summaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache
diff options
context:
space:
mode:
authorthegeorg <[email protected]>2022-02-10 16:45:08 +0300
committerDaniil Cherednik <[email protected]>2022-02-10 16:45:08 +0300
commit4e839db24a3bbc9f1c610c43d6faaaa99824dcca (patch)
tree506dac10f5df94fab310584ee51b24fc5a081c22 /contrib/libs/apache
parent2d37894b1b037cf24231090eda8589bbb44fb6fc (diff)
Restoring authorship annotation for <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/apache')
-rw-r--r--contrib/libs/apache/arrow/CHANGELOG.md20112
-rw-r--r--contrib/libs/apache/arrow/CODE_OF_CONDUCT.md48
-rw-r--r--contrib/libs/apache/arrow/CONTRIBUTING.md154
-rw-r--r--contrib/libs/apache/arrow/LICENSE.txt4484
-rw-r--r--contrib/libs/apache/arrow/NOTICE.txt168
-rw-r--r--contrib/libs/apache/arrow/README.md170
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc1190
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h362
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc2138
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h114
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc12
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc394
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h114
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc318
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h132
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc70
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h62
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc60
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h98
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h76
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc166
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/data.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc472
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/util.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc668
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h44
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer.h12
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h132
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/builder.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc56
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compare.cc1098
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compare.h52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc292
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h594
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc754
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h1410
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc296
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc178
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc742
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h62
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc1646
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h574
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc2372
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h538
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h672
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc536
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc3298
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h1270
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc476
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h188
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc1220
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h344
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc556
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc382
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h272
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc226
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h1252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h168
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc902
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h236
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h242
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc644
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc986
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc328
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc344
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc316
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h806
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc2758
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc3100
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc886
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc114
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc126
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc364
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc254
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc156
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc754
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc162
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc3460
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc288
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc6768
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc1326
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc340
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc1080
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc766
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc3270
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/config.cc72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/config.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/datum.cc120
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/datum.h58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc308
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h76
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/file.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h156
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc190
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h164
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h122
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc120
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h12
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc106
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h34
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h132
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc956
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc150
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc570
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc140
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc122
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/result.h116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc142
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/scalar.h48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/status.cc8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/status.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h292
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table.cc24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc174
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor.h24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type.cc372
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type.h332
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h126
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h3228
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc1376
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h326
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h360
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h706
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h866
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc112
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h74
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h334
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h98
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h372
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h176
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h1252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc452
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc60
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc94
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h38
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc662
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc104
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h362
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc36
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h156
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/future.h1418
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc192
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h56
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc336
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h316
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc44
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h88
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h266
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h1652
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h196
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string.h18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc168
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h70
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc834
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h206
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc272
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h458
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h452
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h184
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h830
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h434
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp34
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp14830
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h5834
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/README20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc1800
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h310
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc2496
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h686
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc1582
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h244
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc2174
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h368
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc444
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc964
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h218
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc324
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h494
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_page.h320
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc3604
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h752
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc182
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h524
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc4134
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h540
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc5094
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encoding.h920
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc824
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h1020
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h232
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc220
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc480
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h242
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc340
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h218
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/exception.cc54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/exception.h316
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc1736
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h376
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc1094
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h468
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/hasher.h144
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc164
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h80
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h130
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc366
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h398
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h714
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc3566
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/metadata.h968
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc444
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/platform.cc82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/platform.h222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/printer.cc594
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/printer.h92
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/properties.cc128
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/properties.h1626
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema.cc1890
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema.h988
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc1770
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/statistics.h684
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc1042
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h598
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc648
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h486
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/symbols.map80
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h988
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/types.cc3134
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/types.h1530
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h60
-rw-r--r--contrib/libs/apache/arrow/src/arrow/util/config.h30
-rw-r--r--contrib/libs/apache/arrow/src/parquet/parquet_version.h62
-rw-r--r--contrib/libs/apache/arrow/ya.make400
-rw-r--r--contrib/libs/apache/avro/.yandex_meta/devtools.licenses.report14
-rw-r--r--contrib/libs/apache/avro/AUTHORS8
-rw-r--r--contrib/libs/apache/avro/ChangeLog2
-rw-r--r--contrib/libs/apache/avro/LICENSE522
-rw-r--r--contrib/libs/apache/avro/MSBUILD.md66
-rw-r--r--contrib/libs/apache/avro/NEWS10
-rw-r--r--contrib/libs/apache/avro/NOTICE12
-rw-r--r--contrib/libs/apache/avro/README138
-rw-r--r--contrib/libs/apache/avro/api/AvroParse.hh170
-rw-r--r--contrib/libs/apache/avro/api/AvroSerialize.hh132
-rw-r--r--contrib/libs/apache/avro/api/AvroTraits.hh232
-rw-r--r--contrib/libs/apache/avro/api/Compiler.hh126
-rw-r--r--contrib/libs/apache/avro/api/Config.hh90
-rw-r--r--contrib/libs/apache/avro/api/DataFile.hh838
-rw-r--r--contrib/libs/apache/avro/api/Decoder.hh452
-rw-r--r--contrib/libs/apache/avro/api/Encoder.hh346
-rw-r--r--contrib/libs/apache/avro/api/Exception.hh92
-rw-r--r--contrib/libs/apache/avro/api/Generic.hh298
-rw-r--r--contrib/libs/apache/avro/api/GenericDatum.hh1152
-rw-r--r--contrib/libs/apache/avro/api/Layout.hh166
-rw-r--r--contrib/libs/apache/avro/api/LogicalType.hh130
-rw-r--r--contrib/libs/apache/avro/api/Node.hh422
-rw-r--r--contrib/libs/apache/avro/api/NodeConcepts.hh434
-rw-r--r--contrib/libs/apache/avro/api/NodeImpl.hh1238
-rw-r--r--contrib/libs/apache/avro/api/Parser.hh302
-rw-r--r--contrib/libs/apache/avro/api/Reader.hh418
-rw-r--r--contrib/libs/apache/avro/api/Resolver.hh114
-rw-r--r--contrib/libs/apache/avro/api/ResolverSchema.hh112
-rw-r--r--contrib/libs/apache/avro/api/ResolvingReader.hh108
-rw-r--r--contrib/libs/apache/avro/api/Schema.hh292
-rw-r--r--contrib/libs/apache/avro/api/SchemaResolution.hh110
-rw-r--r--contrib/libs/apache/avro/api/Serializer.hh270
-rw-r--r--contrib/libs/apache/avro/api/Specific.hh696
-rw-r--r--contrib/libs/apache/avro/api/Stream.hh966
-rw-r--r--contrib/libs/apache/avro/api/Types.hh228
-rw-r--r--contrib/libs/apache/avro/api/ValidSchema.hh132
-rw-r--r--contrib/libs/apache/avro/api/Validator.hh308
-rw-r--r--contrib/libs/apache/avro/api/Writer.hh372
-rw-r--r--contrib/libs/apache/avro/api/Zigzag.hh86
-rw-r--r--contrib/libs/apache/avro/api/buffer/Buffer.hh1052
-rw-r--r--contrib/libs/apache/avro/api/buffer/BufferReader.hh576
-rw-r--r--contrib/libs/apache/avro/api/buffer/detail/BufferDetail.hh1108
-rw-r--r--contrib/libs/apache/avro/api/buffer/detail/BufferDetailIterator.hh460
-rw-r--r--contrib/libs/apache/avro/avro/AvroParse.hh2
-rw-r--r--contrib/libs/apache/avro/avro/AvroSerialize.hh2
-rw-r--r--contrib/libs/apache/avro/avro/AvroTraits.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Compiler.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Config.hh2
-rw-r--r--contrib/libs/apache/avro/avro/DataFile.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Decoder.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Encoder.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Exception.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Generic.hh2
-rw-r--r--contrib/libs/apache/avro/avro/GenericDatum.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Layout.hh2
-rw-r--r--contrib/libs/apache/avro/avro/LogicalType.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Node.hh2
-rw-r--r--contrib/libs/apache/avro/avro/NodeConcepts.hh2
-rw-r--r--contrib/libs/apache/avro/avro/NodeImpl.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Parser.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Reader.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Resolver.hh2
-rw-r--r--contrib/libs/apache/avro/avro/ResolverSchema.hh2
-rw-r--r--contrib/libs/apache/avro/avro/ResolvingReader.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Schema.hh2
-rw-r--r--contrib/libs/apache/avro/avro/SchemaResolution.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Serializer.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Specific.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Stream.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Types.hh2
-rw-r--r--contrib/libs/apache/avro/avro/ValidSchema.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Validator.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Writer.hh2
-rw-r--r--contrib/libs/apache/avro/avro/Zigzag.hh2
-rw-r--r--contrib/libs/apache/avro/impl/BinaryDecoder.cc504
-rw-r--r--contrib/libs/apache/avro/impl/BinaryEncoder.cc336
-rw-r--r--contrib/libs/apache/avro/impl/Compiler.cc1182
-rw-r--r--contrib/libs/apache/avro/impl/DataFile.cc1200
-rw-r--r--contrib/libs/apache/avro/impl/FileStream.cc794
-rw-r--r--contrib/libs/apache/avro/impl/Generic.cc520
-rw-r--r--contrib/libs/apache/avro/impl/GenericDatum.cc210
-rw-r--r--contrib/libs/apache/avro/impl/LogicalType.cc168
-rw-r--r--contrib/libs/apache/avro/impl/Node.cc322
-rw-r--r--contrib/libs/apache/avro/impl/NodeImpl.cc1094
-rw-r--r--contrib/libs/apache/avro/impl/Resolver.cc1744
-rw-r--r--contrib/libs/apache/avro/impl/ResolverSchema.cc78
-rw-r--r--contrib/libs/apache/avro/impl/Schema.cc278
-rw-r--r--contrib/libs/apache/avro/impl/Stream.cc396
-rw-r--r--contrib/libs/apache/avro/impl/Types.cc164
-rw-r--r--contrib/libs/apache/avro/impl/ValidSchema.cc386
-rw-r--r--contrib/libs/apache/avro/impl/Validator.cc602
-rw-r--r--contrib/libs/apache/avro/impl/Zigzag.cc172
-rw-r--r--contrib/libs/apache/avro/impl/json/JsonDom.cc406
-rw-r--r--contrib/libs/apache/avro/impl/json/JsonDom.hh324
-rw-r--r--contrib/libs/apache/avro/impl/json/JsonIO.cc884
-rw-r--r--contrib/libs/apache/avro/impl/json/JsonIO.hh964
-rw-r--r--contrib/libs/apache/avro/impl/parsing/JsonCodec.cc1436
-rw-r--r--contrib/libs/apache/avro/impl/parsing/ResolvingDecoder.cc1480
-rw-r--r--contrib/libs/apache/avro/impl/parsing/Symbol.cc222
-rw-r--r--contrib/libs/apache/avro/impl/parsing/Symbol.hh1708
-rw-r--r--contrib/libs/apache/avro/impl/parsing/ValidatingCodec.cc1182
-rw-r--r--contrib/libs/apache/avro/impl/parsing/ValidatingCodec.hh102
-rw-r--r--contrib/libs/apache/avro/ya.make104
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor.hh4
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.cc2
-rw-r--r--contrib/libs/apache/orc/c++/src/io/InputStream.hh2
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.cc2
-rw-r--r--contrib/libs/apache/orc/c++/src/io/OutputStream.hh2
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h2
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh2
-rw-r--r--contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h2
-rw-r--r--contrib/libs/apache/orc/ya.make12
-rw-r--r--contrib/libs/apache/ya.make10
459 files changed, 115171 insertions, 115171 deletions
diff --git a/contrib/libs/apache/arrow/CHANGELOG.md b/contrib/libs/apache/arrow/CHANGELOG.md
index 3597d22334c..0c7aed36bdd 100644
--- a/contrib/libs/apache/arrow/CHANGELOG.md
+++ b/contrib/libs/apache/arrow/CHANGELOG.md
@@ -1,10056 +1,10056 @@
-
-# Apache Arrow 5.0.0 (2021-07-22)
-
-## Bug Fixes
-
-* [ARROW-6189](https://issues.apache.org/jira/browse/ARROW-6189) - [Rust] [Parquet] Plain encoded boolean column chunks limited to 2048 values
-* [ARROW-6312](https://issues.apache.org/jira/browse/ARROW-6312) - [C++] Declare required Libs.private in arrow.pc package config
-* [ARROW-7948](https://issues.apache.org/jira/browse/ARROW-7948) - [Go][Integration] Decimal integration failures
-* [ARROW-9594](https://issues.apache.org/jira/browse/ARROW-9594) - [Python] DictionaryArray.to\_numpy does not correctly convert null indexes to null values
-* [ARROW-10910](https://issues.apache.org/jira/browse/ARROW-10910) - [Python] Segmentation Fault when None given to read\_table with legacy dataset
-* [ARROW-10958](https://issues.apache.org/jira/browse/ARROW-10958) - [GLib] "Nested data conversions not implemented" through glib, but not through pyarrow
-* [ARROW-11077](https://issues.apache.org/jira/browse/ARROW-11077) - [Rust] ParquetFileArrowReader panicks when trying to read nested list
-* [ARROW-11146](https://issues.apache.org/jira/browse/ARROW-11146) - [CI][Python] Failing conda-python-3.8-jpype Nightly Build
-* [ARROW-11161](https://issues.apache.org/jira/browse/ARROW-11161) - [Python][C++] S3Filesystem: file Content-Type not set correctly?
-* [ARROW-11633](https://issues.apache.org/jira/browse/ARROW-11633) - [CI] [Documentation] Maven default skin not found
-* [ARROW-11780](https://issues.apache.org/jira/browse/ARROW-11780) - [C++][Python] StructArray.from\_arrays() crashes Python interpreter
-* [ARROW-11908](https://issues.apache.org/jira/browse/ARROW-11908) - [Rust] Intermittent Flight integration test failures
-* [ARROW-12007](https://issues.apache.org/jira/browse/ARROW-12007) - [C++] Loading parquet file returns "Invalid UTF8 payload" error
-* [ARROW-12055](https://issues.apache.org/jira/browse/ARROW-12055) - [R] is.na() evaluates to FALSE on Arrow NaN values
-* [ARROW-12096](https://issues.apache.org/jira/browse/ARROW-12096) - [Python][C++] Pyarrow Parquet reader overflows INT96 timestamps when converting to Arrow Array (timestamp[ns])
-* [ARROW-12122](https://issues.apache.org/jira/browse/ARROW-12122) - [Python] Cannot install via pip M1 mac
-* [ARROW-12142](https://issues.apache.org/jira/browse/ARROW-12142) - [Python] undefined symbol: \_ZN5arrow6StatusC1ENS\_10StatusCodeERKNSt7\_\_cxx1112basic\_stringIcSt11char\_traitsIcESaIcEEE
-* [ARROW-12150](https://issues.apache.org/jira/browse/ARROW-12150) - [Python] Bad type inference of mixed-precision Decimals
-* [ARROW-12232](https://issues.apache.org/jira/browse/ARROW-12232) - [Rust][Datafusion] Error with CAST: Unsupported SQL type Time
-* [ARROW-12240](https://issues.apache.org/jira/browse/ARROW-12240) - [Python] invalid-offsetof warning from apple clang-12
-* [ARROW-12377](https://issues.apache.org/jira/browse/ARROW-12377) - [Doc][Java] Java doc build broken
-* [ARROW-12407](https://issues.apache.org/jira/browse/ARROW-12407) - [Python] Deprecation warning when building PyArrow
-* [ARROW-12431](https://issues.apache.org/jira/browse/ARROW-12431) - [Python] pa.array mask inverted when type is binary and value to be converted is numpy array
-* [ARROW-12472](https://issues.apache.org/jira/browse/ARROW-12472) - [Python] read\_table fails when passing a PEP519 filesystem object
-* [ARROW-12482](https://issues.apache.org/jira/browse/ARROW-12482) - [Doc][Python] Mention CSVStreamingReader pitfalls with type inference
-* [ARROW-12491](https://issues.apache.org/jira/browse/ARROW-12491) - [Packaging] Required dependency on LZ4 \>= 1.8 missing from CentOS RPM packages
-* [ARROW-12503](https://issues.apache.org/jira/browse/ARROW-12503) - [C++] Ensure using "lib/" for jemalloc's library directory
-* [ARROW-12508](https://issues.apache.org/jira/browse/ARROW-12508) - [R] expect\_as\_vector implementation causes test failure on R <= 3.3 and variables defined outside of test\_that break build when no arrow install
-* [ARROW-12543](https://issues.apache.org/jira/browse/ARROW-12543) - [CI][Python] Failing conda-python-3.9 Nightly Build
-* [ARROW-12568](https://issues.apache.org/jira/browse/ARROW-12568) - [Python][C++] Segfault when casting a sliced ListArray of int64 in v4.0.0
-* [ARROW-12569](https://issues.apache.org/jira/browse/ARROW-12569) - [R] [CI] Run revdep in CI
-* [ARROW-12570](https://issues.apache.org/jira/browse/ARROW-12570) - [JS] Fix issues that blocked the v4.0.0 release
-* [ARROW-12579](https://issues.apache.org/jira/browse/ARROW-12579) - [Python] Pyarrow 4.0.0 dependency numpy 1.19.4 throws errors on Apple silicon/M1 compilation
-* [ARROW-12589](https://issues.apache.org/jira/browse/ARROW-12589) - [C++] Compiling on windows doesn't work when -DARROW\_WITH\_BACKTRACE=OFF
-* [ARROW-12601](https://issues.apache.org/jira/browse/ARROW-12601) - [R][Packaging] Fix pkg-config check in r/configure
-* [ARROW-12604](https://issues.apache.org/jira/browse/ARROW-12604) - [R][Packaging] Dataset, Parquet off in autobrew and CRAN Mac builds
-* [ARROW-12605](https://issues.apache.org/jira/browse/ARROW-12605) - [Documentation] Repair line numbers in dataset.rst
-* [ARROW-12606](https://issues.apache.org/jira/browse/ARROW-12606) - [C++] Quantile and Mode functions failing on arrays with offset
-* [ARROW-12610](https://issues.apache.org/jira/browse/ARROW-12610) - [C++] Skip TestS3FSGeneric TestDeleteDir and TestDeleteDirContents on windows as they are flaky
-* [ARROW-12611](https://issues.apache.org/jira/browse/ARROW-12611) - [CI][Python] Nightly test-conda-python-pandas-0.24 is failing due to numpy compat issue
-* [ARROW-12613](https://issues.apache.org/jira/browse/ARROW-12613) - [Python] AttributeError when comparing a Scalar with None
-* [ARROW-12614](https://issues.apache.org/jira/browse/ARROW-12614) - [C++][Compute] Revert support for Tables in ExecuteScalarExpression
-* [ARROW-12617](https://issues.apache.org/jira/browse/ARROW-12617) - [Python] pyarrow.orc.write\_table signature reverses that of pyarrow.parquet.write\_table
-* [ARROW-12620](https://issues.apache.org/jira/browse/ARROW-12620) - [C++] Dataset writing can only include projected columns if input columns are also included
-* [ARROW-12622](https://issues.apache.org/jira/browse/ARROW-12622) - [Python] Segfault when reading CSV inside Flight server
-* [ARROW-12630](https://issues.apache.org/jira/browse/ARROW-12630) - [Dev][Integration] conda-integration docker build fails
-* [ARROW-12639](https://issues.apache.org/jira/browse/ARROW-12639) - [CI][Archery] Archery build fails to create branch
-* [ARROW-12640](https://issues.apache.org/jira/browse/ARROW-12640) - [C++] Fix errors from VS 2019 in cpp/src/parquet/types.h
-* [ARROW-12642](https://issues.apache.org/jira/browse/ARROW-12642) - [R] LIBARROW\_MINIMAL, LIBARROW\_DOWNLOAD, NOT\_CRAN env vars should not be case-sensitive
-* [ARROW-12644](https://issues.apache.org/jira/browse/ARROW-12644) - [C++][Dataset] Support reading date/time-partitioned datasets accounting for URL encoding (Spark)
-* [ARROW-12646](https://issues.apache.org/jira/browse/ARROW-12646) - [C++][CI][Packaging][Python] Bump vcpkg version to its latest release
-* [ARROW-12663](https://issues.apache.org/jira/browse/ARROW-12663) - [C++] segfault when arrow header is compiled with nvcc 11.2
-* [ARROW-12668](https://issues.apache.org/jira/browse/ARROW-12668) - [C++][Dataset] CountRows occasionally segfaulting
-* [ARROW-12670](https://issues.apache.org/jira/browse/ARROW-12670) - [C++] extract\_regex gives bizarre behavior after nulls or non-matches
-* [ARROW-12672](https://issues.apache.org/jira/browse/ARROW-12672) - [C++] Segfault casting result of "fill\_null()" (not bitmap but unknown null\_count)
-* [ARROW-12679](https://issues.apache.org/jira/browse/ARROW-12679) - [Java] JDBC adapter does not preserve SQL-nullability
-* [ARROW-12684](https://issues.apache.org/jira/browse/ARROW-12684) - [Go][Flight] Fix nil dereference in error case
-* [ARROW-12708](https://issues.apache.org/jira/browse/ARROW-12708) - [C++] Valgrind errors when calling negate\_checked
-* [ARROW-12729](https://issues.apache.org/jira/browse/ARROW-12729) - [R] Fix length method for Table, RecordBatch
-* [ARROW-12746](https://issues.apache.org/jira/browse/ARROW-12746) - [Go][Flight] Client Auth handler overwrites outgoing metadata
-* [ARROW-12756](https://issues.apache.org/jira/browse/ARROW-12756) - [C++] MSVC build fails with latest gtest from vcpkg
-* [ARROW-12757](https://issues.apache.org/jira/browse/ARROW-12757) - [Dev][Archery] Warning about RUST variable in "archery docker run"
-* [ARROW-12762](https://issues.apache.org/jira/browse/ARROW-12762) - [Python] ListType doesn't preserve field name after pickle and unpickle
-* [ARROW-12769](https://issues.apache.org/jira/browse/ARROW-12769) - [Python] Negative out of range slices yield invalid arrays
-* [ARROW-12771](https://issues.apache.org/jira/browse/ARROW-12771) - [C++] Arrow compute hash\_count skips following chunked arrays in streaming execution
-* [ARROW-12772](https://issues.apache.org/jira/browse/ARROW-12772) - [CI] Merge script test fails due to missing dependency
-* [ARROW-12773](https://issues.apache.org/jira/browse/ARROW-12773) - [Docs] Clarify Java support for ORC and Parquet via JNI bindings
-* [ARROW-12774](https://issues.apache.org/jira/browse/ARROW-12774) - [C++][Compute] replace\_substring\_regex() creates invalid arrays =\> crash
-* [ARROW-12776](https://issues.apache.org/jira/browse/ARROW-12776) - [Archery][Integration] Fix decimal case generation in write\_js\_test\_json
-* [ARROW-12779](https://issues.apache.org/jira/browse/ARROW-12779) - [Python][FlightRPC] Flight server segfaults with certain data
-* [ARROW-12780](https://issues.apache.org/jira/browse/ARROW-12780) - [CI][C++] MinGW builds failing when trying to build Gandiva
-* [ARROW-12790](https://issues.apache.org/jira/browse/ARROW-12790) - [Python] Cannot read from HDFS with blanks in path names
-* [ARROW-12793](https://issues.apache.org/jira/browse/ARROW-12793) - [Python] PYARROW\_BUILD\_TYPE=Debug does not work correctly
-* [ARROW-12797](https://issues.apache.org/jira/browse/ARROW-12797) - [JS] Update readme with new links and remove outdated examples
-* [ARROW-12798](https://issues.apache.org/jira/browse/ARROW-12798) - [JS] Use == null Comparison
-* [ARROW-12799](https://issues.apache.org/jira/browse/ARROW-12799) - [JS] Use Nullish Coalescing Operator (??) For Defaults
-* [ARROW-12804](https://issues.apache.org/jira/browse/ARROW-12804) - [C++] Array methods IsNull and IsValid is confused for NullType
-* [ARROW-12807](https://issues.apache.org/jira/browse/ARROW-12807) - [C++] Fix merge conflicts with Future refactor/async IPC
-* [ARROW-12838](https://issues.apache.org/jira/browse/ARROW-12838) - [Java][Gandiva] Fix JNI CI test for Gandiva
-* [ARROW-12842](https://issues.apache.org/jira/browse/ARROW-12842) - [Java][FlightRPC] Error metadata from FlightStatusException is not propagated to client
-* [ARROW-12850](https://issues.apache.org/jira/browse/ARROW-12850) - [R] is.nan() evaluates to null on Arrow null values
-* [ARROW-12854](https://issues.apache.org/jira/browse/ARROW-12854) - [Dev][Release] Windows wheel verification script fails to download artifacts
-* [ARROW-12857](https://issues.apache.org/jira/browse/ARROW-12857) - [C++] hash\_aggregate\_test not building on master
-* [ARROW-12864](https://issues.apache.org/jira/browse/ARROW-12864) - [C++] Remove needless out argument from arrow::internal::InvertBitmap
-* [ARROW-12865](https://issues.apache.org/jira/browse/ARROW-12865) - [C++][Python] Python FlightRPC server cannot find RE2 symbols
-* [ARROW-12882](https://issues.apache.org/jira/browse/ARROW-12882) - [C++][Gandiva] Fix behavior of convevrt\_replace function for empty replacement char
-* [ARROW-12887](https://issues.apache.org/jira/browse/ARROW-12887) - [CI] AppVeyor pip install failure during setup
-* [ARROW-12906](https://issues.apache.org/jira/browse/ARROW-12906) - [Python] \`fill\_null\` called with a null value seg faults on non fixed-sized types.
-* [ARROW-12907](https://issues.apache.org/jira/browse/ARROW-12907) - [Java] Memory leak possible when exception reading from channel happens
-* [ARROW-12911](https://issues.apache.org/jira/browse/ARROW-12911) - [Python] Export scalar aggregate options to pc.sum (sum of zero rows gives null; should give 0)
-* [ARROW-12917](https://issues.apache.org/jira/browse/ARROW-12917) - [C++][R][pyarrow] Failure importing some decimal types using the C data interface
-* [ARROW-12918](https://issues.apache.org/jira/browse/ARROW-12918) - [C++] Build errors with Visual Studio 16.10.31321.278
-* [ARROW-12919](https://issues.apache.org/jira/browse/ARROW-12919) - [Developer Tools] Crossbow comment bot failing to react to comments
-* [ARROW-12935](https://issues.apache.org/jira/browse/ARROW-12935) - [C++][CI] Compiler error on some clang versions
-* [ARROW-12941](https://issues.apache.org/jira/browse/ARROW-12941) - [C++] csv reader skip\_row does not properly update num\_rows\_seen
-* [ARROW-12942](https://issues.apache.org/jira/browse/ARROW-12942) - [C++][Compute] The result of Arrow compute hash\_min\_max is incorrect if there are new groups in the subsequent chunks
-* [ARROW-12956](https://issues.apache.org/jira/browse/ARROW-12956) - [C++] Fix crash on Parquet file (OSS-Fuzz)
-* [ARROW-12969](https://issues.apache.org/jira/browse/ARROW-12969) - [C++] match\_substring doesn't match empty needle to empty haystack
-* [ARROW-12974](https://issues.apache.org/jira/browse/ARROW-12974) - [R] test-r-without-arrow build fails because of example requiring Arrow
-* [ARROW-12983](https://issues.apache.org/jira/browse/ARROW-12983) - [C++][Python] Converter::Extend gets stuck in infinite loop causing OOM if values don't fit in single chunk
-* [ARROW-12987](https://issues.apache.org/jira/browse/ARROW-12987) - [CI] test-ubuntu-18.04 nightly builds are failing due to Gandiva "TestUpper" test failure
-* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty)
-* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty)
-* [ARROW-12989](https://issues.apache.org/jira/browse/ARROW-12989) - [CI] "Dev PR" jobs undully cancelled
-* [ARROW-12991](https://issues.apache.org/jira/browse/ARROW-12991) - [CI] Travis ARM builds often crash
-* [ARROW-12993](https://issues.apache.org/jira/browse/ARROW-12993) - [Python] Address boundary error with invalid Feather file and stackprinter
-* [ARROW-12995](https://issues.apache.org/jira/browse/ARROW-12995) - [C++] CSV reader should validate options
-* [ARROW-12998](https://issues.apache.org/jira/browse/ARROW-12998) - [C++] Datasets needs dependency on xsimd
-* [ARROW-13001](https://issues.apache.org/jira/browse/ARROW-13001) - [Go] Build failure in parquet/internal/bmi on s390x
-* [ARROW-13003](https://issues.apache.org/jira/browse/ARROW-13003) - [C++] unaligned access in compute/exec/ cc files
-* [ARROW-13008](https://issues.apache.org/jira/browse/ARROW-13008) - [C++] Deprecation warning when compiling minimal example
-* [ARROW-13010](https://issues.apache.org/jira/browse/ARROW-13010) - [C++][Compute] Support outputting to slices from kleene kernels
-* [ARROW-13018](https://issues.apache.org/jira/browse/ARROW-13018) - [C++][Docs] Use consistent terminology for nulls (min\_count) in scalar aggregate kernels
-* [ARROW-13026](https://issues.apache.org/jira/browse/ARROW-13026) - [C++][CI] s390x job setup fails
-* [ARROW-13037](https://issues.apache.org/jira/browse/ARROW-13037) - [R] Incorrect param when creating Expression crashes R
-* [ARROW-13039](https://issues.apache.org/jira/browse/ARROW-13039) - [R] Fix error message handling
-* [ARROW-13041](https://issues.apache.org/jira/browse/ARROW-13041) - [C++] Unary kernels can leave uninitialized data under null entries
-* [ARROW-13046](https://issues.apache.org/jira/browse/ARROW-13046) - [Release] JS package failing test prior to publish
-* [ARROW-13048](https://issues.apache.org/jira/browse/ARROW-13048) - [C++] S3FileSystem fails moving filepaths containing = or +
-* [ARROW-13053](https://issues.apache.org/jira/browse/ARROW-13053) - [Python] Build fails on MacOS Big Sur using homebrewed Arrow libraries
-* [ARROW-13069](https://issues.apache.org/jira/browse/ARROW-13069) - [Website] Add Daniël to committer list
-* [ARROW-13073](https://issues.apache.org/jira/browse/ARROW-13073) - [Developer] archery benchmark list: unexpected keyword 'benchmark\_filter'
-* [ARROW-13080](https://issues.apache.org/jira/browse/ARROW-13080) - [Release] Generate the API docs in ubuntu 20.10
-* [ARROW-13083](https://issues.apache.org/jira/browse/ARROW-13083) - [Python] Wrong SCM version detection both in setup.py and crossbow
-* [ARROW-13085](https://issues.apache.org/jira/browse/ARROW-13085) - [Python] Apache Arrow minimal cpp build segfaults with pyarrow libs
-* [ARROW-13090](https://issues.apache.org/jira/browse/ARROW-13090) - [Python] Test failure with ffspec 2021.6.0
-* [ARROW-13104](https://issues.apache.org/jira/browse/ARROW-13104) - [C++] ByteStreamSplit implementation uses invalid pointer cast
-* [ARROW-13108](https://issues.apache.org/jira/browse/ARROW-13108) - [Python] Pyarrow 4.0.0 crashes upon import on macOS 10.13.6
-* [ARROW-13116](https://issues.apache.org/jira/browse/ARROW-13116) - [R] Test for RecordBatchReader to C-interface fails on arrow-r-minimal due to missing dependencies
-* [ARROW-13125](https://issues.apache.org/jira/browse/ARROW-13125) - [R] Throw error when 2+ args passed to desc() in arrange()
-* [ARROW-13128](https://issues.apache.org/jira/browse/ARROW-13128) - [C\#] TimestampArray conversion logic for nano and micro is wrong
-* [ARROW-13135](https://issues.apache.org/jira/browse/ARROW-13135) - [C++] Fix Status propagation in END\_PARQUET\_CATCH\_EXCEPTIONS
-* [ARROW-13139](https://issues.apache.org/jira/browse/ARROW-13139) - [C++] ReadaheadGenerator cannot be safely copied/moved
-* [ARROW-13145](https://issues.apache.org/jira/browse/ARROW-13145) - [C++][CI] Flight test crashes on MinGW
-* [ARROW-13148](https://issues.apache.org/jira/browse/ARROW-13148) - [Dev][Archery] Crossbow build submission fails
-* [ARROW-13153](https://issues.apache.org/jira/browse/ARROW-13153) - [C++] \`parquet\_dataset\` loses ordering of files in \`\_metadata\`
-* [ARROW-13154](https://issues.apache.org/jira/browse/ARROW-13154) - [C++] Unions can not have 126 and 127 as type\_codes
-* [ARROW-13169](https://issues.apache.org/jira/browse/ARROW-13169) - [R] [C++] sorted partition keys can cause issues
-* [ARROW-13173](https://issues.apache.org/jira/browse/ARROW-13173) - [C++] TestAsyncUtil.ReadaheadFailed asserts occasionally
-* [ARROW-13187](https://issues.apache.org/jira/browse/ARROW-13187) - [c++][python] Possibly memory not deallocated when reading in CSV
-* [ARROW-13189](https://issues.apache.org/jira/browse/ARROW-13189) - [R] Disable row-level metadata application on datasets
-* [ARROW-13203](https://issues.apache.org/jira/browse/ARROW-13203) - [R] Fix optional component checks causing failures
-* [ARROW-13207](https://issues.apache.org/jira/browse/ARROW-13207) - [Python][Doc] Dataset documentation still suggests deprecated scan method as the preferred iterative approach
-* [ARROW-13216](https://issues.apache.org/jira/browse/ARROW-13216) - [R] Type checks test fails with rtools35
-* [ARROW-13217](https://issues.apache.org/jira/browse/ARROW-13217) - [C++][Gandiva] Correct convert\_replace function for invalid chars on string beginning
-* [ARROW-13223](https://issues.apache.org/jira/browse/ARROW-13223) - [C++][CI] Fix thread sanitizer failures
-* [ARROW-13225](https://issues.apache.org/jira/browse/ARROW-13225) - [Go][Flight] Implement Custom Middleware Interface and Enable Integration Tests
-* [ARROW-13229](https://issues.apache.org/jira/browse/ARROW-13229) - [Python] ascii\_trim, ascii\_ltrim and ascii\_rtrim lack options
-* [ARROW-13239](https://issues.apache.org/jira/browse/ARROW-13239) - [Doc][Python] Dataset.head function doesn't mention required argument
-* [ARROW-13243](https://issues.apache.org/jira/browse/ARROW-13243) - [R] altrep function call in R 3.5
-* [ARROW-13246](https://issues.apache.org/jira/browse/ARROW-13246) - [C++] CSV skip\_rows\_after\_names can discard data prematurally
-* [ARROW-13249](https://issues.apache.org/jira/browse/ARROW-13249) - [Java][CI] Consistent timeout in the Java JNI build
-* [ARROW-13253](https://issues.apache.org/jira/browse/ARROW-13253) - [C++][FlightRPC] Segfault when sending record batch \>2GB
-* [ARROW-13254](https://issues.apache.org/jira/browse/ARROW-13254) - [Python] Processes killed and semaphore objects leaked when reading pandas data
-* [ARROW-13265](https://issues.apache.org/jira/browse/ARROW-13265) - [R] cli valgrind errors in nightlies
-* [ARROW-13266](https://issues.apache.org/jira/browse/ARROW-13266) - [JS] Improve benchmark names & add suite name to json
-* [ARROW-13281](https://issues.apache.org/jira/browse/ARROW-13281) - [C++][Gandiva] Error on timestampDiffMonth function behavior for negative diff values
-* [ARROW-13284](https://issues.apache.org/jira/browse/ARROW-13284) - [C++] Wrong pkg\_check\_modules() option name
-* [ARROW-13288](https://issues.apache.org/jira/browse/ARROW-13288) - [Python] Missing default values of kernel options in PyArrow
-* [ARROW-13290](https://issues.apache.org/jira/browse/ARROW-13290) - Compilation fails on clang-12 and gcc-11 due to missing include
-* [ARROW-13305](https://issues.apache.org/jira/browse/ARROW-13305) - [C++] Unable to install nightly on Ubuntu 21.04 due to CSV options
-* [ARROW-13315](https://issues.apache.org/jira/browse/ARROW-13315) - [R] Wrap r\_task\_group includes with ARROW\_R\_WITH\_ARROW checking
-* [ARROW-13321](https://issues.apache.org/jira/browse/ARROW-13321) - [C++][Python] MakeArrayFromScalar doesn't work for FixedSizeBinaryType
-* [ARROW-13324](https://issues.apache.org/jira/browse/ARROW-13324) - [R] Typo in bindings for utf8\_reverse and ascii\_reverse
-* [ARROW-13332](https://issues.apache.org/jira/browse/ARROW-13332) - [C++] TSAN failure in TestAsyncUtil.ReadaheadFailed
-* [ARROW-13341](https://issues.apache.org/jira/browse/ARROW-13341) - [C++] Segfault in arrow-compute-plan-test ExecPlanExecution.SourceScalarAggSink
-* [ARROW-13350](https://issues.apache.org/jira/browse/ARROW-13350) - [Python][CI] conda-python-3.7-pandas-0.24 nightly build failing in test\_extract\_datetime\_components
-* [ARROW-13352](https://issues.apache.org/jira/browse/ARROW-13352) - [C++] Valgrind failure in case\_when kernel
-* [ARROW-13353](https://issues.apache.org/jira/browse/ARROW-13353) - [Documentation] Build failing with sphinx.util.cfamily.DefinitionError
-* [ARROW-13360](https://issues.apache.org/jira/browse/ARROW-13360) - [C++] Missing dependencies in C++ thirdparty offline dependencies versions.txt
-* [ARROW-13363](https://issues.apache.org/jira/browse/ARROW-13363) - [R] is.nan() errors on non-floating point data
-* [ARROW-13368](https://issues.apache.org/jira/browse/ARROW-13368) - [C++][Doc] Rename project to make\_struct in docs
-* [ARROW-13381](https://issues.apache.org/jira/browse/ARROW-13381) - [C++] ArrayFromJSON doesn't work for float value dictionary type
-* [ARROW-13382](https://issues.apache.org/jira/browse/ARROW-13382) - [C++] Aggregation over scalars fails autobrew R job
-* [ARROW-13384](https://issues.apache.org/jira/browse/ARROW-13384) - [C++] Specify minimum required zstd version in cmake
-* [ARROW-13391](https://issues.apache.org/jira/browse/ARROW-13391) - [C++] CSV streaming reader does not include same error information as table reader
-* [ARROW-13417](https://issues.apache.org/jira/browse/ARROW-13417) - [C++] The merged generator can sometimes pull from source sync-reentrant
-* [ARROW-13419](https://issues.apache.org/jira/browse/ARROW-13419) - [JS] Fix perf tests
-* [ARROW-13428](https://issues.apache.org/jira/browse/ARROW-13428) - [C++][Flight] -lssl is missing with bundled gRPC and system shared OpenSSL
-* [ARROW-13431](https://issues.apache.org/jira/browse/ARROW-13431) - [Release] Bump go version to 1.15; don't verify rust source anymore
-* [ARROW-13432](https://issues.apache.org/jira/browse/ARROW-13432) - [Release] Fix ssh connection to the binary uploader container
-
-
-## New Features and Improvements
-
-* [ARROW-2665](https://issues.apache.org/jira/browse/ARROW-2665) - [Python/C++] Add index() method to find first occurence of Python scalar
-* [ARROW-3014](https://issues.apache.org/jira/browse/ARROW-3014) - [C++] Minimal writer adapter for ORC file format
-* [ARROW-3316](https://issues.apache.org/jira/browse/ARROW-3316) - [R] Multi-threaded conversion from R data.frame to Arrow table / record batch
-* [ARROW-5385](https://issues.apache.org/jira/browse/ARROW-5385) - [Go] implement EXTENSION datatype
-* [ARROW-5640](https://issues.apache.org/jira/browse/ARROW-5640) - [Go] implement Map array
-* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension
-* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension
-* [ARROW-7001](https://issues.apache.org/jira/browse/ARROW-7001) - [C++] Develop threading APIs to accommodate nested parallelism
-* [ARROW-7114](https://issues.apache.org/jira/browse/ARROW-7114) - [JS][CI] NodeJS build fails on Github Actions Windows node
-* [ARROW-7252](https://issues.apache.org/jira/browse/ARROW-7252) - [Rust] [Parquet] Reading UTF-8/JSON/ENUM field results in a lot of vec allocation
-* [ARROW-7396](https://issues.apache.org/jira/browse/ARROW-7396) - [Format] Register media types (MIME types) for Apache Arrow formats to IANA
-* [ARROW-8421](https://issues.apache.org/jira/browse/ARROW-8421) - [Rust] [Parquet] Implement parquet writer
-* [ARROW-8459](https://issues.apache.org/jira/browse/ARROW-8459) - [Dev][Archery] Use a more recent cmake-format
-* [ARROW-8527](https://issues.apache.org/jira/browse/ARROW-8527) - [C++][CSV] Add support for ReadOptions::skip\_rows \>= block\_size
-* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset
-* [ARROW-8676](https://issues.apache.org/jira/browse/ARROW-8676) - [Rust] Create implementation of IPC RecordBatch body buffer compression from ARROW-300
-* [ARROW-9054](https://issues.apache.org/jira/browse/ARROW-9054) - [C++] Add ScalarAggregateOptions
-* [ARROW-9056](https://issues.apache.org/jira/browse/ARROW-9056) - [C++] Support scalar aggregation over scalars
-* [ARROW-9140](https://issues.apache.org/jira/browse/ARROW-9140) - [R] Zero-copy Arrow to R where possible
-* [ARROW-9295](https://issues.apache.org/jira/browse/ARROW-9295) - [Archery] Support rust clippy in the lint command
-* [ARROW-9299](https://issues.apache.org/jira/browse/ARROW-9299) - [Python] Expose ORC metadata() in Python ORCFile
-* [ARROW-9313](https://issues.apache.org/jira/browse/ARROW-9313) - [Rust] Use feature enum
-* [ARROW-9421](https://issues.apache.org/jira/browse/ARROW-9421) - [C++][Parquet] Redundancies SchemaManifest::GetFieldIndices
-* [ARROW-9430](https://issues.apache.org/jira/browse/ARROW-9430) - [C++/Python] Kernel for SetItem(BooleanArray, values)
-* [ARROW-9697](https://issues.apache.org/jira/browse/ARROW-9697) - [C++][Dataset] num\_rows method for Dataset/Scanner
-* [ARROW-10031](https://issues.apache.org/jira/browse/ARROW-10031) - [Java] Support Java benchmark in Archery
-* [ARROW-10115](https://issues.apache.org/jira/browse/ARROW-10115) - [C++] CSV empty quoted string is treated as NULL
-* [ARROW-10316](https://issues.apache.org/jira/browse/ARROW-10316) - [Python] Consider using \_\_wrapped\_\_ for compute function introspection
-* [ARROW-10391](https://issues.apache.org/jira/browse/ARROW-10391) - [Rust] [Parquet] Nested Arrow reader
-* [ARROW-10440](https://issues.apache.org/jira/browse/ARROW-10440) - [C++][Dataset][Python] Add a callback to visit file writers just before Finish()
-* [ARROW-10550](https://issues.apache.org/jira/browse/ARROW-10550) - [Rust] [Parquet] Write nested types (struct, list)
-* [ARROW-10557](https://issues.apache.org/jira/browse/ARROW-10557) - [C++] Add scalar string slicing/substring extract kernel
-* [ARROW-10640](https://issues.apache.org/jira/browse/ARROW-10640) - [C++] An "if\_else" kernel to combine two arrays based on a mask
-* [ARROW-10658](https://issues.apache.org/jira/browse/ARROW-10658) - [Python][Packaging] Wheel builds for Apple Silicon
-* [ARROW-10675](https://issues.apache.org/jira/browse/ARROW-10675) - [C++][Python] Support AWS S3 Web identity credentials
-* [ARROW-10797](https://issues.apache.org/jira/browse/ARROW-10797) - [C++] Investigate faster random generation for tests and benchmarks
-* [ARROW-10926](https://issues.apache.org/jira/browse/ARROW-10926) - [Rust] Add parquet reader / writer for decimal types
-* [ARROW-10959](https://issues.apache.org/jira/browse/ARROW-10959) - [C++] Add scalar string join kernel
-* [ARROW-11061](https://issues.apache.org/jira/browse/ARROW-11061) - [Rust] Validate array properties against schema
-* [ARROW-11173](https://issues.apache.org/jira/browse/ARROW-11173) - Add Map type as reader / writer in FieldReader / FieldWriter
-* [ARROW-11199](https://issues.apache.org/jira/browse/ARROW-11199) - [C++][Python] Fix the unit tests for the ORC reader
-* [ARROW-11206](https://issues.apache.org/jira/browse/ARROW-11206) - [C++][Compute][Python] Rename "project" kernel to "make\_struct"
-* [ARROW-11342](https://issues.apache.org/jira/browse/ARROW-11342) - [Python] [Gandiva] Expose ToString and result type information
-* [ARROW-11499](https://issues.apache.org/jira/browse/ARROW-11499) - [Packaging] Remove all use of bintray
-* [ARROW-11514](https://issues.apache.org/jira/browse/ARROW-11514) - [R][C++] Bindings for paste(), paste0(), str\_c()
-* [ARROW-11515](https://issues.apache.org/jira/browse/ARROW-11515) - [R] Bindings for strsplit
-* [ARROW-11565](https://issues.apache.org/jira/browse/ARROW-11565) - [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT\_CAP function
-* [ARROW-11608](https://issues.apache.org/jira/browse/ARROW-11608) - [CI] turbodbc integration tests are failing (build isue)
-* [ARROW-11660](https://issues.apache.org/jira/browse/ARROW-11660) - [C++] Move RecordBatch::SelectColumns method from R to C++ library
-* [ARROW-11673](https://issues.apache.org/jira/browse/ARROW-11673) - [C++] Casting dictionary type to use different index type
-* [ARROW-11675](https://issues.apache.org/jira/browse/ARROW-11675) - [CI][C++] Resolve ctest failures on VS 2019 builds
-* [ARROW-11705](https://issues.apache.org/jira/browse/ARROW-11705) - [R] Support scalar value recycling in RecordBatch/Table$create()
-* [ARROW-11759](https://issues.apache.org/jira/browse/ARROW-11759) - [C++] Kernel to extract datetime components (year, month, day, etc) from timestamp type
-* [ARROW-11769](https://issues.apache.org/jira/browse/ARROW-11769) - [R] Pull groups from grouped\_df into RecordBatch or Table
-* [ARROW-11772](https://issues.apache.org/jira/browse/ARROW-11772) - [C++] Add asynchronous read to ipc::RecordBatchFileReader
-* [ARROW-11782](https://issues.apache.org/jira/browse/ARROW-11782) - [GLib][Ruby][Dataset] Remove bindings for internal classes
-* [ARROW-11787](https://issues.apache.org/jira/browse/ARROW-11787) - [R] Implement write csv
-* [ARROW-11843](https://issues.apache.org/jira/browse/ARROW-11843) - [C++] Add asynchronous read to parquet::arrow::FileReader
-* [ARROW-11849](https://issues.apache.org/jira/browse/ARROW-11849) - [R] Use roxygen @examplesIf tag in R docs
-* [ARROW-11889](https://issues.apache.org/jira/browse/ARROW-11889) - [C++] Add parallelism to streaming CSV reader
-* [ARROW-11909](https://issues.apache.org/jira/browse/ARROW-11909) - [C++] Get rid of MakeIteratorGenerator
-* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
-* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
-* [ARROW-11928](https://issues.apache.org/jira/browse/ARROW-11928) - [C++][Compute] Add ExecNode hierarchy
-* [ARROW-11929](https://issues.apache.org/jira/browse/ARROW-11929) - [C++][Compute] Promote Expression to the compute namespace
-* [ARROW-11930](https://issues.apache.org/jira/browse/ARROW-11930) - [C++][Dataset][Compute] Refactor Dataset scans to use an ExecNode graph
-* [ARROW-11932](https://issues.apache.org/jira/browse/ARROW-11932) - [C++] Provide ArrayBuilder::AppendScalar
-* [ARROW-11950](https://issues.apache.org/jira/browse/ARROW-11950) - [C++][Compute] Add unary negative kernel
-* [ARROW-11960](https://issues.apache.org/jira/browse/ARROW-11960) - [C++][Gandiva] Support escape in LIKE
-* [ARROW-11980](https://issues.apache.org/jira/browse/ARROW-11980) - [Python] Remove "experimental" status from Table.replace\_schema\_metadata
-* [ARROW-11986](https://issues.apache.org/jira/browse/ARROW-11986) - [C++][Gandiva] Implement IN expressions for doubles and floats
-* [ARROW-11990](https://issues.apache.org/jira/browse/ARROW-11990) - [C++][Compute] Use Status/Result return consistently to indicate errors
-* [ARROW-12004](https://issues.apache.org/jira/browse/ARROW-12004) - [C++] Result<detail::Empty\> is annoying
-* [ARROW-12010](https://issues.apache.org/jira/browse/ARROW-12010) - [C++][Compute] Improve performance of the hash table used in GroupIdentifier
-* [ARROW-12016](https://issues.apache.org/jira/browse/ARROW-12016) - [C++] Implement array\_sort\_indices and sort\_indices for BOOL type
-* [ARROW-12050](https://issues.apache.org/jira/browse/ARROW-12050) - [C++][Python][FlightRPC] Use StopToken to enable interrupting long Flight operations
-* [ARROW-12074](https://issues.apache.org/jira/browse/ARROW-12074) - [C++][Compute] Add scalar arithmetic kernels for decimal inputs
-* [ARROW-12083](https://issues.apache.org/jira/browse/ARROW-12083) - [R] schema use in open\_dataset
-* [ARROW-12166](https://issues.apache.org/jira/browse/ARROW-12166) - [C++][Gandiva] Implements CONVERT\_TO(value, type) function
-* [ARROW-12184](https://issues.apache.org/jira/browse/ARROW-12184) - [R] Bindings for na.fail, na.omit, na.exclude, na.pass
-* [ARROW-12185](https://issues.apache.org/jira/browse/ARROW-12185) - [R] Bindings for any, all
-* [ARROW-12198](https://issues.apache.org/jira/browse/ARROW-12198) - [R] bindings for strptime
-* [ARROW-12199](https://issues.apache.org/jira/browse/ARROW-12199) - [R] bindings for stddev, variance
-* [ARROW-12205](https://issues.apache.org/jira/browse/ARROW-12205) - [C++][Gandiva] Implement TO\_TIME([number] secs) and TO\_TIMESTAMP([number] secs) function
-* [ARROW-12231](https://issues.apache.org/jira/browse/ARROW-12231) - [C++][Dataset] Separate datasets backed by readers from InMemoryDataset
-* [ARROW-12253](https://issues.apache.org/jira/browse/ARROW-12253) - [Rust] [Ballista] Implement scalable joins
-* [ARROW-12255](https://issues.apache.org/jira/browse/ARROW-12255) - [Rust] [Ballista] Integrate scheduler with DataFusion
-* [ARROW-12256](https://issues.apache.org/jira/browse/ARROW-12256) - [Rust] [Ballista] Add DataFrame support
-* [ARROW-12257](https://issues.apache.org/jira/browse/ARROW-12257) - [Rust] [Ballista] Publish user guide to Arrow site
-* [ARROW-12261](https://issues.apache.org/jira/browse/ARROW-12261) - [Rust] [Ballista] Ballista should not have its own DataFrame API
-* [ARROW-12291](https://issues.apache.org/jira/browse/ARROW-12291) - [R] Determine the type of an unevaluated expression
-* [ARROW-12310](https://issues.apache.org/jira/browse/ARROW-12310) - [Java] ValueVector\#getObject should support covariance for complex types
-* [ARROW-12355](https://issues.apache.org/jira/browse/ARROW-12355) - [C++] Implement efficient async CSV scanning
-* [ARROW-12362](https://issues.apache.org/jira/browse/ARROW-12362) - [Rust] [DataFusion] topk\_query test failure
-* [ARROW-12364](https://issues.apache.org/jira/browse/ARROW-12364) - [Python] [Dataset] Add metadata\_collector option to ds.write\_dataset()
-* [ARROW-12378](https://issues.apache.org/jira/browse/ARROW-12378) - [C++][Gandiva] Implement castVARBINARY functions
-* [ARROW-12386](https://issues.apache.org/jira/browse/ARROW-12386) - [C++] Support file parallelism in AsyncScanner
-* [ARROW-12391](https://issues.apache.org/jira/browse/ARROW-12391) - [Rust][DataFusion] Implement date\_trunc() function
-* [ARROW-12392](https://issues.apache.org/jira/browse/ARROW-12392) - [C++] Restore asynchronous streaming CSV reader
-* [ARROW-12393](https://issues.apache.org/jira/browse/ARROW-12393) - [JS] Optimally use closure compiler
-* [ARROW-12403](https://issues.apache.org/jira/browse/ARROW-12403) - [Rust] [Ballista] Integration tests should check that query results are correct
-* [ARROW-12415](https://issues.apache.org/jira/browse/ARROW-12415) - [CI] [Python] ERROR: Failed building wheel for pygit2 on ARM64
-* [ARROW-12424](https://issues.apache.org/jira/browse/ARROW-12424) - [Go][Parquet] Add Schema Package
-* [ARROW-12428](https://issues.apache.org/jira/browse/ARROW-12428) - [Python] pyarrow.parquet.read\_\* should use pre\_buffer=True
-* [ARROW-12434](https://issues.apache.org/jira/browse/ARROW-12434) - [Rust] [Ballista] Show executed plans with metrics
-* [ARROW-12442](https://issues.apache.org/jira/browse/ARROW-12442) - [CI] Set job timeouts on GitHub Actions
-* [ARROW-12443](https://issues.apache.org/jira/browse/ARROW-12443) - [C++][Gandiva] Implement castVARCHAR function for binary input
-* [ARROW-12444](https://issues.apache.org/jira/browse/ARROW-12444) - [RUST] [CI] Remove Rust and point integration tests to arrow-rs repo
-* [ARROW-12445](https://issues.apache.org/jira/browse/ARROW-12445) - [Rust] Design and implement packaging process to bundle Rust in signed tar
-* [ARROW-12468](https://issues.apache.org/jira/browse/ARROW-12468) - [Python][R] Expose UseAsync to python/R
-* [ARROW-12478](https://issues.apache.org/jira/browse/ARROW-12478) - [C++] Support LLVM 12
-* [ARROW-12484](https://issues.apache.org/jira/browse/ARROW-12484) - [CI] Change jinja macros to not require CROSSBOW\_TOKEN to upload artifacts in Github Actions
-* [ARROW-12489](https://issues.apache.org/jira/browse/ARROW-12489) - [Developer] autotune is broken
-* [ARROW-12490](https://issues.apache.org/jira/browse/ARROW-12490) - [Dev] Use miniforge for all platforms
-* [ARROW-12492](https://issues.apache.org/jira/browse/ARROW-12492) - [Python] Add an helper method to decode a DictionaryArray back to a plain Array
-* [ARROW-12496](https://issues.apache.org/jira/browse/ARROW-12496) - [C++][Dataset] Ensure Scanner tests fully cover async
-* [ARROW-12499](https://issues.apache.org/jira/browse/ARROW-12499) - [C++][Compute][R] Add ScalarAggregateOptions to Any and All kernels
-* [ARROW-12500](https://issues.apache.org/jira/browse/ARROW-12500) - [C++][Dataset] Consolidate similar tests for file formats
-* [ARROW-12501](https://issues.apache.org/jira/browse/ARROW-12501) - [CI][Ruby] Remove needless workaround for MinGW build
-* [ARROW-12507](https://issues.apache.org/jira/browse/ARROW-12507) - [CI] Remove duplicated cron/nightly builds
-* [ARROW-12512](https://issues.apache.org/jira/browse/ARROW-12512) - [C++][Dataset] Implement CSV writing support
-* [ARROW-12514](https://issues.apache.org/jira/browse/ARROW-12514) - [Release] Don't run Gandiva related Ruby test with ARROW\_GANDIVA=OFF
-* [ARROW-12517](https://issues.apache.org/jira/browse/ARROW-12517) - [Go] Expose App Metadata in Flight client
-* [ARROW-12518](https://issues.apache.org/jira/browse/ARROW-12518) - [Python] Expose Parquet statistics has\_null\_count / has\_distinct\_count
-* [ARROW-12520](https://issues.apache.org/jira/browse/ARROW-12520) - [R] Minor docs updates
-* [ARROW-12522](https://issues.apache.org/jira/browse/ARROW-12522) - [C++] Implement asynchronous/"lazy" variants of ReadRangeCache
-* [ARROW-12525](https://issues.apache.org/jira/browse/ARROW-12525) - [JS] Vector toJSON returns an array
-* [ARROW-12527](https://issues.apache.org/jira/browse/ARROW-12527) - [Dev] Don't try getting JIRA information for MINOR PR
-* [ARROW-12528](https://issues.apache.org/jira/browse/ARROW-12528) - [JS] Support typed arrays in Table.new
-* [ARROW-12530](https://issues.apache.org/jira/browse/ARROW-12530) - [C++] Remove Buffer::mutable\_data\_ member and use const\_cast on data\_ only if is\_mutable\_ is true
-* [ARROW-12533](https://issues.apache.org/jira/browse/ARROW-12533) - [C++] Random real generator is slow on Arm64 Linux when built with clang
-* [ARROW-12534](https://issues.apache.org/jira/browse/ARROW-12534) - [C++][Gandiva] Implement LEFT and RIGHT functions on Gandiva for string input values
-* [ARROW-12537](https://issues.apache.org/jira/browse/ARROW-12537) - [JS] Docs build should not include test sources
-* [ARROW-12541](https://issues.apache.org/jira/browse/ARROW-12541) - [Docs] Improve styling/readability of tables in the new doc theme
-* [ARROW-12551](https://issues.apache.org/jira/browse/ARROW-12551) - [Java][Release] Java post-release tests fail due to missing testing data
-* [ARROW-12554](https://issues.apache.org/jira/browse/ARROW-12554) - Allow duplicates in the value\_set for compute::is\_in
-* [ARROW-12555](https://issues.apache.org/jira/browse/ARROW-12555) - [Java][Release] Java post-release script misses dataset JNI bindings
-* [ARROW-12556](https://issues.apache.org/jira/browse/ARROW-12556) - [C++][Gandiva] Implement BYTESUBSTRING functions on Gandiva
-* [ARROW-12560](https://issues.apache.org/jira/browse/ARROW-12560) - [C++] Investigate utilizing aggressive thread task creation when adding callback to finished future
-* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values
-* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values
-* [ARROW-12571](https://issues.apache.org/jira/browse/ARROW-12571) - [R][CI] Run nightly R with valgrind
-* [ARROW-12575](https://issues.apache.org/jira/browse/ARROW-12575) - [R] Use unary negative kernel
-* [ARROW-12577](https://issues.apache.org/jira/browse/ARROW-12577) - [Website] Use Artifactory instead of Bintray in all places
-* [ARROW-12578](https://issues.apache.org/jira/browse/ARROW-12578) - [JS] Simplify UTF8 handling in NodeJS
-* [ARROW-12581](https://issues.apache.org/jira/browse/ARROW-12581) - [C++][FlightRPC] Benchmark compression with real data
-* [ARROW-12584](https://issues.apache.org/jira/browse/ARROW-12584) - [C++][Python] Expose method for benchmarking tools to release unused memory from the allocators
-* [ARROW-12591](https://issues.apache.org/jira/browse/ARROW-12591) - [Java][Gandiva] Create single Gandiva jar for MacOS and Linux
-* [ARROW-12593](https://issues.apache.org/jira/browse/ARROW-12593) - [Packaging][Ubuntu] Add support for Ubuntu 21.04
-* [ARROW-12597](https://issues.apache.org/jira/browse/ARROW-12597) - [C++] Implement OptionalParallelForAsync
-* [ARROW-12598](https://issues.apache.org/jira/browse/ARROW-12598) - [C++][Dataset] Implement row-count for CSV or allow selecting 0 columns from CSV
-* [ARROW-12599](https://issues.apache.org/jira/browse/ARROW-12599) - [Doc][Python] Documentation missing for pyarrow.Table
-* [ARROW-12600](https://issues.apache.org/jira/browse/ARROW-12600) - [CI] Push docker images from crossbow tasks
-* [ARROW-12602](https://issues.apache.org/jira/browse/ARROW-12602) - [R] Add BuildInfo from C++ to arrow\_info
-* [ARROW-12608](https://issues.apache.org/jira/browse/ARROW-12608) - [C++] Add split\_pattern\_regex function
-* [ARROW-12612](https://issues.apache.org/jira/browse/ARROW-12612) - [C++][Compute] Add Expression to type\_fwd.h
-* [ARROW-12619](https://issues.apache.org/jira/browse/ARROW-12619) - [Python] pyarrow sdist should not require git
-* [ARROW-12621](https://issues.apache.org/jira/browse/ARROW-12621) - [C++][Gandiva] Add alias to sha1 and sha256 functions
-* [ARROW-12631](https://issues.apache.org/jira/browse/ARROW-12631) - [Python] pyarrow.dataset.write\_table should accept a Scanner to write
-* [ARROW-12643](https://issues.apache.org/jira/browse/ARROW-12643) - Add documentation for experimental repos
-* [ARROW-12645](https://issues.apache.org/jira/browse/ARROW-12645) - [Python] Fix numpydoc validation
-* [ARROW-12648](https://issues.apache.org/jira/browse/ARROW-12648) - [C++][FlightRPC] Allow using TLS in benchmark
-* [ARROW-12649](https://issues.apache.org/jira/browse/ARROW-12649) - [Python/Packaging] Move conda-aarch64 to Azure with cross-compilation
-* [ARROW-12653](https://issues.apache.org/jira/browse/ARROW-12653) - [Archery] allow me to add a comment to crossbow requests
-* [ARROW-12658](https://issues.apache.org/jira/browse/ARROW-12658) - [C++] Bump aws-c-common to v0.5.10
-* [ARROW-12660](https://issues.apache.org/jira/browse/ARROW-12660) - [R] Post-4.0 adjustments for CRAN
-* [ARROW-12661](https://issues.apache.org/jira/browse/ARROW-12661) - [C++] CSV add skip rows after column names
-* [ARROW-12662](https://issues.apache.org/jira/browse/ARROW-12662) - [Website] Force to use squash merge
-* [ARROW-12667](https://issues.apache.org/jira/browse/ARROW-12667) - [Python] Ensure test coverage for conversion of strided numpy arrays
-* [ARROW-12675](https://issues.apache.org/jira/browse/ARROW-12675) - [C++] CSV should include line/row numbers in parsing error messages
-* [ARROW-12677](https://issues.apache.org/jira/browse/ARROW-12677) - [Python] Add a mask argument to pyarrow.StructArray.from\_arrays
-* [ARROW-12685](https://issues.apache.org/jira/browse/ARROW-12685) - [C++][Compute] Add unary absolute value kernel
-* [ARROW-12686](https://issues.apache.org/jira/browse/ARROW-12686) - [C++][Python][FlightRPC] Support export\_to\_c in DoGet/inherit from RecordBatchReader
-* [ARROW-12687](https://issues.apache.org/jira/browse/ARROW-12687) - [C++][Python][Dataset] Support C Data Interface with Scanner
-* [ARROW-12689](https://issues.apache.org/jira/browse/ARROW-12689) - [R] Implement ArrowArrayStream C interface
-* [ARROW-12692](https://issues.apache.org/jira/browse/ARROW-12692) - [R] Improve tests and comments for strsplit() bindings
-* [ARROW-12694](https://issues.apache.org/jira/browse/ARROW-12694) - [R][CI] rtools35 job failing on 32-bit build tests
-* [ARROW-12696](https://issues.apache.org/jira/browse/ARROW-12696) - [R] Improve testing of error messages converted to warnings
-* [ARROW-12699](https://issues.apache.org/jira/browse/ARROW-12699) - [CI][Packaging][Java] Generate a jar compatible with Linux and MacOS for all Arrow components
-* [ARROW-12701](https://issues.apache.org/jira/browse/ARROW-12701) - [Website][Release] Include Rust and DataFusion commits, contributors, changes in release notes
-* [ARROW-12702](https://issues.apache.org/jira/browse/ARROW-12702) - [JS] Upgrade Webpack and terser
-* [ARROW-12703](https://issues.apache.org/jira/browse/ARROW-12703) - [JS] Separate Table from DataFrame
-* [ARROW-12704](https://issues.apache.org/jira/browse/ARROW-12704) - [JS] use optional chaining
-* [ARROW-12709](https://issues.apache.org/jira/browse/ARROW-12709) - [C++] Add variadic string join kernel
-* [ARROW-12713](https://issues.apache.org/jira/browse/ARROW-12713) - [C++] String reverse kernel
-* [ARROW-12715](https://issues.apache.org/jira/browse/ARROW-12715) - [C++] SQL-style glob string match kernel
-* [ARROW-12716](https://issues.apache.org/jira/browse/ARROW-12716) - [C++] Left/right/center string padding kernels
-* [ARROW-12717](https://issues.apache.org/jira/browse/ARROW-12717) - [C++] Substring find position kernel
-* [ARROW-12719](https://issues.apache.org/jira/browse/ARROW-12719) - [C++][Python] pyarrow.fs.S3FileSystem pass extra kwargs i.e ACL
-* [ARROW-12721](https://issues.apache.org/jira/browse/ARROW-12721) - [CI] Fix path for uploading aarch64 conda artifacts from the nightly builds
-* [ARROW-12722](https://issues.apache.org/jira/browse/ARROW-12722) - [R] Raise error when attemping to print table with duplicated naming
-* [ARROW-12730](https://issues.apache.org/jira/browse/ARROW-12730) - [MATLAB] Update featherreadmex and featherwritemex to build against latest arrow c++ APIs
-* [ARROW-12731](https://issues.apache.org/jira/browse/ARROW-12731) - [R] Use InMemoryDataset for Table/RecordBatch in dplyr code
-* [ARROW-12736](https://issues.apache.org/jira/browse/ARROW-12736) - [C++] Eliminate unnecessary copy in FieldPath::Get()
-* [ARROW-12738](https://issues.apache.org/jira/browse/ARROW-12738) - [CI] [Gandiva] Nightly build error in azure-conda-osx-clang-py38 (and py39, py\*-r\*)
-* [ARROW-12741](https://issues.apache.org/jira/browse/ARROW-12741) - [CI] Configure GitHub Token for Nightly Builds
-* [ARROW-12745](https://issues.apache.org/jira/browse/ARROW-12745) - [C++][Compute] Add floor, ceiling, and truncate kernels
-* [ARROW-12749](https://issues.apache.org/jira/browse/ARROW-12749) - [C++] Unnecessary copy cause by constructing RecordBatch/Table/Schema from lvalues
-* [ARROW-12750](https://issues.apache.org/jira/browse/ARROW-12750) - [CI] [R] Actually pass parameterized docker options to the templates
-* [ARROW-12751](https://issues.apache.org/jira/browse/ARROW-12751) - [C++] Add variadic row-wise min/max kernels (least/greatest)
-* [ARROW-12758](https://issues.apache.org/jira/browse/ARROW-12758) - [R] Add examples to more function documentation
-* [ARROW-12760](https://issues.apache.org/jira/browse/ARROW-12760) - [C++][Python][R] S3FileSystem: IO thread parallelism limited to 8 threads
-* [ARROW-12761](https://issues.apache.org/jira/browse/ARROW-12761) - [R] Better error handling for write\_to\_raw
-* [ARROW-12764](https://issues.apache.org/jira/browse/ARROW-12764) - [CI] Fix arguments in Conda Windows builds
-* [ARROW-12777](https://issues.apache.org/jira/browse/ARROW-12777) - [R] Convert all inputs to Arrow objects in match\_arrow and is\_in
-* [ARROW-12781](https://issues.apache.org/jira/browse/ARROW-12781) - [R] Implement is.type() functions for dplyr
-* [ARROW-12785](https://issues.apache.org/jira/browse/ARROW-12785) - [CI] the r-devdocs build errors when brew installing gcc
-* [ARROW-12791](https://issues.apache.org/jira/browse/ARROW-12791) - [R] Better error handling for DatasetFactory$Finish() when no format specified
-* [ARROW-12796](https://issues.apache.org/jira/browse/ARROW-12796) - [JS] Support JSON output from benchmarks
-* [ARROW-12800](https://issues.apache.org/jira/browse/ARROW-12800) - [JS] Drop IE Support and remove text encoder and decoder polyfills
-* [ARROW-12801](https://issues.apache.org/jira/browse/ARROW-12801) - [CI][Packaging][Java] Include all modules in script that generate Arrow jars
-* [ARROW-12806](https://issues.apache.org/jira/browse/ARROW-12806) - [Python] test\_write\_to\_dataset\_filesystem missing a dataset mark
-* [ARROW-12808](https://issues.apache.org/jira/browse/ARROW-12808) - [JS] Document browser support
-* [ARROW-12810](https://issues.apache.org/jira/browse/ARROW-12810) - [Python] Run tests with AWS\_EC2\_METADATA\_DISABLED=true
-* [ARROW-12812](https://issues.apache.org/jira/browse/ARROW-12812) - [Packaging][Java] Improve JNI jars build
-* [ARROW-12824](https://issues.apache.org/jira/browse/ARROW-12824) - [R][CI] Upgrade builds for R 4.1 release
-* [ARROW-12827](https://issues.apache.org/jira/browse/ARROW-12827) - [C++] [Dataset] Review error pass-through in the datasets API
-* [ARROW-12829](https://issues.apache.org/jira/browse/ARROW-12829) - [GLib][Ruby] Add support for Apache Arrow Flight
-* [ARROW-12831](https://issues.apache.org/jira/browse/ARROW-12831) - [CI][macOS] Remove needless Homebrew workaround
-* [ARROW-12832](https://issues.apache.org/jira/browse/ARROW-12832) - [JS] Write benchmarks in TypeScript
-* [ARROW-12833](https://issues.apache.org/jira/browse/ARROW-12833) - [JS] Construct perf data in JS
-* [ARROW-12835](https://issues.apache.org/jira/browse/ARROW-12835) - [C++] Implement case insenstive match in match\_substring(\_regex) and match\_like
-* [ARROW-12836](https://issues.apache.org/jira/browse/ARROW-12836) - [C++] Installation on IBM i fails because of CxxFlags
-* [ARROW-12841](https://issues.apache.org/jira/browse/ARROW-12841) - [R] Add examples to more function documentation - part 2
-* [ARROW-12843](https://issues.apache.org/jira/browse/ARROW-12843) - [C++][Compute] Add is\_inf kernel for floating point arrays
-* [ARROW-12848](https://issues.apache.org/jira/browse/ARROW-12848) - [Release] Mail template points to 404
-* [ARROW-12851](https://issues.apache.org/jira/browse/ARROW-12851) - [Go][Parquet] Add Encoding Package Part 1
-* [ARROW-12856](https://issues.apache.org/jira/browse/ARROW-12856) - [C++][Gandiva] Implement castBIT and castBOOLEAN functions on Gandiva
-* [ARROW-12859](https://issues.apache.org/jira/browse/ARROW-12859) - [C++] Add ScalarFromJSON for easier testing
-* [ARROW-12861](https://issues.apache.org/jira/browse/ARROW-12861) - [C++][Compute] Add sign function kernels
-* [ARROW-12867](https://issues.apache.org/jira/browse/ARROW-12867) - [R] Bindings for abs()
-* [ARROW-12868](https://issues.apache.org/jira/browse/ARROW-12868) - [R] Bindings for find\_substring and find\_substring\_regex
-* [ARROW-12869](https://issues.apache.org/jira/browse/ARROW-12869) - [R] Bindings for utf8\_reverse and ascii\_reverse
-* [ARROW-12870](https://issues.apache.org/jira/browse/ARROW-12870) - [R] Bindings for stringr::str\_like
-* [ARROW-12875](https://issues.apache.org/jira/browse/ARROW-12875) - [JS] Upgrade Jest and other minor updates
-* [ARROW-12883](https://issues.apache.org/jira/browse/ARROW-12883) - [R] [CI] version compatibility fails on R 4.1
-* [ARROW-12891](https://issues.apache.org/jira/browse/ARROW-12891) - [C++][Compute][Dataset] Extract subtree pruning logic to compute::
-* [ARROW-12894](https://issues.apache.org/jira/browse/ARROW-12894) - [R] Bump R version
-* [ARROW-12895](https://issues.apache.org/jira/browse/ARROW-12895) - [CI] Use "concurrency" setting on Github Actions
-* [ARROW-12898](https://issues.apache.org/jira/browse/ARROW-12898) - [Release][C\#] Package upload script is broken
-* [ARROW-12900](https://issues.apache.org/jira/browse/ARROW-12900) - [Python][Documentation] an np import in Reading Datasets docs
-* [ARROW-12901](https://issues.apache.org/jira/browse/ARROW-12901) - [R] Follow on to more examples
-* [ARROW-12909](https://issues.apache.org/jira/browse/ARROW-12909) - [R][Release] Build of ubuntu-docs is failing
-* [ARROW-12912](https://issues.apache.org/jira/browse/ARROW-12912) - [Website] Use .asf.yaml for publishing
-* [ARROW-12915](https://issues.apache.org/jira/browse/ARROW-12915) - [Release] Build of ubuntu-docs is failing on thrift
-* [ARROW-12936](https://issues.apache.org/jira/browse/ARROW-12936) - [C++][Gandiva] Implement ASCII Hive function on Gandiva
-* [ARROW-12937](https://issues.apache.org/jira/browse/ARROW-12937) - [C++] Allow specifying default metadata for new S3 files
-* [ARROW-12939](https://issues.apache.org/jira/browse/ARROW-12939) - [R] Simplify RTask stop handling
-* [ARROW-12940](https://issues.apache.org/jira/browse/ARROW-12940) - [R] Expose C interface as R6 methods
-* [ARROW-12948](https://issues.apache.org/jira/browse/ARROW-12948) - [C++] Add string slice replace kernel
-* [ARROW-12949](https://issues.apache.org/jira/browse/ARROW-12949) - [C++] Add string starts-with/ends-with kernels
-* [ARROW-12950](https://issues.apache.org/jira/browse/ARROW-12950) - [C++] Add substring count kernel
-* [ARROW-12951](https://issues.apache.org/jira/browse/ARROW-12951) - [C++] Refactor StringTransform
-* [ARROW-12952](https://issues.apache.org/jira/browse/ARROW-12952) - [C++] Add regex count kernel
-* [ARROW-12955](https://issues.apache.org/jira/browse/ARROW-12955) - [C++] Add additional type support for if\_else kernel
-* [ARROW-12957](https://issues.apache.org/jira/browse/ARROW-12957) - [R] rchk issues on cran
-* [ARROW-12961](https://issues.apache.org/jira/browse/ARROW-12961) - [C++] MSVC issues warning building PyArrow on Windows
-* [ARROW-12962](https://issues.apache.org/jira/browse/ARROW-12962) - [GLib][Ruby] Add Arrow:Scalar
-* [ARROW-12964](https://issues.apache.org/jira/browse/ARROW-12964) - [R] Add bindings for ifelse() and if\_else()
-* [ARROW-12966](https://issues.apache.org/jira/browse/ARROW-12966) - [Python] Expose Python binding for ElementWiseAggregateOptions
-* [ARROW-12967](https://issues.apache.org/jira/browse/ARROW-12967) - [R] Add bindings for pmin() and pmax()
-* [ARROW-12968](https://issues.apache.org/jira/browse/ARROW-12968) - [R] [CI] Add an rchk job to our nightlies
-* [ARROW-12972](https://issues.apache.org/jira/browse/ARROW-12972) - [CI] ][C++] archive\_write\_add\_filter\_zstd error on CentOS + ARM64
-* [ARROW-12975](https://issues.apache.org/jira/browse/ARROW-12975) - [C++][Python] if\_else kernel doesn't support upcasting
-* [ARROW-12982](https://issues.apache.org/jira/browse/ARROW-12982) - [C++] Re-enable unused-variable warning
-* [ARROW-12984](https://issues.apache.org/jira/browse/ARROW-12984) - [C++] Passing options parameter of Count/Index aggregation by reference
-* [ARROW-12985](https://issues.apache.org/jira/browse/ARROW-12985) - [Python][Packaging] Unable to install pygit2 in the arm64 wheel builds
-* [ARROW-12986](https://issues.apache.org/jira/browse/ARROW-12986) - [C++][Gandiva] Implement new cache eviction policy in Gandiva
-* [ARROW-12992](https://issues.apache.org/jira/browse/ARROW-12992) - [R] bindings for substr(), substring(), str\_sub()
-* [ARROW-12994](https://issues.apache.org/jira/browse/ARROW-12994) - [R] Fix tests that assume UTC local tz
-* [ARROW-12996](https://issues.apache.org/jira/browse/ARROW-12996) - [C++] CSV stream reader has no progress indication
-* [ARROW-13002](https://issues.apache.org/jira/browse/ARROW-13002) - [C++] Add a check for the utf8proc's version in CMake
-* [ARROW-13005](https://issues.apache.org/jira/browse/ARROW-13005) - [C++] Support filter/take for union data type.
-* [ARROW-13006](https://issues.apache.org/jira/browse/ARROW-13006) - [C++][Gandiva] Implement BASE64 and UNBASE64 Hive functions on Gandiva
-* [ARROW-13009](https://issues.apache.org/jira/browse/ARROW-13009) - [Doc][Dev] Document builds mailing-list
-* [ARROW-13022](https://issues.apache.org/jira/browse/ARROW-13022) - [R] bindings for lubridate's year, isoyear, quarter, month, day, wday, yday, isoweek, hour, minute, and second functions
-* [ARROW-13025](https://issues.apache.org/jira/browse/ARROW-13025) - [C++][Compute] Enhance FunctionOptions with equality, debug representability, and serializability
-* [ARROW-13027](https://issues.apache.org/jira/browse/ARROW-13027) - [C++] Fix ASAN stack traces in CI
-* [ARROW-13030](https://issues.apache.org/jira/browse/ARROW-13030) - [CI][Go] Setup Arm64 golang CI
-* [ARROW-13031](https://issues.apache.org/jira/browse/ARROW-13031) - [JS] Support arm in closure compiler on macOS
-* [ARROW-13032](https://issues.apache.org/jira/browse/ARROW-13032) - [Java] Update gauva version
-* [ARROW-13034](https://issues.apache.org/jira/browse/ARROW-13034) - [Python][Docs] Update outdated examples for hdfs/azure on the Parquet doc page
-* [ARROW-13036](https://issues.apache.org/jira/browse/ARROW-13036) - [Doc] Mention recommended file extension(s) for Arrow IPC
-* [ARROW-13042](https://issues.apache.org/jira/browse/ARROW-13042) - [C++] Automatic checks that kernels don't leave uninitialized data in output
-* [ARROW-13043](https://issues.apache.org/jira/browse/ARROW-13043) - [GLib][Ruby] Add GArrowEqualOptions
-* [ARROW-13044](https://issues.apache.org/jira/browse/ARROW-13044) - [Java] Union vectors should extend ValueVector
-* [ARROW-13045](https://issues.apache.org/jira/browse/ARROW-13045) - [Packaging][RPM][deb] Don't install system utf8proc if it's old
-* [ARROW-13047](https://issues.apache.org/jira/browse/ARROW-13047) - [Website] Add kiszk to committer list
-* [ARROW-13049](https://issues.apache.org/jira/browse/ARROW-13049) - [C++][Gandiva] Implement BIN Hive function on Gandiva
-* [ARROW-13050](https://issues.apache.org/jira/browse/ARROW-13050) - [C++][Gandiva] Implement SPACE Hive function on Gandiva
-* [ARROW-13054](https://issues.apache.org/jira/browse/ARROW-13054) - [C++] Add option to specify the first day of the week for the "day\_of\_week" temporal kernel
-* [ARROW-13064](https://issues.apache.org/jira/browse/ARROW-13064) - [C++] Add a general "if, ifelse, ..., else" kernel ("CASE WHEN")
-* [ARROW-13065](https://issues.apache.org/jira/browse/ARROW-13065) - [Packaging][RPM] Add missing required LZ4 version information
-* [ARROW-13068](https://issues.apache.org/jira/browse/ARROW-13068) - [GLib][Dataset] Change prefix to gadataset\_ from gad\_
-* [ARROW-13070](https://issues.apache.org/jira/browse/ARROW-13070) - [R] bindings for sd and var
-* [ARROW-13072](https://issues.apache.org/jira/browse/ARROW-13072) - [C++] Add bitwise arithmetic compute functions
-* [ARROW-13074](https://issues.apache.org/jira/browse/ARROW-13074) - [Python] Start with deprecating ParquetDataset custom attributes
-* [ARROW-13075](https://issues.apache.org/jira/browse/ARROW-13075) - [Python] Expose C data interface API for pyarrow.Field
-* [ARROW-13076](https://issues.apache.org/jira/browse/ARROW-13076) - [Java] Enable ExtensionType to use StructVector and UnionVector for underlying storage
-* [ARROW-13082](https://issues.apache.org/jira/browse/ARROW-13082) - [CI] Forward R argument to ubuntu-docs build
-* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_
-* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_
-* [ARROW-13091](https://issues.apache.org/jira/browse/ARROW-13091) - [Python] Add compression\_level argument to IpcWriteOptions constructor
-* [ARROW-13092](https://issues.apache.org/jira/browse/ARROW-13092) - [C++] CreateDir should fail if the target exists and is not a directory
-* [ARROW-13095](https://issues.apache.org/jira/browse/ARROW-13095) - [C++] Implement trigonometric compute functions
-* [ARROW-13096](https://issues.apache.org/jira/browse/ARROW-13096) - [C++] Implement logarithm compute functions
-* [ARROW-13097](https://issues.apache.org/jira/browse/ARROW-13097) - [C++] Provide a simple reflection utility for {{struct}}s
-* [ARROW-13098](https://issues.apache.org/jira/browse/ARROW-13098) - [Dev][Archery] Reorganize docker submodule to its own subpackage
-* [ARROW-13100](https://issues.apache.org/jira/browse/ARROW-13100) - [MATLAB] Integrate GoogleTest with MATLAB Interface C++ Code
-* [ARROW-13101](https://issues.apache.org/jira/browse/ARROW-13101) - [Python][Doc] pyarrow.FixedSizeListArray does not appear in the documentation
-* [ARROW-13110](https://issues.apache.org/jira/browse/ARROW-13110) - [C++] Deadlock can happen when using BackgroundGenerator without transferring callbacks
-* [ARROW-13113](https://issues.apache.org/jira/browse/ARROW-13113) - [R] use RTasks to manage parallel in converting arrow to R
-* [ARROW-13117](https://issues.apache.org/jira/browse/ARROW-13117) - [R] Retain schema in new Expressions
-* [ARROW-13119](https://issues.apache.org/jira/browse/ARROW-13119) - [R] Set empty schema in scalar Expressions
-* [ARROW-13124](https://issues.apache.org/jira/browse/ARROW-13124) - [Ruby] Add support for memory view
-* [ARROW-13127](https://issues.apache.org/jira/browse/ARROW-13127) - [R] Valgrind nightly errors
-* [ARROW-13136](https://issues.apache.org/jira/browse/ARROW-13136) - [C++] Add a "coalesce" variadic scalar kernel
-* [ARROW-13137](https://issues.apache.org/jira/browse/ARROW-13137) - [C++][Documentation] Make in-table references consistent
-* [ARROW-13140](https://issues.apache.org/jira/browse/ARROW-13140) - [C++/Python] Upgrade libthrift pin in the nightlies
-* [ARROW-13142](https://issues.apache.org/jira/browse/ARROW-13142) - [Python] Use vector append when converting from list of non-strided numpy arrays
-* [ARROW-13147](https://issues.apache.org/jira/browse/ARROW-13147) - [Java] Respect the rounding policy when allocating vector buffers
-* [ARROW-13157](https://issues.apache.org/jira/browse/ARROW-13157) - [C++] Add find\_substring\_regex kernel and implement ignore\_case for find\_substring
-* [ARROW-13158](https://issues.apache.org/jira/browse/ARROW-13158) - [Python] Fix repr and contains of StructScalar with duplicate field names
-* [ARROW-13162](https://issues.apache.org/jira/browse/ARROW-13162) - [C++][Gandiva] Add new alias for extract date functions in Gandiva registry
-* [ARROW-13171](https://issues.apache.org/jira/browse/ARROW-13171) - [R] Add binding for str\_pad()
-* [ARROW-13190](https://issues.apache.org/jira/browse/ARROW-13190) - [C++] [Gandiva] Change behavior of INITCAP function
-* [ARROW-13194](https://issues.apache.org/jira/browse/ARROW-13194) - [Java][Document] Create prose document about Java algorithms
-* [ARROW-13195](https://issues.apache.org/jira/browse/ARROW-13195) - [R] Problem with rlang reverse dependency checks
-* [ARROW-13199](https://issues.apache.org/jira/browse/ARROW-13199) - [R] add ubuntu 21.04 to nightly builds
-* [ARROW-13200](https://issues.apache.org/jira/browse/ARROW-13200) - [R] Add binding for case\_when()
-* [ARROW-13201](https://issues.apache.org/jira/browse/ARROW-13201) - [R] Add binding for coalesce()
-* [ARROW-13210](https://issues.apache.org/jira/browse/ARROW-13210) - [Python][CI] Fix vcpkg caching mechanism for the macOS wheels
-* [ARROW-13211](https://issues.apache.org/jira/browse/ARROW-13211) - [C++][CI] Remove outdated Github Actions ARM builds
-* [ARROW-13212](https://issues.apache.org/jira/browse/ARROW-13212) - [Release] Support deploying to test PyPI in the python post release script
-* [ARROW-13215](https://issues.apache.org/jira/browse/ARROW-13215) - [R] [CI] Add ENV TZ to docker files
-* [ARROW-13218](https://issues.apache.org/jira/browse/ARROW-13218) - [Doc] Document/clarify conventions for timestamp storage
-* [ARROW-13219](https://issues.apache.org/jira/browse/ARROW-13219) - [C++][GLib] Demote/deprecate CompareOptions
-* [ARROW-13224](https://issues.apache.org/jira/browse/ARROW-13224) - [Python][Doc] Documentation missing for pyarrow.dataset.write\_dataset
-* [ARROW-13226](https://issues.apache.org/jira/browse/ARROW-13226) - [Python] Add a general purpose cython trampolining utility
-* [ARROW-13228](https://issues.apache.org/jira/browse/ARROW-13228) - [C++] S3 CreateBucket fails because AWS treats us-east-1 differently than other regions
-* [ARROW-13230](https://issues.apache.org/jira/browse/ARROW-13230) - Add CSV Writer documentation
-* [ARROW-13234](https://issues.apache.org/jira/browse/ARROW-13234) - [C++] Add string padding option to determine which side the extra space goes on
-* [ARROW-13235](https://issues.apache.org/jira/browse/ARROW-13235) - [C++] Make type\_name equal to options class name for all FunctionOptionTypes
-* [ARROW-13236](https://issues.apache.org/jira/browse/ARROW-13236) - [Python] Improve repr of pyarrow.compute.FunctionOptions
-* [ARROW-13238](https://issues.apache.org/jira/browse/ARROW-13238) - [C++][Dataset][Compute] Substitute ExecPlan impl for dataset scans
-* [ARROW-13242](https://issues.apache.org/jira/browse/ARROW-13242) - [C++] Improve decimal random generation
-* [ARROW-13244](https://issues.apache.org/jira/browse/ARROW-13244) - [C++] Add facility to get current thread id
-* [ARROW-13258](https://issues.apache.org/jira/browse/ARROW-13258) - [Python] Improve the repr of ParquetFileFragment
-* [ARROW-13262](https://issues.apache.org/jira/browse/ARROW-13262) - [R] transmute() fails after pulling data into R
-* [ARROW-13273](https://issues.apache.org/jira/browse/ARROW-13273) - [C++] Don't use .pc only in CMake paths for Requires.private
-* [ARROW-13274](https://issues.apache.org/jira/browse/ARROW-13274) - [JS] Remove Webpack
-* [ARROW-13275](https://issues.apache.org/jira/browse/ARROW-13275) - [JS] Fix perf tests
-* [ARROW-13276](https://issues.apache.org/jira/browse/ARROW-13276) - [GLib][Ruby][Flight] Add support for ListFlights
-* [ARROW-13277](https://issues.apache.org/jira/browse/ARROW-13277) - [JS] Add declaration maps
-* [ARROW-13280](https://issues.apache.org/jira/browse/ARROW-13280) - [R] Bindings for log and trig functions
-* [ARROW-13282](https://issues.apache.org/jira/browse/ARROW-13282) - [C++] Remove obsolete generated files
-* [ARROW-13283](https://issues.apache.org/jira/browse/ARROW-13283) - [Developer Tools] Support passing through memory limits in archery docker run
-* [ARROW-13286](https://issues.apache.org/jira/browse/ARROW-13286) - [CI] Require docker-compose 1.27.0 or later
-* [ARROW-13289](https://issues.apache.org/jira/browse/ARROW-13289) - [C++] Log functions don't have int kernels
-* [ARROW-13291](https://issues.apache.org/jira/browse/ARROW-13291) - [GLib][CI] Require gobject-introspection 3.4.5 or later
-* [ARROW-13296](https://issues.apache.org/jira/browse/ARROW-13296) - [C++] Provide reflection-compatible enum replacement
-* [ARROW-13299](https://issues.apache.org/jira/browse/ARROW-13299) - [JS] Upgrade ix and rxjs
-* [ARROW-13303](https://issues.apache.org/jira/browse/ARROW-13303) - [JS] Revise bundles
-* [ARROW-13306](https://issues.apache.org/jira/browse/ARROW-13306) - [Java][JDBC] use ResultSetMetaData.getColumnLabel instead of ResultSetMetaData.getColumnName
-* [ARROW-13313](https://issues.apache.org/jira/browse/ARROW-13313) - [C++][Compute] Add ScalarAggregateNode
-* [ARROW-13320](https://issues.apache.org/jira/browse/ARROW-13320) - [Website] Add MIME types to FAQ
-* [ARROW-13323](https://issues.apache.org/jira/browse/ARROW-13323) - [Archery] Validate docker compose configuration
-* [ARROW-13343](https://issues.apache.org/jira/browse/ARROW-13343) - [R] Update NEWS.md for 5.0
-* [ARROW-13346](https://issues.apache.org/jira/browse/ARROW-13346) - [C++] Remove compile time parsing from EnumType
-* [ARROW-13355](https://issues.apache.org/jira/browse/ARROW-13355) - [R] ensure that sf is installed in our revdep job
-* [ARROW-13357](https://issues.apache.org/jira/browse/ARROW-13357) - [R] bindings for sign()
-* [ARROW-13365](https://issues.apache.org/jira/browse/ARROW-13365) - [R] bindings for floor/ceiling/truncate
-* [ARROW-13385](https://issues.apache.org/jira/browse/ARROW-13385) - [C++][Compute] Document out-of-source addition to the FunctionRegistry
-* [ARROW-13386](https://issues.apache.org/jira/browse/ARROW-13386) - [R][C++] CSV streaming changes break Rtools 35 32-bit build
-* [ARROW-13418](https://issues.apache.org/jira/browse/ARROW-13418) - [R] typo in python.r
-* [PARQUET-1798](https://issues.apache.org/jira/browse/PARQUET-1798) - [C++] Review logic around automatic assignment of field\_id's
-* [PARQUET-1998](https://issues.apache.org/jira/browse/PARQUET-1998) - [C++] Implement LZ4\_RAW compression
-* [PARQUET-2056](https://issues.apache.org/jira/browse/PARQUET-2056) - [C++] Add ability for retrieving dictionary and indices separately for ColumnReader
-
-
-
-# Apache Arrow 3.0.0 (2021-01-18)
-
-## New Features and Improvements
-
-* [ARROW-1846](https://issues.apache.org/jira/browse/ARROW-1846) - [C++] Implement "any" reduction kernel for boolean data
-* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration
-* [ARROW-4193](https://issues.apache.org/jira/browse/ARROW-4193) - [Rust] Add support for decimal data type
-* [ARROW-4544](https://issues.apache.org/jira/browse/ARROW-4544) - [Rust] Read nested JSON structs into StructArrays
-* [ARROW-4804](https://issues.apache.org/jira/browse/ARROW-4804) - [Rust] Read temporal values from CSV - Parse Date32 and Date64 in CSV reader
-* [ARROW-4960](https://issues.apache.org/jira/browse/ARROW-4960) - [R] Add crossbow task for r-arrow-feedstock
-* [ARROW-4970](https://issues.apache.org/jira/browse/ARROW-4970) - [C++][Parquet] Implement parquet::FileMetaData::Equals
-* [ARROW-5336](https://issues.apache.org/jira/browse/ARROW-5336) - [C++] Implement arrow::Concatenate for dictionary-encoded arrays with unequal dictionaries
-* [ARROW-5350](https://issues.apache.org/jira/browse/ARROW-5350) - [Rust] Support filtering on primitive/string lists
-* [ARROW-5394](https://issues.apache.org/jira/browse/ARROW-5394) - [C++] Benchmarks for IsIn Kernel
-* [ARROW-5679](https://issues.apache.org/jira/browse/ARROW-5679) - [Python] Drop Python 3.5 from support matrix
-* [ARROW-5950](https://issues.apache.org/jira/browse/ARROW-5950) - [Rust] [DataFusion] Add ability to log via logger dependency
-* [ARROW-6071](https://issues.apache.org/jira/browse/ARROW-6071) - [C++] Implement casting Binary <-\> LargeBinary
-* [ARROW-6697](https://issues.apache.org/jira/browse/ARROW-6697) - [Rust] [DataFusion] Validate that all parquet partitions have the same schema
-* [ARROW-6715](https://issues.apache.org/jira/browse/ARROW-6715) - [Website] Describe "non-free" component is needed for Plasma packages in install page
-* [ARROW-6883](https://issues.apache.org/jira/browse/ARROW-6883) - [C++] Support sending delta DictionaryBatch or replacement DictionaryBatch in IPC stream writer class
-* [ARROW-6995](https://issues.apache.org/jira/browse/ARROW-6995) - [Packaging][Crossbow] The windows conda artifacts are not uploaded to GitHub releases
-* [ARROW-7531](https://issues.apache.org/jira/browse/ARROW-7531) - [C++] Investigate header cost reduction
-* [ARROW-7800](https://issues.apache.org/jira/browse/ARROW-7800) - [Python] Expose GetRecordBatchReader API in PyArrow
-* [ARROW-7842](https://issues.apache.org/jira/browse/ARROW-7842) - [Rust] [Parquet] Implement array reader for list type
-* [ARROW-8113](https://issues.apache.org/jira/browse/ARROW-8113) - [C++] Implement a lighter-weight variant
-* [ARROW-8199](https://issues.apache.org/jira/browse/ARROW-8199) - [C++] Add support for multi-column sort on Table
-* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer
-* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet
-* [ARROW-8425](https://issues.apache.org/jira/browse/ARROW-8425) - [Rust] [Parquet] Add support for writing temporal types
-* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
-* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
-* [ARROW-8853](https://issues.apache.org/jira/browse/ARROW-8853) - [Rust] [Integration Testing] Enable Flight tests
-* [ARROW-8876](https://issues.apache.org/jira/browse/ARROW-8876) - [C++] Implement casts from date types to Timestamp
-* [ARROW-8883](https://issues.apache.org/jira/browse/ARROW-8883) - [Rust] [Integration Testing] Enable passing tests and update spec doc
-* [ARROW-9001](https://issues.apache.org/jira/browse/ARROW-9001) - [R] Box outputs as correct type in call\_function
-* [ARROW-9164](https://issues.apache.org/jira/browse/ARROW-9164) - [C++] Provide APIs for adding "docstrings" to arrow::compute::Function classes that can be accessed by bindings
-* [ARROW-9187](https://issues.apache.org/jira/browse/ARROW-9187) - [R] Add bindings for arithmetic kernels
-* [ARROW-9296](https://issues.apache.org/jira/browse/ARROW-9296) - [CI][Rust] Enable more clippy lint checks
-* [ARROW-9304](https://issues.apache.org/jira/browse/ARROW-9304) - [C++] Add "AppendEmptyValue" builder APIs for use inside StructBuilder::AppendNull
-* [ARROW-9361](https://issues.apache.org/jira/browse/ARROW-9361) - [Rust] Move other array types into their own modules
-* [ARROW-9400](https://issues.apache.org/jira/browse/ARROW-9400) - [Python] Do not depend on conda-forge static libraries in Windows wheel builds
-* [ARROW-9475](https://issues.apache.org/jira/browse/ARROW-9475) - [Java] Clean up usages of BaseAllocator, use BufferAllocator instead
-* [ARROW-9489](https://issues.apache.org/jira/browse/ARROW-9489) - [C++] Add fill\_null kernel implementation for (array[string], scalar[string])
-* [ARROW-9555](https://issues.apache.org/jira/browse/ARROW-9555) - [Rust] [DataFusion] Add inner (hash) equijoin physical plan
-* [ARROW-9564](https://issues.apache.org/jira/browse/ARROW-9564) - [Packaging] Vendor r-arrow-feedstock conda-forge recipe
-* [ARROW-9674](https://issues.apache.org/jira/browse/ARROW-9674) - [Rust] Parquet reader should implement Send + Sync
-* [ARROW-9704](https://issues.apache.org/jira/browse/ARROW-9704) - [Java] TestEndianness.testLittleEndian fails on big endian platform
-* [ARROW-9707](https://issues.apache.org/jira/browse/ARROW-9707) - [Rust] [DataFusion] Re-implement threading model
-* [ARROW-9709](https://issues.apache.org/jira/browse/ARROW-9709) - [Java] Test cases in arrow-vector assume little-endian platform
-* [ARROW-9728](https://issues.apache.org/jira/browse/ARROW-9728) - [Rust] [Parquet] Compute nested definition and repetition for structs
-* [ARROW-9747](https://issues.apache.org/jira/browse/ARROW-9747) - [C++][Java][Format] Support Decimal256 Type
-* [ARROW-9771](https://issues.apache.org/jira/browse/ARROW-9771) - [Rust] [DataFusion] Predicate Pushdown Improvement: treat predicates separated by AND separately
-* [ARROW-9803](https://issues.apache.org/jira/browse/ARROW-9803) - [Go] Add initial support for s390x
-* [ARROW-9804](https://issues.apache.org/jira/browse/ARROW-9804) - [FlightRPC] Authentication Redesign
-* [ARROW-9828](https://issues.apache.org/jira/browse/ARROW-9828) - [Rust] [DataFusion] TableProvider trait should support predicate push-down
-* [ARROW-9861](https://issues.apache.org/jira/browse/ARROW-9861) - [Java] Failed Arrow Vector on big-endian platform
-* [ARROW-9862](https://issues.apache.org/jira/browse/ARROW-9862) - Throw an exception in UnsafeDirectLittleEndian on Big-Endian platform
-* [ARROW-9911](https://issues.apache.org/jira/browse/ARROW-9911) - [Rust][DataFusion] SELECT <expression\> with no FROM clause should produce a single row of output
-* [ARROW-9945](https://issues.apache.org/jira/browse/ARROW-9945) - [C++][Dataset] Refactor Expression::Assume to return a Result
-* [ARROW-9991](https://issues.apache.org/jira/browse/ARROW-9991) - [C++] split kernels for strings/binary
-* [ARROW-10002](https://issues.apache.org/jira/browse/ARROW-10002) - [Rust] Trait-specialization requires nightly
-* [ARROW-10021](https://issues.apache.org/jira/browse/ARROW-10021) - [C++][Compute] Support finding nth frequently used value in mode kernel
-* [ARROW-10032](https://issues.apache.org/jira/browse/ARROW-10032) - [Documentation] C++ Windows docs are out of date
-* [ARROW-10079](https://issues.apache.org/jira/browse/ARROW-10079) - [Rust]: Benchmark and improve count\_set\_bits function
-* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes
-* [ARROW-10097](https://issues.apache.org/jira/browse/ARROW-10097) - [C++] Persist SetLookupState in between usages of IsIn when filtering dataset batches
-* [ARROW-10106](https://issues.apache.org/jira/browse/ARROW-10106) - [FlightRPC][Java] Expose onIsReady() callback on OutboundStreamListener
-* [ARROW-10108](https://issues.apache.org/jira/browse/ARROW-10108) - [Rust] [Parquet] Fix compiler warning about unused return value
-* [ARROW-10109](https://issues.apache.org/jira/browse/ARROW-10109) - [Rust] Add support to produce a C Data interface
-* [ARROW-10110](https://issues.apache.org/jira/browse/ARROW-10110) - [Rust] Add support to consume C Data Interface
-* [ARROW-10131](https://issues.apache.org/jira/browse/ARROW-10131) - [C++][Dataset] Lazily parse parquet metadata / statistics in ParquetDatasetFactory and ParquetFileFragment
-* [ARROW-10135](https://issues.apache.org/jira/browse/ARROW-10135) - [Rust] [Parquet] Refactor file module to help adding sources
-* [ARROW-10143](https://issues.apache.org/jira/browse/ARROW-10143) - [C++] ArrayRangeEquals should accept EqualOptions
-* [ARROW-10144](https://issues.apache.org/jira/browse/ARROW-10144) - [Flight] Add support for using the TLS\_SNI extension
-* [ARROW-10149](https://issues.apache.org/jira/browse/ARROW-10149) - [Rust] Add support to external release of un-owned buffers
-* [ARROW-10163](https://issues.apache.org/jira/browse/ARROW-10163) - [Rust] [DataFusion] Add DictionaryArray coercion support
-* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields
-* [ARROW-10173](https://issues.apache.org/jira/browse/ARROW-10173) - [Rust][DataFusion] Improve performance of equality to a constant predicate support
-* [ARROW-10180](https://issues.apache.org/jira/browse/ARROW-10180) - [C++][Doc] Update dependency management docs following aws-sdk-cpp addition
-* [ARROW-10182](https://issues.apache.org/jira/browse/ARROW-10182) - [C++] Add basic continuation support to futures
-* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches
-* [ARROW-10197](https://issues.apache.org/jira/browse/ARROW-10197) - [Gandiva][python] Execute expression on filtered data
-* [ARROW-10203](https://issues.apache.org/jira/browse/ARROW-10203) - [Doc] Capture guidance for endianness support in contributors guide.
-* [ARROW-10207](https://issues.apache.org/jira/browse/ARROW-10207) - [C++] Unary kernels that results in a list have no preallocated offset buffer
-* [ARROW-10208](https://issues.apache.org/jira/browse/ARROW-10208) - [C++] String split kernels do not propagate nulls correctly on sliced input
-* [ARROW-10216](https://issues.apache.org/jira/browse/ARROW-10216) - [Rust] Simd implementation of min/max aggregation kernels for primitive types
-* [ARROW-10224](https://issues.apache.org/jira/browse/ARROW-10224) - [Python] Add support for Python 3.9 except macOS wheel and Windows wheel
-* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests
-* [ARROW-10228](https://issues.apache.org/jira/browse/ARROW-10228) - [Julia] Donate Julia Implementation
-* [ARROW-10236](https://issues.apache.org/jira/browse/ARROW-10236) - [Rust] [DataFusion] Make DataFusion casting rules consistent with cast kernel
-* [ARROW-10241](https://issues.apache.org/jira/browse/ARROW-10241) - [C++][Compute] Add variance kernel benchmark
-* [ARROW-10249](https://issues.apache.org/jira/browse/ARROW-10249) - [Rust]: Support Dictionary types for ListArrays in arrow json reader
-* [ARROW-10259](https://issues.apache.org/jira/browse/ARROW-10259) - [Rust] Support field metadata
-* [ARROW-10261](https://issues.apache.org/jira/browse/ARROW-10261) - [Rust] [BREAKING] Lists should take Field instead of DataType
-* [ARROW-10263](https://issues.apache.org/jira/browse/ARROW-10263) - [C++][Compute] Improve numerical stability of variances merging
-* [ARROW-10268](https://issues.apache.org/jira/browse/ARROW-10268) - [Rust] Support writing dictionaries to IPC file and stream
-* [ARROW-10269](https://issues.apache.org/jira/browse/ARROW-10269) - [Rust] Update nightly: Oct 2020 Edition
-* [ARROW-10277](https://issues.apache.org/jira/browse/ARROW-10277) - [C++] Support comparing scalars approximately
-* [ARROW-10289](https://issues.apache.org/jira/browse/ARROW-10289) - [Rust] Support reading dictionary streams
-* [ARROW-10292](https://issues.apache.org/jira/browse/ARROW-10292) - [Rust] [DataFusion] Simplify merge
-* [ARROW-10295](https://issues.apache.org/jira/browse/ARROW-10295) - [Rust] [DataFusion] Simplify accumulators
-* [ARROW-10300](https://issues.apache.org/jira/browse/ARROW-10300) - [Rust] Improve benchmark documentation for generating/converting TPC-H data
-* [ARROW-10301](https://issues.apache.org/jira/browse/ARROW-10301) - [C++] Add "all" boolean reducing kernel
-* [ARROW-10302](https://issues.apache.org/jira/browse/ARROW-10302) - [Python] Don't double-package plasma-store-server
-* [ARROW-10304](https://issues.apache.org/jira/browse/ARROW-10304) - [C++][Compute] Optimize variance kernel for integers
-* [ARROW-10310](https://issues.apache.org/jira/browse/ARROW-10310) - [C++][Gandiva] Add single argument round() in Gandiva
-* [ARROW-10311](https://issues.apache.org/jira/browse/ARROW-10311) - [Release] Update crossbow verification process
-* [ARROW-10313](https://issues.apache.org/jira/browse/ARROW-10313) - [C++] Improve UTF8 validation speed and CSV string conversion
-* [ARROW-10318](https://issues.apache.org/jira/browse/ARROW-10318) - [C++] Use pimpl idiom in CSV parser
-* [ARROW-10319](https://issues.apache.org/jira/browse/ARROW-10319) - [Flight][Go] Add Context to Client Auth Handler functions for Flight
-* [ARROW-10320](https://issues.apache.org/jira/browse/ARROW-10320) - [Rust] Convert RecordBatchIterator to a Stream
-* [ARROW-10322](https://issues.apache.org/jira/browse/ARROW-10322) - [C++][Dataset] Minimize Expression to a wrapper around compute::Function
-* [ARROW-10323](https://issues.apache.org/jira/browse/ARROW-10323) - [Release][wheel] Add missing verification setup step
-* [ARROW-10325](https://issues.apache.org/jira/browse/ARROW-10325) - [C++][Compute] Separate aggregate kernel registration
-* [ARROW-10328](https://issues.apache.org/jira/browse/ARROW-10328) - [C++] Consider using fast-double-parser
-* [ARROW-10330](https://issues.apache.org/jira/browse/ARROW-10330) - [Rust][Datafusion] Implement nullif() function for DataFusion
-* [ARROW-10331](https://issues.apache.org/jira/browse/ARROW-10331) - [Rust] [DataFusion] Re-organize errors
-* [ARROW-10332](https://issues.apache.org/jira/browse/ARROW-10332) - [Rust] Allow CSV reader to start from a line
-* [ARROW-10334](https://issues.apache.org/jira/browse/ARROW-10334) - [Rust] [Parquet] Support reading and writing Arrow NullArray
-* [ARROW-10336](https://issues.apache.org/jira/browse/ARROW-10336) - [Rust] Support fromIter and toIter for strings
-* [ARROW-10337](https://issues.apache.org/jira/browse/ARROW-10337) - [C++] More liberal parsing of ISO8601 timestamps with fractional seconds
-* [ARROW-10338](https://issues.apache.org/jira/browse/ARROW-10338) - [Rust]: Use const fn for applicable methods
-* [ARROW-10340](https://issues.apache.org/jira/browse/ARROW-10340) - [Packaging][deb][RPM] Use Python 3.8 for pygit2
-* [ARROW-10356](https://issues.apache.org/jira/browse/ARROW-10356) - [Rust] [DataFusion] Add support for is\_in
-* [ARROW-10363](https://issues.apache.org/jira/browse/ARROW-10363) - [Python] Remove workaround for CMake bug in manylinux
-* [ARROW-10366](https://issues.apache.org/jira/browse/ARROW-10366) - [Rust] [DataFusion] Remove collect from merge
-* [ARROW-10375](https://issues.apache.org/jira/browse/ARROW-10375) - [Rust] Remove PrimitiveArrayOps
-* [ARROW-10378](https://issues.apache.org/jira/browse/ARROW-10378) - [Rust] Update take() kernel with support for large lists
-* [ARROW-10381](https://issues.apache.org/jira/browse/ARROW-10381) - [Rust] Generalize Arrow to support MergeSort
-* [ARROW-10382](https://issues.apache.org/jira/browse/ARROW-10382) - [Rust] Fix typos and spelling
-* [ARROW-10383](https://issues.apache.org/jira/browse/ARROW-10383) - [Doc] Fix typos and spelling
-* [ARROW-10384](https://issues.apache.org/jira/browse/ARROW-10384) - [C++] Fix typos and spelling
-* [ARROW-10385](https://issues.apache.org/jira/browse/ARROW-10385) - [C++][Gandiva] Add support for LLVM 11
-* [ARROW-10389](https://issues.apache.org/jira/browse/ARROW-10389) - [Rust][DataFusion] Make the custom source implementation API more explicit
-* [ARROW-10392](https://issues.apache.org/jira/browse/ARROW-10392) - [C++][Gandiva] Avoid string copy while evaluating IN expression
-* [ARROW-10396](https://issues.apache.org/jira/browse/ARROW-10396) - [Rust] [Parquet] Expose SliceableCursor and FileSource
-* [ARROW-10398](https://issues.apache.org/jira/browse/ARROW-10398) - [Rust] [Parquet] Re-export parquet::record::api::Field
-* [ARROW-10400](https://issues.apache.org/jira/browse/ARROW-10400) - Propagate TLS client peer\_identity when using mutual TLS
-* [ARROW-10402](https://issues.apache.org/jira/browse/ARROW-10402) - [Rust] Improve array equality
-* [ARROW-10407](https://issues.apache.org/jira/browse/ARROW-10407) - [C++] Division Support in Decimal256
-* [ARROW-10408](https://issues.apache.org/jira/browse/ARROW-10408) - [Java] Upgrade Avro dependency to 1.10
-* [ARROW-10410](https://issues.apache.org/jira/browse/ARROW-10410) - [Rust] Some refactorings
-* [ARROW-10416](https://issues.apache.org/jira/browse/ARROW-10416) - [R] Support Tables in Flight
-* [ARROW-10422](https://issues.apache.org/jira/browse/ARROW-10422) - [Rust] Removed unused BinaryArrayBuilder
-* [ARROW-10424](https://issues.apache.org/jira/browse/ARROW-10424) - [Rust] Simplify code for impl PrimitiveArray
-* [ARROW-10428](https://issues.apache.org/jira/browse/ARROW-10428) - [FlightRPC][Java] Add support for HTTP cookies
-* [ARROW-10445](https://issues.apache.org/jira/browse/ARROW-10445) - [Rust] Add DoubleEnded to PrimitiveArrayIter
-* [ARROW-10449](https://issues.apache.org/jira/browse/ARROW-10449) - [Rust] Make dictionary keys be a PrimitiveArray
-* [ARROW-10454](https://issues.apache.org/jira/browse/ARROW-10454) - [Rust][Datafusion] support creating ParquetExec from externally resolved file list and schema
-* [ARROW-10455](https://issues.apache.org/jira/browse/ARROW-10455) - [Rust] Fix CI cache misses on windows
-* [ARROW-10458](https://issues.apache.org/jira/browse/ARROW-10458) - [Rust] [Datafusion] context.create\_logical\_plan should not take a mutable self reference
-* [ARROW-10464](https://issues.apache.org/jira/browse/ARROW-10464) - [Rust] Implement utility to convert TPC-H tbl files to CSV and Parquet
-* [ARROW-10466](https://issues.apache.org/jira/browse/ARROW-10466) - [Rust] [Website] Update implementation status page
-* [ARROW-10467](https://issues.apache.org/jira/browse/ARROW-10467) - [FlightRPC][Java] Ability to pass arbitrary client properties to server
-* [ARROW-10468](https://issues.apache.org/jira/browse/ARROW-10468) - [C++][Compute] Refactor FunctionExecutor -\> KernelExecutor
-* [ARROW-10476](https://issues.apache.org/jira/browse/ARROW-10476) - [Rust] Allow string array to be built from iterator of &str
-* [ARROW-10477](https://issues.apache.org/jira/browse/ARROW-10477) - [Rust] Add support for iterators over binary arrays
-* [ARROW-10478](https://issues.apache.org/jira/browse/ARROW-10478) - [Dev][Release] Correct Java versions to 3.0.0-SNAPSHOT
-* [ARROW-10481](https://issues.apache.org/jira/browse/ARROW-10481) - [R] Bindings to add, remove, replace Table columns
-* [ARROW-10483](https://issues.apache.org/jira/browse/ARROW-10483) - [C++] Move Executor into a separate header
-* [ARROW-10484](https://issues.apache.org/jira/browse/ARROW-10484) - [C++] Future<{void,Status}\> could be more generic
-* [ARROW-10487](https://issues.apache.org/jira/browse/ARROW-10487) - [FlightRPC][C++] Header-based auth in clients
-* [ARROW-10490](https://issues.apache.org/jira/browse/ARROW-10490) - [C++][GLib] Fail to build with Xcode 12.0.1
-* [ARROW-10492](https://issues.apache.org/jira/browse/ARROW-10492) - [Java][JDBC] Allow users to config the mapping between SQL types and Arrow types
-* [ARROW-10504](https://issues.apache.org/jira/browse/ARROW-10504) - [C++] Suppress UBSAN pointer-overflow warning in RapidJSON
-* [ARROW-10510](https://issues.apache.org/jira/browse/ARROW-10510) - [Rust] [DataFusion] Add benchmarks for COUNT(DISTINCT)
-* [ARROW-10515](https://issues.apache.org/jira/browse/ARROW-10515) - [Julia][Doc] Update lists of supported languages to include Julia
-* [ARROW-10522](https://issues.apache.org/jira/browse/ARROW-10522) - [R] Allow rename Table and RecordBatch columns with names()
-* [ARROW-10526](https://issues.apache.org/jira/browse/ARROW-10526) - [FlightRPC][C++] HTTP cookie handling in clients
-* [ARROW-10530](https://issues.apache.org/jira/browse/ARROW-10530) - [R] Optionally use distro package in linuxlibs.R
-* [ARROW-10531](https://issues.apache.org/jira/browse/ARROW-10531) - [Rust] [DataFusion] Better display for logical plans: Graphviz and Schema information
-* [ARROW-10539](https://issues.apache.org/jira/browse/ARROW-10539) - [Packaging][Python] Use GitHub Actions to build wheels for Windows
-* [ARROW-10540](https://issues.apache.org/jira/browse/ARROW-10540) - [Rust] Allow unary kernels of arbitrary array types
-* [ARROW-10541](https://issues.apache.org/jira/browse/ARROW-10541) - [C++] Add re2 library to core arrow / ARROW\_WITH\_RE2
-* [ARROW-10542](https://issues.apache.org/jira/browse/ARROW-10542) - [C\#][Flight] Add beginning on flight code for net core
-* [ARROW-10543](https://issues.apache.org/jira/browse/ARROW-10543) - [Developer] Update dev instructions to note there may be a timelag
-* [ARROW-10552](https://issues.apache.org/jira/browse/ARROW-10552) - [Rust] Remove un-used Result from Buffer
-* [ARROW-10559](https://issues.apache.org/jira/browse/ARROW-10559) - [Rust] [DataFusion] Break up logical\_plan/mod.rs into smaller modules
-* [ARROW-10561](https://issues.apache.org/jira/browse/ARROW-10561) - [Rust] Simplify \`MutableBuffer::write\` and \`MutableBuffer::write\_bytes\`
-* [ARROW-10562](https://issues.apache.org/jira/browse/ARROW-10562) - [Rust] Potential UB on unsafe code
-* [ARROW-10566](https://issues.apache.org/jira/browse/ARROW-10566) - [C++] Array validation should work on ArrayData
-* [ARROW-10567](https://issues.apache.org/jira/browse/ARROW-10567) - [C++][FlightRPC] Add options to help increase precision of arrow-flight-benchmark
-* [ARROW-10572](https://issues.apache.org/jira/browse/ARROW-10572) - [Rust][DataFusion] Use aHash and std::collections hashmap for aggregates / distinct
-* [ARROW-10574](https://issues.apache.org/jira/browse/ARROW-10574) - [Python][Parquet] Allow collections for 'in' / 'not in' filter (in addition to sets)
-* [ARROW-10575](https://issues.apache.org/jira/browse/ARROW-10575) - [Rust] Rename union.rs to be cosistent with other arrays
-* [ARROW-10581](https://issues.apache.org/jira/browse/ARROW-10581) - [Doc] IPC dictionary reference to relevant section
-* [ARROW-10582](https://issues.apache.org/jira/browse/ARROW-10582) - [Rust] [DataFusion] Implement "repartition" operator
-* [ARROW-10584](https://issues.apache.org/jira/browse/ARROW-10584) - [Rust] [DataFusion] Implement SQL join support using explicit JOIN ON syntax
-* [ARROW-10585](https://issues.apache.org/jira/browse/ARROW-10585) - [Rust] [DataFusion] Add join support to DataFrame and LogicalPlan
-* [ARROW-10586](https://issues.apache.org/jira/browse/ARROW-10586) - [Rust] [DataFusion] Add join support to query planner
-* [ARROW-10589](https://issues.apache.org/jira/browse/ARROW-10589) - [Rust]: Implement AVX-512 bit and operation
-* [ARROW-10590](https://issues.apache.org/jira/browse/ARROW-10590) - [Rust] Remove Date32(Millisecond) from test
-* [ARROW-10591](https://issues.apache.org/jira/browse/ARROW-10591) - [Rust] Add support to structArrays for MutableArrayData
-* [ARROW-10595](https://issues.apache.org/jira/browse/ARROW-10595) - [Rust] Simplify inner loop of min/max kernels for non-null case
-* [ARROW-10596](https://issues.apache.org/jira/browse/ARROW-10596) - [Rust] Improve take benchmark
-* [ARROW-10598](https://issues.apache.org/jira/browse/ARROW-10598) - [C++] Improve performance of GenerateBitsUnrolled
-* [ARROW-10604](https://issues.apache.org/jira/browse/ARROW-10604) - [Ruby] Support Decimal256 type
-* [ARROW-10607](https://issues.apache.org/jira/browse/ARROW-10607) - [C++][Parquet] Support Reading/Writing Decimal256 type in Parquet
-* [ARROW-10609](https://issues.apache.org/jira/browse/ARROW-10609) - [Rust] Optimize min/max of non null strings
-* [ARROW-10628](https://issues.apache.org/jira/browse/ARROW-10628) - [Rust] Make clippy error on clippy warnings
-* [ARROW-10633](https://issues.apache.org/jira/browse/ARROW-10633) - [Rust][DataFusion] Dependency version upgrades
-* [ARROW-10634](https://issues.apache.org/jira/browse/ARROW-10634) - [C\#][CI] Change the build version from 2.2 to 3.1 in CI
-* [ARROW-10636](https://issues.apache.org/jira/browse/ARROW-10636) - [Rust] Remove specialisation from Rust parquet
-* [ARROW-10637](https://issues.apache.org/jira/browse/ARROW-10637) - [Rust] Add examples to boolean kernels
-* [ARROW-10638](https://issues.apache.org/jira/browse/ARROW-10638) - [Rust] Improve tests of boolean kernels
-* [ARROW-10639](https://issues.apache.org/jira/browse/ARROW-10639) - [Rust] Simplify signature of is\_null and add example
-* [ARROW-10644](https://issues.apache.org/jira/browse/ARROW-10644) - [Python] Consolidate path/filesystem handling in pyarrow.dataset and pyarrow.fs
-* [ARROW-10646](https://issues.apache.org/jira/browse/ARROW-10646) - [C++][FlightRPC] Disable flaky test
-* [ARROW-10648](https://issues.apache.org/jira/browse/ARROW-10648) - [Java] Prepare Java codebase for source release without requiring any git tags to be created or pushed
-* [ARROW-10651](https://issues.apache.org/jira/browse/ARROW-10651) - [C++] alloc-dealloc-mismatch in s3fs.cc
-* [ARROW-10652](https://issues.apache.org/jira/browse/ARROW-10652) - [C++][Gandiva] Make gandiva cache size configurable
-* [ARROW-10653](https://issues.apache.org/jira/browse/ARROW-10653) - [Rust]: Update toolchain version to bring new features
-* [ARROW-10654](https://issues.apache.org/jira/browse/ARROW-10654) - [Rust] Specialize parsing of floats / bools
-* [ARROW-10660](https://issues.apache.org/jira/browse/ARROW-10660) - [Rust] Implement AVX-512 bit or operation
-* [ARROW-10665](https://issues.apache.org/jira/browse/ARROW-10665) - [Rust] Add fast paths for common utf8 like patterns
-* [ARROW-10666](https://issues.apache.org/jira/browse/ARROW-10666) - [Rust] [DataFusion] Support nested SELECT statements
-* [ARROW-10669](https://issues.apache.org/jira/browse/ARROW-10669) - [C++][Compute] Support Scalar inputs to boolean kernels
-* [ARROW-10672](https://issues.apache.org/jira/browse/ARROW-10672) - [Rust] [DataFusion] Make limit be computed as a stream
-* [ARROW-10673](https://issues.apache.org/jira/browse/ARROW-10673) - [Rust] [DataFusion] Make sort be computed on the stream
-* [ARROW-10674](https://issues.apache.org/jira/browse/ARROW-10674) - [Rust] Add integration tests for Decimal type
-* [ARROW-10677](https://issues.apache.org/jira/browse/ARROW-10677) - [Rust] Fix Bug and Add tests as documentation showing supported csv parsing
-* [ARROW-10679](https://issues.apache.org/jira/browse/ARROW-10679) - [Rust] [DataFusion] Implement SQL CASE WHEN physical expression
-* [ARROW-10680](https://issues.apache.org/jira/browse/ARROW-10680) - [Rust] [DataFusion] Implement TPC-H Query 12
-* [ARROW-10682](https://issues.apache.org/jira/browse/ARROW-10682) - [Rust] Sort kernel performance tuning
-* [ARROW-10685](https://issues.apache.org/jira/browse/ARROW-10685) - [Rust] [DataFusion] Add support for join on filter pushdown optimizer
-* [ARROW-10688](https://issues.apache.org/jira/browse/ARROW-10688) - [Rust] [DataFusion] Support CASE WHEN from DataFrame API
-* [ARROW-10689](https://issues.apache.org/jira/browse/ARROW-10689) - [Rust] [DataFusion] Support CASE WHEN from SQL
-* [ARROW-10693](https://issues.apache.org/jira/browse/ARROW-10693) - [Rust] [DataFusion] Add support for the left join
-* [ARROW-10696](https://issues.apache.org/jira/browse/ARROW-10696) - [C++] Investigate a bit run reader that would only return runs of set bits
-* [ARROW-10697](https://issues.apache.org/jira/browse/ARROW-10697) - [C++] Consolidate bitmap word readers
-* [ARROW-10703](https://issues.apache.org/jira/browse/ARROW-10703) - [Rust] [DataFusion] Make join not collect left on every part
-* [ARROW-10704](https://issues.apache.org/jira/browse/ARROW-10704) - [Rust][DataFusion] Remove Nested from expression enum
-* [ARROW-10708](https://issues.apache.org/jira/browse/ARROW-10708) - [Packaging][deb] Add support for Ubuntu 20.10
-* [ARROW-10709](https://issues.apache.org/jira/browse/ARROW-10709) - [Python] Difficult to make an efficient zero-copy file reader in Python
-* [ARROW-10712](https://issues.apache.org/jira/browse/ARROW-10712) - [Rust] [DataFusion] Add tests to TPC-H benchmarks
-* [ARROW-10717](https://issues.apache.org/jira/browse/ARROW-10717) - [Rust] [DataFusion] Add support for right join
-* [ARROW-10720](https://issues.apache.org/jira/browse/ARROW-10720) - [C++] Add BasicDecimal256 Rescale Support
-* [ARROW-10721](https://issues.apache.org/jira/browse/ARROW-10721) - [C\#][CI] Use .NET 3.1 by default
-* [ARROW-10722](https://issues.apache.org/jira/browse/ARROW-10722) - [Rust][DataFusion] Reduce overhead in data types in aggregations / joins, improve benchmarks
-* [ARROW-10723](https://issues.apache.org/jira/browse/ARROW-10723) - [Packaging][deb][RPM] Enable Parquet encription
-* [ARROW-10724](https://issues.apache.org/jira/browse/ARROW-10724) - [Developer Tools] Add labeler to when PRs need rebase
-* [ARROW-10725](https://issues.apache.org/jira/browse/ARROW-10725) - [Python][Compute] Exposing bindings for sort options
-* [ARROW-10728](https://issues.apache.org/jira/browse/ARROW-10728) - [Rust] [DataFusion] Add SQL support for JOIN with USING clause
-* [ARROW-10729](https://issues.apache.org/jira/browse/ARROW-10729) - [Rust] [DataFusion] Add SQL support for JOIN using implicit syntax
-* [ARROW-10732](https://issues.apache.org/jira/browse/ARROW-10732) - [Rust] [DataFusion] Add SQL support for table/relation aliases and compound identifiers
-* [ARROW-10733](https://issues.apache.org/jira/browse/ARROW-10733) - [R] Improvements to Linux installation troubleshooting
-* [ARROW-10740](https://issues.apache.org/jira/browse/ARROW-10740) - [Rust][DataFusion] Remove redundant clones found by clippy
-* [ARROW-10741](https://issues.apache.org/jira/browse/ARROW-10741) - Apply clippy lints to source code, remove them from ignore list
-* [ARROW-10742](https://issues.apache.org/jira/browse/ARROW-10742) - [Python] Mask not checked when creating array from numpy array
-* [ARROW-10745](https://issues.apache.org/jira/browse/ARROW-10745) - [Rust] Allocate padding bytes in filter context
-* [ARROW-10747](https://issues.apache.org/jira/browse/ARROW-10747) - [Rust] Optimizations for csv reader
-* [ARROW-10750](https://issues.apache.org/jira/browse/ARROW-10750) - [Rust] [DataFusion] Add SQL support for LEFT and RIGHT join
-* [ARROW-10752](https://issues.apache.org/jira/browse/ARROW-10752) - [GLib] Add garrow\_schema\_has\_metadata()
-* [ARROW-10754](https://issues.apache.org/jira/browse/ARROW-10754) - [GLib] Add support for metadata to GArrowField
-* [ARROW-10755](https://issues.apache.org/jira/browse/ARROW-10755) - [Rust] [Parquet] Add support for writing boolean type
-* [ARROW-10756](https://issues.apache.org/jira/browse/ARROW-10756) - [Rust] Clippy - fix reduntant clone
-* [ARROW-10759](https://issues.apache.org/jira/browse/ARROW-10759) - [Rust][DataFusion] Implement support for casting string to date in sql expressions
-* [ARROW-10763](https://issues.apache.org/jira/browse/ARROW-10763) - [Rust] Speed up take kernels
-* [ARROW-10765](https://issues.apache.org/jira/browse/ARROW-10765) - [Rust] Optimize take strings for non-null arrays
-* [ARROW-10767](https://issues.apache.org/jira/browse/ARROW-10767) - [Rust] Speed up sum kernel with nulls
-* [ARROW-10770](https://issues.apache.org/jira/browse/ARROW-10770) - [Rust] Support reading nested JSON lists
-* [ARROW-10772](https://issues.apache.org/jira/browse/ARROW-10772) - [Rust] Improve take performance
-* [ARROW-10775](https://issues.apache.org/jira/browse/ARROW-10775) - [Rust][DataFusion] Use ahash in hash join
-* [ARROW-10776](https://issues.apache.org/jira/browse/ARROW-10776) - [C++] Provide iterator access to primitive elements inside an Array
-* [ARROW-10781](https://issues.apache.org/jira/browse/ARROW-10781) - [Rust] [DataFusion] TableProvider should provide row count statistics
-* [ARROW-10783](https://issues.apache.org/jira/browse/ARROW-10783) - [Rust] [DataFusion] Implement row count statistics for Parquet TableProvider
-* [ARROW-10785](https://issues.apache.org/jira/browse/ARROW-10785) - Further optimize take string
-* [ARROW-10786](https://issues.apache.org/jira/browse/ARROW-10786) - [Packaging][RPM] Drop support for CentOS 6
-* [ARROW-10788](https://issues.apache.org/jira/browse/ARROW-10788) - [C++] Make S3 recursive walks parallel
-* [ARROW-10789](https://issues.apache.org/jira/browse/ARROW-10789) - [Rust][DataFusion] Make TableProvider dynamically typed
-* [ARROW-10790](https://issues.apache.org/jira/browse/ARROW-10790) - [C++][Compute] Investigate ChunkedArray sort performance
-* [ARROW-10792](https://issues.apache.org/jira/browse/ARROW-10792) - [Rust] [CI] Modulararize CI for faster and smaller builds
-* [ARROW-10795](https://issues.apache.org/jira/browse/ARROW-10795) - [Rust] Fix specialization for arrow datatypes
-* [ARROW-10796](https://issues.apache.org/jira/browse/ARROW-10796) - [C++] Investigate RecordBatch sort performance
-* [ARROW-10800](https://issues.apache.org/jira/browse/ARROW-10800) - [Rust] [Parquet] Provide access to the elements of parquet::record::{List, Map}
-* [ARROW-10802](https://issues.apache.org/jira/browse/ARROW-10802) - [C++] Remove Dictionary[NullType] special casing in parquet column writer
-* [ARROW-10808](https://issues.apache.org/jira/browse/ARROW-10808) - [Rust] [DataFusion] Support nested expressions in aggregations
-* [ARROW-10809](https://issues.apache.org/jira/browse/ARROW-10809) - [C++] Use Datum for SortIndices() input
-* [ARROW-10812](https://issues.apache.org/jira/browse/ARROW-10812) - [Rust] Make BooleanArray not a PrimitiveArray
-* [ARROW-10813](https://issues.apache.org/jira/browse/ARROW-10813) - [Rust] [DataFusion] Implement DFSchema
-* [ARROW-10814](https://issues.apache.org/jira/browse/ARROW-10814) - [Packaging][deb] Drop support for Debian GNU/Linux Stretch
-* [ARROW-10817](https://issues.apache.org/jira/browse/ARROW-10817) - [Rust] [DataFusion] Implement TypedString
-* [ARROW-10820](https://issues.apache.org/jira/browse/ARROW-10820) - [Rust] [DataFusion] Complete TPC-H Benchmark Queries
-* [ARROW-10821](https://issues.apache.org/jira/browse/ARROW-10821) - [Rust] [Datafusion] implement negative expression
-* [ARROW-10822](https://issues.apache.org/jira/browse/ARROW-10822) - [Rust] [Datafusion] support compiling datafusion with simd support
-* [ARROW-10824](https://issues.apache.org/jira/browse/ARROW-10824) - [Rust] Added PartialEq for NullArray
-* [ARROW-10825](https://issues.apache.org/jira/browse/ARROW-10825) - [Rust] Add support to NullArrays for MutableArrayData
-* [ARROW-10826](https://issues.apache.org/jira/browse/ARROW-10826) - [Rust] Add support for FixedSizeBinary to MutableArrayData
-* [ARROW-10827](https://issues.apache.org/jira/browse/ARROW-10827) - [Rust] Extend concatenate to all types
-* [ARROW-10828](https://issues.apache.org/jira/browse/ARROW-10828) - [Rust][DataFusion] Enable more clippy lints
-* [ARROW-10829](https://issues.apache.org/jira/browse/ARROW-10829) - [Rust] [DataFusion] Implement Into<Schema\> for DFSchema
-* [ARROW-10832](https://issues.apache.org/jira/browse/ARROW-10832) - [Rust] Evaluate latest snapshot flatc
-* [ARROW-10836](https://issues.apache.org/jira/browse/ARROW-10836) - [Rust] Extend take kernel to FixedSizeListArray
-* [ARROW-10838](https://issues.apache.org/jira/browse/ARROW-10838) - [Rust] [CI] Add CI for wasm32 target
-* [ARROW-10839](https://issues.apache.org/jira/browse/ARROW-10839) - [Rust] [DataFusion] Implement BETWEEN Operator
-* [ARROW-10843](https://issues.apache.org/jira/browse/ARROW-10843) - [C++] Add support for temporal types in sort family kernels
-* [ARROW-10845](https://issues.apache.org/jira/browse/ARROW-10845) - [Python][CI] Add python CI build using numpy nightly
-* [ARROW-10849](https://issues.apache.org/jira/browse/ARROW-10849) - [Python] Handle numpy deprecation warnings for builtin type aliases
-* [ARROW-10851](https://issues.apache.org/jira/browse/ARROW-10851) - [C++] Reduce code size of vector\_sort.cc
-* [ARROW-10857](https://issues.apache.org/jira/browse/ARROW-10857) - [Packaging] Follow PowerTools repository name change on CentOS 8
-* [ARROW-10858](https://issues.apache.org/jira/browse/ARROW-10858) - [C++][MSVC] Add missing Boost dependency
-* [ARROW-10861](https://issues.apache.org/jira/browse/ARROW-10861) - [Python] Update minimal NumPy version to 1.16.6
-* [ARROW-10864](https://issues.apache.org/jira/browse/ARROW-10864) - [Rust] Use standard ordering for floats
-* [ARROW-10865](https://issues.apache.org/jira/browse/ARROW-10865) - [Rust][DataFusion] More ergonomic conversion between Schema, SchemaRef, DFSchema, and DFSchemaRef
-* [ARROW-10867](https://issues.apache.org/jira/browse/ARROW-10867) - build failure on aarch64 with -DARROW\_PYTHON=ON and gcc
-* [ARROW-10869](https://issues.apache.org/jira/browse/ARROW-10869) - [GLib] Add garrow\_\*\_sort\_indices() and related options
-* [ARROW-10870](https://issues.apache.org/jira/browse/ARROW-10870) - [Julia] Update website with Julia implementation
-* [ARROW-10871](https://issues.apache.org/jira/browse/ARROW-10871) - [Julia] Setup Julia CI via GitHub Actions
-* [ARROW-10873](https://issues.apache.org/jira/browse/ARROW-10873) - [C++] Apple Silicon is reported as arm64 in CMake
-* [ARROW-10874](https://issues.apache.org/jira/browse/ARROW-10874) - [Rust][DataFusion] Add table statistics for MemTable
-* [ARROW-10877](https://issues.apache.org/jira/browse/ARROW-10877) - [Rust] [DataFusion] Add benchmark based on kaggle movies
-* [ARROW-10878](https://issues.apache.org/jira/browse/ARROW-10878) - [Rust] Simplify extend\_from\_slice
-* [ARROW-10879](https://issues.apache.org/jira/browse/ARROW-10879) - [Packaging][deb] Restore Debian GNU/Linux Buster configuration
-* [ARROW-10881](https://issues.apache.org/jira/browse/ARROW-10881) - [C++] EXC\_BAD\_ACCESS in BaseSetBitRunReader<false\>::NextRun
-* [ARROW-10885](https://issues.apache.org/jira/browse/ARROW-10885) - [Rust][DataFusion] Optimize join build vs probe based on statistics on row number
-* [ARROW-10887](https://issues.apache.org/jira/browse/ARROW-10887) - [C++][Doc] Document IPC API
-* [ARROW-10889](https://issues.apache.org/jira/browse/ARROW-10889) - [Rust] Document our approach to unsafe code in README
-* [ARROW-10890](https://issues.apache.org/jira/browse/ARROW-10890) - [Rust] [DataFusion] JOIN support
-* [ARROW-10891](https://issues.apache.org/jira/browse/ARROW-10891) - [Rust][DataFusion] More clippy lints
-* [ARROW-10893](https://issues.apache.org/jira/browse/ARROW-10893) - [Rust] [DataFusion] Easier clippy fixes
-* [ARROW-10896](https://issues.apache.org/jira/browse/ARROW-10896) - [C++][CMake] Rename internal RE2 package name to "re2" from "RE2"
-* [ARROW-10900](https://issues.apache.org/jira/browse/ARROW-10900) - [Rust][DataFusion] Resolve TableScan provider eagerly
-* [ARROW-10904](https://issues.apache.org/jira/browse/ARROW-10904) - [Python] Add support for Python 3.9 macOS wheels
-* [ARROW-10905](https://issues.apache.org/jira/browse/ARROW-10905) - [Python] Add support for Python 3.9 windows wheels
-* [ARROW-10908](https://issues.apache.org/jira/browse/ARROW-10908) - [Rust] [DataFusion] Update relevant tpch-queries with BETWEEN
-* [ARROW-10917](https://issues.apache.org/jira/browse/ARROW-10917) - [Rust][Doc] Update feature matrix
-* [ARROW-10918](https://issues.apache.org/jira/browse/ARROW-10918) - [C++][Doc] Document supported Parquet features
-* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
-* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
-* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
-* [ARROW-10929](https://issues.apache.org/jira/browse/ARROW-10929) - [Rust] Migrate CI tests to stable rust
-* [ARROW-10933](https://issues.apache.org/jira/browse/ARROW-10933) - [Rust] Update docs in regard to stable rust
-* [ARROW-10934](https://issues.apache.org/jira/browse/ARROW-10934) - [Python] Tests are failed with fsspec-0.8.5
-* [ARROW-10938](https://issues.apache.org/jira/browse/ARROW-10938) - [Rust] upgrade dependency "flatbuffers" to 0.8
-* [ARROW-10940](https://issues.apache.org/jira/browse/ARROW-10940) - [Rust] Extend sort kernel to ListArray
-* [ARROW-10941](https://issues.apache.org/jira/browse/ARROW-10941) - [Doc][C++] Document supported Parquet encryption features
-* [ARROW-10944](https://issues.apache.org/jira/browse/ARROW-10944) - [Rust] Implement min/max kernels for BooleanArray
-* [ARROW-10946](https://issues.apache.org/jira/browse/ARROW-10946) - [Rust] Make ChunkIter not depend on a buffer
-* [ARROW-10947](https://issues.apache.org/jira/browse/ARROW-10947) - [Rust][DataFusion] Refactor UTF8 to Date32 for Performance
-* [ARROW-10948](https://issues.apache.org/jira/browse/ARROW-10948) - [C++] Always use GTestConfig.cmake
-* [ARROW-10949](https://issues.apache.org/jira/browse/ARROW-10949) - [Rust] Avoid clones in getting values of boolean arrays
-* [ARROW-10951](https://issues.apache.org/jira/browse/ARROW-10951) - [Python][CI] Nightly pandas builds failing because of pytest monkeypatch issue
-* [ARROW-10952](https://issues.apache.org/jira/browse/ARROW-10952) - [Rust] Add pre-commit hook
-* [ARROW-10966](https://issues.apache.org/jira/browse/ARROW-10966) - [C++] Use FnOnce for ThreadPool's tasks instead of std::function
-* [ARROW-10968](https://issues.apache.org/jira/browse/ARROW-10968) - [Rust][DataFusion] Don't build hash table for right side of the join
-* [ARROW-10969](https://issues.apache.org/jira/browse/ARROW-10969) - [Rust][DataFusion] Implement ANSI SQL Functions
-* [ARROW-10985](https://issues.apache.org/jira/browse/ARROW-10985) - [Rust] Update unsafe guidelines for adding JIRA references
-* [ARROW-10986](https://issues.apache.org/jira/browse/ARROW-10986) - [Rust][DataFusion] Add average statistic to TCP-H benchmark too
-* [ARROW-10988](https://issues.apache.org/jira/browse/ARROW-10988) - [C++] Require CMake 3.5 or later
-* [ARROW-10989](https://issues.apache.org/jira/browse/ARROW-10989) - [Rust] Use slices for iterating primitive arrays
-* [ARROW-10993](https://issues.apache.org/jira/browse/ARROW-10993) - [CI][macOS] Fix Python 3.9 installation by Homebrew
-* [ARROW-10995](https://issues.apache.org/jira/browse/ARROW-10995) - [Rust] [DataFusion] Improve parallelism when reading Parquet files
-* [ARROW-11004](https://issues.apache.org/jira/browse/ARROW-11004) - [FlightRPC][Python] Header-based auth in clients
-* [ARROW-11005](https://issues.apache.org/jira/browse/ARROW-11005) - [Rust] Remove indirection from take kernel and simplify interface
-* [ARROW-11008](https://issues.apache.org/jira/browse/ARROW-11008) - [Rust][DataFusion] Simplify count accumulator
-* [ARROW-11009](https://issues.apache.org/jira/browse/ARROW-11009) - [Python] Add environment variable to elect default usage of system memory allocator instead of jemalloc/mimalloc
-* [ARROW-11010](https://issues.apache.org/jira/browse/ARROW-11010) - [Python] \`np.float\` deprecation warning in \`\_pandas\_logical\_type\_map\`
-* [ARROW-11012](https://issues.apache.org/jira/browse/ARROW-11012) - [Rust] [DataFusion] Make write\_csv and write\_parquet concurrent
-* [ARROW-11015](https://issues.apache.org/jira/browse/ARROW-11015) - [CI][Gandiva] Move gandiva nightly build from travis to github action
-* [ARROW-11018](https://issues.apache.org/jira/browse/ARROW-11018) - [Rust][DataFusion] Add null count column statistics
-* [ARROW-11026](https://issues.apache.org/jira/browse/ARROW-11026) - [Rust]: Run tests without requiring environment variables
-* [ARROW-11028](https://issues.apache.org/jira/browse/ARROW-11028) - [Rust] Somewhat pedantic pattern-matches
-* [ARROW-11029](https://issues.apache.org/jira/browse/ARROW-11029) - [Rust] [DataFusion] Document why join order optimization does not work with filter pushdown
-* [ARROW-11032](https://issues.apache.org/jira/browse/ARROW-11032) - [C++][FlightRPC] Add benchmark for local RPC through unix socket
-* [ARROW-11033](https://issues.apache.org/jira/browse/ARROW-11033) - [Rust] CSV writer performance improvements
-* [ARROW-11034](https://issues.apache.org/jira/browse/ARROW-11034) - [Rust] rustfmt cleanup
-* [ARROW-11035](https://issues.apache.org/jira/browse/ARROW-11035) - [Rust] Improve performance of cast to utf8 via FromIter
-* [ARROW-11037](https://issues.apache.org/jira/browse/ARROW-11037) - [Rust] Improve performance of string fromIter
-* [ARROW-11038](https://issues.apache.org/jira/browse/ARROW-11038) - [Rust] Remove \`BufferBuilderTrait\` and associated Result requirement.
-* [ARROW-11039](https://issues.apache.org/jira/browse/ARROW-11039) - [Rust] Improve performance for utf8 to float cast
-* [ARROW-11040](https://issues.apache.org/jira/browse/ARROW-11040) - [Rust] Simplify builders with generics
-* [ARROW-11042](https://issues.apache.org/jira/browse/ARROW-11042) - [Rust][DataFusion] Increase default batch size
-* [ARROW-11043](https://issues.apache.org/jira/browse/ARROW-11043) - [C++] Add "is\_nan" kernel
-* [ARROW-11046](https://issues.apache.org/jira/browse/ARROW-11046) - [Rust][DataFusion] Add count\_distinct to dataframe API
-* [ARROW-11049](https://issues.apache.org/jira/browse/ARROW-11049) - [Python] Expose alternate memory pools
-* [ARROW-11052](https://issues.apache.org/jira/browse/ARROW-11052) - [Rust] [DataFusion] Implement metrics in join operator
-* [ARROW-11053](https://issues.apache.org/jira/browse/ARROW-11053) - [Rust] [DataFusion] Optimize joins with dynamic capacity for output batches
-* [ARROW-11054](https://issues.apache.org/jira/browse/ARROW-11054) - Update SQLParser to 0.70
-* [ARROW-11055](https://issues.apache.org/jira/browse/ARROW-11055) - [Rust] [DataFusion] Support date\_trunc function
-* [ARROW-11058](https://issues.apache.org/jira/browse/ARROW-11058) - [Rust] [DataFusion] Implement "coalesce batches" operator
-* [ARROW-11063](https://issues.apache.org/jira/browse/ARROW-11063) - [Rust] Validate null counts when building arrays
-* [ARROW-11064](https://issues.apache.org/jira/browse/ARROW-11064) - [Rust][DataFusion] Speed up hash join on smaller batches
-* [ARROW-11072](https://issues.apache.org/jira/browse/ARROW-11072) - [Rust] [Parquet] Support int32 and int64 physical types
-* [ARROW-11076](https://issues.apache.org/jira/browse/ARROW-11076) - [Rust][DataFusion] Refactor usage of right indices in hash join
-* [ARROW-11079](https://issues.apache.org/jira/browse/ARROW-11079) - [R] Catch up on changelog since 2.0
-* [ARROW-11080](https://issues.apache.org/jira/browse/ARROW-11080) - [C++][Dataset] Improvements to implicit casting
-* [ARROW-11082](https://issues.apache.org/jira/browse/ARROW-11082) - [Rust] Add FFI for LargeUtf8
-* [ARROW-11086](https://issues.apache.org/jira/browse/ARROW-11086) - [Rust] Extend take to support more index types
-* [ARROW-11091](https://issues.apache.org/jira/browse/ARROW-11091) - [Rust][DataFusion] Fix clippy warning in rust 1.49
-* [ARROW-11095](https://issues.apache.org/jira/browse/ARROW-11095) - [Python] Access pyarrow.RecordBatch column by name
-* [ARROW-11096](https://issues.apache.org/jira/browse/ARROW-11096) - [Rust] Add FFI for [Large]Binary
-* [ARROW-11097](https://issues.apache.org/jira/browse/ARROW-11097) - [Rust] Simplify tests
-* [ARROW-11099](https://issues.apache.org/jira/browse/ARROW-11099) - [Rust]: Remove unsafe value\_slice method from PrimitiveArray and BooleanArray
-* [ARROW-11100](https://issues.apache.org/jira/browse/ARROW-11100) - [Rust] Speed up numeric to string cast using lexical\_core
-* [ARROW-11101](https://issues.apache.org/jira/browse/ARROW-11101) - [Rust] enable "cargo +nightly fmt" in git pre-commit hook
-* [ARROW-11104](https://issues.apache.org/jira/browse/ARROW-11104) - [GLib] Add append\_null/append\_nulls to GArrowArrayBuilder and use them
-* [ARROW-11105](https://issues.apache.org/jira/browse/ARROW-11105) - [Rust] Favor From/Into traits in MutableBuffer
-* [ARROW-11109](https://issues.apache.org/jira/browse/ARROW-11109) - [GLib] Add garrow\_array\_builder\_append\_empty\_value() and values()
-* [ARROW-11110](https://issues.apache.org/jira/browse/ARROW-11110) - [Rust] [Datafusion] context.table should not take a mutable self reference
-* [ARROW-11111](https://issues.apache.org/jira/browse/ARROW-11111) - [GLib] Add GArrowFixedSizeBinaryArrayBuilder
-* [ARROW-11121](https://issues.apache.org/jira/browse/ARROW-11121) - [Developer] Use pull\_request\_target for PR JIRA integration
-* [ARROW-11122](https://issues.apache.org/jira/browse/ARROW-11122) - [Rust] Add FFI for date and time
-* [ARROW-11124](https://issues.apache.org/jira/browse/ARROW-11124) - [Doc] Update status matrix for Decimal256
-* [ARROW-11125](https://issues.apache.org/jira/browse/ARROW-11125) - [Rust] Implement logical equality for list arrays
-* [ARROW-11126](https://issues.apache.org/jira/browse/ARROW-11126) - [Rust] Document and test ARROW-10656
-* [ARROW-11127](https://issues.apache.org/jira/browse/ARROW-11127) - [C++] Unused cpu\_info on non-x86 architecture
-* [ARROW-11129](https://issues.apache.org/jira/browse/ARROW-11129) - [Rust][DataFusion] Use tokio thread pool for loading parquet
-* [ARROW-11130](https://issues.apache.org/jira/browse/ARROW-11130) - [Website][CentOS 8][RHEL 8] Enable all required repositories by default
-* [ARROW-11131](https://issues.apache.org/jira/browse/ARROW-11131) - [Rust] Improve performance of bool\_equal
-* [ARROW-11136](https://issues.apache.org/jira/browse/ARROW-11136) - [R] Bindings for is.nan
-* [ARROW-11137](https://issues.apache.org/jira/browse/ARROW-11137) - [Rust][DataFusion] Fix Clippy needless\_range\_loop, needless\_lifetimes
-* [ARROW-11138](https://issues.apache.org/jira/browse/ARROW-11138) - [Rust] [DataFusion] Support ltrim, rtrim
-* [ARROW-11139](https://issues.apache.org/jira/browse/ARROW-11139) - [GLib] Add support for extension type
-* [ARROW-11155](https://issues.apache.org/jira/browse/ARROW-11155) - [C++][Packaging] Move gandiva crossbow jobs off of Travis-CI
-* [ARROW-11158](https://issues.apache.org/jira/browse/ARROW-11158) - [Julia] Implement Decimal256 support
-* [ARROW-11159](https://issues.apache.org/jira/browse/ARROW-11159) - [Developer] Consolidate pull request related jobs
-* [ARROW-11165](https://issues.apache.org/jira/browse/ARROW-11165) - [Rust] [DataFusion] Document the desired SQL dialect for DataFusion
-* [ARROW-11168](https://issues.apache.org/jira/browse/ARROW-11168) - [Rust] Fix cargo doc warnings
-* [ARROW-11169](https://issues.apache.org/jira/browse/ARROW-11169) - [Rust] Add a comment explaining where float total\_order algorithm came from
-* [ARROW-11175](https://issues.apache.org/jira/browse/ARROW-11175) - [R] Small docs fixes
-* [ARROW-11176](https://issues.apache.org/jira/browse/ARROW-11176) - [R] Expose memory pool name and document setting it
-* [ARROW-11187](https://issues.apache.org/jira/browse/ARROW-11187) - [Rust] [Parquet] Pin specific parquet-format-rs version
-* [ARROW-11188](https://issues.apache.org/jira/browse/ARROW-11188) - [Rust] Implement crypto functions from PostgreSQL dialect
-* [ARROW-11193](https://issues.apache.org/jira/browse/ARROW-11193) - [Documentation] Add docs for Java ListVector
-* [ARROW-11194](https://issues.apache.org/jira/browse/ARROW-11194) - [Rust] Enable SIMD for aarch64
-* [ARROW-11195](https://issues.apache.org/jira/browse/ARROW-11195) - [Rust] [DataFusion] Built-in table providers should expose relevant fields
-* [ARROW-11196](https://issues.apache.org/jira/browse/ARROW-11196) - [GLib] Add support for mock, HDFS and S3 file systems with factory function
-* [ARROW-11198](https://issues.apache.org/jira/browse/ARROW-11198) - [Packaging][Python] Ensure setuptools version during build supports markdown
-* [ARROW-11200](https://issues.apache.org/jira/browse/ARROW-11200) - [Rust] [DateFusion] Physical operators and expressions should have public accessor methods
-* [ARROW-11201](https://issues.apache.org/jira/browse/ARROW-11201) - [Rust] create\_batch\_empty - support more types
-* [ARROW-11203](https://issues.apache.org/jira/browse/ARROW-11203) - [Developer][Website] Enable JIRA and pull request integration
-* [ARROW-11204](https://issues.apache.org/jira/browse/ARROW-11204) - [C++] Fix build failure with bundled gRPC and Protobuf
-* [ARROW-11205](https://issues.apache.org/jira/browse/ARROW-11205) - [GLib][Dataset] Add GADFileFormat and its family
-* [ARROW-11209](https://issues.apache.org/jira/browse/ARROW-11209) - [Rust] DF - Provide better error message on unsupported GROUP BY
-* [ARROW-11210](https://issues.apache.org/jira/browse/ARROW-11210) - [CI] Restore workflows that had been blocked by INFRA
-* [ARROW-11212](https://issues.apache.org/jira/browse/ARROW-11212) - [Packaging][Python] Use vcpkg as dependency source for manylinux and windows wheels
-* [ARROW-11213](https://issues.apache.org/jira/browse/ARROW-11213) - [Packaging][Python] Dockerize wheel building on windows
-* [ARROW-11215](https://issues.apache.org/jira/browse/ARROW-11215) - [CI] Use named volumes by default for caching in docker-compose
-* [ARROW-11218](https://issues.apache.org/jira/browse/ARROW-11218) - [R] Make SubTreeFileSystem print method more informative
-* [ARROW-11219](https://issues.apache.org/jira/browse/ARROW-11219) - [CI][Ruby][MinGW] Reduce CI time
-* [ARROW-11221](https://issues.apache.org/jira/browse/ARROW-11221) - [Rust] DF Implement GROUP BY support for Float32/Float64
-* [ARROW-11231](https://issues.apache.org/jira/browse/ARROW-11231) - [Packaging] Add mimalloc to Linux builds
-* [ARROW-11234](https://issues.apache.org/jira/browse/ARROW-11234) - [CI][Ruby][macOS] Reduce CI time
-* [ARROW-11236](https://issues.apache.org/jira/browse/ARROW-11236) - [Java] Bump Jackson to 2.11.4
-* [ARROW-11240](https://issues.apache.org/jira/browse/ARROW-11240) - [Packaging][R] Add mimalloc to R packaging
-* [ARROW-11242](https://issues.apache.org/jira/browse/ARROW-11242) - [CI] Remove CMake 3.2 job
-* [ARROW-11245](https://issues.apache.org/jira/browse/ARROW-11245) - [C++][Gandiva] Add support for LLVM 11.1
-* [ARROW-11247](https://issues.apache.org/jira/browse/ARROW-11247) - [C++] Infer date32 columns in CSV
-* [ARROW-11256](https://issues.apache.org/jira/browse/ARROW-11256) - [Packaging][Linux] Don't buffer packaging output
-* [ARROW-11272](https://issues.apache.org/jira/browse/ARROW-11272) - [Release][wheel] Remove unsupported Python 3.5 and manylinux1
-* [ARROW-11273](https://issues.apache.org/jira/browse/ARROW-11273) - [Release][deb] Remove unsupported Debian GNU/Linux stretch
-* [ARROW-11278](https://issues.apache.org/jira/browse/ARROW-11278) - [Release][NodeJS] Don't touch \~/.bash\_profile
-* [ARROW-11280](https://issues.apache.org/jira/browse/ARROW-11280) - [Release][APT] Fix minimal build example check
-* [ARROW-11281](https://issues.apache.org/jira/browse/ARROW-11281) - [C++] Remove needless runtime RapidJSON dependency
-* [ARROW-11282](https://issues.apache.org/jira/browse/ARROW-11282) - [Packaging][deb] Add missing libgflags-dev dependency
-* [ARROW-11285](https://issues.apache.org/jira/browse/ARROW-11285) - [Release][APT] Add support for Ubuntu Groovy
-* [ARROW-11292](https://issues.apache.org/jira/browse/ARROW-11292) - [Release][JS] Use Node.JS LTS
-* [ARROW-11293](https://issues.apache.org/jira/browse/ARROW-11293) - [C++] Don't require Boost and gflags with find\_package(Arrow)
-* [ARROW-11307](https://issues.apache.org/jira/browse/ARROW-11307) - [Release][Ubuntu][20.10] Add workaround for dependency issue
-* [PARQUET-1566](https://issues.apache.org/jira/browse/PARQUET-1566) - [C++] Indicate if null count, distinct count are present in column statistics
-
-
-## Bug Fixes
-
-* [ARROW-2616](https://issues.apache.org/jira/browse/ARROW-2616) - [Python] Cross-compiling Pyarrow
-* [ARROW-6582](https://issues.apache.org/jira/browse/ARROW-6582) - [R] Arrow to R fails with embedded nuls in strings
-* [ARROW-7363](https://issues.apache.org/jira/browse/ARROW-7363) - [Python] Add combine\_chunks method to ChunkedArray
-* [ARROW-7909](https://issues.apache.org/jira/browse/ARROW-7909) - [Website] Add how to install on Red Hat Enterprise Linux
-* [ARROW-8258](https://issues.apache.org/jira/browse/ARROW-8258) - [Rust] [Parquet] ArrowReader fails on some timestamp types
-* [ARROW-9027](https://issues.apache.org/jira/browse/ARROW-9027) - [Python] Split in multiple files + clean-up pyarrow.parquet tests
-* [ARROW-9479](https://issues.apache.org/jira/browse/ARROW-9479) - [JS] Table.from fails for zero-item Lists, FixedSizeLists, Maps. ditto Table.empty
-* [ARROW-9636](https://issues.apache.org/jira/browse/ARROW-9636) - [Python] Update documentation about 'LZO' compression in parquet.write\_table
-* [ARROW-9776](https://issues.apache.org/jira/browse/ARROW-9776) - [R] read\_feather causes segfault in R if file doesn't exist
-* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
-* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
-* [ARROW-9898](https://issues.apache.org/jira/browse/ARROW-9898) - [C++][Gandiva] Error handling in castINT fails in some enviroments
-* [ARROW-9903](https://issues.apache.org/jira/browse/ARROW-9903) - [R] open\_dataset freezes opening feather files on Windows
-* [ARROW-9963](https://issues.apache.org/jira/browse/ARROW-9963) - [Python] Recognize datetime.timezone.utc as UTC on conversion python-\>pyarrow
-* [ARROW-10039](https://issues.apache.org/jira/browse/ARROW-10039) - [Rust] Do not require memory alignment of buffers
-* [ARROW-10042](https://issues.apache.org/jira/browse/ARROW-10042) - [Rust] Buffer equalities may be incorrect
-* [ARROW-10080](https://issues.apache.org/jira/browse/ARROW-10080) - [R] Arrow does not release unused memory
-* [ARROW-10122](https://issues.apache.org/jira/browse/ARROW-10122) - [Python] Selecting one column of multi-index results in a duplicated value column.
-* [ARROW-10145](https://issues.apache.org/jira/browse/ARROW-10145) - [C++][Dataset] Assert integer overflow in partitioning falls back to string
-* [ARROW-10146](https://issues.apache.org/jira/browse/ARROW-10146) - [Python] Parquet metadata to\_dict raises attribute error
-* [ARROW-10174](https://issues.apache.org/jira/browse/ARROW-10174) - [Java] Reading of Dictionary encoded struct vector fails
-* [ARROW-10177](https://issues.apache.org/jira/browse/ARROW-10177) - [CI][Gandiva] Nightly gandiva-jar-xenial fails
-* [ARROW-10186](https://issues.apache.org/jira/browse/ARROW-10186) - [Rust] Tests fail when following instructions in README
-* [ARROW-10247](https://issues.apache.org/jira/browse/ARROW-10247) - [C++][Dataset] Cannot write dataset with dictionary column as partition field
-* [ARROW-10264](https://issues.apache.org/jira/browse/ARROW-10264) - [C++][Python] Parquet test failing with HadoopFileSystem URI
-* [ARROW-10270](https://issues.apache.org/jira/browse/ARROW-10270) - [R] Fix CSV timestamp\_parsers test on R-devel
-* [ARROW-10283](https://issues.apache.org/jira/browse/ARROW-10283) - [Python] Python deprecation warning for "PY\_SSIZE\_T\_CLEAN will be required for '\#' formats"
-* [ARROW-10293](https://issues.apache.org/jira/browse/ARROW-10293) - [Rust] [DataFusion] Fix benchmarks
-* [ARROW-10294](https://issues.apache.org/jira/browse/ARROW-10294) - [Java] Resolve problems of DecimalVector APIs on ArrowBufs
-* [ARROW-10321](https://issues.apache.org/jira/browse/ARROW-10321) - [C++] Building AVX512 code when we should not
-* [ARROW-10333](https://issues.apache.org/jira/browse/ARROW-10333) - [Java] Remove split packages in arrow-memory-core and arrow-vectors
-* [ARROW-10345](https://issues.apache.org/jira/browse/ARROW-10345) - [C++] NaN breaks sorting
-* [ARROW-10346](https://issues.apache.org/jira/browse/ARROW-10346) - [Python] Default S3 region is eu-central-1 even with LANG=C
-* [ARROW-10348](https://issues.apache.org/jira/browse/ARROW-10348) - [C++] Fix crash on invalid Parquet file (OSS-Fuzz)
-* [ARROW-10350](https://issues.apache.org/jira/browse/ARROW-10350) - [Rust] parquet\_derive crate cannot be published to crates.io
-* [ARROW-10353](https://issues.apache.org/jira/browse/ARROW-10353) - [C++] Parquet decompresses DataPageV2 pages even if is\_compressed==0
-* [ARROW-10358](https://issues.apache.org/jira/browse/ARROW-10358) - [R] Followups to 2.0.0 release
-* [ARROW-10365](https://issues.apache.org/jira/browse/ARROW-10365) - [R] Remove duplicate setting of S3 flag on macOS
-* [ARROW-10369](https://issues.apache.org/jira/browse/ARROW-10369) - [Dev] Fix archery release utility test cases
-* [ARROW-10371](https://issues.apache.org/jira/browse/ARROW-10371) - [R] Linux system requirements check needs to support older cmake versions
-* [ARROW-10386](https://issues.apache.org/jira/browse/ARROW-10386) - [R] List column class attributes not preserved in roundtrip
-* [ARROW-10388](https://issues.apache.org/jira/browse/ARROW-10388) - [Java] Fix Spark integration build failure
-* [ARROW-10390](https://issues.apache.org/jira/browse/ARROW-10390) - [Rust] [Parquet] Regression Can not implement custom ParquetWriter because \`TryClone\` is not publically exported
-* [ARROW-10393](https://issues.apache.org/jira/browse/ARROW-10393) - [Rust]: Fix null value reading in jsonreader for both dictionary and stringbuilders
-* [ARROW-10394](https://issues.apache.org/jira/browse/ARROW-10394) - [Rust] [Large]BinaryArray can be created from non-binary datatypes
-* [ARROW-10397](https://issues.apache.org/jira/browse/ARROW-10397) - [C++] Outdated and confusing comment on dictionary indices
-* [ARROW-10399](https://issues.apache.org/jira/browse/ARROW-10399) - [R] Fix performance regression from cpp11::r\_string
-* [ARROW-10411](https://issues.apache.org/jira/browse/ARROW-10411) - [C++] Fix incorrect child array lengths for Concatenate of FixedSizeList
-* [ARROW-10412](https://issues.apache.org/jira/browse/ARROW-10412) - [C++] CMake Build Fails with grpc 1.33.1, "GRPC\_CPP\_PLUGIN-NOTFOUND: program not found or is not executable"
-* [ARROW-10413](https://issues.apache.org/jira/browse/ARROW-10413) - [Rust] [Parquet] Unignore some roundtrip tests that are passing now
-* [ARROW-10414](https://issues.apache.org/jira/browse/ARROW-10414) - [R] open\_dataset doesn't work with absolute/expanded paths on Windows
-* [ARROW-10426](https://issues.apache.org/jira/browse/ARROW-10426) - [C++] Arrow type large\_string cannot be written to Parquet type column descriptor
-* [ARROW-10433](https://issues.apache.org/jira/browse/ARROW-10433) - [Python] pyarrow doesn't work with s3fs\>=0.5
-* [ARROW-10434](https://issues.apache.org/jira/browse/ARROW-10434) - [Rust] Debug formatting arrays with lengths greater than 10 and less than 20 produces incorrect values
-* [ARROW-10441](https://issues.apache.org/jira/browse/ARROW-10441) - [FlightRPC][Java] FlightClients from FlightGrpcUtils\#createFlightClient shutdown gRPC channel when closed
-* [ARROW-10446](https://issues.apache.org/jira/browse/ARROW-10446) - [C++][Python] Timezone aware pd.Timestamp's are incorrectly converted to Timestamp arrys
-* [ARROW-10448](https://issues.apache.org/jira/browse/ARROW-10448) - [Rust] PrimitiveArray::new can create arrays not in spec
-* [ARROW-10453](https://issues.apache.org/jira/browse/ARROW-10453) - [Rust] [DataFusion] Performance degredation after removing specialization
-* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests
-* [ARROW-10461](https://issues.apache.org/jira/browse/ARROW-10461) - [Rust] Offset related bug in BitChunks::remainder\_bits
-* [ARROW-10462](https://issues.apache.org/jira/browse/ARROW-10462) - [Python] ParquetDatasetPiece's path broken when using fsspec fs on Windows
-* [ARROW-10463](https://issues.apache.org/jira/browse/ARROW-10463) - [R] Better messaging for currently unsupported CSV options in open\_dataset
-* [ARROW-10470](https://issues.apache.org/jira/browse/ARROW-10470) - [R] Fix missing file error causing NYC taxi example to fail
-* [ARROW-10471](https://issues.apache.org/jira/browse/ARROW-10471) - [CI][Python] Ensure we have tests with s3fs and run those on CI
-* [ARROW-10472](https://issues.apache.org/jira/browse/ARROW-10472) - [C++][Python] casting a scalar timestamp to date32 results in Aborted (core dump)
-* [ARROW-10475](https://issues.apache.org/jira/browse/ARROW-10475) - [С++][FlightRPC] Arrow Flight Server / Client cannot be initialized with Ipv6 host
-* [ARROW-10480](https://issues.apache.org/jira/browse/ARROW-10480) - [Python] Parquet write\_table creates gzipped Parquet file, not Parquet with gzip compression
-* [ARROW-10482](https://issues.apache.org/jira/browse/ARROW-10482) - [Python] Specifying compression type on a column basis when writing Parquet not working
-* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler
-* [ARROW-10491](https://issues.apache.org/jira/browse/ARROW-10491) - [FlightRPC][Java] Fix NPE when using FlightProducer without interceptors
-* [ARROW-10493](https://issues.apache.org/jira/browse/ARROW-10493) - [C++][Parquet] Writing nullable nested strings results in wrong data in file
-* [ARROW-10495](https://issues.apache.org/jira/browse/ARROW-10495) - [C++] find\_package(Arrow) is broken on Ubuntu 18
-* [ARROW-10496](https://issues.apache.org/jira/browse/ARROW-10496) - [R][CI] Fix conda-r job
-* [ARROW-10499](https://issues.apache.org/jira/browse/ARROW-10499) - [C++][Java] Fix ORC Java JNI Crash
-* [ARROW-10502](https://issues.apache.org/jira/browse/ARROW-10502) - [C++/Python] CUDA detection messes up nightly conda-win builds
-* [ARROW-10503](https://issues.apache.org/jira/browse/ARROW-10503) - [C++] Uriparser will not compile using Intel compiler
-* [ARROW-10508](https://issues.apache.org/jira/browse/ARROW-10508) - [Java] Allow FixedSizeListVector to have empty children
-* [ARROW-10509](https://issues.apache.org/jira/browse/ARROW-10509) - [C++] Define operator<<(ostream, ParquetException) for clang+Windows
-* [ARROW-10511](https://issues.apache.org/jira/browse/ARROW-10511) - [Python] Table.to\_pandas() failing when timezone-awareness mismatch in metadata
-* [ARROW-10518](https://issues.apache.org/jira/browse/ARROW-10518) - Fix cast function issues in gandiva
-* [ARROW-10519](https://issues.apache.org/jira/browse/ARROW-10519) - [Python] Deadlock when PyArrow imports Pandas from multiple threads
-* [ARROW-10525](https://issues.apache.org/jira/browse/ARROW-10525) - [C++] Fix crash on unsupported IPC stream (OSS-Fuzz)
-* [ARROW-10532](https://issues.apache.org/jira/browse/ARROW-10532) - [Python] Mangled pandas\_metadata when specified schema has different order as DataFrame columns
-* [ARROW-10545](https://issues.apache.org/jira/browse/ARROW-10545) - [C++] Fix crash on invalid Parquet file (OSS-Fuzz)
-* [ARROW-10546](https://issues.apache.org/jira/browse/ARROW-10546) - [Python] Deprecate the S3FSWrapper class
-* [ARROW-10547](https://issues.apache.org/jira/browse/ARROW-10547) - [Rust][DataFusion] Filter pushdown loses filters if below a user defined node
-* [ARROW-10551](https://issues.apache.org/jira/browse/ARROW-10551) - [Rust]: Fix unreproducible benchmarks
-* [ARROW-10558](https://issues.apache.org/jira/browse/ARROW-10558) - [Python] Filesystem S3 tests not independent (native s3 influences s3fs)
-* [ARROW-10560](https://issues.apache.org/jira/browse/ARROW-10560) - [Python] Crash when creating array with string over 2GB
-* [ARROW-10563](https://issues.apache.org/jira/browse/ARROW-10563) - [Packaging][C++] CMake find\_package(Arrow 2.0 CONFIG REQUIRED) broken
-* [ARROW-10565](https://issues.apache.org/jira/browse/ARROW-10565) - [Python] Table.from\_batches and Table.from\_pandas have argument Schema\_schema in documentation instead of schema
-* [ARROW-10568](https://issues.apache.org/jira/browse/ARROW-10568) - [C++][Parquet] Parquet writer crashes process when Tell() does not succeed
-* [ARROW-10569](https://issues.apache.org/jira/browse/ARROW-10569) - [C++][Python] Poor Table filtering performance
-* [ARROW-10577](https://issues.apache.org/jira/browse/ARROW-10577) - [Rust][DataFusion] Hash Aggregator stream finishes unexpectedly after going to Pending state
-* [ARROW-10578](https://issues.apache.org/jira/browse/ARROW-10578) - [C++] Comparison kernels crashing for string array with null string scalar
-* [ARROW-10610](https://issues.apache.org/jira/browse/ARROW-10610) - [C++] arrow-utility-test and arrow-csv-test causes failures on a big-endian platform
-* [ARROW-10616](https://issues.apache.org/jira/browse/ARROW-10616) - [Developer] Expand PR labeler to all supported languages
-* [ARROW-10617](https://issues.apache.org/jira/browse/ARROW-10617) - [Python] RecordBatchStreamReader's iterator doesn't work with python 3.8
-* [ARROW-10619](https://issues.apache.org/jira/browse/ARROW-10619) - [C++] Fix crash on unsupported IPC stream (OSS-Fuzz)
-* [ARROW-10620](https://issues.apache.org/jira/browse/ARROW-10620) - [Rust][Parquet] move column chunk range logic to metadata.rs
-* [ARROW-10621](https://issues.apache.org/jira/browse/ARROW-10621) - [Java] flight-cpre test causes a failure on s390x
-* [ARROW-10622](https://issues.apache.org/jira/browse/ARROW-10622) - [R] Nameof<\>() is incorrect in r-arrow build environment
-* [ARROW-10623](https://issues.apache.org/jira/browse/ARROW-10623) - [R] Version 1.0.1 breaks data.frame attributes when reading file written by 2.0.0
-* [ARROW-10624](https://issues.apache.org/jira/browse/ARROW-10624) - [R] Proactively remove "problems" attributes
-* [ARROW-10627](https://issues.apache.org/jira/browse/ARROW-10627) - [Rust] Github master does not compile for WASM target
-* [ARROW-10629](https://issues.apache.org/jira/browse/ARROW-10629) - [CI] MinGW builds broken on Github Actions
-* [ARROW-10631](https://issues.apache.org/jira/browse/ARROW-10631) - [Rust] Equality of fixed-sized binary is incorrect.
-* [ARROW-10642](https://issues.apache.org/jira/browse/ARROW-10642) - [R] Can't get Table from RecordBatchReader with 0 batches
-* [ARROW-10656](https://issues.apache.org/jira/browse/ARROW-10656) - [Rust] New RecordBatch requires exact match of Data Types
-* [ARROW-10656](https://issues.apache.org/jira/browse/ARROW-10656) - [Rust] New RecordBatch requires exact match of Data Types
-* [ARROW-10661](https://issues.apache.org/jira/browse/ARROW-10661) - [C\#] Fix benchmarking project
-* [ARROW-10662](https://issues.apache.org/jira/browse/ARROW-10662) - [Java] Avoid integer overflow for Json file reader
-* [ARROW-10663](https://issues.apache.org/jira/browse/ARROW-10663) - [C++/Doc] The IsIn kernel ignores the skip\_nulls option of SetLookupOptions
-* [ARROW-10667](https://issues.apache.org/jira/browse/ARROW-10667) - [Rust] [Parquet] Add a convenience type for writing Parquet to memory
-* [ARROW-10668](https://issues.apache.org/jira/browse/ARROW-10668) - [R] Filtering does not work with .data pronoun
-* [ARROW-10681](https://issues.apache.org/jira/browse/ARROW-10681) - [Rust] [DataFusion] TPC-H Query 12 fails with scheduler error
-* [ARROW-10684](https://issues.apache.org/jira/browse/ARROW-10684) - [Rust] Logical equality should consider parent array nullability
-* [ARROW-10690](https://issues.apache.org/jira/browse/ARROW-10690) - [Java] ComplexCopier gives incorrect result for list vector if target vector is non-empty
-* [ARROW-10692](https://issues.apache.org/jira/browse/ARROW-10692) - [Rust] Segfault while array buffer append
-* [ARROW-10699](https://issues.apache.org/jira/browse/ARROW-10699) - [C++] BitmapUInt64Reader doesn't work on big-endian
-* [ARROW-10701](https://issues.apache.org/jira/browse/ARROW-10701) - [Rust] [Datafusion] Benchmark sort\_limit\_query\_sql fails because order by clause specifies column index instead of expression
-* [ARROW-10705](https://issues.apache.org/jira/browse/ARROW-10705) - [Rust] Lifetime annotations in the IPC writer are too strict, preventing code reuse
-* [ARROW-10710](https://issues.apache.org/jira/browse/ARROW-10710) - [Rust] Example flight server is broken after tokio upgrade (among other things)
-* [ARROW-10711](https://issues.apache.org/jira/browse/ARROW-10711) - [CI] Remove set-env from auto-tune to work with new GHA settings
-* [ARROW-10719](https://issues.apache.org/jira/browse/ARROW-10719) - [C\#] ArrowStreamWriter doesn't write schema metadata
-* [ARROW-10746](https://issues.apache.org/jira/browse/ARROW-10746) - [C++] Use GTEST\_SKIP in parquet encoding tests
-* [ARROW-10748](https://issues.apache.org/jira/browse/ARROW-10748) - [Java] TimeStampMilliVector cannot be cast to TimeStampMilliTZVector
-* [ARROW-10749](https://issues.apache.org/jira/browse/ARROW-10749) - [C++] Incorrect string format for Datum with the collection type
-* [ARROW-10751](https://issues.apache.org/jira/browse/ARROW-10751) - [C++] Add RE2 to minimal build example
-* [ARROW-10753](https://issues.apache.org/jira/browse/ARROW-10753) - [Rust] [DataFusion] Negative numbers in SQL WHERE clause not parsed correctly
-* [ARROW-10757](https://issues.apache.org/jira/browse/ARROW-10757) - [Rust] [CI] Sporadic failures due to disk filling up
-* [ARROW-10760](https://issues.apache.org/jira/browse/ARROW-10760) - [Rust] [DataFusion] Predicate push down does not support joins correctly
-* [ARROW-10769](https://issues.apache.org/jira/browse/ARROW-10769) - [CI] Integration tests are failing in master
-* [ARROW-10774](https://issues.apache.org/jira/browse/ARROW-10774) - [R] Set minimum cpp11 version
-* [ARROW-10777](https://issues.apache.org/jira/browse/ARROW-10777) - [Packaging][Python] PyPI pyarrow source dist (sdist) contains architecture dependent binaries
-* [ARROW-10778](https://issues.apache.org/jira/browse/ARROW-10778) - [Python] RowGroupInfo.statistics errors for empty row group
-* [ARROW-10779](https://issues.apache.org/jira/browse/ARROW-10779) - [Java] writeNull method in UnionListWriter doesn't work correctly if validity at that index is already set
-* [ARROW-10780](https://issues.apache.org/jira/browse/ARROW-10780) - [R] Update known R installation issues for CentOS 7
-* [ARROW-10791](https://issues.apache.org/jira/browse/ARROW-10791) - [Rust] StreamReader, read\_dictionary duplicating schema info
-* [ARROW-10801](https://issues.apache.org/jira/browse/ARROW-10801) - [Rust] [Flight] Support sending FlightData for Dictionaries with that of a RecordBatch
-* [ARROW-10803](https://issues.apache.org/jira/browse/ARROW-10803) - [R] Support R \>= 3.3 and add CI
-* [ARROW-10804](https://issues.apache.org/jira/browse/ARROW-10804) - [Rust] Remove UB on parquet crate
-* [ARROW-10807](https://issues.apache.org/jira/browse/ARROW-10807) - [Rust][DataFusion] Avoid double hashing
-* [ARROW-10810](https://issues.apache.org/jira/browse/ARROW-10810) - [Rust] Speed up comparison kernels
-* [ARROW-10811](https://issues.apache.org/jira/browse/ARROW-10811) - [R][CI] Remove nightly centos6 build
-* [ARROW-10823](https://issues.apache.org/jira/browse/ARROW-10823) - MutableArrayData with use\_null false yields wrong results
-* [ARROW-10830](https://issues.apache.org/jira/browse/ARROW-10830) - [Rust] json reader should not hard crash on invalid json
-* [ARROW-10833](https://issues.apache.org/jira/browse/ARROW-10833) - [Python] Avoid usage of NumPy's PyArray\_DescrCheck macro
-* [ARROW-10834](https://issues.apache.org/jira/browse/ARROW-10834) - [R] Fix print method for SubTreeFileSystem
-* [ARROW-10837](https://issues.apache.org/jira/browse/ARROW-10837) - [Rust] Use \`Vec<u8\>\` for hash key instead
-* [ARROW-10840](https://issues.apache.org/jira/browse/ARROW-10840) - [C++] Parquet FileMetaData does not have key\_value\_metadata when built from FileMetaDataBuilder
-* [ARROW-10842](https://issues.apache.org/jira/browse/ARROW-10842) - [Rust] decouple IO from json schema inference code
-* [ARROW-10844](https://issues.apache.org/jira/browse/ARROW-10844) - [Rust] [DataFusion] join of two DataFrames is not possible
-* [ARROW-10850](https://issues.apache.org/jira/browse/ARROW-10850) - [R] Unrecognized compression type: LZ4
-* [ARROW-10852](https://issues.apache.org/jira/browse/ARROW-10852) - [C++] AssertTablesEqual(verbose=true) segfaults if the left array has more rows
-* [ARROW-10854](https://issues.apache.org/jira/browse/ARROW-10854) - [Rust] [DataFusion] Simplified logical scans
-* [ARROW-10855](https://issues.apache.org/jira/browse/ARROW-10855) - [Python][Numpy] ArrowTypeError after upgrading NumPy to 1.20.0rc1
-* [ARROW-10856](https://issues.apache.org/jira/browse/ARROW-10856) - [R] CentOS 7 not correctly identifying compiler version
-* [ARROW-10859](https://issues.apache.org/jira/browse/ARROW-10859) - [Rust] [DataFusion] Make collect not require ExecutionContext
-* [ARROW-10860](https://issues.apache.org/jira/browse/ARROW-10860) - [Java] Avoid integer overflow for generated classes in Vector
-* [ARROW-10863](https://issues.apache.org/jira/browse/ARROW-10863) - [Python] ExtensionArray.to\_pandas not working
-* [ARROW-10863](https://issues.apache.org/jira/browse/ARROW-10863) - [Python] ExtensionArray.to\_pandas not working
-* [ARROW-10875](https://issues.apache.org/jira/browse/ARROW-10875) - simplify simd cfg check
-* [ARROW-10876](https://issues.apache.org/jira/browse/ARROW-10876) - [Rust] json reader should validate value type
-* [ARROW-10897](https://issues.apache.org/jira/browse/ARROW-10897) - [Rust] Replace Arc<String\> by String in DataType::Timestamp
-* [ARROW-10907](https://issues.apache.org/jira/browse/ARROW-10907) - [Rust] Cast UTF8 to Date64 Incorrect
-* [ARROW-10913](https://issues.apache.org/jira/browse/ARROW-10913) - [Python][Doc] Code block typo in filesystems docs
-* [ARROW-10914](https://issues.apache.org/jira/browse/ARROW-10914) - [Rust]: SIMD implementation of arithmetic kernels reads out of bounds
-* [ARROW-10915](https://issues.apache.org/jira/browse/ARROW-10915) - [Rust] Make ARROW\_TEST\_DATA and PARQUET\_TEST\_DATA absolute dirs
-* [ARROW-10921](https://issues.apache.org/jira/browse/ARROW-10921) - \`TypeError: 'coroutine' object is not iterable\` when reading parquet partitions via s3fs \>= 0.5 with pyarrow
-* [ARROW-10930](https://issues.apache.org/jira/browse/ARROW-10930) - [Python] LargeListType doesn't have a value\_field
-* [ARROW-10932](https://issues.apache.org/jira/browse/ARROW-10932) - [C++] BinaryMemoTable::CopyOffsets access out-of-bound address when data is empty
-* [ARROW-10932](https://issues.apache.org/jira/browse/ARROW-10932) - [C++] BinaryMemoTable::CopyOffsets access out-of-bound address when data is empty
-* [ARROW-10942](https://issues.apache.org/jira/browse/ARROW-10942) - [C++] S3FileSystem::Impl::IsEmptyDirectory fails on Amazon S3
-* [ARROW-10943](https://issues.apache.org/jira/browse/ARROW-10943) - [Rust] Intermittent build failure in parquet encoding
-* [ARROW-10954](https://issues.apache.org/jira/browse/ARROW-10954) - [C++][Doc] PlasmaClient is threadSafe now, doc not update
-* [ARROW-10955](https://issues.apache.org/jira/browse/ARROW-10955) - [C++] Reading empty json lists results in invalid non-nullable null type
-* [ARROW-10960](https://issues.apache.org/jira/browse/ARROW-10960) - [C++][FlightRPC] Missing protobuf data\_body should result in default value of empty bytes, not null
-* [ARROW-10962](https://issues.apache.org/jira/browse/ARROW-10962) - [Java][FlightRPC] FlightData deserializer should accept missing fields
-* [ARROW-10967](https://issues.apache.org/jira/browse/ARROW-10967) - [Rust] Make env vars ARROW\_TEST\_DATA and PARQUET\_TEST\_DATA optional
-* [ARROW-10990](https://issues.apache.org/jira/browse/ARROW-10990) - [Rust]: SIMD implementation of compare kernels reads out of bounds
-* [ARROW-10994](https://issues.apache.org/jira/browse/ARROW-10994) - [Rust] Fix bugs in TPC-H file conversion
-* [ARROW-10996](https://issues.apache.org/jira/browse/ARROW-10996) - [Rust] Return error messages via Result for get\_arrow\_schema\_from\_metadata
-* [ARROW-10999](https://issues.apache.org/jira/browse/ARROW-10999) - [Rust] TPC-H parquet files cannot be read by Apache Spark
-* [ARROW-11014](https://issues.apache.org/jira/browse/ARROW-11014) - [Rust] [DataFusion] ParquetExec reports incorrect statistics
-* [ARROW-11023](https://issues.apache.org/jira/browse/ARROW-11023) - [C++][CMake] gRPC doesn't respect CMAKE\_CXX\_COMPILER
-* [ARROW-11024](https://issues.apache.org/jira/browse/ARROW-11024) - [C++][Parquet] Writing List<Struct\> to parquet sometimes writes wrong data
-* [ARROW-11025](https://issues.apache.org/jira/browse/ARROW-11025) - [Rust] Bench for boolean kernels measure array creation
-* [ARROW-11030](https://issues.apache.org/jira/browse/ARROW-11030) - [Rust] [DataFusion] HashJoinExec slow with many batches
-* [ARROW-11048](https://issues.apache.org/jira/browse/ARROW-11048) - [Rust] Add bench to MutableBuffer
-* [ARROW-11050](https://issues.apache.org/jira/browse/ARROW-11050) - [R] Handle RecordBatch in write\_parquet
-* [ARROW-11067](https://issues.apache.org/jira/browse/ARROW-11067) - [C++] CSV reader returns nulls for some strings on macOS
-* [ARROW-11069](https://issues.apache.org/jira/browse/ARROW-11069) - [C++] Parquet writer incorrect data being written when data type is struct
-* [ARROW-11073](https://issues.apache.org/jira/browse/ARROW-11073) - [Rust] Lint Error on CI Tests in /arrow/rust/arrow/src/ipc/reader.rs
-* [ARROW-11083](https://issues.apache.org/jira/browse/ARROW-11083) - [CI] Build "Source Release and Merge Script" is broken
-* [ARROW-11084](https://issues.apache.org/jira/browse/ARROW-11084) - [Rust] Clippy failing in master
-* [ARROW-11085](https://issues.apache.org/jira/browse/ARROW-11085) - [Rust] Rust CI no longer works b/c it uses action-rs: Migrate CI away from action-rs/\*
-* [ARROW-11092](https://issues.apache.org/jira/browse/ARROW-11092) - [CI] (Temporarily) move offending workflows to separate files
-* [ARROW-11102](https://issues.apache.org/jira/browse/ARROW-11102) - [Rust][DataFusion] fmt::Debug for ScalarValue(Utf8) is always quoted
-* [ARROW-11113](https://issues.apache.org/jira/browse/ARROW-11113) - [Rust] support as\_struct\_array cast
-* [ARROW-11114](https://issues.apache.org/jira/browse/ARROW-11114) - [Java] Metadata serialization is broken for Field class
-* [ARROW-11132](https://issues.apache.org/jira/browse/ARROW-11132) - [CI] Use pip to install crossbow's dependencies for the comment bot
-* [ARROW-11144](https://issues.apache.org/jira/browse/ARROW-11144) - [C++][Python][CI] Fix HDFS nightly build
-* [ARROW-11152](https://issues.apache.org/jira/browse/ARROW-11152) - [CI][C++] Fix Homebrew numpy installation on macOS builds
-* [ARROW-11162](https://issues.apache.org/jira/browse/ARROW-11162) - [C++] Fix crash on Decimal256 Parquet file (OSS-Fuzz)
-* [ARROW-11163](https://issues.apache.org/jira/browse/ARROW-11163) - [C++][Python] Compressed Feather file written with pyarrow 0.17 not readable in pyarrow 2.0.0+
-* [ARROW-11166](https://issues.apache.org/jira/browse/ARROW-11166) - [Python][Compute] Add bindings for ProjectOptions
-* [ARROW-11171](https://issues.apache.org/jira/browse/ARROW-11171) - [Go] Build fails on s390x with noasm tag
-* [ARROW-11189](https://issues.apache.org/jira/browse/ARROW-11189) - [Developer] Achery benchmark diff cannot compare two jsons
-* [ARROW-11190](https://issues.apache.org/jira/browse/ARROW-11190) - [C++][Dataset] Clean up compiler warnings
-* [ARROW-11202](https://issues.apache.org/jira/browse/ARROW-11202) - [R][CI] Nightly builds not happening (or artifacts not exported)
-* [ARROW-11224](https://issues.apache.org/jira/browse/ARROW-11224) - [R] don't test metadata serialization on old R versions
-* [ARROW-11226](https://issues.apache.org/jira/browse/ARROW-11226) - [Python][CI] Filesystem tests failing with s3fs 0.5.2
-* [ARROW-11227](https://issues.apache.org/jira/browse/ARROW-11227) - [Python][CI] AMD64 Conda Python 3.7 Pandas 0.24 cron job failing in to\_pandas extension dtype test
-* [ARROW-11229](https://issues.apache.org/jira/browse/ARROW-11229) - [C++][Dataset] Static build is failed
-* [ARROW-11230](https://issues.apache.org/jira/browse/ARROW-11230) - [R] Fix build failures on Windows when multiple libarrow binaries found
-* [ARROW-11232](https://issues.apache.org/jira/browse/ARROW-11232) - [C++] Table::CombineChunks() returns incorrect results if Table has no column
-* [ARROW-11233](https://issues.apache.org/jira/browse/ARROW-11233) - [C++][Flight] Fail to link with bundled gRPC and Abseil
-* [ARROW-11237](https://issues.apache.org/jira/browse/ARROW-11237) - [C++] Compiler error with GLog and unity build enabled
-* [ARROW-11251](https://issues.apache.org/jira/browse/ARROW-11251) - [CI] Make sure that devtoolset-8 is really installed + being used
-* [ARROW-11253](https://issues.apache.org/jira/browse/ARROW-11253) - [R] Make sure that large metadata tests are reproducible
-* [ARROW-11255](https://issues.apache.org/jira/browse/ARROW-11255) - [Packaging][Conda][macOS] Fix Python version
-* [ARROW-11271](https://issues.apache.org/jira/browse/ARROW-11271) - [Rust] [Parquet] List schema to Arrow parser misinterpreting child nullability
-* [ARROW-11274](https://issues.apache.org/jira/browse/ARROW-11274) - [Packaging][wheel][Windows] Fix wheels path for Gemfury
-* [ARROW-11275](https://issues.apache.org/jira/browse/ARROW-11275) - [Packaging][wheel][Linux] Fix paths for Gemfury
-* [ARROW-11283](https://issues.apache.org/jira/browse/ARROW-11283) - [Julia] Fix install link
-* [ARROW-11286](https://issues.apache.org/jira/browse/ARROW-11286) - [Release][Yum] Fix minimal build example check
-* [ARROW-11287](https://issues.apache.org/jira/browse/ARROW-11287) - [Packaging][RPM] Add missing dependencies
-* [ARROW-11301](https://issues.apache.org/jira/browse/ARROW-11301) - [C++] Fix reading LZ4-compressed Parquet files produced by Java Parquet implementation
-* [ARROW-11302](https://issues.apache.org/jira/browse/ARROW-11302) - [Release][Python] Remove verification of python 3.5 wheel on macOS
-* [ARROW-11306](https://issues.apache.org/jira/browse/ARROW-11306) - [Packaging][Ubuntu][16.04] Add missing libprotobuf-dev dependency
-* [PARQUET-1935](https://issues.apache.org/jira/browse/PARQUET-1935) - [C++][Parquet] nullptr access violation when writing arrays of non-nullable values
-
-
-
-# Apache Arrow 2.0.0 (2020-10-13)
-
-## Bug Fixes
-
-* [ARROW-2367](https://issues.apache.org/jira/browse/ARROW-2367) - [Python] ListArray has trouble with sizes greater than kMaximumCapacity
-* [ARROW-4189](https://issues.apache.org/jira/browse/ARROW-4189) - [CI] [Rust] Fix broken cargo coverage
-* [ARROW-4917](https://issues.apache.org/jira/browse/ARROW-4917) - [C++] orc\_ep fails in cpp-alpine docker
-* [ARROW-5578](https://issues.apache.org/jira/browse/ARROW-5578) - [C++][Flight] Flight does not build out of the box on Alpine Linux
-* [ARROW-7226](https://issues.apache.org/jira/browse/ARROW-7226) - [JSON][Python] Json loader fails on example in documentation.
-* [ARROW-7384](https://issues.apache.org/jira/browse/ARROW-7384) - [Website] Fix search indexing warning reported by Google
-* [ARROW-7517](https://issues.apache.org/jira/browse/ARROW-7517) - [C++] Builder does not honour dictionary type provided during initialization
-* [ARROW-7663](https://issues.apache.org/jira/browse/ARROW-7663) - [Python] from\_pandas gives TypeError instead of ArrowTypeError in some cases
-* [ARROW-7903](https://issues.apache.org/jira/browse/ARROW-7903) - [Rust] [DataFusion] Upgrade SQLParser dependency for DataFusion
-* [ARROW-7957](https://issues.apache.org/jira/browse/ARROW-7957) - [Python] ParquetDataset cannot take HadoopFileSystem as filesystem
-* [ARROW-8265](https://issues.apache.org/jira/browse/ARROW-8265) - [Rust] [DataFusion] Table API collect() should not require context
-* [ARROW-8394](https://issues.apache.org/jira/browse/ARROW-8394) - [JS] Typescript compiler errors for arrow d.ts files, when using es2015-esm package
-* [ARROW-8735](https://issues.apache.org/jira/browse/ARROW-8735) - [Rust] [Parquet] Parquet crate fails to compile on Arm architecture
-* [ARROW-8749](https://issues.apache.org/jira/browse/ARROW-8749) - [C++] IpcFormatWriter writes dictionary batches with wrong ID
-* [ARROW-8773](https://issues.apache.org/jira/browse/ARROW-8773) - [Python] pyarrow schema.empty\_table() does not preserve nullability of fields
-* [ARROW-9028](https://issues.apache.org/jira/browse/ARROW-9028) - [R] Should be able to convert an empty table
-* [ARROW-9096](https://issues.apache.org/jira/browse/ARROW-9096) - [Python] Pandas roundtrip with object-dtype column labels with integer values: data type "integer" not understood
-* [ARROW-9177](https://issues.apache.org/jira/browse/ARROW-9177) - [C++][Parquet] Tracking issue for cross-implementation LZ4 Parquet compression compatibility
-* [ARROW-9414](https://issues.apache.org/jira/browse/ARROW-9414) - [C++] apt package includes headers for S3 interface, but no support
-* [ARROW-9462](https://issues.apache.org/jira/browse/ARROW-9462) - [Go] The Indentation after the first Record arrjson writer is missing
-* [ARROW-9463](https://issues.apache.org/jira/browse/ARROW-9463) - [Go] The writer is double closed in TestReadWrite
-* [ARROW-9490](https://issues.apache.org/jira/browse/ARROW-9490) - [Python] pyarrow array creation for specific set of numpy scalars fails
-* [ARROW-9495](https://issues.apache.org/jira/browse/ARROW-9495) - [C++] Equality assertions don't handle Inf / -Inf properly
-* [ARROW-9520](https://issues.apache.org/jira/browse/ARROW-9520) - [Rust] [DataFusion] Can't alias an aggregate expression
-* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow
-* [ARROW-9532](https://issues.apache.org/jira/browse/ARROW-9532) - [Python] Building pyarrow for MacPorts on macOS
-* [ARROW-9535](https://issues.apache.org/jira/browse/ARROW-9535) - [Python] Remove symlink fixes from conda recipe
-* [ARROW-9536](https://issues.apache.org/jira/browse/ARROW-9536) - Missing parameters in PlasmaOutOfMemoryException.java
-* [ARROW-9541](https://issues.apache.org/jira/browse/ARROW-9541) - [C++] CMakeLists requires UTF8PROC\_STATIC when building static library
-* [ARROW-9544](https://issues.apache.org/jira/browse/ARROW-9544) - [R] version argument of write\_parquet not working
-* [ARROW-9546](https://issues.apache.org/jira/browse/ARROW-9546) - [Python] Clean up Pandas Metadata Conversion test
-* [ARROW-9548](https://issues.apache.org/jira/browse/ARROW-9548) - [Go] Test output files in tmp directory are not removed correctly
-* [ARROW-9549](https://issues.apache.org/jira/browse/ARROW-9549) - [Rust] Parquet no longer builds
-* [ARROW-9554](https://issues.apache.org/jira/browse/ARROW-9554) - [Java] FixedWidthInPlaceVectorSorter sometimes produces wrong result
-* [ARROW-9556](https://issues.apache.org/jira/browse/ARROW-9556) - [Python][C++] Segfaults in UnionArray with null values
-* [ARROW-9560](https://issues.apache.org/jira/browse/ARROW-9560) - [Packaging] conda recipes failing due to missing conda-forge.yml
-* [ARROW-9569](https://issues.apache.org/jira/browse/ARROW-9569) - [CI][R] Fix rtools35 builds for msys2 key change
-* [ARROW-9570](https://issues.apache.org/jira/browse/ARROW-9570) - [Doc] Clean up sphinx sidebar
-* [ARROW-9573](https://issues.apache.org/jira/browse/ARROW-9573) - [Python] Parquet doesn't load when partitioned column starts with '\_'
-* [ARROW-9574](https://issues.apache.org/jira/browse/ARROW-9574) - [R] Cleanups for CRAN 1.0.0 release
-* [ARROW-9575](https://issues.apache.org/jira/browse/ARROW-9575) - [R] gcc-UBSAN failure on CRAN
-* [ARROW-9577](https://issues.apache.org/jira/browse/ARROW-9577) - [Python][C++] posix\_madvise error on Debian in pyarrow 1.0.0
-* [ARROW-9583](https://issues.apache.org/jira/browse/ARROW-9583) - [Rust] Offset is mishandled in arithmetic and boolean compute kernels
-* [ARROW-9588](https://issues.apache.org/jira/browse/ARROW-9588) - [C++] clang/win: Copy constructor of ParquetInvalidOrCorruptedFileException not correctly triggered
-* [ARROW-9589](https://issues.apache.org/jira/browse/ARROW-9589) - [C++/R] arrow\_exports.h contains structs declared as class
-* [ARROW-9592](https://issues.apache.org/jira/browse/ARROW-9592) - [CI] Update homebrew before calling brew bundle
-* [ARROW-9596](https://issues.apache.org/jira/browse/ARROW-9596) - [CI][Crossbow] Fix homebrew-cpp again, again
-* [ARROW-9597](https://issues.apache.org/jira/browse/ARROW-9597) - [C++] AddAlias in compute::FunctionRegistry should be synchronized
-* [ARROW-9598](https://issues.apache.org/jira/browse/ARROW-9598) - [C++][Parquet] Spaced definition levels is not assigned correctly.
-* [ARROW-9599](https://issues.apache.org/jira/browse/ARROW-9599) - [CI] Appveyor toolchain build fails because CMake detects different C and C++ compilers
-* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build
-* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build
-* [ARROW-9602](https://issues.apache.org/jira/browse/ARROW-9602) - [R] Improve cmake detection in Linux build
-* [ARROW-9603](https://issues.apache.org/jira/browse/ARROW-9603) - [C++][Parquet] Write Arrow relies on unspecified behavior for nested types
-* [ARROW-9606](https://issues.apache.org/jira/browse/ARROW-9606) - [C++][Dataset] in expressions don't work with \>1 partition levels
-* [ARROW-9609](https://issues.apache.org/jira/browse/ARROW-9609) - [C++] CSV datasets don't materialize virtual columns
-* [ARROW-9621](https://issues.apache.org/jira/browse/ARROW-9621) - [Python] test\_move\_file() is failed with fsspec 0.8.0
-* [ARROW-9622](https://issues.apache.org/jira/browse/ARROW-9622) - [Java] ComplexCopier fails if a structvector has a child UnionVector with nulls
-* [ARROW-9628](https://issues.apache.org/jira/browse/ARROW-9628) - [Rust] Clippy PR test failing intermittently on Rust / AMD64 MacOS
-* [ARROW-9629](https://issues.apache.org/jira/browse/ARROW-9629) - [Python] Kartothek integration tests failing due to missing freezegun module
-* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight
-* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight
-* [ARROW-9642](https://issues.apache.org/jira/browse/ARROW-9642) - [C++] Let MakeBuilder refer DictionaryType's index\_type for deciding the starting bit width of the indices
-* [ARROW-9643](https://issues.apache.org/jira/browse/ARROW-9643) - [C++] Illegal instruction on haswell cpu
-* [ARROW-9644](https://issues.apache.org/jira/browse/ARROW-9644) - [C++][Dataset] Do not check for ignore\_prefixes in the base path
-* [ARROW-9652](https://issues.apache.org/jira/browse/ARROW-9652) - [Rust][DataFusion] Panic trying to select \* from a CSV (panicked at 'index out of bounds: the len is 0 but the index is 0)
-* [ARROW-9653](https://issues.apache.org/jira/browse/ARROW-9653) - [Rust][DataFusion] Multi-column Group by: Invalid Argument Error
-* [ARROW-9659](https://issues.apache.org/jira/browse/ARROW-9659) - [C++] RecordBatchStreamReader throws on CUDA device buffers
-* [ARROW-9660](https://issues.apache.org/jira/browse/ARROW-9660) - [C++] IPC - dictionaries in maps
-* [ARROW-9666](https://issues.apache.org/jira/browse/ARROW-9666) - [Python][wheel][Windows] library missing failure by ARROW-9412
-* [ARROW-9670](https://issues.apache.org/jira/browse/ARROW-9670) - [C++][FlightRPC] Close()ing a DoPut with an ongoing read locks up the client
-* [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz)
-* [ARROW-9692](https://issues.apache.org/jira/browse/ARROW-9692) - [Python] distutils import warning
-* [ARROW-9693](https://issues.apache.org/jira/browse/ARROW-9693) - [CI][Docs] Nightly docs build fails
-* [ARROW-9696](https://issues.apache.org/jira/browse/ARROW-9696) - [Rust] [Datafusion] nested binary expressions broken
-* [ARROW-9698](https://issues.apache.org/jira/browse/ARROW-9698) - [C++] Revert "Add -NDEBUG flag to arrow.pc"
-* [ARROW-9700](https://issues.apache.org/jira/browse/ARROW-9700) - [Python] create\_library\_symlinks doesn't work in macos
-* [ARROW-9712](https://issues.apache.org/jira/browse/ARROW-9712) - [Rust] [DataFusion] ParquetScanExec panics on error
-* [ARROW-9714](https://issues.apache.org/jira/browse/ARROW-9714) - [Rust] [DataFusion] TypeCoercionRule not implemented for Limit or Sort
-* [ARROW-9716](https://issues.apache.org/jira/browse/ARROW-9716) - [Rust] [DataFusion] MergeExec should have concurrency limit
-* [ARROW-9726](https://issues.apache.org/jira/browse/ARROW-9726) - [Rust] [DataFusion] ParquetScanExec launches threads too early
-* [ARROW-9727](https://issues.apache.org/jira/browse/ARROW-9727) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
-* [ARROW-9729](https://issues.apache.org/jira/browse/ARROW-9729) - [Java] Error Prone causes other annotation processors to not work with Eclipse
-* [ARROW-9733](https://issues.apache.org/jira/browse/ARROW-9733) - [Rust][DataFusion] Aggregates COUNT/MIN/MAX don't work on VARCHAR columns
-* [ARROW-9734](https://issues.apache.org/jira/browse/ARROW-9734) - [Rust] [DataFusion] TableProvider.scan executing partitions prematurely
-* [ARROW-9741](https://issues.apache.org/jira/browse/ARROW-9741) - [Rust] [DataFusion] Incorrect count in TPC-H query 1 result set
-* [ARROW-9743](https://issues.apache.org/jira/browse/ARROW-9743) - [R] Sanitize paths in open\_dataset
-* [ARROW-9744](https://issues.apache.org/jira/browse/ARROW-9744) - [Python] Failed to install on aarch64
-* [ARROW-9764](https://issues.apache.org/jira/browse/ARROW-9764) - [CI][Java] Push wrong Docker image
-* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds
-* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds
-* [ARROW-9778](https://issues.apache.org/jira/browse/ARROW-9778) - [Rust] [DataFusion] Logical and physical schemas' nullability does not match in 8 out of 20 end-to-end tests
-* [ARROW-9783](https://issues.apache.org/jira/browse/ARROW-9783) - [Rust] [DataFusion] Logical aggregate expressions require explicit data type
-* [ARROW-9785](https://issues.apache.org/jira/browse/ARROW-9785) - [Python] pyarrow/tests/test\_fs.py::test\_s3\_options too slow
-* [ARROW-9789](https://issues.apache.org/jira/browse/ARROW-9789) - [C++] Don't install jemalloc in parallel
-* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries
-* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries
-* [ARROW-9793](https://issues.apache.org/jira/browse/ARROW-9793) - [Rust] [DataFusion] Tests failing in master
-* [ARROW-9797](https://issues.apache.org/jira/browse/ARROW-9797) - [Rust] AMD64 Conda Integration Tests is failing for the Master branch
-* [ARROW-9799](https://issues.apache.org/jira/browse/ARROW-9799) - [Rust] [DataFusion] Implementation of physical binary expression get\_type method is incorrect
-* [ARROW-9800](https://issues.apache.org/jira/browse/ARROW-9800) - [Rust] [Parquet] "min" and "max" written to standard out when writing columns
-* [ARROW-9809](https://issues.apache.org/jira/browse/ARROW-9809) - [Rust] [DataFusion] logical schema = physical schema is not true
-* [ARROW-9814](https://issues.apache.org/jira/browse/ARROW-9814) - [Python] Crash in test\_parquet.py::test\_read\_partitioned\_directory\_s3fs
-* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
-* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
-* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
-* [ARROW-9816](https://issues.apache.org/jira/browse/ARROW-9816) - [C++] Escape quotes in config.h
-* [ARROW-9827](https://issues.apache.org/jira/browse/ARROW-9827) - [Python] pandas.read\_parquet fails for wide parquet files and pyarrow 1.0.X
-* [ARROW-9831](https://issues.apache.org/jira/browse/ARROW-9831) - [Rust] [DataFusion] Fix compilation error
-* [ARROW-9840](https://issues.apache.org/jira/browse/ARROW-9840) - [Python] Python fs documentation out of date with code
-* [ARROW-9846](https://issues.apache.org/jira/browse/ARROW-9846) - [Rust] Master branch broken build
-* [ARROW-9851](https://issues.apache.org/jira/browse/ARROW-9851) - [C++] Valgrind errors due to unrecognized instructions
-* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
-* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
-* [ARROW-9855](https://issues.apache.org/jira/browse/ARROW-9855) - [R] Fix bad merge/Rcpp conflict
-* [ARROW-9859](https://issues.apache.org/jira/browse/ARROW-9859) - [C++] S3 FileSystemFromUri with special char in secret key fails
-* [ARROW-9864](https://issues.apache.org/jira/browse/ARROW-9864) - [Python] pathlib.Path not supported in write\_to\_dataset with partition columns
-* [ARROW-9874](https://issues.apache.org/jira/browse/ARROW-9874) - [C++] NewStreamWriter / NewFileWriter don't own output stream
-* [ARROW-9876](https://issues.apache.org/jira/browse/ARROW-9876) - [CI][C++] Travis ARM jobs timeout
-* [ARROW-9877](https://issues.apache.org/jira/browse/ARROW-9877) - [C++][CI] homebrew-cpp fails due to avx512
-* [ARROW-9879](https://issues.apache.org/jira/browse/ARROW-9879) - [Python] ChunkedArray.\_\_getitem\_\_ doesn't work with numpy scalars
-* [ARROW-9882](https://issues.apache.org/jira/browse/ARROW-9882) - [C++/Python] Update conda-forge-pinning to 3 for OSX conda packages
-* [ARROW-9883](https://issues.apache.org/jira/browse/ARROW-9883) - [R] Fix linuxlibs.R install script for R < 3.6
-* [ARROW-9888](https://issues.apache.org/jira/browse/ARROW-9888) - [Rust] [DataFusion] ExecutionContext can not be shared between threads
-* [ARROW-9889](https://issues.apache.org/jira/browse/ARROW-9889) - [Rust][DataFusion] Datafusion CLI: CREATE EXTERNAL TABLE errors with "Unsupported logical plan variant"
-* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
-* [ARROW-9906](https://issues.apache.org/jira/browse/ARROW-9906) - [Python] Crash in test\_parquet.py::test\_parquet\_writer\_filesystem\_s3\_uri (closing NativeFile from S3FileSystem)
-* [ARROW-9913](https://issues.apache.org/jira/browse/ARROW-9913) - [C++] Outputs of Decimal128::FromString depend on presence of one another
-* [ARROW-9920](https://issues.apache.org/jira/browse/ARROW-9920) - [Python] pyarrow.concat\_arrays segfaults when passing it a chunked array
-* [ARROW-9922](https://issues.apache.org/jira/browse/ARROW-9922) - [Rust] Add \`try\_from(Vec<Option<(&str, ArrayRef)\>\>)\` to StructArray
-* [ARROW-9924](https://issues.apache.org/jira/browse/ARROW-9924) - [Python] Performance regression reading individual Parquet files using Dataset interface
-* [ARROW-9931](https://issues.apache.org/jira/browse/ARROW-9931) - [C++] Fix undefined behaviour on invalid IPC (OSS-Fuzz)
-* [ARROW-9932](https://issues.apache.org/jira/browse/ARROW-9932) - [R] Arrow 1.0.1 R package fails to install on R3.4 over linux
-* [ARROW-9936](https://issues.apache.org/jira/browse/ARROW-9936) - [Python] Fix / test relative file paths in pyarrow.parquet
-* [ARROW-9937](https://issues.apache.org/jira/browse/ARROW-9937) - [Rust] [DataFusion] Average is not correct
-* [ARROW-9943](https://issues.apache.org/jira/browse/ARROW-9943) - [C++] Arrow metadata not applied recursively when reading Parquet file
-* [ARROW-9946](https://issues.apache.org/jira/browse/ARROW-9946) - [R] ParquetFileWriter segfaults when \`sink\` is a string
-* [ARROW-9953](https://issues.apache.org/jira/browse/ARROW-9953) - [R] Declare minimum version for bit64
-* [ARROW-9962](https://issues.apache.org/jira/browse/ARROW-9962) - [Python] Conversion to pandas with index column using fixed timezone fails
-* [ARROW-9968](https://issues.apache.org/jira/browse/ARROW-9968) - [C++] UBSAN link failure with \_\_int8\_t
-* [ARROW-9969](https://issues.apache.org/jira/browse/ARROW-9969) - [C++] RecordBatchBuilder yields invalid result with dictionary fields
-* [ARROW-9970](https://issues.apache.org/jira/browse/ARROW-9970) - [Go] checkptr failures in sum methods
-* [ARROW-9972](https://issues.apache.org/jira/browse/ARROW-9972) - [CI] Work around grpc-re2 clash on Homebrew
-* [ARROW-9973](https://issues.apache.org/jira/browse/ARROW-9973) - [Java] JDBC DateConsumer does not allow dates before epoch
-* [ARROW-9976](https://issues.apache.org/jira/browse/ARROW-9976) - [Python] ArrowCapacityError when doing Table.from\_pandas with large dataframe
-* [ARROW-9990](https://issues.apache.org/jira/browse/ARROW-9990) - [Rust] [DataFusion] NOT is not plannable
-* [ARROW-9993](https://issues.apache.org/jira/browse/ARROW-9993) - [Python] Tzinfo - string roundtrip fails on pytz.StaticTzInfo objects
-* [ARROW-9994](https://issues.apache.org/jira/browse/ARROW-9994) - [C++][Python] Auto chunking nested array containing binary-like fields result malformed output
-* [ARROW-9996](https://issues.apache.org/jira/browse/ARROW-9996) - [C++] Dictionary is unset when calling DictionaryArray.GetScalar for null values
-* [ARROW-10003](https://issues.apache.org/jira/browse/ARROW-10003) - [C++] Create directories in CopyFiles when copying within the same filesystem
-* [ARROW-10008](https://issues.apache.org/jira/browse/ARROW-10008) - [Python] pyarrow.parquet.read\_table fails with predicate pushdown on categorical data with use\_legacy\_dataset=False
-* [ARROW-10011](https://issues.apache.org/jira/browse/ARROW-10011) - [C++] Make FindRE2.cmake re-entrant
-* [ARROW-10012](https://issues.apache.org/jira/browse/ARROW-10012) - [C++] Sporadic failures in CopyFiles test
-* [ARROW-10013](https://issues.apache.org/jira/browse/ARROW-10013) - [C++][CI] Flight test failure in TestFlightClient.GenericOptions
-* [ARROW-10017](https://issues.apache.org/jira/browse/ARROW-10017) - [Java] LargeMemoryUtil.checkedCastToInt has buggy logic
-* [ARROW-10022](https://issues.apache.org/jira/browse/ARROW-10022) - [C++] [Compute] core dumped on some scalar-arithmetic-benchmark
-* [ARROW-10027](https://issues.apache.org/jira/browse/ARROW-10027) - [Python] Incorrect null column returned when using a dataset filter expression.
-* [ARROW-10034](https://issues.apache.org/jira/browse/ARROW-10034) - [Rust] Master build broken
-* [ARROW-10041](https://issues.apache.org/jira/browse/ARROW-10041) - [Rust] Possible to create LargeStringArray with DataType::Utf8
-* [ARROW-10047](https://issues.apache.org/jira/browse/ARROW-10047) - [CI] Conda integration tests failing with cmake error
-* [ARROW-10048](https://issues.apache.org/jira/browse/ARROW-10048) - [Rust] Error in aggregate of min/max for strings
-* [ARROW-10049](https://issues.apache.org/jira/browse/ARROW-10049) - [C++/Python] Sync conda recipe with conda-forge
-* [ARROW-10060](https://issues.apache.org/jira/browse/ARROW-10060) - [Rust] [DataFusion] MergeExec currently discards partitions with errors
-* [ARROW-10062](https://issues.apache.org/jira/browse/ARROW-10062) - [Rust]: Fix for null elems for DoubleEndedIter for DictArray
-* [ARROW-10073](https://issues.apache.org/jira/browse/ARROW-10073) - [Python] Test test\_parquet\_nested\_storage relies on dict item ordering
-* [ARROW-10081](https://issues.apache.org/jira/browse/ARROW-10081) - [C++/Python] Fix bash syntax in drone.io conda builds
-* [ARROW-10085](https://issues.apache.org/jira/browse/ARROW-10085) - [C++] S3 tests fail on AppVeyor
-* [ARROW-10087](https://issues.apache.org/jira/browse/ARROW-10087) - [CI] Fix nightly docs job
-* [ARROW-10098](https://issues.apache.org/jira/browse/ARROW-10098) - [R][Doc] Fix copy\_files doc mismatch
-* [ARROW-10104](https://issues.apache.org/jira/browse/ARROW-10104) - [Python] Separate tests into its own conda package
-* [ARROW-10114](https://issues.apache.org/jira/browse/ARROW-10114) - [R] Segfault in to\_dataframe\_parallel with deeply nested structs
-* [ARROW-10116](https://issues.apache.org/jira/browse/ARROW-10116) - [Python][Packaging] Fix gRPC linking error in macOS wheels builds
-* [ARROW-10119](https://issues.apache.org/jira/browse/ARROW-10119) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz)
-* [ARROW-10121](https://issues.apache.org/jira/browse/ARROW-10121) - [C++][Python] Variable dictionaries do not survive roundtrip to IPC stream
-* [ARROW-10124](https://issues.apache.org/jira/browse/ARROW-10124) - [R] Write functions don't follow umask setting
-* [ARROW-10125](https://issues.apache.org/jira/browse/ARROW-10125) - [R] Int64 downcast check doesn't consider all chunks
-* [ARROW-10130](https://issues.apache.org/jira/browse/ARROW-10130) - [C++][Dataset] ParquetFileFragment::SplitByRowGroup does not preserve "complete\_metadata" status
-* [ARROW-10136](https://issues.apache.org/jira/browse/ARROW-10136) - [Rust][Arrow] Nulls are transformed into "" after filtering for StringArray
-* [ARROW-10137](https://issues.apache.org/jira/browse/ARROW-10137) - [R] Fix cpp helper that breaks if libarrow is not present
-* [ARROW-10147](https://issues.apache.org/jira/browse/ARROW-10147) - [Python] Constructing pandas metadata fails if an Index name is not JSON-serializable by default
-* [ARROW-10150](https://issues.apache.org/jira/browse/ARROW-10150) - [C++] Fix crashes on invalid Parquet file (OSS-Fuzz)
-* [ARROW-10169](https://issues.apache.org/jira/browse/ARROW-10169) - [Rust] Nulls should be rendered as "" rather than default value when pretty printing arrays
-* [ARROW-10175](https://issues.apache.org/jira/browse/ARROW-10175) - [CI] Nightly hdfs integration test job fails
-* [ARROW-10176](https://issues.apache.org/jira/browse/ARROW-10176) - [CI] Nightly valgrind job fails
-* [ARROW-10178](https://issues.apache.org/jira/browse/ARROW-10178) - [CI] Fix spark master integration test build setup
-* [ARROW-10179](https://issues.apache.org/jira/browse/ARROW-10179) - [Rust] Labeler is not labeling
-* [ARROW-10181](https://issues.apache.org/jira/browse/ARROW-10181) - [Rust] Arrow tests fail to compile on Raspberry Pi (32 bit)
-* [ARROW-10188](https://issues.apache.org/jira/browse/ARROW-10188) - [Rust] [DataFusion] Some examples are broken
-* [ARROW-10189](https://issues.apache.org/jira/browse/ARROW-10189) - [Doc] C data interface example for i32 uses \`l\`, not \`i\`, in the format
-* [ARROW-10192](https://issues.apache.org/jira/browse/ARROW-10192) - [C++][Python] Segfault when converting nested struct array with dictionary field to pandas series
-* [ARROW-10193](https://issues.apache.org/jira/browse/ARROW-10193) - [Python] Segfault when converting to fixed size binary array
-* [ARROW-10200](https://issues.apache.org/jira/browse/ARROW-10200) - [Java][CI] Fix failure of Java CI on s390x
-* [ARROW-10204](https://issues.apache.org/jira/browse/ARROW-10204) - [RUST] [Datafusion] Test failure in aggregate\_grouped\_empty with simd feature enabled
-* [ARROW-10214](https://issues.apache.org/jira/browse/ARROW-10214) - [Python] UnicodeDecodeError when printing schema with binary metadata
-* [ARROW-10226](https://issues.apache.org/jira/browse/ARROW-10226) - [Rust] [Parquet] Parquet reader reading wrong columns in some batches within a parquet file
-* [ARROW-10230](https://issues.apache.org/jira/browse/ARROW-10230) - [JS][Doc] JavaScript documentation fails to build
-* [ARROW-10232](https://issues.apache.org/jira/browse/ARROW-10232) - FixedSizeListArray is incorrectly written/read to/from parquet
-* [ARROW-10234](https://issues.apache.org/jira/browse/ARROW-10234) - [C++][Gandiva] Fix logic of round() for floats/decimals in Gandiva
-* [ARROW-10237](https://issues.apache.org/jira/browse/ARROW-10237) - [C++] Duplicate values in a dictionary result in corrupted parquet
-* [ARROW-10238](https://issues.apache.org/jira/browse/ARROW-10238) - [C\#] List<Struct\> is broken
-* [ARROW-10239](https://issues.apache.org/jira/browse/ARROW-10239) - [C++] aws-sdk-cpp apparently requires zlib too
-* [ARROW-10244](https://issues.apache.org/jira/browse/ARROW-10244) - [Python][Docs] Add docs on using pyarrow.dataset.parquet\_dataset
-* [ARROW-10248](https://issues.apache.org/jira/browse/ARROW-10248) - [C++][Dataset] Dataset writing does not write schema metadata
-* [ARROW-10262](https://issues.apache.org/jira/browse/ARROW-10262) - [C++] Some TypeClass in Scalar classes seem incorrect
-* [ARROW-10271](https://issues.apache.org/jira/browse/ARROW-10271) - [Rust] packed\_simd is broken and continued under a new project
-* [ARROW-10279](https://issues.apache.org/jira/browse/ARROW-10279) - [Release][Python] Fix verification script to align with the new macos wheel platform tags
-* [ARROW-10280](https://issues.apache.org/jira/browse/ARROW-10280) - [Packaging][Python] Fix macOS wheel artifact patterns
-* [ARROW-10281](https://issues.apache.org/jira/browse/ARROW-10281) - [Python] Fix warnings when running tests
-* [ARROW-10284](https://issues.apache.org/jira/browse/ARROW-10284) - [Python] Pyarrow is raising deprecation warning about filesystems on import
-* [ARROW-10285](https://issues.apache.org/jira/browse/ARROW-10285) - [Python] pyarrow.orc submodule is using deprecated functionality
-* [ARROW-10286](https://issues.apache.org/jira/browse/ARROW-10286) - [C++][Flight] Misleading CMake errors
-* [ARROW-10288](https://issues.apache.org/jira/browse/ARROW-10288) - [C++] Compilation fails on i386
-* [ARROW-10290](https://issues.apache.org/jira/browse/ARROW-10290) - [C++] List POP\_BACK is not available in older CMake versions
-
-
-## New Features and Improvements
-
-* [ARROW-983](https://issues.apache.org/jira/browse/ARROW-983) - [C++] Implement InputStream and OutputStream classes for interacting with socket connections
-* [ARROW-1105](https://issues.apache.org/jira/browse/ARROW-1105) - [C++] SQLite record batch reader
-* [ARROW-1509](https://issues.apache.org/jira/browse/ARROW-1509) - [Python] Write serialized object as a stream of encapsulated IPC messages
-* [ARROW-1669](https://issues.apache.org/jira/browse/ARROW-1669) - [C++] Consider adding Abseil (Google C++11 standard library extensions) to toolchain
-* [ARROW-1797](https://issues.apache.org/jira/browse/ARROW-1797) - [C++] Implement binary arithmetic kernels for numeric arrays
-* [ARROW-2164](https://issues.apache.org/jira/browse/ARROW-2164) - [C++] Clean up unnecessary decimal module refs
-* [ARROW-3080](https://issues.apache.org/jira/browse/ARROW-3080) - [Python] Unify Arrow to Python object conversion paths
-* [ARROW-3757](https://issues.apache.org/jira/browse/ARROW-3757) - [R] R bindings for Flight RPC client
-* [ARROW-3872](https://issues.apache.org/jira/browse/ARROW-3872) - [R] Add ad hoc test of feather compatibility
-* [ARROW-4046](https://issues.apache.org/jira/browse/ARROW-4046) - [Python/CI] Exercise large memory tests
-* [ARROW-4248](https://issues.apache.org/jira/browse/ARROW-4248) - [C++][Plasma] Build on Windows / Visual Studio
-* [ARROW-4685](https://issues.apache.org/jira/browse/ARROW-4685) - [C++] Update Boost to 1.69 in manylinux1 docker image
-* [ARROW-4927](https://issues.apache.org/jira/browse/ARROW-4927) - [Rust] Update top level README to describe current functionality
-* [ARROW-4957](https://issues.apache.org/jira/browse/ARROW-4957) - [Rust] [DataFusion] Implement get\_supertype correctly
-* [ARROW-4965](https://issues.apache.org/jira/browse/ARROW-4965) - [Python] Timestamp array type detection should use tzname of datetime.datetime objects
-* [ARROW-5034](https://issues.apache.org/jira/browse/ARROW-5034) - [C\#] ArrowStreamWriter should expose synchronous Write methods
-* [ARROW-5123](https://issues.apache.org/jira/browse/ARROW-5123) - [Rust] derive RecordWriter from struct definitions
-* [ARROW-6075](https://issues.apache.org/jira/browse/ARROW-6075) - [FlightRPC] Handle uncaught exceptions in middleware
-* [ARROW-6281](https://issues.apache.org/jira/browse/ARROW-6281) - [Python] Produce chunked arrays for nested types in pyarrow.array
-* [ARROW-6282](https://issues.apache.org/jira/browse/ARROW-6282) - [Format] Support lossy compression
-* [ARROW-6437](https://issues.apache.org/jira/browse/ARROW-6437) - [R] Add AWS SDK to system dependencies for macOS and Windows
-* [ARROW-6535](https://issues.apache.org/jira/browse/ARROW-6535) - [C++] Status::WithMessage should accept variadic parameters
-* [ARROW-6537](https://issues.apache.org/jira/browse/ARROW-6537) - [R] Pass column\_types to CSV reader
-* [ARROW-6972](https://issues.apache.org/jira/browse/ARROW-6972) - [C\#] Should support StructField arrays
-* [ARROW-6982](https://issues.apache.org/jira/browse/ARROW-6982) - [R] Add bindings for compare and boolean kernels
-* [ARROW-7136](https://issues.apache.org/jira/browse/ARROW-7136) - [Rust][CI] Pre-install the rust dependencies in the dockerfile
-* [ARROW-7218](https://issues.apache.org/jira/browse/ARROW-7218) - [Python] Conversion from boolean numpy scalars not working
-* [ARROW-7302](https://issues.apache.org/jira/browse/ARROW-7302) - [C++] CSV: allow converting a column to a specific dictionary type
-* [ARROW-7372](https://issues.apache.org/jira/browse/ARROW-7372) - [C++] Allow creating dictionary array from simple JSON
-* [ARROW-7871](https://issues.apache.org/jira/browse/ARROW-7871) - [Python] Expose more compute kernels
-* [ARROW-7960](https://issues.apache.org/jira/browse/ARROW-7960) - [C++][Parquet] Add support for schema translation from parquet nodes back to arrow for missing types
-* [ARROW-8001](https://issues.apache.org/jira/browse/ARROW-8001) - [R][Dataset] Bindings for dataset writing
-* [ARROW-8002](https://issues.apache.org/jira/browse/ARROW-8002) - [C++][Dataset] Dataset writing should let you (re)partition the data
-* [ARROW-8048](https://issues.apache.org/jira/browse/ARROW-8048) - [Python] Run memory leak tests nightly as follow up to ARROW-4120
-* [ARROW-8172](https://issues.apache.org/jira/browse/ARROW-8172) - [C++] ArrayFromJSON for dictionary arrays
-* [ARROW-8205](https://issues.apache.org/jira/browse/ARROW-8205) - [Rust] [DataFusion] DataFusion should enforce unique field names in a schema
-* [ARROW-8253](https://issues.apache.org/jira/browse/ARROW-8253) - [Rust] [DataFusion] Improve ergonomics of registering UDFs
-* [ARROW-8262](https://issues.apache.org/jira/browse/ARROW-8262) - [Rust] [DataFusion] Add example that uses LogicalPlanBuilder
-* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer
-* [ARROW-8296](https://issues.apache.org/jira/browse/ARROW-8296) - [C++][Dataset] IpcFileFormat should support writing files with compressed buffers
-* [ARROW-8355](https://issues.apache.org/jira/browse/ARROW-8355) - [Python] Reduce the number of pandas dependent test cases in test\_feather
-* [ARROW-8359](https://issues.apache.org/jira/browse/ARROW-8359) - [C++/Python] Enable aarch64/ppc64le build in conda recipes
-* [ARROW-8383](https://issues.apache.org/jira/browse/ARROW-8383) - [Rust] Easier random access to DictionaryArray keys and values
-* [ARROW-8402](https://issues.apache.org/jira/browse/ARROW-8402) - [Java] Support ValidateFull methods in Java
-* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet
-* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
-* [ARROW-8493](https://issues.apache.org/jira/browse/ARROW-8493) - [C++] Create unified schema resolution code for Array reconstruction.
-* [ARROW-8494](https://issues.apache.org/jira/browse/ARROW-8494) - [C++] Implement basic array-by-array reassembly logic
-* [ARROW-8581](https://issues.apache.org/jira/browse/ARROW-8581) - [C\#] Date32/64Array.Builder should accept DateTime, not DateTimeOffset
-* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface
-* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface
-* [ARROW-8618](https://issues.apache.org/jira/browse/ARROW-8618) - [C++] ASSIGN\_OR\_RAISE should move its argument
-* [ARROW-8678](https://issues.apache.org/jira/browse/ARROW-8678) - [C++][Parquet] Remove legacy arrow to level translation.
-* [ARROW-8712](https://issues.apache.org/jira/browse/ARROW-8712) - [R] Expose strptime timestamp parsing in read\_csv conversion options
-* [ARROW-8774](https://issues.apache.org/jira/browse/ARROW-8774) - [Rust] [DataFusion] Improve threading model
-* [ARROW-8810](https://issues.apache.org/jira/browse/ARROW-8810) - [R] Add documentation about Parquet format, appending to stream format
-* [ARROW-8824](https://issues.apache.org/jira/browse/ARROW-8824) - [Rust] [DataFusion] Implement new SQL parser
-* [ARROW-8828](https://issues.apache.org/jira/browse/ARROW-8828) - [Rust] Implement SQL tokenizer
-* [ARROW-8829](https://issues.apache.org/jira/browse/ARROW-8829) - [Rust] Implement SQL parser
-* [ARROW-9010](https://issues.apache.org/jira/browse/ARROW-9010) - [Java] Framework and interface changes for RecordBatch IPC buffer compression
-* [ARROW-9065](https://issues.apache.org/jira/browse/ARROW-9065) - [C++] Support parsing date32 in dataset partition folders
-* [ARROW-9068](https://issues.apache.org/jira/browse/ARROW-9068) - [C++][Dataset] Simplify Partitioning interface
-* [ARROW-9078](https://issues.apache.org/jira/browse/ARROW-9078) - [C++] Parquet writing of extension type with nested storage type fails
-* [ARROW-9104](https://issues.apache.org/jira/browse/ARROW-9104) - [C++] Parquet encryption tests should write files to a temporary directory instead of the testing submodule's directory
-* [ARROW-9107](https://issues.apache.org/jira/browse/ARROW-9107) - [C++][Dataset] Time-based types support
-* [ARROW-9147](https://issues.apache.org/jira/browse/ARROW-9147) - [C++][Dataset] Support null -\> other type promotion in Dataset scanning
-* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst
-* [ARROW-9266](https://issues.apache.org/jira/browse/ARROW-9266) - [Python][Packaging] Enable S3 support in macOS wheels
-* [ARROW-9271](https://issues.apache.org/jira/browse/ARROW-9271) - [R] Preserve data frame metadata in round trip
-* [ARROW-9286](https://issues.apache.org/jira/browse/ARROW-9286) - [C++] Add function "aliases" to compute::FunctionRegistry
-* [ARROW-9328](https://issues.apache.org/jira/browse/ARROW-9328) - [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string
-* [ARROW-9338](https://issues.apache.org/jira/browse/ARROW-9338) - [Rust] Add instructions for running clippy locally
-* [ARROW-9344](https://issues.apache.org/jira/browse/ARROW-9344) - [C++][Flight] measure latency quantile in flight benchmark
-* [ARROW-9358](https://issues.apache.org/jira/browse/ARROW-9358) - [Integration] Reconsider generated\_large\_batch.json
-* [ARROW-9371](https://issues.apache.org/jira/browse/ARROW-9371) - [Java] Run vector tests for both allocators
-* [ARROW-9377](https://issues.apache.org/jira/browse/ARROW-9377) - [Java] Support unsigned dictionary indices
-* [ARROW-9387](https://issues.apache.org/jira/browse/ARROW-9387) - [R] Use new C++ table select method
-* [ARROW-9388](https://issues.apache.org/jira/browse/ARROW-9388) - [C++] Division kernels
-* [ARROW-9394](https://issues.apache.org/jira/browse/ARROW-9394) - [Python] Support pickling of Scalars
-* [ARROW-9398](https://issues.apache.org/jira/browse/ARROW-9398) - [C++] Register the SIMD sum variants under function instance instead a SIMD function
-* [ARROW-9402](https://issues.apache.org/jira/browse/ARROW-9402) - [C++] Add portable wrappers for \_\_builtin\_add\_overflow and friends
-* [ARROW-9405](https://issues.apache.org/jira/browse/ARROW-9405) - [R] Switch to cpp11
-* [ARROW-9412](https://issues.apache.org/jira/browse/ARROW-9412) - [C++] Add non-BUNDLED dependencies to exported INSTALL\_INTERFACE\_LIBS of arrow\_static and test that it works
-* [ARROW-9429](https://issues.apache.org/jira/browse/ARROW-9429) - [Python] ChunkedArray.to\_numpy
-* [ARROW-9454](https://issues.apache.org/jira/browse/ARROW-9454) - [GLib] Add binding of some dictionary builders
-* [ARROW-9465](https://issues.apache.org/jira/browse/ARROW-9465) - [Python] Improve ergonomics of compute functions
-* [ARROW-9469](https://issues.apache.org/jira/browse/ARROW-9469) - [Python] Make more objects weakrefable
-* [ARROW-9487](https://issues.apache.org/jira/browse/ARROW-9487) - [Developer] Cover the archery release utilities with unittests
-* [ARROW-9488](https://issues.apache.org/jira/browse/ARROW-9488) - [Release] Use the new changelog generation when updating the website
-* [ARROW-9507](https://issues.apache.org/jira/browse/ARROW-9507) - [Rust] [DataFusion] PhysicalExpr should implement Display trait
-* [ARROW-9508](https://issues.apache.org/jira/browse/ARROW-9508) - [Release][APT][Yum] Enable verification for arm64 binaries
-* [ARROW-9516](https://issues.apache.org/jira/browse/ARROW-9516) - [Rust][DataFusion] Refactor physical expressions to not care about their names nor indexes
-* [ARROW-9517](https://issues.apache.org/jira/browse/ARROW-9517) - [C++][Python] Allow session\_token argument when initializing S3FileSystem
-* [ARROW-9518](https://issues.apache.org/jira/browse/ARROW-9518) - [Python] Deprecate pyarrow serialization
-* [ARROW-9521](https://issues.apache.org/jira/browse/ARROW-9521) - [Rust] CsvReadOptions should allow file extension to be specified
-* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel
-* [ARROW-9534](https://issues.apache.org/jira/browse/ARROW-9534) - [Rust] [DataFusion] Implement functions for creating literal expressions for all types
-* [ARROW-9550](https://issues.apache.org/jira/browse/ARROW-9550) - [Rust] [DataFusion] Remove Rc<RefCell<\_\>\> from hash aggregate operator
-* [ARROW-9553](https://issues.apache.org/jira/browse/ARROW-9553) - [Rust] Release script doesn't bump parquet crate's arrow dependency version
-* [ARROW-9557](https://issues.apache.org/jira/browse/ARROW-9557) - [R] Iterating over parquet columns is slow in R
-* [ARROW-9559](https://issues.apache.org/jira/browse/ARROW-9559) - [Rust] [DataFusion] Revert privatization of exprlist\_to\_fields
-* [ARROW-9563](https://issues.apache.org/jira/browse/ARROW-9563) - [Dev][Release] Use archery's changelog generator when creating release notes for the website
-* [ARROW-9568](https://issues.apache.org/jira/browse/ARROW-9568) - [CI] Use official msys action on GHA
-* [ARROW-9576](https://issues.apache.org/jira/browse/ARROW-9576) - [Python][Doc] Fix error in code example for extension types
-* [ARROW-9580](https://issues.apache.org/jira/browse/ARROW-9580) - [JS] Docs have superfluous ()
-* [ARROW-9581](https://issues.apache.org/jira/browse/ARROW-9581) - [Dev][Release] Bump next snapshot versions to 2.0.0
-* [ARROW-9582](https://issues.apache.org/jira/browse/ARROW-9582) - [Rust] Implement Array::memory\_size()
-* [ARROW-9585](https://issues.apache.org/jira/browse/ARROW-9585) - [Rust] Remove duplicated to-do line in DataFusion readme
-* [ARROW-9587](https://issues.apache.org/jira/browse/ARROW-9587) - [FlightRPC][Java] Clean up DoPut/FlightStream memory handling
-* [ARROW-9593](https://issues.apache.org/jira/browse/ARROW-9593) - [Python] Add custom pickle reducers for DictionaryScalar
-* [ARROW-9604](https://issues.apache.org/jira/browse/ARROW-9604) - [C++] Add benchmark for aggregate min/max compute kernels
-* [ARROW-9605](https://issues.apache.org/jira/browse/ARROW-9605) - [C++] Optimize performance for aggregate min/max compute kernels
-* [ARROW-9607](https://issues.apache.org/jira/browse/ARROW-9607) - [C++][Gandiva] Add bitwise\_and(), bitwise\_or() and bitwise\_not() functions for integers
-* [ARROW-9608](https://issues.apache.org/jira/browse/ARROW-9608) - [Rust] Remove arrow flight from parquet's feature gating
-* [ARROW-9615](https://issues.apache.org/jira/browse/ARROW-9615) - [Rust] Add kernel to compute length of string array
-* [ARROW-9617](https://issues.apache.org/jira/browse/ARROW-9617) - [Rust] [DataFusion] Add length of string array
-* [ARROW-9618](https://issues.apache.org/jira/browse/ARROW-9618) - [Rust] [DataFusion] Make it easier to write optimizers
-* [ARROW-9619](https://issues.apache.org/jira/browse/ARROW-9619) - [Rust] [DataFusion] Add predicate push-down
-* [ARROW-9632](https://issues.apache.org/jira/browse/ARROW-9632) - [Rust] Add a "new" method for ExecutionContextSchemaProvider
-* [ARROW-9638](https://issues.apache.org/jira/browse/ARROW-9638) - [C++][Compute] Implement mode(most frequent number) kernel
-* [ARROW-9639](https://issues.apache.org/jira/browse/ARROW-9639) - [Ruby] Add dependency version check
-* [ARROW-9640](https://issues.apache.org/jira/browse/ARROW-9640) - [C++][Gandiva] Implement round() for integers and long integers
-* [ARROW-9641](https://issues.apache.org/jira/browse/ARROW-9641) - [C++][Gandiva] Implement round() for floating point and double floating point numbers
-* [ARROW-9645](https://issues.apache.org/jira/browse/ARROW-9645) - [Python] Deprecate the legacy pyarrow.filesystem interface
-* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets
-* [ARROW-9650](https://issues.apache.org/jira/browse/ARROW-9650) - [Packaging][APT] Drop support for Ubuntu 19.10
-* [ARROW-9654](https://issues.apache.org/jira/browse/ARROW-9654) - [Rust][DataFusion] Add an EXPLAIN command to the datafusion CLI
-* [ARROW-9656](https://issues.apache.org/jira/browse/ARROW-9656) - [Rust][DataFusion] Slightly confusing error message when unsupported type is provided to CREATE EXTERNAL TABLE
-* [ARROW-9658](https://issues.apache.org/jira/browse/ARROW-9658) - [Python][Dataset] Bindings for dataset writing
-* [ARROW-9665](https://issues.apache.org/jira/browse/ARROW-9665) - [R] head/tail/take for Datasets
-* [ARROW-9667](https://issues.apache.org/jira/browse/ARROW-9667) - [CI][Crossbow] Segfault in 2 nightly R builds
-* [ARROW-9671](https://issues.apache.org/jira/browse/ARROW-9671) - [C++] BasicDecimal128 constructor interprets uint64\_t integers with highest bit set as negative
-* [ARROW-9673](https://issues.apache.org/jira/browse/ARROW-9673) - [Rust] Add a param "dialect" for DFParser::parse\_sql
-* [ARROW-9678](https://issues.apache.org/jira/browse/ARROW-9678) - [Rust] [DataFusion] Improve projection push down to remove unused columns
-* [ARROW-9679](https://issues.apache.org/jira/browse/ARROW-9679) - [Rust] [DataFusion] HashAggregate walks map many times building final batch
-* [ARROW-9681](https://issues.apache.org/jira/browse/ARROW-9681) - [Java] Failed Arrow Memory - Core on big-endian platform
-* [ARROW-9683](https://issues.apache.org/jira/browse/ARROW-9683) - [Rust][DataFusion] Implement Debug for ExecutionPlan trait
-* [ARROW-9691](https://issues.apache.org/jira/browse/ARROW-9691) - [Rust] [DataFusion] Make sql\_statement\_to\_plan public
-* [ARROW-9695](https://issues.apache.org/jira/browse/ARROW-9695) - [Rust][DataFusion] Improve documentation on LogicalPlan variants
-* [ARROW-9699](https://issues.apache.org/jira/browse/ARROW-9699) - [C++][Compute] Improve mode kernel performance for small integer types
-* [ARROW-9701](https://issues.apache.org/jira/browse/ARROW-9701) - [Java][CI] Add a test job on s390x
-* [ARROW-9702](https://issues.apache.org/jira/browse/ARROW-9702) - [C++] Move bpacking simd to runtime path
-* [ARROW-9703](https://issues.apache.org/jira/browse/ARROW-9703) - [Developer][Archery] Restartable cherry-picking process for creating maintenance branches
-* [ARROW-9706](https://issues.apache.org/jira/browse/ARROW-9706) - [Java] Tests in TestLargeListVector fails on big endian platform
-* [ARROW-9710](https://issues.apache.org/jira/browse/ARROW-9710) - [C++] Generalize Decimal ToString in preparation for Decimal256
-* [ARROW-9711](https://issues.apache.org/jira/browse/ARROW-9711) - [Rust] Add benchmark based on TPC-H
-* [ARROW-9713](https://issues.apache.org/jira/browse/ARROW-9713) - [Rust][DataFusion] Remove explicit panics
-* [ARROW-9715](https://issues.apache.org/jira/browse/ARROW-9715) - [R] changelog/doc updates for 1.0.1
-* [ARROW-9718](https://issues.apache.org/jira/browse/ARROW-9718) - [Python] Make pyarrow.parquet work with the new filesystem interfaces
-* [ARROW-9721](https://issues.apache.org/jira/browse/ARROW-9721) - [Packaging][Python] Update wheel dependency files
-* [ARROW-9722](https://issues.apache.org/jira/browse/ARROW-9722) - [Rust]: Shorten key lifetime for reverse lookup for dictionary arrays
-* [ARROW-9723](https://issues.apache.org/jira/browse/ARROW-9723) - [C++] Expected behaviour of "mode" kernel with NaNs ?
-* [ARROW-9725](https://issues.apache.org/jira/browse/ARROW-9725) - [Rust] [DataFusion] LimitExec and SortExec should use MergeExec
-* [ARROW-9737](https://issues.apache.org/jira/browse/ARROW-9737) - [C++][Gandiva] Add bitwise\_xor() for integers
-* [ARROW-9739](https://issues.apache.org/jira/browse/ARROW-9739) - [CI][Ruby] Don't install gem documents
-* [ARROW-9742](https://issues.apache.org/jira/browse/ARROW-9742) - [Rust] Create one standard DataFrame API
-* [ARROW-9751](https://issues.apache.org/jira/browse/ARROW-9751) - [Rust] [DataFusion] Extend UDFs to accept more than one type per argument
-* [ARROW-9752](https://issues.apache.org/jira/browse/ARROW-9752) - [Rust] [DataFusion] Add support for Aggregate UDFs
-* [ARROW-9753](https://issues.apache.org/jira/browse/ARROW-9753) - [Rust] [DataFusion] Remove the use of Mutex in ExecutionPlan trait
-* [ARROW-9754](https://issues.apache.org/jira/browse/ARROW-9754) - [Rust] [DataFusion] Implement async in DataFusion traits
-* [ARROW-9757](https://issues.apache.org/jira/browse/ARROW-9757) - [Rust] [DataFusion] Use "pub use" to expose a clean public API
-* [ARROW-9758](https://issues.apache.org/jira/browse/ARROW-9758) - [Rust] [DataFusion] Implement extension API for DataFusion
-* [ARROW-9759](https://issues.apache.org/jira/browse/ARROW-9759) - [Rust] [DataFusion] Implement DataFrame::sort
-* [ARROW-9760](https://issues.apache.org/jira/browse/ARROW-9760) - [Rust] [DataFusion] Implement DataFrame::explain
-* [ARROW-9761](https://issues.apache.org/jira/browse/ARROW-9761) - [C++] Add experimental pull-based iterator structures to C interface implementation
-* [ARROW-9762](https://issues.apache.org/jira/browse/ARROW-9762) - [Rust] [DataFusion] ExecutionContext::sql should return DataFrame
-* [ARROW-9769](https://issues.apache.org/jira/browse/ARROW-9769) - [Python] Remove skip for in-memory fsspec in test\_move\_file
-* [ARROW-9775](https://issues.apache.org/jira/browse/ARROW-9775) - [C++] Automatic S3 region selection
-* [ARROW-9781](https://issues.apache.org/jira/browse/ARROW-9781) - [C++] Fix uninitialized value warnings
-* [ARROW-9782](https://issues.apache.org/jira/browse/ARROW-9782) - [C++][Dataset] Ability to write ".feather" files with IpcFileFormat
-* [ARROW-9784](https://issues.apache.org/jira/browse/ARROW-9784) - [Rust] [DataFusion] Improve instructions for running tpch benchmark
-* [ARROW-9786](https://issues.apache.org/jira/browse/ARROW-9786) - [R] Unvendor cpp11 before release
-* [ARROW-9788](https://issues.apache.org/jira/browse/ARROW-9788) - Handle naming inconsistencies between SQL, DataFrame API and struct names
-* [ARROW-9792](https://issues.apache.org/jira/browse/ARROW-9792) - [Rust] [DataFusion] Logical aggregate functions should not return Result
-* [ARROW-9794](https://issues.apache.org/jira/browse/ARROW-9794) - [C++] Add functionality to cpu\_info to discriminate between Intel vs AMD x86
-* [ARROW-9795](https://issues.apache.org/jira/browse/ARROW-9795) - [C++][Gandiva] Implement castTIMESTAMP(int64) in Gandiva
-* [ARROW-9806](https://issues.apache.org/jira/browse/ARROW-9806) - [R] More compute kernel bindings
-* [ARROW-9807](https://issues.apache.org/jira/browse/ARROW-9807) - [R] News update/version bump post-1.0.1
-* [ARROW-9808](https://issues.apache.org/jira/browse/ARROW-9808) - [Python] parquet.read\_table docstring wrong use\_legacy\_dataset explanation
-* [ARROW-9811](https://issues.apache.org/jira/browse/ARROW-9811) - [C++] Unchecked floating point division by 0 should succeed
-* [ARROW-9813](https://issues.apache.org/jira/browse/ARROW-9813) - [C++] Disable semantic interposition
-* [ARROW-9819](https://issues.apache.org/jira/browse/ARROW-9819) - [C++] Bump mimalloc to 1.6.4
-* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API
-* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API
-* [ARROW-9823](https://issues.apache.org/jira/browse/ARROW-9823) - [CI][C++][MinGW] Enable S3
-* [ARROW-9832](https://issues.apache.org/jira/browse/ARROW-9832) - [Rust] [DataFusion] Refactor PhysicalPlan to remove Partition
-* [ARROW-9833](https://issues.apache.org/jira/browse/ARROW-9833) - [Rust] [DataFusion] Refactor TableProvider.scan to return ExecutionPlan
-* [ARROW-9834](https://issues.apache.org/jira/browse/ARROW-9834) - [Rust] [DataFusion] Remove Partition trait
-* [ARROW-9835](https://issues.apache.org/jira/browse/ARROW-9835) - [Rust] [DataFusion] Remove FunctionMeta
-* [ARROW-9836](https://issues.apache.org/jira/browse/ARROW-9836) - [Rust] [DataFusion] Improve API for usage of UDFs
-* [ARROW-9837](https://issues.apache.org/jira/browse/ARROW-9837) - [Rust] Add provider for variable
-* [ARROW-9838](https://issues.apache.org/jira/browse/ARROW-9838) - [Rust] [DataFusion] DefaultPhysicalPlanner should insert explicit MergeExec nodes
-* [ARROW-9839](https://issues.apache.org/jira/browse/ARROW-9839) - [Rust] [DataFusion] Add ability to downcast ExecutionPlan to specific operator
-* [ARROW-9841](https://issues.apache.org/jira/browse/ARROW-9841) - [Rust] Update checked-in flatbuffer files
-* [ARROW-9844](https://issues.apache.org/jira/browse/ARROW-9844) - [Go][CI] Add Travis CI job for Go on s390x
-* [ARROW-9845](https://issues.apache.org/jira/browse/ARROW-9845) - [Rust] [Parquet] serde\_json is only used in tests but isn't in dev-dependencies
-* [ARROW-9848](https://issues.apache.org/jira/browse/ARROW-9848) - [Rust] Implement changes to ensure flatbuffer alignment
-* [ARROW-9849](https://issues.apache.org/jira/browse/ARROW-9849) - [Rust] [DataFusion] Make UDFs not need a Field
-* [ARROW-9850](https://issues.apache.org/jira/browse/ARROW-9850) - [Go] Defer should not be used in the loop
-* [ARROW-9853](https://issues.apache.org/jira/browse/ARROW-9853) - [RUST] Implement "take" kernel for dictionary arrays
-* [ARROW-9854](https://issues.apache.org/jira/browse/ARROW-9854) - [R] Support reading/writing data to/from S3
-* [ARROW-9858](https://issues.apache.org/jira/browse/ARROW-9858) - [C++][Python][Docs] Expand user guide for FileSystem
-* [ARROW-9863](https://issues.apache.org/jira/browse/ARROW-9863) - [C++] [PARQUET] Optimize meta data recovery of ApplicationVersion
-* [ARROW-9867](https://issues.apache.org/jira/browse/ARROW-9867) - [C++][Dataset] FileSystemDataset should expose its filesystem
-* [ARROW-9868](https://issues.apache.org/jira/browse/ARROW-9868) - [C++] Provide utility for copying files between filesystems
-* [ARROW-9869](https://issues.apache.org/jira/browse/ARROW-9869) - [R] Implement full S3FileSystem/S3Options constructor
-* [ARROW-9870](https://issues.apache.org/jira/browse/ARROW-9870) - [R] Friendly interface for filesystems (S3)
-* [ARROW-9871](https://issues.apache.org/jira/browse/ARROW-9871) - [C++] Add uppercase support to ARROW\_USER\_SIMD\_LEVEL.
-* [ARROW-9873](https://issues.apache.org/jira/browse/ARROW-9873) - [C++][Compute] Improve mode kernel for intergers within limited value range
-* [ARROW-9875](https://issues.apache.org/jira/browse/ARROW-9875) - [Python] Let FileSystem.get\_file\_info accept a single path
-* [ARROW-9884](https://issues.apache.org/jira/browse/ARROW-9884) - [R] Bindings for writing datasets to Parquet
-* [ARROW-9885](https://issues.apache.org/jira/browse/ARROW-9885) - [Rust] [DataFusion] Simplify code of type coercion for binary types
-* [ARROW-9886](https://issues.apache.org/jira/browse/ARROW-9886) - [Rust] [DataFusion] Simplify code to test cast
-* [ARROW-9887](https://issues.apache.org/jira/browse/ARROW-9887) - [Rust] [DataFusion] Add support for complex return types of built-in functions
-* [ARROW-9890](https://issues.apache.org/jira/browse/ARROW-9890) - [R] Add zstandard compression codec in macOS build
-* [ARROW-9891](https://issues.apache.org/jira/browse/ARROW-9891) - [Rust] [DataFusion] Make math functions support f32
-* [ARROW-9892](https://issues.apache.org/jira/browse/ARROW-9892) - [Rust] [DataFusion] Add support for concat
-* [ARROW-9893](https://issues.apache.org/jira/browse/ARROW-9893) - [Python] Bindings for writing datasets to Parquet
-* [ARROW-9895](https://issues.apache.org/jira/browse/ARROW-9895) - [RUST] Improve sort kernels
-* [ARROW-9899](https://issues.apache.org/jira/browse/ARROW-9899) - [Rust] [DataFusion] Switch from Box<Schema\> --\> SchemaRef (Arc<Schema\>) to be consistent with the rest of Arrow
-* [ARROW-9900](https://issues.apache.org/jira/browse/ARROW-9900) - [Rust][DataFusion] Use Arc<\> instead of Box<\> in LogicalPlan
-* [ARROW-9901](https://issues.apache.org/jira/browse/ARROW-9901) - [C++] Add hand-crafted Parquet to Arrow reconstruction test for nested reading
-* [ARROW-9902](https://issues.apache.org/jira/browse/ARROW-9902) - [Rust] [DataFusion] Add support for array()
-* [ARROW-9904](https://issues.apache.org/jira/browse/ARROW-9904) - [C++] Unroll the loop manually for CountSetBits
-* [ARROW-9908](https://issues.apache.org/jira/browse/ARROW-9908) - [Rust] Support temporal data types in JSON reader
-* [ARROW-9910](https://issues.apache.org/jira/browse/ARROW-9910) - [Rust] [DataFusion] Type coercion of Variadic is wrong
-* [ARROW-9914](https://issues.apache.org/jira/browse/ARROW-9914) - [Rust][DataFusion] Document the SQL -\> Arrow type mapping
-* [ARROW-9916](https://issues.apache.org/jira/browse/ARROW-9916) - [RUST] Avoid cloning ArrayData in several places
-* [ARROW-9917](https://issues.apache.org/jira/browse/ARROW-9917) - [Python][Compute] Add bindings for mode kernel
-* [ARROW-9919](https://issues.apache.org/jira/browse/ARROW-9919) - [Rust] [DataFusion] Math functions
-* [ARROW-9921](https://issues.apache.org/jira/browse/ARROW-9921) - [Rust] Add \`from(Vec<Option<&str\>\>)\` to [Large]StringArray
-* [ARROW-9925](https://issues.apache.org/jira/browse/ARROW-9925) - [GLib] Add low level value readers for GArrowListArray family
-* [ARROW-9926](https://issues.apache.org/jira/browse/ARROW-9926) - [GLib] Use placement new for GArrowRecordBatchFileReader
-* [ARROW-9928](https://issues.apache.org/jira/browse/ARROW-9928) - [C++] Speed up integer parsing slightly
-* [ARROW-9929](https://issues.apache.org/jira/browse/ARROW-9929) - [Developer] Autotune cmake-format
-* [ARROW-9933](https://issues.apache.org/jira/browse/ARROW-9933) - [Developer] Add drone as a CI provider for crossbow
-* [ARROW-9934](https://issues.apache.org/jira/browse/ARROW-9934) - [Rust] Shape and stride check in tensor
-* [ARROW-9941](https://issues.apache.org/jira/browse/ARROW-9941) - [Python] Better string representation for extension types
-* [ARROW-9944](https://issues.apache.org/jira/browse/ARROW-9944) - [Rust] Implement TO\_TIMESTAMP function
-* [ARROW-9949](https://issues.apache.org/jira/browse/ARROW-9949) - [C++] Generalize Decimal128::FromString for reuse in Decimal256
-* [ARROW-9950](https://issues.apache.org/jira/browse/ARROW-9950) - [Rust] [DataFusion] Allow UDF usage without registry
-* [ARROW-9952](https://issues.apache.org/jira/browse/ARROW-9952) - [Python] Use pyarrow.dataset writing for pq.write\_to\_dataset
-* [ARROW-9954](https://issues.apache.org/jira/browse/ARROW-9954) - [Rust] [DataFusion] Simplify code of aggregate planning
-* [ARROW-9956](https://issues.apache.org/jira/browse/ARROW-9956) - [C++][Gandiva] Implement Binary string function in Gandiva
-* [ARROW-9957](https://issues.apache.org/jira/browse/ARROW-9957) - [Rust] Remove unmaintained tempdir dependency
-* [ARROW-9961](https://issues.apache.org/jira/browse/ARROW-9961) - [Rust][DataFusion] to\_timestamp function parses timestamp without timezone offset as UTC rather than local
-* [ARROW-9964](https://issues.apache.org/jira/browse/ARROW-9964) - [C++] CSV date support
-* [ARROW-9965](https://issues.apache.org/jira/browse/ARROW-9965) - [Java] Buffer capacity calculations are slow for fixed-width vectors
-* [ARROW-9966](https://issues.apache.org/jira/browse/ARROW-9966) - [Rust] Speedup aggregate kernels
-* [ARROW-9967](https://issues.apache.org/jira/browse/ARROW-9967) - [Python] Add compute module docs
-* [ARROW-9971](https://issues.apache.org/jira/browse/ARROW-9971) - [Rust] Speedup take
-* [ARROW-9977](https://issues.apache.org/jira/browse/ARROW-9977) - [Rust] Add min/max for [Large]String
-* [ARROW-9979](https://issues.apache.org/jira/browse/ARROW-9979) - [Rust] Fix arrow crate clippy lints
-* [ARROW-9980](https://issues.apache.org/jira/browse/ARROW-9980) - [Rust] Fix parquet crate clippy lints
-* [ARROW-9981](https://issues.apache.org/jira/browse/ARROW-9981) - [Rust] Allow configuring flight IPC with IpcWriteOptions
-* [ARROW-9983](https://issues.apache.org/jira/browse/ARROW-9983) - [C++][Dataset][Python] Use larger default batch size than 32K for Datasets API
-* [ARROW-9984](https://issues.apache.org/jira/browse/ARROW-9984) - [Rust] [DataFusion] DRY of function to string
-* [ARROW-9986](https://issues.apache.org/jira/browse/ARROW-9986) - [Rust][DataFusion] TO\_TIMESTAMP function erroneously requires fractional seconds when no timezone is present
-* [ARROW-9987](https://issues.apache.org/jira/browse/ARROW-9987) - [Rust] [DataFusion] Improve docs of \`Expr\`.
-* [ARROW-9988](https://issues.apache.org/jira/browse/ARROW-9988) - [Rust] [DataFusion] Added std::ops to logical expressions
-* [ARROW-9992](https://issues.apache.org/jira/browse/ARROW-9992) - [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
-* [ARROW-9998](https://issues.apache.org/jira/browse/ARROW-9998) - [Python] Support pickling DictionaryScalar
-* [ARROW-9999](https://issues.apache.org/jira/browse/ARROW-9999) - [Python] Support constructing dictionary array directly through pa.array()
-* [ARROW-10000](https://issues.apache.org/jira/browse/ARROW-10000) - [C++][Python] Support constructing StructArray from list of key-value pairs
-* [ARROW-10001](https://issues.apache.org/jira/browse/ARROW-10001) - [Rust] [DataFusion] Add developer guide to README
-* [ARROW-10010](https://issues.apache.org/jira/browse/ARROW-10010) - [Rust] Speedup arithmetic
-* [ARROW-10015](https://issues.apache.org/jira/browse/ARROW-10015) - [Rust] Implement SIMD for aggregate kernel sum
-* [ARROW-10016](https://issues.apache.org/jira/browse/ARROW-10016) - [Rust] [DataFusion] Implement IsNull and IsNotNull
-* [ARROW-10018](https://issues.apache.org/jira/browse/ARROW-10018) - [CI] Disable Sphinx and API documentation build since it takes 6 hours on master
-* [ARROW-10019](https://issues.apache.org/jira/browse/ARROW-10019) - [Rust] Add substring kernel
-* [ARROW-10023](https://issues.apache.org/jira/browse/ARROW-10023) - [Gandiva][C++] Implementing Split part function in gandiva
-* [ARROW-10024](https://issues.apache.org/jira/browse/ARROW-10024) - [C++][Parquet] Create nested reading benchmarks
-* [ARROW-10028](https://issues.apache.org/jira/browse/ARROW-10028) - [Rust] Simplify macro def\_numeric\_from\_vec
-* [ARROW-10030](https://issues.apache.org/jira/browse/ARROW-10030) - [Rust] Support fromIter and toIter
-* [ARROW-10035](https://issues.apache.org/jira/browse/ARROW-10035) - [C++] Bump versions of vendored code
-* [ARROW-10037](https://issues.apache.org/jira/browse/ARROW-10037) - [C++] Workaround to force find AWS SDK to look for shared libraries
-* [ARROW-10040](https://issues.apache.org/jira/browse/ARROW-10040) - [Rust] Create a way to slice unalligned offset buffers
-* [ARROW-10043](https://issues.apache.org/jira/browse/ARROW-10043) - [Rust] [DataFusion] Introduce support for DISTINCT by partially implementing COUNT(DISTINCT)
-* [ARROW-10044](https://issues.apache.org/jira/browse/ARROW-10044) - [Rust] Improve README
-* [ARROW-10046](https://issues.apache.org/jira/browse/ARROW-10046) - [Rust] [DataFusion] Made \`\*Iterator\` implement Iterator
-* [ARROW-10050](https://issues.apache.org/jira/browse/ARROW-10050) - [C++][Gandiva] Implement concat() in Gandiva for up to 10 arguments
-* [ARROW-10051](https://issues.apache.org/jira/browse/ARROW-10051) - [C++][Compute] Make aggregate kernel merge state mutable
-* [ARROW-10054](https://issues.apache.org/jira/browse/ARROW-10054) - [Python] Slice methods should return empty arrays instead of crashing
-* [ARROW-10055](https://issues.apache.org/jira/browse/ARROW-10055) - [Rust] Implement DoubleEndedIterator for NullableIter
-* [ARROW-10057](https://issues.apache.org/jira/browse/ARROW-10057) - [C++] Add Parquet-Arrow roundtrip tests for nested data
-* [ARROW-10058](https://issues.apache.org/jira/browse/ARROW-10058) - [C++] Investigate performance of LevelsToBitmap without BMI2
-* [ARROW-10059](https://issues.apache.org/jira/browse/ARROW-10059) - [R][Doc] Give more advice on how to set up C++ build
-* [ARROW-10063](https://issues.apache.org/jira/browse/ARROW-10063) - [Archery][CI] Fetch main branch in archery build only when it is a pull request
-* [ARROW-10064](https://issues.apache.org/jira/browse/ARROW-10064) - [C++] Resolve compile warnings on Apple Clang 12
-* [ARROW-10065](https://issues.apache.org/jira/browse/ARROW-10065) - [Rust] DRY downcasted Arrays
-* [ARROW-10066](https://issues.apache.org/jira/browse/ARROW-10066) - [C++] Make sure that default AWS region is respected
-* [ARROW-10068](https://issues.apache.org/jira/browse/ARROW-10068) - [C++] Add bundled external project for aws-sdk-cpp
-* [ARROW-10069](https://issues.apache.org/jira/browse/ARROW-10069) - [Java] Support running Java benchmarks from command line
-* [ARROW-10070](https://issues.apache.org/jira/browse/ARROW-10070) - [C++][Compute] Implement stdev aggregate kernel
-* [ARROW-10071](https://issues.apache.org/jira/browse/ARROW-10071) - [R] segfault with ArrowObject from previous session, or saved
-* [ARROW-10074](https://issues.apache.org/jira/browse/ARROW-10074) - [C++] Don't use string\_view.to\_string()
-* [ARROW-10075](https://issues.apache.org/jira/browse/ARROW-10075) - [C++] Don't use nonstd::nullopt this breaks out vendoring abstraction.
-* [ARROW-10076](https://issues.apache.org/jira/browse/ARROW-10076) - [C++] Use TemporaryDir for all tests that don't already use it.
-* [ARROW-10077](https://issues.apache.org/jira/browse/ARROW-10077) - [C++] Potential overflow in bit\_stream\_utils.h multiplication.
-* [ARROW-10083](https://issues.apache.org/jira/browse/ARROW-10083) - [C++] Improve Parquet fuzz seed corpus
-* [ARROW-10084](https://issues.apache.org/jira/browse/ARROW-10084) - [Rust] [DataFusion] Add length of large string array
-* [ARROW-10086](https://issues.apache.org/jira/browse/ARROW-10086) - [Rust] Migrate min\_large\_string -\> min\_string kernels
-* [ARROW-10090](https://issues.apache.org/jira/browse/ARROW-10090) - [C++][Compute] Improve mode kernel
-* [ARROW-10092](https://issues.apache.org/jira/browse/ARROW-10092) - [Dev][Go] Add grpc generated go files to rat exclusion list
-* [ARROW-10093](https://issues.apache.org/jira/browse/ARROW-10093) - [R] Add ability to opt-out of int64 -\> int demotion
-* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes
-* [ARROW-10096](https://issues.apache.org/jira/browse/ARROW-10096) - [Rust] [DataFusion] Remove unused code
-* [ARROW-10099](https://issues.apache.org/jira/browse/ARROW-10099) - [C++][Dataset] Also allow integer partition fields to be dictionary encoded
-* [ARROW-10100](https://issues.apache.org/jira/browse/ARROW-10100) - [C++][Dataset] Ability to read/subset a ParquetFileFragment with given set of row group ids
-* [ARROW-10102](https://issues.apache.org/jira/browse/ARROW-10102) - [C++] Generalize BasicDecimal128::operator\*= for reuse in Decimal256
-* [ARROW-10103](https://issues.apache.org/jira/browse/ARROW-10103) - [Rust] Add a Contains kernel
-* [ARROW-10105](https://issues.apache.org/jira/browse/ARROW-10105) - [FlightRPC] Add client option to disable certificate validation with TLS
-* [ARROW-10120](https://issues.apache.org/jira/browse/ARROW-10120) - [C++][Parquet] Create reading benchmarks for 2-level nested data
-* [ARROW-10127](https://issues.apache.org/jira/browse/ARROW-10127) - [Format] Update specification to support 256-bit Decimal types
-* [ARROW-10129](https://issues.apache.org/jira/browse/ARROW-10129) - [Rust] Cargo build is rebuilding dependencies on arrow changes
-* [ARROW-10134](https://issues.apache.org/jira/browse/ARROW-10134) - [C++][Dataset] Add ParquetFileFragment::num\_row\_groups property
-* [ARROW-10139](https://issues.apache.org/jira/browse/ARROW-10139) - [C++] Add support for building arrow\_testing without building tests
-* [ARROW-10148](https://issues.apache.org/jira/browse/ARROW-10148) - [Rust] Add documentation to lib.rs
-* [ARROW-10151](https://issues.apache.org/jira/browse/ARROW-10151) - [Python] Add support MapArray to\_pandas conversion
-* [ARROW-10155](https://issues.apache.org/jira/browse/ARROW-10155) - [Rust] [DataFusion] Add documentation to lib.rs
-* [ARROW-10156](https://issues.apache.org/jira/browse/ARROW-10156) - [Rust] Auto-label PRs
-* [ARROW-10157](https://issues.apache.org/jira/browse/ARROW-10157) - [Rust] Add more documentation about take
-* [ARROW-10160](https://issues.apache.org/jira/browse/ARROW-10160) - [Rust] Improve documentation of DictionaryType
-* [ARROW-10161](https://issues.apache.org/jira/browse/ARROW-10161) - [Rust] [DataFusion] Simplify expression tests
-* [ARROW-10162](https://issues.apache.org/jira/browse/ARROW-10162) - [Rust] Support display of DictionaryArrays in pretty printing
-* [ARROW-10164](https://issues.apache.org/jira/browse/ARROW-10164) - [Rust] Add support for DictionaryArray types to cast kernels
-* [ARROW-10167](https://issues.apache.org/jira/browse/ARROW-10167) - [Rust] Support display of DictionaryArrays in sql.rs
-* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields
-* [ARROW-10171](https://issues.apache.org/jira/browse/ARROW-10171) - [Rust] [DataFusion] Add \`ExecutionContext::from<ExecutionContextState\>\`
-* [ARROW-10190](https://issues.apache.org/jira/browse/ARROW-10190) - [Website] Add Jorge to list of committers
-* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches
-* [ARROW-10196](https://issues.apache.org/jira/browse/ARROW-10196) - [C++] Add Future::DeferNotOk()
-* [ARROW-10199](https://issues.apache.org/jira/browse/ARROW-10199) - [Rust][Parquet] Release Parquet at crates.io to remove debug prints
-* [ARROW-10201](https://issues.apache.org/jira/browse/ARROW-10201) - [C++][CI] Disable S3 in arm64 job on Travis CI
-* [ARROW-10202](https://issues.apache.org/jira/browse/ARROW-10202) - [CI][Windows] Use sf.net mirror for MSYS2
-* [ARROW-10205](https://issues.apache.org/jira/browse/ARROW-10205) - [Java][FlightRPC] Add client option to disable server verification
-* [ARROW-10206](https://issues.apache.org/jira/browse/ARROW-10206) - [Python][C++][FlightRPC] Add client option to disable server validation
-* [ARROW-10215](https://issues.apache.org/jira/browse/ARROW-10215) - [Rust] [DataFusion] Rename "Source" typedef
-* [ARROW-10217](https://issues.apache.org/jira/browse/ARROW-10217) - [CI] Run fewer GitHub Actions jobs
-* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests
-* [ARROW-10227](https://issues.apache.org/jira/browse/ARROW-10227) - [Ruby] Use a table size as the default for parquet chunk\_size
-* [ARROW-10229](https://issues.apache.org/jira/browse/ARROW-10229) - [C++][Parquet] Remove left over ARROW\_LOG statement.
-* [ARROW-10231](https://issues.apache.org/jira/browse/ARROW-10231) - [CI] Unable to download minio in arm32v7 docker image
-* [ARROW-10233](https://issues.apache.org/jira/browse/ARROW-10233) - [Rust] Make array\_value\_to\_string available in all Arrow builds
-* [ARROW-10235](https://issues.apache.org/jira/browse/ARROW-10235) - [Rust][DataFusion] Improve documentation for type coercion
-* [ARROW-10240](https://issues.apache.org/jira/browse/ARROW-10240) - [Rust] [Datafusion] Optionally load tpch data into memory before running benchmark query
-* [ARROW-10251](https://issues.apache.org/jira/browse/ARROW-10251) - [Rust] [DataFusion] MemTable::load() should load partitions in parallel
-* [ARROW-10252](https://issues.apache.org/jira/browse/ARROW-10252) - [Python] Add option to skip inclusion of Arrow headers in Python installation
-* [ARROW-10256](https://issues.apache.org/jira/browse/ARROW-10256) - [C++][Flight] Disable -Werror carefully
-* [ARROW-10257](https://issues.apache.org/jira/browse/ARROW-10257) - [R] Prepare news/docs for 2.0 release
-* [ARROW-10260](https://issues.apache.org/jira/browse/ARROW-10260) - [Python] Missing MapType to Pandas dtype
-* [ARROW-10265](https://issues.apache.org/jira/browse/ARROW-10265) - [CI] Use smaler build when cache doesn't exit on Travis CI
-* [ARROW-10266](https://issues.apache.org/jira/browse/ARROW-10266) - [CI][macOS] Ensure using Python 3.8 with Homebrew
-* [ARROW-10267](https://issues.apache.org/jira/browse/ARROW-10267) - [Python] Skip flight test if disable\_server\_verification feature is not available
-* [ARROW-10272](https://issues.apache.org/jira/browse/ARROW-10272) - [Packaging][Python] Pin newer multibuild version to avoid updating homebrew
-* [ARROW-10273](https://issues.apache.org/jira/browse/ARROW-10273) - [CI][Homebrew] Fix "brew audit" usage
-* [ARROW-10287](https://issues.apache.org/jira/browse/ARROW-10287) - [C++] Avoid std::random\_device whenever possible
-* [PARQUET-1845](https://issues.apache.org/jira/browse/PARQUET-1845) - [C++] Int96 memory images in test cases assume only little-endian
-* [PARQUET-1878](https://issues.apache.org/jira/browse/PARQUET-1878) - [C++] lz4 codec is not compatible with Hadoop Lz4Codec
-* [PARQUET-1904](https://issues.apache.org/jira/browse/PARQUET-1904) - [C++] Export file\_offset in RowGroupMetaData
-
-
-
-# Apache Arrow 1.0.0 (2020-07-20)
-
-## Bug Fixes
-
-* [ARROW-1692](https://issues.apache.org/jira/browse/ARROW-1692) - [Python, Java] UnionArray round trip not working
-* [ARROW-3329](https://issues.apache.org/jira/browse/ARROW-3329) - [Python] Error casting decimal(38, 4) to int64
-* [ARROW-3861](https://issues.apache.org/jira/browse/ARROW-3861) - [Python] ParquetDataset().read columns argument always returns partition column
-* [ARROW-4018](https://issues.apache.org/jira/browse/ARROW-4018) - [C++] RLE decoder may not big-endian compatible
-* [ARROW-4309](https://issues.apache.org/jira/browse/ARROW-4309) - [Documentation] Add a docker-compose entry which builds the documentation with CUDA enabled
-* [ARROW-4600](https://issues.apache.org/jira/browse/ARROW-4600) - [Ruby] Arrow::DictionaryArray\#[] should returns the item in the indices array
-* [ARROW-5158](https://issues.apache.org/jira/browse/ARROW-5158) - [Packaging][Wheel] Symlink libraries in wheels
-* [ARROW-5310](https://issues.apache.org/jira/browse/ARROW-5310) - [Python] better error message on creating ParquetDataset from empty directory
-* [ARROW-5359](https://issues.apache.org/jira/browse/ARROW-5359) - [Python] timestamp\_as\_object support for pa.Table.to\_pandas in pyarrow
-* [ARROW-5572](https://issues.apache.org/jira/browse/ARROW-5572) - [Python] raise error message when passing invalid filter in parquet reading
-* [ARROW-5666](https://issues.apache.org/jira/browse/ARROW-5666) - [Python] Underscores in partition (string) values are dropped when reading dataset
-* [ARROW-5744](https://issues.apache.org/jira/browse/ARROW-5744) - [C++] Do not error in Table::CombineChunks for BinaryArray types that overflow 2GB limit
-* [ARROW-5875](https://issues.apache.org/jira/browse/ARROW-5875) - [FlightRPC] Test RPC features in integration tests
-* [ARROW-6235](https://issues.apache.org/jira/browse/ARROW-6235) - [R] Conversion from arrow::BinaryArray to R character vector not implemented
-* [ARROW-6523](https://issues.apache.org/jira/browse/ARROW-6523) - [C++][Dataset] arrow\_dataset target does not depend on anything
-* [ARROW-6848](https://issues.apache.org/jira/browse/ARROW-6848) - [C++] Specify -std=c++11 instead of -std=gnu++11 when building
-* [ARROW-7018](https://issues.apache.org/jira/browse/ARROW-7018) - [R] Non-UTF-8 data in Arrow <--\> R conversion
-* [ARROW-7028](https://issues.apache.org/jira/browse/ARROW-7028) - [R] Date roundtrip results in different R storage mode
-* [ARROW-7084](https://issues.apache.org/jira/browse/ARROW-7084) - [C++] ArrayRangeEquals should check for full type equality?
-* [ARROW-7173](https://issues.apache.org/jira/browse/ARROW-7173) - [Integration] Add test to verify Map field names can be arbitrary
-* [ARROW-7208](https://issues.apache.org/jira/browse/ARROW-7208) - [Python] Passing directory to ParquetFile class gives confusing error message
-* [ARROW-7273](https://issues.apache.org/jira/browse/ARROW-7273) - [Python] Non-nullable null field is allowed / crashes when writing to parquet
-* [ARROW-7480](https://issues.apache.org/jira/browse/ARROW-7480) - [Rust] [DataFusion] Query fails/incorrect when aggregated + grouped columns don't match the selected columns
-* [ARROW-7610](https://issues.apache.org/jira/browse/ARROW-7610) - [Java] Finish support for 64 bit int allocations
-* [ARROW-7654](https://issues.apache.org/jira/browse/ARROW-7654) - [Python] Ability to set column\_types to a Schema in csv.ConvertOptions is undocumented
-* [ARROW-7681](https://issues.apache.org/jira/browse/ARROW-7681) - [Rust] Explicitly seeking a BufReader will discard the internal buffer
-* [ARROW-7702](https://issues.apache.org/jira/browse/ARROW-7702) - [C++][Dataset] Provide (optional) deterministic order of batches
-* [ARROW-7782](https://issues.apache.org/jira/browse/ARROW-7782) - [Python] Losing index information when using write\_to\_dataset with partition\_cols
-* [ARROW-7840](https://issues.apache.org/jira/browse/ARROW-7840) - [Java] [Integration] Java executables fail
-* [ARROW-7925](https://issues.apache.org/jira/browse/ARROW-7925) - [C++][Documentation] Instructions about running IWYU and other tasks in cpp/development.rst have gone stale
-* [ARROW-7939](https://issues.apache.org/jira/browse/ARROW-7939) - [Python] crashes when reading parquet file compressed with snappy
-* [ARROW-7967](https://issues.apache.org/jira/browse/ARROW-7967) - [CI][Crossbow] Pin macOS version in autobrew job to match CRAN
-* [ARROW-8050](https://issues.apache.org/jira/browse/ARROW-8050) - [Python][Packaging] Do not include generated Cython source files in wheel packages
-* [ARROW-8078](https://issues.apache.org/jira/browse/ARROW-8078) - [Python] Missing links in the docs regarding field and schema DataTypes
-* [ARROW-8115](https://issues.apache.org/jira/browse/ARROW-8115) - [Python] Conversion when mixing NaT and datetime objects not working
-* [ARROW-8251](https://issues.apache.org/jira/browse/ARROW-8251) - [Python] pandas.ExtensionDtype does not survive round trip with write\_to\_dataset
-* [ARROW-8344](https://issues.apache.org/jira/browse/ARROW-8344) - [C\#] StringArray.Builder.Clear() corrupts subsequently-built array contents
-* [ARROW-8360](https://issues.apache.org/jira/browse/ARROW-8360) - [C++][Gandiva] Fixes date32 support for date/time functions
-* [ARROW-8374](https://issues.apache.org/jira/browse/ARROW-8374) - [R] Table to vector of DictonaryType will error when Arrays don't have the same Dictionary per array
-* [ARROW-8392](https://issues.apache.org/jira/browse/ARROW-8392) - [Java] Fix overflow related corner cases for vector value comparison
-* [ARROW-8448](https://issues.apache.org/jira/browse/ARROW-8448) - [Package] Can't build apt packages with ubuntu-focal
-* [ARROW-8455](https://issues.apache.org/jira/browse/ARROW-8455) - [Rust] [Parquet] Arrow column read on partially compatible files
-* [ARROW-8455](https://issues.apache.org/jira/browse/ARROW-8455) - [Rust] [Parquet] Arrow column read on partially compatible files
-* [ARROW-8471](https://issues.apache.org/jira/browse/ARROW-8471) - [C++][Integration] Regression to /u?int64/ as JSON::number
-* [ARROW-8472](https://issues.apache.org/jira/browse/ARROW-8472) - [Go][Integration] Represent 64 bit integers as JSON::string
-* [ARROW-8473](https://issues.apache.org/jira/browse/ARROW-8473) - [Rust] "Statistics support" in rust/parquet readme is incorrect
-* [ARROW-8480](https://issues.apache.org/jira/browse/ARROW-8480) - [Rust] There is no check for allocation failure
-* [ARROW-8503](https://issues.apache.org/jira/browse/ARROW-8503) - [Packaging][deb] Can't build apache-arrow-archive-keyring for RC
-* [ARROW-8505](https://issues.apache.org/jira/browse/ARROW-8505) - [Release][C\#] "sourcelink test" is failed by Apache.Arrow.AssemblyInfo.cs
-* [ARROW-8508](https://issues.apache.org/jira/browse/ARROW-8508) - [Rust] ListBuilder of FixedSizeListBuilder creates wrong offsets
-* [ARROW-8510](https://issues.apache.org/jira/browse/ARROW-8510) - [C++] arrow/dataset/file\_base.cc fails to compile with internal compiler error with "Visual Studio 15 2017 Win64" generator
-* [ARROW-8511](https://issues.apache.org/jira/browse/ARROW-8511) - [Developer][Release] Windows release verification script does not halt if C++ compilation fails
-* [ARROW-8514](https://issues.apache.org/jira/browse/ARROW-8514) - [Developer] Windows wheel verification script does not check Python 3.5
-* [ARROW-8529](https://issues.apache.org/jira/browse/ARROW-8529) - [C++] Fix usage of NextCounts() in GetBatchWithDict[Spaced]
-* [ARROW-8535](https://issues.apache.org/jira/browse/ARROW-8535) - [Rust] Arrow crate does not specify arrow-flight version
-* [ARROW-8536](https://issues.apache.org/jira/browse/ARROW-8536) - [Rust] Failed to locate format/Flight.proto in any parent directory
-* [ARROW-8537](https://issues.apache.org/jira/browse/ARROW-8537) - [C++] Performance regression from ARROW-8523
-* [ARROW-8539](https://issues.apache.org/jira/browse/ARROW-8539) - [CI] "AMD64 MacOS 10.15 GLib & Ruby" fails
-* [ARROW-8554](https://issues.apache.org/jira/browse/ARROW-8554) - [C++][Benchmark] Fix building error "cannot bind lvalue"
-* [ARROW-8556](https://issues.apache.org/jira/browse/ARROW-8556) - [R] zstd symbol not found if there are multiple installations of zstd
-* [ARROW-8566](https://issues.apache.org/jira/browse/ARROW-8566) - [R] error when writing POSIXct to spark
-* [ARROW-8568](https://issues.apache.org/jira/browse/ARROW-8568) - [C++][Python] Crash on decimal cast in debug mode
-* [ARROW-8577](https://issues.apache.org/jira/browse/ARROW-8577) - [Plasma] PlasmaClient::Connect() of CUDA enabled build is always failed on no CUDA device machine
-* [ARROW-8583](https://issues.apache.org/jira/browse/ARROW-8583) - [C++][Doc] Undocumented parameter in Dataset namespace
-* [ARROW-8584](https://issues.apache.org/jira/browse/ARROW-8584) - [Packaging][C++] Protobuf link error in deb builds
-* [ARROW-8585](https://issues.apache.org/jira/browse/ARROW-8585) - [Packaging][Python] Windows wheels fail to build because of link error
-* [ARROW-8586](https://issues.apache.org/jira/browse/ARROW-8586) - [R] installation failure on CentOS 7
-* [ARROW-8587](https://issues.apache.org/jira/browse/ARROW-8587) - [C++] Compilation error when linking arrow-flight-perf-server
-* [ARROW-8592](https://issues.apache.org/jira/browse/ARROW-8592) - [C++] Docs still list LLVM 7 as compiler used
-* [ARROW-8593](https://issues.apache.org/jira/browse/ARROW-8593) - [C++] Parquet file\_serialize\_test.cc fails to build with musl libc
-* [ARROW-8598](https://issues.apache.org/jira/browse/ARROW-8598) - [Rust] simd\_compare\_op creates buffer of incorrect length when item count is not a multiple of T::lanes()
-* [ARROW-8602](https://issues.apache.org/jira/browse/ARROW-8602) - [CMake] Fix ws2\_32 link issue when cross-compiling on Linux
-* [ARROW-8603](https://issues.apache.org/jira/browse/ARROW-8603) - [Documentation] Fix Sphinx doxygen comment
-* [ARROW-8604](https://issues.apache.org/jira/browse/ARROW-8604) - [R][CI] Update CI to use R 4.0
-* [ARROW-8608](https://issues.apache.org/jira/browse/ARROW-8608) - [C++] Update vendored mpark/variant.h to latest to fix NVCC compilation issues
-* [ARROW-8609](https://issues.apache.org/jira/browse/ARROW-8609) - [C++] ORC JNI bridge crashed on null arrow buffer
-* [ARROW-8610](https://issues.apache.org/jira/browse/ARROW-8610) - [Rust] DivideByZero when running arrow crate when simd feature is disabled
-* [ARROW-8613](https://issues.apache.org/jira/browse/ARROW-8613) - [C++][Dataset] Raise error for unparsable partition value
-* [ARROW-8615](https://issues.apache.org/jira/browse/ARROW-8615) - [R] Error better and insist on RandomAccessFile in read\_feather
-* [ARROW-8617](https://issues.apache.org/jira/browse/ARROW-8617) - [Rust] simd\_load\_set\_invalid does not exist on aarch64
-* [ARROW-8632](https://issues.apache.org/jira/browse/ARROW-8632) - [C++] Fix conversion error warning in array\_union\_test.cc
-* [ARROW-8641](https://issues.apache.org/jira/browse/ARROW-8641) - [Python] Regression in feather: no longer supports permutation in column selection
-* [ARROW-8643](https://issues.apache.org/jira/browse/ARROW-8643) - [Python] Tests with pandas master failing due to freq assertion
-* [ARROW-8644](https://issues.apache.org/jira/browse/ARROW-8644) - [Python] Dask integration tests failing due to change in not including partition columns
-* [ARROW-8646](https://issues.apache.org/jira/browse/ARROW-8646) - [Java] Allow UnionListWriter to write null values
-* [ARROW-8649](https://issues.apache.org/jira/browse/ARROW-8649) - [Java] [Website] Java documentation on website is hidden
-* [ARROW-8657](https://issues.apache.org/jira/browse/ARROW-8657) - [Python][C++][Parquet] Forward compatibility issue from 0.16 to 0.17 when using version='2.0'
-* [ARROW-8663](https://issues.apache.org/jira/browse/ARROW-8663) - [Documentation] Small correction to building.rst
-* [ARROW-8680](https://issues.apache.org/jira/browse/ARROW-8680) - [Rust] ComplexObjectArrayReader incorrect null value shuffling
-* [ARROW-8684](https://issues.apache.org/jira/browse/ARROW-8684) - [Python] "SystemError: Bad call flags in \_PyMethodDef\_RawFastCallDict" in Python 3.7.7 on macOS when using pyarrow wheel
-* [ARROW-8689](https://issues.apache.org/jira/browse/ARROW-8689) - [C++] S3 benchmarks fail linking
-* [ARROW-8693](https://issues.apache.org/jira/browse/ARROW-8693) - [Python] Dataset.get\_fragments is missing an implicit cast when filtering
-* [ARROW-8694](https://issues.apache.org/jira/browse/ARROW-8694) - [Python][Parquet] parquet.read\_schema() fails when loading wide table created from Pandas DataFrame
-* [ARROW-8701](https://issues.apache.org/jira/browse/ARROW-8701) - [Rust] Unresolved import \`crate::compute::util::simd\_load\_set\_invalid\` on Raspberry Pi
-* [ARROW-8704](https://issues.apache.org/jira/browse/ARROW-8704) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz)
-* [ARROW-8705](https://issues.apache.org/jira/browse/ARROW-8705) - [Java] ComplexCopier is skipping null values
-* [ARROW-8706](https://issues.apache.org/jira/browse/ARROW-8706) - [C++][Parquet] Tracking JIRA for PARQUET-1857 (unencrypted INT16\_MAX Parquet row group limit)
-* [ARROW-8710](https://issues.apache.org/jira/browse/ARROW-8710) - [Rust] Continuation marker not written correctly in IPC writer, and stream not flushed
-* [ARROW-8722](https://issues.apache.org/jira/browse/ARROW-8722) - [Dev] "archery docker run -e" doesn't work
-* [ARROW-8726](https://issues.apache.org/jira/browse/ARROW-8726) - [C++][Dataset] Mis-specified DirectoryPartitioning incorrectly uses the file name as value
-* [ARROW-8728](https://issues.apache.org/jira/browse/ARROW-8728) - [C++] Bitmap operation may cause buffer overflow
-* [ARROW-8729](https://issues.apache.org/jira/browse/ARROW-8729) - [C++][Dataset] Only selecting a partition column results in empty table
-* [ARROW-8734](https://issues.apache.org/jira/browse/ARROW-8734) - [R] improve nightly build installation
-* [ARROW-8741](https://issues.apache.org/jira/browse/ARROW-8741) - [Python][Packaging] Keep VS2015 with for the windows wheels
-* [ARROW-8750](https://issues.apache.org/jira/browse/ARROW-8750) - [Python] pyarrow.feather.write\_feather does not default to lz4 compression if it's available
-* [ARROW-8768](https://issues.apache.org/jira/browse/ARROW-8768) - [R][CI] Fix nightly as-cran spurious failure
-* [ARROW-8775](https://issues.apache.org/jira/browse/ARROW-8775) - [C++][FlightRPC] Integration client doesn't run integration tests
-* [ARROW-8776](https://issues.apache.org/jira/browse/ARROW-8776) - [FlightRPC][C++] Flight/C++ middleware don't receive headers on failed calls to Java servers
-* [ARROW-8798](https://issues.apache.org/jira/browse/ARROW-8798) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz)
-* [ARROW-8799](https://issues.apache.org/jira/browse/ARROW-8799) - [C++][Dataset] Reading list column as nested dictionary segfaults
-* [ARROW-8801](https://issues.apache.org/jira/browse/ARROW-8801) - [Python] Memory leak on read from parquet file with UTC timestamps using pandas
-* [ARROW-8802](https://issues.apache.org/jira/browse/ARROW-8802) - [C++][Dataset] Schema metadata are lost when reading a subset of columns
-* [ARROW-8803](https://issues.apache.org/jira/browse/ARROW-8803) - [Java] Row count should be set before loading buffers in VectorLoader
-* [ARROW-8808](https://issues.apache.org/jira/browse/ARROW-8808) - [Rust] Divide by zero in arrays/builder.rs
-* [ARROW-8809](https://issues.apache.org/jira/browse/ARROW-8809) - [Rust] schema mismatch in integration test
-* [ARROW-8811](https://issues.apache.org/jira/browse/ARROW-8811) - [Java] Fix build on master
-* [ARROW-8820](https://issues.apache.org/jira/browse/ARROW-8820) - [C++][Gandiva] fix date\_trunc functions to return date types
-* [ARROW-8821](https://issues.apache.org/jira/browse/ARROW-8821) - [Rust] nested binary expression with Like, NotLike and Not operator results in type cast error
-* [ARROW-8825](https://issues.apache.org/jira/browse/ARROW-8825) - [C++] Cannot compiled pass with Wunused-parameter flag
-* [ARROW-8826](https://issues.apache.org/jira/browse/ARROW-8826) - [Crossbow] remote URL should always have .git
-* [ARROW-8832](https://issues.apache.org/jira/browse/ARROW-8832) - [Python] AttributeError: module 'pyarrow.fs' has no attribute 'S3FileSystem'
-* [ARROW-8848](https://issues.apache.org/jira/browse/ARROW-8848) - [CI][C/Glib] MinGW build error
-* [ARROW-8848](https://issues.apache.org/jira/browse/ARROW-8848) - [CI][C/Glib] MinGW build error
-* [ARROW-8858](https://issues.apache.org/jira/browse/ARROW-8858) - [FlightRPC] Ensure headers are uniformly exposed
-* [ARROW-8860](https://issues.apache.org/jira/browse/ARROW-8860) - [C++] IPC/Feather decompression broken for nested arrays
-* [ARROW-8862](https://issues.apache.org/jira/browse/ARROW-8862) - [C++] NumericBuilder does not use MemoryPool passed to CTOR
-* [ARROW-8863](https://issues.apache.org/jira/browse/ARROW-8863) - [C++] Array subclass constructors must set ArrayData::null\_count to 0 when there is no validity bitmap
-* [ARROW-8869](https://issues.apache.org/jira/browse/ARROW-8869) - [Rust] [DataFusion] Type Coercion optimizer rule does not support new scan nodes
-* [ARROW-8871](https://issues.apache.org/jira/browse/ARROW-8871) - [C++] Gandiva build failure
-* [ARROW-8872](https://issues.apache.org/jira/browse/ARROW-8872) - [CI] Travis-CI jobs fail (can't open file 'ci/detect-changes.py')
-* [ARROW-8874](https://issues.apache.org/jira/browse/ARROW-8874) - [C++][Dataset] Scanner::ToTable race when ScanTask exit early with an error
-* [ARROW-8878](https://issues.apache.org/jira/browse/ARROW-8878) - [R] try\_download is confused when download.file.method isn't default
-* [ARROW-8882](https://issues.apache.org/jira/browse/ARROW-8882) - [C\#] Add .editorconfig to C\# code
-* [ARROW-8888](https://issues.apache.org/jira/browse/ARROW-8888) - [Python] Heuristic in dataframe\_to\_arrays that decides to multithread convert cause slow conversions
-* [ARROW-8889](https://issues.apache.org/jira/browse/ARROW-8889) - [Python] Python 3.7 SIGSEGV when comparing RecordBatch to None
-* [ARROW-8892](https://issues.apache.org/jira/browse/ARROW-8892) - [C++][CI] CI builds for MSVC do not build benchmarks
-* [ARROW-8909](https://issues.apache.org/jira/browse/ARROW-8909) - [Java] Out of order writes using setSafe
-* [ARROW-8911](https://issues.apache.org/jira/browse/ARROW-8911) - [C++] Slicing a ChunkedArray with zero chunks segfaults
-* [ARROW-8924](https://issues.apache.org/jira/browse/ARROW-8924) - [C++][Gandiva] castDATE\_date32() may cause overflow
-* [ARROW-8925](https://issues.apache.org/jira/browse/ARROW-8925) - [Rust] [DataFusion] CsvExec::schema() returns incorrect results
-* [ARROW-8930](https://issues.apache.org/jira/browse/ARROW-8930) - [C++] libz.so linking error with liborc.a
-* [ARROW-8932](https://issues.apache.org/jira/browse/ARROW-8932) - [C++] symbol resolution failures with liborc.a
-* [ARROW-8946](https://issues.apache.org/jira/browse/ARROW-8946) - [Python] Add tests for parquet.write\_metadata metadata\_collector
-* [ARROW-8948](https://issues.apache.org/jira/browse/ARROW-8948) - [Java][Integration] enable duplicate field names integration tests
-* [ARROW-8951](https://issues.apache.org/jira/browse/ARROW-8951) - [C++] Fix compiler warning in compute/kernels/scalar\_cast\_temporal.cc
-* [ARROW-8954](https://issues.apache.org/jira/browse/ARROW-8954) - [Website] ca-certificates should be listed in installation instructions
-* [ARROW-8957](https://issues.apache.org/jira/browse/ARROW-8957) - [FlightRPC][C++] Fail to build due to IpcOptions
-* [ARROW-8959](https://issues.apache.org/jira/browse/ARROW-8959) - [Rust] Broken build due to new benchmark crate using old API
-* [ARROW-8962](https://issues.apache.org/jira/browse/ARROW-8962) - [C++] Linking failure with clang-4.0
-* [ARROW-8968](https://issues.apache.org/jira/browse/ARROW-8968) - [C++][Gandiva] Show link warning message on s390x
-* [ARROW-8975](https://issues.apache.org/jira/browse/ARROW-8975) - [FlightRPC][C++] Fix flaky MacOS tests
-* [ARROW-8977](https://issues.apache.org/jira/browse/ARROW-8977) - [R] Table$create with schema crashes with some dictionary index types
-* [ARROW-8978](https://issues.apache.org/jira/browse/ARROW-8978) - [C++][Compute] "Conditional jump or move depends on uninitialised value(s)" Valgrind warning
-* [ARROW-8980](https://issues.apache.org/jira/browse/ARROW-8980) - [Python] Metadata grows exponentially when using schema from disk
-* [ARROW-8982](https://issues.apache.org/jira/browse/ARROW-8982) - [CI] Remove allow\_failures for s390x in TravisCI
-* [ARROW-8986](https://issues.apache.org/jira/browse/ARROW-8986) - [Archery][ursabot] Fix benchmark diff checkout of origin/master
-* [ARROW-9000](https://issues.apache.org/jira/browse/ARROW-9000) - [Java] build crashes with JDK14
-* [ARROW-9009](https://issues.apache.org/jira/browse/ARROW-9009) - [C++][Dataset] ARROW:schema should be removed from schema's metadata when reading Parquet files
-* [ARROW-9013](https://issues.apache.org/jira/browse/ARROW-9013) - [C++] Validate enum-style CMake options
-* [ARROW-9020](https://issues.apache.org/jira/browse/ARROW-9020) - [Python] read\_json won't respect explicit\_schema in parse\_options
-* [ARROW-9024](https://issues.apache.org/jira/browse/ARROW-9024) - [C++/Python] Install anaconda-client in conda-clean job
-* [ARROW-9026](https://issues.apache.org/jira/browse/ARROW-9026) - [C++/Python] Force package removal from arrow-nightlies conda repository
-* [ARROW-9037](https://issues.apache.org/jira/browse/ARROW-9037) - [C++][C] unable to import array with null count == -1 (which could be exported)
-* [ARROW-9057](https://issues.apache.org/jira/browse/ARROW-9057) - [Rust] Projection should work on InMemoryScan without error
-* [ARROW-9059](https://issues.apache.org/jira/browse/ARROW-9059) - [Rust] Documentation for slicing array data has the wrong sign
-* [ARROW-9066](https://issues.apache.org/jira/browse/ARROW-9066) - [Python] Raise correct error in isnull()
-* [ARROW-9071](https://issues.apache.org/jira/browse/ARROW-9071) - [C++] MakeArrayOfNull makes invalid ListArray
-* [ARROW-9077](https://issues.apache.org/jira/browse/ARROW-9077) - [C++] Fix aggregate/scalar-compare benchmark null\_percent calculation
-* [ARROW-9080](https://issues.apache.org/jira/browse/ARROW-9080) - [C++] arrow::AllocateBuffer returns a Result<unique\_ptr<Buffer\>\>
-* [ARROW-9082](https://issues.apache.org/jira/browse/ARROW-9082) - [Rust] - Stream reader fail when steam not ended with (optional) 0xFFFFFFFF 0x00000000"
-* [ARROW-9084](https://issues.apache.org/jira/browse/ARROW-9084) - [C++] CMake is unable to find zstd target when ZSTD\_SOURCE=SYSTEM
-* [ARROW-9085](https://issues.apache.org/jira/browse/ARROW-9085) - [C++][CI] Appveyor CI test failures
-* [ARROW-9087](https://issues.apache.org/jira/browse/ARROW-9087) - [C++] Missing HDFS options parsing
-* [ARROW-9098](https://issues.apache.org/jira/browse/ARROW-9098) - RecordBatch::ToStructArray cannot handle record batches with 0 column
-* [ARROW-9105](https://issues.apache.org/jira/browse/ARROW-9105) - [C++] ParquetFileFragment scanning doesn't handle filter on partition field
-* [ARROW-9120](https://issues.apache.org/jira/browse/ARROW-9120) - [C++] Lint and Format C++ files with "codegen" in file name
-* [ARROW-9121](https://issues.apache.org/jira/browse/ARROW-9121) - [C++] Do not wipe the filesystem when path is empty
-* [ARROW-9122](https://issues.apache.org/jira/browse/ARROW-9122) - [C++] Adapt ascii\_lower/ascii\_upper bulk transforms to work on sliced arrays
-* [ARROW-9126](https://issues.apache.org/jira/browse/ARROW-9126) - [C++] Trimmed Boost bundle fails to build on Windows
-* [ARROW-9127](https://issues.apache.org/jira/browse/ARROW-9127) - [Rust] Update thrift library dependencies
-* [ARROW-9134](https://issues.apache.org/jira/browse/ARROW-9134) - [Python] Parquet partitioning degrades Int32 to float64
-* [ARROW-9141](https://issues.apache.org/jira/browse/ARROW-9141) - [R] Update cross-package documentation links
-* [ARROW-9142](https://issues.apache.org/jira/browse/ARROW-9142) - [C++] random::RandomArrayGenerator::Boolean "probability" misdocumented / incorrect
-* [ARROW-9143](https://issues.apache.org/jira/browse/ARROW-9143) - [C++] RecordBatch::Slice erroneously sets non-nullable field's internal null\_count to unknown
-* [ARROW-9146](https://issues.apache.org/jira/browse/ARROW-9146) - [C++][Dataset] Scanning a Fragment with a filter + mismatching schema shouldn't abort
-* [ARROW-9151](https://issues.apache.org/jira/browse/ARROW-9151) - [R][CI] Fix Rtools 4.0 build: pacman sync
-* [ARROW-9160](https://issues.apache.org/jira/browse/ARROW-9160) - [C++] Implement string/binary contains for exact matches
-* [ARROW-9174](https://issues.apache.org/jira/browse/ARROW-9174) - [Go] Tests panic with 386 (x86) builds
-* [ARROW-9183](https://issues.apache.org/jira/browse/ARROW-9183) - [C++] Failed to build arrow-cpp with gcc 4.9.2
-* [ARROW-9184](https://issues.apache.org/jira/browse/ARROW-9184) - [Rust][Datafusion] table scan without projection should return all columns
-* [ARROW-9194](https://issues.apache.org/jira/browse/ARROW-9194) - [C++] Array::GetScalar not implemented for decimal type
-* [ARROW-9195](https://issues.apache.org/jira/browse/ARROW-9195) - [Java] Wrong usage of Unsafe.get from bytearray in ByteFunctionsHelper class
-* [ARROW-9209](https://issues.apache.org/jira/browse/ARROW-9209) - [C++] Benchmarks fail to build ARROW\_IPC=OFF and ARROW\_BUILD\_TESTS=OFF
-* [ARROW-9219](https://issues.apache.org/jira/browse/ARROW-9219) - [R] coerce\_timestamps in Parquet write options does not work
-* [ARROW-9221](https://issues.apache.org/jira/browse/ARROW-9221) - ArrowBuf\#setBytes(int, ByteBuffer) doesn't check the byte buffer's endianness
-* [ARROW-9223](https://issues.apache.org/jira/browse/ARROW-9223) - [Python] Fix to\_pandas() export for timestamps within structs
-* [ARROW-9230](https://issues.apache.org/jira/browse/ARROW-9230) - [FlightRPC][Python] flight.connect() doesn't pass through all arguments
-* [ARROW-9233](https://issues.apache.org/jira/browse/ARROW-9233) - [C++] is\_null on NullArray should be true for all values
-* [ARROW-9236](https://issues.apache.org/jira/browse/ARROW-9236) - [Rust] CSV WriterBuilder never writes header
-* [ARROW-9237](https://issues.apache.org/jira/browse/ARROW-9237) - [R] 0.17 install on Arch Linux
-* [ARROW-9238](https://issues.apache.org/jira/browse/ARROW-9238) - [C++][CI] A few test coverages of round-robin in ipc and flight
-* [ARROW-9252](https://issues.apache.org/jira/browse/ARROW-9252) - [Integration] GitHub Actions integration test job does not test against "gold" 0.14.1 files in apache/arrow-testing
-* [ARROW-9260](https://issues.apache.org/jira/browse/ARROW-9260) - [CI] "ARM64v8 Ubuntu 20.04 C++" fails
-* [ARROW-9260](https://issues.apache.org/jira/browse/ARROW-9260) - [CI] "ARM64v8 Ubuntu 20.04 C++" fails
-* [ARROW-9261](https://issues.apache.org/jira/browse/ARROW-9261) - [Python][Packaging] S3FileSystem curl errors in manylinux wheels
-* [ARROW-9274](https://issues.apache.org/jira/browse/ARROW-9274) - [Rust] [Integration Testing] Read i64 from json files as strings
-* [ARROW-9282](https://issues.apache.org/jira/browse/ARROW-9282) - [R] Remove usage of \_EXTPTR\_PTR
-* [ARROW-9284](https://issues.apache.org/jira/browse/ARROW-9284) - [Java] getMinorTypeForArrowType returns sparse minor type for dense union types
-* [ARROW-9288](https://issues.apache.org/jira/browse/ARROW-9288) - [C++][Dataset] Discovery of partition field as dictionary type segfaulting with HivePartitioning
-* [ARROW-9297](https://issues.apache.org/jira/browse/ARROW-9297) - [C++][Dataset] Dataset scanner cannot handle large binary column (\> 2 GB)
-* [ARROW-9298](https://issues.apache.org/jira/browse/ARROW-9298) - [C++] Fix crashes on invalid input (OSS-Fuzz)
-* [ARROW-9303](https://issues.apache.org/jira/browse/ARROW-9303) - [R] Linux static build should always bundle dependencies
-* [ARROW-9305](https://issues.apache.org/jira/browse/ARROW-9305) - [Python] Dependency load failure in Windows wheel build
-* [ARROW-9315](https://issues.apache.org/jira/browse/ARROW-9315) - [Java] Fix the failure of testAllocationManagerType
-* [ARROW-9317](https://issues.apache.org/jira/browse/ARROW-9317) - [Java] A few testcases for arrow-memory
-* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds
-* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds
-* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds
-* [ARROW-9330](https://issues.apache.org/jira/browse/ARROW-9330) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
-* [ARROW-9334](https://issues.apache.org/jira/browse/ARROW-9334) - [Dev][Archery] Push ancestor docker images
-* [ARROW-9336](https://issues.apache.org/jira/browse/ARROW-9336) - [Ruby] Creating RecordBatch with structs missing keys results in a malformed table
-* [ARROW-9343](https://issues.apache.org/jira/browse/ARROW-9343) - [C++][Gandiva] CastINT/Float functions from string should handle leading/trailing white spaces
-* [ARROW-9347](https://issues.apache.org/jira/browse/ARROW-9347) - [Python] Tests fail with latest fsspec
-* [ARROW-9350](https://issues.apache.org/jira/browse/ARROW-9350) - [C++][CI] Nightly valgrind job failures
-* [ARROW-9351](https://issues.apache.org/jira/browse/ARROW-9351) - [C++][CI] Nightly test-ubuntu-18.04-cpp-cmake32 fails
-* [ARROW-9353](https://issues.apache.org/jira/browse/ARROW-9353) - [Python][CI] Nightly dask integration jobs fail
-* [ARROW-9354](https://issues.apache.org/jira/browse/ARROW-9354) - [C++] Turbodbc latest fails to build in the integration tests
-* [ARROW-9355](https://issues.apache.org/jira/browse/ARROW-9355) - [R] Fix -Wimplicit-int-float-conversion
-* [ARROW-9360](https://issues.apache.org/jira/browse/ARROW-9360) - [CI][Crossbow] Nightly homebrew-cpp job times out
-* [ARROW-9363](https://issues.apache.org/jira/browse/ARROW-9363) - [C++][Dataset] ParquetDatasetFactory schema: pandas metadata is lost
-* [ARROW-9368](https://issues.apache.org/jira/browse/ARROW-9368) - [Python] Rename predicate argument to filter in split\_by\_row\_group()
-* [ARROW-9373](https://issues.apache.org/jira/browse/ARROW-9373) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz)
-* [ARROW-9380](https://issues.apache.org/jira/browse/ARROW-9380) - [C++] Segfaults in compute::CallFunction
-* [ARROW-9384](https://issues.apache.org/jira/browse/ARROW-9384) - [C++] Out-of-memory on invalid IPC input (OSS-Fuzz)
-* [ARROW-9385](https://issues.apache.org/jira/browse/ARROW-9385) - [Python] [CI] jpype integration failure
-* [ARROW-9389](https://issues.apache.org/jira/browse/ARROW-9389) - [C++] Can't call isin/match through CallFunction
-* [ARROW-9397](https://issues.apache.org/jira/browse/ARROW-9397) - [R] Pass CC/CXX to cmake when building libarrow in Linux build
-* [ARROW-9408](https://issues.apache.org/jira/browse/ARROW-9408) - [Integration] Tests do not run in Windows due to numpy 64-bit errors
-* [ARROW-9409](https://issues.apache.org/jira/browse/ARROW-9409) - [CI][Crossbow] Nightly conda-r fails
-* [ARROW-9410](https://issues.apache.org/jira/browse/ARROW-9410) - [CI][Crossbow] Fix homebrew-cpp again
-* [ARROW-9413](https://issues.apache.org/jira/browse/ARROW-9413) - [Rust] Fix clippy lint on master
-* [ARROW-9415](https://issues.apache.org/jira/browse/ARROW-9415) - [C++] Arrow does not compile on Power9
-* [ARROW-9416](https://issues.apache.org/jira/browse/ARROW-9416) - [Go] Add test cases for some datatypes
-* [ARROW-9417](https://issues.apache.org/jira/browse/ARROW-9417) - [C++][IPC] size in message written in native endian
-* [ARROW-9418](https://issues.apache.org/jira/browse/ARROW-9418) - [R] nyc-taxi Parquet files not downloaded in binary mode on Windows
-* [ARROW-9419](https://issues.apache.org/jira/browse/ARROW-9419) - [C++] Test that "fill\_null" function works with sliced inputs, expand tests
-* [ARROW-9428](https://issues.apache.org/jira/browse/ARROW-9428) - [C++] Update documentation for buffer allocation functions
-* [ARROW-9436](https://issues.apache.org/jira/browse/ARROW-9436) - [C++][CI] Valgrind errors in fill\_null kernel tests
-* [ARROW-9438](https://issues.apache.org/jira/browse/ARROW-9438) - [CI] Spark integration tests are failing
-* [ARROW-9439](https://issues.apache.org/jira/browse/ARROW-9439) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
-* [ARROW-9440](https://issues.apache.org/jira/browse/ARROW-9440) - [Python] Expose Fill Null Compute Kernel in PyArrow
-* [ARROW-9443](https://issues.apache.org/jira/browse/ARROW-9443) - [C++] Bundled bz2 build should only build libbz2
-* [ARROW-9448](https://issues.apache.org/jira/browse/ARROW-9448) - [Java] Circular initialization between ArrowBuf and BaseAllocator leads to null HistoricalLog for empty buffer
-* [ARROW-9449](https://issues.apache.org/jira/browse/ARROW-9449) - [R] Strip arrow.so
-* [ARROW-9450](https://issues.apache.org/jira/browse/ARROW-9450) - [Python] "pytest pyarrow" takes over 10 seconds to collect tests and start executing
-* [ARROW-9456](https://issues.apache.org/jira/browse/ARROW-9456) - [Python] Dataset segfault when not importing pyarrow.parquet
-* [ARROW-9458](https://issues.apache.org/jira/browse/ARROW-9458) - [Python] Dataset Scanner is single-threaded only
-* [ARROW-9460](https://issues.apache.org/jira/browse/ARROW-9460) - [C++] BinaryContainsExact doesn't cope with double characters in the pattern
-* [ARROW-9461](https://issues.apache.org/jira/browse/ARROW-9461) - [Rust] Reading Date32 and Date64 errors - they are incorrectly converted to RecordBatch
-* [ARROW-9476](https://issues.apache.org/jira/browse/ARROW-9476) - [C++][Dataset] HivePartitioning discovery with dictionary types fails for multiple fields
-* [ARROW-9486](https://issues.apache.org/jira/browse/ARROW-9486) - [C++][Dataset] Support implicit casting InExpression::set\_ to dict
-* [ARROW-9497](https://issues.apache.org/jira/browse/ARROW-9497) - [C++][Parquet] Fix failure caused by malformed repetition/definition levels
-* [ARROW-9499](https://issues.apache.org/jira/browse/ARROW-9499) - [C++] AdaptiveIntBuilder::AppendNull does not increment the null count
-* [ARROW-9500](https://issues.apache.org/jira/browse/ARROW-9500) - [C++] Fix segfault with std::to\_string in -O3 builds on gcc 7.5.0
-* [ARROW-9501](https://issues.apache.org/jira/browse/ARROW-9501) - [C++][Gandiva] Add logic in timestampdiff() when end date is last day of a month
-* [ARROW-9503](https://issues.apache.org/jira/browse/ARROW-9503) - [Rust] Comparison sliced arrays is wrong
-* [ARROW-9504](https://issues.apache.org/jira/browse/ARROW-9504) - [Python] Segmentation fault on ChunkedArray.take
-* [ARROW-9506](https://issues.apache.org/jira/browse/ARROW-9506) - [Packaging][Python] Fix macOS wheel build failures
-* [ARROW-9512](https://issues.apache.org/jira/browse/ARROW-9512) - [C++] Variadic template unpack inside lambda doesn't compile with gcc
-* [ARROW-9524](https://issues.apache.org/jira/browse/ARROW-9524) - [CI][Gandiva] C++ unit test arrow-ipc-read-write failing in gandiva nightly build
-* [ARROW-9527](https://issues.apache.org/jira/browse/ARROW-9527) - [Rust] Remove un-needed dev-dependencies
-* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow
-* [PARQUET-1839](https://issues.apache.org/jira/browse/PARQUET-1839) - [C++] values\_read not updated in ReadBatchSpaced
-* [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups
-* [PARQUET-1865](https://issues.apache.org/jira/browse/PARQUET-1865) - [C++] Failure from C++17 feature used in parquet/encoding\_benchmark.cc
-* [PARQUET-1877](https://issues.apache.org/jira/browse/PARQUET-1877) - [C++] Reconcile container size with string size for memory issues
-* [PARQUET-1882](https://issues.apache.org/jira/browse/PARQUET-1882) - [C++] Writing an all-null column and then reading it with buffered\_stream aborts the process
-
-
-## New Features and Improvements
-
-* [ARROW-300](https://issues.apache.org/jira/browse/ARROW-300) - [Format] Add body buffer compression option to IPC message protocol using LZ4 or ZSTD
-* [ARROW-842](https://issues.apache.org/jira/browse/ARROW-842) - [Python] Handle more kinds of null sentinel objects from pandas 0.x
-* [ARROW-971](https://issues.apache.org/jira/browse/ARROW-971) - [C++/Python] Implement Array.isvalid/notnull/isnull as scalar functions
-* [ARROW-974](https://issues.apache.org/jira/browse/ARROW-974) - [Website] Add Use Cases section to the website
-* [ARROW-1277](https://issues.apache.org/jira/browse/ARROW-1277) - Completing integration tests for major implemented data types
-* [ARROW-1567](https://issues.apache.org/jira/browse/ARROW-1567) - [C++] Implement "fill null" kernels that replace null values with some scalar replacement value
-* [ARROW-1570](https://issues.apache.org/jira/browse/ARROW-1570) - [C++] Define API for creating a kernel instance from function of scalar input and output with a particular signature
-* [ARROW-1682](https://issues.apache.org/jira/browse/ARROW-1682) - [Python] Add documentation / example for reading a directory of Parquet files on S3
-* [ARROW-1796](https://issues.apache.org/jira/browse/ARROW-1796) - [Python] RowGroup filtering on file level
-* [ARROW-2260](https://issues.apache.org/jira/browse/ARROW-2260) - [C++][Plasma] plasma\_store should show usage
-* [ARROW-2444](https://issues.apache.org/jira/browse/ARROW-2444) - [Python][C++] Better handle reading empty parquet files
-* [ARROW-2702](https://issues.apache.org/jira/browse/ARROW-2702) - [Python] Examine usages of Invalid and TypeError errors in numpy\_to\_arrow.cc to see if we are using the right error type in each instance
-* [ARROW-2714](https://issues.apache.org/jira/browse/ARROW-2714) - [C++/Python] Variable step size slicing for arrays
-* [ARROW-2912](https://issues.apache.org/jira/browse/ARROW-2912) - [Website] Build more detailed Community landing page a la Apache Spark
-* [ARROW-3089](https://issues.apache.org/jira/browse/ARROW-3089) - [Rust] Add ArrayBuilder for different Arrow arrays
-* [ARROW-3134](https://issues.apache.org/jira/browse/ARROW-3134) - [C++] Implement n-ary iterator for a collection of chunked arrays with possibly different chunking layouts
-* [ARROW-3154](https://issues.apache.org/jira/browse/ARROW-3154) - [Python][C++] Document how to write \_metadata, \_common\_metadata files with Parquet datasets
-* [ARROW-3244](https://issues.apache.org/jira/browse/ARROW-3244) - [Python] Multi-file parquet loading without scan
-* [ARROW-3275](https://issues.apache.org/jira/browse/ARROW-3275) - [Python] Add documentation about inspecting Parquet file metadata
-* [ARROW-3308](https://issues.apache.org/jira/browse/ARROW-3308) - [R] Convert R character vector with data exceeding 2GB to Large type
-* [ARROW-3317](https://issues.apache.org/jira/browse/ARROW-3317) - [R] Test/support conversions from data.frame with a single character column exceeding 2GB capacity of BinaryArray
-* [ARROW-3446](https://issues.apache.org/jira/browse/ARROW-3446) - [R] Document mapping of Arrow <-\> R types
-* [ARROW-3509](https://issues.apache.org/jira/browse/ARROW-3509) - [C++] Inconsistent child accessor naming
-* [ARROW-3520](https://issues.apache.org/jira/browse/ARROW-3520) - [C++] Implement List Flatten kernel
-* [ARROW-3688](https://issues.apache.org/jira/browse/ARROW-3688) - [Rust] Implement PrimitiveArrayBuilder<T\>.push\_values
-* [ARROW-3827](https://issues.apache.org/jira/browse/ARROW-3827) - [Rust] Implement UnionArray
-* [ARROW-4022](https://issues.apache.org/jira/browse/ARROW-4022) - [C++] Promote Datum variant out of compute namespace
-* [ARROW-4221](https://issues.apache.org/jira/browse/ARROW-4221) - [Format] Add canonical flag in COO sparse index
-* [ARROW-4390](https://issues.apache.org/jira/browse/ARROW-4390) - [R] Serialize "labeled" metadata in Feather files, IPC messages
-* [ARROW-4412](https://issues.apache.org/jira/browse/ARROW-4412) - [DOCUMENTATION] Add explicit version numbers to the arrow specification documents.
-* [ARROW-4427](https://issues.apache.org/jira/browse/ARROW-4427) - [Doc] Move Confluence Wiki pages to the Sphinx docs
-* [ARROW-4429](https://issues.apache.org/jira/browse/ARROW-4429) - [Doc] Add git rebase tips to the 'Contributing' page in the developer docs
-* [ARROW-5035](https://issues.apache.org/jira/browse/ARROW-5035) - [C\#] ArrowBuffer.Builder<bool\> is broken
-* [ARROW-5082](https://issues.apache.org/jira/browse/ARROW-5082) - [Python][Packaging] Reduce size of macOS and manylinux1 wheels
-* [ARROW-5143](https://issues.apache.org/jira/browse/ARROW-5143) - [Flight] Enable integration testing of batches with dictionaries
-* [ARROW-5279](https://issues.apache.org/jira/browse/ARROW-5279) - [C++] Support reading delta dictionaries in IPC streams
-* [ARROW-5377](https://issues.apache.org/jira/browse/ARROW-5377) - [C++] Make IpcPayload public and add GetPayloadSize
-* [ARROW-5489](https://issues.apache.org/jira/browse/ARROW-5489) - [C++] Normalize kernels and ChunkedArray behavior
-* [ARROW-5548](https://issues.apache.org/jira/browse/ARROW-5548) - [Documentation] http://arrow.apache.org/docs/latest/ is not latest
-* [ARROW-5649](https://issues.apache.org/jira/browse/ARROW-5649) - [Integration][C++] Create round trip integration test for extension types
-* [ARROW-5708](https://issues.apache.org/jira/browse/ARROW-5708) - [C\#] Null support for BooleanArray
-* [ARROW-5760](https://issues.apache.org/jira/browse/ARROW-5760) - [C++] Optimize Take implementation
-* [ARROW-5854](https://issues.apache.org/jira/browse/ARROW-5854) - [Python] Expose compare kernels on Array class
-* [ARROW-6052](https://issues.apache.org/jira/browse/ARROW-6052) - [C++] Divide up arrow/array.h,cc into files in arrow/array/ similar to builder files
-* [ARROW-6110](https://issues.apache.org/jira/browse/ARROW-6110) - [Java] Support LargeList Type and add integration test with C++
-* [ARROW-6111](https://issues.apache.org/jira/browse/ARROW-6111) - [Java] Support LargeVarChar and LargeBinary types and add integration test with C++
-* [ARROW-6439](https://issues.apache.org/jira/browse/ARROW-6439) - [R] Implement S3 file-system interface in R
-* [ARROW-6456](https://issues.apache.org/jira/browse/ARROW-6456) - [C++] Possible to reduce object code generated in compute/kernels/take.cc?
-* [ARROW-6501](https://issues.apache.org/jira/browse/ARROW-6501) - [C++] Remove non\_zero\_length field from SparseIndex
-* [ARROW-6521](https://issues.apache.org/jira/browse/ARROW-6521) - [C++] Add function to arrow:: namespace that returns the current ABI version
-* [ARROW-6543](https://issues.apache.org/jira/browse/ARROW-6543) - [R] Support LargeBinary and LargeString types
-* [ARROW-6602](https://issues.apache.org/jira/browse/ARROW-6602) - [Doc] Add feature / implementation matrix
-* [ARROW-6603](https://issues.apache.org/jira/browse/ARROW-6603) - [C\#] ArrayBuilder API to support writing nulls
-* [ARROW-6645](https://issues.apache.org/jira/browse/ARROW-6645) - [Python] Faster boundschecking of dictionary indices when converting to Categorical
-* [ARROW-6689](https://issues.apache.org/jira/browse/ARROW-6689) - [Rust] [DataFusion] Query execution enhancements for 1.0.0 release
-* [ARROW-6691](https://issues.apache.org/jira/browse/ARROW-6691) - [Rust] [DataFusion] Use tokio and Futures instead of spawning threads
-* [ARROW-6775](https://issues.apache.org/jira/browse/ARROW-6775) - [C++] [Python] Proposal for several Array utility functions
-* [ARROW-6776](https://issues.apache.org/jira/browse/ARROW-6776) - [Python] Need a lite version of pyarrow
-* [ARROW-6800](https://issues.apache.org/jira/browse/ARROW-6800) - [C++] Add CMake option to build libraries targeting a C++14 or C++17 toolchain environment
-* [ARROW-6839](https://issues.apache.org/jira/browse/ARROW-6839) - [Java] Add APIs to read and write "custom\_metadata" field of IPC file footer
-* [ARROW-6856](https://issues.apache.org/jira/browse/ARROW-6856) - [C++] Use ArrayData instead of Array for ArrayData::dictionary
-* [ARROW-6917](https://issues.apache.org/jira/browse/ARROW-6917) - ARROW-6917: [Archery][Release] Add support for JIRA curation, changelog generation and commit cherry-picking for maintenance releases
-* [ARROW-6945](https://issues.apache.org/jira/browse/ARROW-6945) - [Rust] Enable integration tests
-* [ARROW-6959](https://issues.apache.org/jira/browse/ARROW-6959) - [C++] Clarify what signatures are preferred for compute kernels
-* [ARROW-6978](https://issues.apache.org/jira/browse/ARROW-6978) - [R] Add bindings for sum and mean compute kernels
-* [ARROW-6979](https://issues.apache.org/jira/browse/ARROW-6979) - [R] Enable jemalloc in autobrew formula
-* [ARROW-7009](https://issues.apache.org/jira/browse/ARROW-7009) - [C++] Refactor filter/take kernels to use Datum instead of overloads
-* [ARROW-7010](https://issues.apache.org/jira/browse/ARROW-7010) - [C++] Support lossy casts from decimal128 to float32 and float64/double
-* [ARROW-7011](https://issues.apache.org/jira/browse/ARROW-7011) - [C++] Implement casts from float/double to decimal128
-* [ARROW-7012](https://issues.apache.org/jira/browse/ARROW-7012) - [C++] Clarify ChunkedArray chunking strategy and policy
-* [ARROW-7068](https://issues.apache.org/jira/browse/ARROW-7068) - [C++] Expose the offsets of a ListArray as a Int32Array
-* [ARROW-7075](https://issues.apache.org/jira/browse/ARROW-7075) - [C++] Boolean kernels should not allocate in Call()
-* [ARROW-7175](https://issues.apache.org/jira/browse/ARROW-7175) - [Website] Add a security page to track when vulnerabilities are patched
-* [ARROW-7229](https://issues.apache.org/jira/browse/ARROW-7229) - [C++] Unify ConcatenateTables APIs
-* [ARROW-7230](https://issues.apache.org/jira/browse/ARROW-7230) - [C++] Use vendored std::optional instead of boost::optional in Gandiva
-* [ARROW-7237](https://issues.apache.org/jira/browse/ARROW-7237) - [C++] Add Result<T\> to APIs to arrow/json
-* [ARROW-7243](https://issues.apache.org/jira/browse/ARROW-7243) - [Docs] Add common "implementation status" table to the README of each native language implementation, as well as top level README
-* [ARROW-7285](https://issues.apache.org/jira/browse/ARROW-7285) - [C++] ensure C++ implementation meets clarified dictionary spec
-* [ARROW-7300](https://issues.apache.org/jira/browse/ARROW-7300) - [C++][Gandiva] Implement functions to cast from strings to integers/floats
-* [ARROW-7313](https://issues.apache.org/jira/browse/ARROW-7313) - [C++] Add function for retrieving a scalar from an array slot
-* [ARROW-7371](https://issues.apache.org/jira/browse/ARROW-7371) - [GLib] Add Datasets binding
-* [ARROW-7375](https://issues.apache.org/jira/browse/ARROW-7375) - [Python] Expose C++ MakeArrayOfNull
-* [ARROW-7391](https://issues.apache.org/jira/browse/ARROW-7391) - [Python] Remove unnecessary classes from the binding layer
-* [ARROW-7495](https://issues.apache.org/jira/browse/ARROW-7495) - [Java] Remove "empty" concept from ArrowBuf, replace with custom referencemanager
-* [ARROW-7605](https://issues.apache.org/jira/browse/ARROW-7605) - [C++] Create and install static library containing all dependencies built by Arrow
-* [ARROW-7607](https://issues.apache.org/jira/browse/ARROW-7607) - [C++] Add to cpp/examples minimal examples of using Arrow as a dependency of another CMake project
-* [ARROW-7673](https://issues.apache.org/jira/browse/ARROW-7673) - [C++][Dataset] Revisit File discovery failure mode
-* [ARROW-7676](https://issues.apache.org/jira/browse/ARROW-7676) - [Packaging][Python] Ensure that the static libraries are not built in the wheel scripts
-* [ARROW-7699](https://issues.apache.org/jira/browse/ARROW-7699) - [Java] Support concating dense union vectors in batch
-* [ARROW-7705](https://issues.apache.org/jira/browse/ARROW-7705) - [Rust] Initial sort implementation
-* [ARROW-7717](https://issues.apache.org/jira/browse/ARROW-7717) - [CI] Have nightly integration test for Spark's latest release
-* [ARROW-7759](https://issues.apache.org/jira/browse/ARROW-7759) - [C++][Dataset] Add CsvFileFormat for CSV support
-* [ARROW-7778](https://issues.apache.org/jira/browse/ARROW-7778) - [C++] Support nested dictionaries in JSON integration format
-* [ARROW-7784](https://issues.apache.org/jira/browse/ARROW-7784) - [C++] diff.cc is extremely slow to compile
-* [ARROW-7801](https://issues.apache.org/jira/browse/ARROW-7801) - [Developer] Add issue\_comment workflow to fix lint/style/codegen
-* [ARROW-7803](https://issues.apache.org/jira/browse/ARROW-7803) - [R][CI] Autobrew/homebrew tests should not always install from master
-* [ARROW-7831](https://issues.apache.org/jira/browse/ARROW-7831) - [Java] unnecessary buffer allocation when calling splitAndTransferTo on variable width vectors
-* [ARROW-7831](https://issues.apache.org/jira/browse/ARROW-7831) - [Java] unnecessary buffer allocation when calling splitAndTransferTo on variable width vectors
-* [ARROW-7902](https://issues.apache.org/jira/browse/ARROW-7902) - [Integration] Unskip nested dictionary integration tests
-* [ARROW-7910](https://issues.apache.org/jira/browse/ARROW-7910) - [C++] Provide function to query page size portably
-* [ARROW-7924](https://issues.apache.org/jira/browse/ARROW-7924) - [Rust] Add sort for float types
-* [ARROW-7950](https://issues.apache.org/jira/browse/ARROW-7950) - [Python] When initializing pandas API shim, inform user if their installed pandas version is too old
-* [ARROW-7955](https://issues.apache.org/jira/browse/ARROW-7955) - [Java] Support large buffer for file/stream IPC
-* [ARROW-8020](https://issues.apache.org/jira/browse/ARROW-8020) - [Java] Implement vector validate functionality
-* [ARROW-8023](https://issues.apache.org/jira/browse/ARROW-8023) - [Website] Write a blog post about the C data interface
-* [ARROW-8025](https://issues.apache.org/jira/browse/ARROW-8025) - [C++] Implement cast to Binary and FixedSizeBinary
-* [ARROW-8025](https://issues.apache.org/jira/browse/ARROW-8025) - [C++] Implement cast to Binary and FixedSizeBinary
-* [ARROW-8046](https://issues.apache.org/jira/browse/ARROW-8046) - [Developer][Integration] Makefile.docker's target names are broken
-* [ARROW-8062](https://issues.apache.org/jira/browse/ARROW-8062) - [C++][Dataset] Parquet Dataset factory from a \_metadata/\_common\_metadata file
-* [ARROW-8065](https://issues.apache.org/jira/browse/ARROW-8065) - [C++][Dataset] Untangle Dataset, Fragment and ScanOptions
-* [ARROW-8074](https://issues.apache.org/jira/browse/ARROW-8074) - [C++][Dataset] Support for file-like objects (buffers) in FileSystemDataset?
-* [ARROW-8108](https://issues.apache.org/jira/browse/ARROW-8108) - [Java] Extract a common interface for dictionary encoders
-* [ARROW-8111](https://issues.apache.org/jira/browse/ARROW-8111) - [C++][CSV] Support MM/DD/YYYY date format
-* [ARROW-8114](https://issues.apache.org/jira/browse/ARROW-8114) - [Java][Integration] Enable custom\_metadata integration test
-* [ARROW-8121](https://issues.apache.org/jira/browse/ARROW-8121) - [Java] Enhance code style checking for Java code (add space after commas, semi-colons and type casts)
-* [ARROW-8149](https://issues.apache.org/jira/browse/ARROW-8149) - [C++/Python] Enable CUDA Support in conda recipes
-* [ARROW-8157](https://issues.apache.org/jira/browse/ARROW-8157) - [C++][Gandiva] Support building with LLVM 9
-* [ARROW-8162](https://issues.apache.org/jira/browse/ARROW-8162) - [Format][Python] Add serialization for CSF sparse tensors
-* [ARROW-8169](https://issues.apache.org/jira/browse/ARROW-8169) - [Java] Improve the performance of JDBC adapter by allocating memory proactively
-* [ARROW-8171](https://issues.apache.org/jira/browse/ARROW-8171) - Consider pre-allocating memory for fix-width vector in Avro adapter iterator
-* [ARROW-8190](https://issues.apache.org/jira/browse/ARROW-8190) - [C++][Flight] Allow setting IpcWriteOptions and IpcReadOptions in Flight IPC message reader and writer classes
-* [ARROW-8229](https://issues.apache.org/jira/browse/ARROW-8229) - [Java] Move ArrowBuf into the Arrow package
-* [ARROW-8230](https://issues.apache.org/jira/browse/ARROW-8230) - [Java] Move Netty memory manager into a separate module
-* [ARROW-8261](https://issues.apache.org/jira/browse/ARROW-8261) - [Rust] [DataFusion] LogicalPlanBuilder.limit() should take a literal argument
-* [ARROW-8263](https://issues.apache.org/jira/browse/ARROW-8263) - [Rust] [DataFusion] Add documentation for supported SQL functions
-* [ARROW-8281](https://issues.apache.org/jira/browse/ARROW-8281) - [R] Name collision of arrow.dll on Windows conda
-* [ARROW-8283](https://issues.apache.org/jira/browse/ARROW-8283) - [Python][Dataset] Non-existent files are silently dropped in pa.dataset.FileSystemDataset
-* [ARROW-8287](https://issues.apache.org/jira/browse/ARROW-8287) - [Rust] Arrow examples should use utility to print results
-* [ARROW-8293](https://issues.apache.org/jira/browse/ARROW-8293) - [Python] Run flake8 on python/examples also
-* [ARROW-8297](https://issues.apache.org/jira/browse/ARROW-8297) - [FlightRPC][C++] Implement Flight DoExchange for C++
-* [ARROW-8301](https://issues.apache.org/jira/browse/ARROW-8301) - [R] Handle ChunkedArray and Table in C data interface
-* [ARROW-8312](https://issues.apache.org/jira/browse/ARROW-8312) - [Java][Gandiva] improve IN expression support
-* [ARROW-8314](https://issues.apache.org/jira/browse/ARROW-8314) - [Python] Provide a method to select a subset of columns of a Table
-* [ARROW-8318](https://issues.apache.org/jira/browse/ARROW-8318) - [C++][Dataset] Dataset should instantiate Fragment
-* [ARROW-8399](https://issues.apache.org/jira/browse/ARROW-8399) - [Rust] Extend memory alignments to include other architectures
-* [ARROW-8413](https://issues.apache.org/jira/browse/ARROW-8413) - [C++] Refactor DefLevelsToBitmap
-* [ARROW-8422](https://issues.apache.org/jira/browse/ARROW-8422) - [Rust] [Parquet] Implement function to convert Arrow schema to Parquet schema
-* [ARROW-8430](https://issues.apache.org/jira/browse/ARROW-8430) - [CI] Configure self-hosted runners for Github Actions
-* [ARROW-8434](https://issues.apache.org/jira/browse/ARROW-8434) - [C++] Ipc RecordBatchFileReader deserializes the Schema multiple times
-* [ARROW-8440](https://issues.apache.org/jira/browse/ARROW-8440) - [C++] Refine simd header files
-* [ARROW-8443](https://issues.apache.org/jira/browse/ARROW-8443) - [Gandiva][C++] Fix round/truncate to no-op for special cases
-* [ARROW-8447](https://issues.apache.org/jira/browse/ARROW-8447) - [C++][Dataset] Ensure Scanner::ToTable preserve ordering of ScanTasks
-* [ARROW-8467](https://issues.apache.org/jira/browse/ARROW-8467) - [C++] Test cases using ArrayFromJSON assume only a little-endian platform
-* [ARROW-8474](https://issues.apache.org/jira/browse/ARROW-8474) - [CI][Crossbow] Skip some nightlies we don't need to run
-* [ARROW-8477](https://issues.apache.org/jira/browse/ARROW-8477) - [C++] Enable reading and writing of long filenames for Windows
-* [ARROW-8481](https://issues.apache.org/jira/browse/ARROW-8481) - [Java] Provide an allocation manager based on Unsafe API
-* [ARROW-8483](https://issues.apache.org/jira/browse/ARROW-8483) - [Ruby] Arrow::Table documentation improvement
-* [ARROW-8485](https://issues.apache.org/jira/browse/ARROW-8485) - [Integration][Java] Implement extension types integration
-* [ARROW-8486](https://issues.apache.org/jira/browse/ARROW-8486) - [C++] arrow-utility-test causes failures on a big-endian platform
-* [ARROW-8487](https://issues.apache.org/jira/browse/ARROW-8487) - [FlightRPC][C++] Make it possible to target a specific payload size
-* [ARROW-8488](https://issues.apache.org/jira/browse/ARROW-8488) - [R] Replace VALUE\_OR\_STOP with ValueOrStop
-* [ARROW-8496](https://issues.apache.org/jira/browse/ARROW-8496) - [C++] Refine ByteStreamSplitDecodeScalar
-* [ARROW-8497](https://issues.apache.org/jira/browse/ARROW-8497) - [Archery] Add missing component to builds
-* [ARROW-8499](https://issues.apache.org/jira/browse/ARROW-8499) - [C++][Dataset] In ScannerBuilder, batch\_size will not work if projecter is not empty
-* [ARROW-8500](https://issues.apache.org/jira/browse/ARROW-8500) - [C++] Use selection vectors in Filter implementation for record batches, tables
-* [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
-* [ARROW-8502](https://issues.apache.org/jira/browse/ARROW-8502) - [Release][APT][Yum] Ignore all arm64 verifications
-* [ARROW-8504](https://issues.apache.org/jira/browse/ARROW-8504) - [C++] Add Run Length Reader
-* [ARROW-8506](https://issues.apache.org/jira/browse/ARROW-8506) - [c++] Miss tests to verify expected\_buffer with bit\_width \> 8 in RLE
-* [ARROW-8507](https://issues.apache.org/jira/browse/ARROW-8507) - [Release] Detect .git directory automatically in changelog.py
-* [ARROW-8509](https://issues.apache.org/jira/browse/ARROW-8509) - [GLib] Add low level record batch read/write functions
-* [ARROW-8512](https://issues.apache.org/jira/browse/ARROW-8512) - [C++] Delete unused compute expr prototype code
-* [ARROW-8513](https://issues.apache.org/jira/browse/ARROW-8513) - [Python] Expose Take with Table input in Python
-* [ARROW-8515](https://issues.apache.org/jira/browse/ARROW-8515) - [C++] Bitmap ToString should have an option of grouping by bytes
-* [ARROW-8516](https://issues.apache.org/jira/browse/ARROW-8516) - [Rust] Slow BufferBuilder<BooleanType\> inserts within PrimitiveBuilder::append\_slice
-* [ARROW-8517](https://issues.apache.org/jira/browse/ARROW-8517) - [Developer][Release] Update Crossbow RC verification setup for changes since 0.16.0
-* [ARROW-8520](https://issues.apache.org/jira/browse/ARROW-8520) - [Developer] Use .asf.yaml to direct GitHub notifications to e-mail lists and JIRA
-* [ARROW-8521](https://issues.apache.org/jira/browse/ARROW-8521) - [Developer] Group Sub-task, Task, Test, and Wish issue types as "Improvement" in Changelog
-* [ARROW-8522](https://issues.apache.org/jira/browse/ARROW-8522) - [Developer] Add environment variable option to toggle whether ephemeral NodeJS is installed in release verification script
-* [ARROW-8524](https://issues.apache.org/jira/browse/ARROW-8524) - [CI] Free up space on github actions
-* [ARROW-8526](https://issues.apache.org/jira/browse/ARROW-8526) - [Python] Fix non-deterministic row order failure in dataset tests
-* [ARROW-8531](https://issues.apache.org/jira/browse/ARROW-8531) - [C++] Deprecate ARROW\_USE\_SIMD CMake option
-* [ARROW-8538](https://issues.apache.org/jira/browse/ARROW-8538) - [Packaging] Remove boost from homebrew formula
-* [ARROW-8540](https://issues.apache.org/jira/browse/ARROW-8540) - [C++] Create memory allocation benchmark
-* [ARROW-8541](https://issues.apache.org/jira/browse/ARROW-8541) - [Release] Don't remove previous source releases automatically
-* [ARROW-8542](https://issues.apache.org/jira/browse/ARROW-8542) - [Release] Fix checksum url in the website post release script
-* [ARROW-8543](https://issues.apache.org/jira/browse/ARROW-8543) - [C++] IO: single pass coalescing algorithm
-* [ARROW-8544](https://issues.apache.org/jira/browse/ARROW-8544) - [CI][Crossbow] Add a status.json to the gh-pages summary of nightly builds to get around rate limiting
-* [ARROW-8548](https://issues.apache.org/jira/browse/ARROW-8548) - [Website] 0.17 release post
-* [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups
-* [ARROW-8550](https://issues.apache.org/jira/browse/ARROW-8550) - [CI] Don't run cron GHA jobs on forks
-* [ARROW-8551](https://issues.apache.org/jira/browse/ARROW-8551) - [CI][Gandiva] Use LLVM 8 to build gandiva linux jar
-* [ARROW-8552](https://issues.apache.org/jira/browse/ARROW-8552) - [Rust] support column iteration for parquet row
-* [ARROW-8553](https://issues.apache.org/jira/browse/ARROW-8553) - [C++] Optimize unaligned bitmap operations
-* [ARROW-8555](https://issues.apache.org/jira/browse/ARROW-8555) - [FlightRPC][Java] Implement Flight DoExchange for Java
-* [ARROW-8558](https://issues.apache.org/jira/browse/ARROW-8558) - [Rust] GitHub Actions missing rustfmt
-* [ARROW-8559](https://issues.apache.org/jira/browse/ARROW-8559) - [Rust] Consolidate Record Batch reader traits in main arrow crate
-* [ARROW-8560](https://issues.apache.org/jira/browse/ARROW-8560) - [Rust] Docs for MutableBuffer resize are incorrect
-* [ARROW-8561](https://issues.apache.org/jira/browse/ARROW-8561) - [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()
-* [ARROW-8562](https://issues.apache.org/jira/browse/ARROW-8562) - [C++] IO: Parameterize I/O coalescing using S3 storage metrics
-* [ARROW-8563](https://issues.apache.org/jira/browse/ARROW-8563) - [Go] Minor change to make newBuilder public
-* [ARROW-8564](https://issues.apache.org/jira/browse/ARROW-8564) - [Website] Add Ubuntu 20.04 LTS to supported package list
-* [ARROW-8569](https://issues.apache.org/jira/browse/ARROW-8569) - [CI] Upgrade xcode version for testing homebrew formulae
-* [ARROW-8571](https://issues.apache.org/jira/browse/ARROW-8571) - [C++] Switch AppVeyor image to VS 2017
-* [ARROW-8572](https://issues.apache.org/jira/browse/ARROW-8572) - [Python] Expose UnionArray.array and other fields
-* [ARROW-8573](https://issues.apache.org/jira/browse/ARROW-8573) - [Rust] Upgrade to Rust 1.44 nightly
-* [ARROW-8574](https://issues.apache.org/jira/browse/ARROW-8574) - [Rust] Implement Debug for all plain types
-* [ARROW-8575](https://issues.apache.org/jira/browse/ARROW-8575) - [Developer] Add issue\_comment workflow to rebase a PR
-* [ARROW-8590](https://issues.apache.org/jira/browse/ARROW-8590) - [Rust] Use Arrow pretty print utility in DataFusion
-* [ARROW-8591](https://issues.apache.org/jira/browse/ARROW-8591) - [Rust] Reverse lookup for a key in DictionaryArray
-* [ARROW-8597](https://issues.apache.org/jira/browse/ARROW-8597) - [Rust] arrow crate lint and readability improvements
-* [ARROW-8606](https://issues.apache.org/jira/browse/ARROW-8606) - [CI] Don't trigger all builds on a change to any file in ci/
-* [ARROW-8607](https://issues.apache.org/jira/browse/ARROW-8607) - [R][CI] Unbreak builds following R 4.0 release
-* [ARROW-8611](https://issues.apache.org/jira/browse/ARROW-8611) - [R] Can't install arrow 0.17 on Ubuntu 18.04 R 3.6.3
-* [ARROW-8612](https://issues.apache.org/jira/browse/ARROW-8612) - [GLib] Add GArrowReadOptions and GArrowWriteOptions
-* [ARROW-8616](https://issues.apache.org/jira/browse/ARROW-8616) - [Rust] Turn explicit SIMD off by default
-* [ARROW-8619](https://issues.apache.org/jira/browse/ARROW-8619) - [C++] Use distinct Type::type values for interval types
-* [ARROW-8622](https://issues.apache.org/jira/browse/ARROW-8622) - [Rust] Parquet crate does not compile on aarch64
-* [ARROW-8623](https://issues.apache.org/jira/browse/ARROW-8623) - [C++][Gandiva] Reduce use of Boost, remove Boost headers from header files
-* [ARROW-8624](https://issues.apache.org/jira/browse/ARROW-8624) - [Website] Install page should mention arrow-dataset packages
-* [ARROW-8628](https://issues.apache.org/jira/browse/ARROW-8628) - [CI][Dev] Wrap docker-compose commands with archery
-* [ARROW-8629](https://issues.apache.org/jira/browse/ARROW-8629) - [Rust] Eliminate indirection of ZST allocations
-* [ARROW-8633](https://issues.apache.org/jira/browse/ARROW-8633) - [C++] Add ValidateAscii function
-* [ARROW-8634](https://issues.apache.org/jira/browse/ARROW-8634) - [Java] Create an example
-* [ARROW-8639](https://issues.apache.org/jira/browse/ARROW-8639) - [C++][Plasma] Require gflags
-* [ARROW-8645](https://issues.apache.org/jira/browse/ARROW-8645) - [C++] Missing gflags dependency for plasma
-* [ARROW-8647](https://issues.apache.org/jira/browse/ARROW-8647) - [C++][Dataset] Optionally encode partition field values as dictionary type
-* [ARROW-8648](https://issues.apache.org/jira/browse/ARROW-8648) - [Rust] Optimize Rust CI Build Times
-* [ARROW-8650](https://issues.apache.org/jira/browse/ARROW-8650) - [Rust] [Website] Add documentation to Arrow website
-* [ARROW-8651](https://issues.apache.org/jira/browse/ARROW-8651) - [Python][Dataset] Support pickling of Dataset objects
-* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset
-* [ARROW-8656](https://issues.apache.org/jira/browse/ARROW-8656) - [Python] Switch to VS2017 in the windows wheel builds
-* [ARROW-8659](https://issues.apache.org/jira/browse/ARROW-8659) - [Rust] ListBuilder and FixedSizeListBuilder capacity
-* [ARROW-8660](https://issues.apache.org/jira/browse/ARROW-8660) - [C++][Gandiva] Reduce dependence on Boost
-* [ARROW-8662](https://issues.apache.org/jira/browse/ARROW-8662) - [CI] Consolidate appveyor scripts
-* [ARROW-8664](https://issues.apache.org/jira/browse/ARROW-8664) - [Java] Add skip null check to all Vector types
-* [ARROW-8668](https://issues.apache.org/jira/browse/ARROW-8668) - [Packaging][APT][Yum][ARM] Use Travis CI's ARM machine to build packages
-* [ARROW-8669](https://issues.apache.org/jira/browse/ARROW-8669) - [C++] Add IpcWriteOptions argument to GetRecordBatchSize()
-* [ARROW-8671](https://issues.apache.org/jira/browse/ARROW-8671) - [C++] Use IPC body compression metadata approved in ARROW-300
-* [ARROW-8671](https://issues.apache.org/jira/browse/ARROW-8671) - [C++] Use IPC body compression metadata approved in ARROW-300
-* [ARROW-8682](https://issues.apache.org/jira/browse/ARROW-8682) - [Ruby][Parquet] Add support for column level compression
-* [ARROW-8687](https://issues.apache.org/jira/browse/ARROW-8687) - [Java] Finish move of io.netty.buffer.ArrowBuf
-* [ARROW-8690](https://issues.apache.org/jira/browse/ARROW-8690) - [Python] Clean-up dataset+parquet tests now order is determinstic
-* [ARROW-8692](https://issues.apache.org/jira/browse/ARROW-8692) - [C++] Avoid memory copies when downloading from S3
-* [ARROW-8695](https://issues.apache.org/jira/browse/ARROW-8695) - [Java] remove references to PlatformDependent in memory module
-* [ARROW-8696](https://issues.apache.org/jira/browse/ARROW-8696) - [Java] Convert tests to integration tests
-* [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion
-* [ARROW-8702](https://issues.apache.org/jira/browse/ARROW-8702) - [Packaging][C\#] Build NuGet packages in release process
-* [ARROW-8703](https://issues.apache.org/jira/browse/ARROW-8703) - [R] schema$metadata should be properly typed
-* [ARROW-8707](https://issues.apache.org/jira/browse/ARROW-8707) - [CI] Docker push fails because of wrong dockerhub credentials
-* [ARROW-8708](https://issues.apache.org/jira/browse/ARROW-8708) - [CI] Utilize github actions cache for docker-compose volumes
-* [ARROW-8711](https://issues.apache.org/jira/browse/ARROW-8711) - [Python] Expose strptime timestamp parsing in read\_csv conversion options
-* [ARROW-8717](https://issues.apache.org/jira/browse/ARROW-8717) - [CI][Packaging] Add build dependency on boost to homebrew
-* [ARROW-8720](https://issues.apache.org/jira/browse/ARROW-8720) - [C++] Fix checked\_pointer\_cast
-* [ARROW-8721](https://issues.apache.org/jira/browse/ARROW-8721) - [CI] Fix R build matrix
-* [ARROW-8723](https://issues.apache.org/jira/browse/ARROW-8723) - [Rust] Remove SIMD specific benchmark code
-* [ARROW-8724](https://issues.apache.org/jira/browse/ARROW-8724) - [Packaging][deb][RPM] Use directory in host as build directory
-* [ARROW-8725](https://issues.apache.org/jira/browse/ARROW-8725) - [Rust] redundant directory walk in rust parquet datasource code
-* [ARROW-8727](https://issues.apache.org/jira/browse/ARROW-8727) - [C++] Do not require struct-initialization of StringConverter<T\> to parse strings to other types
-* [ARROW-8730](https://issues.apache.org/jira/browse/ARROW-8730) - [Rust] Use slice instead of &Vec for function arguments
-* [ARROW-8733](https://issues.apache.org/jira/browse/ARROW-8733) - [C++][Dataset][Python] ParquetFileFragment should provide access to parquet FileMetadata
-* [ARROW-8736](https://issues.apache.org/jira/browse/ARROW-8736) - [Rust] [DataFusion] Table API should provide a schema() method
-* [ARROW-8740](https://issues.apache.org/jira/browse/ARROW-8740) - [CI] Fix archery option in pandas master cron test
-* [ARROW-8742](https://issues.apache.org/jira/browse/ARROW-8742) - [C++][Python] Add flight client support for Mutual TLS
-* [ARROW-8743](https://issues.apache.org/jira/browse/ARROW-8743) - [C++][CI] Add a test job on s390x
-* [ARROW-8744](https://issues.apache.org/jira/browse/ARROW-8744) - [Rust] ParquetIterator's next method should be safe to call even after reached end of iteration
-* [ARROW-8745](https://issues.apache.org/jira/browse/ARROW-8745) - [C++] Bitmap.ToString causes failures on a big-endian platform
-* [ARROW-8747](https://issues.apache.org/jira/browse/ARROW-8747) - [C++] Feather tests with compression cause failure on big-endian platforms
-* [ARROW-8751](https://issues.apache.org/jira/browse/ARROW-8751) - [Rust] ParquetFileArrowReader should be able to read empty parquet file without error
-* [ARROW-8752](https://issues.apache.org/jira/browse/ARROW-8752) - [Rust] Remove unused hashmap
-* [ARROW-8753](https://issues.apache.org/jira/browse/ARROW-8753) - [C++][CI] Add a test job on ARM
-* [ARROW-8754](https://issues.apache.org/jira/browse/ARROW-8754) - [C++][CI] enable tests for additional components on big-endian platforms
-* [ARROW-8756](https://issues.apache.org/jira/browse/ARROW-8756) - [C++] Bitmap word tests cause failures on a big-endian platform
-* [ARROW-8757](https://issues.apache.org/jira/browse/ARROW-8757) - [C++] Plasma header is written in native endian
-* [ARROW-8758](https://issues.apache.org/jira/browse/ARROW-8758) - [R] Updates for compatibility with dplyr 1.0
-* [ARROW-8759](https://issues.apache.org/jira/browse/ARROW-8759) - [C++] TestPlasmaSerialization.DeleteReply tests failure on big-endian platforms
-* [ARROW-8762](https://issues.apache.org/jira/browse/ARROW-8762) - [C++][Gandiva] Replace Gandiva's BitmapAnd with common implementation
-* [ARROW-8763](https://issues.apache.org/jira/browse/ARROW-8763) - [C++] Create RandomAccessFile::WillNeed-like API
-* [ARROW-8764](https://issues.apache.org/jira/browse/ARROW-8764) - [C++] Make ThreadPool configurable in ReadRangeCache
-* [ARROW-8766](https://issues.apache.org/jira/browse/ARROW-8766) - [Python] A FileSystem implementation based on Python callbacks
-* [ARROW-8769](https://issues.apache.org/jira/browse/ARROW-8769) - [C++] Add convenience methods to access fields by name in StructScalar
-* [ARROW-8770](https://issues.apache.org/jira/browse/ARROW-8770) - [C++][CI] enable arrow-csv-test on s390x
-* [ARROW-8772](https://issues.apache.org/jira/browse/ARROW-8772) - [C++] Expand SumKernel benchmark to more types
-* [ARROW-8777](https://issues.apache.org/jira/browse/ARROW-8777) - [Rust] Parquet.rs does not support reading fixed-size binary fields.
-* [ARROW-8778](https://issues.apache.org/jira/browse/ARROW-8778) - [C++][Gandiva] SelectionVector related test failed on big-endian platforms
-* [ARROW-8779](https://issues.apache.org/jira/browse/ARROW-8779) - [R] Implement conversion to List<Struct\>
-* [ARROW-8781](https://issues.apache.org/jira/browse/ARROW-8781) - [CI][C++] Enable ccache on GHA MinGW jobs
-* [ARROW-8782](https://issues.apache.org/jira/browse/ARROW-8782) - [Rust] [DataFusion] Add benchmarks based on NYC Taxi data set
-* [ARROW-8783](https://issues.apache.org/jira/browse/ARROW-8783) - [Rust] [DataFusion] Logical plan should have ParquetScan and CsvScan entries
-* [ARROW-8784](https://issues.apache.org/jira/browse/ARROW-8784) - [Rust] [DataFusion] Remove use of Arc from LogicalPlan
-* [ARROW-8785](https://issues.apache.org/jira/browse/ARROW-8785) - [Python][Packaging] Build the windows wheels with MIMALLOC enabled
-* [ARROW-8786](https://issues.apache.org/jira/browse/ARROW-8786) - [Packaging][rpm] Use bundled zstd in the CentOS 8 build
-* [ARROW-8788](https://issues.apache.org/jira/browse/ARROW-8788) - [C\#] Array builders to use bit-packed buffer builder rather than boolean array builder for validity map
-* [ARROW-8789](https://issues.apache.org/jira/browse/ARROW-8789) - [Rust] Add separate crate for integration test binaries
-* [ARROW-8790](https://issues.apache.org/jira/browse/ARROW-8790) - [C++][CI] Enable arrow-flight-test on s390x
-* [ARROW-8791](https://issues.apache.org/jira/browse/ARROW-8791) - [Rust] Creating StringDictionaryBuilder with existing dictionary values
-* [ARROW-8792](https://issues.apache.org/jira/browse/ARROW-8792) - [C++] Improved declarative compute function / kernel development framework, normalize calling conventions
-* [ARROW-8793](https://issues.apache.org/jira/browse/ARROW-8793) - [C++] BitUtil::SetBitsTo probably doesn't need to be inline
-* [ARROW-8794](https://issues.apache.org/jira/browse/ARROW-8794) - [C++] Expand benchmark coverage for arrow from parquet reading
-* [ARROW-8795](https://issues.apache.org/jira/browse/ARROW-8795) - [C++] Limited iOS support
-* [ARROW-8800](https://issues.apache.org/jira/browse/ARROW-8800) - [C++] Split arrow::ChunkedArray into arrow/chunked\_array.h
-* [ARROW-8804](https://issues.apache.org/jira/browse/ARROW-8804) - [R][CI] Followup to Rtools40 upgrade
-* [ARROW-8814](https://issues.apache.org/jira/browse/ARROW-8814) - [Dev][Release] Binary upload script keeps raising locale warnings
-* [ARROW-8815](https://issues.apache.org/jira/browse/ARROW-8815) - [Dev][Release] Binary upload script should retry on unexpected bintray request error
-* [ARROW-8818](https://issues.apache.org/jira/browse/ARROW-8818) - [Rust] Failing to build on master due to Flatbuffers/Union issues
-* [ARROW-8822](https://issues.apache.org/jira/browse/ARROW-8822) - [Rust] [DataFusion] Add MemoryScan variant to LogicalPlan
-* [ARROW-8827](https://issues.apache.org/jira/browse/ARROW-8827) - [Integration Testing] Initial skeleton for Rust integration tests
-* [ARROW-8830](https://issues.apache.org/jira/browse/ARROW-8830) - [GLib] Add support for Tell againt not seekable GIO output stream
-* [ARROW-8831](https://issues.apache.org/jira/browse/ARROW-8831) - [Rust] incomplete SIMD implementation in simd\_compare\_op
-* [ARROW-8833](https://issues.apache.org/jira/browse/ARROW-8833) - [Rust] Implement VALIDATE mode in integration test binary
-* [ARROW-8834](https://issues.apache.org/jira/browse/ARROW-8834) - [Rust] Implement arrow-file-to-stream for integration testing
-* [ARROW-8835](https://issues.apache.org/jira/browse/ARROW-8835) - [Rust] Implement arrow-stream-to-file for integration testing
-* [ARROW-8836](https://issues.apache.org/jira/browse/ARROW-8836) - [Website] Update copyright end year automatically
-* [ARROW-8837](https://issues.apache.org/jira/browse/ARROW-8837) - [Rust] Add Null type
-* [ARROW-8838](https://issues.apache.org/jira/browse/ARROW-8838) - [Rust] File reader fails to read header from valid files
-* [ARROW-8839](https://issues.apache.org/jira/browse/ARROW-8839) - [Rust] datafusion logical plan should support scaning csv without provided schema
-* [ARROW-8840](https://issues.apache.org/jira/browse/ARROW-8840) - [Rust] datafusion ExecutionError should implement std::error:Error trait
-* [ARROW-8841](https://issues.apache.org/jira/browse/ARROW-8841) - [C++] Add benchmark and unittest for PLAIN spaced
-* [ARROW-8843](https://issues.apache.org/jira/browse/ARROW-8843) - [C++] Optimize BitmapEquals unaligned case
-* [ARROW-8844](https://issues.apache.org/jira/browse/ARROW-8844) - [C++] Optimize TransferBitmap unaligned case
-* [ARROW-8846](https://issues.apache.org/jira/browse/ARROW-8846) - [Dev][Python] Autoformat Python sources with Archery
-* [ARROW-8847](https://issues.apache.org/jira/browse/ARROW-8847) - [C++] Pass task size / metrics in Executor API
-* [ARROW-8851](https://issues.apache.org/jira/browse/ARROW-8851) - [Python][Documentation] Fix FutureWarnings in Python Plasma docs
-* [ARROW-8852](https://issues.apache.org/jira/browse/ARROW-8852) - [R] Post-0.17.1 adjustments
-* [ARROW-8854](https://issues.apache.org/jira/browse/ARROW-8854) - [Rust] [Integration Testing] Show output from arrow-json-integration-test
-* [ARROW-8855](https://issues.apache.org/jira/browse/ARROW-8855) - [Rust] [Integration Testing] data type Date32(Day) not supported
-* [ARROW-8856](https://issues.apache.org/jira/browse/ARROW-8856) - [Rust] [Integration Testing] Return empty batch if MessageHeader is NONE
-* [ARROW-8864](https://issues.apache.org/jira/browse/ARROW-8864) - [R] Add methods to Table/RecordBatch for consistency with data.frame
-* [ARROW-8866](https://issues.apache.org/jira/browse/ARROW-8866) - [C++] Split Type::UNION into Type::SPARSE\_UNION and Type::DENSE\_UNION
-* [ARROW-8867](https://issues.apache.org/jira/browse/ARROW-8867) - [R] Support converting POSIXlt type
-* [ARROW-8875](https://issues.apache.org/jira/browse/ARROW-8875) - [C++] use AWS SDK SetResponseStreamFactory to avoid a copy of bytes
-* [ARROW-8877](https://issues.apache.org/jira/browse/ARROW-8877) - [Rust] add CSV read option struct to simplify datafusion interface
-* [ARROW-8880](https://issues.apache.org/jira/browse/ARROW-8880) - [R][Linux] Make R Binary Install Friendlier
-* [ARROW-8881](https://issues.apache.org/jira/browse/ARROW-8881) - [Rust] Add large list and binary support
-* [ARROW-8885](https://issues.apache.org/jira/browse/ARROW-8885) - [R] Don't include everything everywhere
-* [ARROW-8886](https://issues.apache.org/jira/browse/ARROW-8886) - [C\#] Decide and implement appropriate behaviour for Array builder resize to negative size
-* [ARROW-8887](https://issues.apache.org/jira/browse/ARROW-8887) - [Java] Buffer size for complex vectors increases rapidly in case of clear/write loop
-* [ARROW-8890](https://issues.apache.org/jira/browse/ARROW-8890) - [R] Fix C++ lint issue
-* [ARROW-8895](https://issues.apache.org/jira/browse/ARROW-8895) - [C++] Add C++ unit tests for filter and take functions on temporal type inputs, including timestamps
-* [ARROW-8896](https://issues.apache.org/jira/browse/ARROW-8896) - [C++] Reimplement dictionary unpacking in Cast kernels using Take
-* [ARROW-8899](https://issues.apache.org/jira/browse/ARROW-8899) - [R] Add R metadata like pandas metadata for round-trip fidelity
-* [ARROW-8901](https://issues.apache.org/jira/browse/ARROW-8901) - [C++] Reduce number of take kernels
-* [ARROW-8903](https://issues.apache.org/jira/browse/ARROW-8903) - [C++] Implement optimized "unsafe take" for use with selection vectors for kernel execution
-* [ARROW-8904](https://issues.apache.org/jira/browse/ARROW-8904) - [Python] Fix usages of deprecated C++ APIs related to child/field
-* [ARROW-8906](https://issues.apache.org/jira/browse/ARROW-8906) - [Rust] Support reading multiple CSV files for schema inference
-* [ARROW-8907](https://issues.apache.org/jira/browse/ARROW-8907) - [Rust] implement scalar comparison operations
-* [ARROW-8912](https://issues.apache.org/jira/browse/ARROW-8912) - [Ruby] Keep reference of Arrow::Buffer's data for GC
-* [ARROW-8913](https://issues.apache.org/jira/browse/ARROW-8913) - [Ruby] Use "field" instead of "child"
-* [ARROW-8914](https://issues.apache.org/jira/browse/ARROW-8914) - [C++][Gandiva] Decimal128 related test failed on big-endian platforms
-* [ARROW-8915](https://issues.apache.org/jira/browse/ARROW-8915) - [Dev][Archery] Require Click 7
-* [ARROW-8917](https://issues.apache.org/jira/browse/ARROW-8917) - [C++][Compute] Formalize "metafunction" concept
-* [ARROW-8918](https://issues.apache.org/jira/browse/ARROW-8918) - [C++] Add cast "metafunction" to FunctionRegistry that addresses dispatching to appropriate type-specific CastFunction
-* [ARROW-8922](https://issues.apache.org/jira/browse/ARROW-8922) - [C++] Implement example string scalar kernel function to assist with string kernels buildout per ARROW-555
-* [ARROW-8923](https://issues.apache.org/jira/browse/ARROW-8923) - [C++] Improve usability of arrow::compute::CallFunction by moving ExecContext\* argument to end and adding default
-* [ARROW-8926](https://issues.apache.org/jira/browse/ARROW-8926) - [C++] Improve docstrings in new public APIs in arrow/compute and fix miscellaneous typos
-* [ARROW-8927](https://issues.apache.org/jira/browse/ARROW-8927) - [C++] Support dictionary memos when reading/writing record batches using cuda IPC
-* [ARROW-8929](https://issues.apache.org/jira/browse/ARROW-8929) - [C++] Change compute::Arity:VarArgs min\_args default to 0
-* [ARROW-8931](https://issues.apache.org/jira/browse/ARROW-8931) - [Rust] Support lexical sort in arrow compute kernel
-* [ARROW-8933](https://issues.apache.org/jira/browse/ARROW-8933) - [C++] Reduce generated code in vector\_hash.cc
-* [ARROW-8934](https://issues.apache.org/jira/browse/ARROW-8934) - [C++] Add timestamp subtract kernel aliased to int64 subtract implementation
-* [ARROW-8937](https://issues.apache.org/jira/browse/ARROW-8937) - [C++] Add "parse\_strptime" function for string to timestamp conversions using the kernels framework
-* [ARROW-8938](https://issues.apache.org/jira/browse/ARROW-8938) - [R] Provide binding for arrow::compute::CallFunction
-* [ARROW-8940](https://issues.apache.org/jira/browse/ARROW-8940) - [Java] Fix the performance degradation of integration tests
-* [ARROW-8941](https://issues.apache.org/jira/browse/ARROW-8941) - [C++/Python] arrow-nightlies conda repository is full
-* [ARROW-8942](https://issues.apache.org/jira/browse/ARROW-8942) - [R] Detect compression in reading CSV/JSON
-* [ARROW-8943](https://issues.apache.org/jira/browse/ARROW-8943) - [C++][Dataset] Add support for Partitioning to ParquetDatasetFactory
-* [ARROW-8950](https://issues.apache.org/jira/browse/ARROW-8950) - [C++] Make head optional in s3fs
-* [ARROW-8958](https://issues.apache.org/jira/browse/ARROW-8958) - [FlightRPC][Python] Implement Flight DoExchange for Python
-* [ARROW-8960](https://issues.apache.org/jira/browse/ARROW-8960) - [MINOR] [FORMAT] Fix typos in comments
-* [ARROW-8961](https://issues.apache.org/jira/browse/ARROW-8961) - [C++] Add utf8proc library to toolchain
-* [ARROW-8963](https://issues.apache.org/jira/browse/ARROW-8963) - [C++][Parquet] Parquet cpp optimize allocate memory
-* [ARROW-8965](https://issues.apache.org/jira/browse/ARROW-8965) - [Python][Documentation] Pyarrow documentation for pip nightlies references 404'd location
-* [ARROW-8966](https://issues.apache.org/jira/browse/ARROW-8966) - [C++] Move arrow::ArrayData to a separate header file
-* [ARROW-8969](https://issues.apache.org/jira/browse/ARROW-8969) - [C++] Reduce generated code in compute/kernels/scalar\_compare.cc
-* [ARROW-8970](https://issues.apache.org/jira/browse/ARROW-8970) - [C++] Reduce shared library / binary code size (umbrella issue)
-* [ARROW-8972](https://issues.apache.org/jira/browse/ARROW-8972) - [Java] Support range value comparison for large varchar/varbinary vectors
-* [ARROW-8973](https://issues.apache.org/jira/browse/ARROW-8973) - [Java] Support batch value appending for large varchar/varbinary vectors
-* [ARROW-8974](https://issues.apache.org/jira/browse/ARROW-8974) - [C++] Refine TransferBitmap template parameters
-* [ARROW-8976](https://issues.apache.org/jira/browse/ARROW-8976) - [C++] compute::CallFunction can't Filter/Take with ChunkedArray
-* [ARROW-8979](https://issues.apache.org/jira/browse/ARROW-8979) - [C++] Implement bitmap word reader and writer
-* [ARROW-8984](https://issues.apache.org/jira/browse/ARROW-8984) - [R] Revise install guides now that Windows conda package exists
-* [ARROW-8985](https://issues.apache.org/jira/browse/ARROW-8985) - [Format] Add "byte width" field with default of 16 to Decimal Flatbuffers type for forward compatibility
-* [ARROW-8989](https://issues.apache.org/jira/browse/ARROW-8989) - [C++] Document available functions in compute::FunctionRegistry
-* [ARROW-8993](https://issues.apache.org/jira/browse/ARROW-8993) - [Rust] Support reading non-seekable sources in text readers
-* [ARROW-8994](https://issues.apache.org/jira/browse/ARROW-8994) - [C++] Disable include-what-you-use cpplint lint checks
-* [ARROW-8996](https://issues.apache.org/jira/browse/ARROW-8996) - [C++] Runtime SIMD path for Aggregate Sum/Mean kernel
-* [ARROW-8997](https://issues.apache.org/jira/browse/ARROW-8997) - [Archery] Benchmark formatter should have friendly units
-* [ARROW-9004](https://issues.apache.org/jira/browse/ARROW-9004) - [C++][Gandiva] Support building with LLVM 10
-* [ARROW-9005](https://issues.apache.org/jira/browse/ARROW-9005) - [Rust] [DataFusion] Support sort expression
-* [ARROW-9007](https://issues.apache.org/jira/browse/ARROW-9007) - [Rust] Support appending arrays by merging array data
-* [ARROW-9014](https://issues.apache.org/jira/browse/ARROW-9014) - [Packaging] Bump the minor part of the automatically generated version in crossbow
-* [ARROW-9015](https://issues.apache.org/jira/browse/ARROW-9015) - [Java] Make BaseAllocator package private
-* [ARROW-9016](https://issues.apache.org/jira/browse/ARROW-9016) - [Java] Remove direct references to Netty/Unsafe Allocators
-* [ARROW-9017](https://issues.apache.org/jira/browse/ARROW-9017) - [Python] Refactor the Scalar classes
-* [ARROW-9018](https://issues.apache.org/jira/browse/ARROW-9018) - [C++] Remove APIs that were deprecated in 0.17.x and prior
-* [ARROW-9021](https://issues.apache.org/jira/browse/ARROW-9021) - [Python] The filesystem keyword in parquet.read\_table is not documented
-* [ARROW-9022](https://issues.apache.org/jira/browse/ARROW-9022) - [C++] Add/Sub/Mul arithmetic kernels with overflow check
-* [ARROW-9029](https://issues.apache.org/jira/browse/ARROW-9029) - [C++] Implement BitBlockCounter interface for blockwise popcounts of validity bitmaps
-* [ARROW-9030](https://issues.apache.org/jira/browse/ARROW-9030) - [Python] Clean up some usages of pyarrow.compat, move some common functions/symbols to lib.pyx
-* [ARROW-9031](https://issues.apache.org/jira/browse/ARROW-9031) - [R] Implement conversion from Type::UINT64 to R vector
-* [ARROW-9032](https://issues.apache.org/jira/browse/ARROW-9032) - [C++] Split arrow/util/bit\_util.h into multiple header files
-* [ARROW-9034](https://issues.apache.org/jira/browse/ARROW-9034) - [C++] Implement binary (two bitmap) version of BitBlockCounter
-* [ARROW-9042](https://issues.apache.org/jira/browse/ARROW-9042) - [C++] Add Subtract and Multiply arithmetic kernels with wrap-around behavior
-* [ARROW-9043](https://issues.apache.org/jira/browse/ARROW-9043) - [Go] Temporarily copy LICENSE.txt to go/
-* [ARROW-9043](https://issues.apache.org/jira/browse/ARROW-9043) - [Go] Temporarily copy LICENSE.txt to go/
-* [ARROW-9045](https://issues.apache.org/jira/browse/ARROW-9045) - [C++] Improve and expand Take/Filter benchmarks
-* [ARROW-9046](https://issues.apache.org/jira/browse/ARROW-9046) - [C++][R] Put more things in type\_fwds
-* [ARROW-9047](https://issues.apache.org/jira/browse/ARROW-9047) - [Rust] Setting 0-bits of a 0-length bitset segfaults
-* [ARROW-9050](https://issues.apache.org/jira/browse/ARROW-9050) - [Release] Use 1.0.0 as the next version
-* [ARROW-9051](https://issues.apache.org/jira/browse/ARROW-9051) - [GLib] Refer Array related objects from Array
-* [ARROW-9052](https://issues.apache.org/jira/browse/ARROW-9052) - [CI][MinGW] Enable Gandiva
-* [ARROW-9055](https://issues.apache.org/jira/browse/ARROW-9055) - [C++] Add sum/mean kernels for Boolean type
-* [ARROW-9058](https://issues.apache.org/jira/browse/ARROW-9058) - [Packaging][wheel] Boost download is failed
-* [ARROW-9060](https://issues.apache.org/jira/browse/ARROW-9060) - [GLib] Add support for building Apache Arrow Datasets GLib with non-installed Apache Arrow Datasets
-* [ARROW-9061](https://issues.apache.org/jira/browse/ARROW-9061) - [Packaging][APT][Yum][GLib] Add Apache Arrow Datasets GLib
-* [ARROW-9062](https://issues.apache.org/jira/browse/ARROW-9062) - [Rust] Support to read JSON into dictionary type
-* [ARROW-9067](https://issues.apache.org/jira/browse/ARROW-9067) - [C++] Create reusable branchless / vectorized index boundschecking functions
-* [ARROW-9070](https://issues.apache.org/jira/browse/ARROW-9070) - [C++] StructScalar needs field accessor methods
-* [ARROW-9073](https://issues.apache.org/jira/browse/ARROW-9073) - [C++] RapidJSON include directory detection doesn't work with RapidJSONConfig.cmake
-* [ARROW-9074](https://issues.apache.org/jira/browse/ARROW-9074) - [GLib] Add missing arrow-json check
-* [ARROW-9075](https://issues.apache.org/jira/browse/ARROW-9075) - [C++] Optimize Filter implementation
-* [ARROW-9079](https://issues.apache.org/jira/browse/ARROW-9079) - [C++] Write benchmark for arithmetic kernels
-* [ARROW-9083](https://issues.apache.org/jira/browse/ARROW-9083) - [R] collect int64, uint32, uint64 as R integer type if not out of bounds
-* [ARROW-9086](https://issues.apache.org/jira/browse/ARROW-9086) - [CI][Homebrew] Enable Gandiva
-* [ARROW-9088](https://issues.apache.org/jira/browse/ARROW-9088) - [Rust] Recent version of arrow crate does not compile into wasm target
-* [ARROW-9089](https://issues.apache.org/jira/browse/ARROW-9089) - [Python] A PyFileSystem handler for fsspec-based filesystems
-* [ARROW-9090](https://issues.apache.org/jira/browse/ARROW-9090) - [C++] Bump versions of bundled libraries
-* [ARROW-9091](https://issues.apache.org/jira/browse/ARROW-9091) - [C++] Utilize function's default options when passing no options to CallFunction for a function that requires them
-* [ARROW-9093](https://issues.apache.org/jira/browse/ARROW-9093) - [FlightRPC][C++][Python] Allow setting gRPC client options
-* [ARROW-9094](https://issues.apache.org/jira/browse/ARROW-9094) - [Python] Bump versions of compiled dependencies in manylinux wheels
-* [ARROW-9095](https://issues.apache.org/jira/browse/ARROW-9095) - [Rust] Fix NullArray to comply with spec
-* [ARROW-9099](https://issues.apache.org/jira/browse/ARROW-9099) - [C++][Gandiva] Add TRIM function for string
-* [ARROW-9100](https://issues.apache.org/jira/browse/ARROW-9100) - [C++] Add ascii\_lower kernel
-* [ARROW-9101](https://issues.apache.org/jira/browse/ARROW-9101) - [Doc][C++][Python] Document encoding expected by CSV and JSON readers
-* [ARROW-9102](https://issues.apache.org/jira/browse/ARROW-9102) - [Packaging] Upload built manylinux docker images
-* [ARROW-9106](https://issues.apache.org/jira/browse/ARROW-9106) - [C++] Add C++ foundation to ease file transcoding
-* [ARROW-9108](https://issues.apache.org/jira/browse/ARROW-9108) - [C++][Dataset] Add Parquet Statistics conversion for timestamp columns
-* [ARROW-9109](https://issues.apache.org/jira/browse/ARROW-9109) - [Python][Packaging] Enable S3 support in manylinux wheels
-* [ARROW-9110](https://issues.apache.org/jira/browse/ARROW-9110) - [C++] Fix CPU cache size detection on macOS
-* [ARROW-9112](https://issues.apache.org/jira/browse/ARROW-9112) - [R] Update autobrew script location
-* [ARROW-9115](https://issues.apache.org/jira/browse/ARROW-9115) - [C++] Process data buffers in batch in ascii\_lower / ascii\_upper kernels rather than using string\_view value iteration
-* [ARROW-9116](https://issues.apache.org/jira/browse/ARROW-9116) - [C++] Add BinaryArray::total\_values\_length()
-* [ARROW-9116](https://issues.apache.org/jira/browse/ARROW-9116) - [C++] Add BinaryArray::total\_values\_length()
-* [ARROW-9118](https://issues.apache.org/jira/browse/ARROW-9118) - [C++] Add more general BoundsCheck function that also checks for arbitrary lower limits in integer arrays
-* [ARROW-9119](https://issues.apache.org/jira/browse/ARROW-9119) - [C++] Add support for building with system static gRPC
-* [ARROW-9123](https://issues.apache.org/jira/browse/ARROW-9123) - [Python][wheel] Use libzstd.a explicitly
-* [ARROW-9124](https://issues.apache.org/jira/browse/ARROW-9124) - [Rust][Datafusion] DFParser should consume sql query as &str instead of String
-* [ARROW-9125](https://issues.apache.org/jira/browse/ARROW-9125) - [C++] Add missing include for arrow::internal::ZeroMemory() for Valgrind
-* [ARROW-9129](https://issues.apache.org/jira/browse/ARROW-9129) - [Python][JPype] Test is failed with JPype 0.7.5
-* [ARROW-9130](https://issues.apache.org/jira/browse/ARROW-9130) - [Python] Add deprecated wrappers functions to a pyarrow/compat.py module for 1.0.0 that will be removed later
-* [ARROW-9131](https://issues.apache.org/jira/browse/ARROW-9131) - [C++] Faster ascii\_lower and ascii\_upper
-* [ARROW-9132](https://issues.apache.org/jira/browse/ARROW-9132) - [C++] Implement hash kernels for dictionary data with constant dictionaries
-* [ARROW-9133](https://issues.apache.org/jira/browse/ARROW-9133) - [C++] Add utf8\_upper and utf8\_lower
-* [ARROW-9137](https://issues.apache.org/jira/browse/ARROW-9137) - [GLib][Ruby] Allow to read Parquet files in chunks (by RowGroup)
-* [ARROW-9138](https://issues.apache.org/jira/browse/ARROW-9138) - [Docs][Format] Make sure format version is hard coded in the docs
-* [ARROW-9139](https://issues.apache.org/jira/browse/ARROW-9139) - [Python] parquet read\_table should not use\_legacy\_dataset
-* [ARROW-9144](https://issues.apache.org/jira/browse/ARROW-9144) - [CI] OSS-Fuzz build fails because recent changes in the google repository
-* [ARROW-9145](https://issues.apache.org/jira/browse/ARROW-9145) - [C++] Add true\_count / false\_count methods to BooleanArray
-* [ARROW-9152](https://issues.apache.org/jira/browse/ARROW-9152) - [C++] Create specialized filter implementation for varbinary types
-* [ARROW-9153](https://issues.apache.org/jira/browse/ARROW-9153) - [Python] Add bindings for StructScalar
-* [ARROW-9154](https://issues.apache.org/jira/browse/ARROW-9154) - [Developer] Use GitHub issue templates better
-* [ARROW-9155](https://issues.apache.org/jira/browse/ARROW-9155) - [Archery] Less precise but faster default settings for "archery benchmark diff"
-* [ARROW-9156](https://issues.apache.org/jira/browse/ARROW-9156) - [C++] Reducing the code size of the tensor module
-* [ARROW-9157](https://issues.apache.org/jira/browse/ARROW-9157) - [Rust][Datafusion] execution context's create\_physical\_plan should take self as immutable reference
-* [ARROW-9158](https://issues.apache.org/jira/browse/ARROW-9158) - [Rust][Datafusion] Projection physical plan compilation should preserve nullability
-* [ARROW-9159](https://issues.apache.org/jira/browse/ARROW-9159) - [Python] Expose the isnull/isvalid kernels
-* [ARROW-9162](https://issues.apache.org/jira/browse/ARROW-9162) - [Python] Expose Add/Subtract/Multiply arithmetic kernels
-* [ARROW-9163](https://issues.apache.org/jira/browse/ARROW-9163) - [C++] Add methods to StringArray, LargeStringArray, to validate whether its values are all UTF-8
-* [ARROW-9166](https://issues.apache.org/jira/browse/ARROW-9166) - [Website] Add overview page
-* [ARROW-9167](https://issues.apache.org/jira/browse/ARROW-9167) - [Doc][Website] /docs/c\_glib/index.html is overwritten
-* [ARROW-9168](https://issues.apache.org/jira/browse/ARROW-9168) - [C++][Flight] allow flight benchmark to use separated TCP connections
-* [ARROW-9173](https://issues.apache.org/jira/browse/ARROW-9173) - [C++] Document how to use Arrow from a third-party CMake project
-* [ARROW-9175](https://issues.apache.org/jira/browse/ARROW-9175) - [FlightRPC][C++][Python] Expose connected peer
-* [ARROW-9176](https://issues.apache.org/jira/browse/ARROW-9176) - [Rust] Fix for memory leaks in Arrow allocator
-* [ARROW-9178](https://issues.apache.org/jira/browse/ARROW-9178) - [R] Improve documentation about CSV reader
-* [ARROW-9179](https://issues.apache.org/jira/browse/ARROW-9179) - [R] Replace usage of iris dataset in tests
-* [ARROW-9180](https://issues.apache.org/jira/browse/ARROW-9180) - [Developer] Remove usage of whitelist, blacklist, slave, etc.
-* [ARROW-9181](https://issues.apache.org/jira/browse/ARROW-9181) - [C++] Instantiate fewer templates in Cast kernel implementation
-* [ARROW-9182](https://issues.apache.org/jira/browse/ARROW-9182) - [C++] Use "applicator" namespace for kernel operator-to-kernel functors, streamline argument unboxing
-* [ARROW-9185](https://issues.apache.org/jira/browse/ARROW-9185) - [C++] [Java][Gandiva] Make llvm build optimisation configurable from java
-* [ARROW-9188](https://issues.apache.org/jira/browse/ARROW-9188) - [C++] Do not always statically link Brotli libraries
-* [ARROW-9189](https://issues.apache.org/jira/browse/ARROW-9189) - [Website] Improve contributor guide
-* [ARROW-9190](https://issues.apache.org/jira/browse/ARROW-9190) - [Website][C++] Add blog post on efforts to make building lighter and easier
-* [ARROW-9191](https://issues.apache.org/jira/browse/ARROW-9191) - [Rust] Do not panic when int96 milliseconds are negative
-* [ARROW-9192](https://issues.apache.org/jira/browse/ARROW-9192) - [CI][Rust] Add support for running clippy
-* [ARROW-9193](https://issues.apache.org/jira/browse/ARROW-9193) - [C++] Add method to parse date from null-terminated string
-* [ARROW-9197](https://issues.apache.org/jira/browse/ARROW-9197) - [C++] Revamp numeric casts: faster performance and reduced binary size
-* [ARROW-9201](https://issues.apache.org/jira/browse/ARROW-9201) - [Archery] Render-human readable table when using "archery benchmark diff"
-* [ARROW-9202](https://issues.apache.org/jira/browse/ARROW-9202) - [GLib] Add GArrowDatum
-* [ARROW-9203](https://issues.apache.org/jira/browse/ARROW-9203) - [Packaging][deb] Add missing gir1.2-arrow-dataset-1.0.install
-* [ARROW-9204](https://issues.apache.org/jira/browse/ARROW-9204) - [C++][Flight] change records\_per\_stream to int64 in flight benchmark
-* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst
-* [ARROW-9206](https://issues.apache.org/jira/browse/ARROW-9206) - [C++][Flight] measure latency in flight benchmark
-* [ARROW-9207](https://issues.apache.org/jira/browse/ARROW-9207) - [Python][Dataset] Clean-up internal FileSource class
-* [ARROW-9210](https://issues.apache.org/jira/browse/ARROW-9210) - [C++] Use OptionalBitBlockCounter in ArrayDataInlineVisitor
-* [ARROW-9214](https://issues.apache.org/jira/browse/ARROW-9214) - [C++] Avoid util::optional in favor of separate inlineable functions in arrow/visitor\_inline.h
-* [ARROW-9216](https://issues.apache.org/jira/browse/ARROW-9216) - [C++][Parquet] Use BitBlockCounter for plain spaced encoding/decoding
-* [ARROW-9217](https://issues.apache.org/jira/browse/ARROW-9217) - [C++][Parquet] Cover 0.01% null for the plain spaced encoding/decoding benchmark
-* [ARROW-9220](https://issues.apache.org/jira/browse/ARROW-9220) - [C++] Disable relevant compute kernels if ARROW\_WITH\_UTF8PROC=OFF
-* [ARROW-9222](https://issues.apache.org/jira/browse/ARROW-9222) - [Format][Proposal] Remove validity bitmap from Union types
-* [ARROW-9224](https://issues.apache.org/jira/browse/ARROW-9224) - [Dev][Archery] Copy local repo on clone failure
-* [ARROW-9225](https://issues.apache.org/jira/browse/ARROW-9225) - [C++][Compute] Improve counting sort
-* [ARROW-9231](https://issues.apache.org/jira/browse/ARROW-9231) - [Format] Increment MetadataVersion from V4 to V5
-* [ARROW-9234](https://issues.apache.org/jira/browse/ARROW-9234) - [GLib][CUDA] Add support for dictionary memo on reading record batch from buffer
-* [ARROW-9241](https://issues.apache.org/jira/browse/ARROW-9241) - [C++] Add forward compatibility checks for Decimal::bitWidth
-* [ARROW-9242](https://issues.apache.org/jira/browse/ARROW-9242) - [Java] Add forward compatibility checks for Decimal::bitWidth
-* [ARROW-9247](https://issues.apache.org/jira/browse/ARROW-9247) - [Python] Expose BinaryArray::total\_values\_length in bindings
-* [ARROW-9248](https://issues.apache.org/jira/browse/ARROW-9248) - [C++] Add "list\_size" function that returns Int32Array/Int64Array giving list cell sizes
-* [ARROW-9249](https://issues.apache.org/jira/browse/ARROW-9249) - [C++] Implement "list\_parent\_indices" vector function
-* [ARROW-9250](https://issues.apache.org/jira/browse/ARROW-9250) - [C++] Compact generated code in compute/kernels/scalar\_set\_lookup.cc using same method as vector\_hash.cc
-* [ARROW-9251](https://issues.apache.org/jira/browse/ARROW-9251) - [C++] Move JSON testing code for integration tests to libarrow\_testing
-* [ARROW-9254](https://issues.apache.org/jira/browse/ARROW-9254) - [C++] Factor out some integer casting internals so it can be reused with temporal casts
-* [ARROW-9255](https://issues.apache.org/jira/browse/ARROW-9255) - [C++] Use CMake to build bundled Protobuf with CMake \>= 3.7
-* [ARROW-9256](https://issues.apache.org/jira/browse/ARROW-9256) - [C++] Incorrect variable name ARROW\_CXX\_FLAGS
-* [ARROW-9258](https://issues.apache.org/jira/browse/ARROW-9258) - [Format] Add V5 MetadataVersion
-* [ARROW-9259](https://issues.apache.org/jira/browse/ARROW-9259) - [Format] Permit unsigned dictionary indices in Columnar.rst
-* [ARROW-9262](https://issues.apache.org/jira/browse/ARROW-9262) - [Packaging][Linux][CI] Use Ubuntu 18.04 to build ARM64 packages on Travis CI
-* [ARROW-9263](https://issues.apache.org/jira/browse/ARROW-9263) - [C++] Benchmark: promote RegressionSetArgs size to L2
-* [ARROW-9264](https://issues.apache.org/jira/browse/ARROW-9264) - [C++] Cleanup Parquet Arrow Schema code
-* [ARROW-9265](https://issues.apache.org/jira/browse/ARROW-9265) - [C++] Add support for writing MetadataVersion::V4-compatible IPC messages for compatibility with library versions <= 0.17.1
-* [ARROW-9268](https://issues.apache.org/jira/browse/ARROW-9268) - [C++] Add is{alnum,alpha,...} kernels for strings
-* [ARROW-9272](https://issues.apache.org/jira/browse/ARROW-9272) - [C++][Python] Reduce complexity in python to arrow conversion
-* [ARROW-9276](https://issues.apache.org/jira/browse/ARROW-9276) - [Dev] Enable ARROW\_CUDA when generating API documentations
-* [ARROW-9277](https://issues.apache.org/jira/browse/ARROW-9277) - [C++] Fix documentation of Reading CSV files
-* [ARROW-9278](https://issues.apache.org/jira/browse/ARROW-9278) - [C++] Implement Union validity bitmap changes from ARROW-9222
-* [ARROW-9280](https://issues.apache.org/jira/browse/ARROW-9280) - [Rust] Write statistics to Parquet files
-* [ARROW-9281](https://issues.apache.org/jira/browse/ARROW-9281) - [R] Turn off utf8proc in R builds
-* [ARROW-9283](https://issues.apache.org/jira/browse/ARROW-9283) - [Python] Expose C++ build info
-* [ARROW-9287](https://issues.apache.org/jira/browse/ARROW-9287) - [C++] Implement support for unsigned dictionary indices
-* [ARROW-9289](https://issues.apache.org/jira/browse/ARROW-9289) - [R] Remove deprecated functions
-* [ARROW-9290](https://issues.apache.org/jira/browse/ARROW-9290) - [Rust] [Parquet] Add features to allow opting out of dependencies
-* [ARROW-9291](https://issues.apache.org/jira/browse/ARROW-9291) - [R] Support fixed size binary/list types
-* [ARROW-9292](https://issues.apache.org/jira/browse/ARROW-9292) - [Rust] Update feature matrix with passing tests
-* [ARROW-9294](https://issues.apache.org/jira/browse/ARROW-9294) - [GLib] Add GArrowFunction
-* [ARROW-9300](https://issues.apache.org/jira/browse/ARROW-9300) - [Java] Separate Netty Memory to its own module
-* [ARROW-9306](https://issues.apache.org/jira/browse/ARROW-9306) - [Ruby] Add support for Arrow::RecordBatch.new(raw\_table)
-* [ARROW-9307](https://issues.apache.org/jira/browse/ARROW-9307) - [Ruby] Add Arrow::RecordBatchIterator\#to\_a
-* [ARROW-9308](https://issues.apache.org/jira/browse/ARROW-9308) - [Format] Add Feature enum to schema.fbs for forward compatibity
-* [ARROW-9316](https://issues.apache.org/jira/browse/ARROW-9316) - [C++] Use "Dataset" instead of "Datasets"
-* [ARROW-9321](https://issues.apache.org/jira/browse/ARROW-9321) - [C++][Dataset] Allow to "collect" statistics for ParquetFragment row groups if not constructed from \_metadata
-* [ARROW-9322](https://issues.apache.org/jira/browse/ARROW-9322) - [R] Dataset documentation polishing
-* [ARROW-9323](https://issues.apache.org/jira/browse/ARROW-9323) - [Ruby] Add Red Arrow Dataset
-* [ARROW-9327](https://issues.apache.org/jira/browse/ARROW-9327) - Fix all clippy errors for arrow crate
-* [ARROW-9329](https://issues.apache.org/jira/browse/ARROW-9329) - [C++][Gandiva] Implement castTimestampToDate function
-* [ARROW-9331](https://issues.apache.org/jira/browse/ARROW-9331) - [C++] Improve the performance of Tensor-to-SparseTensor conversion
-* [ARROW-9333](https://issues.apache.org/jira/browse/ARROW-9333) - [Python] Expose more IPC write options in Python
-* [ARROW-9335](https://issues.apache.org/jira/browse/ARROW-9335) - [Website] Update website for 1.0
-* [ARROW-9337](https://issues.apache.org/jira/browse/ARROW-9337) - [R] On C++ library build failure, give an unambiguous message
-* [ARROW-9339](https://issues.apache.org/jira/browse/ARROW-9339) - [Rust] Comments on SIMD in Arrow README are incorrect
-* [ARROW-9340](https://issues.apache.org/jira/browse/ARROW-9340) - [R] Use CRAN version of decor package
-* [ARROW-9341](https://issues.apache.org/jira/browse/ARROW-9341) - [GLib] Use arrow::Datum version Take()
-* [ARROW-9345](https://issues.apache.org/jira/browse/ARROW-9345) - [C++][Dataset] Expression with dictionary type should work with operand of value type
-* [ARROW-9346](https://issues.apache.org/jira/browse/ARROW-9346) - [C++][Python][Dataset] Add total\_byte\_size metadata to RowGroupInfo
-* [ARROW-9362](https://issues.apache.org/jira/browse/ARROW-9362) - [Java] Add support for writing MetadataVersion::V4-compatible IPC messages for compatibility with library versions <= 0.17.1
-* [ARROW-9365](https://issues.apache.org/jira/browse/ARROW-9365) - [Go] Implement the rest of the typed array builders in NewBuilder
-* [ARROW-9370](https://issues.apache.org/jira/browse/ARROW-9370) - [Java] Bump Netty version
-* [ARROW-9374](https://issues.apache.org/jira/browse/ARROW-9374) - [C++][Python] Expose MakeArrayFromScalar
-* [ARROW-9379](https://issues.apache.org/jira/browse/ARROW-9379) - [Rust] Support unsigned dictionary indices
-* [ARROW-9383](https://issues.apache.org/jira/browse/ARROW-9383) - [Python] Support fsspec filesystems in Dataset API through fs handler
-* [ARROW-9386](https://issues.apache.org/jira/browse/ARROW-9386) - [Rust] RecordBatch.schema() should not return &Arc<Schema\>
-* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names
-* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names
-* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names
-* [ARROW-9391](https://issues.apache.org/jira/browse/ARROW-9391) - [Rust] Float32 values interpreted as zero when record batch has one row
-* [ARROW-9393](https://issues.apache.org/jira/browse/ARROW-9393) - [Doc] update supported types documentation for Java
-* [ARROW-9395](https://issues.apache.org/jira/browse/ARROW-9395) - [Python] Provide configurable MetadataVersion in IPC API and environment variable to set default to V4 when needed
-* [ARROW-9399](https://issues.apache.org/jira/browse/ARROW-9399) - [C++] Add forward compatibility checks for unrecognized future MetadataVersion
-* [ARROW-9403](https://issues.apache.org/jira/browse/ARROW-9403) - [Python] add .tolist as alias of .to\_pylist
-* [ARROW-9407](https://issues.apache.org/jira/browse/ARROW-9407) - [Python] Accept pd.NA as missing value in array constructor
-* [ARROW-9411](https://issues.apache.org/jira/browse/ARROW-9411) - [Rust] Update dependencies
-* [ARROW-9424](https://issues.apache.org/jira/browse/ARROW-9424) - [C++][Parquet] Disable writing files with LZ4 codec
-* [ARROW-9425](https://issues.apache.org/jira/browse/ARROW-9425) - [Rust][DataFusion] Make ExecutionContext sharable between threads
-* [ARROW-9427](https://issues.apache.org/jira/browse/ARROW-9427) - [Rust][DataFusion] Add pub fn ExecutionContext.tables()
-* [ARROW-9437](https://issues.apache.org/jira/browse/ARROW-9437) - [Python][Packaging] Homebrew fails to install build dependencies in the macOS wheel builds
-* [ARROW-9442](https://issues.apache.org/jira/browse/ARROW-9442) - [Python] Do not force Validate() to be called in pyarrow\_wrap\_table
-* [ARROW-9445](https://issues.apache.org/jira/browse/ARROW-9445) - [Python] Revert Array.equals changes + expose comparison ops in compute
-* [ARROW-9446](https://issues.apache.org/jira/browse/ARROW-9446) - [C++] Export compiler information in BuildInfo
-* [ARROW-9447](https://issues.apache.org/jira/browse/ARROW-9447) - [Rust][DataFusion] Allow closures as ScalarUDFs
-* [ARROW-9452](https://issues.apache.org/jira/browse/ARROW-9452) - [Rust] [DateFusion] Improve performance of parquet scan
-* [ARROW-9470](https://issues.apache.org/jira/browse/ARROW-9470) - [CI][Java] Run Maven in parallel
-* [ARROW-9472](https://issues.apache.org/jira/browse/ARROW-9472) - [R] Provide configurable MetadataVersion in IPC API and environment variable to set default to V4 when needed
-* [ARROW-9473](https://issues.apache.org/jira/browse/ARROW-9473) - [Doc] Polishing for 1.0
-* [ARROW-9478](https://issues.apache.org/jira/browse/ARROW-9478) - [C++] Improve error message on unsupported cast types
-* [ARROW-9484](https://issues.apache.org/jira/browse/ARROW-9484) - [Docs] Update is\* functions to be is\_\* in the compute docs
-* [ARROW-9485](https://issues.apache.org/jira/browse/ARROW-9485) - [R] Better shared library stripping
-* [ARROW-9493](https://issues.apache.org/jira/browse/ARROW-9493) - [Python][Dataset] Dictionary encode string partition columns by default
-* [ARROW-9509](https://issues.apache.org/jira/browse/ARROW-9509) - [Release] Don't test Gandiva in the windows wheel verification script
-* [ARROW-9511](https://issues.apache.org/jira/browse/ARROW-9511) - [Packaging][Release] Set conda packages' build number to 0
-* [ARROW-9519](https://issues.apache.org/jira/browse/ARROW-9519) - [Rust] Improve error message when getting a field by name from schema
-* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel
-* [ARROW-9529](https://issues.apache.org/jira/browse/ARROW-9529) - [Dev][Release] Improvements to release verification scripts
-* [ARROW-9531](https://issues.apache.org/jira/browse/ARROW-9531) - [Packaging][Release] Update conda forge dependency pins
-* [PARQUET-1820](https://issues.apache.org/jira/browse/PARQUET-1820) - [C++] Use a column filter hint to inform read prefetching in Arrow reads
-* [PARQUET-1843](https://issues.apache.org/jira/browse/PARQUET-1843) - [C++] Unnecessary assignment in DictDecoderImpl::Decode
-* [PARQUET-1855](https://issues.apache.org/jira/browse/PARQUET-1855) - [C++] Improve documentation on MetaData ownership
-* [PARQUET-1861](https://issues.apache.org/jira/browse/PARQUET-1861) - [Documentation][C++] Explain ReaderProperters.buffer\_stream\*
-
-
-
-# Apache Arrow 0.17.1 (2020-05-18)
-
-## Bug Fixes
-
-* [ARROW-8503](https://issues.apache.org/jira/browse/ARROW-8503) - [Packaging][deb] Can't build apache-arrow-archive-keyring for RC
-* [ARROW-8505](https://issues.apache.org/jira/browse/ARROW-8505) - [Release][C\#] "sourcelink test" is failed by Apache.Arrow.AssemblyInfo.cs
-* [ARROW-8584](https://issues.apache.org/jira/browse/ARROW-8584) - [Packaging][C++] Protobuf link error in deb builds
-* [ARROW-8608](https://issues.apache.org/jira/browse/ARROW-8608) - [C++] Update vendored mpark/variant.h to latest to fix NVCC compilation issues
-* [ARROW-8609](https://issues.apache.org/jira/browse/ARROW-8609) - [C++] ORC JNI bridge crashed on null arrow buffer
-* [ARROW-8641](https://issues.apache.org/jira/browse/ARROW-8641) - [Python] Regression in feather: no longer supports permutation in column selection
-* [ARROW-8657](https://issues.apache.org/jira/browse/ARROW-8657) - [Python][C++][Parquet] Forward compatibility issue from 0.16 to 0.17 when using version='2.0'
-* [ARROW-8684](https://issues.apache.org/jira/browse/ARROW-8684) - [Python] "SystemError: Bad call flags in \_PyMethodDef\_RawFastCallDict" in Python 3.7.7 on macOS when using pyarrow wheel
-* [ARROW-8694](https://issues.apache.org/jira/browse/ARROW-8694) - [Python][Parquet] parquet.read\_schema() fails when loading wide table created from Pandas DataFrame
-* [ARROW-8704](https://issues.apache.org/jira/browse/ARROW-8704) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz)
-* [ARROW-8706](https://issues.apache.org/jira/browse/ARROW-8706) - [C++][Parquet] Tracking JIRA for PARQUET-1857 (unencrypted INT16\_MAX Parquet row group limit)
-* [ARROW-8728](https://issues.apache.org/jira/browse/ARROW-8728) - [C++] Bitmap operation may cause buffer overflow
-* [ARROW-8741](https://issues.apache.org/jira/browse/ARROW-8741) - [Python][Packaging] Keep VS2015 with for the windows wheels
-* [ARROW-8750](https://issues.apache.org/jira/browse/ARROW-8750) - [Python] pyarrow.feather.write\_feather does not default to lz4 compression if it's available
-* [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups
-
-
-## New Features and Improvements
-
-* [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
-* [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups
-* [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion
-* [ARROW-8758](https://issues.apache.org/jira/browse/ARROW-8758) - [R] Updates for compatibility with dplyr 1.0
-* [ARROW-8786](https://issues.apache.org/jira/browse/ARROW-8786) - [Packaging][rpm] Use bundled zstd in the CentOS 8 build
-
-
-
-# Apache Arrow 0.17.0 (2020-04-20)
-
-## Bug Fixes
-
-* [ARROW-1907](https://issues.apache.org/jira/browse/ARROW-1907) - [C++/Python] Feather format cannot accommodate string columns containing more than a total of 2GB of data
-* [ARROW-2255](https://issues.apache.org/jira/browse/ARROW-2255) - [Developer][Integration] Serialize schema- and field-level custom metadata in integration test JSON format
-* [ARROW-2587](https://issues.apache.org/jira/browse/ARROW-2587) - [Python] Unable to write StructArrays with multiple children to parquet
-* [ARROW-3004](https://issues.apache.org/jira/browse/ARROW-3004) - [Documentation] Builds docs for master rather than a pinned commit
-* [ARROW-3543](https://issues.apache.org/jira/browse/ARROW-3543) - [R] Better support for timestamp format and time zones in R
-* [ARROW-5265](https://issues.apache.org/jira/browse/ARROW-5265) - [Python/CI] Add integration test with kartothek
-* [ARROW-5473](https://issues.apache.org/jira/browse/ARROW-5473) - [C++] Build failure on googletest\_ep on Windows when using Ninja
-* [ARROW-5981](https://issues.apache.org/jira/browse/ARROW-5981) - [C++] DictionaryBuilder<T\> initialization with Array can fail silently
-* [ARROW-6528](https://issues.apache.org/jira/browse/ARROW-6528) - [C++] Spurious Flight test failures (port allocation failure)
-* [ARROW-6547](https://issues.apache.org/jira/browse/ARROW-6547) - [C++] valgrind errors in diff-test
-* [ARROW-6738](https://issues.apache.org/jira/browse/ARROW-6738) - [Java] Fix problems with current union comparison logic
-* [ARROW-6757](https://issues.apache.org/jira/browse/ARROW-6757) - [Python] Creating csv.ParseOptions() causes "Windows fatal exception: access violation" with Visual Studio 2017
-* [ARROW-6871](https://issues.apache.org/jira/browse/ARROW-6871) - [Java] Enhance TransferPair related parameters check and tests
-* [ARROW-6872](https://issues.apache.org/jira/browse/ARROW-6872) - [C++][Python] Empty table with dictionary-columns raises ArrowNotImplementedError
-* [ARROW-6890](https://issues.apache.org/jira/browse/ARROW-6890) - [Rust] [Parquet] ArrowReader fails with seg fault
-* [ARROW-6895](https://issues.apache.org/jira/browse/ARROW-6895) - [C++][Parquet] parquet::arrow::ColumnReader: ByteArrayDictionaryRecordReader repeats returned values when calling \`NextBatch()\`
-* [ARROW-7008](https://issues.apache.org/jira/browse/ARROW-7008) - [Python] pyarrow.chunked\_array([array]) fails on array with all-None buffers
-* [ARROW-7049](https://issues.apache.org/jira/browse/ARROW-7049) - [C++] warnings building on mingw-w64
-* [ARROW-7301](https://issues.apache.org/jira/browse/ARROW-7301) - [Java] Sql type DATE should correspond to DateDayVector
-* [ARROW-7335](https://issues.apache.org/jira/browse/ARROW-7335) - [C++][Gandiva] Add castBIGINT, extractDay interval\_day functions in Gandiva
-* [ARROW-7390](https://issues.apache.org/jira/browse/ARROW-7390) - [C++][Dataset] Concurrency race in Projector::Project
-* [ARROW-7405](https://issues.apache.org/jira/browse/ARROW-7405) - [Java] ListVector isEmpty API is incorrect
-* [ARROW-7466](https://issues.apache.org/jira/browse/ARROW-7466) - [CI][Java] Fix gandiva-jar-osx nightly build failure
-* [ARROW-7467](https://issues.apache.org/jira/browse/ARROW-7467) - [Java] ComplexCopier does incorrect copy for Map nullable info
-* [ARROW-7507](https://issues.apache.org/jira/browse/ARROW-7507) - [Rust] Bump Thrift version to 0.13 in parquet-format and parquet
-* [ARROW-7520](https://issues.apache.org/jira/browse/ARROW-7520) - [R] Writing many batches causes a crash
-* [ARROW-7546](https://issues.apache.org/jira/browse/ARROW-7546) - [Java] Use new implementation to concat vectors values in batch
-* [ARROW-7624](https://issues.apache.org/jira/browse/ARROW-7624) - [Rust] Soundness issues via \`Buffer\` methods
-* [ARROW-7628](https://issues.apache.org/jira/browse/ARROW-7628) - [Python] Better document some read\_csv corner cases
-* [ARROW-7631](https://issues.apache.org/jira/browse/ARROW-7631) - [C++][Gandiva] return zero if there is an overflow while converting a decimal to a lower precision/scale
-* [ARROW-7672](https://issues.apache.org/jira/browse/ARROW-7672) - [C++] NULL pointer dereference bug
-* [ARROW-7680](https://issues.apache.org/jira/browse/ARROW-7680) - [C++][Dataset] Partition discovery is not working with windows path
-* [ARROW-7701](https://issues.apache.org/jira/browse/ARROW-7701) - [C++] [CI] Flight test error on macOS
-* [ARROW-7713](https://issues.apache.org/jira/browse/ARROW-7713) - [Java] TastLeak was put at the wrong location
-* [ARROW-7722](https://issues.apache.org/jira/browse/ARROW-7722) - [Java][FlightRPC] Memory leak
-* [ARROW-7734](https://issues.apache.org/jira/browse/ARROW-7734) - [C++] Segfault when comparing status with and without detail
-* [ARROW-7740](https://issues.apache.org/jira/browse/ARROW-7740) - [C++] Array internals corruption in StructArray::Flatten
-* [ARROW-7755](https://issues.apache.org/jira/browse/ARROW-7755) - [Python] Windows wheel cannot be installed on Python 3.8
-* [ARROW-7758](https://issues.apache.org/jira/browse/ARROW-7758) - [Python] Wrong conversion of timestamps that are out of bounds for pandas (eg 0000-01-01)
-* [ARROW-7760](https://issues.apache.org/jira/browse/ARROW-7760) - [Release] Fix verify-release-candidate.sh since pip3 seems to no longer be in miniconda
-* [ARROW-7762](https://issues.apache.org/jira/browse/ARROW-7762) - [Python] Exceptions in ParquetWriter get ignored
-* [ARROW-7766](https://issues.apache.org/jira/browse/ARROW-7766) - [Python][Packaging] Windows py38 wheels are built with wrong ABI tag
-* [ARROW-7772](https://issues.apache.org/jira/browse/ARROW-7772) - [R][C++][Dataset] Unable to filter on date32 object with date64 scalar
-* [ARROW-7775](https://issues.apache.org/jira/browse/ARROW-7775) - [Rust] Don't let safe code arbitrarily transmute readers and writers
-* [ARROW-7777](https://issues.apache.org/jira/browse/ARROW-7777) - [Go] StructBuilder/ListBuilder index out of range panic
-* [ARROW-7780](https://issues.apache.org/jira/browse/ARROW-7780) - [Release] Fix Windows wheel RC verification script given lack of "m" ABI tag in Python 3.8
-* [ARROW-7781](https://issues.apache.org/jira/browse/ARROW-7781) - [C++][Dataset] Filtering on a non-existent column gives a segfault
-* [ARROW-7783](https://issues.apache.org/jira/browse/ARROW-7783) - [C++] ARROW\_DATASET should enable ARROW\_COMPUTE
-* [ARROW-7785](https://issues.apache.org/jira/browse/ARROW-7785) - [C++] sparse\_tensor.cc is extremely slow to compile
-* [ARROW-7786](https://issues.apache.org/jira/browse/ARROW-7786) - [R] Wire up check\_metadata in Table.Equals method
-* [ARROW-7789](https://issues.apache.org/jira/browse/ARROW-7789) - [R] Can't initialize arrow objects when R.oo package is loaded
-* [ARROW-7791](https://issues.apache.org/jira/browse/ARROW-7791) - [C++][Parquet] Fix building error "cannot bind lvalue"
-* [ARROW-7792](https://issues.apache.org/jira/browse/ARROW-7792) - [R] read\_\* functions should close connection to file
-* [ARROW-7793](https://issues.apache.org/jira/browse/ARROW-7793) - [Java] If there is a leak the base allocator should release the excess memory to parent before throwing exception
-* [ARROW-7794](https://issues.apache.org/jira/browse/ARROW-7794) - [Rust] cargo publish fails for arrow-flight due to relative path to Flight.proto
-* [ARROW-7794](https://issues.apache.org/jira/browse/ARROW-7794) - [Rust] cargo publish fails for arrow-flight due to relative path to Flight.proto
-* [ARROW-7797](https://issues.apache.org/jira/browse/ARROW-7797) - [Release][Rust] Fix arrow-flight's version in datafusion crate
-* [ARROW-7802](https://issues.apache.org/jira/browse/ARROW-7802) - [C++] Support for LargeBinary and LargeString in the hash kernel
-* [ARROW-7806](https://issues.apache.org/jira/browse/ARROW-7806) - [Python] Implement to\_pandas for lists of LargeBinary/String
-* [ARROW-7807](https://issues.apache.org/jira/browse/ARROW-7807) - [R] Installation on RHEL 7 Cannot call io\_\_\_MemoryMappedFile\_\_Open()
-* [ARROW-7809](https://issues.apache.org/jira/browse/ARROW-7809) - [R] vignette does not run on Win 10 nor ubuntu
-* [ARROW-7813](https://issues.apache.org/jira/browse/ARROW-7813) - [Rust] Fix undefined behaviour and and remove unsafe
-* [ARROW-7815](https://issues.apache.org/jira/browse/ARROW-7815) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
-* [ARROW-7827](https://issues.apache.org/jira/browse/ARROW-7827) - [Python] conda-forge pyarrow package does not have s3 enabled
-* [ARROW-7832](https://issues.apache.org/jira/browse/ARROW-7832) - [R] Patches to 0.16.0 release
-* [ARROW-7836](https://issues.apache.org/jira/browse/ARROW-7836) - [Rust] "allocate\_aligned"/"reallocate" need to initialize memory to avoid UB
-* [ARROW-7837](https://issues.apache.org/jira/browse/ARROW-7837) - [Java] bug in BaseVariableWidthVector.copyFromSafe results with an index out of bounds exception
-* [ARROW-7838](https://issues.apache.org/jira/browse/ARROW-7838) - [C++] Installed plasma-store-server fails finding Boost
-* [ARROW-7841](https://issues.apache.org/jira/browse/ARROW-7841) - [C++] HADOOP\_HOME doesn't work to find libhdfs.so
-* [ARROW-7844](https://issues.apache.org/jira/browse/ARROW-7844) - [R] array\_to\_vector is not thread safe
-* [ARROW-7848](https://issues.apache.org/jira/browse/ARROW-7848) - Add doc for MapType
-* [ARROW-7852](https://issues.apache.org/jira/browse/ARROW-7852) - [Python] 0.16.0 wheels not compatible with older numpy
-* [ARROW-7857](https://issues.apache.org/jira/browse/ARROW-7857) - [Python] Failing test with pandas master for extension type conversion
-* [ARROW-7861](https://issues.apache.org/jira/browse/ARROW-7861) - [C++][Parquet] Add fuzz regression corpus for parquet reader
-* [ARROW-7884](https://issues.apache.org/jira/browse/ARROW-7884) - [C++][Python] Crash in pq.read\_table()
-* [ARROW-7887](https://issues.apache.org/jira/browse/ARROW-7887) - [Rust] Filter kernel does not support temporal types
-* [ARROW-7889](https://issues.apache.org/jira/browse/ARROW-7889) - [Rust] Datafusion CLI does not support registering Parquet files
-* [ARROW-7899](https://issues.apache.org/jira/browse/ARROW-7899) - [Integration][Java] null type integration test
-* [ARROW-7908](https://issues.apache.org/jira/browse/ARROW-7908) - [R] Can't install package without setting LIBARROW\_DOWNLOAD=true
-* [ARROW-7922](https://issues.apache.org/jira/browse/ARROW-7922) - [CI][Crossbow] Nightly macOS wheel builds fail (brew bundle edition)
-* [ARROW-7923](https://issues.apache.org/jira/browse/ARROW-7923) - [CI][Crossbow] macOS autobrew fails on homebrew-versions
-* [ARROW-7926](https://issues.apache.org/jira/browse/ARROW-7926) - [Developer] "archery lint" target is not ergonomic for running a single check like IWYU
-* [ARROW-7928](https://issues.apache.org/jira/browse/ARROW-7928) - [Python] Example of flight server and client not working
-* [ARROW-7931](https://issues.apache.org/jira/browse/ARROW-7931) - [C++] Fix crash on corrupt Map array input (OSS-Fuzz)
-* [ARROW-7936](https://issues.apache.org/jira/browse/ARROW-7936) - [Python] FileSystem.from\_uri test fails on python 3.5
-* [ARROW-7940](https://issues.apache.org/jira/browse/ARROW-7940) - [C++] Unable to generate cmake build with settings other than default
-* [ARROW-7944](https://issues.apache.org/jira/browse/ARROW-7944) - [Python] Test failures without Pandas
-* [ARROW-7956](https://issues.apache.org/jira/browse/ARROW-7956) - [Python] Memory leak in pyarrow functions .ipc.serialize\_pandas/deserialize\_pandas
-* [ARROW-7958](https://issues.apache.org/jira/browse/ARROW-7958) - [Java] Update Avro to version 1.9.2
-* [ARROW-7962](https://issues.apache.org/jira/browse/ARROW-7962) - [R][Dataset] Followup to "Consolidate Source and Dataset classes"
-* [ARROW-7968](https://issues.apache.org/jira/browse/ARROW-7968) - [C++] orc\_ep build fails on 64-bit Raspbian
-* [ARROW-7973](https://issues.apache.org/jira/browse/ARROW-7973) - [Developer][C++] ResourceWarnings in run\_cpplint.py
-* [ARROW-7974](https://issues.apache.org/jira/browse/ARROW-7974) - [Developer][C++] ResourceWarning in "make check-format"
-* [ARROW-7975](https://issues.apache.org/jira/browse/ARROW-7975) - [C++] Do not include padding bytes in "Buffer" IPC metadata accounting
-* [ARROW-7978](https://issues.apache.org/jira/browse/ARROW-7978) - [Developer] GitHub Actions "lint" task is running include-what-you-use and failing
-* [ARROW-7980](https://issues.apache.org/jira/browse/ARROW-7980) - [Python] Deserialization with pyarrow fails for certain Timestamp-based data frame
-* [ARROW-7981](https://issues.apache.org/jira/browse/ARROW-7981) - [C++][Dataset] Fails to compile on gcc 5.4
-* [ARROW-7985](https://issues.apache.org/jira/browse/ARROW-7985) - [C++] ListBuilder.Finish fails if underlying value builder is empty and .Reserve'd
-* [ARROW-7990](https://issues.apache.org/jira/browse/ARROW-7990) - [C++][Developer] Add "archery lint" option for running "iwyu.sh all"
-* [ARROW-7992](https://issues.apache.org/jira/browse/ARROW-7992) - [C++] MSVC warning causing Appveyor failure in sort\_to\_indices.cc
-* [ARROW-7996](https://issues.apache.org/jira/browse/ARROW-7996) - [Python] Error serializing empty pandas DataFrame with pyarrow
-* [ARROW-7997](https://issues.apache.org/jira/browse/ARROW-7997) - [Python] Schema equals method with inconsistent docs in pyarrow
-* [ARROW-7999](https://issues.apache.org/jira/browse/ARROW-7999) - [C++] Fix crash on corrupt Map array input (OSS-Fuzz)
-* [ARROW-8000](https://issues.apache.org/jira/browse/ARROW-8000) - [C++] gcc 4.8 build failures
-* [ARROW-8003](https://issues.apache.org/jira/browse/ARROW-8003) - [C++] -DBZip2\_SOURCE=BUNDLED fails when building with clang
-* [ARROW-8006](https://issues.apache.org/jira/browse/ARROW-8006) - [C++] Unsafe arrow dictionary recovered from parquet
-* [ARROW-8007](https://issues.apache.org/jira/browse/ARROW-8007) - [Python] Remove unused and defunct assert\_get\_object\_equal in plasma tests
-* [ARROW-8008](https://issues.apache.org/jira/browse/ARROW-8008) - [C++/Python] Framework Python is preferred even though not the activated one
-* [ARROW-8009](https://issues.apache.org/jira/browse/ARROW-8009) - [Java] Fix the hash code methods for BitVector
-* [ARROW-8011](https://issues.apache.org/jira/browse/ARROW-8011) - [C++] Some buffers not resized when reading from Parquet
-* [ARROW-8013](https://issues.apache.org/jira/browse/ARROW-8013) - [Python][Packaging] Fix manylinux wheels
-* [ARROW-8021](https://issues.apache.org/jira/browse/ARROW-8021) - [Python] Appveyor does not appear to be including pandas in test runs
-* [ARROW-8029](https://issues.apache.org/jira/browse/ARROW-8029) - [R] rstudio/r-base:3.6-centos7 GHA build failing on master
-* [ARROW-8036](https://issues.apache.org/jira/browse/ARROW-8036) - [C++] Compilation failure with gtest 1.10.0
-* [ARROW-8042](https://issues.apache.org/jira/browse/ARROW-8042) - [Python] pyarrow.ChunkedArray docstring is incorrect regarding zero-length ChunkedArray having no chunks
-* [ARROW-8057](https://issues.apache.org/jira/browse/ARROW-8057) - [Python] Don't check Schema metadata in \_\_eq\_\_ and \_\_ne\_\_
-* [ARROW-8070](https://issues.apache.org/jira/browse/ARROW-8070) - [C++] Cast segfaults on unsupported cast from list<binary\> to utf8
-* [ARROW-8071](https://issues.apache.org/jira/browse/ARROW-8071) - [GLib] Build error with configure
-* [ARROW-8075](https://issues.apache.org/jira/browse/ARROW-8075) - [R] Loading R.utils after arrow breaks some arrow functions
-* [ARROW-8088](https://issues.apache.org/jira/browse/ARROW-8088) - [C++][Dataset] Partition columns with specified dictionary type result in all nulls
-* [ARROW-8091](https://issues.apache.org/jira/browse/ARROW-8091) - [CI][Crossbow] Fix nightly homebrew and R failures
-* [ARROW-8092](https://issues.apache.org/jira/browse/ARROW-8092) - [CI][Crossbow] OSX wheels fail on bundled bzip2
-* [ARROW-8094](https://issues.apache.org/jira/browse/ARROW-8094) - [CI][Crossbow] Nightly valgrind test fails
-* [ARROW-8095](https://issues.apache.org/jira/browse/ARROW-8095) - [CI][Crossbow] Nightly turbodbc job fails
-* [ARROW-8098](https://issues.apache.org/jira/browse/ARROW-8098) - [go] Checkptr Failures on Go 1.14
-* [ARROW-8099](https://issues.apache.org/jira/browse/ARROW-8099) - [Integration] archery integration --with-LANG flags don't work
-* [ARROW-8101](https://issues.apache.org/jira/browse/ARROW-8101) - [FlightRPC][Java] Can't read/write only an empty null array
-* [ARROW-8102](https://issues.apache.org/jira/browse/ARROW-8102) - [Dev] Crossbow's version detection doesn't work in the comment bot's scenario
-* [ARROW-8105](https://issues.apache.org/jira/browse/ARROW-8105) - [Python] pyarrow.array segfaults when passed masked array with shrunken mask
-* [ARROW-8106](https://issues.apache.org/jira/browse/ARROW-8106) - [Python] Builds on master broken by pandas 1.0.2 release
-* [ARROW-8110](https://issues.apache.org/jira/browse/ARROW-8110) - [C\#] BuildArrays fails if NestedType is included
-* [ARROW-8112](https://issues.apache.org/jira/browse/ARROW-8112) - [FlightRPC][C++] Some status codes don't round-trip through gRPC
-* [ARROW-8119](https://issues.apache.org/jira/browse/ARROW-8119) - [Dev] Make Yaml optional dependency for archery
-* [ARROW-8122](https://issues.apache.org/jira/browse/ARROW-8122) - [Python] Empty numpy arrays with shape cannot be deserialized
-* [ARROW-8125](https://issues.apache.org/jira/browse/ARROW-8125) - [C++] "arrow-tests" target broken with ninja build
-* [ARROW-8127](https://issues.apache.org/jira/browse/ARROW-8127) - [C++] [Parquet] Incorrect column chunk metadata for multipage batch writes
-* [ARROW-8128](https://issues.apache.org/jira/browse/ARROW-8128) - [C\#] NestedType children serialized on wrong length
-* [ARROW-8132](https://issues.apache.org/jira/browse/ARROW-8132) - [C++] arrow-s3fs-test failing on master
-* [ARROW-8133](https://issues.apache.org/jira/browse/ARROW-8133) - [CI] Github Actions sometimes fail to checkout Arrow
-* [ARROW-8136](https://issues.apache.org/jira/browse/ARROW-8136) - [C++][Python] Creating dataset from relative path no longer working
-* [ARROW-8136](https://issues.apache.org/jira/browse/ARROW-8136) - [C++][Python] Creating dataset from relative path no longer working
-* [ARROW-8138](https://issues.apache.org/jira/browse/ARROW-8138) - [C++] parquet::arrow::FileReader cannot read multiple RowGroup
-* [ARROW-8139](https://issues.apache.org/jira/browse/ARROW-8139) - [C++] FileSystem enum causes attributes warning
-* [ARROW-8142](https://issues.apache.org/jira/browse/ARROW-8142) - [C++] Casting a chunked array with 0 chunks critical failure
-* [ARROW-8144](https://issues.apache.org/jira/browse/ARROW-8144) - [CI] Cmake 3.2 nightly build fails
-* [ARROW-8154](https://issues.apache.org/jira/browse/ARROW-8154) - [Python] HDFS Filesystem does not set environment variables in pyarrow 0.16.0 release
-* [ARROW-8159](https://issues.apache.org/jira/browse/ARROW-8159) - [Python] pyarrow.Schema.from\_pandas doesn't support ExtensionDtype
-* [ARROW-8166](https://issues.apache.org/jira/browse/ARROW-8166) - [C++] AVX512 intrinsics fail to compile with clang-8 on Ubuntu 18.04
-* [ARROW-8176](https://issues.apache.org/jira/browse/ARROW-8176) - [FlightRPC][Integration] Have Flight services bind to port 0 in integration
-* [ARROW-8186](https://issues.apache.org/jira/browse/ARROW-8186) - [Python] Dataset expression != returns bool instead of expression for invalid value
-* [ARROW-8188](https://issues.apache.org/jira/browse/ARROW-8188) - [R] Adapt to latest checks in R-devel
-* [ARROW-8193](https://issues.apache.org/jira/browse/ARROW-8193) - [C++] arrow-future-test fails to compile on gcc 4.8
-* [ARROW-8197](https://issues.apache.org/jira/browse/ARROW-8197) - [Rust] DataFusion "create\_physical\_plan" returns incorrect schema?
-* [ARROW-8206](https://issues.apache.org/jira/browse/ARROW-8206) - [R] Minor fix for backwards compatibility on Linux installation
-* [ARROW-8209](https://issues.apache.org/jira/browse/ARROW-8209) - [Python] Accessing duplicate column of Table by name gives wrong error
-* [ARROW-8213](https://issues.apache.org/jira/browse/ARROW-8213) - [Python][Dataset] Opening a dataset with a local incorrect path gives confusing error message
-* [ARROW-8216](https://issues.apache.org/jira/browse/ARROW-8216) - [R][C++][Dataset] Filtering returns all-missing rows where the filtering column is missing
-* [ARROW-8217](https://issues.apache.org/jira/browse/ARROW-8217) - [R][C++] Fix crashing test in test-dataset.R on 32-bit Windows from ARROW-7979
-* [ARROW-8219](https://issues.apache.org/jira/browse/ARROW-8219) - [Rust] sqlparser crate needs to be bumped to version 0.2.5
-* [ARROW-8223](https://issues.apache.org/jira/browse/ARROW-8223) - [Python] Schema.from\_pandas breaks with pandas nullable integer dtype
-* [ARROW-8233](https://issues.apache.org/jira/browse/ARROW-8233) - [CI] Build timeouts on "AMD64 Windows MinGW 64 GLib & Ruby "
-* [ARROW-8234](https://issues.apache.org/jira/browse/ARROW-8234) - [CI] Build timeouts on "AMD64 Windows RTools 35"
-* [ARROW-8236](https://issues.apache.org/jira/browse/ARROW-8236) - [Rust] Linting GitHub Actions task failing
-* [ARROW-8237](https://issues.apache.org/jira/browse/ARROW-8237) - [Python] Review Developer build instructions for conda and non-conda users
-* [ARROW-8237](https://issues.apache.org/jira/browse/ARROW-8237) - [Python] Review Developer build instructions for conda and non-conda users
-* [ARROW-8238](https://issues.apache.org/jira/browse/ARROW-8238) - [C++][Compute] Failed to build compute tests on windows with msvc2015
-* [ARROW-8239](https://issues.apache.org/jira/browse/ARROW-8239) - [Java] fix param checks in splitAndTransfer method
-* [ARROW-8245](https://issues.apache.org/jira/browse/ARROW-8245) - [Python][Parquet] Skip hidden directories when reading partitioned parquet files
-* [ARROW-8254](https://issues.apache.org/jira/browse/ARROW-8254) - [Rust] [DataFusion] CLI is not working as expected
-* [ARROW-8255](https://issues.apache.org/jira/browse/ARROW-8255) - [Rust] [DataFusion] COUNT(\*) results in confusing error
-* [ARROW-8259](https://issues.apache.org/jira/browse/ARROW-8259) - [Rust] [DataFusion] ProjectionPushDownRule does not rewrite LIMIT
-* [ARROW-8268](https://issues.apache.org/jira/browse/ARROW-8268) - [Ruby] Test failure due to lack of built ZSTD support
-* [ARROW-8269](https://issues.apache.org/jira/browse/ARROW-8269) - [Python] Failure in "nopandas" build in test\_parquet\_row\_group\_fragments
-* [ARROW-8270](https://issues.apache.org/jira/browse/ARROW-8270) - [Python][Flight] Example Flight server with TLS's certificate and key is not working
-* [ARROW-8272](https://issues.apache.org/jira/browse/ARROW-8272) - [CI][Python] Test failure on Ubuntu 16.04
-* [ARROW-8274](https://issues.apache.org/jira/browse/ARROW-8274) - [C++] Use LZ4 frame format for "LZ4" compression in IPC write
-* [ARROW-8276](https://issues.apache.org/jira/browse/ARROW-8276) - [C++][Dataset] Scanning a Fragment does not take into account the partition columns
-* [ARROW-8280](https://issues.apache.org/jira/browse/ARROW-8280) - [C++] MinGW builds failing due to CARES-related toolchain issue
-* [ARROW-8286](https://issues.apache.org/jira/browse/ARROW-8286) - [Python] Creating dataset from pathlib results in UnionDataset instead of FileSystemDataset
-* [ARROW-8298](https://issues.apache.org/jira/browse/ARROW-8298) - [C++][CI] MinGW builds fail building grpc
-* [ARROW-8303](https://issues.apache.org/jira/browse/ARROW-8303) - [Python] Fix test failure caused by non-deterministic dict key ordering on Python 3.5
-* [ARROW-8304](https://issues.apache.org/jira/browse/ARROW-8304) - [Flight][Python] Flight client with TLS root certificate is reporting error on do\_get()
-* [ARROW-8305](https://issues.apache.org/jira/browse/ARROW-8305) - [Java] ExtensionTypeVector should make sure underlyingVector not null
-* [ARROW-8310](https://issues.apache.org/jira/browse/ARROW-8310) - [C++] Minio's exceptions not recognized by IsConnectError()
-* [ARROW-8315](https://issues.apache.org/jira/browse/ARROW-8315) - [Python][Dataset] Don't rely on ordered dict keys in test\_dataset.py
-* [ARROW-8323](https://issues.apache.org/jira/browse/ARROW-8323) - [C++] Pin gRPC at v1.27 to avoid compilation error in its headers
-* [ARROW-8326](https://issues.apache.org/jira/browse/ARROW-8326) - [C++] Don't use deprecated TYPED\_TEST\_CASE
-* [ARROW-8327](https://issues.apache.org/jira/browse/ARROW-8327) - [FlightRPC][Java] gRPC trailers may be null
-* [ARROW-8331](https://issues.apache.org/jira/browse/ARROW-8331) - [C++] arrow-compute-filter-benchmark fails to compile
-* [ARROW-8333](https://issues.apache.org/jira/browse/ARROW-8333) - [C++][CI] Always compile benchmarks in some C++ CI entry
-* [ARROW-8334](https://issues.apache.org/jira/browse/ARROW-8334) - [C++] [Gandiva] Missing DATE32 in LLVM Types / Simple D32 Compute Functions
-* [ARROW-8342](https://issues.apache.org/jira/browse/ARROW-8342) - [Python] dask and kartothek integration tests are failing
-* [ARROW-8345](https://issues.apache.org/jira/browse/ARROW-8345) - [Python] feather.read\_table should not require pandas
-* [ARROW-8346](https://issues.apache.org/jira/browse/ARROW-8346) - [CI][Ruby] GLib/Ruby macOS build fails on zlib
-* [ARROW-8349](https://issues.apache.org/jira/browse/ARROW-8349) - [CI][NIGHTLY:gandiva-jar-osx] Use latest pygit2
-* [ARROW-8353](https://issues.apache.org/jira/browse/ARROW-8353) - [C++] is\_nullable maybe not initialized in parquet writer
-* [ARROW-8354](https://issues.apache.org/jira/browse/ARROW-8354) - [R] Fix segfault in Table to Array conversion
-* [ARROW-8357](https://issues.apache.org/jira/browse/ARROW-8357) - [Rust] [DataFusion] Dockerfile for CLI is missing format dir
-* [ARROW-8358](https://issues.apache.org/jira/browse/ARROW-8358) - [C++] Fix -Wrange-loop-construct warnings in clang-11
-* [ARROW-8365](https://issues.apache.org/jira/browse/ARROW-8365) - [C++] Error when writing files to S3 larger than 5 GB
-* [ARROW-8366](https://issues.apache.org/jira/browse/ARROW-8366) - [Rust] Need to revert recent arrow-flight build change
-* [ARROW-8369](https://issues.apache.org/jira/browse/ARROW-8369) - [CI] Fix crossbow wildcard groups
-* [ARROW-8373](https://issues.apache.org/jira/browse/ARROW-8373) - [GLib] Problems resolving gobject-introspection, arrow in Meson builds
-* [ARROW-8380](https://issues.apache.org/jira/browse/ARROW-8380) - [RUST] StringDictionaryBuilder not publicly exported from arrow::array
-* [ARROW-8384](https://issues.apache.org/jira/browse/ARROW-8384) - [C++][Python] arrow/filesystem/hdfs.h and Python wrapper does not have an option for setting a path to a Kerberos ticket
-* [ARROW-8386](https://issues.apache.org/jira/browse/ARROW-8386) - [Python] pyarrow.jvm raises error for empty Arrays
-* [ARROW-8388](https://issues.apache.org/jira/browse/ARROW-8388) - [C++] GCC 4.8 fails to move on return
-* [ARROW-8397](https://issues.apache.org/jira/browse/ARROW-8397) - [C++] Fail to compile aggregate\_test.cc on Ubuntu 16.04
-* [ARROW-8406](https://issues.apache.org/jira/browse/ARROW-8406) - [Python] test\_fs fails when run from a different drive on Windows
-* [ARROW-8410](https://issues.apache.org/jira/browse/ARROW-8410) - [C++] CMake fails on aarch64 systems that do not support -march=armv8-a+crc+crypto
-* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py
-* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py
-* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py
-* [ARROW-8415](https://issues.apache.org/jira/browse/ARROW-8415) - [C++][Packaging] fix gandiva linux job
-* [ARROW-8416](https://issues.apache.org/jira/browse/ARROW-8416) - [Python] Provide a "feather" alias in the dataset API
-* [ARROW-8420](https://issues.apache.org/jira/browse/ARROW-8420) - [C++] CMake fails to configure on armv7l platform (e.g. Raspberry Pi 3)
-* [ARROW-8427](https://issues.apache.org/jira/browse/ARROW-8427) - [C++][Dataset] Do not ignore file paths with underscore/dot when full path was specified
-* [ARROW-8428](https://issues.apache.org/jira/browse/ARROW-8428) - [C++][NIGHTLY:gandiva-jar-trusty] GCC 4.8 failures in C++ unit tests
-* [ARROW-8429](https://issues.apache.org/jira/browse/ARROW-8429) - [C++] Fix Buffer::CopySlice on 0-sized buffer
-* [ARROW-8432](https://issues.apache.org/jira/browse/ARROW-8432) - [Python][CI] Failure to download Hadoop
-* [ARROW-8437](https://issues.apache.org/jira/browse/ARROW-8437) - [C++] Remove std::move return value from MakeRandomNullBitmap test utility
-* [ARROW-8438](https://issues.apache.org/jira/browse/ARROW-8438) - [C++] arrow-io-memory-benchmark crashes
-* [ARROW-8439](https://issues.apache.org/jira/browse/ARROW-8439) - [Python] Filesystem docs are outdated
-* [ARROW-8441](https://issues.apache.org/jira/browse/ARROW-8441) - [C++] Fix crashes on invalid input (OSS-Fuzz)
-* [ARROW-8442](https://issues.apache.org/jira/browse/ARROW-8442) - [Python] NullType.to\_pandas\_dtype inconsisent with dtype returned in to\_pandas/to\_numpy
-* [ARROW-8460](https://issues.apache.org/jira/browse/ARROW-8460) - [Packaging][deb] Ubuntu Focal build is failed
-* [ARROW-8465](https://issues.apache.org/jira/browse/ARROW-8465) - [Packaging][Python] Windows py35 wheel build fails because of boost
-* [ARROW-8466](https://issues.apache.org/jira/browse/ARROW-8466) - [Packaging] The python unittests are not running in the windows wheel builds
-* [ARROW-8468](https://issues.apache.org/jira/browse/ARROW-8468) - [Document] Fix the incorrect null bits description
-* [ARROW-8469](https://issues.apache.org/jira/browse/ARROW-8469) - [Dev] Fix nightly docker tests on azure
-* [ARROW-8478](https://issues.apache.org/jira/browse/ARROW-8478) - [Java] Rollback contrib package changes.
-* [ARROW-8498](https://issues.apache.org/jira/browse/ARROW-8498) - [Python] Schema.from\_pandas fails on extension type, while Table.from\_pandas works
-* [PARQUET-1780](https://issues.apache.org/jira/browse/PARQUET-1780) - [C++] Set ColumnMetadata.encoding\_stats field
-* [PARQUET-1788](https://issues.apache.org/jira/browse/PARQUET-1788) - [C++] ColumnWriter has undefined behavior when writing arrow chunks
-* [PARQUET-1797](https://issues.apache.org/jira/browse/PARQUET-1797) - [C++] Fix fuzzing errors
-* [PARQUET-1799](https://issues.apache.org/jira/browse/PARQUET-1799) - [C++] Stream API: Relax schema checking when reading
-* [PARQUET-1810](https://issues.apache.org/jira/browse/PARQUET-1810) - [C++] Fix undefined behaviour on invalid enum values (OSS-Fuzz)
-* [PARQUET-1813](https://issues.apache.org/jira/browse/PARQUET-1813) - [C++] Remove logging statement in unit test
-* [PARQUET-1819](https://issues.apache.org/jira/browse/PARQUET-1819) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
-* [PARQUET-1819](https://issues.apache.org/jira/browse/PARQUET-1819) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
-* [PARQUET-1823](https://issues.apache.org/jira/browse/PARQUET-1823) - [C++] Invalid RowGroup returned when reading with parquet::arrow::FileReader-\>RowGroup(i)-\>Column(j)
-* [PARQUET-1824](https://issues.apache.org/jira/browse/PARQUET-1824) - [C++] Fix crashes on invalid input (OSS-Fuzz)
-* [PARQUET-1829](https://issues.apache.org/jira/browse/PARQUET-1829) - [C++] Fix crashes on invalid input (OSS-Fuzz)
-* [PARQUET-1831](https://issues.apache.org/jira/browse/PARQUET-1831) - [C++] Fix crashes on invalid input (OSS-Fuzz)
-* [PARQUET-1835](https://issues.apache.org/jira/browse/PARQUET-1835) - [C++] Fix crashes on invalid input (OSS-Fuzz)
-
-
-## New Features and Improvements
-
-* [ARROW-590](https://issues.apache.org/jira/browse/ARROW-590) - [Integration] Add integration tests for Union types
-* [ARROW-1470](https://issues.apache.org/jira/browse/ARROW-1470) - [C++] Add BufferAllocator abstract interface
-* [ARROW-1560](https://issues.apache.org/jira/browse/ARROW-1560) - [C++] Kernel implementations for "match" function
-* [ARROW-1571](https://issues.apache.org/jira/browse/ARROW-1571) - [C++] Implement argsort kernels (sort indices) for integers using O(n) counting sort
-* [ARROW-1581](https://issues.apache.org/jira/browse/ARROW-1581) - [Packaging] Tooling to make nightly wheels available for install
-* [ARROW-1582](https://issues.apache.org/jira/browse/ARROW-1582) - [Python] Set up + document nightly conda builds for macOS
-* [ARROW-1636](https://issues.apache.org/jira/browse/ARROW-1636) - [Format] Integration tests for null type
-* [ARROW-2447](https://issues.apache.org/jira/browse/ARROW-2447) - [C++] Create a device abstraction
-* [ARROW-2882](https://issues.apache.org/jira/browse/ARROW-2882) - [C++][Python] Support AWS Firehose partition\_scheme implementation for Parquet datasets
-* [ARROW-3054](https://issues.apache.org/jira/browse/ARROW-3054) - [Packaging] Tooling to enable nightly conda packages to be updated to some anaconda.org channel
-* [ARROW-3410](https://issues.apache.org/jira/browse/ARROW-3410) - [C++][Dataset] Streaming CSV reader interface for memory-constrainted environments
-* [ARROW-3750](https://issues.apache.org/jira/browse/ARROW-3750) - [R] Pass various wrapped Arrow objects created in Python into R with zero copy via reticulate
-* [ARROW-4120](https://issues.apache.org/jira/browse/ARROW-4120) - [Python] Define process for testing procedures that check for no macro-level memory leaks
-* [ARROW-4226](https://issues.apache.org/jira/browse/ARROW-4226) - [Format][C++] Add CSF sparse tensor support
-* [ARROW-4286](https://issues.apache.org/jira/browse/ARROW-4286) - [C++/R] Namespace vendored Boost
-* [ARROW-4304](https://issues.apache.org/jira/browse/ARROW-4304) - [Rust] Enhance documentation for arrow
-* [ARROW-4428](https://issues.apache.org/jira/browse/ARROW-4428) - [R] Feature flags for R build
-* [ARROW-4482](https://issues.apache.org/jira/browse/ARROW-4482) - [Website] Add blog archive page
-* [ARROW-4815](https://issues.apache.org/jira/browse/ARROW-4815) - [Rust] [DataFusion] Add support for \* in SQL projection
-* [ARROW-5357](https://issues.apache.org/jira/browse/ARROW-5357) - [Rust] Add capacity field in Buffer
-* [ARROW-5405](https://issues.apache.org/jira/browse/ARROW-5405) - [Documentation] Move integration testing documentation to Sphinx docs, add instructions for JavaScript
-* [ARROW-5497](https://issues.apache.org/jira/browse/ARROW-5497) - [Release] Build and publish R/Java/JS docs
-* [ARROW-5501](https://issues.apache.org/jira/browse/ARROW-5501) - [R] Reorganize read/write file/stream functions
-* [ARROW-5510](https://issues.apache.org/jira/browse/ARROW-5510) - [Format] Feather V2 based on Arrow IPC file format, with compression support
-* [ARROW-5563](https://issues.apache.org/jira/browse/ARROW-5563) - [Format] Update integration test JSON format documentation
-* [ARROW-5585](https://issues.apache.org/jira/browse/ARROW-5585) - [Go] rename arrow.TypeEquals into arrow.TypeEqual
-* [ARROW-5742](https://issues.apache.org/jira/browse/ARROW-5742) - [CI] Add daily / weekly Valgrind build
-* [ARROW-5757](https://issues.apache.org/jira/browse/ARROW-5757) - [Python] Stop supporting Python 2.7
-* [ARROW-5949](https://issues.apache.org/jira/browse/ARROW-5949) - [Rust] Implement DictionaryArray
-* [ARROW-6165](https://issues.apache.org/jira/browse/ARROW-6165) - [Integration] Use multiprocessing to run integration tests on multiple CPU cores
-* [ARROW-6176](https://issues.apache.org/jira/browse/ARROW-6176) - [Python] Allow to subclass ExtensionArray to attach to custom extension type
-* [ARROW-6275](https://issues.apache.org/jira/browse/ARROW-6275) - [C++] Deprecate RecordBatchReader::ReadNext
-* [ARROW-6393](https://issues.apache.org/jira/browse/ARROW-6393) - [C++] Add EqualOptions support in SparseTensor::Equals
-* [ARROW-6479](https://issues.apache.org/jira/browse/ARROW-6479) - [C++] inline errors from external projects' build logs
-* [ARROW-6510](https://issues.apache.org/jira/browse/ARROW-6510) - [Python][Filesystem] Expose nanosecond resolution mtime
-* [ARROW-6666](https://issues.apache.org/jira/browse/ARROW-6666) - [Rust] [DataFusion] Implement string literal expression
-* [ARROW-6724](https://issues.apache.org/jira/browse/ARROW-6724) - [C++] Add simpler static ctor for BufferOutputStream than the current Create function
-* [ARROW-6821](https://issues.apache.org/jira/browse/ARROW-6821) - [C++][Parquet] Do not require Thrift compiler when building (but still require library)
-* [ARROW-6823](https://issues.apache.org/jira/browse/ARROW-6823) - [C++][Python][R] Support metadata in the feather format?
-* [ARROW-6829](https://issues.apache.org/jira/browse/ARROW-6829) - [Docs] Migrate integration test docs to Sphinx, fix instructions after ARROW-6466
-* [ARROW-6837](https://issues.apache.org/jira/browse/ARROW-6837) - [C++/Python] access File Footer custom\_metadata
-* [ARROW-6841](https://issues.apache.org/jira/browse/ARROW-6841) - [C++] Upgrade to LLVM 8
-* [ARROW-6875](https://issues.apache.org/jira/browse/ARROW-6875) - [FlightRPC] Implement Criteria for ListFlights RPC / list\_flights method
-* [ARROW-6915](https://issues.apache.org/jira/browse/ARROW-6915) - [Developer] Do not overwrite minor release version with merge script, even if not specified by committer
-* [ARROW-6947](https://issues.apache.org/jira/browse/ARROW-6947) - [Rust] [DataFusion] Add support for scalar UDFs
-* [ARROW-6996](https://issues.apache.org/jira/browse/ARROW-6996) - [Python] Expose boolean filter kernel on Table
-* [ARROW-7044](https://issues.apache.org/jira/browse/ARROW-7044) - [Release] Create a post release script for the home-brew formulas
-* [ARROW-7048](https://issues.apache.org/jira/browse/ARROW-7048) - [Java] Support for combining multiple vectors under VectorSchemaRoot
-* [ARROW-7063](https://issues.apache.org/jira/browse/ARROW-7063) - [C++] Schema print method prints too much metadata
-* [ARROW-7073](https://issues.apache.org/jira/browse/ARROW-7073) - [Java] Support concating vectors values in batch
-* [ARROW-7080](https://issues.apache.org/jira/browse/ARROW-7080) - [Python][Parquet][C++] Expose parquet field\_id in Schema objects
-* [ARROW-7091](https://issues.apache.org/jira/browse/ARROW-7091) - [C++] Move all factories to type\_fwd.h
-* [ARROW-7119](https://issues.apache.org/jira/browse/ARROW-7119) - [C++][CI] Use scripts/util\_coredump.sh to show automatic backtraces
-* [ARROW-7201](https://issues.apache.org/jira/browse/ARROW-7201) - [GLib][Gandiva] Add support for BooleanNode
-* [ARROW-7202](https://issues.apache.org/jira/browse/ARROW-7202) - [R][CI] Improve rwinlib building on CI to stop re-downloading dependencies
-* [ARROW-7222](https://issues.apache.org/jira/browse/ARROW-7222) - [Python][Release] Wipe any existing generated Python API documentation when updating website
-* [ARROW-7233](https://issues.apache.org/jira/browse/ARROW-7233) - [C++] Add Result<T\> APIs to IPC module
-* [ARROW-7256](https://issues.apache.org/jira/browse/ARROW-7256) - [C++] Remove ARROW\_MEMORY\_POOL\_DEFAULT macro
-* [ARROW-7330](https://issues.apache.org/jira/browse/ARROW-7330) - [C++] Add Result<T\> to APIs to arrow/gpu
-* [ARROW-7332](https://issues.apache.org/jira/browse/ARROW-7332) - [C++][Parquet] Explicitly catch status exceptions in PARQUET\_CATCH\_NOT\_OK
-* [ARROW-7336](https://issues.apache.org/jira/browse/ARROW-7336) - [C++] Implement MinMax options to not skip nulls
-* [ARROW-7338](https://issues.apache.org/jira/browse/ARROW-7338) - [C++] Improve InMemoryDataSource to support generator instead of static list
-* [ARROW-7365](https://issues.apache.org/jira/browse/ARROW-7365) - [Python] Support FixedSizeList type in conversion to numpy/pandas
-* [ARROW-7373](https://issues.apache.org/jira/browse/ARROW-7373) - [C++][Dataset] Remove FileSource
-* [ARROW-7400](https://issues.apache.org/jira/browse/ARROW-7400) - [Java] Avoids the worst case for quick sort
-* [ARROW-7412](https://issues.apache.org/jira/browse/ARROW-7412) - [C++][Dataset] Ensure that dataset code is robust to schemas with duplicate field names
-* [ARROW-7419](https://issues.apache.org/jira/browse/ARROW-7419) - [Python] Support SparseCSCMatrix
-* [ARROW-7427](https://issues.apache.org/jira/browse/ARROW-7427) - [Python] Support SparseCSFTensor
-* [ARROW-7428](https://issues.apache.org/jira/browse/ARROW-7428) - [Format][C++] Add serialization for CSF sparse tensors
-* [ARROW-7444](https://issues.apache.org/jira/browse/ARROW-7444) - [GLib] Add LocalFileSystem support
-* [ARROW-7462](https://issues.apache.org/jira/browse/ARROW-7462) - [C++] Add CpuInfo detection for Arm64 Architecture
-* [ARROW-7491](https://issues.apache.org/jira/browse/ARROW-7491) - [Java] Improve the performance of aligning
-* [ARROW-7499](https://issues.apache.org/jira/browse/ARROW-7499) - [C++] CMake should collect libs when making static build
-* [ARROW-7501](https://issues.apache.org/jira/browse/ARROW-7501) - [C++] CMake build\_thrift should build flex and bison if necessary
-* [ARROW-7515](https://issues.apache.org/jira/browse/ARROW-7515) - [C++] Rename nonexistent and non\_existent to not\_found
-* [ARROW-7524](https://issues.apache.org/jira/browse/ARROW-7524) - [C++][CI] Build parquet support in the VS2019 GitHub Actions job
-* [ARROW-7530](https://issues.apache.org/jira/browse/ARROW-7530) - [Developer] Do not include list of commits from PR in squashed summary message
-* [ARROW-7534](https://issues.apache.org/jira/browse/ARROW-7534) - [Java] Create a new java/contrib module
-* [ARROW-7547](https://issues.apache.org/jira/browse/ARROW-7547) - [C++] [Python] [Dataset] Additional reader options in ParquetFileFormat
-* [ARROW-7555](https://issues.apache.org/jira/browse/ARROW-7555) - [Python] Drop support for python 2.7
-* [ARROW-7587](https://issues.apache.org/jira/browse/ARROW-7587) - [C++][Compute] Add Top-k kernel
-* [ARROW-7608](https://issues.apache.org/jira/browse/ARROW-7608) - [C++][Dataset] Expose more informational properties
-* [ARROW-7615](https://issues.apache.org/jira/browse/ARROW-7615) - [CI][Gandiva] Ensure that the gandiva\_jni library has only a whitelisted set of shared dependencies as part of Travis CI job
-* [ARROW-7616](https://issues.apache.org/jira/browse/ARROW-7616) - [Java] Support comparing value ranges for dense union vector
-* [ARROW-7625](https://issues.apache.org/jira/browse/ARROW-7625) - [GLib] Parquet GLib and Red Parquet (Ruby) do not allow specifying compression type
-* [ARROW-7641](https://issues.apache.org/jira/browse/ARROW-7641) - [R] Make dataset vignette have executable code
-* [ARROW-7662](https://issues.apache.org/jira/browse/ARROW-7662) - [R] Support creating ListArray from R list
-* [ARROW-7664](https://issues.apache.org/jira/browse/ARROW-7664) - [C++] Extract localfs default from FileSystemFromUri
-* [ARROW-7675](https://issues.apache.org/jira/browse/ARROW-7675) - [R][CI] Move Windows CI from Appveyor to GHA
-* [ARROW-7679](https://issues.apache.org/jira/browse/ARROW-7679) - [R] Cleaner interface for creating UnionDataset
-* [ARROW-7684](https://issues.apache.org/jira/browse/ARROW-7684) - [Rust] Provide example of Flight server for DataFusion
-* [ARROW-7685](https://issues.apache.org/jira/browse/ARROW-7685) - [Developer] Add support for GitHub Actions to Crossbow
-* [ARROW-7691](https://issues.apache.org/jira/browse/ARROW-7691) - [C++] Verify missing fields when walking Flatbuffers data
-* [ARROW-7708](https://issues.apache.org/jira/browse/ARROW-7708) - [Release] Include PARQUET commits from git changelog in release changelogs
-* [ARROW-7712](https://issues.apache.org/jira/browse/ARROW-7712) - [CI][Crossbow] Fix or delete fuzzit jobs
-* [ARROW-7720](https://issues.apache.org/jira/browse/ARROW-7720) - [C++][Python] Add check\_metadata argument to Table.equals
-* [ARROW-7725](https://issues.apache.org/jira/browse/ARROW-7725) - [C++] Add infrastructure for unity builds and precompiled headers
-* [ARROW-7726](https://issues.apache.org/jira/browse/ARROW-7726) - [CI] [C++] Use boost binaries on Windows GHA build
-* [ARROW-7729](https://issues.apache.org/jira/browse/ARROW-7729) - [Python][CI] Pin pandas version to 0.25 in the dask integration test
-* [ARROW-7733](https://issues.apache.org/jira/browse/ARROW-7733) - [Developer] Install locally a new enough version of Go for release verification script
-* [ARROW-7735](https://issues.apache.org/jira/browse/ARROW-7735) - [Release] conda-forge channel is missing for verifying wheels
-* [ARROW-7736](https://issues.apache.org/jira/browse/ARROW-7736) - [Release] Binary verification sometimes fails with transient error
-* [ARROW-7739](https://issues.apache.org/jira/browse/ARROW-7739) - [GLib] Use placement new to initialize shared\_ptr object in private structs
-* [ARROW-7741](https://issues.apache.org/jira/browse/ARROW-7741) - [C++][Parquet] Incorporate new level generation logic in parquet write path with a flag to revert back to old logic
-* [ARROW-7742](https://issues.apache.org/jira/browse/ARROW-7742) - [GLib] Add support for MapArray
-* [ARROW-7745](https://issues.apache.org/jira/browse/ARROW-7745) - [Doc] [C++] Update Parquet documentation
-* [ARROW-7749](https://issues.apache.org/jira/browse/ARROW-7749) - [C++] Link some more tests together
-* [ARROW-7750](https://issues.apache.org/jira/browse/ARROW-7750) - [Release] Make the source release verification script restartable
-* [ARROW-7751](https://issues.apache.org/jira/browse/ARROW-7751) - [Release] macOS wheel verification also needs arrow-testing
-* [ARROW-7752](https://issues.apache.org/jira/browse/ARROW-7752) - [Release] Enable and test dataset in the verification script
-* [ARROW-7754](https://issues.apache.org/jira/browse/ARROW-7754) - [C++] Result<T\> is slow
-* [ARROW-7761](https://issues.apache.org/jira/browse/ARROW-7761) - [C++] Add S3 support to fs::FileSystemFromUri
-* [ARROW-7764](https://issues.apache.org/jira/browse/ARROW-7764) - [C++] Builders allocate a null bitmap buffer even if there is no nulls
-* [ARROW-7771](https://issues.apache.org/jira/browse/ARROW-7771) - [Developer] Use ARROW\_TMPDIR environment variable in the verification scripts instead of TMPDIR
-* [ARROW-7774](https://issues.apache.org/jira/browse/ARROW-7774) - [Packaging][Python] Update macos and windows wheel filenames
-* [ARROW-7787](https://issues.apache.org/jira/browse/ARROW-7787) - [Rust] Add collect to Table API
-* [ARROW-7788](https://issues.apache.org/jira/browse/ARROW-7788) - [C++] Add schema conversion support for map type
-* [ARROW-7790](https://issues.apache.org/jira/browse/ARROW-7790) - [Website] Update how to install Linux packages
-* [ARROW-7795](https://issues.apache.org/jira/browse/ARROW-7795) - [Rust - DataFusion] Support boolean negation (NOT)
-* [ARROW-7796](https://issues.apache.org/jira/browse/ARROW-7796) - [R] write\_\* functions should invisibly return their inputs
-* [ARROW-7799](https://issues.apache.org/jira/browse/ARROW-7799) - [R][CI] Remove flatbuffers from homebrew formulae
-* [ARROW-7804](https://issues.apache.org/jira/browse/ARROW-7804) - [C++][R] Compile error on macOS 10.11
-* [ARROW-7812](https://issues.apache.org/jira/browse/ARROW-7812) - [Packaging][Python] Upgrade LLVM in manylinux1 docker image
-* [ARROW-7817](https://issues.apache.org/jira/browse/ARROW-7817) - [CI] macOS R autobrew nightly failed on installing dependency from source
-* [ARROW-7819](https://issues.apache.org/jira/browse/ARROW-7819) - [C++][Gandiva] Add DumpIR to Filter/Projector classes
-* [ARROW-7824](https://issues.apache.org/jira/browse/ARROW-7824) - [C++][Dataset] Provide Dataset writing to IPC format
-* [ARROW-7828](https://issues.apache.org/jira/browse/ARROW-7828) - [Release] Remove SSH keys for internal use
-* [ARROW-7829](https://issues.apache.org/jira/browse/ARROW-7829) - [R] Test R bindings on clang
-* [ARROW-7833](https://issues.apache.org/jira/browse/ARROW-7833) - [R] Make install\_arrow() actually install arrow
-* [ARROW-7834](https://issues.apache.org/jira/browse/ARROW-7834) - [Release] Post release task for updating the documentations
-* [ARROW-7839](https://issues.apache.org/jira/browse/ARROW-7839) - [Python][Dataset] Add IPC format to python bindings
-* [ARROW-7846](https://issues.apache.org/jira/browse/ARROW-7846) - [Python][Dev] Remove last dependencies on six
-* [ARROW-7847](https://issues.apache.org/jira/browse/ARROW-7847) - [Website] Write a blog post about fuzzing
-* [ARROW-7849](https://issues.apache.org/jira/browse/ARROW-7849) - [Packaging][Python] Remove the remaining py27 crossbow wheel tasks from the nightlies
-* [ARROW-7858](https://issues.apache.org/jira/browse/ARROW-7858) - [C++][Python] Support casting an Extension type to its storage type
-* [ARROW-7859](https://issues.apache.org/jira/browse/ARROW-7859) - [R] Minor patches for CRAN submission 0.16.0.2
-* [ARROW-7860](https://issues.apache.org/jira/browse/ARROW-7860) - [C++] Support cast to/from halffloat
-* [ARROW-7862](https://issues.apache.org/jira/browse/ARROW-7862) - [R] Linux installation should run quieter by default
-* [ARROW-7863](https://issues.apache.org/jira/browse/ARROW-7863) - [C++][Python][CI] Ensure running HDFS related tests
-* [ARROW-7864](https://issues.apache.org/jira/browse/ARROW-7864) - [R] Make sure bundled installation works even if there are system packages
-* [ARROW-7865](https://issues.apache.org/jira/browse/ARROW-7865) - [R] Test builds on latest Linux versions
-* [ARROW-7868](https://issues.apache.org/jira/browse/ARROW-7868) - [Crossbow] Reduce GitHub API query parallelism
-* [ARROW-7869](https://issues.apache.org/jira/browse/ARROW-7869) - [Python] Boost::system and boost::filesystem not necessary anymore in Python wheels
-* [ARROW-7872](https://issues.apache.org/jira/browse/ARROW-7872) - [Python] Support conversion of list-of-struct in Array/Table.to\_pandas
-* [ARROW-7874](https://issues.apache.org/jira/browse/ARROW-7874) - [Python][Archery] Validate docstrings with numpydoc
-* [ARROW-7876](https://issues.apache.org/jira/browse/ARROW-7876) - [R] Installation fails in the documentation generation image
-* [ARROW-7877](https://issues.apache.org/jira/browse/ARROW-7877) - [Packaging] Fix crossbow deployment to github artifacts
-* [ARROW-7879](https://issues.apache.org/jira/browse/ARROW-7879) - [C++][Doc] Add doc for the Device API
-* [ARROW-7880](https://issues.apache.org/jira/browse/ARROW-7880) - [CI][R] R sanitizer job is not really working
-* [ARROW-7881](https://issues.apache.org/jira/browse/ARROW-7881) - [C++] Fix pedantic warnings
-* [ARROW-7882](https://issues.apache.org/jira/browse/ARROW-7882) - [C++][Gandiva] Optimise like function for substring pattern
-* [ARROW-7886](https://issues.apache.org/jira/browse/ARROW-7886) - [C++][Dataset] Consolidate Source and Dataset
-* [ARROW-7888](https://issues.apache.org/jira/browse/ARROW-7888) - [Python] Allow using a more modern version of jpype in pyarrow.jvm
-* [ARROW-7890](https://issues.apache.org/jira/browse/ARROW-7890) - [C++] Add Promise / Future implementation
-* [ARROW-7891](https://issues.apache.org/jira/browse/ARROW-7891) - [C++] RecordBatch-\>Equals should also have a check\_metadata argument
-* [ARROW-7892](https://issues.apache.org/jira/browse/ARROW-7892) - [Python] Expose FilesystemSource.format attribute
-* [ARROW-7895](https://issues.apache.org/jira/browse/ARROW-7895) - [Python] Remove more python 2.7 cruft
-* [ARROW-7896](https://issues.apache.org/jira/browse/ARROW-7896) - [C++] Refactor from \#include guards to \#pragma once
-* [ARROW-7897](https://issues.apache.org/jira/browse/ARROW-7897) - [Packaging] Temporarily disable artifact uploading until we fix the deployment issues
-* [ARROW-7898](https://issues.apache.org/jira/browse/ARROW-7898) - [Python] Reduce the number docstring violations using numpydoc
-* [ARROW-7904](https://issues.apache.org/jira/browse/ARROW-7904) - [C++] Decide about Field/Schema metadata printing parameters and how much to show by default
-* [ARROW-7907](https://issues.apache.org/jira/browse/ARROW-7907) - [Python] Conversion to pandas of empty table with timestamp type aborts
-* [ARROW-7912](https://issues.apache.org/jira/browse/ARROW-7912) - [Format] C data interface
-* [ARROW-7913](https://issues.apache.org/jira/browse/ARROW-7913) - [C++][Python][R] C++ implementation of C data interface
-* [ARROW-7915](https://issues.apache.org/jira/browse/ARROW-7915) - [CI] [Python] Run tests with Python development mode enabled
-* [ARROW-7916](https://issues.apache.org/jira/browse/ARROW-7916) - [C++][Dataset] Project IPC record batches to materialized fields
-* [ARROW-7917](https://issues.apache.org/jira/browse/ARROW-7917) - [CMake] FindPythonInterp should check for python3
-* [ARROW-7919](https://issues.apache.org/jira/browse/ARROW-7919) - [R] install\_arrow() should conda install if appropriate
-* [ARROW-7920](https://issues.apache.org/jira/browse/ARROW-7920) - [R] Fill in some missing input validation
-* [ARROW-7921](https://issues.apache.org/jira/browse/ARROW-7921) - [Go] Add Reset method to various components and clean up comments
-* [ARROW-7927](https://issues.apache.org/jira/browse/ARROW-7927) - [C++] Fix 'cpu\_info.cc' compilation warning
-* [ARROW-7929](https://issues.apache.org/jira/browse/ARROW-7929) - [C++] CMake target names differ from upstream provided names
-* [ARROW-7930](https://issues.apache.org/jira/browse/ARROW-7930) - [Python][CI] Test jpype integration in CI
-* [ARROW-7932](https://issues.apache.org/jira/browse/ARROW-7932) - [Rust] [Parquet] Implement array reader for temporal types
-* [ARROW-7934](https://issues.apache.org/jira/browse/ARROW-7934) - [C++] Fix UriEscape for empty string
-* [ARROW-7935](https://issues.apache.org/jira/browse/ARROW-7935) - [Java] Remove Netty dependency for BufferAllocator and ReferenceManager
-* [ARROW-7937](https://issues.apache.org/jira/browse/ARROW-7937) - [Python][Packaging] Remove boost from the macos wheels
-* [ARROW-7941](https://issues.apache.org/jira/browse/ARROW-7941) - [Rust] [DataFusion] Logical plan should support unresolved column references
-* [ARROW-7943](https://issues.apache.org/jira/browse/ARROW-7943) - [C++][Parquet] Add a new level builder capable of handling nested data
-* [ARROW-7947](https://issues.apache.org/jira/browse/ARROW-7947) - [Rust] [Flight] [DataFusion] Implement example for get\_schema
-* [ARROW-7949](https://issues.apache.org/jira/browse/ARROW-7949) - [Developer] Update to '.gitignore' to not track user specific 'cpp/Brewfile.lock.json' file
-* [ARROW-7951](https://issues.apache.org/jira/browse/ARROW-7951) - [Python][Parquet] Expose BYTE\_STREAM\_SPLIT to pyarrow
-* [ARROW-7959](https://issues.apache.org/jira/browse/ARROW-7959) - [Ruby] Add support for Ruby 2.3 again
-* [ARROW-7963](https://issues.apache.org/jira/browse/ARROW-7963) - [C++][Python][Dataset] Expose listing fragments
-* [ARROW-7965](https://issues.apache.org/jira/browse/ARROW-7965) - [Python] Refine higher level dataset API
-* [ARROW-7966](https://issues.apache.org/jira/browse/ARROW-7966) - [Integration][Flight][C++] Client should verify each batch independently
-* [ARROW-7969](https://issues.apache.org/jira/browse/ARROW-7969) - [Packaging] Use cURL to upload artifacts
-* [ARROW-7970](https://issues.apache.org/jira/browse/ARROW-7970) - [Packaging][Python] Use system boost to build the macos wheels
-* [ARROW-7971](https://issues.apache.org/jira/browse/ARROW-7971) - [Rust] Create rowcount utility
-* [ARROW-7977](https://issues.apache.org/jira/browse/ARROW-7977) - [C++] Rename fs::FileStats to fs::FileInfo
-* [ARROW-7979](https://issues.apache.org/jira/browse/ARROW-7979) - [C++] Implement experimental buffer compression in IPC messages
-* [ARROW-7982](https://issues.apache.org/jira/browse/ARROW-7982) - [C++] Let ArrayDataVisitor accept void-returning functions
-* [ARROW-7983](https://issues.apache.org/jira/browse/ARROW-7983) - [CI][R] Nightly builds should be more verbose when they fail
-* [ARROW-7984](https://issues.apache.org/jira/browse/ARROW-7984) - [R] Check for valid inputs in more places
-* [ARROW-7986](https://issues.apache.org/jira/browse/ARROW-7986) - [Python] pa.Array.from\_pandas cannot convert pandas.Series containing pyspark.ml.linalg.SparseVector
-* [ARROW-7987](https://issues.apache.org/jira/browse/ARROW-7987) - [CI][R] Fix for verbose nightly builds
-* [ARROW-7988](https://issues.apache.org/jira/browse/ARROW-7988) - [R] Fix on.exit calls in reticulate bindings
-* [ARROW-7991](https://issues.apache.org/jira/browse/ARROW-7991) - [C++][Plasma] Allow option for evicting if full when creating an object
-* [ARROW-7993](https://issues.apache.org/jira/browse/ARROW-7993) - [Java] Support decimal type in ComplexCopier
-* [ARROW-7994](https://issues.apache.org/jira/browse/ARROW-7994) - [CI][C++] Move AppVeyor MinGW builds to GitHub Actions
-* [ARROW-7995](https://issues.apache.org/jira/browse/ARROW-7995) - [C++] IO: coalescing and caching read ranges
-* [ARROW-7998](https://issues.apache.org/jira/browse/ARROW-7998) - [C++][Plasma] Make Seal requests synchronous
-* [ARROW-8005](https://issues.apache.org/jira/browse/ARROW-8005) - [Website] Review and adjust any usages of Apache dist system from website / tools
-* [ARROW-8014](https://issues.apache.org/jira/browse/ARROW-8014) - [C++] Provide CMake targets to test only within a given label
-* [ARROW-8016](https://issues.apache.org/jira/browse/ARROW-8016) - [Developer] Fix deprecation warning in PR merge tool
-* [ARROW-8018](https://issues.apache.org/jira/browse/ARROW-8018) - [C++][Parquet]Parquet Modular Encryption
-* [ARROW-8024](https://issues.apache.org/jira/browse/ARROW-8024) - [R] Bindings for BinaryType and FixedBinaryType
-* [ARROW-8026](https://issues.apache.org/jira/browse/ARROW-8026) - [Python] Support memoryview in addition to string value types for constructing string and binary type arrays
-* [ARROW-8027](https://issues.apache.org/jira/browse/ARROW-8027) - [Developer][Integration] Add integration tests for duplicate field names
-* [ARROW-8028](https://issues.apache.org/jira/browse/ARROW-8028) - [Go] Allow duplicate field names in schemas and nested types
-* [ARROW-8030](https://issues.apache.org/jira/browse/ARROW-8030) - [C++][Plasma] Fix inconsistent comment style
-* [ARROW-8035](https://issues.apache.org/jira/browse/ARROW-8035) - [Developer][Integration] Add integration tests for extension types
-* [ARROW-8039](https://issues.apache.org/jira/browse/ARROW-8039) - [Python][Dataset] Support using dataset API in pyarrow.parquet with a minimal ParquetDataset shim
-* [ARROW-8044](https://issues.apache.org/jira/browse/ARROW-8044) - [CI][NIGHTLY:gandiva-jar-osx] pygit2 needs libgit2 v1.0.x
-* [ARROW-8055](https://issues.apache.org/jira/browse/ARROW-8055) - [GLib][Ruby] Add some metadata bindings to GArrowSchema
-* [ARROW-8058](https://issues.apache.org/jira/browse/ARROW-8058) - [C++][Python][Dataset] Provide an option to toggle validation and schema inference in FileSystemDatasetFactoryOptions
-* [ARROW-8059](https://issues.apache.org/jira/browse/ARROW-8059) - [Python] Make FileSystem objects serializable
-* [ARROW-8060](https://issues.apache.org/jira/browse/ARROW-8060) - [Python] Make dataset Expression objects serializable
-* [ARROW-8061](https://issues.apache.org/jira/browse/ARROW-8061) - [C++][Dataset] Ability to specify granularity of ParquetFileFragment (support row groups)
-* [ARROW-8063](https://issues.apache.org/jira/browse/ARROW-8063) - [Python] Add user guide documentation for Datasets API
-* [ARROW-8064](https://issues.apache.org/jira/browse/ARROW-8064) - [Dev] Implement Comment bot via Github actions
-* [ARROW-8069](https://issues.apache.org/jira/browse/ARROW-8069) - [C++] Should the default value of "check\_metadata" arguments of Equals methods be "true"?
-* [ARROW-8072](https://issues.apache.org/jira/browse/ARROW-8072) - [C++][Plasma] Add const constraint when parsing data
-* [ARROW-8077](https://issues.apache.org/jira/browse/ARROW-8077) - [Python] Add wheel build script and Crossbow configuration for Windows on Python 3.5
-* [ARROW-8079](https://issues.apache.org/jira/browse/ARROW-8079) - [Python] Implement a wrapper for KeyValueMetadata, duck-typing dict where relevant
-* [ARROW-8080](https://issues.apache.org/jira/browse/ARROW-8080) - [C++] Add AVX512 build option
-* [ARROW-8082](https://issues.apache.org/jira/browse/ARROW-8082) - [Java][Plasma] Add JNI list() interface
-* [ARROW-8083](https://issues.apache.org/jira/browse/ARROW-8083) - [GLib] Add support for Peek() to GIOInputStream
-* [ARROW-8086](https://issues.apache.org/jira/browse/ARROW-8086) - [Java] Support writing decimal from big endian byte array in UnionListWriter
-* [ARROW-8087](https://issues.apache.org/jira/browse/ARROW-8087) - [C++][Dataset] Order of keys with HivePartitioning is lost in resulting schema
-* [ARROW-8096](https://issues.apache.org/jira/browse/ARROW-8096) - [C++][Gandiva] Create null node of Interval type
-* [ARROW-8097](https://issues.apache.org/jira/browse/ARROW-8097) - [Dev] Comment bot's crossbow command acts on the master branch
-* [ARROW-8103](https://issues.apache.org/jira/browse/ARROW-8103) - [R] Make default Linux build more minimal
-* [ARROW-8104](https://issues.apache.org/jira/browse/ARROW-8104) - [C++] Don't install bundled Thrift
-* [ARROW-8107](https://issues.apache.org/jira/browse/ARROW-8107) - [Packaging][APT] Use HTTPS for LLVM APT repository for Debian GNU/Linux stretch
-* [ARROW-8109](https://issues.apache.org/jira/browse/ARROW-8109) - [Packaging][APT] Drop support for Ubuntu Disco
-* [ARROW-8117](https://issues.apache.org/jira/browse/ARROW-8117) - [Rust] [Datafusion] Allow CAST from number to timestamp
-* [ARROW-8118](https://issues.apache.org/jira/browse/ARROW-8118) - [R] dim method for FileSystemDataset
-* [ARROW-8120](https://issues.apache.org/jira/browse/ARROW-8120) - [Packaging][APT] Add support for Ubuntu Focal
-* [ARROW-8123](https://issues.apache.org/jira/browse/ARROW-8123) - [Rust] [DataFusion] Create LogicalPlanBuilder
-* [ARROW-8124](https://issues.apache.org/jira/browse/ARROW-8124) - [Rust] Update library dependencies
-* [ARROW-8126](https://issues.apache.org/jira/browse/ARROW-8126) - [C++][Compute] Add Top-K kernel benchmark
-* [ARROW-8129](https://issues.apache.org/jira/browse/ARROW-8129) - [C++][Compute] Refine compare sorting kernel
-* [ARROW-8130](https://issues.apache.org/jira/browse/ARROW-8130) - [C++][Gandiva] Fix Dex visitor in llvm\_generator to handle interval type
-* [ARROW-8140](https://issues.apache.org/jira/browse/ARROW-8140) - [Developer] Follow NullType -\> NullField change
-* [ARROW-8141](https://issues.apache.org/jira/browse/ARROW-8141) - [C++] Optimize BM\_PlainDecodingBoolean performance using AVX512 Intrinsics API
-* [ARROW-8145](https://issues.apache.org/jira/browse/ARROW-8145) - [C++] Rename GetTargetInfos
-* [ARROW-8146](https://issues.apache.org/jira/browse/ARROW-8146) - [C++] Add per-filesystem facility to sanitize a path
-* [ARROW-8150](https://issues.apache.org/jira/browse/ARROW-8150) - [Rust] Allow writing custom FileMetaData k/v pairs
-* [ARROW-8151](https://issues.apache.org/jira/browse/ARROW-8151) - [Benchmarking][Dataset] Benchmark Parquet read performance with S3File
-* [ARROW-8153](https://issues.apache.org/jira/browse/ARROW-8153) - [Packaging] Update the conda feedstock files and upload artifacts to Anaconda
-* [ARROW-8158](https://issues.apache.org/jira/browse/ARROW-8158) - [Java] Getting length of data buffer and base variable width vector
-* [ARROW-8164](https://issues.apache.org/jira/browse/ARROW-8164) - [C++][Dataset] Let datasets be viewable with non-identical schema
-* [ARROW-8165](https://issues.apache.org/jira/browse/ARROW-8165) - [Packaging] Make nightly wheels available on a PyPI server
-* [ARROW-8167](https://issues.apache.org/jira/browse/ARROW-8167) - [CI] Add support for skipping builds with skip pattern in pull request title
-* [ARROW-8168](https://issues.apache.org/jira/browse/ARROW-8168) - [Java][Plasma] Improve Java Plasma client off-heap memory usage
-* [ARROW-8177](https://issues.apache.org/jira/browse/ARROW-8177) - [Rust] Make schema\_to\_fb\_offset public
-* [ARROW-8178](https://issues.apache.org/jira/browse/ARROW-8178) - [C++] Upgrade to Flatbuffers 1.12
-* [ARROW-8179](https://issues.apache.org/jira/browse/ARROW-8179) - [R] Windows build script tweaking for nightly packaging on GHA
-* [ARROW-8181](https://issues.apache.org/jira/browse/ARROW-8181) - [Java][FlightRPC] Expose transport error metadata
-* [ARROW-8182](https://issues.apache.org/jira/browse/ARROW-8182) - [Packaging] Increment the version number detected from the latest git tag
-* [ARROW-8183](https://issues.apache.org/jira/browse/ARROW-8183) - [c++][FlightRPC] Expose transport error metadata
-* [ARROW-8184](https://issues.apache.org/jira/browse/ARROW-8184) - [Packaging] Use arrow-nightlies organization name on Anaconda and Gemfury to host the nightlies
-* [ARROW-8185](https://issues.apache.org/jira/browse/ARROW-8185) - [Packaging] Document the available nightly wheels and conda packages
-* [ARROW-8187](https://issues.apache.org/jira/browse/ARROW-8187) - [R] Make test assertions robust to i18n
-* [ARROW-8191](https://issues.apache.org/jira/browse/ARROW-8191) - [Packaging][APT] Fix cmake removal in Debian GNU/Linux Stretch
-* [ARROW-8192](https://issues.apache.org/jira/browse/ARROW-8192) - [C++] script for unpack avx512 intrinsics code
-* [ARROW-8194](https://issues.apache.org/jira/browse/ARROW-8194) - [CI] Github Actions Windows job should run tests in parallel
-* [ARROW-8195](https://issues.apache.org/jira/browse/ARROW-8195) - [CI] Remove Boost download step in Github Actions
-* [ARROW-8198](https://issues.apache.org/jira/browse/ARROW-8198) - [C++] Diffing should handle null arrays
-* [ARROW-8200](https://issues.apache.org/jira/browse/ARROW-8200) - [GLib] Rename garrow\_file\_system\_target\_info{,s}() to ...\_file\_info{,s}()
-* [ARROW-8203](https://issues.apache.org/jira/browse/ARROW-8203) - [C\#] "dotnet pack" is failed
-* [ARROW-8204](https://issues.apache.org/jira/browse/ARROW-8204) - [Rust] [DataFusion] Add support for aliased expressions in SQL
-* [ARROW-8207](https://issues.apache.org/jira/browse/ARROW-8207) - [Packaging][wheel] Use LLVM 8 in manylinux2010 and manylinux2014
-* [ARROW-8215](https://issues.apache.org/jira/browse/ARROW-8215) - [CI][GLib] Meson install fails in the macOS build
-* [ARROW-8218](https://issues.apache.org/jira/browse/ARROW-8218) - [C++] Parallelize decompression at field level in experimental IPC compression code
-* [ARROW-8220](https://issues.apache.org/jira/browse/ARROW-8220) - [Python] Make dataset FileFormat objects serializable
-* [ARROW-8222](https://issues.apache.org/jira/browse/ARROW-8222) - [C++] Use bcp to make a slim boost for bundled build
-* [ARROW-8224](https://issues.apache.org/jira/browse/ARROW-8224) - [C++] Remove APIs deprecated prior to 0.16.0
-* [ARROW-8225](https://issues.apache.org/jira/browse/ARROW-8225) - [Rust] IPC reader must respect continuation markers
-* [ARROW-8225](https://issues.apache.org/jira/browse/ARROW-8225) - [Rust] IPC reader must respect continuation markers
-* [ARROW-8227](https://issues.apache.org/jira/browse/ARROW-8227) - [C++] Refine SIMD feature definitions
-* [ARROW-8231](https://issues.apache.org/jira/browse/ARROW-8231) - [Rust] Parse key\_value\_metadata from parquet FileMetaData into arrow schema metadata
-* [ARROW-8232](https://issues.apache.org/jira/browse/ARROW-8232) - [Python] Deprecate pa.open\_file and pa.open\_stream in favor of pa.ipc.open\_file/open\_stream
-* [ARROW-8235](https://issues.apache.org/jira/browse/ARROW-8235) - [C++][Compute] Filter out nulls by default
-* [ARROW-8241](https://issues.apache.org/jira/browse/ARROW-8241) - [Rust] Add convenience methods to Schema
-* [ARROW-8242](https://issues.apache.org/jira/browse/ARROW-8242) - [C++] Flight fails to compile on GCC 4.8
-* [ARROW-8243](https://issues.apache.org/jira/browse/ARROW-8243) - [Rust] [DataFusion] Fix inconsistent API in LogicalPlanBuilder
-* [ARROW-8244](https://issues.apache.org/jira/browse/ARROW-8244) - [Python][Parquet] Add \`write\_to\_dataset\` option to populate the "file\_path" metadata fields
-* [ARROW-8246](https://issues.apache.org/jira/browse/ARROW-8246) - [C++] Add -Wa,-mbig-obj when compiling with MinGW to avoid linking errors
-* [ARROW-8247](https://issues.apache.org/jira/browse/ARROW-8247) - [Python] Expose Parquet writing "engine" setting in pyarrow.parquet.write\_table
-* [ARROW-8249](https://issues.apache.org/jira/browse/ARROW-8249) - [Rust] [DataFusion] Make Table and LogicalPlanBuilder APIs more consistent
-* [ARROW-8252](https://issues.apache.org/jira/browse/ARROW-8252) - [CI][Ruby] Add Ubuntu 20.04
-* [ARROW-8256](https://issues.apache.org/jira/browse/ARROW-8256) - [Rust] [DataFusion] Update CLI documentation for 0.17.0 release
-* [ARROW-8264](https://issues.apache.org/jira/browse/ARROW-8264) - [Rust] [DataFusion] Create utility for printing record batches
-* [ARROW-8266](https://issues.apache.org/jira/browse/ARROW-8266) - [C++] Add backup mirrors for external project source downloads
-* [ARROW-8267](https://issues.apache.org/jira/browse/ARROW-8267) - [CI][GLib] Failed to build on Ubuntu 16.04
-* [ARROW-8271](https://issues.apache.org/jira/browse/ARROW-8271) - [Packaging] Allow wheel upload failures to gemfury
-* [ARROW-8275](https://issues.apache.org/jira/browse/ARROW-8275) - [Python][Docs] Review Feather + IPC file documentation per "Feather V2" changes
-* [ARROW-8277](https://issues.apache.org/jira/browse/ARROW-8277) - [Python] RecordBatch interface improvements
-* [ARROW-8279](https://issues.apache.org/jira/browse/ARROW-8279) - [C++] Do not export symbols from Codec implementations, remove need for PIMPL pattern
-* [ARROW-8288](https://issues.apache.org/jira/browse/ARROW-8288) - [Python] Expose with\_ modifiers on DataType
-* [ARROW-8290](https://issues.apache.org/jira/browse/ARROW-8290) - [Python][Dataset] Improve ergonomy of the FileSystemDataset constructor
-* [ARROW-8291](https://issues.apache.org/jira/browse/ARROW-8291) - [Packaging] Conda nightly builds can't locate Numpy
-* [ARROW-8292](https://issues.apache.org/jira/browse/ARROW-8292) - [Python][Dataset] Passthrough schema to Factory.finish() in dataset() function
-* [ARROW-8294](https://issues.apache.org/jira/browse/ARROW-8294) - [Format][Flight] Add DoExchange RPC to Flight protocol
-* [ARROW-8295](https://issues.apache.org/jira/browse/ARROW-8295) - [C++][Dataset] IpcFileFormat should expliclity push down column projection
-* [ARROW-8299](https://issues.apache.org/jira/browse/ARROW-8299) - [C++] Reusable "optional ParallelFor" function for optional use of multithreading
-* [ARROW-8300](https://issues.apache.org/jira/browse/ARROW-8300) - [R] Documentation and changelog updates for 0.17
-* [ARROW-8307](https://issues.apache.org/jira/browse/ARROW-8307) - [Python] Expose use\_memory\_map option in pyarrow.feather APIs
-* [ARROW-8308](https://issues.apache.org/jira/browse/ARROW-8308) - [Rust] [Flight] Implement DoExchange on examples
-* [ARROW-8309](https://issues.apache.org/jira/browse/ARROW-8309) - [CI] C++/Java/Rust workflows should trigger on changes to Flight.proto
-* [ARROW-8311](https://issues.apache.org/jira/browse/ARROW-8311) - [C++] Add push style stream format reader
-* [ARROW-8316](https://issues.apache.org/jira/browse/ARROW-8316) - [CI] Set docker-compose to use docker-cli instead of docker-py for building images
-* [ARROW-8319](https://issues.apache.org/jira/browse/ARROW-8319) - [CI] Install thrift compiler in the debian build
-* [ARROW-8320](https://issues.apache.org/jira/browse/ARROW-8320) - [Documentation][Format] Clarify (lack of) alignment requirements in C data interface
-* [ARROW-8321](https://issues.apache.org/jira/browse/ARROW-8321) - [CI] Use bundled thrift in Fedora 30 build
-* [ARROW-8322](https://issues.apache.org/jira/browse/ARROW-8322) - [CI] Fix C\# workflow file syntax
-* [ARROW-8325](https://issues.apache.org/jira/browse/ARROW-8325) - [R][CI] Stop including boost in R windows bundle
-* [ARROW-8329](https://issues.apache.org/jira/browse/ARROW-8329) - [Documentation][C++] Undocumented FilterOptions argument in Filter kernel
-* [ARROW-8330](https://issues.apache.org/jira/browse/ARROW-8330) - [Documentation] The post release script generates the documentation with a development version
-* [ARROW-8332](https://issues.apache.org/jira/browse/ARROW-8332) - [C++] Require Thrift compiler to use system libthrift for Parquet build
-* [ARROW-8335](https://issues.apache.org/jira/browse/ARROW-8335) - [Release] Add crossbow jobs to run release verification
-* [ARROW-8336](https://issues.apache.org/jira/browse/ARROW-8336) - [Packaging][deb] Use libthrift-dev on Debian 10 and Ubuntu 19.10 or later
-* [ARROW-8341](https://issues.apache.org/jira/browse/ARROW-8341) - [Packaging][deb] Fail to build by no disk space
-* [ARROW-8343](https://issues.apache.org/jira/browse/ARROW-8343) - [GLib] Add GArrowRecordBatchIterator
-* [ARROW-8347](https://issues.apache.org/jira/browse/ARROW-8347) - [C++] Add Result<T\> APIs to Array methods
-* [ARROW-8351](https://issues.apache.org/jira/browse/ARROW-8351) - [R][CI] Store the Rtools-built Arrow C++ library as a build artifact
-* [ARROW-8352](https://issues.apache.org/jira/browse/ARROW-8352) - [R] Add install\_pyarrow()
-* [ARROW-8356](https://issues.apache.org/jira/browse/ARROW-8356) - [Developer] Support \* wildcards with "crossbow submit" via GitHub actions
-* [ARROW-8361](https://issues.apache.org/jira/browse/ARROW-8361) - [C++] Add Result<T\> APIs to Buffer methods and functions
-* [ARROW-8362](https://issues.apache.org/jira/browse/ARROW-8362) - [Crossbow] Ensure that the locally generated version is used in the docker tasks
-* [ARROW-8367](https://issues.apache.org/jira/browse/ARROW-8367) - [C++] Deprecate Buffer::FromString(..., pool)
-* [ARROW-8368](https://issues.apache.org/jira/browse/ARROW-8368) - [Format] In C interface, clarify resource management for consumers needing only a subset of child fields in ArrowArray
-* [ARROW-8370](https://issues.apache.org/jira/browse/ARROW-8370) - [C++] Add Result<T\> to type / schema APIs
-* [ARROW-8371](https://issues.apache.org/jira/browse/ARROW-8371) - [Crossbow] Implement and exercise sanity checks for tasks.yml
-* [ARROW-8372](https://issues.apache.org/jira/browse/ARROW-8372) - [C++] Add Result<T\> to table / record batch APIs
-* [ARROW-8375](https://issues.apache.org/jira/browse/ARROW-8375) - [CI][R] Make Windows tests more verbose in case of segfault
-* [ARROW-8376](https://issues.apache.org/jira/browse/ARROW-8376) - [R] Add experimental interface to ScanTask/RecordBatch iterators
-* [ARROW-8387](https://issues.apache.org/jira/browse/ARROW-8387) - [Rust] Make schema\_to\_fb public
-* [ARROW-8389](https://issues.apache.org/jira/browse/ARROW-8389) - [Integration] Run tests in parallel
-* [ARROW-8390](https://issues.apache.org/jira/browse/ARROW-8390) - [R] Expose schema unification features
-* [ARROW-8393](https://issues.apache.org/jira/browse/ARROW-8393) - [C++][Gandiva] Make gandiva function registry case-insensitive
-* [ARROW-8396](https://issues.apache.org/jira/browse/ARROW-8396) - [Rust] Remove libc from dependencies
-* [ARROW-8398](https://issues.apache.org/jira/browse/ARROW-8398) - [Python] Remove deprecation warnings originating from python tests
-* [ARROW-8401](https://issues.apache.org/jira/browse/ARROW-8401) - [C++] Add AVX2/AVX512 version of ByteStreamSplitDecode/ByteStreamSplitEncode
-* [ARROW-8403](https://issues.apache.org/jira/browse/ARROW-8403) - [C++] Add ToString() to ChunkedArray, Table and RecordBatch
-* [ARROW-8407](https://issues.apache.org/jira/browse/ARROW-8407) - [Rust] Add rustdoc for Dictionary type
-* [ARROW-8408](https://issues.apache.org/jira/browse/ARROW-8408) - [Python] Add memory\_map= toggle to pyarrow.feather.read\_feather
-* [ARROW-8409](https://issues.apache.org/jira/browse/ARROW-8409) - [R] Add arrow::cpu\_count, arrow::set\_cpu\_count wrapper functions a la Python
-* [ARROW-8412](https://issues.apache.org/jira/browse/ARROW-8412) - [C++][Gandiva] Fix gandiva date\_diff function definitions
-* [ARROW-8433](https://issues.apache.org/jira/browse/ARROW-8433) - [R] Add feather alias for ipc format in dataset API
-* [ARROW-8444](https://issues.apache.org/jira/browse/ARROW-8444) - [Documentation] Fix spelling errors across the codebase
-* [ARROW-8449](https://issues.apache.org/jira/browse/ARROW-8449) - [R] Use CMAKE\_UNITY\_BUILD everywhere
-* [ARROW-8450](https://issues.apache.org/jira/browse/ARROW-8450) - [Integration][C++] Implement large list/binary/utf8 integration
-* [ARROW-8457](https://issues.apache.org/jira/browse/ARROW-8457) - [C++] bridge test does not take care of endianness
-* [ARROW-8458](https://issues.apache.org/jira/browse/ARROW-8458) - [C++] Prefer the original mirrors for the bundled thirdparty dependencies
-* [ARROW-8461](https://issues.apache.org/jira/browse/ARROW-8461) - [Packaging][deb] Use zstd package for Ubuntu Xenial
-* [ARROW-8463](https://issues.apache.org/jira/browse/ARROW-8463) - [CI] Balance the nightly test builds between CircleCI, Azure and Github
-* [ARROW-8679](https://issues.apache.org/jira/browse/ARROW-8679) - [Python] supporting pandas sparse series in pyarrow
-* [PARQUET-458](https://issues.apache.org/jira/browse/PARQUET-458) - [C++] Implement support for DataPageV2
-* [PARQUET-1663](https://issues.apache.org/jira/browse/PARQUET-1663) - [C++] Provide API to check the presence of complex data types
-* [PARQUET-1716](https://issues.apache.org/jira/browse/PARQUET-1716) - [C++] Add support for BYTE\_STREAM\_SPLIT encoding
-* [PARQUET-1770](https://issues.apache.org/jira/browse/PARQUET-1770) - [C++][CI] Add fuzz target for reading Parquet files
-* [PARQUET-1785](https://issues.apache.org/jira/browse/PARQUET-1785) - [C++] Improve code reusability in encoding-test.cc
-* [PARQUET-1786](https://issues.apache.org/jira/browse/PARQUET-1786) - [C++] Use simd to improve BYTE\_STREAM\_SPLIT decoding performance
-* [PARQUET-1806](https://issues.apache.org/jira/browse/PARQUET-1806) - [C++] [CI] Improve fuzzing seed corpus
-* [PARQUET-1825](https://issues.apache.org/jira/browse/PARQUET-1825) - [C++] Fix compilation error in column\_io\_benchmark.cc
-* [PARQUET-1828](https://issues.apache.org/jira/browse/PARQUET-1828) - [C++] Add a SSE2 path for the ByteStreamSplit encoder implementation
-* [PARQUET-1840](https://issues.apache.org/jira/browse/PARQUET-1840) - [C++] DecodeSpaced copies more values then necessary
-
-
-
-# Apache Arrow 0.16.0 (2020-02-07)
-
-## Bug Fixes
-
-* [ARROW-3783](https://issues.apache.org/jira/browse/ARROW-3783) - [R] Incorrect collection of float type
-* [ARROW-3962](https://issues.apache.org/jira/browse/ARROW-3962) - [Go] Support null values while reading a CSV file.
-* [ARROW-4470](https://issues.apache.org/jira/browse/ARROW-4470) - [Python] Pyarrow using considerable more memory when reading partitioned Parquet file
-* [ARROW-4998](https://issues.apache.org/jira/browse/ARROW-4998) - [R] R package fails to install on OSX
-* [ARROW-5575](https://issues.apache.org/jira/browse/ARROW-5575) - [C++] arrowConfig.cmake includes uninstalled targets
-* [ARROW-5655](https://issues.apache.org/jira/browse/ARROW-5655) - [Python] Table.from\_pydict/from\_arrays not using types in specified schema correctly
-* [ARROW-5680](https://issues.apache.org/jira/browse/ARROW-5680) - [Rust] datafusion group-by tests depends on result set order
-* [ARROW-6157](https://issues.apache.org/jira/browse/ARROW-6157) - [Python][C++] UnionArray with invalid data passes validation / leads to segfaults
-* [ARROW-6195](https://issues.apache.org/jira/browse/ARROW-6195) - [C++] CMake fails with file not found error while bundling thrift if python is not installed
-* [ARROW-6298](https://issues.apache.org/jira/browse/ARROW-6298) - [Rust] [CI] Examples are not being tested in CI
-* [ARROW-6320](https://issues.apache.org/jira/browse/ARROW-6320) - [C++] Arrow utilities are linked statically
-* [ARROW-6429](https://issues.apache.org/jira/browse/ARROW-6429) - [CI][Crossbow] Nightly spark integration job fails
-* [ARROW-6445](https://issues.apache.org/jira/browse/ARROW-6445) - [CI][Crossbow] Nightly Gandiva jar trusty job fails
-* [ARROW-6567](https://issues.apache.org/jira/browse/ARROW-6567) - [Rust] [DataFusion] SQL aggregate query execution assume grouping expressions precede aggregate expressions
-* [ARROW-6581](https://issues.apache.org/jira/browse/ARROW-6581) - [C++] Fix fuzzit job submission
-* [ARROW-6704](https://issues.apache.org/jira/browse/ARROW-6704) - [C++] Cast from timestamp to higher resolution does not check out of bounds timestamps
-* [ARROW-6708](https://issues.apache.org/jira/browse/ARROW-6708) - [C++] "cannot find -lboost\_filesystem\_static"
-* [ARROW-6728](https://issues.apache.org/jira/browse/ARROW-6728) - [C\#] Support reading and writing Date32 and Date64 arrays
-* [ARROW-6736](https://issues.apache.org/jira/browse/ARROW-6736) - [Rust] [DataFusion] Aggregate expressions get evaluated repeatedly
-* [ARROW-6740](https://issues.apache.org/jira/browse/ARROW-6740) - [Python] Unable to delete closed MemoryMappedFile on Windows
-* [ARROW-6745](https://issues.apache.org/jira/browse/ARROW-6745) - [Rust] Fix a variety of typos
-* [ARROW-6749](https://issues.apache.org/jira/browse/ARROW-6749) - [Python] Conversion of non-ns timestamp array to numpy gives wrong values
-* [ARROW-6750](https://issues.apache.org/jira/browse/ARROW-6750) - [Python] Silence S3 error logs by default
-* [ARROW-6761](https://issues.apache.org/jira/browse/ARROW-6761) - [Rust] Travis CI builds not respecting rust-toolchain
-* [ARROW-6762](https://issues.apache.org/jira/browse/ARROW-6762) - [C++] JSON reader segfaults on newline
-* [ARROW-6785](https://issues.apache.org/jira/browse/ARROW-6785) - [JS] Remove superfluous child assignment
-* [ARROW-6786](https://issues.apache.org/jira/browse/ARROW-6786) - [C++] arrow-dataset-file-parquet-test is slow
-* [ARROW-6795](https://issues.apache.org/jira/browse/ARROW-6795) - [C\#] Reading large Arrow files in C\# results in an exception
-* [ARROW-6798](https://issues.apache.org/jira/browse/ARROW-6798) - [CI] [Rust] Improve build times by caching dependencies in the Docker image
-* [ARROW-6801](https://issues.apache.org/jira/browse/ARROW-6801) - [Rust] Arrow source release tarball is missing benchmarks
-* [ARROW-6806](https://issues.apache.org/jira/browse/ARROW-6806) - [C++] Segfault deserializing ListArray containing null/empty list
-* [ARROW-6808](https://issues.apache.org/jira/browse/ARROW-6808) - [Ruby] Ensure requiring suitable MSYS2 package
-* [ARROW-6809](https://issues.apache.org/jira/browse/ARROW-6809) - [RUBY] Gem does not install on macOS due to glib2 3.3.7 compilation failure
-* [ARROW-6812](https://issues.apache.org/jira/browse/ARROW-6812) - [Java] Remove Dremio Corp. from License Header
-* [ARROW-6813](https://issues.apache.org/jira/browse/ARROW-6813) - [Ruby] Arrow::Table.load with headers=true leads to exception in Arrow 0.15
-* [ARROW-6820](https://issues.apache.org/jira/browse/ARROW-6820) - [C++] [Doc] [Format] Map specification and implementation inconsistent
-* [ARROW-6834](https://issues.apache.org/jira/browse/ARROW-6834) - [C++] Pin gtest to 1.8.1 to triage failing Appveyor / MSVC build
-* [ARROW-6835](https://issues.apache.org/jira/browse/ARROW-6835) - [Archery][CMake] Restore ARROW\_LINT\_ONLY
-* [ARROW-6842](https://issues.apache.org/jira/browse/ARROW-6842) - [Website] Jekyll error building website
-* [ARROW-6844](https://issues.apache.org/jira/browse/ARROW-6844) - [C++][Parquet][Python] List<scalar type\> columns read broken with 0.15.0
-* [ARROW-6846](https://issues.apache.org/jira/browse/ARROW-6846) - [C++] Build failures with glog enabled
-* [ARROW-6857](https://issues.apache.org/jira/browse/ARROW-6857) - [Python][C++] Segfault for dictionary\_encode on empty chunked\_array (edge case)
-* [ARROW-6859](https://issues.apache.org/jira/browse/ARROW-6859) - [CI][Nightly] Disable docker layer caching for CircleCI tasks
-* [ARROW-6860](https://issues.apache.org/jira/browse/ARROW-6860) - [Python] Only link libarrow\_flight.so to pyarrow.\_flight
-* [ARROW-6861](https://issues.apache.org/jira/browse/ARROW-6861) - [Python] arrow-0.15.0 reading arrow-0.14.1-output Parquet dictionary column: Failure reading column: IOError: Arrow error: Invalid: Resize cannot downsize
-* [ARROW-6864](https://issues.apache.org/jira/browse/ARROW-6864) - [C++] bz2 / zstd tests not enabled
-* [ARROW-6867](https://issues.apache.org/jira/browse/ARROW-6867) - [FlightRPC][Java] Flight server can hang JVM on shutdown
-* [ARROW-6868](https://issues.apache.org/jira/browse/ARROW-6868) - [Go] slicing Struct array does not slice child fields
-* [ARROW-6869](https://issues.apache.org/jira/browse/ARROW-6869) - [C++] Dictionary "delta" building logic in builder\_dict.h produces invalid arrays
-* [ARROW-6873](https://issues.apache.org/jira/browse/ARROW-6873) - [Python] Stale CColumn reference break Cython cimport pyarrow
-* [ARROW-6874](https://issues.apache.org/jira/browse/ARROW-6874) - [Python] Memory leak in Table.to\_pandas() when conversion to object dtype
-* [ARROW-6876](https://issues.apache.org/jira/browse/ARROW-6876) - [Python] Reading parquet file with many columns becomes slow for 0.15.0
-* [ARROW-6877](https://issues.apache.org/jira/browse/ARROW-6877) - [C++] Boost not found from the correct environment
-* [ARROW-6878](https://issues.apache.org/jira/browse/ARROW-6878) - [Python] pa.array() does not handle list of dicts with bytes keys correctly under python3
-* [ARROW-6882](https://issues.apache.org/jira/browse/ARROW-6882) - [Python] cannot create a chunked\_array from dictionary\_encoding result
-* [ARROW-6885](https://issues.apache.org/jira/browse/ARROW-6885) - [Python] Remove superfluous skipped timedelta test
-* [ARROW-6886](https://issues.apache.org/jira/browse/ARROW-6886) - [C++] arrow::io header nvcc compiler warnings
-* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes
-* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes
-* [ARROW-6899](https://issues.apache.org/jira/browse/ARROW-6899) - [Python] to\_pandas() not implemented on list<dictionary<values=string, indices=int32\>
-* [ARROW-6901](https://issues.apache.org/jira/browse/ARROW-6901) - [Rust][Parquet] SerializedFileWriter writes total\_num\_rows as zero
-* [ARROW-6903](https://issues.apache.org/jira/browse/ARROW-6903) - [Python] Wheels broken after ARROW-6860 changes
-* [ARROW-6905](https://issues.apache.org/jira/browse/ARROW-6905) - [Packaging][OSX] Nightly builds on MacOS are failing because of brew compile timeouts
-* [ARROW-6910](https://issues.apache.org/jira/browse/ARROW-6910) - [Python] pyarrow.parquet.read\_table(...) takes up lots of memory which is not released until program exits
-* [ARROW-6913](https://issues.apache.org/jira/browse/ARROW-6913) - [R] Potential bug in compute.cc
-* [ARROW-6914](https://issues.apache.org/jira/browse/ARROW-6914) - [CI] docker-clang-format nightly failing
-* [ARROW-6922](https://issues.apache.org/jira/browse/ARROW-6922) - [Python] Pandas master build is failing (MultiIndex.levels change)
-* [ARROW-6925](https://issues.apache.org/jira/browse/ARROW-6925) - [C++] Arrow fails to buld on MacOS 10.13.6 using brew gcc 7 and 8
-* [ARROW-6929](https://issues.apache.org/jira/browse/ARROW-6929) - [C++] ValidateArray is out of sync with the ListArray IPC specification
-* [ARROW-6937](https://issues.apache.org/jira/browse/ARROW-6937) - [Packaging][Python] Fix conda linux and OSX wheel nightly builds
-* [ARROW-6938](https://issues.apache.org/jira/browse/ARROW-6938) - [Python] Windows wheel depends on zstd.dll and libbz2.dll, which are not bundled
-* [ARROW-6948](https://issues.apache.org/jira/browse/ARROW-6948) - [Rust] [Parquet] Fix bool array support in arrow reader.
-* [ARROW-6950](https://issues.apache.org/jira/browse/ARROW-6950) - [C++][Dataset] Add example/benchmark for reading parquet files with dataset
-* [ARROW-6957](https://issues.apache.org/jira/browse/ARROW-6957) - [CI][Crossbow] Nightly R with sanitizers build fails installing dependencies
-* [ARROW-6962](https://issues.apache.org/jira/browse/ARROW-6962) - [C++] [CI] Stop compiling with -Weverything
-* [ARROW-6966](https://issues.apache.org/jira/browse/ARROW-6966) - [Go] 32bit memset is null
-* [ARROW-6977](https://issues.apache.org/jira/browse/ARROW-6977) - [C++] Only enable jemalloc background\_thread if feature is supported
-* [ARROW-6983](https://issues.apache.org/jira/browse/ARROW-6983) - [C++] Threaded task group crashes sometimes
-* [ARROW-6989](https://issues.apache.org/jira/browse/ARROW-6989) - [Python][C++] Assert is triggered when decimal type inference occurs on a value with out of range precision
-* [ARROW-6992](https://issues.apache.org/jira/browse/ARROW-6992) - [C++]: Undefined Behavior sanitizer build option fails with GCC
-* [ARROW-6999](https://issues.apache.org/jira/browse/ARROW-6999) - [Python] KeyError: '\_\_index\_level\_0\_\_' passing Table.from\_pandas its own schema
-* [ARROW-7013](https://issues.apache.org/jira/browse/ARROW-7013) - [C++] arrow-dataset pkgconfig is incomplete
-* [ARROW-7020](https://issues.apache.org/jira/browse/ARROW-7020) - [Java] Fix the bugs when calculating vector hash code
-* [ARROW-7021](https://issues.apache.org/jira/browse/ARROW-7021) - [Java] UnionFixedSizeListWriter decimal type should check writer index
-* [ARROW-7022](https://issues.apache.org/jira/browse/ARROW-7022) - [Python] \_\_arrow\_array\_\_ does not work for ExtensionTypes in Table.from\_pandas
-* [ARROW-7023](https://issues.apache.org/jira/browse/ARROW-7023) - [Python] pa.array does not use "from\_pandas" semantics for pd.Index
-* [ARROW-7024](https://issues.apache.org/jira/browse/ARROW-7024) - [CI][R] Update R dependencies for Conda build
-* [ARROW-7027](https://issues.apache.org/jira/browse/ARROW-7027) - [Python] pa.table(..) returns instead of raises error if passing invalid object
-* [ARROW-7033](https://issues.apache.org/jira/browse/ARROW-7033) - [C++] Error in./configure step for jemalloc when building on OSX 10.14.6
-* [ARROW-7045](https://issues.apache.org/jira/browse/ARROW-7045) - [R] Factor type not preserved in Parquet roundtrip
-* [ARROW-7050](https://issues.apache.org/jira/browse/ARROW-7050) - [R] Fix compiler warnings in R bindings
-* [ARROW-7053](https://issues.apache.org/jira/browse/ARROW-7053) - [Python] setuptools-scm produces incorrect version at apache-arrow-0.15.1 tag
-* [ARROW-7056](https://issues.apache.org/jira/browse/ARROW-7056) - [Python] Test errors without S3
-* [ARROW-7059](https://issues.apache.org/jira/browse/ARROW-7059) - [Python] Reading parquet file with many columns is much slower in 0.15.x versus 0.14.x
-* [ARROW-7074](https://issues.apache.org/jira/browse/ARROW-7074) - [C++] ASSERT\_OK\_AND\_ASSIGN crashes when failing
-* [ARROW-7077](https://issues.apache.org/jira/browse/ARROW-7077) - [C++] Unsupported Dict-\>T cast crashes instead of returning error
-* [ARROW-7087](https://issues.apache.org/jira/browse/ARROW-7087) - [Python] Table Metadata disappear when we write a partitioned dataset
-* [ARROW-7097](https://issues.apache.org/jira/browse/ARROW-7097) - [Rust][CI] Builds failing due to rust nightly formatting
-* [ARROW-7100](https://issues.apache.org/jira/browse/ARROW-7100) - [C++] libjvm.so not found on ubuntu 19.04 with openjdk-11
-* [ARROW-7105](https://issues.apache.org/jira/browse/ARROW-7105) - [CI][Crossbow] Nightly homebrew-cpp job fails
-* [ARROW-7106](https://issues.apache.org/jira/browse/ARROW-7106) - [Java] Fix the problem that flight perf test hangs endlessly
-* [ARROW-7117](https://issues.apache.org/jira/browse/ARROW-7117) - [C++][CI] Fix the hanging C++ tests in Windows 2019
-* [ARROW-7128](https://issues.apache.org/jira/browse/ARROW-7128) - [CI] Fedora cron jobs are failing because of wrong fedora version
-* [ARROW-7133](https://issues.apache.org/jira/browse/ARROW-7133) - [CI] Allow GH Actions to run on all branches
-* [ARROW-7142](https://issues.apache.org/jira/browse/ARROW-7142) - [C++] Compile error with GCC 5.4.0
-* [ARROW-7152](https://issues.apache.org/jira/browse/ARROW-7152) - [Java] Delete useless class DiffFunction
-* [ARROW-7157](https://issues.apache.org/jira/browse/ARROW-7157) - [R] Add validation, helpful error message to Object$new()
-* [ARROW-7158](https://issues.apache.org/jira/browse/ARROW-7158) - [C++][Visual Studio]Build config Error on non English Version visual studio.
-* [ARROW-7163](https://issues.apache.org/jira/browse/ARROW-7163) - [Doc] Fix double-and typos
-* [ARROW-7164](https://issues.apache.org/jira/browse/ARROW-7164) - [CI] Dev cron github action is failing every 15 minutes
-* [ARROW-7167](https://issues.apache.org/jira/browse/ARROW-7167) - [CI][Python] Add nightly tests for older pandas versions to Github Actions
-* [ARROW-7168](https://issues.apache.org/jira/browse/ARROW-7168) - [Python] pa.array() doesn't respect specified dictionary type
-* [ARROW-7170](https://issues.apache.org/jira/browse/ARROW-7170) - [C++] Bundled ORC fails linking
-* [ARROW-7180](https://issues.apache.org/jira/browse/ARROW-7180) - [CI] Java builds are not triggered on the master branch
-* [ARROW-7181](https://issues.apache.org/jira/browse/ARROW-7181) - [Python][Nightly] Wheel builds could NOT find ArrowPython
-* [ARROW-7183](https://issues.apache.org/jira/browse/ARROW-7183) - [CI][Crossbow] Re-skip r-sanitizer nightly tests
-* [ARROW-7187](https://issues.apache.org/jira/browse/ARROW-7187) - [C++][Doc] doxygen broken on master because of @
-* [ARROW-7188](https://issues.apache.org/jira/browse/ARROW-7188) - [C++][Doc] doxygen broken on master: missing param implicit\_casts
-* [ARROW-7189](https://issues.apache.org/jira/browse/ARROW-7189) - [CI][Crossbow] Nightly conda osx builds fail
-* [ARROW-7194](https://issues.apache.org/jira/browse/ARROW-7194) - [Rust] CSV Writer causing recursion errors
-* [ARROW-7199](https://issues.apache.org/jira/browse/ARROW-7199) - [Java] ConcurrentModificationException in BaseAllocator::getChildAllocators
-* [ARROW-7200](https://issues.apache.org/jira/browse/ARROW-7200) - [C++][Flight] Running Arrow Flight benchmark on two hosts doesn't work
-* [ARROW-7209](https://issues.apache.org/jira/browse/ARROW-7209) - [Python] tests with pandas master are failing now \_\_from\_arrow\_\_ support landed in pandas
-* [ARROW-7212](https://issues.apache.org/jira/browse/ARROW-7212) - "go test -bench=8192 -run=. ./math" fails
-* [ARROW-7214](https://issues.apache.org/jira/browse/ARROW-7214) - [Python] unpickling a pyarrow table with dictionary fields crashes
-* [ARROW-7217](https://issues.apache.org/jira/browse/ARROW-7217) - ARROW-7217: [CI][Python] Use correct python version in Github Actions
-* [ARROW-7225](https://issues.apache.org/jira/browse/ARROW-7225) - [C++] \`\*std::move(Result<T\>)\` calls T copy constructor
-* [ARROW-7249](https://issues.apache.org/jira/browse/ARROW-7249) - [CI] Release test fails in master due to new arrow-flight Rust crate
-* [ARROW-7250](https://issues.apache.org/jira/browse/ARROW-7250) - [C++] Undefined symbols for StringToFloatConverter::Impl with clang 4.x
-* [ARROW-7253](https://issues.apache.org/jira/browse/ARROW-7253) - [CI] Fix master failure with release test
-* [ARROW-7254](https://issues.apache.org/jira/browse/ARROW-7254) - BaseVariableWidthVector\#setSafe appears to make value offsets inconsistent
-* [ARROW-7264](https://issues.apache.org/jira/browse/ARROW-7264) - [Java] RangeEqualsVisitor type check is not correct
-* [ARROW-7266](https://issues.apache.org/jira/browse/ARROW-7266) - [Python] dictionary\_encode() of a slice gives wrong result
-* [ARROW-7271](https://issues.apache.org/jira/browse/ARROW-7271) - [C++][Flight] Use the single parameter version of SetTotalBytesLimit
-* [ARROW-7281](https://issues.apache.org/jira/browse/ARROW-7281) - [C++] AdaptiveIntBuilder::length() does not consider pending\_pos\_.
-* [ARROW-7282](https://issues.apache.org/jira/browse/ARROW-7282) - [Python] IO functions should raise FileNotFoundError when appropriate
-* [ARROW-7291](https://issues.apache.org/jira/browse/ARROW-7291) - [Dev] Fix FORMAT\_DIR in update-flatbuffers.sh
-* [ARROW-7294](https://issues.apache.org/jira/browse/ARROW-7294) - [Python] converted\_type\_name\_from\_enum(): Incorrect name for INT\_64
-* [ARROW-7295](https://issues.apache.org/jira/browse/ARROW-7295) - [R] Fix bad test that causes failure on R < 3.5
-* [ARROW-7298](https://issues.apache.org/jira/browse/ARROW-7298) - [C++] cpp/thirdparty/download-dependencies.sh is broken
-* [ARROW-7314](https://issues.apache.org/jira/browse/ARROW-7314) - [Python] Compiler warning in pyarrow
-* [ARROW-7318](https://issues.apache.org/jira/browse/ARROW-7318) - [C\#] TimestampArray serialization failure
-* [ARROW-7320](https://issues.apache.org/jira/browse/ARROW-7320) - [C++] Target arrow-type-benchmark failed to be built on bullx Linux
-* [ARROW-7327](https://issues.apache.org/jira/browse/ARROW-7327) - [CI] Failing C GLib and R buildbot builders
-* [ARROW-7328](https://issues.apache.org/jira/browse/ARROW-7328) - [CI] GitHub Actions should trigger on changes to GitHub Actions configuration
-* [ARROW-7341](https://issues.apache.org/jira/browse/ARROW-7341) - [CI] Unbreak nightly Conda R job
-* [ARROW-7343](https://issues.apache.org/jira/browse/ARROW-7343) - [Java] Memory leak in Flight DoGet when client cancels
-* [ARROW-7349](https://issues.apache.org/jira/browse/ARROW-7349) - [C++] Fix the bug of parsing string hex values
-* [ARROW-7353](https://issues.apache.org/jira/browse/ARROW-7353) - [C++] Disable -Wmissing-braces when building with clang
-* [ARROW-7354](https://issues.apache.org/jira/browse/ARROW-7354) - [C++] TestHadoopFileSystem::ThreadSafety fails with sigabort
-* [ARROW-7355](https://issues.apache.org/jira/browse/ARROW-7355) - [CI] Environment variables are defined twice for the fuzzit builds
-* [ARROW-7358](https://issues.apache.org/jira/browse/ARROW-7358) - [CI] [Dev] [C++] ccache disabled on conda-python-hdfs
-* [ARROW-7359](https://issues.apache.org/jira/browse/ARROW-7359) - [C++][Gandiva] Don't throw error for locate function with start position exceeding string length, return 0 instead
-* [ARROW-7360](https://issues.apache.org/jira/browse/ARROW-7360) - [R] Can't use dplyr filter() with variables defined in parent scope
-* [ARROW-7361](https://issues.apache.org/jira/browse/ARROW-7361) - [Rust] Build directory is not passed to ci/scripts/rust\_test.sh
-* [ARROW-7362](https://issues.apache.org/jira/browse/ARROW-7362) - [Python] ListArray.flatten() should take care of slicing offsets
-* [ARROW-7374](https://issues.apache.org/jira/browse/ARROW-7374) - [Dev] [C++] cuda-cpp docker image fails compiling Arrow
-* [ARROW-7381](https://issues.apache.org/jira/browse/ARROW-7381) - [C++][Packaging] Iterator change broke manylinux1 wheels
-* [ARROW-7386](https://issues.apache.org/jira/browse/ARROW-7386) - [C\#] Array offset does not work properly
-* [ARROW-7388](https://issues.apache.org/jira/browse/ARROW-7388) - [Python] Skip HDFS tests if libhdfs cannot be located
-* [ARROW-7389](https://issues.apache.org/jira/browse/ARROW-7389) - [Python][Packaging] Remove pyarrow.s3fs import check from the recipe
-* [ARROW-7393](https://issues.apache.org/jira/browse/ARROW-7393) - [Plasma] Fix plasma executable name in build for Java
-* [ARROW-7395](https://issues.apache.org/jira/browse/ARROW-7395) - [C++] Logical "or" with constants is a Clang warning
-* [ARROW-7397](https://issues.apache.org/jira/browse/ARROW-7397) - [C++] Json white space length detection error
-* [ARROW-7404](https://issues.apache.org/jira/browse/ARROW-7404) - [C++][Gandiva] Fix utf8 char length error on Arm64
-* [ARROW-7406](https://issues.apache.org/jira/browse/ARROW-7406) - [Java] NonNullableStructVector\#hashCode should pass hasher to child vectors
-* [ARROW-7407](https://issues.apache.org/jira/browse/ARROW-7407) - [Python] Failed to install pyarrow 0.15.1 on Python 3.8
-* [ARROW-7408](https://issues.apache.org/jira/browse/ARROW-7408) - [C++] Reference benchmarks fail compiling
-* [ARROW-7435](https://issues.apache.org/jira/browse/ARROW-7435) - Security issue: ValidateOffsets() does not prevent buffer over-read
-* [ARROW-7436](https://issues.apache.org/jira/browse/ARROW-7436) - [Archery] Fix benchmark default configuration
-* [ARROW-7437](https://issues.apache.org/jira/browse/ARROW-7437) - [Java] ReadChannel\#readFully does not set writer index correctly
-* [ARROW-7442](https://issues.apache.org/jira/browse/ARROW-7442) - [Ruby] Specifying column type as time causes segmentation fault
-* [ARROW-7447](https://issues.apache.org/jira/browse/ARROW-7447) - [Java] ComplexCopier does incorrect copy in some cases
-* [ARROW-7450](https://issues.apache.org/jira/browse/ARROW-7450) - [CI][C++] test-ubuntu-18.04-cpp-static failing with linking error in arrow-io-hdfs-test
-* [ARROW-7458](https://issues.apache.org/jira/browse/ARROW-7458) - [GLib] incorrect build dependency in Makefile
-* [ARROW-7471](https://issues.apache.org/jira/browse/ARROW-7471) - [Python] Cython flake8 failures
-* [ARROW-7472](https://issues.apache.org/jira/browse/ARROW-7472) - [Java] Fix some incorrect behavior in UnionListWriter
-* [ARROW-7478](https://issues.apache.org/jira/browse/ARROW-7478) - [Rust] [DataFusion] Group by expression ignored unless paired with aggregate expression
-* [ARROW-7492](https://issues.apache.org/jira/browse/ARROW-7492) - [CI][Crossbow] Nightly homebrew-cpp job fails on Python installation
-* [ARROW-7497](https://issues.apache.org/jira/browse/ARROW-7497) - [Python] Test asserts: pandas.util.testing is deprecated, use pandas.testing instead
-* [ARROW-7500](https://issues.apache.org/jira/browse/ARROW-7500) - [C++][Dataset] regex\_error in hive partition on centos7 and opensuse42
-* [ARROW-7503](https://issues.apache.org/jira/browse/ARROW-7503) - [Rust] Rust builds are failing on master
-* [ARROW-7506](https://issues.apache.org/jira/browse/ARROW-7506) - [Java] JMH benchmarks should be called from main methods
-* [ARROW-7508](https://issues.apache.org/jira/browse/ARROW-7508) - [C\#] DateTime32 Reading is Broken
-* [ARROW-7510](https://issues.apache.org/jira/browse/ARROW-7510) - [C++] Array::null\_count() is not thread-compatible
-* [ARROW-7516](https://issues.apache.org/jira/browse/ARROW-7516) - [C\#] .NET Benchmarks are broken
-* [ARROW-7518](https://issues.apache.org/jira/browse/ARROW-7518) - [Python] Use PYARROW\_WITH\_HDFS when building wheels, conda packages
-* [ARROW-7527](https://issues.apache.org/jira/browse/ARROW-7527) - [Python] pandas/feather tests failing on pandas master
-* [ARROW-7528](https://issues.apache.org/jira/browse/ARROW-7528) - [Python] The pandas.datetime class (import of datetime.datetime) and pandas.np are deprecated
-* [ARROW-7535](https://issues.apache.org/jira/browse/ARROW-7535) - [C++] ASAN failure in validation
-* [ARROW-7543](https://issues.apache.org/jira/browse/ARROW-7543) - [R] arrow::write\_parquet() code examples do not work
-* [ARROW-7545](https://issues.apache.org/jira/browse/ARROW-7545) - [C++] [Dataset] Scanning dataset with dictionary type hangs
-* [ARROW-7551](https://issues.apache.org/jira/browse/ARROW-7551) - [FlightRPC][C++] Flight test on macOS fails due to Homebrew gRPC
-* [ARROW-7552](https://issues.apache.org/jira/browse/ARROW-7552) - [C++] TestSlowInputStream is flaky
-* [ARROW-7554](https://issues.apache.org/jira/browse/ARROW-7554) - [C++] Unknown CMake command "externalproject\_add".
-* [ARROW-7559](https://issues.apache.org/jira/browse/ARROW-7559) - [Rust] Possibly incorrect index check assertion in StringArray and BinaryArray
-* [ARROW-7561](https://issues.apache.org/jira/browse/ARROW-7561) - [Doc][Python] fix conda environment command
-* [ARROW-7563](https://issues.apache.org/jira/browse/ARROW-7563) - [Rust] failed to select a version for \`byteorder\`
-* [ARROW-7582](https://issues.apache.org/jira/browse/ARROW-7582) - [Rust][Flight] Unable to compile arrow.flight.protocol.rs
-* [ARROW-7583](https://issues.apache.org/jira/browse/ARROW-7583) - [C++][Flight] Auth handler tests fragile on Windows
-* [ARROW-7591](https://issues.apache.org/jira/browse/ARROW-7591) - [Python] DictionaryArray.to\_numpy returns dict of parts instead of numpy array
-* [ARROW-7592](https://issues.apache.org/jira/browse/ARROW-7592) - [C++] Fix crashes on corrupt IPC input
-* [ARROW-7593](https://issues.apache.org/jira/browse/ARROW-7593) - [CI][Python] Python datasets failing on master / not run on CI
-* [ARROW-7595](https://issues.apache.org/jira/browse/ARROW-7595) - [R][CI] R appveyor job fails due to pacman compression change
-* [ARROW-7596](https://issues.apache.org/jira/browse/ARROW-7596) - [Python] Only apply zero-copy DataFrame block optimizations when split\_blocks=True
-* [ARROW-7599](https://issues.apache.org/jira/browse/ARROW-7599) - [Java] Fix build break due to change in RangeEqualsVisitor
-* [ARROW-7603](https://issues.apache.org/jira/browse/ARROW-7603) - [CI][Crossbow] Nightly centos 8 job fails
-* [ARROW-7611](https://issues.apache.org/jira/browse/ARROW-7611) - [Packaging][Python] Artifacts patterns for wheel are wrong
-* [ARROW-7612](https://issues.apache.org/jira/browse/ARROW-7612) - [Packaging][Python] Artifact paths for Conda on WIndows are wrong
-* [ARROW-7614](https://issues.apache.org/jira/browse/ARROW-7614) - [Python] Slow performance in test\_parquet.py::test\_set\_data\_page\_size
-* [ARROW-7618](https://issues.apache.org/jira/browse/ARROW-7618) - [C++] Fix crashes or undefined behaviour on corrupt IPC input
-* [ARROW-7620](https://issues.apache.org/jira/browse/ARROW-7620) - [Rust] Windows builds failing due to flatbuffer compile error
-* [ARROW-7621](https://issues.apache.org/jira/browse/ARROW-7621) - [Doc] Doc build fails
-* [ARROW-7634](https://issues.apache.org/jira/browse/ARROW-7634) - [Python] Dataset tests failing on Windows to parse file path
-* [ARROW-7638](https://issues.apache.org/jira/browse/ARROW-7638) - [Python] Segfault when inspecting dataset.Source with invalid file/partitioning
-* [ARROW-7639](https://issues.apache.org/jira/browse/ARROW-7639) - [R] Cannot convert Dictionary Array to R when values aren't strings
-* [ARROW-7640](https://issues.apache.org/jira/browse/ARROW-7640) - [C++][Dataset] segfault when reading compressed Parquet files if build didn't include support for codec
-* [ARROW-7647](https://issues.apache.org/jira/browse/ARROW-7647) - [C++] JSON reader fails to read arrays with few values
-* [ARROW-7650](https://issues.apache.org/jira/browse/ARROW-7650) - [C++] Dataset tests not built on Windows
-* [ARROW-7651](https://issues.apache.org/jira/browse/ARROW-7651) - [CI][Crossbow] Nightly macOS wheel builds fail
-* [ARROW-7652](https://issues.apache.org/jira/browse/ARROW-7652) - [Python][Dataset] Insert implicit cast in ScannerBuilder.filter
-* [ARROW-7661](https://issues.apache.org/jira/browse/ARROW-7661) - [Python] Non-optimal CSV chunking when no newline at end
-* [ARROW-7689](https://issues.apache.org/jira/browse/ARROW-7689) - [C++] Sporadic Flight test crash on macOS
-* [ARROW-7690](https://issues.apache.org/jira/browse/ARROW-7690) - [R] Cannot write parquet to OutputStream
-* [ARROW-7693](https://issues.apache.org/jira/browse/ARROW-7693) - [CI] Fix test-conda-python-3.7-spark-master nightly errors
-* [ARROW-7709](https://issues.apache.org/jira/browse/ARROW-7709) - [Python] Conversion from Table Column to Pandas loses name for Timestamps
-* [ARROW-7714](https://issues.apache.org/jira/browse/ARROW-7714) - [Release] Variable expansion is missing
-* [ARROW-7718](https://issues.apache.org/jira/browse/ARROW-7718) - [Release] Fix auto-retry in the binary release script
-* [ARROW-7723](https://issues.apache.org/jira/browse/ARROW-7723) - [Python] StructArray timestamp type with timezone to\_pandas convert error
-* [ARROW-7727](https://issues.apache.org/jira/browse/ARROW-7727) - [Python] Unable to read a ParquetDataset when schema validation is on.
-* [ARROW-8135](https://issues.apache.org/jira/browse/ARROW-8135) - [Python] Problem importing PyArrow on a cluster
-* [ARROW-8638](https://issues.apache.org/jira/browse/ARROW-8638) - Arrow Cython API Usage Gives an error when calling CTable API Endpoints
-* [PARQUET-1692](https://issues.apache.org/jira/browse/PARQUET-1692) - [C++] LogicalType::FromThrift error on Centos 7 RPM
-* [PARQUET-1692](https://issues.apache.org/jira/browse/PARQUET-1692) - [C++] LogicalType::FromThrift error on Centos 7 RPM
-* [PARQUET-1693](https://issues.apache.org/jira/browse/PARQUET-1693) - [C++] Build examples don't account for CMAKE compression feature flags
-* [PARQUET-1702](https://issues.apache.org/jira/browse/PARQUET-1702) - [C++] Make BufferedRowGroupWriter compatible with parquet encryption
-* [PARQUET-1706](https://issues.apache.org/jira/browse/PARQUET-1706) - [C++] Wrong dictionary\_page\_offset when writing only data pages via BufferedPageWriter
-* [PARQUET-1707](https://issues.apache.org/jira/browse/PARQUET-1707) - [C++] parquet-arrow-test fails with undefined behaviour sanitizer
-* [PARQUET-1709](https://issues.apache.org/jira/browse/PARQUET-1709) - [C++] Avoid unnecessary temporary std::shared\_ptr copies
-* [PARQUET-1715](https://issues.apache.org/jira/browse/PARQUET-1715) - [C++] Add the Parquet code samples to CI + Refactor Parquet Encryption Samples
-* [PARQUET-1720](https://issues.apache.org/jira/browse/PARQUET-1720) - [C++] Parquet JSONPrint not showing version correctly
-* [PARQUET-1747](https://issues.apache.org/jira/browse/PARQUET-1747) - [C++] Access to ColumnChunkMetaData fails when encryption is on
-* [PARQUET-1766](https://issues.apache.org/jira/browse/PARQUET-1766) - [C++] parquet NaN/null double statistics can result in endless loop
-* [PARQUET-1772](https://issues.apache.org/jira/browse/PARQUET-1772) - [C++] ParquetFileWriter: Data overwritten when output stream opened in append mode
-
-
-## New Features and Improvements
-
-* [ARROW-412](https://issues.apache.org/jira/browse/ARROW-412) - [Format] Handling of buffer padding in the IPC metadata
-* [ARROW-501](https://issues.apache.org/jira/browse/ARROW-501) - [C++] Implement concurrent / buffering InputStream for streaming data use cases
-* [ARROW-772](https://issues.apache.org/jira/browse/ARROW-772) - [C++] Implement take kernel functions
-* [ARROW-843](https://issues.apache.org/jira/browse/ARROW-843) - [C++] Implement Schema unification, merging unequal but equivalent schemas
-* [ARROW-976](https://issues.apache.org/jira/browse/ARROW-976) - [C++][Python] Provide API for defining and reading Parquet datasets with more ad hoc partition schemes
-* [ARROW-1036](https://issues.apache.org/jira/browse/ARROW-1036) - [C++] Define abstract API for filtering Arrow streams (e.g. predicate evaluation)
-* [ARROW-1119](https://issues.apache.org/jira/browse/ARROW-1119) - [Python/C++] Implement NativeFile interfaces for Amazon S3
-* [ARROW-1175](https://issues.apache.org/jira/browse/ARROW-1175) - [Java] Implement/test dictionary-encoded subfields
-* [ARROW-1456](https://issues.apache.org/jira/browse/ARROW-1456) - [Python] Run s3fs unit tests in Travis CI
-* [ARROW-1562](https://issues.apache.org/jira/browse/ARROW-1562) - [C++] Numeric kernel implementations for add (+)
-* [ARROW-1638](https://issues.apache.org/jira/browse/ARROW-1638) - [Java] IPC roundtrip for null type
-* [ARROW-1900](https://issues.apache.org/jira/browse/ARROW-1900) - [C++] Add kernel functions for determining value range (maximum and minimum) of integer arrays
-* [ARROW-2428](https://issues.apache.org/jira/browse/ARROW-2428) - [Python] Add API to map Arrow types (including extension types) to pandas ExtensionArray instances for to\_pandas conversions
-* [ARROW-2602](https://issues.apache.org/jira/browse/ARROW-2602) - [Packaging] Automate build of development docker containers
-* [ARROW-2863](https://issues.apache.org/jira/browse/ARROW-2863) - [Python] Add context manager APIs to RecordBatch\*Writer/Reader classes
-* [ARROW-3085](https://issues.apache.org/jira/browse/ARROW-3085) - [Rust] Add an adapter for parquet.
-* [ARROW-3408](https://issues.apache.org/jira/browse/ARROW-3408) - [C++] Add option to CSV reader to dictionary encode individual columns or all string / binary columns
-* [ARROW-3444](https://issues.apache.org/jira/browse/ARROW-3444) - [Python] Table.nbytes attribute
-* [ARROW-3706](https://issues.apache.org/jira/browse/ARROW-3706) - [Rust] Add record batch reader trait.
-* [ARROW-3789](https://issues.apache.org/jira/browse/ARROW-3789) - [Python] Enable calling object in Table.to\_pandas to "self-destruct" for improved memory use
-* [ARROW-3808](https://issues.apache.org/jira/browse/ARROW-3808) - [R] Implement [.arrow::Array
-* [ARROW-3813](https://issues.apache.org/jira/browse/ARROW-3813) - [R] lower level construction of Dictionary Arrays
-* [ARROW-4059](https://issues.apache.org/jira/browse/ARROW-4059) - [Rust] Parquet/Arrow Integration
-* [ARROW-4091](https://issues.apache.org/jira/browse/ARROW-4091) - [C++] Curate default list of CSV null spellings
-* [ARROW-4208](https://issues.apache.org/jira/browse/ARROW-4208) - [CI/Python] Have automatized tests for S3
-* [ARROW-4219](https://issues.apache.org/jira/browse/ARROW-4219) - [Rust] [Parquet] Implement ArrowReader
-* [ARROW-4223](https://issues.apache.org/jira/browse/ARROW-4223) - [Python] Support scipy.sparse integration
-* [ARROW-4224](https://issues.apache.org/jira/browse/ARROW-4224) - [Python] Support integration with pydata/sparse library
-* [ARROW-4225](https://issues.apache.org/jira/browse/ARROW-4225) - [Format][C++] Add CSC sparse matrix support
-* [ARROW-4722](https://issues.apache.org/jira/browse/ARROW-4722) - [C++] Implement Bitmap class to modularize handling of bitmaps
-* [ARROW-4748](https://issues.apache.org/jira/browse/ARROW-4748) - [Rust] [DataFusion] GROUP BY performance could be optimized
-* [ARROW-4930](https://issues.apache.org/jira/browse/ARROW-4930) - [Python] Remove LIBDIR assumptions in Python build
-* [ARROW-5180](https://issues.apache.org/jira/browse/ARROW-5180) - [Rust] IPC Support
-* [ARROW-5181](https://issues.apache.org/jira/browse/ARROW-5181) - [Rust] Create Arrow File reader
-* [ARROW-5182](https://issues.apache.org/jira/browse/ARROW-5182) - [Rust] Create Arrow File writer
-* [ARROW-5227](https://issues.apache.org/jira/browse/ARROW-5227) - [Rust] [DataFusion] Re-implement query execution with an extensible physical query plan
-* [ARROW-5277](https://issues.apache.org/jira/browse/ARROW-5277) - [C\#] MemoryAllocator.Allocate(length: 0) should not return null
-* [ARROW-5333](https://issues.apache.org/jira/browse/ARROW-5333) - [C++] Fit build option summary into narrower console
-* [ARROW-5366](https://issues.apache.org/jira/browse/ARROW-5366) - [Rust] Implement Duration and Interval Arrays
-* [ARROW-5400](https://issues.apache.org/jira/browse/ARROW-5400) - [Rust] Test/ensure that reader and writer support zero-length record batches
-* [ARROW-5445](https://issues.apache.org/jira/browse/ARROW-5445) - [Website] Remove language that encourages pinning a version
-* [ARROW-5454](https://issues.apache.org/jira/browse/ARROW-5454) - [C++] Implement Take on ChunkedArray for DataFrame use
-* [ARROW-5502](https://issues.apache.org/jira/browse/ARROW-5502) - [R] file readers should mmap
-* [ARROW-5508](https://issues.apache.org/jira/browse/ARROW-5508) - [C++] Create reusable Iterator<T\> interface
-* [ARROW-5523](https://issues.apache.org/jira/browse/ARROW-5523) - [Python] [Packaging] Use HTTPS consistently for downloading dependencies
-* [ARROW-5712](https://issues.apache.org/jira/browse/ARROW-5712) - [C++][Parquet] Arrow time32/time64/timestamp ConvertedType not being restored properly
-* [ARROW-5767](https://issues.apache.org/jira/browse/ARROW-5767) - [Format] Permit dictionary replacements in IPC protocol
-* [ARROW-5801](https://issues.apache.org/jira/browse/ARROW-5801) - [CI] Dockerize (add to docker-compose) all Travis CI Linux tasks
-* [ARROW-5802](https://issues.apache.org/jira/browse/ARROW-5802) - [CI] Dockerize "lint" Travis CI job
-* [ARROW-5804](https://issues.apache.org/jira/browse/ARROW-5804) - [C++] Dockerize C++ CI job with conda-forge toolchain, code coverage from Travis CI
-* [ARROW-5805](https://issues.apache.org/jira/browse/ARROW-5805) - [Python] Dockerize (add to docker-compose) Python Travis CI job
-* [ARROW-5806](https://issues.apache.org/jira/browse/ARROW-5806) - [CI] Dockerize (add to docker-compose) Integration tests Travis CI entry
-* [ARROW-5807](https://issues.apache.org/jira/browse/ARROW-5807) - [JS] Dockerize NodeJS Travis CI entry
-* [ARROW-5808](https://issues.apache.org/jira/browse/ARROW-5808) - [GLib][Ruby] Dockerize (add to docker-compose) current GLib + Ruby Travis CI entry
-* [ARROW-5809](https://issues.apache.org/jira/browse/ARROW-5809) - [Rust] Dockerize (add to docker-compose) Rust Travis CI build
-* [ARROW-5810](https://issues.apache.org/jira/browse/ARROW-5810) - [Go] Dockerize Travis CI Go build
-* [ARROW-5831](https://issues.apache.org/jira/browse/ARROW-5831) - [Release] Migrate and improve binary release verification script
-* [ARROW-5839](https://issues.apache.org/jira/browse/ARROW-5839) - [Python] Test manylinux2010 in CI
-* [ARROW-5855](https://issues.apache.org/jira/browse/ARROW-5855) - [Python] Add support for Duration type
-* [ARROW-5859](https://issues.apache.org/jira/browse/ARROW-5859) - [Python] Support ExtentionType on conversion to numpy/pandas
-* [ARROW-5971](https://issues.apache.org/jira/browse/ARROW-5971) - [Website] Blog post introducing Arrow Flight
-* [ARROW-5994](https://issues.apache.org/jira/browse/ARROW-5994) - [CI] [Rust] Create nightly releases of the Rust implementation
-* [ARROW-6003](https://issues.apache.org/jira/browse/ARROW-6003) - [C++] Better input validation and error messaging in CSV reader
-* [ARROW-6074](https://issues.apache.org/jira/browse/ARROW-6074) - [FlightRPC] Implement middleware
-* [ARROW-6091](https://issues.apache.org/jira/browse/ARROW-6091) - [Rust] [DataFusion] Implement parallel execution for limit
-* [ARROW-6109](https://issues.apache.org/jira/browse/ARROW-6109) - [Integration] Docker image for integration testing can't be built on windows
-* [ARROW-6112](https://issues.apache.org/jira/browse/ARROW-6112) - [Java] Update APIs to support 64-bit address space
-* [ARROW-6184](https://issues.apache.org/jira/browse/ARROW-6184) - [Java] Provide hash table based dictionary encoder
-* [ARROW-6251](https://issues.apache.org/jira/browse/ARROW-6251) - [Developer] Add PR merge tool to apache/arrow-site
-* [ARROW-6257](https://issues.apache.org/jira/browse/ARROW-6257) - [C++] Add fnmatch compatible globbing function
-* [ARROW-6274](https://issues.apache.org/jira/browse/ARROW-6274) - [Rust] [DataFusion] Add support for writing results to CSV
-* [ARROW-6277](https://issues.apache.org/jira/browse/ARROW-6277) - [C++][Parquet] Support reading/writing other Parquet primitive types to DictionaryArray
-* [ARROW-6283](https://issues.apache.org/jira/browse/ARROW-6283) - [Rust] [DataFusion] Implement operator to write query results to partitioned CSV
-* [ARROW-6285](https://issues.apache.org/jira/browse/ARROW-6285) - [GLib] Add support for LargeBinary and LargeString types
-* [ARROW-6286](https://issues.apache.org/jira/browse/ARROW-6286) - [GLib] Add support for LargeList type
-* [ARROW-6299](https://issues.apache.org/jira/browse/ARROW-6299) - [C++] Simplify FileFormat classes to singletons
-* [ARROW-6321](https://issues.apache.org/jira/browse/ARROW-6321) - [Python] Ability to create ExtensionBlock on conversion to pandas
-* [ARROW-6340](https://issues.apache.org/jira/browse/ARROW-6340) - [R] Implements low-level bindings to Dataset classes
-* [ARROW-6341](https://issues.apache.org/jira/browse/ARROW-6341) - [Python] Implement low-level bindings for Dataset
-* [ARROW-6352](https://issues.apache.org/jira/browse/ARROW-6352) - [Java] Add implementation of DenseUnionVector.
-* [ARROW-6367](https://issues.apache.org/jira/browse/ARROW-6367) - [C++][Gandiva] Implement string reverse
-* [ARROW-6378](https://issues.apache.org/jira/browse/ARROW-6378) - [C++][Dataset] Implement TreeDataSource
-* [ARROW-6386](https://issues.apache.org/jira/browse/ARROW-6386) - [C++][Documentation] Explicit documentation of null slot interpretation
-* [ARROW-6394](https://issues.apache.org/jira/browse/ARROW-6394) - [Java] Support conversions between delta vector and partial sum vector
-* [ARROW-6396](https://issues.apache.org/jira/browse/ARROW-6396) - [C++] Add ResolveNullOptions to Logical kernels
-* [ARROW-6398](https://issues.apache.org/jira/browse/ARROW-6398) - [C++] Consolidate ScanOptions and ScanContext
-* [ARROW-6405](https://issues.apache.org/jira/browse/ARROW-6405) - [Python] Add std::move wrapper for use in Cython
-* [ARROW-6452](https://issues.apache.org/jira/browse/ARROW-6452) - [Java] Override ValueVector toString() method
-* [ARROW-6463](https://issues.apache.org/jira/browse/ARROW-6463) - [C++][Python] Rename arrow::fs::Selector to FileSelector
-* [ARROW-6466](https://issues.apache.org/jira/browse/ARROW-6466) - [Developer] Refactor integration/integration\_test.py into a proper Python package
-* [ARROW-6468](https://issues.apache.org/jira/browse/ARROW-6468) - [C++] Remove unused hashing routines
-* [ARROW-6473](https://issues.apache.org/jira/browse/ARROW-6473) - [Format] Clarify dictionary encoding edge cases
-* [ARROW-6503](https://issues.apache.org/jira/browse/ARROW-6503) - [C++] Add an argument of memory pool object to SparseTensorConverter
-* [ARROW-6508](https://issues.apache.org/jira/browse/ARROW-6508) - [C++] Add Tensor and SparseTensor factory function with validations
-* [ARROW-6515](https://issues.apache.org/jira/browse/ARROW-6515) - [C++] Clean type\_traits.h definitions
-* [ARROW-6578](https://issues.apache.org/jira/browse/ARROW-6578) - [C++] Casting int64 to string columns
-* [ARROW-6592](https://issues.apache.org/jira/browse/ARROW-6592) - [Java] Add support for skipping decoding of columns/field in Avro converter
-* [ARROW-6594](https://issues.apache.org/jira/browse/ARROW-6594) - [Java] Support logical type encodings from Avro
-* [ARROW-6598](https://issues.apache.org/jira/browse/ARROW-6598) - [Java] Sort the code for ApproxEqualsVisitor
-* [ARROW-6608](https://issues.apache.org/jira/browse/ARROW-6608) - [C++] Make default for ARROW\_HDFS to be OFF
-* [ARROW-6610](https://issues.apache.org/jira/browse/ARROW-6610) - [C++] Add ARROW\_FILESYSTEM=ON/OFF CMake configuration flag
-* [ARROW-6611](https://issues.apache.org/jira/browse/ARROW-6611) - [C++] Make ARROW\_JSON=OFF the default
-* [ARROW-6612](https://issues.apache.org/jira/browse/ARROW-6612) - [C++] Add ARROW\_CSV CMake build flag
-* [ARROW-6619](https://issues.apache.org/jira/browse/ARROW-6619) - [Ruby] Add support for building Gandiva::Expression by Arrow::Schema\#build\_expression
-* [ARROW-6624](https://issues.apache.org/jira/browse/ARROW-6624) - [C++] Add SparseTensor.ToTensor() method
-* [ARROW-6625](https://issues.apache.org/jira/browse/ARROW-6625) - [Python] Allow concat\_tables to null or default fill missing columns
-* [ARROW-6631](https://issues.apache.org/jira/browse/ARROW-6631) - [C++] Do not build with any compression library dependencies by default
-* [ARROW-6632](https://issues.apache.org/jira/browse/ARROW-6632) - [C++] Do not build with ARROW\_COMPUTE=on and ARROW\_DATASET=on by default
-* [ARROW-6633](https://issues.apache.org/jira/browse/ARROW-6633) - [C++] Do not require double-conversion for default build
-* [ARROW-6634](https://issues.apache.org/jira/browse/ARROW-6634) - [C++] Do not require flatbuffers or flatbuffers\_ep to build
-* [ARROW-6634](https://issues.apache.org/jira/browse/ARROW-6634) - [C++] Do not require flatbuffers or flatbuffers\_ep to build
-* [ARROW-6635](https://issues.apache.org/jira/browse/ARROW-6635) - [C++] Do not require glog for default build
-* [ARROW-6636](https://issues.apache.org/jira/browse/ARROW-6636) - [C++] Do not build C++ command line utilities by default
-* [ARROW-6637](https://issues.apache.org/jira/browse/ARROW-6637) - [C++] Zero-dependency default core build
-* [ARROW-6637](https://issues.apache.org/jira/browse/ARROW-6637) - [C++] Zero-dependency default core build
-* [ARROW-6646](https://issues.apache.org/jira/browse/ARROW-6646) - [Go] Amend NullType IPC implementation to append no buffers in RecordBatch message
-* [ARROW-6650](https://issues.apache.org/jira/browse/ARROW-6650) - [Rust] [Integration] Create methods to test Arrow files against Integration JSON
-* [ARROW-6656](https://issues.apache.org/jira/browse/ARROW-6656) - [Rust] [DataFusion] Implement MIN and MAX aggregate expressions
-* [ARROW-6657](https://issues.apache.org/jira/browse/ARROW-6657) - [Rust] [DataFusion] Implement COUNT aggregate expression
-* [ARROW-6658](https://issues.apache.org/jira/browse/ARROW-6658) - [Rust] [DataFusion] Implement AVG aggregate expression
-* [ARROW-6659](https://issues.apache.org/jira/browse/ARROW-6659) - [Rust] [DataFusion] Refactor of HashAggregateExec to support custom merge
-* [ARROW-6662](https://issues.apache.org/jira/browse/ARROW-6662) - [Java] Implement equals/approxEquals API for VectorSchemaRoot
-* [ARROW-6671](https://issues.apache.org/jira/browse/ARROW-6671) - [C++] Sparse tensor naming
-* [ARROW-6672](https://issues.apache.org/jira/browse/ARROW-6672) - [Java] Extract a common interface for dictionary builders
-* [ARROW-6685](https://issues.apache.org/jira/browse/ARROW-6685) - [C++/Python] S3 FileStat object's base\_path and type depends on trailing slash
-* [ARROW-6686](https://issues.apache.org/jira/browse/ARROW-6686) - [CI] Pull and push docker images to speed up the nightly builds
-* [ARROW-6688](https://issues.apache.org/jira/browse/ARROW-6688) - [Packaging] Include s3 support in the conda packages
-* [ARROW-6690](https://issues.apache.org/jira/browse/ARROW-6690) - [Rust] [DataFusion] HashAggregate without GROUP BY should use SIMD
-* [ARROW-6692](https://issues.apache.org/jira/browse/ARROW-6692) - [Rust] [DataFusion] Update examples to use physical query plan
-* [ARROW-6693](https://issues.apache.org/jira/browse/ARROW-6693) - [Rust] [DataFusion] Update unit tests to use physical query plan
-* [ARROW-6694](https://issues.apache.org/jira/browse/ARROW-6694) - [Rust] [DataFusion] Update integration tests to use physical plan
-* [ARROW-6695](https://issues.apache.org/jira/browse/ARROW-6695) - [Rust] [DataFusion] Remove execution of logical plan
-* [ARROW-6696](https://issues.apache.org/jira/browse/ARROW-6696) - [Rust] [DataFusion] Implement simple math operations in physical query plan
-* [ARROW-6700](https://issues.apache.org/jira/browse/ARROW-6700) - [Rust] [DataFusion] Use new parquet arrow reader
-* [ARROW-6707](https://issues.apache.org/jira/browse/ARROW-6707) - [Java] Improve the performance of JDBC adapters by using nullable information
-* [ARROW-6710](https://issues.apache.org/jira/browse/ARROW-6710) - [Java] Add JDBC adapter test to cover cases which contains some null values
-* [ARROW-6711](https://issues.apache.org/jira/browse/ARROW-6711) - [C++] Consolidate Filter and Expression classes
-* [ARROW-6721](https://issues.apache.org/jira/browse/ARROW-6721) - [JAVA] Avro adapter benchmark only runs once in JMH
-* [ARROW-6722](https://issues.apache.org/jira/browse/ARROW-6722) - [Java] Provide a uniform way to get vector name
-* [ARROW-6729](https://issues.apache.org/jira/browse/ARROW-6729) - [C++] StlStringBuffer constructor is not zero-copy
-* [ARROW-6730](https://issues.apache.org/jira/browse/ARROW-6730) - [CI] Use GitHub Actions for "C++ with clang 7" docker image
-* [ARROW-6731](https://issues.apache.org/jira/browse/ARROW-6731) - [CI] [Rust] Set up Github Action to run Rust tests
-* [ARROW-6732](https://issues.apache.org/jira/browse/ARROW-6732) - [Java] Implement quick sort in a non-recursive way to avoid stack overflow
-* [ARROW-6741](https://issues.apache.org/jira/browse/ARROW-6741) - [Release] Update changelog.py to use APACHE\_ prefixed JIRA\_USERNAME and JIRA\_PASSWORD environment variables
-* [ARROW-6742](https://issues.apache.org/jira/browse/ARROW-6742) - [C++] Remove usage of boost::filesystem::path from arrow/io/hdfs\_internal.cc
-* [ARROW-6743](https://issues.apache.org/jira/browse/ARROW-6743) - [C++] Completely remove usage of boost::filesystem (except in hdfs\_internal)
-* [ARROW-6744](https://issues.apache.org/jira/browse/ARROW-6744) - [Rust] Export JsonEqual trait in the array module
-* [ARROW-6754](https://issues.apache.org/jira/browse/ARROW-6754) - [C++] Merge arrow/allocator.h and arrow/stl.h, or rename allocator.h
-* [ARROW-6758](https://issues.apache.org/jira/browse/ARROW-6758) - [Release] Install ephemeral node/npm/npx in release verification script
-* [ARROW-6764](https://issues.apache.org/jira/browse/ARROW-6764) - [C++] Add readahead iterator
-* [ARROW-6767](https://issues.apache.org/jira/browse/ARROW-6767) - [JS] lazily bind batches in scan/scanReverse
-* [ARROW-6768](https://issues.apache.org/jira/browse/ARROW-6768) - [C++][Dataset] Implement dataset::Scan to Table helper function
-* [ARROW-6769](https://issues.apache.org/jira/browse/ARROW-6769) - [C++][Dataset] End to End dataset integration test case
-* [ARROW-6770](https://issues.apache.org/jira/browse/ARROW-6770) - [CI][Travis] Download Minio quietly
-* [ARROW-6777](https://issues.apache.org/jira/browse/ARROW-6777) - [GLib][CI] Unpin gobject-introspection gem
-* [ARROW-6778](https://issues.apache.org/jira/browse/ARROW-6778) - [C++] Support DurationType in Cast kernel
-* [ARROW-6782](https://issues.apache.org/jira/browse/ARROW-6782) - [C++] Build minimal core Arrow libraries without any Boost headers
-* [ARROW-6784](https://issues.apache.org/jira/browse/ARROW-6784) - [C++][R] Move filter and take code from Rcpp to C++ library
-* [ARROW-6787](https://issues.apache.org/jira/browse/ARROW-6787) - [CI] Decommission "C++ with clang 7 and system packages" Travis CI job
-* [ARROW-6788](https://issues.apache.org/jira/browse/ARROW-6788) - [CI] Migrate Travis CI lint job to GitHub Actions
-* [ARROW-6789](https://issues.apache.org/jira/browse/ARROW-6789) - [Python] Automatically box bytes/buffer-like values yielded from \`FlightServerBase.do\_action\` in Result values
-* [ARROW-6790](https://issues.apache.org/jira/browse/ARROW-6790) - [Release] Automatically disable integration test cases in release verification
-* [ARROW-6793](https://issues.apache.org/jira/browse/ARROW-6793) - [R] Arrow C++ binary packaging for Linux
-* [ARROW-6797](https://issues.apache.org/jira/browse/ARROW-6797) - [Release] Use a separately cloned arrow-site repository in the website post release script
-* [ARROW-6802](https://issues.apache.org/jira/browse/ARROW-6802) - [Packaging][deb][RPM] Update qemu-user-static package URL
-* [ARROW-6803](https://issues.apache.org/jira/browse/ARROW-6803) - [Rust] [DataFusion] Aggregate queries are slower with new physical query plan
-* [ARROW-6804](https://issues.apache.org/jira/browse/ARROW-6804) - [CI] [Rust] Migrate Travis Rust job to Github Actions
-* [ARROW-6807](https://issues.apache.org/jira/browse/ARROW-6807) - [Java][FlightRPC] Expose gRPC service
-* [ARROW-6810](https://issues.apache.org/jira/browse/ARROW-6810) - [Website] Add docs for R package 0.15 release
-* [ARROW-6811](https://issues.apache.org/jira/browse/ARROW-6811) - [R] Assorted post-0.15 release cleanups
-* [ARROW-6814](https://issues.apache.org/jira/browse/ARROW-6814) - [C++] Resolve compiler warnings occurred on release build
-* [ARROW-6822](https://issues.apache.org/jira/browse/ARROW-6822) - [Website] merge\_pr.py is published
-* [ARROW-6824](https://issues.apache.org/jira/browse/ARROW-6824) - [Plasma] Support batched create and seal requests for small objects
-* [ARROW-6825](https://issues.apache.org/jira/browse/ARROW-6825) - [C++] Rework CSV reader IO around readahead iterator
-* [ARROW-6831](https://issues.apache.org/jira/browse/ARROW-6831) - [R] Update R macOS/Windows builds for change in cmake compression defaults
-* [ARROW-6832](https://issues.apache.org/jira/browse/ARROW-6832) - [R] Implement Codec::IsAvailable
-* [ARROW-6833](https://issues.apache.org/jira/browse/ARROW-6833) - [R][CI] Add crossbow job for full R autobrew macOS build
-* [ARROW-6836](https://issues.apache.org/jira/browse/ARROW-6836) - [Format] add a custom\_metadata:[KeyValue] field to the Footer table in File.fbs
-* [ARROW-6843](https://issues.apache.org/jira/browse/ARROW-6843) - [Website] Disable deploy on pull request
-* [ARROW-6847](https://issues.apache.org/jira/browse/ARROW-6847) - [C++] Add a range\_expression interface to Iterator<\>
-* [ARROW-6850](https://issues.apache.org/jira/browse/ARROW-6850) - [Java] Jdbc converter support Null type
-* [ARROW-6852](https://issues.apache.org/jira/browse/ARROW-6852) - [C++] memory-benchmark build failed on Arm64
-* [ARROW-6853](https://issues.apache.org/jira/browse/ARROW-6853) - [Java] Support vector and dictionary encoder use different hasher for calculating hashCode
-* [ARROW-6855](https://issues.apache.org/jira/browse/ARROW-6855) - [C++][Python][Flight] Implement Flight middleware
-* [ARROW-6862](https://issues.apache.org/jira/browse/ARROW-6862) - [Developer] Check pull request title
-* [ARROW-6863](https://issues.apache.org/jira/browse/ARROW-6863) - [Java] Provide parallel searcher
-* [ARROW-6865](https://issues.apache.org/jira/browse/ARROW-6865) - [Java] Improve the performance of comparing an ArrowBuf against a byte array
-* [ARROW-6866](https://issues.apache.org/jira/browse/ARROW-6866) - [Java] Improve the performance of calculating hash code for struct vector
-* [ARROW-6879](https://issues.apache.org/jira/browse/ARROW-6879) - [Rust] Add explicit SIMD for sum kernel
-* [ARROW-6880](https://issues.apache.org/jira/browse/ARROW-6880) - [Rust] Add explicit SIMD for min/max kernel
-* [ARROW-6881](https://issues.apache.org/jira/browse/ARROW-6881) - [Rust] Remove "array\_ops" in favor of the "compute" sub-module
-* [ARROW-6884](https://issues.apache.org/jira/browse/ARROW-6884) - [Python][Flight] Make server-side RPC exceptions more friendly?
-* [ARROW-6887](https://issues.apache.org/jira/browse/ARROW-6887) - [Java] Create prose documentation for using ValueVectors
-* [ARROW-6888](https://issues.apache.org/jira/browse/ARROW-6888) - [Java] Support copy operation for vector value comparators
-* [ARROW-6889](https://issues.apache.org/jira/browse/ARROW-6889) - [Java] ComplexCopier enable FixedSizeList type & fix RangeEualsVisitor StackOverFlow
-* [ARROW-6891](https://issues.apache.org/jira/browse/ARROW-6891) - [Rust] [Parquet] Add Utf8 support to ArrowReader
-* [ARROW-6902](https://issues.apache.org/jira/browse/ARROW-6902) - [C++] Add String\*/Binary\* support for Compare kernels
-* [ARROW-6904](https://issues.apache.org/jira/browse/ARROW-6904) - [Python] Implement MapArray and MapType
-* [ARROW-6907](https://issues.apache.org/jira/browse/ARROW-6907) - [C++][Plasma] Allow Plasma store to batch notifications to clients
-* [ARROW-6911](https://issues.apache.org/jira/browse/ARROW-6911) - [Java] Provide composite comparator
-* [ARROW-6912](https://issues.apache.org/jira/browse/ARROW-6912) - [Java] Extract a common base class for avro converter consumers
-* [ARROW-6916](https://issues.apache.org/jira/browse/ARROW-6916) - [Developer] Alphabetize task names in nightly Crossbow report
-* [ARROW-6918](https://issues.apache.org/jira/browse/ARROW-6918) - [R] Make docker-compose setup faster
-* [ARROW-6919](https://issues.apache.org/jira/browse/ARROW-6919) - [Python] Expose more builders in Cython
-* [ARROW-6920](https://issues.apache.org/jira/browse/ARROW-6920) - [Python] create manylinux wheels for python3.8
-* [ARROW-6926](https://issues.apache.org/jira/browse/ARROW-6926) - [Python] Support \_\_sizeof\_\_ protocol for Python objects
-* [ARROW-6927](https://issues.apache.org/jira/browse/ARROW-6927) - [C++] Add gRPC version check
-* [ARROW-6928](https://issues.apache.org/jira/browse/ARROW-6928) - [Rust] Add FixedSizeList type
-* [ARROW-6930](https://issues.apache.org/jira/browse/ARROW-6930) - [Java] Create utility class for populating vector values used for test purpose only
-* [ARROW-6932](https://issues.apache.org/jira/browse/ARROW-6932) - [Java] incorrect log on known extension type
-* [ARROW-6933](https://issues.apache.org/jira/browse/ARROW-6933) - [Java] Suppor linear dictionary encoder
-* [ARROW-6936](https://issues.apache.org/jira/browse/ARROW-6936) - [Python] Improve error message when object of wrong type is given
-* [ARROW-6942](https://issues.apache.org/jira/browse/ARROW-6942) - [Developer] Add support for Parquet in pull request check by GitHub Actions
-* [ARROW-6943](https://issues.apache.org/jira/browse/ARROW-6943) - [Website] Translate Apache Arrow Flight introduction to Japanese
-* [ARROW-6944](https://issues.apache.org/jira/browse/ARROW-6944) - [Rust] Add StringType
-* [ARROW-6949](https://issues.apache.org/jira/browse/ARROW-6949) - [Java] Fix promotable write to handle nullvectors
-* [ARROW-6951](https://issues.apache.org/jira/browse/ARROW-6951) - [C++][Dataset] Ensure column projection is passed to ParquetDataFragment
-* [ARROW-6952](https://issues.apache.org/jira/browse/ARROW-6952) - [C++][Dataset] Ensure expression filter is passed ParquetDataFragment
-* [ARROW-6954](https://issues.apache.org/jira/browse/ARROW-6954) - [Python] [CI] Add Python 3.8 to CI matrix
-* [ARROW-6960](https://issues.apache.org/jira/browse/ARROW-6960) - [R] Add support for more compression codecs in Windows build
-* [ARROW-6961](https://issues.apache.org/jira/browse/ARROW-6961) - [C++][Gandiva] Add lower\_utf8 function in Gandiva
-* [ARROW-6963](https://issues.apache.org/jira/browse/ARROW-6963) - [Packaging][Wheel][OSX] Use crossbow's command to deploy artifacts from travis builds
-* [ARROW-6964](https://issues.apache.org/jira/browse/ARROW-6964) - [C++][Dataset] Expose a nested parallel option for Scanner::ToTable
-* [ARROW-6965](https://issues.apache.org/jira/browse/ARROW-6965) - [C++][Dataset] Optionally expose partition keys as materialized columns
-* [ARROW-6967](https://issues.apache.org/jira/browse/ARROW-6967) - [C++] Add filter expressions for IN, IS\_VALID
-* [ARROW-6969](https://issues.apache.org/jira/browse/ARROW-6969) - [C++][Dataset] ParquetScanTask eagerly load file
-* [ARROW-6970](https://issues.apache.org/jira/browse/ARROW-6970) - [Packaging][RPM] Add support for CentOS 8
-* [ARROW-6973](https://issues.apache.org/jira/browse/ARROW-6973) - [C++][ThreadPool] Use perfect forwarding in Submit
-* [ARROW-6975](https://issues.apache.org/jira/browse/ARROW-6975) - [C++] Put make\_unique in its own header
-* [ARROW-6980](https://issues.apache.org/jira/browse/ARROW-6980) - [R] dplyr backend for RecordBatch/Table
-* [ARROW-6984](https://issues.apache.org/jira/browse/ARROW-6984) - [C++] Update LZ4 to 1.9.2 for CVE-2019-17543
-* [ARROW-6986](https://issues.apache.org/jira/browse/ARROW-6986) - [R] Add basic Expression class
-* [ARROW-6987](https://issues.apache.org/jira/browse/ARROW-6987) - [CI] Travis OSX failing to install sdk headers
-* [ARROW-6991](https://issues.apache.org/jira/browse/ARROW-6991) - [Packaging][deb] Add support for Ubuntu 19.10
-* [ARROW-6994](https://issues.apache.org/jira/browse/ARROW-6994) - [C++] Research jemalloc memory page reclamation configuration on macOS when background\_thread option is unavailable
-* [ARROW-6997](https://issues.apache.org/jira/browse/ARROW-6997) - [Packaging] Add support for RHEL
-* [ARROW-7000](https://issues.apache.org/jira/browse/ARROW-7000) - [C++][Gandiva] Handle empty inputs in string lower, upper functions
-* [ARROW-7003](https://issues.apache.org/jira/browse/ARROW-7003) - [Format] [Rust] Generate flatbuffers files in build script
-* [ARROW-7004](https://issues.apache.org/jira/browse/ARROW-7004) - [Plasma] Make it possible to bump up object in LRU cache
-* [ARROW-7006](https://issues.apache.org/jira/browse/ARROW-7006) - [Rust] Bump flatbuffers version to avoid vulnerability
-* [ARROW-7007](https://issues.apache.org/jira/browse/ARROW-7007) - [C++] Enable mmap option for LocalFs
-* [ARROW-7014](https://issues.apache.org/jira/browse/ARROW-7014) - [Developer] Write script to verify Linux wheels given local environment with conda or virtualenv
-* [ARROW-7015](https://issues.apache.org/jira/browse/ARROW-7015) - [Developer] Write script to verify macOS wheels given local environment with conda or virtualenv
-* [ARROW-7016](https://issues.apache.org/jira/browse/ARROW-7016) - [Developer][Python] Write script to verify Windows wheels given local environment with conda
-* [ARROW-7019](https://issues.apache.org/jira/browse/ARROW-7019) - [Java] Improve the performance of loading validity buffers
-* [ARROW-7026](https://issues.apache.org/jira/browse/ARROW-7026) - [Java] Remove assertions in MessageSerializer/vector/writer/reader
-* [ARROW-7031](https://issues.apache.org/jira/browse/ARROW-7031) - [Python] Expose the offsets of a ListArray in python
-* [ARROW-7031](https://issues.apache.org/jira/browse/ARROW-7031) - [Python] Expose the offsets of a ListArray in python
-* [ARROW-7032](https://issues.apache.org/jira/browse/ARROW-7032) - [Release] Run the python unit tests in the release verification script
-* [ARROW-7034](https://issues.apache.org/jira/browse/ARROW-7034) - [CI][Crossbow] Skip known nightly failures
-* [ARROW-7035](https://issues.apache.org/jira/browse/ARROW-7035) - [R] Default arguments are unclear in write\_parquet docs
-* [ARROW-7036](https://issues.apache.org/jira/browse/ARROW-7036) - [C++] Version up ORC to avoid compile errors
-* [ARROW-7037](https://issues.apache.org/jira/browse/ARROW-7037) - [C++ ] Compile error on the combination of protobuf \>= 3.9 and clang
-* [ARROW-7039](https://issues.apache.org/jira/browse/ARROW-7039) - [Python] Typecheck expects pandas to be installed
-* [ARROW-7047](https://issues.apache.org/jira/browse/ARROW-7047) - [C++][Dataset] Filter expressions should not require exact type match
-* [ARROW-7052](https://issues.apache.org/jira/browse/ARROW-7052) - [C++] Datasets example fails to build with ARROW\_SHARED=OFF
-* [ARROW-7054](https://issues.apache.org/jira/browse/ARROW-7054) - [Docs] Add option to override displayed docs version with an environment variable
-* [ARROW-7057](https://issues.apache.org/jira/browse/ARROW-7057) - [C++] Add API to parse URI query strings
-* [ARROW-7058](https://issues.apache.org/jira/browse/ARROW-7058) - [C++] FileSystemDataSourceDiscovery should apply partition schemes relative to the base\_dir of its selector
-* [ARROW-7060](https://issues.apache.org/jira/browse/ARROW-7060) - [R] Post-0.15.1 cleanup
-* [ARROW-7061](https://issues.apache.org/jira/browse/ARROW-7061) - [C++][Dataset] FileSystemDiscovery with ParquetFileFormat should ignore files that aren't Parquet
-* [ARROW-7062](https://issues.apache.org/jira/browse/ARROW-7062) - [C++] Parquet file parse error messages should include the file name
-* [ARROW-7064](https://issues.apache.org/jira/browse/ARROW-7064) - [R] Implement null type
-* [ARROW-7066](https://issues.apache.org/jira/browse/ARROW-7066) - [Python] support returning ChunkedArray from \_\_arrow\_array\_\_ ?
-* [ARROW-7067](https://issues.apache.org/jira/browse/ARROW-7067) - [CI] Disable code coverage on Travis-CI
-* [ARROW-7069](https://issues.apache.org/jira/browse/ARROW-7069) - [C++][Dataset] Replace ConstantPartitionScheme with PrefixDictionaryPartitionScheme
-* [ARROW-7070](https://issues.apache.org/jira/browse/ARROW-7070) - [Packaging][deb] Update package names for 1.0.0
-* [ARROW-7072](https://issues.apache.org/jira/browse/ARROW-7072) - [Java] Support concating validity bits efficiently
-* [ARROW-7082](https://issues.apache.org/jira/browse/ARROW-7082) - [Packaging][deb] Add apache-arrow-archive-keyring
-* [ARROW-7086](https://issues.apache.org/jira/browse/ARROW-7086) - [C++] Provide a wrapper for invoking factories to produce a Result
-* [ARROW-7092](https://issues.apache.org/jira/browse/ARROW-7092) - [R] Add vignette for dplyr and datasets
-* [ARROW-7093](https://issues.apache.org/jira/browse/ARROW-7093) - [R] Support creating ScalarExpressions for more data types
-* [ARROW-7094](https://issues.apache.org/jira/browse/ARROW-7094) - [C++] FileSystemDataSource should use an owning pointer for fs::Filesystem
-* [ARROW-7095](https://issues.apache.org/jira/browse/ARROW-7095) - [R] Better handling of unsupported filter and mutate expressions in dplyr methods
-* [ARROW-7096](https://issues.apache.org/jira/browse/ARROW-7096) - [C++] Add options structs for concatenation-with-promotion and schema unification
-* [ARROW-7098](https://issues.apache.org/jira/browse/ARROW-7098) - [Java] Improve the performance of comparing two memory blocks
-* [ARROW-7099](https://issues.apache.org/jira/browse/ARROW-7099) - [C++] Disambiguate function calls in csv parser test
-* [ARROW-7101](https://issues.apache.org/jira/browse/ARROW-7101) - [CI] Refactor docker-compose setup and use it with GitHub Actions
-* [ARROW-7103](https://issues.apache.org/jira/browse/ARROW-7103) - [R] Various minor cleanups
-* [ARROW-7107](https://issues.apache.org/jira/browse/ARROW-7107) - [C++][MinGW] Enable Flight on AppVeyor
-* [ARROW-7110](https://issues.apache.org/jira/browse/ARROW-7110) - [GLib] Add filter support for GArrowTable, GArrowChunkedArray, and GArrowRecordBatch
-* [ARROW-7111](https://issues.apache.org/jira/browse/ARROW-7111) - [GLib] Add take support for GArrowTable, GArrowChunkedArray, and GArrowRecordBatch
-* [ARROW-7113](https://issues.apache.org/jira/browse/ARROW-7113) - [Rust] Buffer should accept memory owned by others
-* [ARROW-7116](https://issues.apache.org/jira/browse/ARROW-7116) - [CI] Use the docker repository provided by apache organisation
-* [ARROW-7120](https://issues.apache.org/jira/browse/ARROW-7120) - [C++][CI] Add .ccache to the docker-compose volume mounts
-* [ARROW-7146](https://issues.apache.org/jira/browse/ARROW-7146) - [R][CI] Various fixes and speedups for the R docker-compose setup
-* [ARROW-7147](https://issues.apache.org/jira/browse/ARROW-7147) - [C++][Dataset] Refactor dataset's API to use Result<T\>
-* [ARROW-7148](https://issues.apache.org/jira/browse/ARROW-7148) - [C++][Dataset] API cleanup
-* [ARROW-7149](https://issues.apache.org/jira/browse/ARROW-7149) - [C++] Remove experimental status on filesystem APIs
-* [ARROW-7155](https://issues.apache.org/jira/browse/ARROW-7155) - [Java][CI] add maven wrapper to make setup process simple
-* [ARROW-7159](https://issues.apache.org/jira/browse/ARROW-7159) - [CI] Run HDFS tests as cron task
-* [ARROW-7160](https://issues.apache.org/jira/browse/ARROW-7160) - [C++] Update string\_view backport
-* [ARROW-7161](https://issues.apache.org/jira/browse/ARROW-7161) - [C++] Migrate filesystem layer from Status to Result
-* [ARROW-7162](https://issues.apache.org/jira/browse/ARROW-7162) - [C++] Cleanup warnings in cmake\_modules/SetupCxxFlags.cmake
-* [ARROW-7166](https://issues.apache.org/jira/browse/ARROW-7166) - [Java] Remove redundant code for Jdbc adapters
-* [ARROW-7169](https://issues.apache.org/jira/browse/ARROW-7169) - [C++] Vendor uriparser library
-* [ARROW-7171](https://issues.apache.org/jira/browse/ARROW-7171) - [Ruby] Pass Array<Boolean\> for Arrow::Table\#filter
-* [ARROW-7172](https://issues.apache.org/jira/browse/ARROW-7172) - [C++][Dataset] Improve format of Expression::ToString
-* [ARROW-7176](https://issues.apache.org/jira/browse/ARROW-7176) - [C++] Fix arrow::ipc compiler warning
-* [ARROW-7178](https://issues.apache.org/jira/browse/ARROW-7178) - [C++] Vendor forward compatible std::optional
-* [ARROW-7185](https://issues.apache.org/jira/browse/ARROW-7185) - [R][Dataset] Add bindings for IN, IS\_VALID expressions
-* [ARROW-7186](https://issues.apache.org/jira/browse/ARROW-7186) - [R] Add inline comments to document the dplyr code
-* [ARROW-7192](https://issues.apache.org/jira/browse/ARROW-7192) - [Rust] Implement Flight crate
-* [ARROW-7193](https://issues.apache.org/jira/browse/ARROW-7193) - [Rust] Create Arrow stream reader
-* [ARROW-7195](https://issues.apache.org/jira/browse/ARROW-7195) - [Ruby] Improve \#filter, \#take, and \#is\_in
-* [ARROW-7196](https://issues.apache.org/jira/browse/ARROW-7196) - [Ruby] Remove needless BinaryArrayBuilder\#append\_values
-* [ARROW-7197](https://issues.apache.org/jira/browse/ARROW-7197) - [Ruby] Suppress keyword argument related warnings with Ruby 2.7
-* [ARROW-7204](https://issues.apache.org/jira/browse/ARROW-7204) - [C++][Dataset] In expression should not require exact type match
-* [ARROW-7206](https://issues.apache.org/jira/browse/ARROW-7206) - [Java] Avoid string concatenation when calling Preconditions\#checkArgument
-* [ARROW-7207](https://issues.apache.org/jira/browse/ARROW-7207) - [Rust] Update Generated Flatbuffer Files
-* [ARROW-7210](https://issues.apache.org/jira/browse/ARROW-7210) - [C++] Scalar cast should support time-based types
-* [ARROW-7211](https://issues.apache.org/jira/browse/ARROW-7211) - [Rust] [Parquet] Support writing to byte buffers
-* [ARROW-7216](https://issues.apache.org/jira/browse/ARROW-7216) - [Java] Improve the performance of setting/clearing individual bits
-* [ARROW-7219](https://issues.apache.org/jira/browse/ARROW-7219) - [CI][Python] Install pickle5 in the conda-python docker image for python version 3.6
-* [ARROW-7227](https://issues.apache.org/jira/browse/ARROW-7227) - [Python] Provide wrappers for ConcatenateWithPromotion()
-* [ARROW-7228](https://issues.apache.org/jira/browse/ARROW-7228) - [Python] Expose RecordBatch.FromStructArray in Python.
-* [ARROW-7235](https://issues.apache.org/jira/browse/ARROW-7235) - [C++] Add Result<T\> to APIs to arrow/io
-* [ARROW-7236](https://issues.apache.org/jira/browse/ARROW-7236) - [C++] Add Result<T\> to APIs to arrow/csv
-* [ARROW-7240](https://issues.apache.org/jira/browse/ARROW-7240) - [C++] Add Result<T\> to APIs to arrow/util
-* [ARROW-7246](https://issues.apache.org/jira/browse/ARROW-7246) - [CI][Python] wheel can't be built by SSL\_ST\_INIT error
-* [ARROW-7247](https://issues.apache.org/jira/browse/ARROW-7247) - [CI][Python] wheel can't be built by wget and OpenSSL error
-* [ARROW-7248](https://issues.apache.org/jira/browse/ARROW-7248) - [Rust] Automatically Regenerate IPC messages from Flatbuffers
-* [ARROW-7255](https://issues.apache.org/jira/browse/ARROW-7255) - [CI] Run source release test on pull request
-* [ARROW-7257](https://issues.apache.org/jira/browse/ARROW-7257) - [CI] Homebrew formula is failed by openssl formula name update
-* [ARROW-7258](https://issues.apache.org/jira/browse/ARROW-7258) - [CI] Fuzzit job is failed by nonexistent directory
-* [ARROW-7259](https://issues.apache.org/jira/browse/ARROW-7259) - [Java] Support subfield encoder use different hasher
-* [ARROW-7260](https://issues.apache.org/jira/browse/ARROW-7260) - [CI] Ubuntu 14.04 test is failed by user defined literal
-* [ARROW-7261](https://issues.apache.org/jira/browse/ARROW-7261) - [Python] Python support for fixed size list type
-* [ARROW-7262](https://issues.apache.org/jira/browse/ARROW-7262) - [C++][Gandiva] Implement replace function in Gandiva
-* [ARROW-7263](https://issues.apache.org/jira/browse/ARROW-7263) - [C++][Gandiva] Implement locate and position functions
-* [ARROW-7268](https://issues.apache.org/jira/browse/ARROW-7268) - [Rust] Propagate \`custom\_metadata\` field from IPC message
-* [ARROW-7269](https://issues.apache.org/jira/browse/ARROW-7269) - [C++] Fix arrow::parquet compiler warning
-* [ARROW-7270](https://issues.apache.org/jira/browse/ARROW-7270) - [Go] preserve CSV reading behaviour, improve memory usage
-* [ARROW-7274](https://issues.apache.org/jira/browse/ARROW-7274) - [C++] Add Result<T\> APIs to Decimal class
-* [ARROW-7275](https://issues.apache.org/jira/browse/ARROW-7275) - [Ruby] Add support for Arrow::ListDataType.new(data\_type)
-* [ARROW-7276](https://issues.apache.org/jira/browse/ARROW-7276) - [Ruby] Add support for building Arrow::ListArray from [[...]]
-* [ARROW-7277](https://issues.apache.org/jira/browse/ARROW-7277) - [Document] Add discussion about vector lifecycle
-* [ARROW-7279](https://issues.apache.org/jira/browse/ARROW-7279) - [C++] Rename UnionArray::type\_ids to UnionArray::type\_codes
-* [ARROW-7284](https://issues.apache.org/jira/browse/ARROW-7284) - [Java] ensure java implementation meets clarified dictionary spec
-* [ARROW-7289](https://issues.apache.org/jira/browse/ARROW-7289) - [C\#] ListType constructor argument is redundant
-* [ARROW-7290](https://issues.apache.org/jira/browse/ARROW-7290) - [C\#] Implement ListArray Builder
-* [ARROW-7292](https://issues.apache.org/jira/browse/ARROW-7292) - [C++] [CI] [Dev] Add ASAN / UBSAN CI run
-* [ARROW-7293](https://issues.apache.org/jira/browse/ARROW-7293) - [Dev] [C++] Persist ccache in docker-compose build volumes
-* [ARROW-7296](https://issues.apache.org/jira/browse/ARROW-7296) - [Python] Add ORC api documentation
-* [ARROW-7299](https://issues.apache.org/jira/browse/ARROW-7299) - [GLib] Use Result instead of Status
-* [ARROW-7303](https://issues.apache.org/jira/browse/ARROW-7303) - [C++] Refactor benchmarks to use new Result APIs
-* [ARROW-7306](https://issues.apache.org/jira/browse/ARROW-7306) - [C++] Add Result-returning version of FileSystemFromUri
-* [ARROW-7307](https://issues.apache.org/jira/browse/ARROW-7307) - [CI][GLib] Documentation isn't generated
-* [ARROW-7309](https://issues.apache.org/jira/browse/ARROW-7309) - [Python] Support HDFS federation viewfs://
-* [ARROW-7310](https://issues.apache.org/jira/browse/ARROW-7310) - [Python] Expose HDFS implementation for pyarrow.fs
-* [ARROW-7311](https://issues.apache.org/jira/browse/ARROW-7311) - [Python] Return filesystem and path from URI
-* [ARROW-7312](https://issues.apache.org/jira/browse/ARROW-7312) - [Rust] ArrowError should implement std::error:Error
-* [ARROW-7317](https://issues.apache.org/jira/browse/ARROW-7317) - [C++] Migrate Iterator API to Result<T\>
-* [ARROW-7319](https://issues.apache.org/jira/browse/ARROW-7319) - [C++] Refactor Iterator<T\> to yield Result<T\>
-* [ARROW-7321](https://issues.apache.org/jira/browse/ARROW-7321) - [CI][GLib] Failed to build with GLib warning
-* [ARROW-7322](https://issues.apache.org/jira/browse/ARROW-7322) - [CI][Python] Fall back to arrowdev dockerhub organization for manylinux images
-* [ARROW-7323](https://issues.apache.org/jira/browse/ARROW-7323) - [CI][Rust] Nightly CI is failed by different toolchain
-* [ARROW-7324](https://issues.apache.org/jira/browse/ARROW-7324) - [Rust] Add Timezone to Timestamp
-* [ARROW-7325](https://issues.apache.org/jira/browse/ARROW-7325) - [Rust] [Parquet] Update to parquet-format 2.6 and thrift 0.12
-* [ARROW-7329](https://issues.apache.org/jira/browse/ARROW-7329) - [Java] AllocationManager: Allow managing different types of memory other than those are allocated using Netty
-* [ARROW-7333](https://issues.apache.org/jira/browse/ARROW-7333) - [CI][Rust] Remove duplicated nightly job
-* [ARROW-7334](https://issues.apache.org/jira/browse/ARROW-7334) - [CI][Python] macOS uses Python 2
-* [ARROW-7339](https://issues.apache.org/jira/browse/ARROW-7339) - [CMake] Thrift version not respected in CMake configuration version.txt
-* [ARROW-7340](https://issues.apache.org/jira/browse/ARROW-7340) - [CI] Prune defunct appveyor build setup
-* [ARROW-7344](https://issues.apache.org/jira/browse/ARROW-7344) - [Packaging][Python] Build manylinux2014 wheels
-* [ARROW-7346](https://issues.apache.org/jira/browse/ARROW-7346) - [CI] Explicit usage of ccache across the builds
-* [ARROW-7347](https://issues.apache.org/jira/browse/ARROW-7347) - [C++] Update bundled Boost to 1.71.0
-* [ARROW-7348](https://issues.apache.org/jira/browse/ARROW-7348) - [Rust] Add api to return references of buffer of null bitmap.
-* [ARROW-7351](https://issues.apache.org/jira/browse/ARROW-7351) - [Developer] Only suggest cpp-\* fix versions when merging Parquet patches
-* [ARROW-7357](https://issues.apache.org/jira/browse/ARROW-7357) - [Go] migrate from pkg/errors to x/xerrors
-* [ARROW-7366](https://issues.apache.org/jira/browse/ARROW-7366) - [C++][Dataset] Use PartitionSchemeDiscovery in DataSourceDiscovery
-* [ARROW-7367](https://issues.apache.org/jira/browse/ARROW-7367) - [Python] Use np.full instead of np.array.repeat in ParquetDatasetPiece
-* [ARROW-7368](https://issues.apache.org/jira/browse/ARROW-7368) - [Ruby] Use :arrow\_file and :arrow\_streaming for format name
-* [ARROW-7369](https://issues.apache.org/jira/browse/ARROW-7369) - [GLib] Add garrow\_table\_combine\_chunks
-* [ARROW-7370](https://issues.apache.org/jira/browse/ARROW-7370) - [C++] Old Protobuf with AUTO detection is failed
-* [ARROW-7377](https://issues.apache.org/jira/browse/ARROW-7377) - [C++][Dataset] Simplify parquet column projection
-* [ARROW-7378](https://issues.apache.org/jira/browse/ARROW-7378) - [C++][Gandiva] Loop vectorization broken in IR optimization
-* [ARROW-7379](https://issues.apache.org/jira/browse/ARROW-7379) - [C++] Introduce SchemaBuilder companion class and Field::IsCompatibleWith
-* [ARROW-7380](https://issues.apache.org/jira/browse/ARROW-7380) - [C++][Dataset] Implement DatasetFactory
-* [ARROW-7382](https://issues.apache.org/jira/browse/ARROW-7382) - [C++][Dataset] Refactor FsDsDiscovery constructors
-* [ARROW-7387](https://issues.apache.org/jira/browse/ARROW-7387) - [C\#] Support ListType Serialization
-* [ARROW-7392](https://issues.apache.org/jira/browse/ARROW-7392) - [Packaging] Add conda packaging tasks for python 3.8
-* [ARROW-7398](https://issues.apache.org/jira/browse/ARROW-7398) - [Packaging][Python] Conda builds are failing on macOS
-* [ARROW-7399](https://issues.apache.org/jira/browse/ARROW-7399) - [C++][Gandiva] Gandiva does not pick runtime cpu features
-* [ARROW-7402](https://issues.apache.org/jira/browse/ARROW-7402) - [C++] Add more information on CUDA error
-* [ARROW-7403](https://issues.apache.org/jira/browse/ARROW-7403) - [C++][JSON] Enable Rapidjson on Arm64 Neon
-* [ARROW-7410](https://issues.apache.org/jira/browse/ARROW-7410) - [Python] [Doc] Document filesystem APIs
-* [ARROW-7411](https://issues.apache.org/jira/browse/ARROW-7411) - [C++][Flight] Incorrect Arrow Flight benchmark output
-* [ARROW-7413](https://issues.apache.org/jira/browse/ARROW-7413) - [Python][Dataset] Add tests for PartitionSchemeDiscovery
-* [ARROW-7414](https://issues.apache.org/jira/browse/ARROW-7414) - [R][Dataset] Implement PartitionSchemeDiscovery
-* [ARROW-7415](https://issues.apache.org/jira/browse/ARROW-7415) - [C++][Dataset] Implement IpcFormat for sources composed of ipc files
-* [ARROW-7416](https://issues.apache.org/jira/browse/ARROW-7416) - [R][Nightly] Fix macos-r-autobrew build on R 3.6.2
-* [ARROW-7417](https://issues.apache.org/jira/browse/ARROW-7417) - [C++] Add a docker-compose entry for CUDA 10.1
-* [ARROW-7418](https://issues.apache.org/jira/browse/ARROW-7418) - [C++] Can't build with g++ 5.4.0 on Ubuntu 16.04
-* [ARROW-7420](https://issues.apache.org/jira/browse/ARROW-7420) - [C++] Migrate tensor related APIs to Result-returning version
-* [ARROW-7429](https://issues.apache.org/jira/browse/ARROW-7429) - [Java] Enhance code style checking for Java code (remove consecutive spaces)
-* [ARROW-7430](https://issues.apache.org/jira/browse/ARROW-7430) - [Python] Add more docstrings to dataset bindings
-* [ARROW-7431](https://issues.apache.org/jira/browse/ARROW-7431) - [Python] Add dataset API to reference docs
-* [ARROW-7432](https://issues.apache.org/jira/browse/ARROW-7432) - [Python] Add higher-level datasets functions
-* [ARROW-7439](https://issues.apache.org/jira/browse/ARROW-7439) - [C++][Dataset] Remove dataset pointer aliases
-* [ARROW-7449](https://issues.apache.org/jira/browse/ARROW-7449) - [GLib] Make GObject Introspection optional
-* [ARROW-7452](https://issues.apache.org/jira/browse/ARROW-7452) - [GLib] Make GArrowTimeDataType abstract
-* [ARROW-7453](https://issues.apache.org/jira/browse/ARROW-7453) - [Ruby] Add support for Arrow::NullArray\#[]
-* [ARROW-7454](https://issues.apache.org/jira/browse/ARROW-7454) - [Ruby] Add support for saving/loading TSV
-* [ARROW-7455](https://issues.apache.org/jira/browse/ARROW-7455) - [Ruby] Use Arrow::DataType.resolve for all GArrowDataType input
-* [ARROW-7456](https://issues.apache.org/jira/browse/ARROW-7456) - [C++] Add support for YYYY-MM-DDThh and YYYY-MM-DDThh:mm timestamp formats
-* [ARROW-7457](https://issues.apache.org/jira/browse/ARROW-7457) - [Doc] Fix typos
-* [ARROW-7459](https://issues.apache.org/jira/browse/ARROW-7459) - [Python] Documentation lint is failed
-* [ARROW-7460](https://issues.apache.org/jira/browse/ARROW-7460) - [Rust] Improve some kernels with autovectorisation
-* [ARROW-7461](https://issues.apache.org/jira/browse/ARROW-7461) - [Java] Fix typos and spelling
-* [ARROW-7463](https://issues.apache.org/jira/browse/ARROW-7463) - [Doc] Fix a broken link and typos
-* [ARROW-7464](https://issues.apache.org/jira/browse/ARROW-7464) - [C++] Refine CpuInfo singleton with std::call\_once
-* [ARROW-7465](https://issues.apache.org/jira/browse/ARROW-7465) - [C++] Add Arrow memory benchmark for Arm64
-* [ARROW-7468](https://issues.apache.org/jira/browse/ARROW-7468) - [Python] Fix typos
-* [ARROW-7469](https://issues.apache.org/jira/browse/ARROW-7469) - [C++] Improve division related bit operations
-* [ARROW-7470](https://issues.apache.org/jira/browse/ARROW-7470) - [JS] Fix typos
-* [ARROW-7474](https://issues.apache.org/jira/browse/ARROW-7474) - [Ruby] Save CSV files faster
-* [ARROW-7475](https://issues.apache.org/jira/browse/ARROW-7475) - [Rust] Create Arrow Stream writer
-* [ARROW-7477](https://issues.apache.org/jira/browse/ARROW-7477) - [FlightRPC][Java] Flight gRPC service is missing reflection info
-* [ARROW-7479](https://issues.apache.org/jira/browse/ARROW-7479) - [Rust][Ruby][R] Fix typos
-* [ARROW-7481](https://issues.apache.org/jira/browse/ARROW-7481) - [C\#] Fix typos
-* [ARROW-7482](https://issues.apache.org/jira/browse/ARROW-7482) - [C++] Fix typos
-* [ARROW-7484](https://issues.apache.org/jira/browse/ARROW-7484) - [C++][Gandiva] Fix typos
-* [ARROW-7485](https://issues.apache.org/jira/browse/ARROW-7485) - [C++][Plasma] Fix typos
-* [ARROW-7487](https://issues.apache.org/jira/browse/ARROW-7487) - [Developer] Fix typos
-* [ARROW-7488](https://issues.apache.org/jira/browse/ARROW-7488) - [GLib] Fix typos and broken links
-* [ARROW-7489](https://issues.apache.org/jira/browse/ARROW-7489) - [CI] Fix typos
-* [ARROW-7490](https://issues.apache.org/jira/browse/ARROW-7490) - [Java] Avro converter should convert attributes and props to FieldType metadata
-* [ARROW-7493](https://issues.apache.org/jira/browse/ARROW-7493) - [Python] Expose sum kernel in pyarrow.compute and support ChunkedArray inputs
-* [ARROW-7498](https://issues.apache.org/jira/browse/ARROW-7498) - [C++][Dataset] Rename DataFragment/DataSource/PartitionScheme
-* [ARROW-7502](https://issues.apache.org/jira/browse/ARROW-7502) - [Integration] Remove Spark Integration patch that not needed anymore
-* [ARROW-7513](https://issues.apache.org/jira/browse/ARROW-7513) - [JS] Arrow Tutorial: Common data types
-* [ARROW-7514](https://issues.apache.org/jira/browse/ARROW-7514) - [C\#] Make GetValueOffset Obsolete
-* [ARROW-7519](https://issues.apache.org/jira/browse/ARROW-7519) - [Python] Build wheels, conda packages with dataset support
-* [ARROW-7521](https://issues.apache.org/jira/browse/ARROW-7521) - [Rust] Remove tuple on FixedSizeList datatype
-* [ARROW-7523](https://issues.apache.org/jira/browse/ARROW-7523) - [Developer] Relax clang-tidy check
-* [ARROW-7526](https://issues.apache.org/jira/browse/ARROW-7526) - [C++][Compute]: Optimize small integer sorting
-* [ARROW-7532](https://issues.apache.org/jira/browse/ARROW-7532) - [CI] Unskip brew test after Homebrew fixes it upstream
-* [ARROW-7537](https://issues.apache.org/jira/browse/ARROW-7537) - [CI][R] Nightly macOS autobrew job should be more verbose if it fails
-* [ARROW-7538](https://issues.apache.org/jira/browse/ARROW-7538) - Clarify actual and desired size in AllocationManager
-* [ARROW-7540](https://issues.apache.org/jira/browse/ARROW-7540) - [C++] License files aren't installed
-* [ARROW-7541](https://issues.apache.org/jira/browse/ARROW-7541) - [GLib] Install license files
-* [ARROW-7542](https://issues.apache.org/jira/browse/ARROW-7542) - [CI][C++] nproc isn't available on macOS
-* [ARROW-7549](https://issues.apache.org/jira/browse/ARROW-7549) - [Java] Reorganize Flight modules to keep top level clean/organized
-* [ARROW-7550](https://issues.apache.org/jira/browse/ARROW-7550) - [R][CI] Run donttest examples in CI
-* [ARROW-7557](https://issues.apache.org/jira/browse/ARROW-7557) - [C++][Compute] Validate sorting stability in random test
-* [ARROW-7558](https://issues.apache.org/jira/browse/ARROW-7558) - [Packaging][deb][RPM] Use the host owner and group for artifacts
-* [ARROW-7560](https://issues.apache.org/jira/browse/ARROW-7560) - [Rust] Reduce Rc/Refcell usage
-* [ARROW-7565](https://issues.apache.org/jira/browse/ARROW-7565) - [Website] Add support for download URL redirect
-* [ARROW-7566](https://issues.apache.org/jira/browse/ARROW-7566) - [CI] Use more recent Miniconda on AppVeyor
-* [ARROW-7567](https://issues.apache.org/jira/browse/ARROW-7567) - [Java] Bump Checkstyle from 6.19 to 8.18
-* [ARROW-7567](https://issues.apache.org/jira/browse/ARROW-7567) - [Java] Bump Checkstyle from 6.19 to 8.18
-* [ARROW-7568](https://issues.apache.org/jira/browse/ARROW-7568) - [Java] Bump Apache Avro from 1.9.0 to 1.9.1
-* [ARROW-7569](https://issues.apache.org/jira/browse/ARROW-7569) - [Python] Add API to map Arrow types to pandas ExtensionDtypes for to\_pandas conversions
-* [ARROW-7570](https://issues.apache.org/jira/browse/ARROW-7570) - [Java] Fix high severity issues reported by LGTM
-* [ARROW-7571](https://issues.apache.org/jira/browse/ARROW-7571) - [Java] Correct minimal java version on README
-* [ARROW-7572](https://issues.apache.org/jira/browse/ARROW-7572) - [Java] Enfore Maven 3.3+ as mentioned in README
-* [ARROW-7573](https://issues.apache.org/jira/browse/ARROW-7573) - [Rust] Reduce boxing and cleanup
-* [ARROW-7575](https://issues.apache.org/jira/browse/ARROW-7575) - [R] Linux binary packaging followup
-* [ARROW-7576](https://issues.apache.org/jira/browse/ARROW-7576) - [C++][Dev] Improve fuzzing setup
-* [ARROW-7577](https://issues.apache.org/jira/browse/ARROW-7577) - [C++][CI] Check fuzzer setup in CI
-* [ARROW-7578](https://issues.apache.org/jira/browse/ARROW-7578) - [R] Add support for datasets with IPC files and with multiple sources
-* [ARROW-7580](https://issues.apache.org/jira/browse/ARROW-7580) - [Website] 0.16 release post
-* [ARROW-7581](https://issues.apache.org/jira/browse/ARROW-7581) - [R] Documentation/polishing for 0.16 release
-* [ARROW-7590](https://issues.apache.org/jira/browse/ARROW-7590) - [C++] Managed files in thirdparty/ are ignored
-* [ARROW-7597](https://issues.apache.org/jira/browse/ARROW-7597) - [C++] Improvements to CMake configuration console summary
-* [ARROW-7600](https://issues.apache.org/jira/browse/ARROW-7600) - [C++][Parquet] Add a basic disabled unit test to excercise nesting functionality
-* [ARROW-7601](https://issues.apache.org/jira/browse/ARROW-7601) - [Doc] [C++] Update fuzzing documentation
-* [ARROW-7602](https://issues.apache.org/jira/browse/ARROW-7602) - [Archery] Add more build options
-* [ARROW-7613](https://issues.apache.org/jira/browse/ARROW-7613) - [Rust] Remove redundant \`::\` prefixes
-* [ARROW-7622](https://issues.apache.org/jira/browse/ARROW-7622) - [Format] Mark Tensor and SparseTensor fields required
-* [ARROW-7623](https://issues.apache.org/jira/browse/ARROW-7623) - [C++] Update generated flatbuffers files
-* [ARROW-7626](https://issues.apache.org/jira/browse/ARROW-7626) - [Parquet][GLib] Add support for version macros
-* [ARROW-7627](https://issues.apache.org/jira/browse/ARROW-7627) - [C++][Gandiva] Optimize string truncate function
-* [ARROW-7629](https://issues.apache.org/jira/browse/ARROW-7629) - [C++][CI] Add fuzz regression files to arrow-testing
-* [ARROW-7630](https://issues.apache.org/jira/browse/ARROW-7630) - [C++][CI] Check fuzz crash regressions in CI
-* [ARROW-7632](https://issues.apache.org/jira/browse/ARROW-7632) - [C++] [CI] Improve fuzzing seed corpus
-* [ARROW-7635](https://issues.apache.org/jira/browse/ARROW-7635) - [C++] Add pkg-config support for each components
-* [ARROW-7636](https://issues.apache.org/jira/browse/ARROW-7636) - [Python] Clean-up the pyarrow.dataset.partitioning() API
-* [ARROW-7644](https://issues.apache.org/jira/browse/ARROW-7644) - Add vcpkg installation instructions
-* [ARROW-7645](https://issues.apache.org/jira/browse/ARROW-7645) - [Packaging][deb][RPM] arm64 build by crossbow is broken
-* [ARROW-7648](https://issues.apache.org/jira/browse/ARROW-7648) - [C++] Sanitize local paths on Windows
-* [ARROW-7658](https://issues.apache.org/jira/browse/ARROW-7658) - [R] Support dplyr filtering on date/time
-* [ARROW-7659](https://issues.apache.org/jira/browse/ARROW-7659) - [Rust] Reduce Rc usage
-* [ARROW-7660](https://issues.apache.org/jira/browse/ARROW-7660) - [C++][Gandiva] Optimise castVarchar(string, int) function for single byte characters
-* [ARROW-7665](https://issues.apache.org/jira/browse/ARROW-7665) - [R] linuxLibs.R should build in parallel
-* [ARROW-7666](https://issues.apache.org/jira/browse/ARROW-7666) - [Packaging][deb] Always use NInja to reduce build time
-* [ARROW-7667](https://issues.apache.org/jira/browse/ARROW-7667) - [Packaging][deb] ubuntu-eoan is missing in nightly jobs
-* [ARROW-7668](https://issues.apache.org/jira/browse/ARROW-7668) - [Packaging][RPM] Use NInja if possible to reduce build time
-* [ARROW-7670](https://issues.apache.org/jira/browse/ARROW-7670) - [Python][Dataset] Better ergonomics for the filter expressions
-* [ARROW-7671](https://issues.apache.org/jira/browse/ARROW-7671) - [Python][Dataset] Add bindings for the DatasetFactory
-* [ARROW-7674](https://issues.apache.org/jira/browse/ARROW-7674) - Add helpful message for captcha challenge in merge\_arrow\_pr.py
-* [ARROW-7682](https://issues.apache.org/jira/browse/ARROW-7682) - [Packaging][APT][Yum] Add support for arm64 APT/Yum repositories
-* [ARROW-7683](https://issues.apache.org/jira/browse/ARROW-7683) - [Packaging] Set 0.16.0 as the next version
-* [ARROW-7686](https://issues.apache.org/jira/browse/ARROW-7686) - [Packaging][deb][RPM] Include more arrow-\*.pc
-* [ARROW-7687](https://issues.apache.org/jira/browse/ARROW-7687) - [C++] C++ developer document links in README are broken
-* [ARROW-7692](https://issues.apache.org/jira/browse/ARROW-7692) - [Rust] Several pattern matches are hard to read
-* [ARROW-7694](https://issues.apache.org/jira/browse/ARROW-7694) - [Packaging][deb][RPM] Can't build repository packages for RC
-* [ARROW-7695](https://issues.apache.org/jira/browse/ARROW-7695) - [Release] Update java versions to 0.16-SNAPSHOT
-* [ARROW-7696](https://issues.apache.org/jira/browse/ARROW-7696) - [Release] Unit test on release branch is failed
-* [ARROW-7697](https://issues.apache.org/jira/browse/ARROW-7697) - [Release] Add a test for updating Linux packages by 00-prepare.sh
-* [ARROW-7710](https://issues.apache.org/jira/browse/ARROW-7710) - [Release][C\#] .NET download URL is redirected
-* [ARROW-7711](https://issues.apache.org/jira/browse/ARROW-7711) - [C\#] Date32 test depends on system timezone
-* [ARROW-7715](https://issues.apache.org/jira/browse/ARROW-7715) - [Release][APT] Ignore some arm64 verifications
-* [ARROW-7716](https://issues.apache.org/jira/browse/ARROW-7716) - [Packaging][APT] Use the "main" component for Ubuntu 19.10
-* [ARROW-7719](https://issues.apache.org/jira/browse/ARROW-7719) - [Python][Dataset] Table equality check occasionally fails
-* [ARROW-7724](https://issues.apache.org/jira/browse/ARROW-7724) - [Release][Yum] Ignore some arm64 verifications
-* [ARROW-7743](https://issues.apache.org/jira/browse/ARROW-7743) - [Rust] [Parquet] Support reading timestamp micros
-* [ARROW-7768](https://issues.apache.org/jira/browse/ARROW-7768) - [Rust] Implement Length and TryClone traits for Cursor<Vec<u8\>\> in reader.rs
-* [ARROW-8015](https://issues.apache.org/jira/browse/ARROW-8015) - [Python] Build 0.16.0 wheel install for Windows + Python 3.5 and publish to PyPI
-* [PARQUET-517](https://issues.apache.org/jira/browse/PARQUET-517) - [C++] Use arrow::MemoryPool for all heap allocations
-* [PARQUET-1300](https://issues.apache.org/jira/browse/PARQUET-1300) - [C++] Parquet modular encryption
-* [PARQUET-1664](https://issues.apache.org/jira/browse/PARQUET-1664) - [C++] Provide API to return metadata string from FileMetadata.
-* [PARQUET-1678](https://issues.apache.org/jira/browse/PARQUET-1678) - [C++] Provide classes for reading/writing using input/output operators
-* [PARQUET-1688](https://issues.apache.org/jira/browse/PARQUET-1688) - [C++] StreamWriter/StreamReader can't be built with g++ 4.8.5 on CentOS 7
-* [PARQUET-1689](https://issues.apache.org/jira/browse/PARQUET-1689) - [C++] Stream API: Allow for columns/rows to be skipped when reading
-* [PARQUET-1701](https://issues.apache.org/jira/browse/PARQUET-1701) - [C++] Stream API: Add support for optional fields
-* [PARQUET-1704](https://issues.apache.org/jira/browse/PARQUET-1704) - [C++] Add re-usable encryption buffer to SerializedPageWriter
-* [PARQUET-1705](https://issues.apache.org/jira/browse/PARQUET-1705) - [C++] Disable shrink-to-fit on the re-usable decryption buffer
-* [PARQUET-1712](https://issues.apache.org/jira/browse/PARQUET-1712) - [C++] Stop using deprecated APIs in examples
-* [PARQUET-1721](https://issues.apache.org/jira/browse/PARQUET-1721) - [C++] Arrow dependency is missing in parquet.pc
-* [PARQUET-1734](https://issues.apache.org/jira/browse/PARQUET-1734) - [C++] Fix typos
-* [PARQUET-1769](https://issues.apache.org/jira/browse/PARQUET-1769) - [C++] Update to parquet-format 2.8.0
-
-
-
-# Apache Arrow 0.15.1 (2019-11-01)
-
-## Bug Fixes
-
-* [ARROW-6464](https://issues.apache.org/jira/browse/ARROW-6464) - [Java] Refactor FixedSizeListVector\#splitAndTransfer with slice API
-* [ARROW-6728](https://issues.apache.org/jira/browse/ARROW-6728) - [C\#] Support reading and writing Date32 and Date64 arrays
-* [ARROW-6740](https://issues.apache.org/jira/browse/ARROW-6740) - [Python] Unable to delete closed MemoryMappedFile on Windows
-* [ARROW-6762](https://issues.apache.org/jira/browse/ARROW-6762) - [C++] JSON reader segfaults on newline
-* [ARROW-6795](https://issues.apache.org/jira/browse/ARROW-6795) - [C\#] Reading large Arrow files in C\# results in an exception
-* [ARROW-6806](https://issues.apache.org/jira/browse/ARROW-6806) - [C++] Segfault deserializing ListArray containing null/empty list
-* [ARROW-6809](https://issues.apache.org/jira/browse/ARROW-6809) - [RUBY] Gem does not install on macOS due to glib2 3.3.7 compilation failure
-* [ARROW-6813](https://issues.apache.org/jira/browse/ARROW-6813) - [Ruby] Arrow::Table.load with headers=true leads to exception in Arrow 0.15
-* [ARROW-6834](https://issues.apache.org/jira/browse/ARROW-6834) - [C++] Pin gtest to 1.8.1 to triage failing Appveyor / MSVC build
-* [ARROW-6844](https://issues.apache.org/jira/browse/ARROW-6844) - [C++][Parquet][Python] List<scalar type\> columns read broken with 0.15.0
-* [ARROW-6857](https://issues.apache.org/jira/browse/ARROW-6857) - [Python][C++] Segfault for dictionary\_encode on empty chunked\_array (edge case)
-* [ARROW-6860](https://issues.apache.org/jira/browse/ARROW-6860) - [Python] Only link libarrow\_flight.so to pyarrow.\_flight
-* [ARROW-6861](https://issues.apache.org/jira/browse/ARROW-6861) - [Python] arrow-0.15.0 reading arrow-0.14.1-output Parquet dictionary column: Failure reading column: IOError: Arrow error: Invalid: Resize cannot downsize
-* [ARROW-6869](https://issues.apache.org/jira/browse/ARROW-6869) - [C++] Dictionary "delta" building logic in builder\_dict.h produces invalid arrays
-* [ARROW-6873](https://issues.apache.org/jira/browse/ARROW-6873) - [Python] Stale CColumn reference break Cython cimport pyarrow
-* [ARROW-6874](https://issues.apache.org/jira/browse/ARROW-6874) - [Python] Memory leak in Table.to\_pandas() when conversion to object dtype
-* [ARROW-6876](https://issues.apache.org/jira/browse/ARROW-6876) - [Python] Reading parquet file with many columns becomes slow for 0.15.0
-* [ARROW-6877](https://issues.apache.org/jira/browse/ARROW-6877) - [C++] Boost not found from the correct environment
-* [ARROW-6878](https://issues.apache.org/jira/browse/ARROW-6878) - [Python] pa.array() does not handle list of dicts with bytes keys correctly under python3
-* [ARROW-6882](https://issues.apache.org/jira/browse/ARROW-6882) - [Python] cannot create a chunked\_array from dictionary\_encoding result
-* [ARROW-6886](https://issues.apache.org/jira/browse/ARROW-6886) - [C++] arrow::io header nvcc compiler warnings
-* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes
-* [ARROW-6903](https://issues.apache.org/jira/browse/ARROW-6903) - [Python] Wheels broken after ARROW-6860 changes
-* [ARROW-6905](https://issues.apache.org/jira/browse/ARROW-6905) - [Packaging][OSX] Nightly builds on MacOS are failing because of brew compile timeouts
-* [ARROW-6910](https://issues.apache.org/jira/browse/ARROW-6910) - [Python] pyarrow.parquet.read\_table(...) takes up lots of memory which is not released until program exits
-* [ARROW-6922](https://issues.apache.org/jira/browse/ARROW-6922) - [Python] Pandas master build is failing (MultiIndex.levels change)
-* [ARROW-6937](https://issues.apache.org/jira/browse/ARROW-6937) - [Packaging][Python] Fix conda linux and OSX wheel nightly builds
-* [ARROW-6938](https://issues.apache.org/jira/browse/ARROW-6938) - [Python] Windows wheel depends on zstd.dll and libbz2.dll, which are not bundled
-* [ARROW-6962](https://issues.apache.org/jira/browse/ARROW-6962) - [C++] [CI] Stop compiling with -Weverything
-* [ARROW-6977](https://issues.apache.org/jira/browse/ARROW-6977) - [C++] Only enable jemalloc background\_thread if feature is supported
-* [ARROW-6983](https://issues.apache.org/jira/browse/ARROW-6983) - [C++] Threaded task group crashes sometimes
-* [ARROW-7422](https://issues.apache.org/jira/browse/ARROW-7422) - [Python] Improper CPU flags failing pyarrow install in ARM devices
-* [ARROW-7423](https://issues.apache.org/jira/browse/ARROW-7423) - Pyarrow ARM install fails from source with no clear error
-* [ARROW-9349](https://issues.apache.org/jira/browse/ARROW-9349) - [Python] parquet.read\_table causes crashes on Windows Server 2016 w/ Xeon Processor
-
-
-## New Features and Improvements
-
-* [ARROW-6610](https://issues.apache.org/jira/browse/ARROW-6610) - [C++] Add ARROW\_FILESYSTEM=ON/OFF CMake configuration flag
-* [ARROW-6661](https://issues.apache.org/jira/browse/ARROW-6661) - [Java] Implement APIs like slice to enhance VectorSchemaRoot
-* [ARROW-6777](https://issues.apache.org/jira/browse/ARROW-6777) - [GLib][CI] Unpin gobject-introspection gem
-* [ARROW-6852](https://issues.apache.org/jira/browse/ARROW-6852) - [C++] memory-benchmark build failed on Arm64
-* [ARROW-6927](https://issues.apache.org/jira/browse/ARROW-6927) - [C++] Add gRPC version check
-* [ARROW-6963](https://issues.apache.org/jira/browse/ARROW-6963) - [Packaging][Wheel][OSX] Use crossbow's command to deploy artifacts from travis builds
-
-
-
-# Apache Arrow 0.15.0 (2019-10-05)
-
-## New Features and Improvements
-
-* [ARROW-453](https://issues.apache.org/jira/browse/ARROW-453) - [C++] Add filesystem implementation for Amazon S3
-* [ARROW-517](https://issues.apache.org/jira/browse/ARROW-517) - [C++] Verbose Array::Equals
-* [ARROW-750](https://issues.apache.org/jira/browse/ARROW-750) - [Format] Add LargeBinary and LargeString types
-* [ARROW-1324](https://issues.apache.org/jira/browse/ARROW-1324) - [C++] Support ARROW\_BOOST\_VENDORED on Windows / MSVC
-* [ARROW-1561](https://issues.apache.org/jira/browse/ARROW-1561) - [C++] Kernel implementations for "isin" (set containment)
-* [ARROW-1566](https://issues.apache.org/jira/browse/ARROW-1566) - [C++] Implement non-materializing sort kernels
-* [ARROW-1741](https://issues.apache.org/jira/browse/ARROW-1741) - [C++] Comparison function for DictionaryArray to determine if indices are "compatible"
-* [ARROW-1786](https://issues.apache.org/jira/browse/ARROW-1786) - [Format] List expected on-wire buffer layouts for each kind of Arrow physical type in specification
-* [ARROW-1789](https://issues.apache.org/jira/browse/ARROW-1789) - [Format] Consolidate specification documents and improve clarity for new implementation authors
-* [ARROW-1875](https://issues.apache.org/jira/browse/ARROW-1875) - [Java] Write 64-bit ints as strings in integration test JSON files
-* [ARROW-2006](https://issues.apache.org/jira/browse/ARROW-2006) - [C++] Add option to trim excess padding when writing IPC messages
-* [ARROW-2431](https://issues.apache.org/jira/browse/ARROW-2431) - [Rust] Schema fidelity
-* [ARROW-2769](https://issues.apache.org/jira/browse/ARROW-2769) - [C++][Python] Deprecate and rename add\_metadata methods
-* [ARROW-2931](https://issues.apache.org/jira/browse/ARROW-2931) - [Crossbow] Windows builds are attempting to run linux and osx packaging tasks
-* [ARROW-3032](https://issues.apache.org/jira/browse/ARROW-3032) - [Python] Clean up NumPy-related C++ headers
-* [ARROW-3204](https://issues.apache.org/jira/browse/ARROW-3204) - [R] Enable package to be made available on CRAN
-* [ARROW-3243](https://issues.apache.org/jira/browse/ARROW-3243) - [C++] Upgrade jemalloc to version 5
-* [ARROW-3246](https://issues.apache.org/jira/browse/ARROW-3246) - [Python][Parquet] direct reading/writing of pandas categoricals in parquet
-* [ARROW-3325](https://issues.apache.org/jira/browse/ARROW-3325) - [Python] Support reading Parquet binary/string columns directly as DictionaryArray
-* [ARROW-3325](https://issues.apache.org/jira/browse/ARROW-3325) - [Python] Support reading Parquet binary/string columns directly as DictionaryArray
-* [ARROW-3531](https://issues.apache.org/jira/browse/ARROW-3531) - [Python] Deprecate Schema.field\_by\_name in favor of \_\_getitem\_\_
-* [ARROW-3538](https://issues.apache.org/jira/browse/ARROW-3538) - [Python] ability to override the automated assignment of uuid for filenames when writing datasets
-* [ARROW-3579](https://issues.apache.org/jira/browse/ARROW-3579) - [Crossbow] Unintuitive error message when remote branch has not been pushed
-* [ARROW-3643](https://issues.apache.org/jira/browse/ARROW-3643) - [Rust] Optimize \`push\_slice\` of \`BufferBuilder<bool\>\`
-* [ARROW-3710](https://issues.apache.org/jira/browse/ARROW-3710) - [Crossbow][Python] Run nightly tests against pandas master
-* [ARROW-3772](https://issues.apache.org/jira/browse/ARROW-3772) - [C++] Read Parquet dictionary encoded ColumnChunks directly into an Arrow DictionaryArray
-* [ARROW-3777](https://issues.apache.org/jira/browse/ARROW-3777) - [C++] Implement a mock "high latency" filesystem
-* [ARROW-3817](https://issues.apache.org/jira/browse/ARROW-3817) - [R] $ method for RecordBatch
-* [ARROW-3829](https://issues.apache.org/jira/browse/ARROW-3829) - [Python] Support protocols to extract Arrow objects from third-party classes
-* [ARROW-3943](https://issues.apache.org/jira/browse/ARROW-3943) - [R] Write vignette for R package
-* [ARROW-4036](https://issues.apache.org/jira/browse/ARROW-4036) - [C++] Make status codes pluggable
-* [ARROW-4095](https://issues.apache.org/jira/browse/ARROW-4095) - [C++] Implement optimizations for dictionary unification where dictionaries are prefixes of the unified dictionary
-* [ARROW-4111](https://issues.apache.org/jira/browse/ARROW-4111) - [Python] Create time types from Python sequences of integers
-* [ARROW-4218](https://issues.apache.org/jira/browse/ARROW-4218) - [Rust] [Parquet] Implement ColumnReader
-* [ARROW-4220](https://issues.apache.org/jira/browse/ARROW-4220) - [Python] Add buffered input and output stream ASV benchmarks with simulated high latency IO
-* [ARROW-4365](https://issues.apache.org/jira/browse/ARROW-4365) - [Rust] [Parquet] Implement RecordReader
-* [ARROW-4398](https://issues.apache.org/jira/browse/ARROW-4398) - [Python] Add benchmarks for Arrow<\>Parquet BYTE\_ARRAY serialization (read and write)
-* [ARROW-4473](https://issues.apache.org/jira/browse/ARROW-4473) - [Website] Add instructions to do a test-deploy of Arrow website and fix bugs
-* [ARROW-4507](https://issues.apache.org/jira/browse/ARROW-4507) - [Format] Create outline and introduction for new document.
-* [ARROW-4508](https://issues.apache.org/jira/browse/ARROW-4508) - [Format] Copy content from Layout.rst to new document.
-* [ARROW-4509](https://issues.apache.org/jira/browse/ARROW-4509) - [Format] Copy content from Metadata.rst to new document.
-* [ARROW-4510](https://issues.apache.org/jira/browse/ARROW-4510) - [Format] copy content from IPC.rst to new document.
-* [ARROW-4511](https://issues.apache.org/jira/browse/ARROW-4511) - [Format] remove individual documents in favor of new document once all content is moved
-* [ARROW-4648](https://issues.apache.org/jira/browse/ARROW-4648) - [C++/Question] Naming/organizational inconsistencies in cpp codebase
-* [ARROW-4648](https://issues.apache.org/jira/browse/ARROW-4648) - [C++/Question] Naming/organizational inconsistencies in cpp codebase
-* [ARROW-4649](https://issues.apache.org/jira/browse/ARROW-4649) - [C++/CI/R] Add (nightly) job that builds \`brew install apache-arrow --HEAD\`
-* [ARROW-4752](https://issues.apache.org/jira/browse/ARROW-4752) - [Rust] Add explicit SIMD vectorization for the divide kernel
-* [ARROW-4810](https://issues.apache.org/jira/browse/ARROW-4810) - [Format][C++] Add "LargeList" type with 64-bit offsets
-* [ARROW-4841](https://issues.apache.org/jira/browse/ARROW-4841) - [C++] Persist CMake options in generated CMake config
-* [ARROW-4860](https://issues.apache.org/jira/browse/ARROW-4860) - [C++] Build AWS C++ SDK for Windows in conda-forge
-* [ARROW-5134](https://issues.apache.org/jira/browse/ARROW-5134) - [R][CI] Run nightly tests against multiple R versions
-* [ARROW-5211](https://issues.apache.org/jira/browse/ARROW-5211) - [Format] Missing documentation under \`Dictionary encoding\` section on MetaData page
-* [ARROW-5216](https://issues.apache.org/jira/browse/ARROW-5216) - [CI] Add Appveyor badge to README
-* [ARROW-5307](https://issues.apache.org/jira/browse/ARROW-5307) - [CI][GLib] Enable GTK-Doc
-* [ARROW-5337](https://issues.apache.org/jira/browse/ARROW-5337) - [C++] Add RecordBatch::field method, possibly deprecate "column"
-* [ARROW-5343](https://issues.apache.org/jira/browse/ARROW-5343) - [C++] Consider using Buffer for transpose maps in DictionaryType::Unify instead of std::vector
-* [ARROW-5344](https://issues.apache.org/jira/browse/ARROW-5344) - [C++] Use ArrayDataVisitor in implementation of dictionary unpacking in compute/kernels/cast.cc
-* [ARROW-5351](https://issues.apache.org/jira/browse/ARROW-5351) - [Rust] Add support for take kernel functions
-* [ARROW-5358](https://issues.apache.org/jira/browse/ARROW-5358) - [Rust] Implement equality check for ArrayData and Array
-* [ARROW-5380](https://issues.apache.org/jira/browse/ARROW-5380) - [C++] Fix and enable UBSan for unaligned accesses.
-* [ARROW-5439](https://issues.apache.org/jira/browse/ARROW-5439) - [Java] Utilize stream EOS in File format
-* [ARROW-5444](https://issues.apache.org/jira/browse/ARROW-5444) - [Release][Website] After 0.14 release, update what is an "official" release
-* [ARROW-5458](https://issues.apache.org/jira/browse/ARROW-5458) - [C++] ARMv8 parallel CRC32c computation optimization
-* [ARROW-5480](https://issues.apache.org/jira/browse/ARROW-5480) - [Python] Pandas categorical type doesn't survive a round-trip through parquet
-* [ARROW-5483](https://issues.apache.org/jira/browse/ARROW-5483) - [Java] add ValueVector constructors that take a Field object
-* [ARROW-5494](https://issues.apache.org/jira/browse/ARROW-5494) - [Python] Create FileSystem bindings
-* [ARROW-5505](https://issues.apache.org/jira/browse/ARROW-5505) - [R] Stop masking base R functions/rethink namespacing
-* [ARROW-5527](https://issues.apache.org/jira/browse/ARROW-5527) - [C++] HashTable/MemoTable should use Buffer(s)/Builder(s) for heap data
-* [ARROW-5558](https://issues.apache.org/jira/browse/ARROW-5558) - [C++] Support Array::View on arrays with non-zero offsets
-* [ARROW-5559](https://issues.apache.org/jira/browse/ARROW-5559) - [C++] Introduce IpcOptions struct object for better API-stability when adding new options
-* [ARROW-5564](https://issues.apache.org/jira/browse/ARROW-5564) - [C++] Add uriparser to conda-forge
-* [ARROW-5579](https://issues.apache.org/jira/browse/ARROW-5579) - [Java] shade flatbuffer dependency
-* [ARROW-5580](https://issues.apache.org/jira/browse/ARROW-5580) - [C++][Gandiva] Correct definitions of timestamp functions in Gandiva
-* [ARROW-5588](https://issues.apache.org/jira/browse/ARROW-5588) - [C++] Better support for building UnionArrays
-* [ARROW-5594](https://issues.apache.org/jira/browse/ARROW-5594) - [C++] add support for UnionArrays to Take and Filter
-* [ARROW-5610](https://issues.apache.org/jira/browse/ARROW-5610) - [Python] Define extension type API in Python to "receive" or "send" a foreign extension type
-* [ARROW-5646](https://issues.apache.org/jira/browse/ARROW-5646) - [Crossbow][Documentation] Move the user guide to the Sphinx documentation
-* [ARROW-5681](https://issues.apache.org/jira/browse/ARROW-5681) - [FlightRPC] Wrap gRPC exceptions/statuses
-* [ARROW-5686](https://issues.apache.org/jira/browse/ARROW-5686) - [R] Review R Windows CI build
-* [ARROW-5716](https://issues.apache.org/jira/browse/ARROW-5716) - [Developer] Improve merge PR script to acknowledge co-authors
-* [ARROW-5717](https://issues.apache.org/jira/browse/ARROW-5717) - [Python] Support dictionary unification when converting variable dictionaries to pandas
-* [ARROW-5719](https://issues.apache.org/jira/browse/ARROW-5719) - [Java] Support in-place vector sorting
-* [ARROW-5722](https://issues.apache.org/jira/browse/ARROW-5722) - [Rust] Implement std::fmt::Debug for ListArray, BinaryArray and StructArray
-* [ARROW-5734](https://issues.apache.org/jira/browse/ARROW-5734) - [Python] Dispatch to Table.from\_arrays from pyarrow.table factory function
-* [ARROW-5736](https://issues.apache.org/jira/browse/ARROW-5736) - [Format][C++] Support small bit-width indices in sparse tensor
-* [ARROW-5741](https://issues.apache.org/jira/browse/ARROW-5741) - [JS] Make numeric vector from functions consistent with TypedArray.from
-* [ARROW-5743](https://issues.apache.org/jira/browse/ARROW-5743) - [C++] Add CMake option to enable "large memory" unit tests
-* [ARROW-5746](https://issues.apache.org/jira/browse/ARROW-5746) - [Website] Move website source out of apache/arrow
-* [ARROW-5747](https://issues.apache.org/jira/browse/ARROW-5747) - [C++] Better column name and header support in CSV reader
-* [ARROW-5758](https://issues.apache.org/jira/browse/ARROW-5758) - [C++][Gandiva] Support casting decimals to varchar and vice versa
-* [ARROW-5762](https://issues.apache.org/jira/browse/ARROW-5762) - [Integration][JS] Integration Tests for Map Type
-* [ARROW-5777](https://issues.apache.org/jira/browse/ARROW-5777) - [C++] BasicDecimal128 is a small object it doesn't always make sense to pass by const ref
-* [ARROW-5778](https://issues.apache.org/jira/browse/ARROW-5778) - [Java] Extract the logic for vector data copying to the super classes
-* [ARROW-5784](https://issues.apache.org/jira/browse/ARROW-5784) - [Release][GLib] Replace c\_glib/ after running c\_glib/autogen.sh in dev/release/02-source.sh
-* [ARROW-5786](https://issues.apache.org/jira/browse/ARROW-5786) - [Release] Use arrow-jni profile in dev/release/01-prepare.sh
-* [ARROW-5788](https://issues.apache.org/jira/browse/ARROW-5788) - [Rust] Use { version = "...", path = "../..." } for arrow and parquet dependencies
-* [ARROW-5789](https://issues.apache.org/jira/browse/ARROW-5789) - [C++] Small Warning/Linkage cleanups
-* [ARROW-5792](https://issues.apache.org/jira/browse/ARROW-5792) - [Rust] [Parquet] A visitor trait for parquet types.
-* [ARROW-5798](https://issues.apache.org/jira/browse/ARROW-5798) - [Packaging][deb] Update doc architecture
-* [ARROW-5800](https://issues.apache.org/jira/browse/ARROW-5800) - [R] Dockerize R Travis CI tests so they can be run anywhere via docker-compose
-* [ARROW-5803](https://issues.apache.org/jira/browse/ARROW-5803) - [C++] Dockerize C++ with clang 7 Travis CI unit test logic
-* [ARROW-5812](https://issues.apache.org/jira/browse/ARROW-5812) - [Java] Refactor method name and param type in BaseIntVector
-* [ARROW-5813](https://issues.apache.org/jira/browse/ARROW-5813) - [C++] Support checking the equality of the different contiguous tensors
-* [ARROW-5814](https://issues.apache.org/jira/browse/ARROW-5814) - [Java] Implement a <Object, int\> HashMap for DictionaryEncoder
-* [ARROW-5827](https://issues.apache.org/jira/browse/ARROW-5827) - [C++] Require c-ares CMake config
-* [ARROW-5828](https://issues.apache.org/jira/browse/ARROW-5828) - [C++] Add Protocol Buffers version check
-* [ARROW-5830](https://issues.apache.org/jira/browse/ARROW-5830) - [C++] Stop using memcmp in TensorEquals
-* [ARROW-5832](https://issues.apache.org/jira/browse/ARROW-5832) - [Java] Support search operations for vector data
-* [ARROW-5833](https://issues.apache.org/jira/browse/ARROW-5833) - [C++] Factor out status copying code from cast.cc
-* [ARROW-5834](https://issues.apache.org/jira/browse/ARROW-5834) - [Java] Apply new hash map in DictionaryEncoder
-* [ARROW-5835](https://issues.apache.org/jira/browse/ARROW-5835) - [Java] Support Dictionary Encoding for binary type
-* [ARROW-5841](https://issues.apache.org/jira/browse/ARROW-5841) - [Website] Add 0.14.0 release note
-* [ARROW-5842](https://issues.apache.org/jira/browse/ARROW-5842) - [Java] Revise the semantic of lastSet in ListVector
-* [ARROW-5843](https://issues.apache.org/jira/browse/ARROW-5843) - [Java] Improve the readability and performance of BitVectorHelper\#getNullCount
-* [ARROW-5844](https://issues.apache.org/jira/browse/ARROW-5844) - [Java] Support comparison & sort for more numeric types
-* [ARROW-5846](https://issues.apache.org/jira/browse/ARROW-5846) - [Java] Create Avro adapter module and add dependencies
-* [ARROW-5853](https://issues.apache.org/jira/browse/ARROW-5853) - [Python] Expose boolean filter kernel on Array
-* [ARROW-5861](https://issues.apache.org/jira/browse/ARROW-5861) - [Java] Initial implement to convert Avro record with primitive types
-* [ARROW-5862](https://issues.apache.org/jira/browse/ARROW-5862) - [Java] Provide dictionary builder
-* [ARROW-5864](https://issues.apache.org/jira/browse/ARROW-5864) - [Python] simplify cython wrapping of Result
-* [ARROW-5865](https://issues.apache.org/jira/browse/ARROW-5865) - [Release] Helper script for rebasing open pull requests on master
-* [ARROW-5866](https://issues.apache.org/jira/browse/ARROW-5866) - [C++] Remove duplicate library in cpp/Brewfile
-* [ARROW-5867](https://issues.apache.org/jira/browse/ARROW-5867) - [C++][Gandiva] Add support for cast int to decimal
-* [ARROW-5872](https://issues.apache.org/jira/browse/ARROW-5872) - Support mod(double, double) method in Gandiva
-* [ARROW-5876](https://issues.apache.org/jira/browse/ARROW-5876) - [FlightRPC] Implement basic auth across all languages
-* [ARROW-5877](https://issues.apache.org/jira/browse/ARROW-5877) - [FlightRPC] Fix auth incompatibilities between Python/Java
-* [ARROW-5880](https://issues.apache.org/jira/browse/ARROW-5880) - [C++] Update arrow parquet writer to use TypedBufferBuilder
-* [ARROW-5881](https://issues.apache.org/jira/browse/ARROW-5881) - [Java] Provide functionalities to efficiently determine if a validity buffer has completely 1 bits/0 bits
-* [ARROW-5883](https://issues.apache.org/jira/browse/ARROW-5883) - [Java] Support dictionary encoding for List and Struct type
-* [ARROW-5888](https://issues.apache.org/jira/browse/ARROW-5888) - [Python][C++] Add metadata to store Arrow time zones in Parquet file metadata
-* [ARROW-5891](https://issues.apache.org/jira/browse/ARROW-5891) - [C++][Gandiva] Remove duplicates in function registries
-* [ARROW-5892](https://issues.apache.org/jira/browse/ARROW-5892) - [C++][Gandiva] Support function aliases
-* [ARROW-5893](https://issues.apache.org/jira/browse/ARROW-5893) - [C++] Remove arrow::Column class from C++ library
-* [ARROW-5897](https://issues.apache.org/jira/browse/ARROW-5897) - [Java] Remove duplicated logic in MapVector
-* [ARROW-5898](https://issues.apache.org/jira/browse/ARROW-5898) - [Java] Provide functionality to efficiently compute hash code for arbitrary memory segment
-* [ARROW-5900](https://issues.apache.org/jira/browse/ARROW-5900) - [Gandiva] [Java] Decimal precision,scale bounds check
-* [ARROW-5901](https://issues.apache.org/jira/browse/ARROW-5901) - [Rust] Implement PartialEq to compare array and json values
-* [ARROW-5902](https://issues.apache.org/jira/browse/ARROW-5902) - [Java] Implement hash table and equals & hashCode API for dictionary encoding
-* [ARROW-5903](https://issues.apache.org/jira/browse/ARROW-5903) - [Java] Set methods in DecimalVector are slow
-* [ARROW-5904](https://issues.apache.org/jira/browse/ARROW-5904) - [Java] [Plasma] Fix compilation of Plasma Java client
-* [ARROW-5906](https://issues.apache.org/jira/browse/ARROW-5906) - [CI] Set -DARROW\_VERBOSE\_THIRDPARTY\_BUILD=OFF in builds running in Travis CI, maybe all docker-compose builds by default
-* [ARROW-5908](https://issues.apache.org/jira/browse/ARROW-5908) - [C\#] ArrowStreamWriter doesn't align buffers to 8 bytes
-* [ARROW-5909](https://issues.apache.org/jira/browse/ARROW-5909) - [Java] Optimize ByteFunctionHelpers equals & compare logic
-* [ARROW-5911](https://issues.apache.org/jira/browse/ARROW-5911) - [Java] Make ListVector and MapVector create reader lazily
-* [ARROW-5917](https://issues.apache.org/jira/browse/ARROW-5917) - [Java] Redesign the dictionary encoder
-* [ARROW-5918](https://issues.apache.org/jira/browse/ARROW-5918) - [Java] Add get to BaseIntVector interface
-* [ARROW-5919](https://issues.apache.org/jira/browse/ARROW-5919) - [R] Add nightly tests for building r-arrow with dependencies from conda-forge
-* [ARROW-5920](https://issues.apache.org/jira/browse/ARROW-5920) - [Java] Support sort & compare for all variable width vectors
-* [ARROW-5924](https://issues.apache.org/jira/browse/ARROW-5924) - [C++][Plasma] It is not convenient to release a GPU object
-* [ARROW-5934](https://issues.apache.org/jira/browse/ARROW-5934) - [Python] Bundle arrow's LICENSE with the wheels
-* [ARROW-5937](https://issues.apache.org/jira/browse/ARROW-5937) - [Release] Stop parallel binary upload
-* [ARROW-5938](https://issues.apache.org/jira/browse/ARROW-5938) - [Release] Create branch for adding release note automatically
-* [ARROW-5939](https://issues.apache.org/jira/browse/ARROW-5939) - [Release] Add support for generating vote email template separately
-* [ARROW-5940](https://issues.apache.org/jira/browse/ARROW-5940) - [Release] Add support for re-uploading sign/checksum for binary artifacts
-* [ARROW-5941](https://issues.apache.org/jira/browse/ARROW-5941) - [Release] Avoid re-uploading already uploaded binary artifacts
-* [ARROW-5943](https://issues.apache.org/jira/browse/ARROW-5943) - [GLib][Gandiva] Add support for function aliases
-* [ARROW-5944](https://issues.apache.org/jira/browse/ARROW-5944) - [C++][Gandiva] Remove 'div' alias for 'divide'
-* [ARROW-5945](https://issues.apache.org/jira/browse/ARROW-5945) - [Rust] [DataFusion] Table trait should support building complete queries
-* [ARROW-5947](https://issues.apache.org/jira/browse/ARROW-5947) - [Rust] [DataFusion] Remove serde\_json dependency
-* [ARROW-5948](https://issues.apache.org/jira/browse/ARROW-5948) - [Rust] [DataFusion] create\_logical\_plan should not call optimizer
-* [ARROW-5955](https://issues.apache.org/jira/browse/ARROW-5955) - [Plasma] Support setting memory quotas per plasma client for better isolation
-* [ARROW-5957](https://issues.apache.org/jira/browse/ARROW-5957) - [C++][Gandiva] Implement div function in Gandiva
-* [ARROW-5958](https://issues.apache.org/jira/browse/ARROW-5958) - [Python] Link zlib statically in the wheels
-* [ARROW-5961](https://issues.apache.org/jira/browse/ARROW-5961) - [R] Be able to run R-only tests even without C++ library
-* [ARROW-5962](https://issues.apache.org/jira/browse/ARROW-5962) - [CI][Python] Do not test manylinux1 wheels in Travis CI
-* [ARROW-5967](https://issues.apache.org/jira/browse/ARROW-5967) - [Java] DateUtility\#timeZoneList is not correct
-* [ARROW-5970](https://issues.apache.org/jira/browse/ARROW-5970) - [Java] Provide pointer to Arrow buffer
-* [ARROW-5974](https://issues.apache.org/jira/browse/ARROW-5974) - [Python][C++] Enable CSV reader to read from concatenated gzip stream
-* [ARROW-5975](https://issues.apache.org/jira/browse/ARROW-5975) - [C++][Gandiva] Add method to cast Date(in Milliseconds) to timestamp
-* [ARROW-5976](https://issues.apache.org/jira/browse/ARROW-5976) - [C++] RETURN\_IF\_ERROR(ctx) should be namespaced
-* [ARROW-5977](https://issues.apache.org/jira/browse/ARROW-5977) - [C++] [Python] Method for read\_csv to limit which columns are read?
-* [ARROW-5979](https://issues.apache.org/jira/browse/ARROW-5979) - [FlightRPC] Expose (de)serialization of protocol types
-* [ARROW-5985](https://issues.apache.org/jira/browse/ARROW-5985) - [Developer] Do not suggest setting Fix Version for point releases in dev/merge\_arrow\_pr.py
-* [ARROW-5986](https://issues.apache.org/jira/browse/ARROW-5986) - [Java] Code cleanup for dictionary encoding
-* [ARROW-5988](https://issues.apache.org/jira/browse/ARROW-5988) - [Java] Avro adapter implement simple Record type
-* [ARROW-5997](https://issues.apache.org/jira/browse/ARROW-5997) - [Java] Support dictionary encoding for Union type
-* [ARROW-5998](https://issues.apache.org/jira/browse/ARROW-5998) - [Java] Open a document to track the API changes
-* [ARROW-6000](https://issues.apache.org/jira/browse/ARROW-6000) - [Python] Expose LargeBinaryType and LargeStringType
-* [ARROW-6008](https://issues.apache.org/jira/browse/ARROW-6008) - [Release] Don't parallelize the bintray upload script
-* [ARROW-6009](https://issues.apache.org/jira/browse/ARROW-6009) - [Release][JS] Ignore NPM errors in the javascript release script
-* [ARROW-6013](https://issues.apache.org/jira/browse/ARROW-6013) - [Java] Support range searcher
-* [ARROW-6017](https://issues.apache.org/jira/browse/ARROW-6017) - [FlightRPC] Allow creating Locations with unknown schemes
-* [ARROW-6020](https://issues.apache.org/jira/browse/ARROW-6020) - [Java] Refactor ByteFunctionHelper\#hash with new added ArrowBufHasher
-* [ARROW-6021](https://issues.apache.org/jira/browse/ARROW-6021) - [Java] Extract copyFrom and copyFromSafe methods to ValueVector interface
-* [ARROW-6022](https://issues.apache.org/jira/browse/ARROW-6022) - [Java] Support equals API in ValueVector to compare two vectors equal
-* [ARROW-6023](https://issues.apache.org/jira/browse/ARROW-6023) - [C++][Gandiva] Add functions in Gandiva
-* [ARROW-6024](https://issues.apache.org/jira/browse/ARROW-6024) - [Java] Provide more hash algorithms
-* [ARROW-6026](https://issues.apache.org/jira/browse/ARROW-6026) - [Doc] Add CONTRIBUTING.md
-* [ARROW-6030](https://issues.apache.org/jira/browse/ARROW-6030) - [Java] Efficiently compute hash code for ArrowBufPointer
-* [ARROW-6031](https://issues.apache.org/jira/browse/ARROW-6031) - [Java] Support iterating a vector by ArrowBufPointer
-* [ARROW-6034](https://issues.apache.org/jira/browse/ARROW-6034) - [C++][Gandiva] Add string functions in Gandiva
-* [ARROW-6035](https://issues.apache.org/jira/browse/ARROW-6035) - [Java] Avro adapter support convert nullable value
-* [ARROW-6036](https://issues.apache.org/jira/browse/ARROW-6036) - [GLib] Add support for skip rows and column\_names CSV read option
-* [ARROW-6037](https://issues.apache.org/jira/browse/ARROW-6037) - [GLib] Add a missing version macro
-* [ARROW-6039](https://issues.apache.org/jira/browse/ARROW-6039) - [GLib] Add garrow\_array\_filter()
-* [ARROW-6041](https://issues.apache.org/jira/browse/ARROW-6041) - [Website] Blog post announcing R package release
-* [ARROW-6042](https://issues.apache.org/jira/browse/ARROW-6042) - [C++] Implement alternative DictionaryBuilder that always yields int32 indices
-* [ARROW-6045](https://issues.apache.org/jira/browse/ARROW-6045) - [C++] Benchmark for Parquet float and NaN encoding/decoding
-* [ARROW-6048](https://issues.apache.org/jira/browse/ARROW-6048) - [C++] Add ChunkedArray::View which calls to Array::View
-* [ARROW-6049](https://issues.apache.org/jira/browse/ARROW-6049) - [C++] Support using Array::View from compatible dictionary type to another
-* [ARROW-6053](https://issues.apache.org/jira/browse/ARROW-6053) - [Python] RecordBatchStreamReader::Open2 cdef type signature doesn't match C++
-* [ARROW-6063](https://issues.apache.org/jira/browse/ARROW-6063) - [FlightRPC] Implement "half-closed" semantics for DoPut
-* [ARROW-6065](https://issues.apache.org/jira/browse/ARROW-6065) - [C++] Reorganize parquet/arrow/reader.cc, remove code duplication, improve readability
-* [ARROW-6069](https://issues.apache.org/jira/browse/ARROW-6069) - [Rust] [Parquet] Implement Converter to convert record reader to arrow primitive array.
-* [ARROW-6070](https://issues.apache.org/jira/browse/ARROW-6070) - [Java] Avoid creating new schema before IPC sending
-* [ARROW-6077](https://issues.apache.org/jira/browse/ARROW-6077) - [C++][Parquet] Build logical schema tree mapping Arrow fields to Parquet schema levels
-* [ARROW-6078](https://issues.apache.org/jira/browse/ARROW-6078) - [Java] Implement dictionary-encoded subfields for List type
-* [ARROW-6079](https://issues.apache.org/jira/browse/ARROW-6079) - [Java] Implement/test UnionFixedSizeListWriter for FixedSizeListVector
-* [ARROW-6080](https://issues.apache.org/jira/browse/ARROW-6080) - [Java] Support compare and search operation for BaseRepeatedValueVector
-* [ARROW-6083](https://issues.apache.org/jira/browse/ARROW-6083) - [Java] Refactor Jdbc adapter consume logic
-* [ARROW-6084](https://issues.apache.org/jira/browse/ARROW-6084) - [Python] Support LargeList
-* [ARROW-6085](https://issues.apache.org/jira/browse/ARROW-6085) - [Rust] [DataFusion] Create traits for phsyical query plan
-* [ARROW-6086](https://issues.apache.org/jira/browse/ARROW-6086) - [Rust] [DataFusion] Implement parallel execution for parquet scan
-* [ARROW-6087](https://issues.apache.org/jira/browse/ARROW-6087) - [Rust] [DataFusion] Implement parallel execution for CSV scan
-* [ARROW-6088](https://issues.apache.org/jira/browse/ARROW-6088) - [Rust] [DataFusion] Implement parallel execution for projection
-* [ARROW-6089](https://issues.apache.org/jira/browse/ARROW-6089) - [Rust] [DataFusion] Implement parallel execution for selection
-* [ARROW-6090](https://issues.apache.org/jira/browse/ARROW-6090) - [Rust] [DataFusion] Implement parallel execution for hash aggregate
-* [ARROW-6093](https://issues.apache.org/jira/browse/ARROW-6093) - [Java] reduce branches in algo for first match in VectorRangeSearcher
-* [ARROW-6094](https://issues.apache.org/jira/browse/ARROW-6094) - [Format][Flight] Add GetFlightSchema to Flight RPC
-* [ARROW-6096](https://issues.apache.org/jira/browse/ARROW-6096) - [C++] Conditionally depend on boost regex library
-* [ARROW-6097](https://issues.apache.org/jira/browse/ARROW-6097) - [Java] Avro adapter implement unions type
-* [ARROW-6100](https://issues.apache.org/jira/browse/ARROW-6100) - [Rust] Pin to specific Rust nightly release
-* [ARROW-6101](https://issues.apache.org/jira/browse/ARROW-6101) - [Rust] [DataFusion] Create physical plan from logical plan
-* [ARROW-6102](https://issues.apache.org/jira/browse/ARROW-6102) - [Testing] Add partitioned CSV file to arrow-testing repo
-* [ARROW-6104](https://issues.apache.org/jira/browse/ARROW-6104) - [Rust] [DataFusion] Don't allow bare\_trait\_objects
-* [ARROW-6105](https://issues.apache.org/jira/browse/ARROW-6105) - [C++][Parquet][Python] Add test case showing dictionary-encoded subfields in nested type
-* [ARROW-6113](https://issues.apache.org/jira/browse/ARROW-6113) - [Java] Support vector deduplicate function
-* [ARROW-6115](https://issues.apache.org/jira/browse/ARROW-6115) - [Python] support LargeList, LargeString, LargeBinary in conversion to pandas
-* [ARROW-6118](https://issues.apache.org/jira/browse/ARROW-6118) - [Java] Replace google Preconditions with Arrow Preconditions
-* [ARROW-6121](https://issues.apache.org/jira/browse/ARROW-6121) - [Tools] Improve merge tool cli ergonomic
-* [ARROW-6125](https://issues.apache.org/jira/browse/ARROW-6125) - [Python] Remove any APIs deprecated prior to 0.14.x
-* [ARROW-6127](https://issues.apache.org/jira/browse/ARROW-6127) - [Website] Add favicons and meta tags
-* [ARROW-6128](https://issues.apache.org/jira/browse/ARROW-6128) - [C++] Can't build with g++ 8.3.0 by class-memaccess warning
-* [ARROW-6130](https://issues.apache.org/jira/browse/ARROW-6130) - [Release] Use 0.15.0 as the next release
-* [ARROW-6134](https://issues.apache.org/jira/browse/ARROW-6134) - [C++][Gandiva] Add concat function in Gandiva
-* [ARROW-6137](https://issues.apache.org/jira/browse/ARROW-6137) - [C++][Gandiva] Change output format of castVARCHAR(timestamp) in Gandiva
-* [ARROW-6137](https://issues.apache.org/jira/browse/ARROW-6137) - [C++][Gandiva] Change output format of castVARCHAR(timestamp) in Gandiva
-* [ARROW-6138](https://issues.apache.org/jira/browse/ARROW-6138) - [C++] Add a basic (single RecordBatch) implementation of Dataset
-* [ARROW-6139](https://issues.apache.org/jira/browse/ARROW-6139) - [Documentation][R] Build R docs (pkgdown) site and add to arrow-site
-* [ARROW-6141](https://issues.apache.org/jira/browse/ARROW-6141) - [C++] Enable memory-mapping a file region that is offset from the beginning of the file
-* [ARROW-6142](https://issues.apache.org/jira/browse/ARROW-6142) - [R] Install instructions on linux could be clearer
-* [ARROW-6143](https://issues.apache.org/jira/browse/ARROW-6143) - [Java] Unify the copyFrom and copyFromSafe methods for all vectors
-* [ARROW-6144](https://issues.apache.org/jira/browse/ARROW-6144) - [C++][Gandiva] Implement random function in Gandiva
-* [ARROW-6155](https://issues.apache.org/jira/browse/ARROW-6155) - [Java] Extract a super interface for vectors whose elements reside in continuous memory segments
-* [ARROW-6156](https://issues.apache.org/jira/browse/ARROW-6156) - [Java] Support compare semantics for ArrowBufPointer
-* [ARROW-6161](https://issues.apache.org/jira/browse/ARROW-6161) - [C++] Implements dataset::ParquetFile and associated Scan structures
-* [ARROW-6162](https://issues.apache.org/jira/browse/ARROW-6162) - [C++][Gandiva] Do not truncate string in castVARCHAR\_varchar when out\_len parameter is zero
-* [ARROW-6164](https://issues.apache.org/jira/browse/ARROW-6164) - [Docs][Format] Document project versioning schema and forward/backward compatibility policies
-* [ARROW-6172](https://issues.apache.org/jira/browse/ARROW-6172) - [Java] Provide benchmarks to set IntVector with different methods
-* [ARROW-6177](https://issues.apache.org/jira/browse/ARROW-6177) - [C++] Add Array::Validate()
-* [ARROW-6180](https://issues.apache.org/jira/browse/ARROW-6180) - [C++] Create InputStream that is an isolated reader of a segment of a RandomAccessFile
-* [ARROW-6181](https://issues.apache.org/jira/browse/ARROW-6181) - [R] Only allow R package to install without libarrow on linux
-* [ARROW-6183](https://issues.apache.org/jira/browse/ARROW-6183) - [R] Document that you don't have to use tidyselect if you don't want
-* [ARROW-6185](https://issues.apache.org/jira/browse/ARROW-6185) - [Java] Provide hash table based dictionary builder
-* [ARROW-6187](https://issues.apache.org/jira/browse/ARROW-6187) - [C++] fallback to storage type when writing ExtensionType to Parquet
-* [ARROW-6188](https://issues.apache.org/jira/browse/ARROW-6188) - [GLib] Add garrow\_array\_is\_in()
-* [ARROW-6192](https://issues.apache.org/jira/browse/ARROW-6192) - [GLib] Use the same SO version as C++
-* [ARROW-6194](https://issues.apache.org/jira/browse/ARROW-6194) - [Java] Add non-static approach in DictionaryEncoder making it easy to extend and reuse
-* [ARROW-6196](https://issues.apache.org/jira/browse/ARROW-6196) - [Ruby] Add support for building Arrow::TimeNNArray by .new
-* [ARROW-6197](https://issues.apache.org/jira/browse/ARROW-6197) - [GLib] Add garrow\_decimal128\_rescale()
-* [ARROW-6199](https://issues.apache.org/jira/browse/ARROW-6199) - [Java] Avro adapter avoid potential resource leak.
-* [ARROW-6203](https://issues.apache.org/jira/browse/ARROW-6203) - [GLib] Add garrow\_array\_sort\_to\_indices()
-* [ARROW-6204](https://issues.apache.org/jira/browse/ARROW-6204) - [GLib] Add garrow\_array\_is\_in\_chunked\_array()
-* [ARROW-6206](https://issues.apache.org/jira/browse/ARROW-6206) - [Java][Docs] Document environment variables/java properties
-* [ARROW-6209](https://issues.apache.org/jira/browse/ARROW-6209) - [Java] Extract set null method to the base class for fixed width vectors
-* [ARROW-6212](https://issues.apache.org/jira/browse/ARROW-6212) - [Java] Support vector rank operation
-* [ARROW-6216](https://issues.apache.org/jira/browse/ARROW-6216) - [C++] Allow user to select the compression level
-* [ARROW-6217](https://issues.apache.org/jira/browse/ARROW-6217) - [Website] Remove needless \_site/ directory
-* [ARROW-6219](https://issues.apache.org/jira/browse/ARROW-6219) - [Java] Add API for JDBC adapter that can convert less then the full result set at a time.
-* [ARROW-6220](https://issues.apache.org/jira/browse/ARROW-6220) - [Java] Add API to avro adapter to limit number of rows returned at a time.
-* [ARROW-6225](https://issues.apache.org/jira/browse/ARROW-6225) - [Website] Update arrow-site/README and any other places to point website contributors in right direction
-* [ARROW-6229](https://issues.apache.org/jira/browse/ARROW-6229) - [C++] Add a DataSource implementation which scans a directory
-* [ARROW-6230](https://issues.apache.org/jira/browse/ARROW-6230) - [R] Reading in Parquet files are 20x slower than reading fst files in R
-* [ARROW-6231](https://issues.apache.org/jira/browse/ARROW-6231) - [C++][Python] Consider assigning default column names when reading CSV file and header\_rows=0
-* [ARROW-6232](https://issues.apache.org/jira/browse/ARROW-6232) - [C++] Rename Argsort kernel to SortToIndices
-* [ARROW-6237](https://issues.apache.org/jira/browse/ARROW-6237) - [R] Add option to set CXXFLAGS when compiling R package with $ARROW\_R\_CXXFLAGS
-* [ARROW-6238](https://issues.apache.org/jira/browse/ARROW-6238) - [C++] Implement SimpleDataSource/SimpleDataFragment
-* [ARROW-6240](https://issues.apache.org/jira/browse/ARROW-6240) - [Ruby] Arrow::Decimal128Array returns BigDecimal
-* [ARROW-6242](https://issues.apache.org/jira/browse/ARROW-6242) - [C++] Implements basic Dataset/Scanner/ScannerBuilder
-* [ARROW-6243](https://issues.apache.org/jira/browse/ARROW-6243) - [C++] Implement basic Filter expression classes
-* [ARROW-6244](https://issues.apache.org/jira/browse/ARROW-6244) - [C++] Implement Partition DataSource
-* [ARROW-6246](https://issues.apache.org/jira/browse/ARROW-6246) - [Website] Add link to R documentation site
-* [ARROW-6247](https://issues.apache.org/jira/browse/ARROW-6247) - [Java] Provide a common interface for float4 and float8 vectors
-* [ARROW-6249](https://issues.apache.org/jira/browse/ARROW-6249) - [Java] Remove useless class ByteArrayWrapper
-* [ARROW-6250](https://issues.apache.org/jira/browse/ARROW-6250) - [Java] Implement ApproxEqualsVisitor comparing approx for floating point
-* [ARROW-6252](https://issues.apache.org/jira/browse/ARROW-6252) - [Python] Add pyarrow.Array.diff method that exposes arrow::Diff
-* [ARROW-6253](https://issues.apache.org/jira/browse/ARROW-6253) - [Python] Expose "enable\_buffered\_stream" option from parquet::ReaderProperties in pyarrow.parquet.read\_table
-* [ARROW-6258](https://issues.apache.org/jira/browse/ARROW-6258) - [R] Add macOS build scripts
-* [ARROW-6260](https://issues.apache.org/jira/browse/ARROW-6260) - [Website] Use deploy key on Travis to build and push to asf-site
-* [ARROW-6262](https://issues.apache.org/jira/browse/ARROW-6262) - [Developer] Show JIRA issue before merging
-* [ARROW-6264](https://issues.apache.org/jira/browse/ARROW-6264) - [Java] There is no need to consider byte order in ArrowBufHasher
-* [ARROW-6265](https://issues.apache.org/jira/browse/ARROW-6265) - [Java] Avro adapter implement Array/Map/Fixed type
-* [ARROW-6267](https://issues.apache.org/jira/browse/ARROW-6267) - [Ruby] Add Arrow::Time for Arrow::Time{32,64}DataType value
-* [ARROW-6271](https://issues.apache.org/jira/browse/ARROW-6271) - [Rust] [DataFusion] Add example for running SQL against Parquet
-* [ARROW-6272](https://issues.apache.org/jira/browse/ARROW-6272) - [Rust] [DataFusion] Add register\_parquet convenience method to ExecutionContext
-* [ARROW-6278](https://issues.apache.org/jira/browse/ARROW-6278) - [R] Read parquet files from raw vector
-* [ARROW-6279](https://issues.apache.org/jira/browse/ARROW-6279) - [Python] Add Table.slice method or allow slices in \_\_getitem\_\_
-* [ARROW-6284](https://issues.apache.org/jira/browse/ARROW-6284) - [C++] Allow references in std::tuple when converting tuple to arrow array
-* [ARROW-6287](https://issues.apache.org/jira/browse/ARROW-6287) - [Rust] [DataFusion] Refactor TableProvider to return thread-safe BatchIterator
-* [ARROW-6288](https://issues.apache.org/jira/browse/ARROW-6288) - [Java] Implement TypeEqualsVisitor comparing vector type equals considering names and metadata
-* [ARROW-6289](https://issues.apache.org/jira/browse/ARROW-6289) - [Java] Add empty() in UnionVector to create instance
-* [ARROW-6292](https://issues.apache.org/jira/browse/ARROW-6292) - [C++] Add an option to build with mimalloc
-* [ARROW-6294](https://issues.apache.org/jira/browse/ARROW-6294) - [C++] Use hyphen for plasma-store-server executable
-* [ARROW-6295](https://issues.apache.org/jira/browse/ARROW-6295) - [Rust][DataFusion] ExecutionError Cannot compare Float32 with Float64
-* [ARROW-6296](https://issues.apache.org/jira/browse/ARROW-6296) - [Java] Cleanup JDBC interfaces and eliminate one memcopy for binary/varchar fields
-* [ARROW-6297](https://issues.apache.org/jira/browse/ARROW-6297) - [Java] Compare ArrowBufPointers by unsinged integers
-* [ARROW-6300](https://issues.apache.org/jira/browse/ARROW-6300) - [C++] Add io::OutputStream::Abort()
-* [ARROW-6303](https://issues.apache.org/jira/browse/ARROW-6303) - [Rust] Add a feature to disable SIMD
-* [ARROW-6304](https://issues.apache.org/jira/browse/ARROW-6304) - [Java] Add description to each maven artifact
-* [ARROW-6306](https://issues.apache.org/jira/browse/ARROW-6306) - [Java] Support stable sort by stable comparators
-* [ARROW-6310](https://issues.apache.org/jira/browse/ARROW-6310) - [C++] Write 64-bit integers as strings in JSON integration test files
-* [ARROW-6311](https://issues.apache.org/jira/browse/ARROW-6311) - [Java] Make ApproxEqualsVisitor accept DiffFunction to make it more flexible
-* [ARROW-6313](https://issues.apache.org/jira/browse/ARROW-6313) - [Format] Tracking for ensuring flatbuffer serialized values are aligned in stream/files.
-* [ARROW-6314](https://issues.apache.org/jira/browse/ARROW-6314) - [C++] Implement changes to ensure flatbuffer alignment.
-* [ARROW-6314](https://issues.apache.org/jira/browse/ARROW-6314) - [C++] Implement changes to ensure flatbuffer alignment.
-* [ARROW-6315](https://issues.apache.org/jira/browse/ARROW-6315) - [Java] Make change to ensure flatbuffer reads are aligned
-* [ARROW-6316](https://issues.apache.org/jira/browse/ARROW-6316) - [Go] Make change to ensure flatbuffer reads are aligned
-* [ARROW-6317](https://issues.apache.org/jira/browse/ARROW-6317) - [JS] Implement changes to ensure flatbuffer alignment
-* [ARROW-6318](https://issues.apache.org/jira/browse/ARROW-6318) - [Integration] Update integration test to use generated binaries to ensure backwards compatibility
-* [ARROW-6319](https://issues.apache.org/jira/browse/ARROW-6319) - [C++] Extract the core of NumericTensor<T\>::Value as Tensor::Value<T\>
-* [ARROW-6326](https://issues.apache.org/jira/browse/ARROW-6326) - [C++] Nullable fields when converting std::tuple to Table
-* [ARROW-6328](https://issues.apache.org/jira/browse/ARROW-6328) - Click.option-s should have help text
-* [ARROW-6329](https://issues.apache.org/jira/browse/ARROW-6329) - [Format] Add 4-byte "stream continuation" to IPC message format to align Flatbuffers
-* [ARROW-6331](https://issues.apache.org/jira/browse/ARROW-6331) - [Java] Incorporate ErrorProne into the java build
-* [ARROW-6334](https://issues.apache.org/jira/browse/ARROW-6334) - [Java] Improve the dictionary builder API to return the position of the value in the dictionary
-* [ARROW-6335](https://issues.apache.org/jira/browse/ARROW-6335) - [Java] Improve the performance of DictionaryHashTable
-* [ARROW-6336](https://issues.apache.org/jira/browse/ARROW-6336) - [Python] Clarify pyarrow.serialize/deserialize docstrings viz-a-viz relationship with Arrow IPC protocol
-* [ARROW-6337](https://issues.apache.org/jira/browse/ARROW-6337) - [R] as\_tibble in R API is a misnomer
-* [ARROW-6338](https://issues.apache.org/jira/browse/ARROW-6338) - [R] Type function names don't match type names
-* [ARROW-6342](https://issues.apache.org/jira/browse/ARROW-6342) - [Python] Add pyarrow.record\_batch factory function with same basic API / semantics as pyarrow.table
-* [ARROW-6346](https://issues.apache.org/jira/browse/ARROW-6346) - [GLib] Add garrow\_array\_view()
-* [ARROW-6347](https://issues.apache.org/jira/browse/ARROW-6347) - [GLib] Add garrow\_array\_diff\_unified()
-* [ARROW-6350](https://issues.apache.org/jira/browse/ARROW-6350) - [Ruby] Remove Arrow::Struct and use Hash instead
-* [ARROW-6351](https://issues.apache.org/jira/browse/ARROW-6351) - [Ruby] Improve Arrow\#values performance
-* [ARROW-6353](https://issues.apache.org/jira/browse/ARROW-6353) - [Python] Allow user to select compression level in pyarrow.parquet.write\_table
-* [ARROW-6355](https://issues.apache.org/jira/browse/ARROW-6355) - [Java] Make range equal visitor reusable
-* [ARROW-6356](https://issues.apache.org/jira/browse/ARROW-6356) - [Java] Avro adapter implement Enum type and nested Record type
-* [ARROW-6357](https://issues.apache.org/jira/browse/ARROW-6357) - [C++] S3: allow for background writes
-* [ARROW-6358](https://issues.apache.org/jira/browse/ARROW-6358) - [C++] FileSystem::DeleteDir should make it optional to delete the directory itself
-* [ARROW-6360](https://issues.apache.org/jira/browse/ARROW-6360) - [R] Update support for compression
-* [ARROW-6362](https://issues.apache.org/jira/browse/ARROW-6362) - [C++] S3: more flexible credential options
-* [ARROW-6365](https://issues.apache.org/jira/browse/ARROW-6365) - [R] Should be able to coerce numeric to integer with schema
-* [ARROW-6366](https://issues.apache.org/jira/browse/ARROW-6366) - [Java] Make field vectors final explicitly
-* [ARROW-6368](https://issues.apache.org/jira/browse/ARROW-6368) - [C++] Add RecordBatch projection functionality
-* [ARROW-6373](https://issues.apache.org/jira/browse/ARROW-6373) - [C++] Make FixedWidthBinaryBuilder consistent with other primitive fixed width builders
-* [ARROW-6375](https://issues.apache.org/jira/browse/ARROW-6375) - [C++] Extend ConversionTraits to allow efficiently appending list values in STL API
-* [ARROW-6379](https://issues.apache.org/jira/browse/ARROW-6379) - [C++] Do not append any buffers when serializing NullType for IPC
-* [ARROW-6381](https://issues.apache.org/jira/browse/ARROW-6381) - [C++] BufferOutputStream::Write is slow for many small writes
-* [ARROW-6383](https://issues.apache.org/jira/browse/ARROW-6383) - [Java] report outstanding child allocators on parent allocator close
-* [ARROW-6384](https://issues.apache.org/jira/browse/ARROW-6384) - [C++] Bump dependencies
-* [ARROW-6385](https://issues.apache.org/jira/browse/ARROW-6385) - [C++] Investigate xxh3
-* [ARROW-6391](https://issues.apache.org/jira/browse/ARROW-6391) - [Python][Flight] Add built-in methods on FlightServerBase to start server and wait for it to be available
-* [ARROW-6397](https://issues.apache.org/jira/browse/ARROW-6397) - [C++][CI] Fix S3 minio failure
-* [ARROW-6401](https://issues.apache.org/jira/browse/ARROW-6401) - [Java] Implement dictionary-encoded subfields for Struct type
-* [ARROW-6402](https://issues.apache.org/jira/browse/ARROW-6402) - [C++] Suppress sign-compare warning with g++ 9.2.1
-* [ARROW-6403](https://issues.apache.org/jira/browse/ARROW-6403) - [Python] Expose FileReader::ReadRowGroups() to Python
-* [ARROW-6408](https://issues.apache.org/jira/browse/ARROW-6408) - [Rust] Use "if cfg!" pattern in SIMD kernel implementations
-* [ARROW-6413](https://issues.apache.org/jira/browse/ARROW-6413) - [R] Support autogenerating column names
-* [ARROW-6415](https://issues.apache.org/jira/browse/ARROW-6415) - [R] Remove usage of R CMD config CXXCPP
-* [ARROW-6416](https://issues.apache.org/jira/browse/ARROW-6416) - [Python] Confusing API & documentation regarding chunksizes
-* [ARROW-6417](https://issues.apache.org/jira/browse/ARROW-6417) - [C++][Parquet] Non-dictionary BinaryArray reads from Parquet format have slowed down since 0.11.x
-* [ARROW-6419](https://issues.apache.org/jira/browse/ARROW-6419) - [Website] Blog post about Parquet dictionary performance work coming in 0.15.x release
-* [ARROW-6422](https://issues.apache.org/jira/browse/ARROW-6422) - [Gandiva] Fix double-conversion linker issue
-* [ARROW-6426](https://issues.apache.org/jira/browse/ARROW-6426) - [FlightRPC] Expose gRPC configuration knobs in Flight
-* [ARROW-6427](https://issues.apache.org/jira/browse/ARROW-6427) - [GLib] Add support for column names autogeneration CSV read option
-* [ARROW-6438](https://issues.apache.org/jira/browse/ARROW-6438) - [R] Add bindings for filesystem API
-* [ARROW-6447](https://issues.apache.org/jira/browse/ARROW-6447) - [C++] Builds with ARROW\_JEMALLOC=ON wait until jemalloc\_ep is complete before building any libarrow .cc files
-* [ARROW-6450](https://issues.apache.org/jira/browse/ARROW-6450) - [C++] Use 2x reallocation strategy in arrow::BufferBuilder instead of 1.5x
-* [ARROW-6451](https://issues.apache.org/jira/browse/ARROW-6451) - [Format] Add clarifications to Columnar.rst about the contents of "null" slots in Varbinary or List arrays
-* [ARROW-6453](https://issues.apache.org/jira/browse/ARROW-6453) - [C++] More informative error messages from S3
-* [ARROW-6454](https://issues.apache.org/jira/browse/ARROW-6454) - [Developer] Add LLVM license to LICENSE.txt due to binary redistribution in packages
-* [ARROW-6458](https://issues.apache.org/jira/browse/ARROW-6458) - [Java] Remove value boxing/unboxing for ApproxEqualsVisitor
-* [ARROW-6460](https://issues.apache.org/jira/browse/ARROW-6460) - [Java] Add benchmark and large fake data UT for avro adapter
-* [ARROW-6462](https://issues.apache.org/jira/browse/ARROW-6462) - [C++] Can't build with bundled double-conversion on CentOS 6 x86\_64
-* [ARROW-6465](https://issues.apache.org/jira/browse/ARROW-6465) - [Python] Improve Windows build instructions
-* [ARROW-6474](https://issues.apache.org/jira/browse/ARROW-6474) - [Python] Provide mechanism for python to write out old format
-* [ARROW-6475](https://issues.apache.org/jira/browse/ARROW-6475) - [C++] Don't try to dictionary encode dictionary arrays
-* [ARROW-6477](https://issues.apache.org/jira/browse/ARROW-6477) - [Packaging][Crossbow] Use Azure Pipelines to build linux packages
-* [ARROW-6480](https://issues.apache.org/jira/browse/ARROW-6480) - [Developer] Add command to generate and send e-mail report for a Crossbow run
-* [ARROW-6484](https://issues.apache.org/jira/browse/ARROW-6484) - [Java] Enable create indexType for DictionaryEncoding according to dictionary value count
-* [ARROW-6487](https://issues.apache.org/jira/browse/ARROW-6487) - [Rust] [DataFusion] Create test utils module
-* [ARROW-6489](https://issues.apache.org/jira/browse/ARROW-6489) - [Developer][Documentation] Fix merge script and readme
-* [ARROW-6490](https://issues.apache.org/jira/browse/ARROW-6490) - [Java] log error for leak in allocator close
-* [ARROW-6491](https://issues.apache.org/jira/browse/ARROW-6491) - [Java] fix master build failure caused by ErrorProne
-* [ARROW-6494](https://issues.apache.org/jira/browse/ARROW-6494) - [C++][Dataset] Implement basic PartitionScheme
-* [ARROW-6504](https://issues.apache.org/jira/browse/ARROW-6504) - [Python][Packaging] Add mimalloc to conda packages for better performance
-* [ARROW-6505](https://issues.apache.org/jira/browse/ARROW-6505) - [Website] Add new committers
-* [ARROW-6518](https://issues.apache.org/jira/browse/ARROW-6518) - [Packaging][Python] Flight failing in OSX Python wheel builds
-* [ARROW-6519](https://issues.apache.org/jira/browse/ARROW-6519) - [Java] Use IPC continuation token to mark EOS
-* [ARROW-6524](https://issues.apache.org/jira/browse/ARROW-6524) - [Developer][Packaging] Nightly build report's subject should contain Arrow
-* [ARROW-6525](https://issues.apache.org/jira/browse/ARROW-6525) - [C++] CloseFromDestructor() should perhaps not crash
-* [ARROW-6526](https://issues.apache.org/jira/browse/ARROW-6526) - [C++] Poison data in PoolBuffer destructor
-* [ARROW-6527](https://issues.apache.org/jira/browse/ARROW-6527) - [C++] Add OutputStream::Write() variant taking an owned buffer
-* [ARROW-6531](https://issues.apache.org/jira/browse/ARROW-6531) - [Python] Add detach() method to buffered streams
-* [ARROW-6532](https://issues.apache.org/jira/browse/ARROW-6532) - [R] Write parquet files with compression
-* [ARROW-6533](https://issues.apache.org/jira/browse/ARROW-6533) - [R] Compression codec should take a "level"
-* [ARROW-6534](https://issues.apache.org/jira/browse/ARROW-6534) - [Java] Fix typos and spelling
-* [ARROW-6539](https://issues.apache.org/jira/browse/ARROW-6539) - [R] Provide mechanism to write out old format
-* [ARROW-6540](https://issues.apache.org/jira/browse/ARROW-6540) - [R] Add Validate() methods
-* [ARROW-6541](https://issues.apache.org/jira/browse/ARROW-6541) - [Format][C++] Use two-part EOS and amend Format documentation
-* [ARROW-6542](https://issues.apache.org/jira/browse/ARROW-6542) - [R] Add View() method to array types
-* [ARROW-6544](https://issues.apache.org/jira/browse/ARROW-6544) - [R] Documentation/polishing for 0.15 release
-* [ARROW-6545](https://issues.apache.org/jira/browse/ARROW-6545) - [Go] Update Go IPC writer to use two-part EOS per mailing list discussion
-* [ARROW-6546](https://issues.apache.org/jira/browse/ARROW-6546) - [C++] Add missing FlatBuffers source dependency
-* [ARROW-6549](https://issues.apache.org/jira/browse/ARROW-6549) - [C++] Switch back to latest jemalloc 5.x
-* [ARROW-6556](https://issues.apache.org/jira/browse/ARROW-6556) - [Python] Prepare for pandas release without SparseDataFrame
-* [ARROW-6556](https://issues.apache.org/jira/browse/ARROW-6556) - [Python] Prepare for pandas release without SparseDataFrame
-* [ARROW-6557](https://issues.apache.org/jira/browse/ARROW-6557) - [Python] Always return pandas.Series from Array/ChunkedArray.to\_pandas, propagate field names to Series from RecordBatch, Table
-* [ARROW-6558](https://issues.apache.org/jira/browse/ARROW-6558) - [C++] Refactor Iterator to a type erased handle
-* [ARROW-6559](https://issues.apache.org/jira/browse/ARROW-6559) - [Developer][C++] Add "archery" option to specify system toolchain for C++ builds
-* [ARROW-6563](https://issues.apache.org/jira/browse/ARROW-6563) - [Rust] [DataFusion] Create "merge" execution plan
-* [ARROW-6569](https://issues.apache.org/jira/browse/ARROW-6569) - [Website] Add support for auto deployment by GitHub Actions
-* [ARROW-6570](https://issues.apache.org/jira/browse/ARROW-6570) - [Python] Use MemoryPool to allocate memory for NumPy arrays in to\_pandas calls
-* [ARROW-6580](https://issues.apache.org/jira/browse/ARROW-6580) - [Java] Support comparison for unsigned integers
-* [ARROW-6584](https://issues.apache.org/jira/browse/ARROW-6584) - [Python][Wheel] Bundle zlib again with the windows wheels
-* [ARROW-6588](https://issues.apache.org/jira/browse/ARROW-6588) - [C++] Suppress class-memaccess warning with g++ 9.2.1
-* [ARROW-6589](https://issues.apache.org/jira/browse/ARROW-6589) - [C++] Support BinaryType in MakeArrayOfNull
-* [ARROW-6590](https://issues.apache.org/jira/browse/ARROW-6590) - [C++] Do not require ARROW\_JSON=ON when ARROW\_IPC=ON
-* [ARROW-6591](https://issues.apache.org/jira/browse/ARROW-6591) - [R] Ignore .Rhistory files in source control
-* [ARROW-6599](https://issues.apache.org/jira/browse/ARROW-6599) - [Rust] [DataFusion] Implement SUM aggregate expression
-* [ARROW-6601](https://issues.apache.org/jira/browse/ARROW-6601) - [Java] Improve JDBC adapter performance & add benchmark
-* [ARROW-6605](https://issues.apache.org/jira/browse/ARROW-6605) - [C++] Add recursion depth control to fs::Selector
-* [ARROW-6606](https://issues.apache.org/jira/browse/ARROW-6606) - [C++] Construct tree structure from std::vector<fs::FileStats\>
-* [ARROW-6609](https://issues.apache.org/jira/browse/ARROW-6609) - [C++] Add minimal build Dockerfile example
-* [ARROW-6613](https://issues.apache.org/jira/browse/ARROW-6613) - [C++] Remove dependency on boost::filesystem
-* [ARROW-6614](https://issues.apache.org/jira/browse/ARROW-6614) - [C++][Dataset] Implement FileSystemDataSourceDiscovery
-* [ARROW-6616](https://issues.apache.org/jira/browse/ARROW-6616) - [Website] Release announcement blog post for 0.15
-* [ARROW-6621](https://issues.apache.org/jira/browse/ARROW-6621) - [Rust][DataFusion] Examples for DataFusion are not executed in CI
-* [ARROW-6629](https://issues.apache.org/jira/browse/ARROW-6629) - [Doc][C++] Document the FileSystem API
-* [ARROW-6630](https://issues.apache.org/jira/browse/ARROW-6630) - [Doc][C++] Document the file readers (CSV, JSON, Parquet, etc.)
-* [ARROW-6644](https://issues.apache.org/jira/browse/ARROW-6644) - [JS] Amend NullType IPC protocol to append no buffers
-* [ARROW-6647](https://issues.apache.org/jira/browse/ARROW-6647) - [C++] Can't build with g++ 4.8.5 on CentOS 7 by member initializer for shared\_ptr
-* [ARROW-6648](https://issues.apache.org/jira/browse/ARROW-6648) - [Go] Expose the bitutil package
-* [ARROW-6649](https://issues.apache.org/jira/browse/ARROW-6649) - [R] print() methods for Table, RecordBatch, etc.
-* [ARROW-6653](https://issues.apache.org/jira/browse/ARROW-6653) - [Developer] Add support for auto JIRA link on pull request
-* [ARROW-6655](https://issues.apache.org/jira/browse/ARROW-6655) - [Python] Filesystem bindings for S3
-* [ARROW-6664](https://issues.apache.org/jira/browse/ARROW-6664) - [C++] Add option to build without SSE4.2
-* [ARROW-6665](https://issues.apache.org/jira/browse/ARROW-6665) - [Rust] [DataFusion] Implement numeric literal expressions
-* [ARROW-6667](https://issues.apache.org/jira/browse/ARROW-6667) - [Python] Avoid Reference Cycles in pyarrow.parquet
-* [ARROW-6668](https://issues.apache.org/jira/browse/ARROW-6668) - [Rust] [DataFusion] Implement CAST expression
-* [ARROW-6669](https://issues.apache.org/jira/browse/ARROW-6669) - [Rust] [DataFusion] Implement physical expression for binary expressions
-* [ARROW-6675](https://issues.apache.org/jira/browse/ARROW-6675) - [JS] Add scanReverse function to dataFrame and filteredDataframe
-* [ARROW-6683](https://issues.apache.org/jira/browse/ARROW-6683) - [Python] Add unit tests that validate cross-compatibility with pyarrow.parquet when fastparquet is installed
-* [ARROW-6725](https://issues.apache.org/jira/browse/ARROW-6725) - [CI] Disable 3rdparty fuzzit nightly builds
-* [ARROW-6735](https://issues.apache.org/jira/browse/ARROW-6735) - [C++] Suppress sign-compare warning with g++ 9.2.1
-* [ARROW-6752](https://issues.apache.org/jira/browse/ARROW-6752) - [Go] implement Stringer for Null array
-* [ARROW-6755](https://issues.apache.org/jira/browse/ARROW-6755) - [Release] Improvements to Windows release verification script
-* [ARROW-6771](https://issues.apache.org/jira/browse/ARROW-6771) - [Packaging][Python] Missing pytest dependency from conda and wheel builds
-* [PARQUET-1468](https://issues.apache.org/jira/browse/PARQUET-1468) - [C++] Consolidate RecordReader, ColumnReader code paths
-
-
-## Bug Fixes
-
-* [ARROW-1184](https://issues.apache.org/jira/browse/ARROW-1184) - [Java] Dictionary.equals is not working correctly
-* [ARROW-2041](https://issues.apache.org/jira/browse/ARROW-2041) - [Python] pyarrow.serialize has high overhead for list of NumPy arrays
-* [ARROW-2248](https://issues.apache.org/jira/browse/ARROW-2248) - [Python] Nightly or on-demand HDFS test builds
-* [ARROW-2317](https://issues.apache.org/jira/browse/ARROW-2317) - [Python] fix C linkage warning
-* [ARROW-2490](https://issues.apache.org/jira/browse/ARROW-2490) - [C++] input stream locking inconsistent
-* [ARROW-3176](https://issues.apache.org/jira/browse/ARROW-3176) - [Python] Overflow in Date32 column conversion to pandas
-* [ARROW-3203](https://issues.apache.org/jira/browse/ARROW-3203) - [C++] Build error on Debian Buster
-* [ARROW-3651](https://issues.apache.org/jira/browse/ARROW-3651) - [Python] Datetimes from non-DateTimeIndex cannot be deserialized
-* [ARROW-3652](https://issues.apache.org/jira/browse/ARROW-3652) - [Python] CategoricalIndex is lost after reading back
-* [ARROW-3762](https://issues.apache.org/jira/browse/ARROW-3762) - [C++] Parquet arrow::Table reads error when overflowing capacity of BinaryArray
-* [ARROW-3933](https://issues.apache.org/jira/browse/ARROW-3933) - [Python] Segfault reading Parquet files from GNOMAD
-* [ARROW-4187](https://issues.apache.org/jira/browse/ARROW-4187) - [C++] file-benchmark uses <poll.h\>
-* [ARROW-4746](https://issues.apache.org/jira/browse/ARROW-4746) - [C++/Python] PyDataTime\_Date wrongly casted to PyDataTime\_DateTime
-* [ARROW-4836](https://issues.apache.org/jira/browse/ARROW-4836) - [Python] "Cannot tell() a compressed stream" when using RecordBatchStreamWriter
-* [ARROW-4848](https://issues.apache.org/jira/browse/ARROW-4848) - [C++] Static libparquet not compiled with -DARROW\_STATIC on Windows
-* [ARROW-4880](https://issues.apache.org/jira/browse/ARROW-4880) - [Python] python/asv-build.sh is probably broken after CMake refactor
-* [ARROW-4883](https://issues.apache.org/jira/browse/ARROW-4883) - [Python] read\_csv() returns garbage if given file object in text mode
-* [ARROW-5028](https://issues.apache.org/jira/browse/ARROW-5028) - [Python][C++] Creating list<string\> with pyarrow.array can overflow child builder
-* [ARROW-5072](https://issues.apache.org/jira/browse/ARROW-5072) - [Python] write\_table fails silently on S3 errors
-* [ARROW-5085](https://issues.apache.org/jira/browse/ARROW-5085) - [Python/C++] Conversion of dict encoded null column fails in parquet writing when using RowGroups
-* [ARROW-5086](https://issues.apache.org/jira/browse/ARROW-5086) - [Python] Space leak in ParquetFile.read\_row\_group()
-* [ARROW-5089](https://issues.apache.org/jira/browse/ARROW-5089) - [C++/Python] Writing dictionary encoded columns to parquet is extremely slow when using chunk size
-* [ARROW-5103](https://issues.apache.org/jira/browse/ARROW-5103) - [Python] Segfault when using chunked\_array.to\_pandas on array different types (edge case)
-* [ARROW-5125](https://issues.apache.org/jira/browse/ARROW-5125) - [Python] Cannot roundtrip extreme dates through pyarrow
-* [ARROW-5161](https://issues.apache.org/jira/browse/ARROW-5161) - [Python] Cannot convert struct type from Pandas object column
-* [ARROW-5220](https://issues.apache.org/jira/browse/ARROW-5220) - [Python] index / unknown columns in specified schema in Table.from\_pandas
-* [ARROW-5220](https://issues.apache.org/jira/browse/ARROW-5220) - [Python] index / unknown columns in specified schema in Table.from\_pandas
-* [ARROW-5292](https://issues.apache.org/jira/browse/ARROW-5292) - [C++] Static libraries are built on AppVeyor
-* [ARROW-5300](https://issues.apache.org/jira/browse/ARROW-5300) - [C++] 0.13 FAILED to build with option -DARROW\_NO\_DEFAULT\_MEMORY\_POOL
-* [ARROW-5374](https://issues.apache.org/jira/browse/ARROW-5374) - [Python] Misleading error message when calling pyarrow.read\_record\_batch on a complete IPC stream
-* [ARROW-5414](https://issues.apache.org/jira/browse/ARROW-5414) - [C++] Using "Ninja" build system generator overrides default Release build type on Windows
-* [ARROW-5450](https://issues.apache.org/jira/browse/ARROW-5450) - [Python] TimestampArray.to\_pylist() fails with OverflowError: Python int too large to convert to C long
-* [ARROW-5471](https://issues.apache.org/jira/browse/ARROW-5471) - [C++][Gandiva]Array offset is ignored in Gandiva projector
-* [ARROW-5522](https://issues.apache.org/jira/browse/ARROW-5522) - [Packaging][Documentation] Comments out of date in python/manylinux1/build\_arrow.sh
-* [ARROW-5525](https://issues.apache.org/jira/browse/ARROW-5525) - [C++][CI] Enable continuous fuzzing
-* [ARROW-5560](https://issues.apache.org/jira/browse/ARROW-5560) - [C++][Plasma] Cannot create Plasma object after OutOfMemory error
-* [ARROW-5562](https://issues.apache.org/jira/browse/ARROW-5562) - [C++][Parquet] parquet writer does not handle negative zero correctly
-* [ARROW-5630](https://issues.apache.org/jira/browse/ARROW-5630) - [Python][Parquet] Table of nested arrays doesn't round trip
-* [ARROW-5638](https://issues.apache.org/jira/browse/ARROW-5638) - [C++] cmake fails to generate Xcode project when Gandiva JNI bindings are enabled
-* [ARROW-5651](https://issues.apache.org/jira/browse/ARROW-5651) - [Python] Incorrect conversion from strided Numpy array when other type is specified
-* [ARROW-5682](https://issues.apache.org/jira/browse/ARROW-5682) - [Python] from\_pandas conversion casts values to string inconsistently
-* [ARROW-5731](https://issues.apache.org/jira/browse/ARROW-5731) - [CI] Turbodbc integration tests are failing
-* [ARROW-5753](https://issues.apache.org/jira/browse/ARROW-5753) - [Rust] Fix test failure in CI code coverage
-* [ARROW-5772](https://issues.apache.org/jira/browse/ARROW-5772) - [GLib][Plasma][CUDA] Plasma::Client\#refer\_object test is failed
-* [ARROW-5775](https://issues.apache.org/jira/browse/ARROW-5775) - [C++] StructArray : cached boxed fields not thread-safe
-* [ARROW-5776](https://issues.apache.org/jira/browse/ARROW-5776) - [Gandiva][Crossbow] Revert template to have commit ids.
-* [ARROW-5790](https://issues.apache.org/jira/browse/ARROW-5790) - [Python] Passing zero-dim numpy array to pa.array causes segfault
-* [ARROW-5817](https://issues.apache.org/jira/browse/ARROW-5817) - [Python] Use pytest marks for Flight test to avoid silently skipping unit tests due to import failures
-* [ARROW-5823](https://issues.apache.org/jira/browse/ARROW-5823) - [Rust] CI scripts miss --all-targets cargo argument
-* [ARROW-5824](https://issues.apache.org/jira/browse/ARROW-5824) - [Gandiva] [C++] Fix decimal null
-* [ARROW-5836](https://issues.apache.org/jira/browse/ARROW-5836) - [Java][OSX] Flight tests are failing: address already in use
-* [ARROW-5838](https://issues.apache.org/jira/browse/ARROW-5838) - [C++][Flight][OSX] Building 3rdparty grpc cannot find OpenSSL
-* [ARROW-5848](https://issues.apache.org/jira/browse/ARROW-5848) - [C++] SO versioning schema after release 1.0.0
-* [ARROW-5849](https://issues.apache.org/jira/browse/ARROW-5849) - [C++] Compiler warnings on mingw-w64
-* [ARROW-5850](https://issues.apache.org/jira/browse/ARROW-5850) - [CI][R] R appveyor job is broken after release
-* [ARROW-5851](https://issues.apache.org/jira/browse/ARROW-5851) - [C++] Compilation of reference benchmarks fails
-* [ARROW-5856](https://issues.apache.org/jira/browse/ARROW-5856) - [Python] linking 3rd party cython modules against pyarrow fails since 0.14.0
-* [ARROW-5860](https://issues.apache.org/jira/browse/ARROW-5860) - [Java] [Vector] Fix decimal byte setter
-* [ARROW-5863](https://issues.apache.org/jira/browse/ARROW-5863) - [Python] Segmentation Fault via pytest-runner
-* [ARROW-5868](https://issues.apache.org/jira/browse/ARROW-5868) - [Python] manylinux2010 wheels have shared library dependency on liblz4
-* [ARROW-5870](https://issues.apache.org/jira/browse/ARROW-5870) - [C++] Development compile instructions need to include "make"
-* [ARROW-5873](https://issues.apache.org/jira/browse/ARROW-5873) - [Python] Segmentation fault when comparing schema with None
-* [ARROW-5874](https://issues.apache.org/jira/browse/ARROW-5874) - [Python] pyarrow 0.14.0 macOS wheels depend on shared libs under /usr/local/opt
-* [ARROW-5878](https://issues.apache.org/jira/browse/ARROW-5878) - [Python][C++] Parquet reader not forward compatible for timestamps without timezone
-* [ARROW-5884](https://issues.apache.org/jira/browse/ARROW-5884) - [Java] Fix the get method of StructVector
-* [ARROW-5886](https://issues.apache.org/jira/browse/ARROW-5886) - [Python][Packaging] Manylinux1/2010 compliance issue with libz
-* [ARROW-5887](https://issues.apache.org/jira/browse/ARROW-5887) - [C\#] ArrowStreamWriter writes FieldNodes in wrong order
-* [ARROW-5889](https://issues.apache.org/jira/browse/ARROW-5889) - [Python][C++] Parquet backwards compat for timestamps without timezone broken
-* [ARROW-5894](https://issues.apache.org/jira/browse/ARROW-5894) - [C++] libgandiva.so.14 is exporting libstdc++ symbols
-* [ARROW-5899](https://issues.apache.org/jira/browse/ARROW-5899) - [Python][Packaging] Bundle uriparser.dll in windows wheels
-* [ARROW-5910](https://issues.apache.org/jira/browse/ARROW-5910) - [Python] read\_tensor() fails on non-seekable streams
-* [ARROW-5921](https://issues.apache.org/jira/browse/ARROW-5921) - [C++][Fuzzing] Missing nullptr checks in IPC
-* [ARROW-5923](https://issues.apache.org/jira/browse/ARROW-5923) - [C++] Fix int96 comment
-* [ARROW-5925](https://issues.apache.org/jira/browse/ARROW-5925) - [Gandiva][C++] cast decimal to int should round up
-* [ARROW-5930](https://issues.apache.org/jira/browse/ARROW-5930) - [FlightRPC] [Python] Flight CI tests are failing
-* [ARROW-5930](https://issues.apache.org/jira/browse/ARROW-5930) - [FlightRPC] [Python] Flight CI tests are failing
-* [ARROW-5935](https://issues.apache.org/jira/browse/ARROW-5935) - [C++] ArrayBuilders with mutable type are not robustly supported
-* [ARROW-5946](https://issues.apache.org/jira/browse/ARROW-5946) - [Rust] [DataFusion] Projection push down with aggregate producing incorrect results
-* [ARROW-5952](https://issues.apache.org/jira/browse/ARROW-5952) - [Python] Segfault when reading empty table with category as pandas dataframe
-* [ARROW-5959](https://issues.apache.org/jira/browse/ARROW-5959) - [C++][CI] Fuzzit does not know about branch + commit hash
-* [ARROW-5960](https://issues.apache.org/jira/browse/ARROW-5960) - [C++] Boost dependencies are specified in wrong order
-* [ARROW-5963](https://issues.apache.org/jira/browse/ARROW-5963) - [R] R Appveyor job does not test changes in the C++ library
-* [ARROW-5964](https://issues.apache.org/jira/browse/ARROW-5964) - [C++][Gandiva] Cast double to decimal with rounding returns 0
-* [ARROW-5965](https://issues.apache.org/jira/browse/ARROW-5965) - [Python] Regression: segfault when reading hive table with v0.14
-* [ARROW-5966](https://issues.apache.org/jira/browse/ARROW-5966) - [Python] Capacity error when converting large UTF32 numpy array to arrow array
-* [ARROW-5968](https://issues.apache.org/jira/browse/ARROW-5968) - [Java] Remove duplicate Preconditions check in JDBC adapter
-* [ARROW-5969](https://issues.apache.org/jira/browse/ARROW-5969) - [CI] [R] Lint failures
-* [ARROW-5973](https://issues.apache.org/jira/browse/ARROW-5973) - [Java] Variable width vectors' get methods should return null when the underlying data is null
-* [ARROW-5978](https://issues.apache.org/jira/browse/ARROW-5978) - [FlightRPC] [Java] Integration test client doesn't close buffers
-* [ARROW-5989](https://issues.apache.org/jira/browse/ARROW-5989) - [C++][Python] pyarrow.lib.ArrowIOError: Unable to load libjvm when using openjdk-8
-* [ARROW-5990](https://issues.apache.org/jira/browse/ARROW-5990) - [Python] RowGroupMetaData.column misses bounds check
-* [ARROW-5992](https://issues.apache.org/jira/browse/ARROW-5992) - [C++] Array::View fails for string/utf8 as binary
-* [ARROW-5993](https://issues.apache.org/jira/browse/ARROW-5993) - [Python] Reading a dictionary column from Parquet results in disproportionate memory usage
-* [ARROW-5996](https://issues.apache.org/jira/browse/ARROW-5996) - [Java] Avoid resource leak in flight service
-* [ARROW-5999](https://issues.apache.org/jira/browse/ARROW-5999) - [C++] Required header files missing when built with -DARROW\_DATASET=OFF
-* [ARROW-6002](https://issues.apache.org/jira/browse/ARROW-6002) - [C++][Gandiva] TestCastFunctions does not test int64 casting\`
-* [ARROW-6004](https://issues.apache.org/jira/browse/ARROW-6004) - [C++] CSV reader ignore\_empty\_lines option doesn't handle empty lines
-* [ARROW-6005](https://issues.apache.org/jira/browse/ARROW-6005) - [C++] parquet::arrow::FileReader::GetRecordBatchReader() does not behave as documented since ARROW-1012
-* [ARROW-6006](https://issues.apache.org/jira/browse/ARROW-6006) - [C++] Empty IPC streams containing a dictionary are corrupt
-* [ARROW-6012](https://issues.apache.org/jira/browse/ARROW-6012) - [C++] Fall back on known Apache mirror for Thrift downloads
-* [ARROW-6015](https://issues.apache.org/jira/browse/ARROW-6015) - [Python] pyarrow wheel: \`DLL load failed\` when importing on windows
-* [ARROW-6016](https://issues.apache.org/jira/browse/ARROW-6016) - [Python] pyarrow get\_library\_dirs assertion error
-* [ARROW-6029](https://issues.apache.org/jira/browse/ARROW-6029) - [R] Improve R docs on how to fix library version mismatch
-* [ARROW-6032](https://issues.apache.org/jira/browse/ARROW-6032) - [C++] CountSetBits doesn't ensure 64-bit aligned accesses
-* [ARROW-6038](https://issues.apache.org/jira/browse/ARROW-6038) - [Python] pyarrow.Table.from\_batches produces corrupted table if any of the batches were empty
-* [ARROW-6040](https://issues.apache.org/jira/browse/ARROW-6040) - [Java] Dictionary entries are required in IPC streams even when empty
-* [ARROW-6046](https://issues.apache.org/jira/browse/ARROW-6046) - [C++] Slice RecordBatch of String array with offset 0 returns whole batch
-* [ARROW-6047](https://issues.apache.org/jira/browse/ARROW-6047) - [Rust] Rust nightly 1.38.0 builds failing
-* [ARROW-6050](https://issues.apache.org/jira/browse/ARROW-6050) - [Java] Update out-of-date java/flight/README.md
-* [ARROW-6054](https://issues.apache.org/jira/browse/ARROW-6054) - pyarrow.serialize should respect the value of structured dtype of numpy
-* [ARROW-6058](https://issues.apache.org/jira/browse/ARROW-6058) - [Python][Parquet] Failure when reading Parquet file from S3 with s3fs
-* [ARROW-6059](https://issues.apache.org/jira/browse/ARROW-6059) - [Python] Regression memory issue when calling pandas.read\_parquet
-* [ARROW-6060](https://issues.apache.org/jira/browse/ARROW-6060) - [Python] too large memory cost using pyarrow.parquet.read\_table with use\_threads=True
-* [ARROW-6061](https://issues.apache.org/jira/browse/ARROW-6061) - [C++] Cannot build libarrow without rapidjson
-* [ARROW-6066](https://issues.apache.org/jira/browse/ARROW-6066) - [Website] Fix blog post author header
-* [ARROW-6067](https://issues.apache.org/jira/browse/ARROW-6067) - [Python] Large memory test failures
-* [ARROW-6068](https://issues.apache.org/jira/browse/ARROW-6068) - [Python] Hypothesis test failure, Add StructType::Make that accepts vector of fields
-* [ARROW-6073](https://issues.apache.org/jira/browse/ARROW-6073) - [C++] Decimal128Builder is not reset in Finish()
-* [ARROW-6082](https://issues.apache.org/jira/browse/ARROW-6082) - [Python] create pa.dictionary() type with non-integer indices type crashes
-* [ARROW-6092](https://issues.apache.org/jira/browse/ARROW-6092) - [C++] Python 2.7: arrow\_python\_test failure
-* [ARROW-6095](https://issues.apache.org/jira/browse/ARROW-6095) - [C++] Python subproject ignores ARROW\_TEST\_LINKAGE
-* [ARROW-6108](https://issues.apache.org/jira/browse/ARROW-6108) - [C++] Appveyor Build\_Debug configuration is hanging in C++ unit tests
-* [ARROW-6116](https://issues.apache.org/jira/browse/ARROW-6116) - [C++][Gandiva] Fix bug in TimedTestFilterAdd2
-* [ARROW-6117](https://issues.apache.org/jira/browse/ARROW-6117) - [Java] Fix the set method of FixedSizeBinaryVector
-* [ARROW-6119](https://issues.apache.org/jira/browse/ARROW-6119) - [Python] PyArrow wheel import fails on Windows Python 3.7
-* [ARROW-6120](https://issues.apache.org/jira/browse/ARROW-6120) - [C++][Gandiva] including some headers causes decimal\_test to fail
-* [ARROW-6126](https://issues.apache.org/jira/browse/ARROW-6126) - [C++] IPC stream reader handling of empty streams potentially not robust
-* [ARROW-6132](https://issues.apache.org/jira/browse/ARROW-6132) - [Python] ListArray.from\_arrays does not check validity of input arrays
-* [ARROW-6135](https://issues.apache.org/jira/browse/ARROW-6135) - [C++] KeyValueMetadata::Equals should not be order-sensitive
-* [ARROW-6136](https://issues.apache.org/jira/browse/ARROW-6136) - [FlightRPC][Java] Don't double-close response stream
-* [ARROW-6145](https://issues.apache.org/jira/browse/ARROW-6145) - [Java] UnionVector created by MinorType\#getNewVector could not keep field type info properly
-* [ARROW-6148](https://issues.apache.org/jira/browse/ARROW-6148) - [C++][Packaging] Improve aarch64 support
-* [ARROW-6152](https://issues.apache.org/jira/browse/ARROW-6152) - [C++][Parquet] Write arrow::Array directly into parquet::TypedColumnWriter<T\>
-* [ARROW-6153](https://issues.apache.org/jira/browse/ARROW-6153) - [R] Address parquet deprecation warning
-* [ARROW-6158](https://issues.apache.org/jira/browse/ARROW-6158) - [Python] possible to create StructArray with type that conflicts with child array's types
-* [ARROW-6159](https://issues.apache.org/jira/browse/ARROW-6159) - [C++] PrettyPrint of arrow::Schema missing identation for first line
-* [ARROW-6160](https://issues.apache.org/jira/browse/ARROW-6160) - [Java] AbstractStructVector\#getPrimitiveVectors fails to work with complex child vectors
-* [ARROW-6166](https://issues.apache.org/jira/browse/ARROW-6166) - [Go] Slice of slice causes index out of range panic
-* [ARROW-6167](https://issues.apache.org/jira/browse/ARROW-6167) - [R] macOS binary R packages on CRAN don't have arrow\_available
-* [ARROW-6168](https://issues.apache.org/jira/browse/ARROW-6168) - [C++] IWYU docker-compose job is broken
-* [ARROW-6170](https://issues.apache.org/jira/browse/ARROW-6170) - [R] "docker-compose build r" is slow
-* [ARROW-6171](https://issues.apache.org/jira/browse/ARROW-6171) - [R] "docker-compose run r" fails
-* [ARROW-6174](https://issues.apache.org/jira/browse/ARROW-6174) - [C++] Validate chunks in ChunkedArray::Validate
-* [ARROW-6175](https://issues.apache.org/jira/browse/ARROW-6175) - [Java] Fix MapVector\#getMinorType and extend AbstractContainerVector addOrGet complex vector API
-* [ARROW-6178](https://issues.apache.org/jira/browse/ARROW-6178) - [Developer] Don't fail in merge script on bad primary author input in multi-author PRs
-* [ARROW-6182](https://issues.apache.org/jira/browse/ARROW-6182) - [R] Add note to README about r-arrow conda installation
-* [ARROW-6186](https://issues.apache.org/jira/browse/ARROW-6186) - [Packaging][C++] Plasma headers not included for ubuntu-xenial libplasma-dev debian package
-* [ARROW-6190](https://issues.apache.org/jira/browse/ARROW-6190) - [C++] Define and declare functions regardless of NDEBUG
-* [ARROW-6193](https://issues.apache.org/jira/browse/ARROW-6193) - [GLib] Add missing require in test
-* [ARROW-6200](https://issues.apache.org/jira/browse/ARROW-6200) - [Java] Method getBufferSizeFor in BaseRepeatedValueVector/ListVector not correct
-* [ARROW-6202](https://issues.apache.org/jira/browse/ARROW-6202) - [Java] Exception in thread "main" org.apache.arrow.memory.OutOfMemoryException: Unable to allocate buffer of size 4 due to memory limit. Current allocation: 2147483646
-* [ARROW-6205](https://issues.apache.org/jira/browse/ARROW-6205) - [C++] ARROW\_DEPRECATED warning when including io/interfaces.h from CUDA (.cu) source
-* [ARROW-6208](https://issues.apache.org/jira/browse/ARROW-6208) - [Java] Correct byte order before comparing in ByteFunctionHelpers
-* [ARROW-6210](https://issues.apache.org/jira/browse/ARROW-6210) - [Java] remove equals API from ValueVector
-* [ARROW-6211](https://issues.apache.org/jira/browse/ARROW-6211) - [Java] Remove dependency on RangeEqualsVisitor from ValueVector interface
-* [ARROW-6214](https://issues.apache.org/jira/browse/ARROW-6214) - [R] Sanitizer errors triggered via R bindings
-* [ARROW-6215](https://issues.apache.org/jira/browse/ARROW-6215) - [Java] RangeEqualVisitor does not properly compare ZeroVector
-* [ARROW-6218](https://issues.apache.org/jira/browse/ARROW-6218) - [Java] Add UINT type test in integration to avoid potential overflow
-* [ARROW-6223](https://issues.apache.org/jira/browse/ARROW-6223) - [C++] Configuration error with Anaconda Python 3.7.4
-* [ARROW-6224](https://issues.apache.org/jira/browse/ARROW-6224) - [Python] remaining usages of the 'data' attribute (from previous Column) cause warnings
-* [ARROW-6227](https://issues.apache.org/jira/browse/ARROW-6227) - [Python] pyarrow.array() shouldn't coerce np.nan to string
-* [ARROW-6234](https://issues.apache.org/jira/browse/ARROW-6234) - [Java] ListVector hashCode() is not correct
-* [ARROW-6241](https://issues.apache.org/jira/browse/ARROW-6241) - [Java] Failures on master
-* [ARROW-6255](https://issues.apache.org/jira/browse/ARROW-6255) - [Rust] [Parquet] Cannot use any published parquet crate due to parquet-format breaking change
-* [ARROW-6259](https://issues.apache.org/jira/browse/ARROW-6259) - [C++][CI] Flatbuffers-related failures in CI on macOS
-* [ARROW-6263](https://issues.apache.org/jira/browse/ARROW-6263) - [Python] RecordBatch.from\_arrays does not check array types against a passed schema
-* [ARROW-6266](https://issues.apache.org/jira/browse/ARROW-6266) - [Java] Resolve the ambiguous method overload in RangeEqualsVisitor
-* [ARROW-6268](https://issues.apache.org/jira/browse/ARROW-6268) - Empty buffer should have a valid address
-* [ARROW-6269](https://issues.apache.org/jira/browse/ARROW-6269) - [C++][Fuzzing] IPC reads do not check decimal precision
-* [ARROW-6270](https://issues.apache.org/jira/browse/ARROW-6270) - [C++][Fuzzing] IPC reads do not check buffer indices
-* [ARROW-6290](https://issues.apache.org/jira/browse/ARROW-6290) - [Rust] [DataFusion] sql\_csv example errors when running
-* [ARROW-6291](https://issues.apache.org/jira/browse/ARROW-6291) - [C++] CMake ignores ARROW\_PARQUET
-* [ARROW-6293](https://issues.apache.org/jira/browse/ARROW-6293) - [Rust] datafusion 0.15.0-SNAPSHOT error
-* [ARROW-6301](https://issues.apache.org/jira/browse/ARROW-6301) - [Python] atexit: pyarrow.lib.ArrowKeyError: 'No type extension with name arrow.py\_extension\_type found'
-* [ARROW-6302](https://issues.apache.org/jira/browse/ARROW-6302) - [Python][Parquet] Reading dictionary type with serialized Arrow schema does not restore "ordered" type property
-* [ARROW-6309](https://issues.apache.org/jira/browse/ARROW-6309) - [C++] Parquet tests and executables are linked statically
-* [ARROW-6323](https://issues.apache.org/jira/browse/ARROW-6323) - [R] Expand file paths when passing to readers
-* [ARROW-6325](https://issues.apache.org/jira/browse/ARROW-6325) - [Python] wrong conversion of DataFrame with boolean values
-* [ARROW-6330](https://issues.apache.org/jira/browse/ARROW-6330) - [C++] Include missing headers in api.h
-* [ARROW-6332](https://issues.apache.org/jira/browse/ARROW-6332) - [Java][C++][Gandiva] Handle size of varchar vectors correctly
-* [ARROW-6339](https://issues.apache.org/jira/browse/ARROW-6339) - [Python][C++] Rowgroup statistics for pd.NaT array ill defined
-* [ARROW-6343](https://issues.apache.org/jira/browse/ARROW-6343) - [Java] [Vector] Fix allocation helper
-* [ARROW-6344](https://issues.apache.org/jira/browse/ARROW-6344) - [C++][Gandiva] substring does not handle multibyte characters
-* [ARROW-6345](https://issues.apache.org/jira/browse/ARROW-6345) - [C++][Python] "ordered" flag seemingly not taken into account when comparing DictionaryType values for equality
-* [ARROW-6348](https://issues.apache.org/jira/browse/ARROW-6348) - [R] arrow::read\_csv\_arrow namespace error when package not loaded
-* [ARROW-6354](https://issues.apache.org/jira/browse/ARROW-6354) - [C++] Building without Parquet fails
-* [ARROW-6363](https://issues.apache.org/jira/browse/ARROW-6363) - [R] segfault in Table\_\_from\_dots with unexpected schema
-* [ARROW-6364](https://issues.apache.org/jira/browse/ARROW-6364) - [R] Handling unexpected input to time64() et al
-* [ARROW-6369](https://issues.apache.org/jira/browse/ARROW-6369) - [Python] Support list-of-boolean in Array.to\_pandas conversion
-* [ARROW-6371](https://issues.apache.org/jira/browse/ARROW-6371) - [Doc] Row to columnar conversion example mentions arrow::Column in comments
-* [ARROW-6372](https://issues.apache.org/jira/browse/ARROW-6372) - [Rust][Datafusion] Casting from Un-signed to Signed Integers not supported
-* [ARROW-6376](https://issues.apache.org/jira/browse/ARROW-6376) - [Developer] PR merge script has "master" target ref hard-coded
-* [ARROW-6387](https://issues.apache.org/jira/browse/ARROW-6387) - [Archery] Errors with make
-* [ARROW-6392](https://issues.apache.org/jira/browse/ARROW-6392) - [Python][Flight] list\_actions Server RPC is not tested in test\_flight.py, nor is return value validated
-* [ARROW-6395](https://issues.apache.org/jira/browse/ARROW-6395) - [Python] Bug when using bool arrays with stride greater than 1
-* [ARROW-6406](https://issues.apache.org/jira/browse/ARROW-6406) - [C++] jemalloc\_ep fails for offline build
-* [ARROW-6411](https://issues.apache.org/jira/browse/ARROW-6411) - [C++][Parquet] DictEncoderImpl<T\>::PutIndicesTyped has bad performance on some systems
-* [ARROW-6412](https://issues.apache.org/jira/browse/ARROW-6412) - [C++] arrow-flight-test can crash because of port allocation
-* [ARROW-6418](https://issues.apache.org/jira/browse/ARROW-6418) - [C++] Plasma cmake targets are not exported
-* [ARROW-6423](https://issues.apache.org/jira/browse/ARROW-6423) - [Python] pyarrow.CompressedOutputStream() never completes with compression='snappy'
-* [ARROW-6424](https://issues.apache.org/jira/browse/ARROW-6424) - [C++][Fuzzing] Fuzzit nightly is broken
-* [ARROW-6425](https://issues.apache.org/jira/browse/ARROW-6425) - [C++] ValidateArray fail for slice of list array
-* [ARROW-6428](https://issues.apache.org/jira/browse/ARROW-6428) - [CI][Crossbow] Nightly turbodbc job fails
-* [ARROW-6430](https://issues.apache.org/jira/browse/ARROW-6430) - [CI][Crossbow] Nightly R docker job fails
-* [ARROW-6431](https://issues.apache.org/jira/browse/ARROW-6431) - [Python] Test suite fails without pandas installed
-* [ARROW-6432](https://issues.apache.org/jira/browse/ARROW-6432) - [CI][Crossbow] Remove alpine crossbow jobs
-* [ARROW-6433](https://issues.apache.org/jira/browse/ARROW-6433) - [CI][Crossbow] Nightly java docker job fails
-* [ARROW-6434](https://issues.apache.org/jira/browse/ARROW-6434) - [CI][Crossbow] Nightly HDFS integration job fails
-* [ARROW-6435](https://issues.apache.org/jira/browse/ARROW-6435) - [CI][Crossbow] Nightly dask integration job fails
-* [ARROW-6440](https://issues.apache.org/jira/browse/ARROW-6440) - [CI][Crossbow] Nightly ubuntu, debian, and centos package builds fail
-* [ARROW-6441](https://issues.apache.org/jira/browse/ARROW-6441) - [CI][Crossbow] Nightly Centos 6 job fails
-* [ARROW-6442](https://issues.apache.org/jira/browse/ARROW-6442) - [CI][Crossbow] Nightly gandiva jar osx build fails
-* [ARROW-6443](https://issues.apache.org/jira/browse/ARROW-6443) - [CI][Crossbow] Nightly conda osx builds fail
-* [ARROW-6444](https://issues.apache.org/jira/browse/ARROW-6444) - [CI][Crossbow] Nightly conda Windows builds fail (time out)
-* [ARROW-6446](https://issues.apache.org/jira/browse/ARROW-6446) - [OSX][Python][Wheel] Turn off ORC feature in the wheel building scripts
-* [ARROW-6449](https://issues.apache.org/jira/browse/ARROW-6449) - [R] io "tell()" methods are inconsistently named and untested
-* [ARROW-6457](https://issues.apache.org/jira/browse/ARROW-6457) - [C++] CMake build locally fails with MSVC 2015 build generator
-* [ARROW-6461](https://issues.apache.org/jira/browse/ARROW-6461) - [Java] EchoServer can close socket before client has finished reading
-* [ARROW-6472](https://issues.apache.org/jira/browse/ARROW-6472) - [Java] ValueVector\#accept may has potential cast exception
-* [ARROW-6476](https://issues.apache.org/jira/browse/ARROW-6476) - [Java][CI] Travis java all-jdks job is broken
-* [ARROW-6478](https://issues.apache.org/jira/browse/ARROW-6478) - [C++] Roll back to jemalloc stable-4 branch until performance issues in 5.2.x addressed
-* [ARROW-6481](https://issues.apache.org/jira/browse/ARROW-6481) - [Python][C++] Bad performance of read\_csv() with column\_types
-* [ARROW-6488](https://issues.apache.org/jira/browse/ARROW-6488) - [Python] pyarrow.NULL equals to itself
-* [ARROW-6492](https://issues.apache.org/jira/browse/ARROW-6492) - [Python] file written with latest fastparquet cannot be read with latest pyarrow
-* [ARROW-6502](https://issues.apache.org/jira/browse/ARROW-6502) - [GLib][CI] MinGW failure in CI
-* [ARROW-6506](https://issues.apache.org/jira/browse/ARROW-6506) - [C++] Validation of ExtensionType with nested type fails
-* [ARROW-6509](https://issues.apache.org/jira/browse/ARROW-6509) - [C++][Gandiva] Re-enable Gandiva JNI tests and fix Travis CI failure
-* [ARROW-6509](https://issues.apache.org/jira/browse/ARROW-6509) - [C++][Gandiva] Re-enable Gandiva JNI tests and fix Travis CI failure
-* [ARROW-6520](https://issues.apache.org/jira/browse/ARROW-6520) - [Python] Segmentation fault on writing tables with fixed size binary fields
-* [ARROW-6522](https://issues.apache.org/jira/browse/ARROW-6522) - [Python] Test suite fails with pandas 0.23.4, pytest 3.8.1
-* [ARROW-6530](https://issues.apache.org/jira/browse/ARROW-6530) - [CI][Crossbow][R] Nightly R job doesn't install all dependencies
-* [ARROW-6550](https://issues.apache.org/jira/browse/ARROW-6550) - [C++] Filter expressions PR failing manylinux package builds
-* [ARROW-6551](https://issues.apache.org/jira/browse/ARROW-6551) - [Python] Dask Parquet integration test failure
-* [ARROW-6552](https://issues.apache.org/jira/browse/ARROW-6552) - [C++] boost::optional in STL test fails compiling in gcc 4.8.2
-* [ARROW-6560](https://issues.apache.org/jira/browse/ARROW-6560) - [Python] Failures in \*-nopandas integration tests
-* [ARROW-6561](https://issues.apache.org/jira/browse/ARROW-6561) - [Python] pandas-master integration test failure
-* [ARROW-6562](https://issues.apache.org/jira/browse/ARROW-6562) - [GLib] Fix wrong sliced data of GArrowBuffer
-* [ARROW-6564](https://issues.apache.org/jira/browse/ARROW-6564) - [Python] Do not require pandas for invoking Array.\_\_array\_\_
-* [ARROW-6565](https://issues.apache.org/jira/browse/ARROW-6565) - [Rust] [DataFusion] Intermittent test failure due to temp dir already existing
-* [ARROW-6568](https://issues.apache.org/jira/browse/ARROW-6568) - [C++][Python][Parquet] pyarrow.parquet crash writing zero-chunk dictionary-type column
-* [ARROW-6572](https://issues.apache.org/jira/browse/ARROW-6572) - [C++] Reading some Parquet data can return uninitialized memory
-* [ARROW-6573](https://issues.apache.org/jira/browse/ARROW-6573) - [Python] Segfault when writing to parquet
-* [ARROW-6576](https://issues.apache.org/jira/browse/ARROW-6576) - [R] Fix sparklyr integration tests
-* [ARROW-6586](https://issues.apache.org/jira/browse/ARROW-6586) - [Python][Packaging] Windows wheel builds failing with "DLL load failure"
-* [ARROW-6597](https://issues.apache.org/jira/browse/ARROW-6597) - [Python] Segfault in test\_pandas with Python 2.7
-* [ARROW-6618](https://issues.apache.org/jira/browse/ARROW-6618) - [Python] Reading a zero-size buffer can segfault
-* [ARROW-6620](https://issues.apache.org/jira/browse/ARROW-6620) - [Python][CI] pandas-master build failing due to removal of "to\_sparse" method
-* [ARROW-6622](https://issues.apache.org/jira/browse/ARROW-6622) - [C++][R] SubTreeFileSystem path error on Windows
-* [ARROW-6623](https://issues.apache.org/jira/browse/ARROW-6623) - [CI][Python] Dask docker integration test broken perhaps by statistics-related change
-* [ARROW-6639](https://issues.apache.org/jira/browse/ARROW-6639) - [Packaging][RPM] Add support for CentOS 7 on aarch64
-* [ARROW-6640](https://issues.apache.org/jira/browse/ARROW-6640) - [C++] Error when BufferedInputStream Peek more than bytes buffered
-* [ARROW-6641](https://issues.apache.org/jira/browse/ARROW-6641) - [C++] Remove Deprecated WriteableFile warning
-* [ARROW-6642](https://issues.apache.org/jira/browse/ARROW-6642) - [Python] chained access of ParquetDataset's metadata segfaults
-* [ARROW-6651](https://issues.apache.org/jira/browse/ARROW-6651) - [R] Fix R conda job
-* [ARROW-6652](https://issues.apache.org/jira/browse/ARROW-6652) - [Python] to\_pandas conversion removes timezone from type
-* [ARROW-6652](https://issues.apache.org/jira/browse/ARROW-6652) - [Python] to\_pandas conversion removes timezone from type
-* [ARROW-6660](https://issues.apache.org/jira/browse/ARROW-6660) - [Rust] [DataFusion] Minor docs update for 0.15.0 release
-* [ARROW-6670](https://issues.apache.org/jira/browse/ARROW-6670) - [CI][R] Fix fix for R nightly jobs
-* [ARROW-6674](https://issues.apache.org/jira/browse/ARROW-6674) - [Python] Fix or ignore the test warnings
-* [ARROW-6677](https://issues.apache.org/jira/browse/ARROW-6677) - [FlightRPC][C++] Document using Flight in C++
-* [ARROW-6678](https://issues.apache.org/jira/browse/ARROW-6678) - [C++] Regression in Parquet file compatibility introduced by ARROW-3246
-* [ARROW-6679](https://issues.apache.org/jira/browse/ARROW-6679) - [RELEASE] autobrew license in LICENSE.txt is not acceptable
-* [ARROW-6682](https://issues.apache.org/jira/browse/ARROW-6682) - [C\#] Arrow R/C++ hangs reading binary file generated by C\#
-* [ARROW-6687](https://issues.apache.org/jira/browse/ARROW-6687) - [Rust] [DataFusion] Query returns incorrect row count
-* [ARROW-6687](https://issues.apache.org/jira/browse/ARROW-6687) - [Rust] [DataFusion] Query returns incorrect row count
-* [ARROW-6701](https://issues.apache.org/jira/browse/ARROW-6701) - [C++][R] Lint failing on R cpp code
-* [ARROW-6703](https://issues.apache.org/jira/browse/ARROW-6703) - [Packaging][Linux] Restore ARROW\_VERSION environment variable
-* [ARROW-6705](https://issues.apache.org/jira/browse/ARROW-6705) - [Rust] [DataFusion] README has invalid github URL
-* [ARROW-6709](https://issues.apache.org/jira/browse/ARROW-6709) - [JAVA] Jdbc adapter currentIndex should increment when value is null
-* [ARROW-6714](https://issues.apache.org/jira/browse/ARROW-6714) - [R] Fix untested RecordBatchWriter case
-* [ARROW-6716](https://issues.apache.org/jira/browse/ARROW-6716) - [CI] [Rust] New 1.40.0 nightly causing builds to fail
-* [ARROW-6748](https://issues.apache.org/jira/browse/ARROW-6748) - [RUBY] gem compilation error
-* [ARROW-6751](https://issues.apache.org/jira/browse/ARROW-6751) - [CI] ccache doesn't cache on Travis-CI
-* [ARROW-6760](https://issues.apache.org/jira/browse/ARROW-6760) - [C++] JSON: improve error message when column changed type
-* [ARROW-6773](https://issues.apache.org/jira/browse/ARROW-6773) - [C++] Filter kernel returns invalid data when filtering with an Array slice
-* [ARROW-6796](https://issues.apache.org/jira/browse/ARROW-6796) - Certain moderately-sized (\~100MB) default-Snappy-compressed Parquet files take enormous memory and long time to load by pyarrow.parquet.read\_table
-* [ARROW-7112](https://issues.apache.org/jira/browse/ARROW-7112) - Wrong contents when initializinga pyarrow.Table from boolean DataFrame
-* [PARQUET-1623](https://issues.apache.org/jira/browse/PARQUET-1623) - [C++] Invalid memory access with a magic number of records
-* [PARQUET-1631](https://issues.apache.org/jira/browse/PARQUET-1631) - [C++] ParquetInputWrapper::GetSize always returns 0
-* [PARQUET-1640](https://issues.apache.org/jira/browse/PARQUET-1640) - [C++] parquet-encoding-benchmark crashes
-
-
-
-# Apache Arrow 0.14.1 (2019-07-22)
-
-## Bug Fixes
-
-* [ARROW-5775](https://issues.apache.org/jira/browse/ARROW-5775) - [C++] StructArray : cached boxed fields not thread-safe
-* [ARROW-5790](https://issues.apache.org/jira/browse/ARROW-5790) - [Python] Passing zero-dim numpy array to pa.array causes segfault
-* [ARROW-5791](https://issues.apache.org/jira/browse/ARROW-5791) - [Python] pyarrow.csv.read\_csv hangs + eats all RAM
-* [ARROW-5816](https://issues.apache.org/jira/browse/ARROW-5816) - [Release] Parallel curl does not work reliably in verify-release-candidate-sh
-* [ARROW-5836](https://issues.apache.org/jira/browse/ARROW-5836) - [Java][OSX] Flight tests are failing: address already in use
-* [ARROW-5838](https://issues.apache.org/jira/browse/ARROW-5838) - [C++][Flight][OSX] Building 3rdparty grpc cannot find OpenSSL
-* [ARROW-5849](https://issues.apache.org/jira/browse/ARROW-5849) - [C++] Compiler warnings on mingw-w64
-* [ARROW-5850](https://issues.apache.org/jira/browse/ARROW-5850) - [CI][R] R appveyor job is broken after release
-* [ARROW-5851](https://issues.apache.org/jira/browse/ARROW-5851) - [C++] Compilation of reference benchmarks fails
-* [ARROW-5856](https://issues.apache.org/jira/browse/ARROW-5856) - [Python] linking 3rd party cython modules against pyarrow fails since 0.14.0
-* [ARROW-5863](https://issues.apache.org/jira/browse/ARROW-5863) - [Python] Segmentation Fault via pytest-runner
-* [ARROW-5868](https://issues.apache.org/jira/browse/ARROW-5868) - [Python] manylinux2010 wheels have shared library dependency on liblz4
-* [ARROW-5873](https://issues.apache.org/jira/browse/ARROW-5873) - [Python] Segmentation fault when comparing schema with None
-* [ARROW-5874](https://issues.apache.org/jira/browse/ARROW-5874) - [Python] pyarrow 0.14.0 macOS wheels depend on shared libs under /usr/local/opt
-* [ARROW-5878](https://issues.apache.org/jira/browse/ARROW-5878) - [Python][C++] Parquet reader not forward compatible for timestamps without timezone
-* [ARROW-5886](https://issues.apache.org/jira/browse/ARROW-5886) - [Python][Packaging] Manylinux1/2010 compliance issue with libz
-* [ARROW-5887](https://issues.apache.org/jira/browse/ARROW-5887) - [C\#] ArrowStreamWriter writes FieldNodes in wrong order
-* [ARROW-5889](https://issues.apache.org/jira/browse/ARROW-5889) - [Python][C++] Parquet backwards compat for timestamps without timezone broken
-* [ARROW-5899](https://issues.apache.org/jira/browse/ARROW-5899) - [Python][Packaging] Bundle uriparser.dll in windows wheels
-* [ARROW-5921](https://issues.apache.org/jira/browse/ARROW-5921) - [C++][Fuzzing] Missing nullptr checks in IPC
-* [PARQUET-1623](https://issues.apache.org/jira/browse/PARQUET-1623) - [C++] Invalid memory access with a magic number of records
-
-
-## New Features and Improvements
-
-* [ARROW-5101](https://issues.apache.org/jira/browse/ARROW-5101) - [Packaging] Avoid bundling static libraries in Windows conda packages
-* [ARROW-5380](https://issues.apache.org/jira/browse/ARROW-5380) - [C++] Fix and enable UBSan for unaligned accesses.
-* [ARROW-5564](https://issues.apache.org/jira/browse/ARROW-5564) - [C++] Add uriparser to conda-forge
-* [ARROW-5609](https://issues.apache.org/jira/browse/ARROW-5609) - [C++] Set CMP0068 CMake policy to avoid macOS warnings
-* [ARROW-5784](https://issues.apache.org/jira/browse/ARROW-5784) - [Release][GLib] Replace c\_glib/ after running c\_glib/autogen.sh in dev/release/02-source.sh
-* [ARROW-5785](https://issues.apache.org/jira/browse/ARROW-5785) - [Rust] Rust datafusion implementation should not depend on rustyline
-* [ARROW-5787](https://issues.apache.org/jira/browse/ARROW-5787) - [Release][Rust] Use local modules to verify RC
-* [ARROW-5793](https://issues.apache.org/jira/browse/ARROW-5793) - [Release] Avoid duplicate known host SSH error in dev/release/03-binary.sh
-* [ARROW-5794](https://issues.apache.org/jira/browse/ARROW-5794) - [Release] Skip uploading already uploaded binaries
-* [ARROW-5795](https://issues.apache.org/jira/browse/ARROW-5795) - [Release] Add missing waits on uploading binaries
-* [ARROW-5796](https://issues.apache.org/jira/browse/ARROW-5796) - [Release][APT] Update expected package list
-* [ARROW-5797](https://issues.apache.org/jira/browse/ARROW-5797) - [Release][APT] Update supported distributions
-* [ARROW-5820](https://issues.apache.org/jira/browse/ARROW-5820) - [Release] Remove undefined variable check from verify script
-* [ARROW-5827](https://issues.apache.org/jira/browse/ARROW-5827) - [C++] Require c-ares CMake config
-* [ARROW-5828](https://issues.apache.org/jira/browse/ARROW-5828) - [C++] Add Protocol Buffers version check
-* [ARROW-5866](https://issues.apache.org/jira/browse/ARROW-5866) - [C++] Remove duplicate library in cpp/Brewfile
-* [ARROW-5877](https://issues.apache.org/jira/browse/ARROW-5877) - [FlightRPC] Fix auth incompatibilities between Python/Java
-* [ARROW-5904](https://issues.apache.org/jira/browse/ARROW-5904) - [Java] [Plasma] Fix compilation of Plasma Java client
-* [ARROW-5908](https://issues.apache.org/jira/browse/ARROW-5908) - [C\#] ArrowStreamWriter doesn't align buffers to 8 bytes
-* [ARROW-5934](https://issues.apache.org/jira/browse/ARROW-5934) - [Python] Bundle arrow's LICENSE with the wheels
-* [ARROW-5937](https://issues.apache.org/jira/browse/ARROW-5937) - [Release] Stop parallel binary upload
-* [ARROW-5938](https://issues.apache.org/jira/browse/ARROW-5938) - [Release] Create branch for adding release note automatically
-* [ARROW-5939](https://issues.apache.org/jira/browse/ARROW-5939) - [Release] Add support for generating vote email template separately
-* [ARROW-5940](https://issues.apache.org/jira/browse/ARROW-5940) - [Release] Add support for re-uploading sign/checksum for binary artifacts
-* [ARROW-5941](https://issues.apache.org/jira/browse/ARROW-5941) - [Release] Avoid re-uploading already uploaded binary artifacts
-* [ARROW-5958](https://issues.apache.org/jira/browse/ARROW-5958) - [Python] Link zlib statically in the wheels
-
-
-
-# Apache Arrow 0.14.0 (2019-07-04)
-
-## New Features and Improvements
-
-* [ARROW-258](https://issues.apache.org/jira/browse/ARROW-258) - [Format] clarify definition of Buffer in context of RPC, IPC, File
-* [ARROW-653](https://issues.apache.org/jira/browse/ARROW-653) - [Python / C++] Add debugging function to print an array's buffer contents in hexadecimal
-* [ARROW-767](https://issues.apache.org/jira/browse/ARROW-767) - [C++] Adopt FileSystem abstraction
-* [ARROW-835](https://issues.apache.org/jira/browse/ARROW-835) - [Format] Add Timedelta type to describe time intervals
-* [ARROW-840](https://issues.apache.org/jira/browse/ARROW-840) - [Python] Provide Python API for creating user-defined data types that can survive Arrow IPC
-* [ARROW-973](https://issues.apache.org/jira/browse/ARROW-973) - [Website] Add FAQ page about project
-* [ARROW-1012](https://issues.apache.org/jira/browse/ARROW-1012) - [C++] Create a configurable implementation of RecordBatchReader that reads from Apache Parquet files
-* [ARROW-1207](https://issues.apache.org/jira/browse/ARROW-1207) - [C++] Implement Map logical type
-* [ARROW-1261](https://issues.apache.org/jira/browse/ARROW-1261) - [Java] Add container type for Map logical type
-* [ARROW-1278](https://issues.apache.org/jira/browse/ARROW-1278) - Integration tests for Fixed Size List type
-* [ARROW-1279](https://issues.apache.org/jira/browse/ARROW-1279) - [Integration][Java] Integration tests for Map type
-* [ARROW-1280](https://issues.apache.org/jira/browse/ARROW-1280) - [C++] Implement Fixed Size List type
-* [ARROW-1349](https://issues.apache.org/jira/browse/ARROW-1349) - [Packaging] Provide APT and Yum repositories
-* [ARROW-1496](https://issues.apache.org/jira/browse/ARROW-1496) - [JS] Upload coverage data to codecov.io
-* [ARROW-1558](https://issues.apache.org/jira/browse/ARROW-1558) - [C++] Implement boolean selection kernels
-* [ARROW-1587](https://issues.apache.org/jira/browse/ARROW-1587) - [Format] Add metadata for user-defined logical types
-* [ARROW-1774](https://issues.apache.org/jira/browse/ARROW-1774) - [C++] Add "view" function to create zero-copy views for compatible types, if supported
-* [ARROW-1833](https://issues.apache.org/jira/browse/ARROW-1833) - [Java] Add accessor methods for data buffers that skip null checking
-* [ARROW-1957](https://issues.apache.org/jira/browse/ARROW-1957) - [Python] Write nanosecond timestamps using new NANO LogicalType Parquet unit
-* [ARROW-1983](https://issues.apache.org/jira/browse/ARROW-1983) - [Python] Add ability to write parquet \`\_metadata\` file
-* [ARROW-2057](https://issues.apache.org/jira/browse/ARROW-2057) - [Python] Configure size of data pages in pyarrow.parquet.write\_table
-* [ARROW-2102](https://issues.apache.org/jira/browse/ARROW-2102) - [C++] Implement take kernel functions - primitive value type
-* [ARROW-2103](https://issues.apache.org/jira/browse/ARROW-2103) - [C++] Implement take kernel functions - string/binary value type
-* [ARROW-2104](https://issues.apache.org/jira/browse/ARROW-2104) - [C++] Implement take kernel functions - nested array value type
-* [ARROW-2105](https://issues.apache.org/jira/browse/ARROW-2105) - [C++] Implement take kernel functions - properly handle special indices
-* [ARROW-2186](https://issues.apache.org/jira/browse/ARROW-2186) - [C++] Clean up architecture specific compiler flags
-* [ARROW-2217](https://issues.apache.org/jira/browse/ARROW-2217) - [C++] Add option to use dynamic linking for compression library dependencies
-* [ARROW-2298](https://issues.apache.org/jira/browse/ARROW-2298) - [Python] Add option to not consider NaN to be null when converting to an integer Arrow type
-* [ARROW-2412](https://issues.apache.org/jira/browse/ARROW-2412) - [Integration] Add nested dictionary integration test
-* [ARROW-2467](https://issues.apache.org/jira/browse/ARROW-2467) - [Rust] Generate code using Flatbuffers
-* [ARROW-2517](https://issues.apache.org/jira/browse/ARROW-2517) - [Java] Add list<decimal\> writer
-* [ARROW-2618](https://issues.apache.org/jira/browse/ARROW-2618) - [Rust] Bitmap constructor should accept for flag for default state (0 or 1)
-* [ARROW-2667](https://issues.apache.org/jira/browse/ARROW-2667) - [C++/Python] Add pandas-like take method to Array
-* [ARROW-2707](https://issues.apache.org/jira/browse/ARROW-2707) - [C++] Implement Table::Slice methods using Column::Slice
-* [ARROW-2709](https://issues.apache.org/jira/browse/ARROW-2709) - [Python] write\_to\_dataset poor performance when splitting
-* [ARROW-2730](https://issues.apache.org/jira/browse/ARROW-2730) - [C++] Set up CMAKE\_C\_FLAGS more thoughtfully instead of using CMAKE\_CXX\_FLAGS
-* [ARROW-2796](https://issues.apache.org/jira/browse/ARROW-2796) - [C++] Simplify symbols.map file, use when building libarrow\_python
-* [ARROW-2818](https://issues.apache.org/jira/browse/ARROW-2818) - [Python] Better error message when passing SparseDataFrame into Table.from\_pandas
-* [ARROW-2835](https://issues.apache.org/jira/browse/ARROW-2835) - [C++] ReadAt/WriteAt are inconsistent with moving the files position
-* [ARROW-2969](https://issues.apache.org/jira/browse/ARROW-2969) - [R] Convert between StructArray and "nested" data.frame column containing data frame in each cell
-* [ARROW-2981](https://issues.apache.org/jira/browse/ARROW-2981) - [C++] Support scripts / documentation for running clang-tidy on codebase
-* [ARROW-2984](https://issues.apache.org/jira/browse/ARROW-2984) - [JS] Refactor release verification script to share code with main source release verification script
-* [ARROW-3040](https://issues.apache.org/jira/browse/ARROW-3040) - [Go] add support for comparing Arrays
-* [ARROW-3041](https://issues.apache.org/jira/browse/ARROW-3041) - [Go] add support for TimeArray
-* [ARROW-3052](https://issues.apache.org/jira/browse/ARROW-3052) - [C++] Detect ORC system packages
-* [ARROW-3087](https://issues.apache.org/jira/browse/ARROW-3087) - [C++] Add kernels for comparison operations to scalars
-* [ARROW-3144](https://issues.apache.org/jira/browse/ARROW-3144) - [C++] Move "dictionary" member from DictionaryType to ArrayData to allow for changing dictionaries between Array chunks
-* [ARROW-3150](https://issues.apache.org/jira/browse/ARROW-3150) - [Python] Ship Flight-enabled Python wheels on Linux and Windows
-* [ARROW-3166](https://issues.apache.org/jira/browse/ARROW-3166) - [C++] Consolidate IO interfaces used in arrow/io and parquet-cpp
-* [ARROW-3191](https://issues.apache.org/jira/browse/ARROW-3191) - [Java] Add support for ArrowBuf to point to arbitrary memory.
-* [ARROW-3200](https://issues.apache.org/jira/browse/ARROW-3200) - [C++] Add support for reading Flight streams with dictionaries
-* [ARROW-3290](https://issues.apache.org/jira/browse/ARROW-3290) - [C++] Toolchain support for secure gRPC
-* [ARROW-3294](https://issues.apache.org/jira/browse/ARROW-3294) - [C++] Test Flight RPC on Windows / Appveyor
-* [ARROW-3314](https://issues.apache.org/jira/browse/ARROW-3314) - [R] Set -rpath using pkg-config when building
-* [ARROW-3330](https://issues.apache.org/jira/browse/ARROW-3330) - [C++] Spawn multiple Flight performance servers in flight-benchmark to test parallel get performance
-* [ARROW-3419](https://issues.apache.org/jira/browse/ARROW-3419) - [C++] Run include-what-you-use checks as nightly build
-* [ARROW-3459](https://issues.apache.org/jira/browse/ARROW-3459) - [C++][Gandiva] Add support for variable length output vectors
-* [ARROW-3475](https://issues.apache.org/jira/browse/ARROW-3475) - [C++] Int64Builder.Finish(NumericArray<Int64Type\>)
-* [ARROW-3570](https://issues.apache.org/jira/browse/ARROW-3570) - [Packaging] Don't bundle test data files with python wheels
-* [ARROW-3572](https://issues.apache.org/jira/browse/ARROW-3572) - [Packaging] Correctly handle ssh origin urls for crossbow
-* [ARROW-3671](https://issues.apache.org/jira/browse/ARROW-3671) - [Go] implement Interval array
-* [ARROW-3676](https://issues.apache.org/jira/browse/ARROW-3676) - [Go] implement Decimal128 array
-* [ARROW-3679](https://issues.apache.org/jira/browse/ARROW-3679) - [Go] implement IPC protocol
-* [ARROW-3680](https://issues.apache.org/jira/browse/ARROW-3680) - [Go] implement Float16 array
-* [ARROW-3686](https://issues.apache.org/jira/browse/ARROW-3686) - [Python] Support for masked arrays in to/from numpy
-* [ARROW-3702](https://issues.apache.org/jira/browse/ARROW-3702) - [R] POSIXct mapped to DateType not TimestampType?
-* [ARROW-3714](https://issues.apache.org/jira/browse/ARROW-3714) - [CI] Run RAT checks in pre-commit hooks
-* [ARROW-3729](https://issues.apache.org/jira/browse/ARROW-3729) - [C++] Support for writing TIMESTAMP\_NANOS Parquet metadata
-* [ARROW-3732](https://issues.apache.org/jira/browse/ARROW-3732) - [R] Add functions to write RecordBatch or Schema to Message value, then read back
-* [ARROW-3758](https://issues.apache.org/jira/browse/ARROW-3758) - [R] Build R library on Windows, document build instructions for Windows developers
-* [ARROW-3759](https://issues.apache.org/jira/browse/ARROW-3759) - [R][CI] Build and test on Windows in Appveyor
-* [ARROW-3767](https://issues.apache.org/jira/browse/ARROW-3767) - [C++] Add cast for Null to any type
-* [ARROW-3780](https://issues.apache.org/jira/browse/ARROW-3780) - [R] Failed to fetch data: invalid data when collecting int16
-* [ARROW-3791](https://issues.apache.org/jira/browse/ARROW-3791) - [C++] Add type inference for boolean values in CSV files
-* [ARROW-3794](https://issues.apache.org/jira/browse/ARROW-3794) - [R] Consider mapping INT8 to integer() not raw()
-* [ARROW-3804](https://issues.apache.org/jira/browse/ARROW-3804) - [R] Consider lowering required R runtime
-* [ARROW-3810](https://issues.apache.org/jira/browse/ARROW-3810) - [R] type= argument for Array and ChunkedArray
-* [ARROW-3811](https://issues.apache.org/jira/browse/ARROW-3811) - [R] struct arrays inference
-* [ARROW-3814](https://issues.apache.org/jira/browse/ARROW-3814) - [R] RecordBatch$from\_arrays()
-* [ARROW-3815](https://issues.apache.org/jira/browse/ARROW-3815) - [R] refine record batch factory
-* [ARROW-3848](https://issues.apache.org/jira/browse/ARROW-3848) - [R] allow nbytes to be missing in RandomAccessFile$Read()
-* [ARROW-3897](https://issues.apache.org/jira/browse/ARROW-3897) - [MATLAB] Add MATLAB support for writing numeric datatypes to a Feather file
-* [ARROW-3904](https://issues.apache.org/jira/browse/ARROW-3904) - [C++/Python] Validate scale and precision of decimal128 type
-* [ARROW-4013](https://issues.apache.org/jira/browse/ARROW-4013) - [Documentation][C++] Document how to build Apache Arrow on MSYS2
-* [ARROW-4020](https://issues.apache.org/jira/browse/ARROW-4020) - [Release] Remove source artifacts from dev dist system after release vote passes
-* [ARROW-4047](https://issues.apache.org/jira/browse/ARROW-4047) - [Python] Document use of int96 timestamps and options in Parquet docs
-* [ARROW-4086](https://issues.apache.org/jira/browse/ARROW-4086) - [Java] Add apis to debug alloc failures
-* [ARROW-4121](https://issues.apache.org/jira/browse/ARROW-4121) - [C++] Refactor memory allocation from InvertKernel
-* [ARROW-4159](https://issues.apache.org/jira/browse/ARROW-4159) - [C++] Check for -Wdocumentation issues
-* [ARROW-4194](https://issues.apache.org/jira/browse/ARROW-4194) - [Format] Metadata.rst does not specify timezone for Timestamp type
-* [ARROW-4302](https://issues.apache.org/jira/browse/ARROW-4302) - [C++] Add OpenSSL to C++ build toolchain
-* [ARROW-4337](https://issues.apache.org/jira/browse/ARROW-4337) - [C\#] Array / RecordBatch Builder Fluent API
-* [ARROW-4343](https://issues.apache.org/jira/browse/ARROW-4343) - [C++] Add as complete as possible Ubuntu Trusty / 14.04 build to docker-compose setup
-* [ARROW-4356](https://issues.apache.org/jira/browse/ARROW-4356) - [CI] Add integration (docker) test for turbodbc
-* [ARROW-4369](https://issues.apache.org/jira/browse/ARROW-4369) - [Packaging] Release verification script should test linux packages via docker
-* [ARROW-4452](https://issues.apache.org/jira/browse/ARROW-4452) - [Python] Serializing sparse torch tensors
-* [ARROW-4453](https://issues.apache.org/jira/browse/ARROW-4453) - [Python] Create Cython wrappers for SparseTensor
-* [ARROW-4467](https://issues.apache.org/jira/browse/ARROW-4467) - [Rust] [DataFusion] Create a REPL & Dockerfile for DataFusion
-* [ARROW-4503](https://issues.apache.org/jira/browse/ARROW-4503) - [C\#] ArrowStreamReader allocates and copies data excessively
-* [ARROW-4504](https://issues.apache.org/jira/browse/ARROW-4504) - [C++] Reduce the number of unit test executables
-* [ARROW-4505](https://issues.apache.org/jira/browse/ARROW-4505) - [C++] Nicer PrettyPrint for date32
-* [ARROW-4566](https://issues.apache.org/jira/browse/ARROW-4566) - [C++][Flight] Add option to run arrow-flight-benchmark against a perf server running on a different host
-* [ARROW-4596](https://issues.apache.org/jira/browse/ARROW-4596) - [Rust] [DataFusion] Implement COUNT aggregate function
-* [ARROW-4622](https://issues.apache.org/jira/browse/ARROW-4622) - [C++] [Python] MakeDense and MakeSparse in UnionArray should accept a vector of Field
-* [ARROW-4625](https://issues.apache.org/jira/browse/ARROW-4625) - [Flight] Wrap server busy-wait methods
-* [ARROW-4626](https://issues.apache.org/jira/browse/ARROW-4626) - [Flight] Add application metadata field to DoGet
-* [ARROW-4627](https://issues.apache.org/jira/browse/ARROW-4627) - [Flight] Add application metadata field to DoPut
-* [ARROW-4701](https://issues.apache.org/jira/browse/ARROW-4701) - [C++] Add JSON chunker benchmarks
-* [ARROW-4702](https://issues.apache.org/jira/browse/ARROW-4702) - [C++] Upgrade dependency versions
-* [ARROW-4708](https://issues.apache.org/jira/browse/ARROW-4708) - [C++] Add multithreaded JSON reader
-* [ARROW-4708](https://issues.apache.org/jira/browse/ARROW-4708) - [C++] Add multithreaded JSON reader
-* [ARROW-4714](https://issues.apache.org/jira/browse/ARROW-4714) - [C++][Java] Providing JNI interface to Read ORC file via Arrow C++
-* [ARROW-4717](https://issues.apache.org/jira/browse/ARROW-4717) - [C\#] Consider exposing ValueTask instead of Task
-* [ARROW-4719](https://issues.apache.org/jira/browse/ARROW-4719) - [C\#] Implement ChunkedArray, Column and Table in C\#
-* [ARROW-4741](https://issues.apache.org/jira/browse/ARROW-4741) - [Java] Add documentation to all classes and enable checkstyle for class javadocs
-* [ARROW-4787](https://issues.apache.org/jira/browse/ARROW-4787) - [C++] Include "null" values (perhaps with an option to toggle on/off) in hash kernel actions
-* [ARROW-4788](https://issues.apache.org/jira/browse/ARROW-4788) - [C++] Develop less verbose API for constructing StructArray
-* [ARROW-4800](https://issues.apache.org/jira/browse/ARROW-4800) - [C++] Create/port a StatusOr implementation to be able to return a status or a type
-* [ARROW-4805](https://issues.apache.org/jira/browse/ARROW-4805) - [Rust] Write temporal arrays to CSV
-* [ARROW-4806](https://issues.apache.org/jira/browse/ARROW-4806) - [Rust] Support casting temporal arrays in cast kernels
-* [ARROW-4824](https://issues.apache.org/jira/browse/ARROW-4824) - [Python] read\_csv should accept io.StringIO objects
-* [ARROW-4827](https://issues.apache.org/jira/browse/ARROW-4827) - [C++] Implement benchmark comparison between two git revisions
-* [ARROW-4847](https://issues.apache.org/jira/browse/ARROW-4847) - [Python] Add pyarrow.table factory function that dispatches to various ctors based on type of input
-* [ARROW-4904](https://issues.apache.org/jira/browse/ARROW-4904) - [C++] Move implementations in arrow/ipc/test-common.h into libarrow\_testing
-* [ARROW-4911](https://issues.apache.org/jira/browse/ARROW-4911) - [R] Support for building package for Windows
-* [ARROW-4912](https://issues.apache.org/jira/browse/ARROW-4912) - [C++, Python] Allow specifying column names to CSV reader
-* [ARROW-4913](https://issues.apache.org/jira/browse/ARROW-4913) - [Java][Memory] Limit number of ledgers and arrowbufs
-* [ARROW-4945](https://issues.apache.org/jira/browse/ARROW-4945) - [Flight] Enable Flight integration tests in Travis
-* [ARROW-4956](https://issues.apache.org/jira/browse/ARROW-4956) - [C\#] Allow ArrowBuffers to wrap external Memory in C\#
-* [ARROW-4959](https://issues.apache.org/jira/browse/ARROW-4959) - [Gandiva][Crossbow] Builds broken
-* [ARROW-4968](https://issues.apache.org/jira/browse/ARROW-4968) - [Rust] StructArray builder and From<\> methods should check that field types match schema
-* [ARROW-4971](https://issues.apache.org/jira/browse/ARROW-4971) - [Go] DataType equality
-* [ARROW-4972](https://issues.apache.org/jira/browse/ARROW-4972) - [Go] Array equality
-* [ARROW-4973](https://issues.apache.org/jira/browse/ARROW-4973) - [Go] Slice Array equality
-* [ARROW-4974](https://issues.apache.org/jira/browse/ARROW-4974) - [Go] Array approx equality
-* [ARROW-4990](https://issues.apache.org/jira/browse/ARROW-4990) - [C++] Kernel to compare array with array
-* [ARROW-4993](https://issues.apache.org/jira/browse/ARROW-4993) - [C++] Display summary at the end of CMake configuration
-* [ARROW-5000](https://issues.apache.org/jira/browse/ARROW-5000) - [Python] Fix deprecation warning from setup.py
-* [ARROW-5007](https://issues.apache.org/jira/browse/ARROW-5007) - [C++] Move DCHECK out of sse-utils
-* [ARROW-5020](https://issues.apache.org/jira/browse/ARROW-5020) - [C++][Gandiva] Split Gandiva-related conda packages for builds into separate .yml conda env file
-* [ARROW-5027](https://issues.apache.org/jira/browse/ARROW-5027) - [Python] Add JSON Reader
-* [ARROW-5037](https://issues.apache.org/jira/browse/ARROW-5037) - [Rust] [DataFusion] Refactor aggregate module
-* [ARROW-5038](https://issues.apache.org/jira/browse/ARROW-5038) - [Rust] [DataFusion] Implement AVG aggregate function
-* [ARROW-5039](https://issues.apache.org/jira/browse/ARROW-5039) - [Rust] [DataFusion] Fix bugs in CAST support
-* [ARROW-5040](https://issues.apache.org/jira/browse/ARROW-5040) - [C++] ArrayFromJSON can't parse Timestamp from strings
-* [ARROW-5045](https://issues.apache.org/jira/browse/ARROW-5045) - [Rust] Code coverage silently failing in CI
-* [ARROW-5053](https://issues.apache.org/jira/browse/ARROW-5053) - [Rust] [DataFusion] Use env var for location of arrow test data
-* [ARROW-5054](https://issues.apache.org/jira/browse/ARROW-5054) - [C++][Release] Test Flight in verify-release-candidate.sh
-* [ARROW-5056](https://issues.apache.org/jira/browse/ARROW-5056) - [Packaging] Adjust conda recipes to use ORC conda-forge package on unix systems
-* [ARROW-5061](https://issues.apache.org/jira/browse/ARROW-5061) - [Release] Improve 03-binary performance
-* [ARROW-5062](https://issues.apache.org/jira/browse/ARROW-5062) - [Java] Shade Java Guava dependency for Flight
-* [ARROW-5063](https://issues.apache.org/jira/browse/ARROW-5063) - [Java] FlightClient should not create a child allocator
-* [ARROW-5064](https://issues.apache.org/jira/browse/ARROW-5064) - [Release] Pass PKG\_CONFIG\_PATH to glib in the verification script
-* [ARROW-5066](https://issues.apache.org/jira/browse/ARROW-5066) - [Integration] Add flags to enable/disable implementations in integration/integration\_test.py
-* [ARROW-5071](https://issues.apache.org/jira/browse/ARROW-5071) - [Benchmarking] Performs a benchmark run with archery
-* [ARROW-5076](https://issues.apache.org/jira/browse/ARROW-5076) - [Packaging] Improve post binary upload performance
-* [ARROW-5077](https://issues.apache.org/jira/browse/ARROW-5077) - [Rust] Release process should change Cargo.toml to use release versions
-* [ARROW-5078](https://issues.apache.org/jira/browse/ARROW-5078) - [Documentation] Sphinx is failed by RemovedInSphinx30Warning
-* [ARROW-5079](https://issues.apache.org/jira/browse/ARROW-5079) - [Release] Add a script to release C\# package
-* [ARROW-5080](https://issues.apache.org/jira/browse/ARROW-5080) - [Release] Add a script to release Rust packages
-* [ARROW-5081](https://issues.apache.org/jira/browse/ARROW-5081) - [C++] Consistently use PATH\_SUFFIXES in CMake config
-* [ARROW-5083](https://issues.apache.org/jira/browse/ARROW-5083) - [Developer] In merge\_arrow\_pr.py script, allow user to set a released Fix Version
-* [ARROW-5088](https://issues.apache.org/jira/browse/ARROW-5088) - [C++] Do not set -Werror when using BUILD\_WARNING\_LEVEL=CHECKIN in release mode
-* [ARROW-5091](https://issues.apache.org/jira/browse/ARROW-5091) - [Flight] Rename FlightGetInfo message to FlightInfo
-* [ARROW-5093](https://issues.apache.org/jira/browse/ARROW-5093) - [Packaging] Add support for selective binary upload
-* [ARROW-5094](https://issues.apache.org/jira/browse/ARROW-5094) - [Packaging] Add APT/Yum verification scripts
-* [ARROW-5102](https://issues.apache.org/jira/browse/ARROW-5102) - [C++] Reduce header dependencies
-* [ARROW-5108](https://issues.apache.org/jira/browse/ARROW-5108) - [Go] implement reading primitive arrays from Arrow file
-* [ARROW-5109](https://issues.apache.org/jira/browse/ARROW-5109) - [Go] implement reading binary/string arrays from Arrow file
-* [ARROW-5110](https://issues.apache.org/jira/browse/ARROW-5110) - [Go] implement reading struct arrays from Arrow file
-* [ARROW-5111](https://issues.apache.org/jira/browse/ARROW-5111) - [Go] implement reading list arrays from Arrow file
-* [ARROW-5112](https://issues.apache.org/jira/browse/ARROW-5112) - [Go] implement writing arrays to Arrow file
-* [ARROW-5113](https://issues.apache.org/jira/browse/ARROW-5113) - [C++][Flight] Unit tests in C++ for DoPut
-* [ARROW-5115](https://issues.apache.org/jira/browse/ARROW-5115) - [JS] Implement the Vector Builders
-* [ARROW-5116](https://issues.apache.org/jira/browse/ARROW-5116) - [Rust] move kernel related files under compute/kernels
-* [ARROW-5124](https://issues.apache.org/jira/browse/ARROW-5124) - [C++] Add support for Parquet in MinGW build
-* [ARROW-5126](https://issues.apache.org/jira/browse/ARROW-5126) - [Rust] [Parquet] Convert parquet column desc to arrow data type
-* [ARROW-5127](https://issues.apache.org/jira/browse/ARROW-5127) - [Rust] [Parquet] Add page iterator
-* [ARROW-5136](https://issues.apache.org/jira/browse/ARROW-5136) - [Flight] Implement call options (timeouts)
-* [ARROW-5137](https://issues.apache.org/jira/browse/ARROW-5137) - [Flight] Implement authentication APIs
-* [ARROW-5145](https://issues.apache.org/jira/browse/ARROW-5145) - [C++] Release mode lacks convenience input validation
-* [ARROW-5150](https://issues.apache.org/jira/browse/ARROW-5150) - [Ruby] Add Arrow::Table\#raw\_records
-* [ARROW-5155](https://issues.apache.org/jira/browse/ARROW-5155) - [GLib][Ruby] Add support for building union arrays from data type
-* [ARROW-5157](https://issues.apache.org/jira/browse/ARROW-5157) - [Website] Add MATLAB to powered by Apache Arrow page
-* [ARROW-5162](https://issues.apache.org/jira/browse/ARROW-5162) - [Rust] [Parquet] Rename mod reader to arrow.
-* [ARROW-5163](https://issues.apache.org/jira/browse/ARROW-5163) - [Gandiva] Cast timestamp/date are incorrectly evaluating year 0097 to 1997
-* [ARROW-5164](https://issues.apache.org/jira/browse/ARROW-5164) - [Gandiva] [C++] Introduce 32bit hash functions
-* [ARROW-5165](https://issues.apache.org/jira/browse/ARROW-5165) - [Python][Documentation] Build docs don't suggest assigning $ARROW\_BUILD\_TYPE
-* [ARROW-5168](https://issues.apache.org/jira/browse/ARROW-5168) - [GLib] Add garrow\_array\_take()
-* [ARROW-5171](https://issues.apache.org/jira/browse/ARROW-5171) - [C++] Use LESS instead of LOWER in compare enum option.
-* [ARROW-5172](https://issues.apache.org/jira/browse/ARROW-5172) - [Go] implement reading fixed-size binary arrays from Arrow file
-* [ARROW-5178](https://issues.apache.org/jira/browse/ARROW-5178) - [Python] Allow creating Table from Python dict
-* [ARROW-5179](https://issues.apache.org/jira/browse/ARROW-5179) - [Python] Return plain dicts, not OrderedDict, on Python 3.7+
-* [ARROW-5185](https://issues.apache.org/jira/browse/ARROW-5185) - [C++] Add support for Boost with CMake configuration file
-* [ARROW-5187](https://issues.apache.org/jira/browse/ARROW-5187) - [Rust] Ability to flatten StructArray into a RecordBatch
-* [ARROW-5188](https://issues.apache.org/jira/browse/ARROW-5188) - [Rust] Add temporal builders for StructArray
-* [ARROW-5189](https://issues.apache.org/jira/browse/ARROW-5189) - [Rust] [Parquet] Format individual fields within a parquet row
-* [ARROW-5190](https://issues.apache.org/jira/browse/ARROW-5190) - [R] Discussion: tibble dependency in R package
-* [ARROW-5191](https://issues.apache.org/jira/browse/ARROW-5191) - [Rust] Expose CSV and JSON reader schemas
-* [ARROW-5203](https://issues.apache.org/jira/browse/ARROW-5203) - [GLib] Add support for Compare filter
-* [ARROW-5204](https://issues.apache.org/jira/browse/ARROW-5204) - [C++] Improve BufferBuilder performance
-* [ARROW-5212](https://issues.apache.org/jira/browse/ARROW-5212) - [Go] Array BinaryBuilder in Go library has no access to resize the values buffer
-* [ARROW-5218](https://issues.apache.org/jira/browse/ARROW-5218) - [C++] Improve build when third-party library locations are specified
-* [ARROW-5219](https://issues.apache.org/jira/browse/ARROW-5219) - [C++] Build protobuf\_ep in parallel when using Ninja
-* [ARROW-5222](https://issues.apache.org/jira/browse/ARROW-5222) - [Python] Issues with installing pyarrow for development on MacOS
-* [ARROW-5225](https://issues.apache.org/jira/browse/ARROW-5225) - [Java] Improve performance of BaseValueVector\#getValidityBufferSizeFromCount
-* [ARROW-5226](https://issues.apache.org/jira/browse/ARROW-5226) - [Gandiva] support compare operators for decimal
-* [ARROW-5238](https://issues.apache.org/jira/browse/ARROW-5238) - [Python] Improve usability of pyarrow.dictionary function
-* [ARROW-5241](https://issues.apache.org/jira/browse/ARROW-5241) - [Python] Add option to disable writing statistics to parquet file
-* [ARROW-5250](https://issues.apache.org/jira/browse/ARROW-5250) - [Java] remove javadoc suppression on methods.
-* [ARROW-5252](https://issues.apache.org/jira/browse/ARROW-5252) - [C++] Change variant implementation
-* [ARROW-5256](https://issues.apache.org/jira/browse/ARROW-5256) - [Packaging][deb] Failed to build with LLVM 7.1.0
-* [ARROW-5257](https://issues.apache.org/jira/browse/ARROW-5257) - [Website] Update site to use "official" Apache Arrow logo, add clearly marked links to logo
-* [ARROW-5258](https://issues.apache.org/jira/browse/ARROW-5258) - [C++/Python] Expose file metadata of dataset pieces to caller
-* [ARROW-5261](https://issues.apache.org/jira/browse/ARROW-5261) - [C++] Finish implementation of scalar types for Duration and Interval
-* [ARROW-5262](https://issues.apache.org/jira/browse/ARROW-5262) - [Python] Fix typo
-* [ARROW-5264](https://issues.apache.org/jira/browse/ARROW-5264) - [Java] Allow enabling/disabling boundary checking by environmental variable
-* [ARROW-5266](https://issues.apache.org/jira/browse/ARROW-5266) - [Go] implement read/write IPC for Float16
-* [ARROW-5268](https://issues.apache.org/jira/browse/ARROW-5268) - [GLib] Add GArrowJSONReader
-* [ARROW-5269](https://issues.apache.org/jira/browse/ARROW-5269) - [C++] Whitelist benchmarks candidates for regression checks
-* [ARROW-5275](https://issues.apache.org/jira/browse/ARROW-5275) - [C++] Write generic filesystem tests
-* [ARROW-5281](https://issues.apache.org/jira/browse/ARROW-5281) - [Rust] [Parquet] Move DataPageBuilder to test\_common
-* [ARROW-5284](https://issues.apache.org/jira/browse/ARROW-5284) - [Rust] Replace libc with std::alloc for memory allocation
-* [ARROW-5286](https://issues.apache.org/jira/browse/ARROW-5286) - [Python] support Structs in Table.from\_pandas given a known schema
-* [ARROW-5288](https://issues.apache.org/jira/browse/ARROW-5288) - [Documentation] Enrich the contribution guidelines
-* [ARROW-5289](https://issues.apache.org/jira/browse/ARROW-5289) - [C++] Move arrow/util/concatenate.h to arrow/array/
-* [ARROW-5290](https://issues.apache.org/jira/browse/ARROW-5290) - [Java] Provide a flag to enable/disable null-checking in vectors' get methods
-* [ARROW-5291](https://issues.apache.org/jira/browse/ARROW-5291) - [Python] Add wrapper for "take" kernel on Array
-* [ARROW-5298](https://issues.apache.org/jira/browse/ARROW-5298) - [Rust] Add debug implementation for Buffer
-* [ARROW-5299](https://issues.apache.org/jira/browse/ARROW-5299) - [C++] ListArray comparison is incorrect
-* [ARROW-5309](https://issues.apache.org/jira/browse/ARROW-5309) - [Python] Add clarifications to Python "append" methods that return new objects
-* [ARROW-5311](https://issues.apache.org/jira/browse/ARROW-5311) - [C++] Return more specific invalid Status in Take kernel
-* [ARROW-5313](https://issues.apache.org/jira/browse/ARROW-5313) - [Format] Comments on Field table are a bit confusing
-* [ARROW-5317](https://issues.apache.org/jira/browse/ARROW-5317) - [Rust] [Parquet] impl IntoIterator for SerializedFileReader
-* [ARROW-5319](https://issues.apache.org/jira/browse/ARROW-5319) - [CI] Enable ccache with MinGW builds
-* [ARROW-5321](https://issues.apache.org/jira/browse/ARROW-5321) - [Gandiva][C++] add isnull and isnotnull for utf8 and binary types
-* [ARROW-5323](https://issues.apache.org/jira/browse/ARROW-5323) - [CI] Use compression with clcache
-* [ARROW-5328](https://issues.apache.org/jira/browse/ARROW-5328) - [R] Add shell scripts to do a full package rebuild and test locally
-* [ARROW-5329](https://issues.apache.org/jira/browse/ARROW-5329) - Add support for building MATLAB interface to Feather directly within MATLAB
-* [ARROW-5334](https://issues.apache.org/jira/browse/ARROW-5334) - [C++] Add "Type" to names of arrow::Integer, arrow::FloatingPoint classes for consistency
-* [ARROW-5335](https://issues.apache.org/jira/browse/ARROW-5335) - [Python] Raise on variable dictionaries when converting to pandas
-* [ARROW-5339](https://issues.apache.org/jira/browse/ARROW-5339) - [C++] Add jemalloc to thirdparty dependency download script
-* [ARROW-5341](https://issues.apache.org/jira/browse/ARROW-5341) - [C++] Add instructions about fixing and testing for -Wdocumentation clang warnings locally
-* [ARROW-5342](https://issues.apache.org/jira/browse/ARROW-5342) - [Format] Formalize extension type metadata in IPC protocol
-* [ARROW-5346](https://issues.apache.org/jira/browse/ARROW-5346) - [C++] Revert changes to qualify duration in vendored date code
-* [ARROW-5349](https://issues.apache.org/jira/browse/ARROW-5349) - [Python/C++] Provide a way to specify the file path in parquet ColumnChunkMetaData
-* [ARROW-5361](https://issues.apache.org/jira/browse/ARROW-5361) - [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144
-* [ARROW-5363](https://issues.apache.org/jira/browse/ARROW-5363) - [GLib] Fix coding styles
-* [ARROW-5364](https://issues.apache.org/jira/browse/ARROW-5364) - [C++] Use ASCII rather than UTF-8 in BuildUtils.cmake comment
-* [ARROW-5365](https://issues.apache.org/jira/browse/ARROW-5365) - [C++][CI] Add UBSan and ASAN into CI
-* [ARROW-5368](https://issues.apache.org/jira/browse/ARROW-5368) - [C++] Disable jemalloc by default with MinGW
-* [ARROW-5369](https://issues.apache.org/jira/browse/ARROW-5369) - [C++] Add support for glog on Windows
-* [ARROW-5370](https://issues.apache.org/jira/browse/ARROW-5370) - [C++] Detect system uriparser by default
-* [ARROW-5372](https://issues.apache.org/jira/browse/ARROW-5372) - [GLib] Add support for null/boolean values CSV read option
-* [ARROW-5378](https://issues.apache.org/jira/browse/ARROW-5378) - [C++] Add local FileSystem implementation
-* [ARROW-5384](https://issues.apache.org/jira/browse/ARROW-5384) - [Go] add FixedSizeList array
-* [ARROW-5389](https://issues.apache.org/jira/browse/ARROW-5389) - [C++] Add an internal temporary directory API
-* [ARROW-5392](https://issues.apache.org/jira/browse/ARROW-5392) - [C++][CI][MinGW] Disable static library build on AppVeyor
-* [ARROW-5393](https://issues.apache.org/jira/browse/ARROW-5393) - [R] Add tests and example for read\_parquet()
-* [ARROW-5395](https://issues.apache.org/jira/browse/ARROW-5395) - [C++] Utilize stream EOS in File format
-* [ARROW-5396](https://issues.apache.org/jira/browse/ARROW-5396) - [JS] Ensure reader and writer support files and streams with no RecordBatches
-* [ARROW-5401](https://issues.apache.org/jira/browse/ARROW-5401) - [CI] [C++] Print ccache statistics on Travis-CI
-* [ARROW-5404](https://issues.apache.org/jira/browse/ARROW-5404) - [C++] nonstd::string\_view conflicts with std::string\_view in c++17
-* [ARROW-5407](https://issues.apache.org/jira/browse/ARROW-5407) - [C++] Integration test Travis CI entry builds many unnecessary targets
-* [ARROW-5413](https://issues.apache.org/jira/browse/ARROW-5413) - [C++] CSV reader doesn't remove BOM
-* [ARROW-5415](https://issues.apache.org/jira/browse/ARROW-5415) - [Release] Release script should update R version everywhere
-* [ARROW-5416](https://issues.apache.org/jira/browse/ARROW-5416) - [Website] Add Homebrew to project installation page
-* [ARROW-5418](https://issues.apache.org/jira/browse/ARROW-5418) - [CI][R] Run code coverage and report to codecov.io
-* [ARROW-5420](https://issues.apache.org/jira/browse/ARROW-5420) - [Java] Implement or remove getCurrentSizeInBytes in VariableWidthVector
-* [ARROW-5427](https://issues.apache.org/jira/browse/ARROW-5427) - [Python] RangeIndex serialization change implications
-* [ARROW-5428](https://issues.apache.org/jira/browse/ARROW-5428) - [C++] Add option to set "read extent" in arrow::io::BufferedInputStream
-* [ARROW-5429](https://issues.apache.org/jira/browse/ARROW-5429) - [Java] Provide alternative buffer allocation policy
-* [ARROW-5432](https://issues.apache.org/jira/browse/ARROW-5432) - [Python] Add 'read\_at' method to pyarrow.NativeFile
-* [ARROW-5433](https://issues.apache.org/jira/browse/ARROW-5433) - [C++][Parquet] improve parquet-reader columns information
-* [ARROW-5434](https://issues.apache.org/jira/browse/ARROW-5434) - [Java] Introduce wrappers for backward compatibility for ArrowBuf changes in ARROW-3191
-* [ARROW-5436](https://issues.apache.org/jira/browse/ARROW-5436) - [Python] expose filters argument in parquet.read\_table
-* [ARROW-5438](https://issues.apache.org/jira/browse/ARROW-5438) - [JS] Utilize stream EOS in File format
-* [ARROW-5441](https://issues.apache.org/jira/browse/ARROW-5441) - [C++] Implement FindArrowFlight.cmake
-* [ARROW-5442](https://issues.apache.org/jira/browse/ARROW-5442) - [Website] Clarify what makes a release artifact "official"
-* [ARROW-5443](https://issues.apache.org/jira/browse/ARROW-5443) - [Gandiva][Crossbow] Turn parquet encryption off
-* [ARROW-5447](https://issues.apache.org/jira/browse/ARROW-5447) - [CI] [Ruby] CI is failed on AppVeyor
-* [ARROW-5449](https://issues.apache.org/jira/browse/ARROW-5449) - [C++] Local filesystem implementation: investigate Windows UNC paths
-* [ARROW-5451](https://issues.apache.org/jira/browse/ARROW-5451) - [C++][Gandiva] Add round functions for decimals
-* [ARROW-5452](https://issues.apache.org/jira/browse/ARROW-5452) - [R] Add documentation website (pkgdown)
-* [ARROW-5461](https://issues.apache.org/jira/browse/ARROW-5461) - [Java] Add micro-benchmarks for Float8Vector and allocators
-* [ARROW-5463](https://issues.apache.org/jira/browse/ARROW-5463) - [Rust] Implement AsRef for Buffer
-* [ARROW-5464](https://issues.apache.org/jira/browse/ARROW-5464) - [Archery] Bad --benchmark-filter default
-* [ARROW-5465](https://issues.apache.org/jira/browse/ARROW-5465) - [Crossbow] Support writing submitted job definition yaml to a file
-* [ARROW-5466](https://issues.apache.org/jira/browse/ARROW-5466) - [Java] Dockerize Java builds in Travis CI, run multiple JDKs in single entry
-* [ARROW-5467](https://issues.apache.org/jira/browse/ARROW-5467) - [Go] implement read/write IPC for Time32/Time64 arrays
-* [ARROW-5468](https://issues.apache.org/jira/browse/ARROW-5468) - [Go] implement read/write IPC for Timestamp arrays
-* [ARROW-5469](https://issues.apache.org/jira/browse/ARROW-5469) - [Go] implement read/write IPC for Date32/Date64 arrays
-* [ARROW-5470](https://issues.apache.org/jira/browse/ARROW-5470) - [CI] C++ local filesystem patch breaks Travis R job
-* [ARROW-5472](https://issues.apache.org/jira/browse/ARROW-5472) - [Development] Add warning to PR merge tool if no JIRA component is set
-* [ARROW-5474](https://issues.apache.org/jira/browse/ARROW-5474) - [C++] Document required Boost version
-* [ARROW-5475](https://issues.apache.org/jira/browse/ARROW-5475) - [Python] Add Python binding for arrow::Concatenate
-* [ARROW-5476](https://issues.apache.org/jira/browse/ARROW-5476) - [Java][Memory] Fix Netty ArrowBuf Slice
-* [ARROW-5477](https://issues.apache.org/jira/browse/ARROW-5477) - [C++] Check required RapidJSON version
-* [ARROW-5478](https://issues.apache.org/jira/browse/ARROW-5478) - [Packaging] Drop Ubuntu 14.04 support
-* [ARROW-5481](https://issues.apache.org/jira/browse/ARROW-5481) - [GLib] garrow\_seekable\_input\_stream\_peek() misses "error" parameter document
-* [ARROW-5485](https://issues.apache.org/jira/browse/ARROW-5485) - [Gandiva][Crossbow] OSx builds failing
-* [ARROW-5485](https://issues.apache.org/jira/browse/ARROW-5485) - [Gandiva][Crossbow] OSx builds failing
-* [ARROW-5486](https://issues.apache.org/jira/browse/ARROW-5486) - [GLib] Add binding of gandiva::FunctionRegistry and related things
-* [ARROW-5488](https://issues.apache.org/jira/browse/ARROW-5488) - [R] Workaround when C++ lib not available
-* [ARROW-5490](https://issues.apache.org/jira/browse/ARROW-5490) - [C++] Remove ARROW\_BOOST\_HEADER\_ONLY
-* [ARROW-5491](https://issues.apache.org/jira/browse/ARROW-5491) - [C++] Remove unecessary semicolons following MACRO definitions
-* [ARROW-5492](https://issues.apache.org/jira/browse/ARROW-5492) - [R] Add "col\_select" argument to read\_\* functions to read subset of columns
-* [ARROW-5495](https://issues.apache.org/jira/browse/ARROW-5495) - [C++] Use HTTPS consistently for downloading dependencies
-* [ARROW-5496](https://issues.apache.org/jira/browse/ARROW-5496) - [R][CI] Fix relative paths in R codecov.io reporting
-* [ARROW-5498](https://issues.apache.org/jira/browse/ARROW-5498) - [C++] Build failure with Flatbuffers 1.11.0 and MinGW
-* [ARROW-5499](https://issues.apache.org/jira/browse/ARROW-5499) - [R] Alternate bindings for when libarrow is not found
-* [ARROW-5500](https://issues.apache.org/jira/browse/ARROW-5500) - [R] read\_csv\_arrow() signature should match readr::read\_csv()
-* [ARROW-5503](https://issues.apache.org/jira/browse/ARROW-5503) - [R] add read\_json()
-* [ARROW-5504](https://issues.apache.org/jira/browse/ARROW-5504) - [R] move use\_threads argument to global option
-* [ARROW-5509](https://issues.apache.org/jira/browse/ARROW-5509) - [R] write\_parquet()
-* [ARROW-5511](https://issues.apache.org/jira/browse/ARROW-5511) - [Packaging] Enable Flight in Conda packages
-* [ARROW-5512](https://issues.apache.org/jira/browse/ARROW-5512) - [C++] Draft initial public APIs for Datasets project
-* [ARROW-5513](https://issues.apache.org/jira/browse/ARROW-5513) - [Java] Refactor method name for getstartOffset to use camel case
-* [ARROW-5516](https://issues.apache.org/jira/browse/ARROW-5516) - [Python] Development page for pyarrow has a missing dependency in using pip
-* [ARROW-5518](https://issues.apache.org/jira/browse/ARROW-5518) - [Java] Set VectorSchemaRoot rowCount to 0 on allocateNew and clear
-* [ARROW-5524](https://issues.apache.org/jira/browse/ARROW-5524) - [C++] Turn off PARQUET\_BUILD\_ENCRYPTION in CMake if OpenSSL not found
-* [ARROW-5526](https://issues.apache.org/jira/browse/ARROW-5526) - [Developer] Add more prominent notice to GitHub issue template to direct bug reports to JIRA
-* [ARROW-5529](https://issues.apache.org/jira/browse/ARROW-5529) - [Flight] Allow serving with multiple TLS certificates
-* [ARROW-5531](https://issues.apache.org/jira/browse/ARROW-5531) - [Python] Support binary, utf8, and nested types in Array.from\_buffers
-* [ARROW-5533](https://issues.apache.org/jira/browse/ARROW-5533) - [Plasma] Plasma client should be thread-safe
-* [ARROW-5534](https://issues.apache.org/jira/browse/ARROW-5534) - [GLib] Add garrow\_table\_concatenate()
-* [ARROW-5535](https://issues.apache.org/jira/browse/ARROW-5535) - [GLib] Add garrow\_table\_slice()
-* [ARROW-5537](https://issues.apache.org/jira/browse/ARROW-5537) - [JS] Support delta dictionaries in RecordBatchWriter and DictionaryBuilder
-* [ARROW-5538](https://issues.apache.org/jira/browse/ARROW-5538) - [C++] Restrict minimum OpenSSL version to 1.0.2
-* [ARROW-5541](https://issues.apache.org/jira/browse/ARROW-5541) - [R] cast from negative int32 to uint32 and uint64 are now safe
-* [ARROW-5544](https://issues.apache.org/jira/browse/ARROW-5544) - [Archery] should not return non-zero in \`benchmark diff\` sub command on regression
-* [ARROW-5545](https://issues.apache.org/jira/browse/ARROW-5545) - [C++][Docs] Clarify expectation of UTC values for timestamps with time zones in C++ API docs
-* [ARROW-5547](https://issues.apache.org/jira/browse/ARROW-5547) - [C++][FlightRPC] arrow-flight.pc isn't provided
-* [ARROW-5552](https://issues.apache.org/jira/browse/ARROW-5552) - [Go] make Schema and Field implement Stringer
-* [ARROW-5554](https://issues.apache.org/jira/browse/ARROW-5554) - Add a python wrapper for arrow::Concatenate
-* [ARROW-5555](https://issues.apache.org/jira/browse/ARROW-5555) - [R] Add install\_arrow() function to assist the user in obtaining C++ runtime libraries
-* [ARROW-5556](https://issues.apache.org/jira/browse/ARROW-5556) - [Doc] Document JSON reader
-* [ARROW-5557](https://issues.apache.org/jira/browse/ARROW-5557) - [C++] Investigate performance of VisitBitsUnrolled on different platforms
-* [ARROW-5565](https://issues.apache.org/jira/browse/ARROW-5565) - [Python] Document how to use gdb when working on pyarrow
-* [ARROW-5567](https://issues.apache.org/jira/browse/ARROW-5567) - [C++] Fix build error of memory-benchmark
-* [ARROW-5571](https://issues.apache.org/jira/browse/ARROW-5571) - [R] Rework handing of ARROW\_R\_WITH\_PARQUET
-* [ARROW-5574](https://issues.apache.org/jira/browse/ARROW-5574) - [R] documentation error for read\_arrow()
-* [ARROW-5581](https://issues.apache.org/jira/browse/ARROW-5581) - [Java] Provide interfaces and initial implementations for vector sorting
-* [ARROW-5582](https://issues.apache.org/jira/browse/ARROW-5582) - [Go] add support for comparing Records
-* [ARROW-5586](https://issues.apache.org/jira/browse/ARROW-5586) - [R] convert Array of LIST type to R lists
-* [ARROW-5587](https://issues.apache.org/jira/browse/ARROW-5587) - [Java] Add more maven style check for Java code
-* [ARROW-5590](https://issues.apache.org/jira/browse/ARROW-5590) - [R] Run "no libarrow" R build in the same CI entry if possible
-* [ARROW-5591](https://issues.apache.org/jira/browse/ARROW-5591) - [Go] implement read/write IPC for Duration & Intervals
-* [ARROW-5597](https://issues.apache.org/jira/browse/ARROW-5597) - [Packaging][deb] Add Flight packages
-* [ARROW-5600](https://issues.apache.org/jira/browse/ARROW-5600) - [R] R package namespace cleanup
-* [ARROW-5602](https://issues.apache.org/jira/browse/ARROW-5602) - [Java][Gandiva] Add test for decimal round functions
-* [ARROW-5604](https://issues.apache.org/jira/browse/ARROW-5604) - [Go] improve test coverage of type-traits
-* [ARROW-5609](https://issues.apache.org/jira/browse/ARROW-5609) - [C++] Set CMP0068 CMake policy to avoid macOS warnings
-* [ARROW-5612](https://issues.apache.org/jira/browse/ARROW-5612) - [Python][Documentation] Clarify date\_as\_object option behavior
-* [ARROW-5621](https://issues.apache.org/jira/browse/ARROW-5621) - [Go] implement read/write IPC for Decimal128 arrays
-* [ARROW-5622](https://issues.apache.org/jira/browse/ARROW-5622) - [C++][Dataset] arrow-dataset.pc isn't provided
-* [ARROW-5625](https://issues.apache.org/jira/browse/ARROW-5625) - [R] convert Array of struct type to data frame columns
-* [ARROW-5632](https://issues.apache.org/jira/browse/ARROW-5632) - [Doc] Add some documentation describing compile/debug workflow on macOS with Xcode IDE
-* [ARROW-5633](https://issues.apache.org/jira/browse/ARROW-5633) - [Python] Enable bz2 in Linux wheels
-* [ARROW-5635](https://issues.apache.org/jira/browse/ARROW-5635) - [C++] Support "compacting" a table
-* [ARROW-5637](https://issues.apache.org/jira/browse/ARROW-5637) - [Gandiva] [Java]Complete IN Expression
-* [ARROW-5639](https://issues.apache.org/jira/browse/ARROW-5639) - [Java] Remove floating point computation from getOffsetBufferValueCapacity
-* [ARROW-5641](https://issues.apache.org/jira/browse/ARROW-5641) - [GLib] Remove enums files generated by GNU Autotools from Git targets
-* [ARROW-5643](https://issues.apache.org/jira/browse/ARROW-5643) - [Flight] Add ability to override hostname checking
-* [ARROW-5650](https://issues.apache.org/jira/browse/ARROW-5650) - [Python] Update manylinux dependency versions
-* [ARROW-5652](https://issues.apache.org/jira/browse/ARROW-5652) - [CI] Fix iwyu docker image
-* [ARROW-5653](https://issues.apache.org/jira/browse/ARROW-5653) - [CI] Fix cpp docker image
-* [ARROW-5656](https://issues.apache.org/jira/browse/ARROW-5656) - [Python] Enable Flight wheels on macOS
-* [ARROW-5659](https://issues.apache.org/jira/browse/ARROW-5659) - [C++] Add support for finding OpenSSL installed by Homebrew
-* [ARROW-5660](https://issues.apache.org/jira/browse/ARROW-5660) - [GLib][CI] Use the latest macOS image and all Homebrew based libraries
-* [ARROW-5661](https://issues.apache.org/jira/browse/ARROW-5661) - Support hash functions for decimal in Gandiva
-* [ARROW-5662](https://issues.apache.org/jira/browse/ARROW-5662) - [C++] Add support for BOOST\_SOURCE=AUTO|BUNDLED|SYSTEM
-* [ARROW-5663](https://issues.apache.org/jira/browse/ARROW-5663) - [Packaging][RPM] Update CentOS packages for 0.14.0
-* [ARROW-5664](https://issues.apache.org/jira/browse/ARROW-5664) - [Crossbow] Execute nightly crossbow tests on CircleCI instead of Travis
-* [ARROW-5668](https://issues.apache.org/jira/browse/ARROW-5668) - [Python] Display "not null" in Schema.\_\_repr\_\_ for non-nullable fields
-* [ARROW-5669](https://issues.apache.org/jira/browse/ARROW-5669) - [Crossbow] manylinux1 wheel building failing
-* [ARROW-5670](https://issues.apache.org/jira/browse/ARROW-5670) - [Crossbow] get\_apache\_mirror.py fails with TLS error on macOS with Python 3.5
-* [ARROW-5671](https://issues.apache.org/jira/browse/ARROW-5671) - [crossbow] mac os python wheels failing
-* [ARROW-5672](https://issues.apache.org/jira/browse/ARROW-5672) - [Java] Refactor redundant method modifier
-* [ARROW-5683](https://issues.apache.org/jira/browse/ARROW-5683) - [R] Add snappy to Rtools Windows builds
-* [ARROW-5684](https://issues.apache.org/jira/browse/ARROW-5684) - [Packaging][deb] Add support for Ubuntu 19.04
-* [ARROW-5685](https://issues.apache.org/jira/browse/ARROW-5685) - [Packaging][deb] Add support for Apache Arrow Datasets
-* [ARROW-5687](https://issues.apache.org/jira/browse/ARROW-5687) - [C++] Remove remaining uses of ARROW\_BOOST\_VENDORED
-* [ARROW-5690](https://issues.apache.org/jira/browse/ARROW-5690) - [Packaging][Python] macOS wheels broken: libprotobuf.18.dylib missing
-* [ARROW-5694](https://issues.apache.org/jira/browse/ARROW-5694) - [Python] List of decimals are not supported when converting to pandas
-* [ARROW-5695](https://issues.apache.org/jira/browse/ARROW-5695) - [C\#][Release] Run sourcelink test in verify-release-candidate.sh
-* [ARROW-5696](https://issues.apache.org/jira/browse/ARROW-5696) - [Gandiva] [C++] Introduce castVarcharVarchar
-* [ARROW-5699](https://issues.apache.org/jira/browse/ARROW-5699) - [C++] Optimize parsing of Decimal128 in CSV
-* [ARROW-5701](https://issues.apache.org/jira/browse/ARROW-5701) - [C++][Gandiva] Build expressions only for the required selection vector types
-* [ARROW-5702](https://issues.apache.org/jira/browse/ARROW-5702) - [C++] parquet::arrow::FileReader::GetSchema()
-* [ARROW-5704](https://issues.apache.org/jira/browse/ARROW-5704) - [C++] Stop using ARROW\_TEMPLATE\_EXPORT for SparseTensorImpl class
-* [ARROW-5705](https://issues.apache.org/jira/browse/ARROW-5705) - [Java] Optimize BaseValueVector\#computeCombinedBufferSize logic
-* [ARROW-5706](https://issues.apache.org/jira/browse/ARROW-5706) - [Java] Remove type conversion in getValidityBufferValueCapacity
-* [ARROW-5707](https://issues.apache.org/jira/browse/ARROW-5707) - [Java] Improve the performance and code structure for ArrowRecordBatch
-* [ARROW-5710](https://issues.apache.org/jira/browse/ARROW-5710) - [C++] Allow compiling Gandiva with Ninja on Windows
-* [ARROW-5715](https://issues.apache.org/jira/browse/ARROW-5715) - [Release] Verify Ubuntu 19.04 APT repository
-* [ARROW-5718](https://issues.apache.org/jira/browse/ARROW-5718) - [R] auto splice data frames in record\_batch() and table()
-* [ARROW-5720](https://issues.apache.org/jira/browse/ARROW-5720) - [C++] Create benchmarks for decimal related classes.
-* [ARROW-5721](https://issues.apache.org/jira/browse/ARROW-5721) - [Rust] Move array related code into a separate module
-* [ARROW-5724](https://issues.apache.org/jira/browse/ARROW-5724) - [R] [CI] AppVeyor build should use ccache
-* [ARROW-5725](https://issues.apache.org/jira/browse/ARROW-5725) - [Crossbow] Port conda recipes to azure pipelines
-* [ARROW-5726](https://issues.apache.org/jira/browse/ARROW-5726) - [Java] Implement a common interface for int vectors
-* [ARROW-5727](https://issues.apache.org/jira/browse/ARROW-5727) - [Python] [CI] Install pytest-faulthandler before running tests
-* [ARROW-5748](https://issues.apache.org/jira/browse/ARROW-5748) - [Packaging][deb] Add support for Debian GNU/Linux buster
-* [ARROW-5749](https://issues.apache.org/jira/browse/ARROW-5749) - [Python] Add Python binding for Table::CombineChunks()
-* [ARROW-5751](https://issues.apache.org/jira/browse/ARROW-5751) - [Packaging][Python] Python macOS wheels have dynamic dependency on libcares
-* [ARROW-5752](https://issues.apache.org/jira/browse/ARROW-5752) - [Java] Improve the performance of ArrowBuf\#setZero
-* [ARROW-5755](https://issues.apache.org/jira/browse/ARROW-5755) - [Rust] [Parquet] Add derived clone for Type
-* [ARROW-5768](https://issues.apache.org/jira/browse/ARROW-5768) - [Release] There are needless newlines at the end of CHANGELOG.md
-* [ARROW-5773](https://issues.apache.org/jira/browse/ARROW-5773) - [R] Clean up documentation before release
-* [ARROW-5780](https://issues.apache.org/jira/browse/ARROW-5780) - [C++] Add benchmark for Decimal128 operations
-* [ARROW-5782](https://issues.apache.org/jira/browse/ARROW-5782) - [Release] Setup test data for Flight in dev/release/01-perform.sh
-* [ARROW-5783](https://issues.apache.org/jira/browse/ARROW-5783) - [Release][C\#] Exclude dummy.git from RAT check
-* [ARROW-5785](https://issues.apache.org/jira/browse/ARROW-5785) - [Rust] Rust datafusion implementation should not depend on rustyline
-* [ARROW-5787](https://issues.apache.org/jira/browse/ARROW-5787) - [Release][Rust] Use local modules to verify RC
-* [ARROW-5793](https://issues.apache.org/jira/browse/ARROW-5793) - [Release] Avoid duplicate known host SSH error in dev/release/03-binary.sh
-* [ARROW-5794](https://issues.apache.org/jira/browse/ARROW-5794) - [Release] Skip uploading already uploaded binaries
-* [ARROW-5795](https://issues.apache.org/jira/browse/ARROW-5795) - [Release] Add missing waits on uploading binaries
-* [ARROW-5796](https://issues.apache.org/jira/browse/ARROW-5796) - [Release][APT] Update expected package list
-* [ARROW-5797](https://issues.apache.org/jira/browse/ARROW-5797) - [Release][APT] Update supported distributions
-* [ARROW-5818](https://issues.apache.org/jira/browse/ARROW-5818) - [Java][Gandiva] support varlen output vectors
-* [ARROW-5820](https://issues.apache.org/jira/browse/ARROW-5820) - [Release] Remove undefined variable check from verify script
-* [ARROW-5826](https://issues.apache.org/jira/browse/ARROW-5826) - [Website] Blog post for 0.14.0 release announcement
-* [PARQUET-1243](https://issues.apache.org/jira/browse/PARQUET-1243) - [C++] Improve quality of error message for zero-length files, otherwise corrupted files
-* [PARQUET-1411](https://issues.apache.org/jira/browse/PARQUET-1411) - [C++] Upgrade to use LogicalType annotations instead of ConvertedType
-* [PARQUET-1422](https://issues.apache.org/jira/browse/PARQUET-1422) - [C++] Use Arrow IO interfaces natively rather than current parquet:: wrappers
-* [PARQUET-1517](https://issues.apache.org/jira/browse/PARQUET-1517) - [C++] Update cpp crypto package to match signed-off specification
-* [PARQUET-1523](https://issues.apache.org/jira/browse/PARQUET-1523) - [C++] Vectorize comparator interface
-* [PARQUET-1569](https://issues.apache.org/jira/browse/PARQUET-1569) - [C++] Consolidate testing header files
-* [PARQUET-1582](https://issues.apache.org/jira/browse/PARQUET-1582) - [C++] Add ToString method ColumnDescriptor
-* [PARQUET-1583](https://issues.apache.org/jira/browse/PARQUET-1583) - [C++] Remove parquet::Vector class
-* [PARQUET-1586](https://issues.apache.org/jira/browse/PARQUET-1586) - [C++] Add --dump options to parquet-reader tool to dump def/rep levels
-* [PARQUET-1603](https://issues.apache.org/jira/browse/PARQUET-1603) - [C++] rename parquet::LogicalType to parquet::ConvertedType
-
-
-## Bug Fixes
-
-* [ARROW-61](https://issues.apache.org/jira/browse/ARROW-61) - [Java] Method can return the value bigger than long MAX\_VALUE
-* [ARROW-352](https://issues.apache.org/jira/browse/ARROW-352) - [Format] Interval(DAY\_TIME) has no unit
-* [ARROW-1837](https://issues.apache.org/jira/browse/ARROW-1837) - [Java] Unable to read unsigned integers outside signed range for bit width in integration tests
-* [ARROW-2119](https://issues.apache.org/jira/browse/ARROW-2119) - [C++][Java] Handle Arrow stream with zero record batch
-* [ARROW-2136](https://issues.apache.org/jira/browse/ARROW-2136) - [Python] Non-nullable schema fields not checked in conversions from pandas
-* [ARROW-2256](https://issues.apache.org/jira/browse/ARROW-2256) - [C++] Fuzzer builds fail out of the box on Ubuntu 16.04 using LLVM apt repos
-* [ARROW-2461](https://issues.apache.org/jira/browse/ARROW-2461) - [Python] Build wheels for manylinux2010 tag
-* [ARROW-2590](https://issues.apache.org/jira/browse/ARROW-2590) - [Python] Pyspark python\_udf serialization error on grouped map (Amazon EMR)
-* [ARROW-3344](https://issues.apache.org/jira/browse/ARROW-3344) - [Python] test\_plasma.py fails (in test\_plasma\_list)
-* [ARROW-3399](https://issues.apache.org/jira/browse/ARROW-3399) - [Python] Cannot serialize numpy matrix object
-* [ARROW-3650](https://issues.apache.org/jira/browse/ARROW-3650) - [Python] Mixed column indexes are read back as strings
-* [ARROW-3801](https://issues.apache.org/jira/browse/ARROW-3801) - [Python] Pandas-Arrow roundtrip makes pd categorical index not writeable
-* [ARROW-4021](https://issues.apache.org/jira/browse/ARROW-4021) - [Ruby] Error building red-arrow on msys2
-* [ARROW-4076](https://issues.apache.org/jira/browse/ARROW-4076) - [Python] schema validation and filters
-* [ARROW-4139](https://issues.apache.org/jira/browse/ARROW-4139) - [Python] Cast Parquet column statistics to unicode if UTF8 ConvertedType is set
-* [ARROW-4301](https://issues.apache.org/jira/browse/ARROW-4301) - [Java][Gandiva] Maven snapshot version update does not seem to update Gandiva submodule
-* [ARROW-4301](https://issues.apache.org/jira/browse/ARROW-4301) - [Java][Gandiva] Maven snapshot version update does not seem to update Gandiva submodule
-* [ARROW-4324](https://issues.apache.org/jira/browse/ARROW-4324) - [Python] Array dtype inference incorrect when created from list of mixed numpy scalars
-* [ARROW-4350](https://issues.apache.org/jira/browse/ARROW-4350) - [Python] dtype=object arrays cannot be converted to a list-of-list ListArray
-* [ARROW-4433](https://issues.apache.org/jira/browse/ARROW-4433) - [R] Segmentation fault when instantiating arrow::table from data frame
-* [ARROW-4447](https://issues.apache.org/jira/browse/ARROW-4447) - [C++] Investigate dynamic linking for libthift
-* [ARROW-4516](https://issues.apache.org/jira/browse/ARROW-4516) - [Python] Error while creating a ParquetDataset on a path without \`\_common\_dataset\` but with an empty \`\_tempfile\`
-* [ARROW-4523](https://issues.apache.org/jira/browse/ARROW-4523) - [JS] Add row proxy generation benchmark
-* [ARROW-4651](https://issues.apache.org/jira/browse/ARROW-4651) - [Format] Flight Location should be more flexible than a (host, port) pair
-* [ARROW-4665](https://issues.apache.org/jira/browse/ARROW-4665) - [C++] With glog activated, DCHECK macros are redefined
-* [ARROW-4675](https://issues.apache.org/jira/browse/ARROW-4675) - [Python] Error serializing bool ndarray in py2 and deserializing in py3
-* [ARROW-4694](https://issues.apache.org/jira/browse/ARROW-4694) - [CI] detect-changes.py is inconsistent
-* [ARROW-4723](https://issues.apache.org/jira/browse/ARROW-4723) - [Python] Skip \_files when reading a directory containing parquet files
-* [ARROW-4725](https://issues.apache.org/jira/browse/ARROW-4725) - [C++] Dictionary tests disabled under MinGW builds
-* [ARROW-4823](https://issues.apache.org/jira/browse/ARROW-4823) - [Python] read\_csv shouldn't close file handles it doesn't own
-* [ARROW-4832](https://issues.apache.org/jira/browse/ARROW-4832) - [Python] pandas Index metadata for RangeIndex is incorrect
-* [ARROW-4845](https://issues.apache.org/jira/browse/ARROW-4845) - [R] Compiler warnings on Windows MingW64
-* [ARROW-4851](https://issues.apache.org/jira/browse/ARROW-4851) - [Java] BoundsChecking.java defaulting behavior for old drill parameter seems off
-* [ARROW-4877](https://issues.apache.org/jira/browse/ARROW-4877) - [Plasma] CI failure in test\_plasma\_list
-* [ARROW-4884](https://issues.apache.org/jira/browse/ARROW-4884) - [C++] conda-forge thrift-cpp package not available via pkg-config or cmake
-* [ARROW-4885](https://issues.apache.org/jira/browse/ARROW-4885) - [Python] read\_csv() can't handle decimal128 columns
-* [ARROW-4886](https://issues.apache.org/jira/browse/ARROW-4886) - [Rust] Inconsistent behaviour with casting sliced primitive array to list array
-* [ARROW-4923](https://issues.apache.org/jira/browse/ARROW-4923) - Expose setters for Decimal vector that take long and double inputs
-* [ARROW-4934](https://issues.apache.org/jira/browse/ARROW-4934) - [Python] Address deprecation notice that will be a bug in Python 3.8
-* [ARROW-5019](https://issues.apache.org/jira/browse/ARROW-5019) - [C\#] ArrowStreamWriter doesn't work on a non-seekable stream
-* [ARROW-5049](https://issues.apache.org/jira/browse/ARROW-5049) - [Python] org/apache/hadoop/fs/FileSystem class not found when pyarrow FileSystem used in spark
-* [ARROW-5051](https://issues.apache.org/jira/browse/ARROW-5051) - [GLib][Gandiva] Test failure in release verification script
-* [ARROW-5055](https://issues.apache.org/jira/browse/ARROW-5055) - [Ruby][MSYS2] libparquet needs to be installed in MSYS2 for ruby
-* [ARROW-5058](https://issues.apache.org/jira/browse/ARROW-5058) - [Release] 02-source.sh generates e-mail template with wrong links
-* [ARROW-5059](https://issues.apache.org/jira/browse/ARROW-5059) - [C++][Gandiva] cbrt\_\* floating point tests can fail due to exact comparisons
-* [ARROW-5065](https://issues.apache.org/jira/browse/ARROW-5065) - [Rust] cast kernel does not support casting from Int64
-* [ARROW-5068](https://issues.apache.org/jira/browse/ARROW-5068) - [Gandiva][Packaging] Fix gandiva nightly builds after the CMake refactor
-* [ARROW-5090](https://issues.apache.org/jira/browse/ARROW-5090) - Parquet linking fails on MacOS due to @rpath in dylib
-* [ARROW-5092](https://issues.apache.org/jira/browse/ARROW-5092) - [C\#] Source Link doesn't work with the C\# release script
-* [ARROW-5095](https://issues.apache.org/jira/browse/ARROW-5095) - [Flight][C++] Flight DoGet doesn't expose server error message
-* [ARROW-5096](https://issues.apache.org/jira/browse/ARROW-5096) - [Packaging][deb] plasma-store-server packages are missing
-* [ARROW-5097](https://issues.apache.org/jira/browse/ARROW-5097) - [Packaging][CentOS6] arrow-lib has unresolvable dependencies
-* [ARROW-5098](https://issues.apache.org/jira/browse/ARROW-5098) - [Website] Update APT install document for 0.13.0
-* [ARROW-5100](https://issues.apache.org/jira/browse/ARROW-5100) - [JS] Writer swaps byte order if buffers share the same underlying ArrayBuffer
-* [ARROW-5117](https://issues.apache.org/jira/browse/ARROW-5117) - [Go] Panic when appending zero slices after initializing a builder
-* [ARROW-5119](https://issues.apache.org/jira/browse/ARROW-5119) - [Go] invalid Stringer implementation for array.Boolean
-* [ARROW-5122](https://issues.apache.org/jira/browse/ARROW-5122) - [Python] pyarrow.parquet.read\_table raises non-file path error when given a windows path to a directory
-* [ARROW-5128](https://issues.apache.org/jira/browse/ARROW-5128) - [Packaging][CentOS][Conda] Numpy not found in nightly builds
-* [ARROW-5129](https://issues.apache.org/jira/browse/ARROW-5129) - [Rust][Parquet] Column writer bug: check dictionary encoder when adding a new data page
-* [ARROW-5130](https://issues.apache.org/jira/browse/ARROW-5130) - [Python] Segfault when importing TensorFlow after Pyarrow
-* [ARROW-5132](https://issues.apache.org/jira/browse/ARROW-5132) - [Java] Errors on building gandiva\_jni.dll on Windows with Visual Studio 2017
-* [ARROW-5138](https://issues.apache.org/jira/browse/ARROW-5138) - [Python/C++] Row group retrieval doesn't restore index properly
-* [ARROW-5140](https://issues.apache.org/jira/browse/ARROW-5140) - [Bug?][Parquet] Can write a jagged array column of strings to disk, but hit \`ArrowNotImplementedError\` on read
-* [ARROW-5142](https://issues.apache.org/jira/browse/ARROW-5142) - [CI] Fix conda calls in AppVeyor scripts
-* [ARROW-5144](https://issues.apache.org/jira/browse/ARROW-5144) - [Python] ParquetDataset and ParquetPiece not serializable
-* [ARROW-5146](https://issues.apache.org/jira/browse/ARROW-5146) - [Dev] Merge script imposes directory name
-* [ARROW-5147](https://issues.apache.org/jira/browse/ARROW-5147) - [C++] get an error in building: Could NOT find DoubleConversion
-* [ARROW-5148](https://issues.apache.org/jira/browse/ARROW-5148) - [CI] [C++] LLVM-related compile errors
-* [ARROW-5149](https://issues.apache.org/jira/browse/ARROW-5149) - [Packaging][Wheel] Pin LLVM to version 7 in windows builds
-* [ARROW-5152](https://issues.apache.org/jira/browse/ARROW-5152) - [Python] CMake warnings when building
-* [ARROW-5159](https://issues.apache.org/jira/browse/ARROW-5159) - Unable to build benches in arrow crate.
-* [ARROW-5160](https://issues.apache.org/jira/browse/ARROW-5160) - [C++] ABORT\_NOT\_OK evalutes expression twice
-* [ARROW-5166](https://issues.apache.org/jira/browse/ARROW-5166) - [Python][Parquet] Statistics for uint64 columns may overflow
-* [ARROW-5167](https://issues.apache.org/jira/browse/ARROW-5167) - [C++] Upgrade string-view-light to latest
-* [ARROW-5169](https://issues.apache.org/jira/browse/ARROW-5169) - [Python] non-nullable fields are converted to nullable in {{Table.from\_pandas}}
-* [ARROW-5173](https://issues.apache.org/jira/browse/ARROW-5173) - [Go] handle multiple concatenated streams back-to-back
-* [ARROW-5174](https://issues.apache.org/jira/browse/ARROW-5174) - [Go] implement Stringer for DataTypes
-* [ARROW-5177](https://issues.apache.org/jira/browse/ARROW-5177) - [Python] ParquetReader.read\_column() doesn't check bounds
-* [ARROW-5183](https://issues.apache.org/jira/browse/ARROW-5183) - [CI] MinGW build failures on AppVeyor
-* [ARROW-5184](https://issues.apache.org/jira/browse/ARROW-5184) - [Rust] Broken links and other documentation warnings
-* [ARROW-5186](https://issues.apache.org/jira/browse/ARROW-5186) - [Plasma] Crash on deleting CUDA memory
-* [ARROW-5194](https://issues.apache.org/jira/browse/ARROW-5194) - [C++][Plasma] TEST(PlasmaSerialization, GetReply) is failing
-* [ARROW-5195](https://issues.apache.org/jira/browse/ARROW-5195) - [Python] read\_csv ignores null\_values on string types
-* [ARROW-5201](https://issues.apache.org/jira/browse/ARROW-5201) - [Python] Import ABCs from collections is deprecated in Python 3.7
-* [ARROW-5208](https://issues.apache.org/jira/browse/ARROW-5208) - [Python] Inconsistent resulting type during casting in pa.array() when mask is present
-* [ARROW-5214](https://issues.apache.org/jira/browse/ARROW-5214) - [C++] Offline dependency downloader misses some libraries
-* [ARROW-5217](https://issues.apache.org/jira/browse/ARROW-5217) - [Rust] [CI] DataFusion test failure
-* [ARROW-5232](https://issues.apache.org/jira/browse/ARROW-5232) - [Java] value vector size increases rapidly in case of clear/setSafe loop
-* [ARROW-5233](https://issues.apache.org/jira/browse/ARROW-5233) - [Go] migrate to new flatbuffers-v1.11.0
-* [ARROW-5237](https://issues.apache.org/jira/browse/ARROW-5237) - [Python] pandas\_version key in pandas metadata no longer populated
-* [ARROW-5240](https://issues.apache.org/jira/browse/ARROW-5240) - [C++][CI] cmake\_format 0.5.0 appears to fail the build
-* [ARROW-5242](https://issues.apache.org/jira/browse/ARROW-5242) - [C++] Arrow doesn't compile cleanly with Visual Studio 2017 Update 9 or later due to narrowing
-* [ARROW-5243](https://issues.apache.org/jira/browse/ARROW-5243) - [Java][Gandiva] Add test for decimal compare functions
-* [ARROW-5245](https://issues.apache.org/jira/browse/ARROW-5245) - [C++][CI] Unpin cmake\_format
-* [ARROW-5246](https://issues.apache.org/jira/browse/ARROW-5246) - [Go] use Go-1.12 in CI
-* [ARROW-5249](https://issues.apache.org/jira/browse/ARROW-5249) - [Java] Flight client doesn't handle auth correctly in some cases
-* [ARROW-5253](https://issues.apache.org/jira/browse/ARROW-5253) - [C++] external Snappy fails on Alpine
-* [ARROW-5254](https://issues.apache.org/jira/browse/ARROW-5254) - [Flight][Java] DoAction does not support result streams
-* [ARROW-5255](https://issues.apache.org/jira/browse/ARROW-5255) - [Java] Implement user-defined data types API
-* [ARROW-5260](https://issues.apache.org/jira/browse/ARROW-5260) - [Python][C++] Crash when deserializing from components in a fresh new process
-* [ARROW-5274](https://issues.apache.org/jira/browse/ARROW-5274) - [JavaScript] Wrong array type for countBy
-* [ARROW-5283](https://issues.apache.org/jira/browse/ARROW-5283) - [C++][Plasma] Server crash when creating an aborted object 3 times
-* [ARROW-5285](https://issues.apache.org/jira/browse/ARROW-5285) - [C++][Plasma] GpuProcessHandle is not released when GPU object deleted
-* [ARROW-5293](https://issues.apache.org/jira/browse/ARROW-5293) - [C++] Take kernel on DictionaryArray does not preserve ordered flag
-* [ARROW-5294](https://issues.apache.org/jira/browse/ARROW-5294) - [CI] setuptools\_scm failures
-* [ARROW-5296](https://issues.apache.org/jira/browse/ARROW-5296) - [Java] Sporadic Flight test failures
-* [ARROW-5301](https://issues.apache.org/jira/browse/ARROW-5301) - [Python] parquet documentation outdated on nthreads argument
-* [ARROW-5304](https://issues.apache.org/jira/browse/ARROW-5304) - [C++] CudaDeviceManager::GetInstance is not thread-safe
-* [ARROW-5306](https://issues.apache.org/jira/browse/ARROW-5306) - [CI] [GLib] Disable GTK-Doc
-* [ARROW-5308](https://issues.apache.org/jira/browse/ARROW-5308) - [Go] remove deprecated Feather format
-* [ARROW-5314](https://issues.apache.org/jira/browse/ARROW-5314) - [Go] Incorrect Printing for String Arrays with Offsets
-* [ARROW-5314](https://issues.apache.org/jira/browse/ARROW-5314) - [Go] Incorrect Printing for String Arrays with Offsets
-* [ARROW-5318](https://issues.apache.org/jira/browse/ARROW-5318) - [Python] pyarrow hdfs reader overrequests
-* [ARROW-5325](https://issues.apache.org/jira/browse/ARROW-5325) - [Archery][Benchmark] Output properly formatted jsonlines from benchmark diff cli command
-* [ARROW-5330](https://issues.apache.org/jira/browse/ARROW-5330) - [Python] [CI] Run Python Flight tests on Travis-CI
-* [ARROW-5332](https://issues.apache.org/jira/browse/ARROW-5332) - [R] R package fails to build/install: error in dyn.load()
-* [ARROW-5348](https://issues.apache.org/jira/browse/ARROW-5348) - [CI] [Java] Gandiva checkstyle failure
-* [ARROW-5360](https://issues.apache.org/jira/browse/ARROW-5360) - [Rust] Builds are broken by rustyline on nightly 2019-05-16+
-* [ARROW-5362](https://issues.apache.org/jira/browse/ARROW-5362) - [C++] Compression round trip test can cause some sanitizers to to fail
-* [ARROW-5371](https://issues.apache.org/jira/browse/ARROW-5371) - [Release] Add tests for dev/release/00-prepare.sh
-* [ARROW-5373](https://issues.apache.org/jira/browse/ARROW-5373) - [Java] Add missing details for Gandiva Java Build
-* [ARROW-5376](https://issues.apache.org/jira/browse/ARROW-5376) - [C++] Compile failure on gcc 5.4.0
-* [ARROW-5383](https://issues.apache.org/jira/browse/ARROW-5383) - [Go] update IPC flatbuf (new Duration type)
-* [ARROW-5387](https://issues.apache.org/jira/browse/ARROW-5387) - [Go] properly handle sub-slice of List
-* [ARROW-5388](https://issues.apache.org/jira/browse/ARROW-5388) - [Go] use arrow.TypeEqual in array.NewChunked
-* [ARROW-5390](https://issues.apache.org/jira/browse/ARROW-5390) - [CI] Job time limit exceeded on Travis
-* [ARROW-5397](https://issues.apache.org/jira/browse/ARROW-5397) - Test Flight TLS support
-* [ARROW-5398](https://issues.apache.org/jira/browse/ARROW-5398) - [Python] Flight tests broken by URI changes
-* [ARROW-5403](https://issues.apache.org/jira/browse/ARROW-5403) - [C++] Test failures not propagated in Windows shared builds
-* [ARROW-5411](https://issues.apache.org/jira/browse/ARROW-5411) - [C++][Python] Build error building on Mac OS Mojave
-* [ARROW-5412](https://issues.apache.org/jira/browse/ARROW-5412) - [Java] Integration test fails with UnsupportedOperationException
-* [ARROW-5419](https://issues.apache.org/jira/browse/ARROW-5419) - [C++] CSV strings\_can\_be\_null option doesn't respect all null\_values
-* [ARROW-5421](https://issues.apache.org/jira/browse/ARROW-5421) - [Packaging][Crossbow] Duplicated key in nightly test configuration
-* [ARROW-5422](https://issues.apache.org/jira/browse/ARROW-5422) - [CI] [C++] Build failure with Google Benchmark
-* [ARROW-5430](https://issues.apache.org/jira/browse/ARROW-5430) - [Python] Can read but not write parquet partitioned on large ints
-* [ARROW-5435](https://issues.apache.org/jira/browse/ARROW-5435) - [Java] add test for IntervalYearVector\#getAsStringBuilder
-* [ARROW-5437](https://issues.apache.org/jira/browse/ARROW-5437) - [Python] Missing pandas pytest marker from parquet tests
-* [ARROW-5446](https://issues.apache.org/jira/browse/ARROW-5446) - [C++] Use cmake header install directory instead of include
-* [ARROW-5448](https://issues.apache.org/jira/browse/ARROW-5448) - [CI] MinGW build failures on AppVeyor
-* [ARROW-5453](https://issues.apache.org/jira/browse/ARROW-5453) - [C++] Just-released cmake-format 0.5.2 breaks the build
-* [ARROW-5455](https://issues.apache.org/jira/browse/ARROW-5455) - [Rust] Build broken by 2019-05-30 Rust nightly
-* [ARROW-5456](https://issues.apache.org/jira/browse/ARROW-5456) - [GLib][Plasma] Installed plasma-glib may be used on building document
-* [ARROW-5457](https://issues.apache.org/jira/browse/ARROW-5457) - [GLib][Plasma] Environment variable name for test is wrong
-* [ARROW-5459](https://issues.apache.org/jira/browse/ARROW-5459) - [Go] implement Stringer for Float16 DataType
-* [ARROW-5462](https://issues.apache.org/jira/browse/ARROW-5462) - [Go] support writing zero-length List
-* [ARROW-5479](https://issues.apache.org/jira/browse/ARROW-5479) - [Rust] [DataFusion] Use ARROW\_TEST\_DATA instead of relative path for testing
-* [ARROW-5487](https://issues.apache.org/jira/browse/ARROW-5487) - [CI] [Python] Failure in docs build
-* [ARROW-5493](https://issues.apache.org/jira/browse/ARROW-5493) - [Integration/Go] add Go support for IPC integration tests
-* [ARROW-5507](https://issues.apache.org/jira/browse/ARROW-5507) - [Plasma] [CUDA] Compile error
-* [ARROW-5514](https://issues.apache.org/jira/browse/ARROW-5514) - [C++] Printer for uint64 shows wrong values
-* [ARROW-5517](https://issues.apache.org/jira/browse/ARROW-5517) - [C++] Header collection CMake logic should only consider filename without directory included
-* [ARROW-5520](https://issues.apache.org/jira/browse/ARROW-5520) - [C++][Packaging] No NVidia CUDA toolkit on AArch64C
-* [ARROW-5521](https://issues.apache.org/jira/browse/ARROW-5521) - [Packaging] License check fails with Apache RAT 0.13
-* [ARROW-5528](https://issues.apache.org/jira/browse/ARROW-5528) - Concatenate() crashes when concatenating empty binary arrays.
-* [ARROW-5532](https://issues.apache.org/jira/browse/ARROW-5532) - [JS] Field Metadata Not Read
-* [ARROW-5551](https://issues.apache.org/jira/browse/ARROW-5551) - [Go] invalid FixedSizeArray representation
-* [ARROW-5553](https://issues.apache.org/jira/browse/ARROW-5553) - [Ruby] red-arrow gem does not compile on ruby:2.5 docker image
-* [ARROW-5576](https://issues.apache.org/jira/browse/ARROW-5576) - [C++] Flaky thrift\_ep tarball downloads
-* [ARROW-5577](https://issues.apache.org/jira/browse/ARROW-5577) - [C++] Link failure due to googletest shared library on Alpine Linux
-* [ARROW-5583](https://issues.apache.org/jira/browse/ARROW-5583) - [Java] When the isSet of a NullableValueHolder is 0, the buffer field should not be used
-* [ARROW-5584](https://issues.apache.org/jira/browse/ARROW-5584) - [Java] Add import for link reference in FieldReader javadoc
-* [ARROW-5589](https://issues.apache.org/jira/browse/ARROW-5589) - [C++][Fuzzing] arrow-ipc-fuzzing-test crash 2354085db0125113f04f7bd23f54b85cca104713
-* [ARROW-5592](https://issues.apache.org/jira/browse/ARROW-5592) - [Go] implement Duration array
-* [ARROW-5596](https://issues.apache.org/jira/browse/ARROW-5596) - [Python] Flight tests failing on Python 2.7
-* [ARROW-5601](https://issues.apache.org/jira/browse/ARROW-5601) - [gandiva] Error when projector with a string field
-* [ARROW-5603](https://issues.apache.org/jira/browse/ARROW-5603) - [Python] register pytest markers to avoid warnings
-* [ARROW-5605](https://issues.apache.org/jira/browse/ARROW-5605) - [C++][Fuzzing] arrow-ipc-fuzzing-test crash 74aec871d14bb6b07c72ea8f0e8c9f72cbe6b73c
-* [ARROW-5606](https://issues.apache.org/jira/browse/ARROW-5606) - [Python] pandas.RangeIndex.\_start/\_stop/\_step are deprecated
-* [ARROW-5608](https://issues.apache.org/jira/browse/ARROW-5608) - [C++][parquet] Invalid memory access when using parquet::arrow::ColumnReader
-* [ARROW-5615](https://issues.apache.org/jira/browse/ARROW-5615) - [C++] Compilation error due to C++11 string literals on gcc 5.4.0 Ubuntu 16.04
-* [ARROW-5616](https://issues.apache.org/jira/browse/ARROW-5616) - [Python] C++ build failure against Python 2.7 headers
-* [ARROW-5617](https://issues.apache.org/jira/browse/ARROW-5617) - [C++] thrift\_ep 0.12.0 fails to build when using ARROW\_BOOST\_VENDORED=ON
-* [ARROW-5619](https://issues.apache.org/jira/browse/ARROW-5619) - [C++] get\_apache\_mirror.py doesn't work with Python 3.5
-* [ARROW-5623](https://issues.apache.org/jira/browse/ARROW-5623) - [CI][GLib] Failed on macOS
-* [ARROW-5624](https://issues.apache.org/jira/browse/ARROW-5624) - [C++] -Duriparser\_SOURCE=BUNDLED is broken
-* [ARROW-5626](https://issues.apache.org/jira/browse/ARROW-5626) - [C++][Gandiva] Expression cache should consider precision and scale too
-* [ARROW-5629](https://issues.apache.org/jira/browse/ARROW-5629) - [C++] Fix Coverity issues
-* [ARROW-5631](https://issues.apache.org/jira/browse/ARROW-5631) - [C++] CMake 3.2 build is broken
-* [ARROW-5644](https://issues.apache.org/jira/browse/ARROW-5644) - [Python] test\_flight.py::test\_tls\_do\_get appears to hang
-* [ARROW-5647](https://issues.apache.org/jira/browse/ARROW-5647) - [Python] Accessing a file from Databricks using pandas read\_parquet using the pyarrow engine fails with : Passed non-file path: /mnt/aa/example.parquet
-* [ARROW-5648](https://issues.apache.org/jira/browse/ARROW-5648) - [C++] Build fails on mingw without codecvt
-* [ARROW-5654](https://issues.apache.org/jira/browse/ARROW-5654) - [C++] ChunkedArray should validate the types of the arrays
-* [ARROW-5657](https://issues.apache.org/jira/browse/ARROW-5657) - [C++] "docker-compose run cpp" broken in master
-* [ARROW-5674](https://issues.apache.org/jira/browse/ARROW-5674) - [Python] Missing pandas pytest markers from test\_parquet.py
-* [ARROW-5675](https://issues.apache.org/jira/browse/ARROW-5675) - [Doc] Fix typo in documentation describing compile/debug workflow on macOS with Xcode IDE
-* [ARROW-5678](https://issues.apache.org/jira/browse/ARROW-5678) - [R][Lint] Fix hadolint docker linting error
-* [ARROW-5693](https://issues.apache.org/jira/browse/ARROW-5693) - [Go] skip IPC integration test for Decimal128
-* [ARROW-5697](https://issues.apache.org/jira/browse/ARROW-5697) - [GLib] c\_glib/Dockerfile is broken
-* [ARROW-5698](https://issues.apache.org/jira/browse/ARROW-5698) - [R] r/Dockerfile docker-compose build is broken
-* [ARROW-5709](https://issues.apache.org/jira/browse/ARROW-5709) - [C++] gandiva-date\_time\_test failure on Windows
-* [ARROW-5714](https://issues.apache.org/jira/browse/ARROW-5714) - [JS] Inconsistent behavior in Int64Builder with/without BigNum
-* [ARROW-5723](https://issues.apache.org/jira/browse/ARROW-5723) - [Gandiva][Crossbow] Builds failing
-* [ARROW-5728](https://issues.apache.org/jira/browse/ARROW-5728) - [Python] [CI] Travis-CI failures in test\_jvm.py
-* [ARROW-5729](https://issues.apache.org/jira/browse/ARROW-5729) - [Python][Java] ArrowType.Int object has no attribute 'isSigned'
-* [ARROW-5730](https://issues.apache.org/jira/browse/ARROW-5730) - [Python][CI] Selectively skip test cases in the dask integration test
-* [ARROW-5732](https://issues.apache.org/jira/browse/ARROW-5732) - [C++] macOS builds failing idiosyncratically on master with warnings from pmmintrin.h
-* [ARROW-5735](https://issues.apache.org/jira/browse/ARROW-5735) - [C++] Appveyor builds failing persistently in thrift\_ep build
-* [ARROW-5737](https://issues.apache.org/jira/browse/ARROW-5737) - [C++][Gandiva] Gandiva not building in manylinux
-* [ARROW-5738](https://issues.apache.org/jira/browse/ARROW-5738) - [Crossbow][Conda] OSX package builds are failing with missing intrinsics
-* [ARROW-5739](https://issues.apache.org/jira/browse/ARROW-5739) - [CI] Fix docker python build
-* [ARROW-5750](https://issues.apache.org/jira/browse/ARROW-5750) - [Java] Java compilation failures on master
-* [ARROW-5754](https://issues.apache.org/jira/browse/ARROW-5754) - [C++]Missing override for \~GrpcStreamWriter?
-* [ARROW-5765](https://issues.apache.org/jira/browse/ARROW-5765) - [C++] TestDictionary.Validate test is crashed with release build
-* [ARROW-5769](https://issues.apache.org/jira/browse/ARROW-5769) - [Java] org.apache.arrow.flight.TestTls is failed via dev/release/00-prepare.sh
-* [ARROW-5770](https://issues.apache.org/jira/browse/ARROW-5770) - [C++] Fix -Wpessimizing-move in result.h
-* [ARROW-5771](https://issues.apache.org/jira/browse/ARROW-5771) - [Python] Docker python-nopandas job fails
-* [ARROW-5774](https://issues.apache.org/jira/browse/ARROW-5774) - [Java][Documentation] Document the need to checkout git submodules for flight
-* [ARROW-5781](https://issues.apache.org/jira/browse/ARROW-5781) - [Archery] Ensure benchmark clone accepts remotes in revision
-* [ARROW-5791](https://issues.apache.org/jira/browse/ARROW-5791) - [Python] pyarrow.csv.read\_csv hangs + eats all RAM
-* [ARROW-5816](https://issues.apache.org/jira/browse/ARROW-5816) - [Release] Parallel curl does not work reliably in verify-release-candidate-sh
-* [ARROW-5922](https://issues.apache.org/jira/browse/ARROW-5922) - [Python] Unable to connect to HDFS from a worker/data node on a Kerberized cluster using pyarrow' hdfs API
-* [PARQUET-1402](https://issues.apache.org/jira/browse/PARQUET-1402) - [C++] incorrect calculation column start offset for files created by parquet-mr 1.8.1
-* [PARQUET-1405](https://issues.apache.org/jira/browse/PARQUET-1405) - [C++] 'Couldn't deserialize thrift' error when reading large binary column
-* [PARQUET-1405](https://issues.apache.org/jira/browse/PARQUET-1405) - [C++] 'Couldn't deserialize thrift' error when reading large binary column
-* [PARQUET-1565](https://issues.apache.org/jira/browse/PARQUET-1565) - [C++] SEGV in FromParquetSchema with corrupt file from PARQUET-1481
-* [PARQUET-1571](https://issues.apache.org/jira/browse/PARQUET-1571) - [C++] Can't read data from parquet file in C++ library
-* [PARQUET-1574](https://issues.apache.org/jira/browse/PARQUET-1574) - [C++] parquet-encoding-test failed with msvc
-* [PARQUET-1581](https://issues.apache.org/jira/browse/PARQUET-1581) - [C++] Fix undefined behavior in encoding.cc when num\_dictionary\_values is 0.
-
-
-
-# Apache Arrow 0.13.0 (2019-04-01)
-
-## Bug Fixes
-
-* [ARROW-295](https://issues.apache.org/jira/browse/ARROW-295) - Create DOAP File
-* [ARROW-1171](https://issues.apache.org/jira/browse/ARROW-1171) - [C++] Segmentation faults on Fedora 24 with pyarrow-manylinux1 and self-compiled turbodbc
-* [ARROW-2392](https://issues.apache.org/jira/browse/ARROW-2392) - [Python] pyarrow RecordBatchStreamWriter allows writing batches with different schemas
-* [ARROW-2399](https://issues.apache.org/jira/browse/ARROW-2399) - [Rust] Builder<T\> should not provide a set() method
-* [ARROW-2598](https://issues.apache.org/jira/browse/ARROW-2598) - [Python] table.to\_pandas segfault
-* [ARROW-3086](https://issues.apache.org/jira/browse/ARROW-3086) - [GLib] GISCAN fails due to conda-shipped openblas
-* [ARROW-3096](https://issues.apache.org/jira/browse/ARROW-3096) - [Python] Update Python source build instructions given Anaconda/conda-forge toolchain migration
-* [ARROW-3133](https://issues.apache.org/jira/browse/ARROW-3133) - [C++] Logical boolean kernels in kernels/boolean.cc cannot write into preallocated memory
-* [ARROW-3133](https://issues.apache.org/jira/browse/ARROW-3133) - [C++] Logical boolean kernels in kernels/boolean.cc cannot write into preallocated memory
-* [ARROW-3208](https://issues.apache.org/jira/browse/ARROW-3208) - [C++] Segmentation fault when casting dictionary to numeric with nullptr valid\_bitmap
-* [ARROW-3426](https://issues.apache.org/jira/browse/ARROW-3426) - [CI] Java integration test very verbose
-* [ARROW-3564](https://issues.apache.org/jira/browse/ARROW-3564) - [Python] writing version 2.0 parquet format with dictionary encoding enabled
-* [ARROW-3578](https://issues.apache.org/jira/browse/ARROW-3578) - [Release] Address spurious Apache RAT failures in source release script
-* [ARROW-3593](https://issues.apache.org/jira/browse/ARROW-3593) - [R] CI builds failing due to GitHub API rate limits
-* [ARROW-3606](https://issues.apache.org/jira/browse/ARROW-3606) - [Python] flake8 fails on Crossbow
-* [ARROW-3669](https://issues.apache.org/jira/browse/ARROW-3669) - [Python] Convert big-endian numbers or raise error in pyarrow.array
-* [ARROW-3843](https://issues.apache.org/jira/browse/ARROW-3843) - [Python] Writing Parquet file from empty table created with Table.from\_pandas(..., preserve\_index=False) fails
-* [ARROW-3923](https://issues.apache.org/jira/browse/ARROW-3923) - [Java] JDBC-to-Arrow Conversion: Unnecessary Calendar Requirement
-* [ARROW-4007](https://issues.apache.org/jira/browse/ARROW-4007) - [Java][Plasma] Plasma JNI tests failing
-* [ARROW-4050](https://issues.apache.org/jira/browse/ARROW-4050) - [Python][Parquet] core dump on reading parquet file
-* [ARROW-4081](https://issues.apache.org/jira/browse/ARROW-4081) - [Go] Sum methods on Mac OS X panic when the array is empty
-* [ARROW-4104](https://issues.apache.org/jira/browse/ARROW-4104) - [Java] race in AllocationManager during release
-* [ARROW-4108](https://issues.apache.org/jira/browse/ARROW-4108) - [Python/Java] Spark integration tests do not work
-* [ARROW-4117](https://issues.apache.org/jira/browse/ARROW-4117) - [Python] "asv dev" command fails with latest revision
-* [ARROW-4140](https://issues.apache.org/jira/browse/ARROW-4140) - [C++][Gandiva] Compiled LLVM bitcode file path may result in libraries being non-relocatable
-* [ARROW-4145](https://issues.apache.org/jira/browse/ARROW-4145) - [C++] Find Windows-compatible strptime implementation
-* [ARROW-4181](https://issues.apache.org/jira/browse/ARROW-4181) - [Python] TestConvertStructTypes.test\_from\_numpy\_large failing
-* [ARROW-4192](https://issues.apache.org/jira/browse/ARROW-4192) - "./dev/run\_docker\_compose.sh" is out of date
-* [ARROW-4213](https://issues.apache.org/jira/browse/ARROW-4213) - [Flight] C++ and Java implementations are incompatible
-* [ARROW-4244](https://issues.apache.org/jira/browse/ARROW-4244) - Clarify language around padding/alignment
-* [ARROW-4250](https://issues.apache.org/jira/browse/ARROW-4250) - [C++][Gandiva] Use approximate comparisons for floating point numbers in gandiva-projector-test
-* [ARROW-4252](https://issues.apache.org/jira/browse/ARROW-4252) - [C++] Status error context strings missing lines of code
-* [ARROW-4253](https://issues.apache.org/jira/browse/ARROW-4253) - [GLib] Cannot use non-system Boost specified with $BOOST\_ROOT
-* [ARROW-4254](https://issues.apache.org/jira/browse/ARROW-4254) - [C++] Gandiva tests fail to compile with Boost in Ubuntu 14.04 apt
-* [ARROW-4255](https://issues.apache.org/jira/browse/ARROW-4255) - [C++] Schema::GetFieldIndex is not thread-safe
-* [ARROW-4261](https://issues.apache.org/jira/browse/ARROW-4261) - [C++] CMake paths for IPC, Flight, Thrift, and Plasma don't support using Arrow as a subproject
-* [ARROW-4264](https://issues.apache.org/jira/browse/ARROW-4264) - [C++] Document why DCHECKs are used in kernels
-* [ARROW-4267](https://issues.apache.org/jira/browse/ARROW-4267) - [Python/C++][Parquet] Segfault when reading rowgroups with duplicated columns
-* [ARROW-4274](https://issues.apache.org/jira/browse/ARROW-4274) - [Gandiva] static jni library broken after decimal changes
-* [ARROW-4275](https://issues.apache.org/jira/browse/ARROW-4275) - [C++] gandiva-decimal\_single\_test extremely slow
-* [ARROW-4280](https://issues.apache.org/jira/browse/ARROW-4280) - [C++][Documentation] It looks like flex and bison are required for parquet
-* [ARROW-4282](https://issues.apache.org/jira/browse/ARROW-4282) - [Rust] builder benchmark is broken
-* [ARROW-4284](https://issues.apache.org/jira/browse/ARROW-4284) - [C\#] File / Stream serialization fails due to type mismatch / missing footer
-* [ARROW-4295](https://issues.apache.org/jira/browse/ARROW-4295) - [Plasma] Incorrect log message when evicting objects
-* [ARROW-4296](https://issues.apache.org/jira/browse/ARROW-4296) - [Plasma] Starting Plasma store with use\_one\_memory\_mapped\_file enabled crashes due to improper memory alignment
-* [ARROW-4308](https://issues.apache.org/jira/browse/ARROW-4308) - [Python] pyarrow has a hard dependency on pandas
-* [ARROW-4311](https://issues.apache.org/jira/browse/ARROW-4311) - [Python] Regression on pq.ParquetWriter incorrectly handling source string
-* [ARROW-4312](https://issues.apache.org/jira/browse/ARROW-4312) - [C++] Lint doesn't work anymore ("[Errno 24] Too many open files")
-* [ARROW-4319](https://issues.apache.org/jira/browse/ARROW-4319) - plasma/store.h pulls ins flatbuffer dependency
-* [ARROW-4320](https://issues.apache.org/jira/browse/ARROW-4320) - [C++] Add tests for non-contiguous tensors
-* [ARROW-4322](https://issues.apache.org/jira/browse/ARROW-4322) - [CI] docker nightlies fails after conda-forge compiler migration
-* [ARROW-4323](https://issues.apache.org/jira/browse/ARROW-4323) - [Packaging] Fix failing OSX clang conda forge builds
-* [ARROW-4326](https://issues.apache.org/jira/browse/ARROW-4326) - [C++] Development instructions in python/development.rst will not work for many Linux distros with new conda-forge toolchain
-* [ARROW-4327](https://issues.apache.org/jira/browse/ARROW-4327) - [Python] Add requirements-build.txt file to simplify setting up Python build environment
-* [ARROW-4328](https://issues.apache.org/jira/browse/ARROW-4328) - Make R build compatible with DARROW\_TENSORFLOW=ON
-* [ARROW-4329](https://issues.apache.org/jira/browse/ARROW-4329) - Python should include the parquet headers
-* [ARROW-4342](https://issues.apache.org/jira/browse/ARROW-4342) - [Gandiva][Java] spurious failures in projector cache test
-* [ARROW-4347](https://issues.apache.org/jira/browse/ARROW-4347) - [Python] Run Python Travis CI unit tests on Linux when Java codebase changed
-* [ARROW-4349](https://issues.apache.org/jira/browse/ARROW-4349) - [C++] Build all benchmarks on Windows without failing
-* [ARROW-4351](https://issues.apache.org/jira/browse/ARROW-4351) - [C++] Fail to build with static parquet
-* [ARROW-4355](https://issues.apache.org/jira/browse/ARROW-4355) - [C++] test-util functions are no longer part of libarrow
-* [ARROW-4360](https://issues.apache.org/jira/browse/ARROW-4360) - [C++] Query homebrew for Thrift
-* [ARROW-4364](https://issues.apache.org/jira/browse/ARROW-4364) - [C++] Fix -weverything -wextra compilation errors
-* [ARROW-4366](https://issues.apache.org/jira/browse/ARROW-4366) - [Docs] Change extension from format/README.md to format/README.rst
-* [ARROW-4367](https://issues.apache.org/jira/browse/ARROW-4367) - [C++] StringDictionaryBuilder segfaults on Finish with only null entries
-* [ARROW-4368](https://issues.apache.org/jira/browse/ARROW-4368) - Bintray repository signature verification fails
-* [ARROW-4370](https://issues.apache.org/jira/browse/ARROW-4370) - [Python] Table to pandas conversion fails for list of bool
-* [ARROW-4374](https://issues.apache.org/jira/browse/ARROW-4374) - [C++] DictionaryBuilder does not correctly report length and null\_count
-* [ARROW-4381](https://issues.apache.org/jira/browse/ARROW-4381) - [Docker] docker-compose build lint fails
-* [ARROW-4382](https://issues.apache.org/jira/browse/ARROW-4382) - [C++] Improve new cpplint output readability
-* [ARROW-4384](https://issues.apache.org/jira/browse/ARROW-4384) - [C++] Running "format" target on new Windows 10 install opens "how do you want to open this file" dialog
-* [ARROW-4385](https://issues.apache.org/jira/browse/ARROW-4385) - [Python] default\_version of a release should not include SNAPSHOT
-* [ARROW-4389](https://issues.apache.org/jira/browse/ARROW-4389) - [R] Installing clang-tools in CI is failing on trusty
-* [ARROW-4395](https://issues.apache.org/jira/browse/ARROW-4395) - ts-node throws type error running \`bin/arrow2csv.js\`
-* [ARROW-4400](https://issues.apache.org/jira/browse/ARROW-4400) - [CI] install of clang tools failing
-* [ARROW-4403](https://issues.apache.org/jira/browse/ARROW-4403) - [Rust] CI fails due to formatting errors
-* [ARROW-4404](https://issues.apache.org/jira/browse/ARROW-4404) - [CI] AppVeyor toolchain build does not build anything
-* [ARROW-4407](https://issues.apache.org/jira/browse/ARROW-4407) - [C++] ExternalProject\_Add does not capture CC/CXX correctly
-* [ARROW-4410](https://issues.apache.org/jira/browse/ARROW-4410) - [C++] Fix InvertKernel edge cases
-* [ARROW-4413](https://issues.apache.org/jira/browse/ARROW-4413) - [Python] pyarrow.hdfs.connect() failing
-* [ARROW-4414](https://issues.apache.org/jira/browse/ARROW-4414) - [C++] Stop using cmake COMMAND\_EXPAND\_LISTS because it breaks package builds for older distros
-* [ARROW-4417](https://issues.apache.org/jira/browse/ARROW-4417) - [C++] Doc build broken
-* [ARROW-4420](https://issues.apache.org/jira/browse/ARROW-4420) - [INTEGRATION] Make spark integration test pass and test against spark's master branch
-* [ARROW-4421](https://issues.apache.org/jira/browse/ARROW-4421) - [Flight][C++] Handle large Flight data messages
-* [ARROW-4434](https://issues.apache.org/jira/browse/ARROW-4434) - [Python] Cannot create empty StructArray via pa.StructArray.from\_arrays
-* [ARROW-4440](https://issues.apache.org/jira/browse/ARROW-4440) - [C++] Fix flatbuffers build using msvc
-* [ARROW-4457](https://issues.apache.org/jira/browse/ARROW-4457) - [Python] Cannot create Decimal128 array using integers
-* [ARROW-4469](https://issues.apache.org/jira/browse/ARROW-4469) - [Python][C++] CI Failing for Python 2.7 and 3.6 with valgrind
-* [ARROW-4471](https://issues.apache.org/jira/browse/ARROW-4471) - [C++] Pass AR and RANLIB to all external projects
-* [ARROW-4474](https://issues.apache.org/jira/browse/ARROW-4474) - [Flight] FlightInfo should use signed integer types for payload size
-* [ARROW-4480](https://issues.apache.org/jira/browse/ARROW-4480) - [Python] Drive letter removed when writing parquet file
-* [ARROW-4487](https://issues.apache.org/jira/browse/ARROW-4487) - [C++] Appveyor toolchain build does not actually build the project
-* [ARROW-4494](https://issues.apache.org/jira/browse/ARROW-4494) - [Java] arrow-jdbc JAR is not uploaded on release
-* [ARROW-4496](https://issues.apache.org/jira/browse/ARROW-4496) - [CI] CI failing for python Xcode 7.3
-* [ARROW-4498](https://issues.apache.org/jira/browse/ARROW-4498) - [Plasma] Plasma fails building with CUDA enabled
-* [ARROW-4500](https://issues.apache.org/jira/browse/ARROW-4500) - [C++] librt and pthread hacks can cause linking problems
-* [ARROW-4501](https://issues.apache.org/jira/browse/ARROW-4501) - [C++] Unique returns non-unique strings
-* [ARROW-4525](https://issues.apache.org/jira/browse/ARROW-4525) - [Rust] [Parquet] Convert ArrowError to ParquetError
-* [ARROW-4527](https://issues.apache.org/jira/browse/ARROW-4527) - [Packaging] Update linux packaging tasks to align with the LLVM 7 migration
-* [ARROW-4532](https://issues.apache.org/jira/browse/ARROW-4532) - [Java] varchar value buffer much larger than expected
-* [ARROW-4533](https://issues.apache.org/jira/browse/ARROW-4533) - [Python] Document how to run hypothesis tests
-* [ARROW-4535](https://issues.apache.org/jira/browse/ARROW-4535) - [C++] Fix MakeBuilder to preserve ListType's field name
-* [ARROW-4536](https://issues.apache.org/jira/browse/ARROW-4536) - Add data\_type argument in garrow\_list\_array\_new
-* [ARROW-4538](https://issues.apache.org/jira/browse/ARROW-4538) - [PYTHON] Remove index column from subschema in write\_to\_dataframe
-* [ARROW-4549](https://issues.apache.org/jira/browse/ARROW-4549) - [C++] Can't build benchmark code on CUDA enabled build
-* [ARROW-4550](https://issues.apache.org/jira/browse/ARROW-4550) - [JS] Fix AMD pattern
-* [ARROW-4559](https://issues.apache.org/jira/browse/ARROW-4559) - [Python] pyarrow can't read/write filenames with special characters
-* [ARROW-4563](https://issues.apache.org/jira/browse/ARROW-4563) - [Python] pa.decimal128 should validate inputs
-* [ARROW-4571](https://issues.apache.org/jira/browse/ARROW-4571) - [Format] Tensor.fbs file has multiple root\_type declarations
-* [ARROW-4573](https://issues.apache.org/jira/browse/ARROW-4573) - [Python] Add Flight unit tests
-* [ARROW-4576](https://issues.apache.org/jira/browse/ARROW-4576) - [Python] Benchmark failures
-* [ARROW-4577](https://issues.apache.org/jira/browse/ARROW-4577) - [C++] Interface link libraries declared on arrow\_shared target that are actually non-interface
-* [ARROW-4581](https://issues.apache.org/jira/browse/ARROW-4581) - [C++] gbenchmark\_ep is a dependency of unit tests when ARROW\_BUILD\_BENCHMARKS=ON
-* [ARROW-4582](https://issues.apache.org/jira/browse/ARROW-4582) - [C++/Python] Memory corruption on Pandas-\>Arrow conversion
-* [ARROW-4584](https://issues.apache.org/jira/browse/ARROW-4584) - [Python] Add built wheel to manylinux1 dockerignore.
-* [ARROW-4585](https://issues.apache.org/jira/browse/ARROW-4585) - [C++] Dependency of Flight C++ sources on generated protobuf is not respected
-* [ARROW-4587](https://issues.apache.org/jira/browse/ARROW-4587) - Flight C++ DoPut segfaults
-* [ARROW-4597](https://issues.apache.org/jira/browse/ARROW-4597) - [C++] Targets for system Google Mock shared library are missing
-* [ARROW-4601](https://issues.apache.org/jira/browse/ARROW-4601) - [Python] Master build is broken due to missing licence for .dockerignore
-* [ARROW-4606](https://issues.apache.org/jira/browse/ARROW-4606) - [Rust] [DataFusion] FilterRelation created RecordBatch with empty schema
-* [ARROW-4608](https://issues.apache.org/jira/browse/ARROW-4608) - [C++] cmake script assumes that double-conversion installs static libs
-* [ARROW-4617](https://issues.apache.org/jira/browse/ARROW-4617) - [C++] Support double-conversion<3.1
-* [ARROW-4624](https://issues.apache.org/jira/browse/ARROW-4624) - [C++] Linker errors when building benchmarks
-* [ARROW-4629](https://issues.apache.org/jira/browse/ARROW-4629) - [Python] Pandas to arrow conversion slowed down by local imports
-* [ARROW-4635](https://issues.apache.org/jira/browse/ARROW-4635) - [Java] StructVector growing validity buffer unnecessarily
-* [ARROW-4639](https://issues.apache.org/jira/browse/ARROW-4639) - [CI] Crossbow build failing for Gandiva jars
-* [ARROW-4641](https://issues.apache.org/jira/browse/ARROW-4641) - [C++] Flight builds complain of -Wstrict-aliasing
-* [ARROW-4642](https://issues.apache.org/jira/browse/ARROW-4642) - [R] Change \`f\` to \`file\` in \`read\_parquet\_file()\`
-* [ARROW-4653](https://issues.apache.org/jira/browse/ARROW-4653) - [C++] decimal multiply broken when both args are negative
-* [ARROW-4654](https://issues.apache.org/jira/browse/ARROW-4654) - [C++] Implicit Flight target dependencies cause compilation failure
-* [ARROW-4657](https://issues.apache.org/jira/browse/ARROW-4657) - [Release] gbenchmark should not be needed for verification
-* [ARROW-4658](https://issues.apache.org/jira/browse/ARROW-4658) - [C++] Shared gflags is also a run-time conda requirement
-* [ARROW-4659](https://issues.apache.org/jira/browse/ARROW-4659) - [CI] ubuntu/debian nightlies fail because of missing gandiva files
-* [ARROW-4660](https://issues.apache.org/jira/browse/ARROW-4660) - [C++] gflags fails to build due to CMake error
-* [ARROW-4664](https://issues.apache.org/jira/browse/ARROW-4664) - [C++] DCHECK macro conditions are evaluated in release builds
-* [ARROW-4669](https://issues.apache.org/jira/browse/ARROW-4669) - [Java] No Bounds checking on ArrowBuf.slice
-* [ARROW-4672](https://issues.apache.org/jira/browse/ARROW-4672) - [C++] clang-7 matrix entry is build using gcc
-* [ARROW-4680](https://issues.apache.org/jira/browse/ARROW-4680) - [CI] [Rust] Travis CI builds fail with latest Rust 1.34.0-nightly (2019-02-25)
-* [ARROW-4684](https://issues.apache.org/jira/browse/ARROW-4684) - [Python] CI failures in test\_cython.py
-* [ARROW-4687](https://issues.apache.org/jira/browse/ARROW-4687) - [Python] FlightServerBase.run should exit on Ctrl-C
-* [ARROW-4688](https://issues.apache.org/jira/browse/ARROW-4688) - [C++][Parquet] 16MB limit on (nested) column chunk prevents tuning row\_group\_size
-* [ARROW-4696](https://issues.apache.org/jira/browse/ARROW-4696) - Verify release script is over optimist with CUDA detection
-* [ARROW-4699](https://issues.apache.org/jira/browse/ARROW-4699) - [C++] json parser should not rely on null terminated buffers
-* [ARROW-4704](https://issues.apache.org/jira/browse/ARROW-4704) - [CI][GLib] Plasma test is flaky
-* [ARROW-4710](https://issues.apache.org/jira/browse/ARROW-4710) - [C++][R] New linting script skip files with "cpp" extension
-* [ARROW-4712](https://issues.apache.org/jira/browse/ARROW-4712) - [C++][CI] Clang7 Valgrind complains when not move shared\_ptr
-* [ARROW-4721](https://issues.apache.org/jira/browse/ARROW-4721) - [Rust] [DataFusion] Propagate schema in filter
-* [ARROW-4724](https://issues.apache.org/jira/browse/ARROW-4724) - [C++] Python not being built nor test under MinGW builds
-* [ARROW-4728](https://issues.apache.org/jira/browse/ARROW-4728) - [JS] Failing test Table\#assign with a zero-length Null column round-trips through serialization
-* [ARROW-4737](https://issues.apache.org/jira/browse/ARROW-4737) - [C\#] tests are not running in CI
-* [ARROW-4744](https://issues.apache.org/jira/browse/ARROW-4744) - [CI][C++] Mingw32 builds failing
-* [ARROW-4750](https://issues.apache.org/jira/browse/ARROW-4750) - [C++] RapidJSON triggers Wclass-memaccess on GCC 8+
-* [ARROW-4760](https://issues.apache.org/jira/browse/ARROW-4760) - [C++] protobuf 3.7 defines EXPECT\_OK that clashes with Arrow's macro
-* [ARROW-4766](https://issues.apache.org/jira/browse/ARROW-4766) - [C++] Casting empty boolean array causes segfault
-* [ARROW-4767](https://issues.apache.org/jira/browse/ARROW-4767) - [C\#] ArrowStreamReader crashes while reading the end of a stream
-* [ARROW-4768](https://issues.apache.org/jira/browse/ARROW-4768) - [C++][CI] arrow-test-array sometimes gets stuck in MinGW build
-* [ARROW-4774](https://issues.apache.org/jira/browse/ARROW-4774) - [C++][Parquet] Call Table::Validate when writing a table
-* [ARROW-4775](https://issues.apache.org/jira/browse/ARROW-4775) - [Website] Site navbar cannot be expanded
-* [ARROW-4783](https://issues.apache.org/jira/browse/ARROW-4783) - [C++][CI] Mingw32 builds sometimes timeout
-* [ARROW-4793](https://issues.apache.org/jira/browse/ARROW-4793) - [Ruby] Suppress unused variable warning
-* [ARROW-4796](https://issues.apache.org/jira/browse/ARROW-4796) - [Flight][Python] segfault in simple server implementation
-* [ARROW-4802](https://issues.apache.org/jira/browse/ARROW-4802) - [Python] Hadoop classpath discovery broken HADOOP\_HOME is a symlink
-* [ARROW-4807](https://issues.apache.org/jira/browse/ARROW-4807) - [Rust] Fix csv\_writer benchmark
-* [ARROW-4811](https://issues.apache.org/jira/browse/ARROW-4811) - [C++] An incorrect dependency leads "ninja" to re-evaluate steps unnecessarily on subsequent calls
-* [ARROW-4813](https://issues.apache.org/jira/browse/ARROW-4813) - [Ruby] Add tests for \#== and \#!=
-* [ARROW-4820](https://issues.apache.org/jira/browse/ARROW-4820) - [Python] hadoop class path derived not correct
-* [ARROW-4822](https://issues.apache.org/jira/browse/ARROW-4822) - [C++/Python] pyarrow.Table.equals segmentation fault on None
-* [ARROW-4828](https://issues.apache.org/jira/browse/ARROW-4828) - [Python] manylinux1 docker-compose context should be python/manylinux1
-* [ARROW-4850](https://issues.apache.org/jira/browse/ARROW-4850) - [CI] Integration test failures do not fail the Travis CI build
-* [ARROW-4853](https://issues.apache.org/jira/browse/ARROW-4853) - [Rust] Array slice doesn't work on ListArray and StructArray
-* [ARROW-4857](https://issues.apache.org/jira/browse/ARROW-4857) - [C++/Python/CI] docker-compose in manylinux1 crossbow jobs too old
-* [ARROW-4866](https://issues.apache.org/jira/browse/ARROW-4866) - [C++] zstd ExternalProject failing on Windows
-* [ARROW-4867](https://issues.apache.org/jira/browse/ARROW-4867) - [Python] Table.from\_pandas() column order not respected
-* [ARROW-4869](https://issues.apache.org/jira/browse/ARROW-4869) - [C++] Use of gmock fails in compute/kernels/util-internal-test.cc
-* [ARROW-4870](https://issues.apache.org/jira/browse/ARROW-4870) - [Ruby] gemspec has wrong msys2 dependency listed
-* [ARROW-4871](https://issues.apache.org/jira/browse/ARROW-4871) - [Flight][Java] Handle large Flight messages
-* [ARROW-4872](https://issues.apache.org/jira/browse/ARROW-4872) - [Python] Keep backward compatibility for ParquetDatasetPiece
-* [ARROW-4879](https://issues.apache.org/jira/browse/ARROW-4879) - [C++] cmake can't use conda's flatbuffers
-* [ARROW-4881](https://issues.apache.org/jira/browse/ARROW-4881) - [Python] bundle\_zlib CMake function still uses ARROW\_BUILD\_TOOLCHAIN
-* [ARROW-4900](https://issues.apache.org/jira/browse/ARROW-4900) - mingw-w64 < 5 does not have \_\_cpuidex
-* [ARROW-4903](https://issues.apache.org/jira/browse/ARROW-4903) - [C++] Building tests using only static libs not possible
-* [ARROW-4906](https://issues.apache.org/jira/browse/ARROW-4906) - [Format] Fix document to describe that SparseMatrixIndexCSR assumes indptr is sorted for each row
-* [ARROW-4918](https://issues.apache.org/jira/browse/ARROW-4918) - [C++] Add cmake-format to pre-commit
-* [ARROW-4928](https://issues.apache.org/jira/browse/ARROW-4928) - [Python] Hypothesis test failures
-* [ARROW-4931](https://issues.apache.org/jira/browse/ARROW-4931) - [C++] CMake fails on gRPC ExternalProject
-* [ARROW-4938](https://issues.apache.org/jira/browse/ARROW-4938) - [Glib] Undefined symbols error occurred when GIR file is being generated.
-* [ARROW-4942](https://issues.apache.org/jira/browse/ARROW-4942) - [Ruby] Remove needless omits
-* [ARROW-4948](https://issues.apache.org/jira/browse/ARROW-4948) - [JS] Nightly test failing with "Cannot assign to read only property"
-* [ARROW-4950](https://issues.apache.org/jira/browse/ARROW-4950) - [C++] Thirdparty CMake error get\_target\_property() called with non-existent target LZ4::lz4
-* [ARROW-4952](https://issues.apache.org/jira/browse/ARROW-4952) - [C++] Equals / ApproxEquals behaviour undefined on FP NaNs
-* [ARROW-4953](https://issues.apache.org/jira/browse/ARROW-4953) - [Ruby] Not loading libarrow-glib
-* [ARROW-4954](https://issues.apache.org/jira/browse/ARROW-4954) - [Python] test failure with Flight enabled
-* [ARROW-4958](https://issues.apache.org/jira/browse/ARROW-4958) - [C++] Purely static linking broken
-* [ARROW-4961](https://issues.apache.org/jira/browse/ARROW-4961) - [C++][Python] Add GTest\_SOURCE=BUNDLED to relevant build docs that use conda-forge toolchain
-* [ARROW-4962](https://issues.apache.org/jira/browse/ARROW-4962) - [C++] Warning level to CHECKIN can't compile on modern GCC
-* [ARROW-4976](https://issues.apache.org/jira/browse/ARROW-4976) - [JS] RecordBatchReader should reset its Node/DOM streams
-* [ARROW-4982](https://issues.apache.org/jira/browse/ARROW-4982) - [GLib][CI] Run tests on AppVeyor
-* [ARROW-4984](https://issues.apache.org/jira/browse/ARROW-4984) - [Flight][C++] Flight server segfaults when port is in use
-* [ARROW-4986](https://issues.apache.org/jira/browse/ARROW-4986) - [CI] Travis fails to install llvm@7
-* [ARROW-4989](https://issues.apache.org/jira/browse/ARROW-4989) - [C++] Builds fails to find Ubuntu-packaged re2 library
-* [ARROW-4991](https://issues.apache.org/jira/browse/ARROW-4991) - [CI] Bump travis node version to 11.12
-* [ARROW-4997](https://issues.apache.org/jira/browse/ARROW-4997) - [C\#] ArrowStreamReader doesn't consume whole stream and doesn't implement sync read
-* [ARROW-5009](https://issues.apache.org/jira/browse/ARROW-5009) - [C++] Cleanup using to std::\* in files
-* [ARROW-5010](https://issues.apache.org/jira/browse/ARROW-5010) - [Release] Fix release script with llvm-7
-* [ARROW-5012](https://issues.apache.org/jira/browse/ARROW-5012) - [C++] "testing" headers not installed
-* [ARROW-5023](https://issues.apache.org/jira/browse/ARROW-5023) - [Release] Default value syntax in shell is wrong
-* [ARROW-5024](https://issues.apache.org/jira/browse/ARROW-5024) - [Release] crossbow.py --arrow-version causes missing variable error
-* [ARROW-5025](https://issues.apache.org/jira/browse/ARROW-5025) - [Python][Packaging] wheel for Windows are broken
-* [ARROW-5026](https://issues.apache.org/jira/browse/ARROW-5026) - [Python][Packaging] conda package on non Windows is broken
-* [ARROW-5029](https://issues.apache.org/jira/browse/ARROW-5029) - [C++] Compilation warnings in release mode
-* [ARROW-5031](https://issues.apache.org/jira/browse/ARROW-5031) - [Dev] Release verification script does not run CUDA tests in Python
-* [ARROW-5042](https://issues.apache.org/jira/browse/ARROW-5042) - [Release] Wrong ARROW\_DEPENDENCY\_SOURCE in verification script
-* [ARROW-5043](https://issues.apache.org/jira/browse/ARROW-5043) - [Release][Ruby] red-arrow dependency can't be resolve in verification script
-* [ARROW-5044](https://issues.apache.org/jira/browse/ARROW-5044) - [Release][Rust] Format error in verification script
-* [ARROW-5046](https://issues.apache.org/jira/browse/ARROW-5046) - [Release][C++] Plasma test is fragile in verification script
-* [ARROW-5047](https://issues.apache.org/jira/browse/ARROW-5047) - [Release] Always set up parquet-testing in verification script
-* [ARROW-5048](https://issues.apache.org/jira/browse/ARROW-5048) - [Release][Rust] arrow-testing is missing in verification script
-* [ARROW-5050](https://issues.apache.org/jira/browse/ARROW-5050) - [C++] cares\_ep should build before grpc\_ep
-* [ARROW-5087](https://issues.apache.org/jira/browse/ARROW-5087) - [Debian] APT repository no longer contains libarrow-dev
-* [ARROW-5658](https://issues.apache.org/jira/browse/ARROW-5658) - [JAVA] Provide ability to resync VectorSchemaRoot if types change
-* [PARQUET-1482](https://issues.apache.org/jira/browse/PARQUET-1482) - [C++] Unable to read data from parquet file generated with parquetjs
-* [PARQUET-1494](https://issues.apache.org/jira/browse/PARQUET-1494) - [C++] Can't access parquet statistics on binary columns
-* [PARQUET-1532](https://issues.apache.org/jira/browse/PARQUET-1532) - [C++] Can't build column reader test with MinGW
-
-
-## New Features and Improvements
-
-* [ARROW-47](https://issues.apache.org/jira/browse/ARROW-47) - [C++] Consider adding a scalar type object model
-* [ARROW-331](https://issues.apache.org/jira/browse/ARROW-331) - [Python] Timeline for dropping Python 2.7 support
-* [ARROW-549](https://issues.apache.org/jira/browse/ARROW-549) - [C++] Add function to concatenate like-typed arrays
-* [ARROW-572](https://issues.apache.org/jira/browse/ARROW-572) - [C++] Apply visitor pattern in IPC metadata
-* [ARROW-585](https://issues.apache.org/jira/browse/ARROW-585) - [C++] Define public API for user-defined data types
-* [ARROW-694](https://issues.apache.org/jira/browse/ARROW-694) - [C++] Build JSON "scanner" for reading record batches from line-delimited JSON files
-* [ARROW-1425](https://issues.apache.org/jira/browse/ARROW-1425) - [Python] Document semantic differences between Spark timestamps and Arrow timestamps
-* [ARROW-1572](https://issues.apache.org/jira/browse/ARROW-1572) - [C++] Implement "value counts" kernels for tabulating value frequencies
-* [ARROW-1639](https://issues.apache.org/jira/browse/ARROW-1639) - [Python] More efficient serialization for RangeIndex in serialize\_pandas
-* [ARROW-1642](https://issues.apache.org/jira/browse/ARROW-1642) - [GLib] Build GLib using Meson in Appveyor
-* [ARROW-1807](https://issues.apache.org/jira/browse/ARROW-1807) - [JAVA] Reduce Heap Usage (Phase 3): consolidate buffers
-* [ARROW-1896](https://issues.apache.org/jira/browse/ARROW-1896) - [C++] Do not allocate memory for primitive outputs in CastKernel::Call implementation
-* [ARROW-2015](https://issues.apache.org/jira/browse/ARROW-2015) - [Java] Use Java Time and Date APIs instead of JodaTime
-* [ARROW-2022](https://issues.apache.org/jira/browse/ARROW-2022) - [Format] Add custom metadata field specific to a RecordBatch message
-* [ARROW-2112](https://issues.apache.org/jira/browse/ARROW-2112) - [C++] Enable cpplint to be run on Windows
-* [ARROW-2243](https://issues.apache.org/jira/browse/ARROW-2243) - [C++] Enable IPO/LTO
-* [ARROW-2409](https://issues.apache.org/jira/browse/ARROW-2409) - [Rust] Test for build warnings, remove current warnings
-* [ARROW-2460](https://issues.apache.org/jira/browse/ARROW-2460) - [Rust] Schema and DataType::Struct should use Vec<Rc<Field\>\>
-* [ARROW-2487](https://issues.apache.org/jira/browse/ARROW-2487) - [C++] Provide a variant of AppendValues that takes bytemaps for the nullability
-* [ARROW-2523](https://issues.apache.org/jira/browse/ARROW-2523) - [Rust] Implement CAST operations for arrays
-* [ARROW-2620](https://issues.apache.org/jira/browse/ARROW-2620) - [Rust] Integrate memory pool abstraction with rest of codebase
-* [ARROW-2627](https://issues.apache.org/jira/browse/ARROW-2627) - [Python] Add option (or some equivalent) to toggle memory mapping functionality when using parquet.ParquetFile or other read entry points
-* [ARROW-2904](https://issues.apache.org/jira/browse/ARROW-2904) - [C++] Use FirstTimeBitmapWriter instead of SetBit functions in builder.h/cc
-* [ARROW-3066](https://issues.apache.org/jira/browse/ARROW-3066) - [Wiki] Add "How to contribute" to developer wiki
-* [ARROW-3084](https://issues.apache.org/jira/browse/ARROW-3084) - [Python] Do we need to build both unicode variants of pyarrow wheels?
-* [ARROW-3107](https://issues.apache.org/jira/browse/ARROW-3107) - [C++] arrow::PrettyPrint for Column instances
-* [ARROW-3121](https://issues.apache.org/jira/browse/ARROW-3121) - [C++] Mean kernel aggregate
-* [ARROW-3123](https://issues.apache.org/jira/browse/ARROW-3123) - [C++] Incremental Count, Count Not Null aggregator
-* [ARROW-3135](https://issues.apache.org/jira/browse/ARROW-3135) - [C++] Add helper functions for validity bitmap propagation in kernel context
-* [ARROW-3149](https://issues.apache.org/jira/browse/ARROW-3149) - [C++] Use gRPC (when it exists) from conda-forge for CI builds
-* [ARROW-3162](https://issues.apache.org/jira/browse/ARROW-3162) - [Python] Enable Flight servers to be implemented in pure Python
-* [ARROW-3162](https://issues.apache.org/jira/browse/ARROW-3162) - [Python] Enable Flight servers to be implemented in pure Python
-* [ARROW-3239](https://issues.apache.org/jira/browse/ARROW-3239) - [C++] Improve random data generation functions
-* [ARROW-3255](https://issues.apache.org/jira/browse/ARROW-3255) - [C++/Python] Migrate Travis CI jobs off Xcode 6.4
-* [ARROW-3289](https://issues.apache.org/jira/browse/ARROW-3289) - [C++] Implement DoPut command for Flight on client and server side
-* [ARROW-3292](https://issues.apache.org/jira/browse/ARROW-3292) - [C++] Test Flight RPC in Travis CI
-* [ARROW-3295](https://issues.apache.org/jira/browse/ARROW-3295) - [Packaging] Package gRPC libraries in conda-forge for use in builds, packaging
-* [ARROW-3297](https://issues.apache.org/jira/browse/ARROW-3297) - [Python] Python bindings for Flight C++ client
-* [ARROW-3311](https://issues.apache.org/jira/browse/ARROW-3311) - [R] Functions for deserializing IPC components from arrow::Buffer or from IO interface
-* [ARROW-3328](https://issues.apache.org/jira/browse/ARROW-3328) - [Flight] Allow for optional unique flight identifier to be sent with FlightGetInfo
-* [ARROW-3361](https://issues.apache.org/jira/browse/ARROW-3361) - [R] Run cpp/build-support/cpplint.py on C++ source files
-* [ARROW-3364](https://issues.apache.org/jira/browse/ARROW-3364) - [Doc] Document docker compose setup
-* [ARROW-3367](https://issues.apache.org/jira/browse/ARROW-3367) - [INTEGRATION] Port Spark integration test to the docker-compose setup
-* [ARROW-3422](https://issues.apache.org/jira/browse/ARROW-3422) - [C++] Add "toolchain" target to ensure that all required toolchain libraries are built
-* [ARROW-3434](https://issues.apache.org/jira/browse/ARROW-3434) - [Packaging] Add Apache ORC C++ library to conda-forge
-* [ARROW-3435](https://issues.apache.org/jira/browse/ARROW-3435) - [C++] Add option to use dynamic linking with re2
-* [ARROW-3511](https://issues.apache.org/jira/browse/ARROW-3511) - [Gandiva] support input selection vectors for both projector and filter
-* [ARROW-3532](https://issues.apache.org/jira/browse/ARROW-3532) - [Python] Schema, StructType, StructArray field retrieval by name should raise warning or exception for multiple matches
-* [ARROW-3550](https://issues.apache.org/jira/browse/ARROW-3550) - [C++] Use kUnknownNullCount in NumericArray constructor
-* [ARROW-3554](https://issues.apache.org/jira/browse/ARROW-3554) - [C++] Reverse traits for C++
-* [ARROW-3594](https://issues.apache.org/jira/browse/ARROW-3594) - [Packaging] Build "cares" library in conda-forge
-* [ARROW-3595](https://issues.apache.org/jira/browse/ARROW-3595) - [Packaging] Build boringssl in conda-forge
-* [ARROW-3596](https://issues.apache.org/jira/browse/ARROW-3596) - [Packaging] Build gRPC in conda-forge
-* [ARROW-3619](https://issues.apache.org/jira/browse/ARROW-3619) - [R] Expose global thread pool optins
-* [ARROW-3631](https://issues.apache.org/jira/browse/ARROW-3631) - [C\#] Add Appveyor build for C\#
-* [ARROW-3653](https://issues.apache.org/jira/browse/ARROW-3653) - [Python/C++] Support data copying between different GPU devices
-* [ARROW-3735](https://issues.apache.org/jira/browse/ARROW-3735) - [Python] Proper error handling in \_ensure\_type
-* [ARROW-3761](https://issues.apache.org/jira/browse/ARROW-3761) - [R] Bindings for CompressedInputStream, CompressedOutputStream
-* [ARROW-3763](https://issues.apache.org/jira/browse/ARROW-3763) - [C++] Write Parquet ByteArray / FixedLenByteArray reader batches directly into arrow::BinaryBuilder
-* [ARROW-3769](https://issues.apache.org/jira/browse/ARROW-3769) - [C++] Support reading non-dictionary encoded binary Parquet columns directly as DictionaryArray
-* [ARROW-3770](https://issues.apache.org/jira/browse/ARROW-3770) - [C++] Validate or add option to validate arrow::Table schema in parquet::arrow::FileWriter::WriteTable
-* [ARROW-3816](https://issues.apache.org/jira/browse/ARROW-3816) - [R] nrow.RecordBatch method
-* [ARROW-3824](https://issues.apache.org/jira/browse/ARROW-3824) - [R] Document developer workflow for building project, running unit tests in r/README.md
-* [ARROW-3838](https://issues.apache.org/jira/browse/ARROW-3838) - [Rust] Implement CSV Writer
-* [ARROW-3846](https://issues.apache.org/jira/browse/ARROW-3846) - [Gandiva] Build on Windows
-* [ARROW-3882](https://issues.apache.org/jira/browse/ARROW-3882) - [Rust] PrimitiveArray<T\> should support cast operations
-* [ARROW-3903](https://issues.apache.org/jira/browse/ARROW-3903) - [Python] Random array generator for Arrow conversion and Parquet testing
-* [ARROW-3926](https://issues.apache.org/jira/browse/ARROW-3926) - [Python] Add Gandiva bindings to Python wheels
-* [ARROW-3951](https://issues.apache.org/jira/browse/ARROW-3951) - [Go] implement a CSV writer
-* [ARROW-3954](https://issues.apache.org/jira/browse/ARROW-3954) - [Rust] Add Slice to Array and ArrayData
-* [ARROW-3965](https://issues.apache.org/jira/browse/ARROW-3965) - [Java] JDBC-to-Arrow Conversion: Configuration Object
-* [ARROW-3966](https://issues.apache.org/jira/browse/ARROW-3966) - [Java] JDBC-to-Arrow Conversion: JDBC Metadata in Schema Fields
-* [ARROW-3972](https://issues.apache.org/jira/browse/ARROW-3972) - [C++] Update to LLVM and Clang bits to 7.0
-* [ARROW-3981](https://issues.apache.org/jira/browse/ARROW-3981) - [C++] Rename json.h
-* [ARROW-3985](https://issues.apache.org/jira/browse/ARROW-3985) - [C++] Pass -C option when compiling with ccache to avoid some warnings
-* [ARROW-4012](https://issues.apache.org/jira/browse/ARROW-4012) - [Documentation][C++] Document how to install Apache Arrow on MSYS2
-* [ARROW-4014](https://issues.apache.org/jira/browse/ARROW-4014) - [C++] Fix "LIBCMT" warnings on MSVC
-* [ARROW-4023](https://issues.apache.org/jira/browse/ARROW-4023) - [Gandiva] Address long CI times in macOS builds
-* [ARROW-4024](https://issues.apache.org/jira/browse/ARROW-4024) - [Python] Cython compilation error on cython==0.27.3
-* [ARROW-4031](https://issues.apache.org/jira/browse/ARROW-4031) - [C++] Refactor ArrayBuilder bitmap logic into TypedBufferBuilder<bool\>
-* [ARROW-4040](https://issues.apache.org/jira/browse/ARROW-4040) - [Rust] Add array\_ops method for filtering an array
-* [ARROW-4056](https://issues.apache.org/jira/browse/ARROW-4056) - [C++] Upgrade to boost-cpp 1.69.0 again
-* [ARROW-4061](https://issues.apache.org/jira/browse/ARROW-4061) - [Rust] [Parquet] Implement "spaced" version for non-dictionary encoding/decoding
-* [ARROW-4068](https://issues.apache.org/jira/browse/ARROW-4068) - [Gandiva] Support building with Xcode 6.4
-* [ARROW-4071](https://issues.apache.org/jira/browse/ARROW-4071) - [Rust] Add rustfmt as a pre-commit hook
-* [ARROW-4072](https://issues.apache.org/jira/browse/ARROW-4072) - [Rust] Set default value for PARQUET\_TEST\_DATA
-* [ARROW-4092](https://issues.apache.org/jira/browse/ARROW-4092) - [Rust] Implement common Reader / DataSource trait for CSV and Parquet
-* [ARROW-4094](https://issues.apache.org/jira/browse/ARROW-4094) - [Python] Store RangeIndex in Parquet files as metadata rather than a physical data column
-* [ARROW-4110](https://issues.apache.org/jira/browse/ARROW-4110) - [C++] Do not generate distinct cast kernels when input and output type are the same
-* [ARROW-4123](https://issues.apache.org/jira/browse/ARROW-4123) - [C++] Improve linting workflow and documentation for Windows-based developers
-* [ARROW-4124](https://issues.apache.org/jira/browse/ARROW-4124) - [C++] Abstract aggregation kernel API
-* [ARROW-4142](https://issues.apache.org/jira/browse/ARROW-4142) - [Java] JDBC-to-Arrow: JDBC Arrays
-* [ARROW-4165](https://issues.apache.org/jira/browse/ARROW-4165) - [C++] Port cpp/apidoc/Windows.md and other files to Sphinx / rst
-* [ARROW-4180](https://issues.apache.org/jira/browse/ARROW-4180) - [Java] Reduce verbose logging of ArrowBuf creation events?
-* [ARROW-4196](https://issues.apache.org/jira/browse/ARROW-4196) - [Rust] Add explicit SIMD vectorization for arithmetic ops in "array\_ops"
-* [ARROW-4198](https://issues.apache.org/jira/browse/ARROW-4198) - [Gandiva] Add support to cast timestamp
-* [ARROW-4204](https://issues.apache.org/jira/browse/ARROW-4204) - [Gandiva] implement decimal subtract
-* [ARROW-4205](https://issues.apache.org/jira/browse/ARROW-4205) - [Gandiva] Implement decimal multiply
-* [ARROW-4206](https://issues.apache.org/jira/browse/ARROW-4206) - [Gandiva] Implement decimal divide
-* [ARROW-4212](https://issues.apache.org/jira/browse/ARROW-4212) - [Python] [CUDA] Creating a CUDA buffer from Numba device array should be easier
-* [ARROW-4230](https://issues.apache.org/jira/browse/ARROW-4230) - [C++] Enable building flight against system gRPC
-* [ARROW-4232](https://issues.apache.org/jira/browse/ARROW-4232) - [C++] Follow conda-forge compiler ABI migration
-* [ARROW-4234](https://issues.apache.org/jira/browse/ARROW-4234) - [C++] Add memory bandwidth benchmarks to arrow/util/machine-benchmark.cc
-* [ARROW-4235](https://issues.apache.org/jira/browse/ARROW-4235) - [GLib] Use "column\_builder" in GArrowRecordBatchBuilder
-* [ARROW-4236](https://issues.apache.org/jira/browse/ARROW-4236) - [JAVA] Distinct plasma client create exceptions
-* [ARROW-4245](https://issues.apache.org/jira/browse/ARROW-4245) - [Rust] Add Rustdoc header to each source file
-* [ARROW-4247](https://issues.apache.org/jira/browse/ARROW-4247) - [Packaging] Update verify script for 0.12.0
-* [ARROW-4251](https://issues.apache.org/jira/browse/ARROW-4251) - [C++] Add option to use vendored Boost in verify-release-candidate.sh
-* [ARROW-4262](https://issues.apache.org/jira/browse/ARROW-4262) - [Website] Blog post to give preview into using R and Arrow with Apache Spark
-* [ARROW-4263](https://issues.apache.org/jira/browse/ARROW-4263) - [Rust] Donate DataFusion
-* [ARROW-4265](https://issues.apache.org/jira/browse/ARROW-4265) - [C++] Automatic conversion between Table and std::vector<std::tuple<..\>\>
-* [ARROW-4268](https://issues.apache.org/jira/browse/ARROW-4268) - [C++] Add C primitive to Arrow:Type compile time in TypeTraits
-* [ARROW-4271](https://issues.apache.org/jira/browse/ARROW-4271) - [Rust] Move Parquet specific info to Parquet Readme
-* [ARROW-4273](https://issues.apache.org/jira/browse/ARROW-4273) - [Release] Fix verification script to use cf201901 conda-forge label
-* [ARROW-4277](https://issues.apache.org/jira/browse/ARROW-4277) - [C++] Add gmock to toolchain
-* [ARROW-4281](https://issues.apache.org/jira/browse/ARROW-4281) - [CI] Use Ubuntu Xenial (16.04) VMs on Travis-CI
-* [ARROW-4285](https://issues.apache.org/jira/browse/ARROW-4285) - [Python] Use proper builder interface for serialization
-* [ARROW-4287](https://issues.apache.org/jira/browse/ARROW-4287) - [C++] Ensure minimal bison version on OSX for Thrift
-* [ARROW-4289](https://issues.apache.org/jira/browse/ARROW-4289) - [C++] Forward AR and RANLIB to thirdparty builds
-* [ARROW-4290](https://issues.apache.org/jira/browse/ARROW-4290) - [C++/Gandiva] Support detecting correct LLVM version in Homebrew
-* [ARROW-4291](https://issues.apache.org/jira/browse/ARROW-4291) - [Dev] Support selecting features in release scripts
-* [ARROW-4294](https://issues.apache.org/jira/browse/ARROW-4294) - [Plasma] Add support for evicting objects to external store
-* [ARROW-4297](https://issues.apache.org/jira/browse/ARROW-4297) - [C++] Fix build for 32-bit MSYS2
-* [ARROW-4298](https://issues.apache.org/jira/browse/ARROW-4298) - [Java] Building Flight fails with OpenJDK 11
-* [ARROW-4299](https://issues.apache.org/jira/browse/ARROW-4299) - [Ruby] Depend on the same version as Red Arrow
-* [ARROW-4300](https://issues.apache.org/jira/browse/ARROW-4300) - [C++] Restore apache-arrow Homebrew recipe and define process for maintaining and updating for releases
-* [ARROW-4303](https://issues.apache.org/jira/browse/ARROW-4303) - [Gandiva/Python] Build LLVM with RTTI in manylinux1 container
-* [ARROW-4305](https://issues.apache.org/jira/browse/ARROW-4305) - [Rust] Fix parquet version number in README
-* [ARROW-4307](https://issues.apache.org/jira/browse/ARROW-4307) - [C++] FIx doxygen warnings, include doxygen warning checks in CI linting
-* [ARROW-4310](https://issues.apache.org/jira/browse/ARROW-4310) - [Website] Update install document for 0.12.0
-* [ARROW-4313](https://issues.apache.org/jira/browse/ARROW-4313) - Define general benchmark database schema
-* [ARROW-4315](https://issues.apache.org/jira/browse/ARROW-4315) - [Website] Home page of https://arrow.apache.org/ does not mention Go or Rust
-* [ARROW-4318](https://issues.apache.org/jira/browse/ARROW-4318) - [C++] Add Tensor::CountNonZero
-* [ARROW-4321](https://issues.apache.org/jira/browse/ARROW-4321) - [CI] Setup conda-forge channel globally in docker containers
-* [ARROW-4330](https://issues.apache.org/jira/browse/ARROW-4330) - [C++] Use FindThreads.cmake to handle -pthread compiler/link options
-* [ARROW-4331](https://issues.apache.org/jira/browse/ARROW-4331) - [C++] Extend Scalar Datum to support more types
-* [ARROW-4332](https://issues.apache.org/jira/browse/ARROW-4332) - [Website] Instructions and scripts for publishing web site appear to be incorrect
-* [ARROW-4334](https://issues.apache.org/jira/browse/ARROW-4334) - [CI] Setup conda-forge channel globally in travis builds
-* [ARROW-4335](https://issues.apache.org/jira/browse/ARROW-4335) - [C++] Better document sparse tensor support
-* [ARROW-4336](https://issues.apache.org/jira/browse/ARROW-4336) - [C++] Default BUILD\_WARNING\_LEVEL to CHECKIN
-* [ARROW-4339](https://issues.apache.org/jira/browse/ARROW-4339) - [C++] rewrite cpp/README shorter, with a separate contribution guide
-* [ARROW-4340](https://issues.apache.org/jira/browse/ARROW-4340) - [C++] Update IWYU version in the \`lint\` dockerfile
-* [ARROW-4341](https://issues.apache.org/jira/browse/ARROW-4341) - [C++] Use TypedBufferBuilder<bool\> in BooleanBuilder
-* [ARROW-4344](https://issues.apache.org/jira/browse/ARROW-4344) - [Java] Further cleanup maven output
-* [ARROW-4345](https://issues.apache.org/jira/browse/ARROW-4345) - [C++] Add Apache 2.0 license file to the Parquet-testing repository
-* [ARROW-4346](https://issues.apache.org/jira/browse/ARROW-4346) - [C++] Fix compiler warnings with gcc 8.2.0
-* [ARROW-4352](https://issues.apache.org/jira/browse/ARROW-4352) - [C++] Add support for system Google Test
-* [ARROW-4353](https://issues.apache.org/jira/browse/ARROW-4353) - [CI] Add jobs for 32-bit and 64-bit MinGW
-* [ARROW-4358](https://issues.apache.org/jira/browse/ARROW-4358) - [Gandiva][Crossbow] Trusty build broken
-* [ARROW-4361](https://issues.apache.org/jira/browse/ARROW-4361) - [Website] Update commiters list
-* [ARROW-4362](https://issues.apache.org/jira/browse/ARROW-4362) - [Java] Test OpenJDK 11 in CI
-* [ARROW-4363](https://issues.apache.org/jira/browse/ARROW-4363) - [C++] Add CMake format checks
-* [ARROW-4372](https://issues.apache.org/jira/browse/ARROW-4372) - [C++] Embed precompiled bitcode in the gandiva library
-* [ARROW-4373](https://issues.apache.org/jira/browse/ARROW-4373) - [Packaging] Travis fails to deploy conda packages on OSX
-* [ARROW-4375](https://issues.apache.org/jira/browse/ARROW-4375) - [CI] Sphinx dependencies were removed from docs conda environment
-* [ARROW-4376](https://issues.apache.org/jira/browse/ARROW-4376) - [Rust] Implement from\_buf\_reader for csv::Reader
-* [ARROW-4377](https://issues.apache.org/jira/browse/ARROW-4377) - [Rust] Implement std::fmt::Debug for all PrimitiveArrays
-* [ARROW-4379](https://issues.apache.org/jira/browse/ARROW-4379) - Register pyarrow serializers for collections.Counter and collections.deque.
-* [ARROW-4383](https://issues.apache.org/jira/browse/ARROW-4383) - [C++] Use the CMake's standard find features
-* [ARROW-4386](https://issues.apache.org/jira/browse/ARROW-4386) - [Rust] Implement Date and Time Arrays
-* [ARROW-4388](https://issues.apache.org/jira/browse/ARROW-4388) - [Go] add DimNames() method to tensor Interface?
-* [ARROW-4393](https://issues.apache.org/jira/browse/ARROW-4393) - [Rust] coding style: apply 90 characters per line limit
-* [ARROW-4396](https://issues.apache.org/jira/browse/ARROW-4396) - Update Typedoc to support TypeScript 3.2
-* [ARROW-4397](https://issues.apache.org/jira/browse/ARROW-4397) - [C++] dim\_names in Tensor and SparseTensor
-* [ARROW-4399](https://issues.apache.org/jira/browse/ARROW-4399) - [C++] Remove usage of "extern template class" from NumericArray<T\>
-* [ARROW-4401](https://issues.apache.org/jira/browse/ARROW-4401) - [Python] Alpine dockerfile fails to build because pandas requires numpy as build dependency
-* [ARROW-4406](https://issues.apache.org/jira/browse/ARROW-4406) - Ignore "\*\_$folder$" files on S3
-* [ARROW-4408](https://issues.apache.org/jira/browse/ARROW-4408) - [CPP/Doc] Remove outdated Parquet documentation
-* [ARROW-4422](https://issues.apache.org/jira/browse/ARROW-4422) - [Plasma] Enforce memory limit in plasma, rather than relying on dlmalloc\_set\_footprint\_limit
-* [ARROW-4423](https://issues.apache.org/jira/browse/ARROW-4423) - [C++] Update version of vendored gtest to 1.8.1
-* [ARROW-4424](https://issues.apache.org/jira/browse/ARROW-4424) - [Python] Manylinux CI builds failing
-* [ARROW-4425](https://issues.apache.org/jira/browse/ARROW-4425) - Add link to 'Contributing' page in the top-level Arrow README
-* [ARROW-4430](https://issues.apache.org/jira/browse/ARROW-4430) - [C++] add unit test for currently unused append method
-* [ARROW-4431](https://issues.apache.org/jira/browse/ARROW-4431) - [C++] Build gRPC as ExternalProject without allowing it to build its vendored dependencies
-* [ARROW-4435](https://issues.apache.org/jira/browse/ARROW-4435) - [C\#] Add .sln file and minor .csproj fix ups
-* [ARROW-4436](https://issues.apache.org/jira/browse/ARROW-4436) - [Documentation] Clarify instructions for building documentation
-* [ARROW-4442](https://issues.apache.org/jira/browse/ARROW-4442) - [JS] Overly broad type annotation for Chunked typeId leading to type mismatches in generated typing
-* [ARROW-4444](https://issues.apache.org/jira/browse/ARROW-4444) - [Testing] Add DataFusion test files to arrow-testing repo
-* [ARROW-4445](https://issues.apache.org/jira/browse/ARROW-4445) - [C++][Gandiva] Run Gandiva-LLVM tests in Appveyor
-* [ARROW-4446](https://issues.apache.org/jira/browse/ARROW-4446) - [Python] Run Gandiva tests on Windows and Appveyor
-* [ARROW-4448](https://issues.apache.org/jira/browse/ARROW-4448) - [JAVA][Flight] Flaky Flight java test
-* [ARROW-4449](https://issues.apache.org/jira/browse/ARROW-4449) - [Rust] Convert File to T: Read + Seek for schema inference
-* [ARROW-4454](https://issues.apache.org/jira/browse/ARROW-4454) - [C++] fix unused parameter warnings
-* [ARROW-4455](https://issues.apache.org/jira/browse/ARROW-4455) - [Plasma] g++ 8 reports class-memaccess warnings
-* [ARROW-4459](https://issues.apache.org/jira/browse/ARROW-4459) - [Testing] Add git submodule for arrow-testing data files
-* [ARROW-4460](https://issues.apache.org/jira/browse/ARROW-4460) - [Website] Write blog post to announce DataFusion donation
-* [ARROW-4461](https://issues.apache.org/jira/browse/ARROW-4461) - [C++] Expose bit-util methods for binary boolean operations that don't allocate
-* [ARROW-4462](https://issues.apache.org/jira/browse/ARROW-4462) - [C++] Upgrade LZ4 v1.7.5 to v1.8.3 to compile with VS2017
-* [ARROW-4464](https://issues.apache.org/jira/browse/ARROW-4464) - [Rust] [DataFusion] Add support for LIMIT
-* [ARROW-4466](https://issues.apache.org/jira/browse/ARROW-4466) - [Rust] [DataFusion] Add support for Parquet data sources
-* [ARROW-4468](https://issues.apache.org/jira/browse/ARROW-4468) - [Rust] Implement BitAnd/BitOr for &Buffer (with SIMD)
-* [ARROW-4472](https://issues.apache.org/jira/browse/ARROW-4472) - [Website][Python] Blog post about Python string memory use improvements in 0.12
-* [ARROW-4475](https://issues.apache.org/jira/browse/ARROW-4475) - [Python] Serializing objects that contain themselves
-* [ARROW-4476](https://issues.apache.org/jira/browse/ARROW-4476) - [Rust] [DataFusion] Post donation clean up tasks
-* [ARROW-4481](https://issues.apache.org/jira/browse/ARROW-4481) - [Website] Instructions for publishing web site are missing a step
-* [ARROW-4483](https://issues.apache.org/jira/browse/ARROW-4483) - [Website] Fix broken link (author) in DataFusion blog post
-* [ARROW-4485](https://issues.apache.org/jira/browse/ARROW-4485) - [CI] Determine maintenance approach to pinned conda-forge binutils package
-* [ARROW-4486](https://issues.apache.org/jira/browse/ARROW-4486) - [Python][CUDA] pyarrow.cuda.Context.foreign\_buffer should have a \`base=None\` argument
-* [ARROW-4488](https://issues.apache.org/jira/browse/ARROW-4488) - [Rust] From AsRef<[u8]\> for Buffer does not ensure correct padding
-* [ARROW-4489](https://issues.apache.org/jira/browse/ARROW-4489) - [Rust] PrimitiveArray.value\_slice performs bounds checking when it should not
-* [ARROW-4490](https://issues.apache.org/jira/browse/ARROW-4490) - [Rust] Add explicit SIMD vectorization for boolean ops in "array\_ops"
-* [ARROW-4491](https://issues.apache.org/jira/browse/ARROW-4491) - [Python] Remove usage of std::to\_string and std::stoi
-* [ARROW-4499](https://issues.apache.org/jira/browse/ARROW-4499) - [Python][CI] Upgrade to latest flake8 3.7.5 in travis\_lint.sh
-* [ARROW-4502](https://issues.apache.org/jira/browse/ARROW-4502) - [C\#] Add support for zero-copy reads
-* [ARROW-4506](https://issues.apache.org/jira/browse/ARROW-4506) - [Ruby] Add Arrow::RecordBatch\#raw\_records
-* [ARROW-4513](https://issues.apache.org/jira/browse/ARROW-4513) - [Rust] Implement BitAnd/BitOr for &Bitmap
-* [ARROW-4517](https://issues.apache.org/jira/browse/ARROW-4517) - [JS] remove version number as it is not used
-* [ARROW-4518](https://issues.apache.org/jira/browse/ARROW-4518) - [JS] add jsdelivr to package.json
-* [ARROW-4528](https://issues.apache.org/jira/browse/ARROW-4528) - [C++] Update lint docker container to LLVM-7
-* [ARROW-4529](https://issues.apache.org/jira/browse/ARROW-4529) - [C++] Add test coverage for BitUtils::RoundDown
-* [ARROW-4531](https://issues.apache.org/jira/browse/ARROW-4531) - [C++] Handling of non-aligned slices in Sum kernel
-* [ARROW-4537](https://issues.apache.org/jira/browse/ARROW-4537) - [CI] Suppress shell warning on travis-ci
-* [ARROW-4539](https://issues.apache.org/jira/browse/ARROW-4539) - [Java]List vector child value count not set correctly
-* [ARROW-4540](https://issues.apache.org/jira/browse/ARROW-4540) - [Rust] Add basic JSON reader
-* [ARROW-4543](https://issues.apache.org/jira/browse/ARROW-4543) - [C\#] Update Flat Buffers code to latest version
-* [ARROW-4546](https://issues.apache.org/jira/browse/ARROW-4546) - [C++] LICENSE.txt should be updated.
-* [ARROW-4547](https://issues.apache.org/jira/browse/ARROW-4547) - [Python][Documentation] Update python/development.rst with instructions for CUDA-enabled builds
-* [ARROW-4556](https://issues.apache.org/jira/browse/ARROW-4556) - [Rust] Preserve order of JSON inferred schema
-* [ARROW-4558](https://issues.apache.org/jira/browse/ARROW-4558) - [C++][Flight] Avoid undefined behavior with gRPC memory optimizations
-* [ARROW-4560](https://issues.apache.org/jira/browse/ARROW-4560) - [R] array() needs to take single input, not ...
-* [ARROW-4562](https://issues.apache.org/jira/browse/ARROW-4562) - [C++][Flight] Create outgoing composite grpc::ByteBuffer instead of allocating contiguous slice and copying IpcPayload into it
-* [ARROW-4564](https://issues.apache.org/jira/browse/ARROW-4564) - [C++] IWYU docker image silently fails
-* [ARROW-4565](https://issues.apache.org/jira/browse/ARROW-4565) - [R] Reading records with all non-null decimals SEGFAULTs
-* [ARROW-4568](https://issues.apache.org/jira/browse/ARROW-4568) - [C++] Add version macros to headers
-* [ARROW-4572](https://issues.apache.org/jira/browse/ARROW-4572) - [C++] Remove memory zeroing from PrimitiveAllocatingUnaryKernel
-* [ARROW-4583](https://issues.apache.org/jira/browse/ARROW-4583) - [Plasma] There are bugs reported by code scan tool
-* [ARROW-4586](https://issues.apache.org/jira/browse/ARROW-4586) - [Rust] Remove arrow/mod.rs as it is not needed
-* [ARROW-4589](https://issues.apache.org/jira/browse/ARROW-4589) - [Rust] [DataFusion] Implement projection push down query optimizer rule
-* [ARROW-4590](https://issues.apache.org/jira/browse/ARROW-4590) - [Rust] Add explicit SIMD vectorization for comparison ops in "array\_ops"
-* [ARROW-4592](https://issues.apache.org/jira/browse/ARROW-4592) - [GLib] Stop configure immediately when GLib isn't available
-* [ARROW-4593](https://issues.apache.org/jira/browse/ARROW-4593) - [Ruby] Arrow::Array\#[out\_of\_range] returns nil
-* [ARROW-4594](https://issues.apache.org/jira/browse/ARROW-4594) - [Ruby] Arrow::StructArray\#[] returns Arrow::Struct instead of Arrow::Array
-* [ARROW-4595](https://issues.apache.org/jira/browse/ARROW-4595) - [Rust] [DataFusion] Implement DataFrame style API
-* [ARROW-4598](https://issues.apache.org/jira/browse/ARROW-4598) - [CI] Remove needless LLVM\_DIR for macOS
-* [ARROW-4599](https://issues.apache.org/jira/browse/ARROW-4599) - [C++] Add support for system GFlags
-* [ARROW-4602](https://issues.apache.org/jira/browse/ARROW-4602) - [Rust][ [DataFusion] Integrate query optimizer with ExecutionContext
-* [ARROW-4603](https://issues.apache.org/jira/browse/ARROW-4603) - [Rust] [DataFusion] Execution context should allow in-memory data sources to be registered
-* [ARROW-4604](https://issues.apache.org/jira/browse/ARROW-4604) - [Rust] [DataFusion] Add benchmarks for SQL query execution
-* [ARROW-4605](https://issues.apache.org/jira/browse/ARROW-4605) - [Rust] Move filter and limit code from DataFusion into compute module
-* [ARROW-4609](https://issues.apache.org/jira/browse/ARROW-4609) - [C++] Use google benchmark from toolchain
-* [ARROW-4610](https://issues.apache.org/jira/browse/ARROW-4610) - [Plasma] Avoid JNI from crashing
-* [ARROW-4611](https://issues.apache.org/jira/browse/ARROW-4611) - [C++] Rework CMake third-party logic
-* [ARROW-4612](https://issues.apache.org/jira/browse/ARROW-4612) - [Python] Use cython from PyPI for windows wheels build
-* [ARROW-4613](https://issues.apache.org/jira/browse/ARROW-4613) - [C++] Alpine build failing as libgtestd.so is not found
-* [ARROW-4614](https://issues.apache.org/jira/browse/ARROW-4614) - [C++/CI] Activate flight build in ci/docker\_build\_cpp.sh
-* [ARROW-4615](https://issues.apache.org/jira/browse/ARROW-4615) - [C++] Add checked\_pointer\_cast
-* [ARROW-4616](https://issues.apache.org/jira/browse/ARROW-4616) - [C++] Log message in BuildUtils as STATUS
-* [ARROW-4618](https://issues.apache.org/jira/browse/ARROW-4618) - [Docker] Makefile to build dependent docker images
-* [ARROW-4619](https://issues.apache.org/jira/browse/ARROW-4619) - [R]: Fix the autobrew script
-* [ARROW-4620](https://issues.apache.org/jira/browse/ARROW-4620) - [C\#] Add unit tests for "Types" in arrow/csharp
-* [ARROW-4623](https://issues.apache.org/jira/browse/ARROW-4623) - [R] update Rcpp dependency
-* [ARROW-4628](https://issues.apache.org/jira/browse/ARROW-4628) - [Rust] [DataFusion] Implement type coercion query optimizer rule
-* [ARROW-4632](https://issues.apache.org/jira/browse/ARROW-4632) - [Ruby] Add BigDecimal\#to\_arrow
-* [ARROW-4634](https://issues.apache.org/jira/browse/ARROW-4634) - [Rust] [Parquet] Reorganize test\_common mod to allow more test util codes.
-* [ARROW-4637](https://issues.apache.org/jira/browse/ARROW-4637) - [Python] Avoid importing Pandas unless necessary
-* [ARROW-4638](https://issues.apache.org/jira/browse/ARROW-4638) - [R] install instructions using brew
-* [ARROW-4640](https://issues.apache.org/jira/browse/ARROW-4640) - [Python] Add docker-compose configuration to build and test the project without pandas installed
-* [ARROW-4643](https://issues.apache.org/jira/browse/ARROW-4643) - [C++] Add compiler diagnostic color when using Ninja
-* [ARROW-4644](https://issues.apache.org/jira/browse/ARROW-4644) - [C++/Docker] Build Gandiva in the docker containers
-* [ARROW-4645](https://issues.apache.org/jira/browse/ARROW-4645) - [C++/Packaging] Ship Gandiva with OSX and Windows wheels
-* [ARROW-4646](https://issues.apache.org/jira/browse/ARROW-4646) - [C++/Packaging] Ship gandiva with the conda-forge packages
-* [ARROW-4655](https://issues.apache.org/jira/browse/ARROW-4655) - [Packaging] Parallelize binary upload
-* [ARROW-4662](https://issues.apache.org/jira/browse/ARROW-4662) - [Python] Add type\_codes property in UnionType
-* [ARROW-4667](https://issues.apache.org/jira/browse/ARROW-4667) - [C++] Suppress unused function warnings with MinGW
-* [ARROW-4670](https://issues.apache.org/jira/browse/ARROW-4670) - [Rust] compute::sum performance issue
-* [ARROW-4671](https://issues.apache.org/jira/browse/ARROW-4671) - [C++] MakeBuilder doesn't support Type::DICTIONARY
-* [ARROW-4673](https://issues.apache.org/jira/browse/ARROW-4673) - [C++] Implement AssertDatumEquals
-* [ARROW-4676](https://issues.apache.org/jira/browse/ARROW-4676) - [C++] Add support for debug build with MinGW
-* [ARROW-4678](https://issues.apache.org/jira/browse/ARROW-4678) - [Rust] Minimize unstable feature usage
-* [ARROW-4679](https://issues.apache.org/jira/browse/ARROW-4679) - [Rust] [DataFusion] Implement in-memory DataSource
-* [ARROW-4681](https://issues.apache.org/jira/browse/ARROW-4681) - [Rust] [DataFusion] Implement parallel query execution using threads
-* [ARROW-4686](https://issues.apache.org/jira/browse/ARROW-4686) - Only accept 'y' or 'n' in merge\_arrow\_pr.py prompts
-* [ARROW-4689](https://issues.apache.org/jira/browse/ARROW-4689) - [Go] add support for WASM
-* [ARROW-4690](https://issues.apache.org/jira/browse/ARROW-4690) - [Python] Building TensorFlow compatible wheels for Arrow
-* [ARROW-4692](https://issues.apache.org/jira/browse/ARROW-4692) - [Format][Documentation] Add more details about "sidecar" to flight proto
-* [ARROW-4693](https://issues.apache.org/jira/browse/ARROW-4693) - [CI] Build boost library with multi precision
-* [ARROW-4697](https://issues.apache.org/jira/browse/ARROW-4697) - [C++] Add URI parsing facility
-* [ARROW-4703](https://issues.apache.org/jira/browse/ARROW-4703) - [C++] Upgrade dependency versions
-* [ARROW-4705](https://issues.apache.org/jira/browse/ARROW-4705) - [Rust] CSV reader should show line number and error message when failing to parse a line
-* [ARROW-4707](https://issues.apache.org/jira/browse/ARROW-4707) - [C++] move BitsetStack to bit-util.h
-* [ARROW-4718](https://issues.apache.org/jira/browse/ARROW-4718) - Add ArrowStreamWriter/Reader ctors that leave open the underlying Stream
-* [ARROW-4727](https://issues.apache.org/jira/browse/ARROW-4727) - [Rust] Implement ability to check if two schemas are the same
-* [ARROW-4730](https://issues.apache.org/jira/browse/ARROW-4730) - [C++] Add docker-compose entry for testing Fedora build with system packages
-* [ARROW-4731](https://issues.apache.org/jira/browse/ARROW-4731) - [C++] Add docker-compose entry for testing Ubuntu Xenial build with system packages
-* [ARROW-4732](https://issues.apache.org/jira/browse/ARROW-4732) - [C++] Add docker-compose entry for testing Debian Testing build with system packages
-* [ARROW-4733](https://issues.apache.org/jira/browse/ARROW-4733) - [C++] Add CI entry that builds without the conda-forge toolchain but with system packages
-* [ARROW-4734](https://issues.apache.org/jira/browse/ARROW-4734) - [Go] Add option to write a header for CSV writer
-* [ARROW-4735](https://issues.apache.org/jira/browse/ARROW-4735) - [Go] Benchmark strconv.Format vs. fmt.Sprintf for CSV writer
-* [ARROW-4739](https://issues.apache.org/jira/browse/ARROW-4739) - [Rust] [DataFusion] It should be possible to share a logical plan between threads
-* [ARROW-4740](https://issues.apache.org/jira/browse/ARROW-4740) - [Java] Upgrade to JUnit 5
-* [ARROW-4743](https://issues.apache.org/jira/browse/ARROW-4743) - [Java] Fix documentation in arrow memory module
-* [ARROW-4745](https://issues.apache.org/jira/browse/ARROW-4745) - [C++][Documentation] Document process for replicating static\_crt builds on windows
-* [ARROW-4749](https://issues.apache.org/jira/browse/ARROW-4749) - [Rust] RecordBatch::new() should return result instead of panicking
-* [ARROW-4751](https://issues.apache.org/jira/browse/ARROW-4751) - [C++] Add pkg-config to conda\_env\_cpp.yml
-* [ARROW-4754](https://issues.apache.org/jira/browse/ARROW-4754) - [CI][Java] Flaky TestAuth Flight test
-* [ARROW-4756](https://issues.apache.org/jira/browse/ARROW-4756) - [CI] document the procedure to update docker image for manylinux1 builds
-* [ARROW-4758](https://issues.apache.org/jira/browse/ARROW-4758) - [Flight] Build fails on Mac due to missing Schema\_generated.h
-* [ARROW-4769](https://issues.apache.org/jira/browse/ARROW-4769) - [Rust] Improve array limit function where max records \> len
-* [ARROW-4772](https://issues.apache.org/jira/browse/ARROW-4772) - Provide new ORC adapter interface that allow user to specify row number
-* [ARROW-4776](https://issues.apache.org/jira/browse/ARROW-4776) - [C++] DictionaryBuilder should support bootstrapping from an existing dict type
-* [ARROW-4777](https://issues.apache.org/jira/browse/ARROW-4777) - [C++/Python] manylinux1: Update lz4 to 1.8.3
-* [ARROW-4778](https://issues.apache.org/jira/browse/ARROW-4778) - [C++/Python] manylinux1: Update Thrift to 0.12.0
-* [ARROW-4782](https://issues.apache.org/jira/browse/ARROW-4782) - [C++] Prototype scalar and array expression types for developing deferred operator algebra
-* [ARROW-4786](https://issues.apache.org/jira/browse/ARROW-4786) - [C++/Python] Support better parallelisation in manylinux1 base build
-* [ARROW-4789](https://issues.apache.org/jira/browse/ARROW-4789) - [C++] Deprecate and and later remove arrow::io::ReadableFileInterface
-* [ARROW-4790](https://issues.apache.org/jira/browse/ARROW-4790) - [Python/Packaging] Update manylinux docker image in crossbow task
-* [ARROW-4791](https://issues.apache.org/jira/browse/ARROW-4791) - Unused dependencies in arrow and datafusion
-* [ARROW-4794](https://issues.apache.org/jira/browse/ARROW-4794) - [Python] Make pandas an optional test dependency
-* [ARROW-4797](https://issues.apache.org/jira/browse/ARROW-4797) - [Plasma] Avoid store crash if not enough memory is available
-* [ARROW-4801](https://issues.apache.org/jira/browse/ARROW-4801) - [GLib] Suppress pkgconfig.generate() warnings
-* [ARROW-4808](https://issues.apache.org/jira/browse/ARROW-4808) - [Java][Vector] Convenience methods for setting decimal vector
-* [ARROW-4812](https://issues.apache.org/jira/browse/ARROW-4812) - [Rust] [DataFusion] Table.scan() should return one iterator per partition
-* [ARROW-4817](https://issues.apache.org/jira/browse/ARROW-4817) - [Rust] [DataFusion] Small re-org of modules
-* [ARROW-4818](https://issues.apache.org/jira/browse/ARROW-4818) - [Rust] [DataFusion] Parquet data source does not support null values
-* [ARROW-4826](https://issues.apache.org/jira/browse/ARROW-4826) - [Go] export Flush method for CSV writer
-* [ARROW-4831](https://issues.apache.org/jira/browse/ARROW-4831) - [C++] CMAKE\_AR is not passed to ZSTD thirdparty dependency
-* [ARROW-4833](https://issues.apache.org/jira/browse/ARROW-4833) - [Release] Document how to update the brew formula in the release management guide
-* [ARROW-4834](https://issues.apache.org/jira/browse/ARROW-4834) - [R] Feature flag to disable parquet
-* [ARROW-4835](https://issues.apache.org/jira/browse/ARROW-4835) - [GLib] Add boolean operations
-* [ARROW-4837](https://issues.apache.org/jira/browse/ARROW-4837) - [C++] Support c++filt on a custom path in the run-test.sh script
-* [ARROW-4839](https://issues.apache.org/jira/browse/ARROW-4839) - [C\#] Add NuGet support
-* [ARROW-4843](https://issues.apache.org/jira/browse/ARROW-4843) - [Rust] [DataFusion] Parquet data source should support DATE
-* [ARROW-4846](https://issues.apache.org/jira/browse/ARROW-4846) - [Java] Update Jackson to 2.9.8
-* [ARROW-4849](https://issues.apache.org/jira/browse/ARROW-4849) - [C++] Add docker-compose entry for testing Ubuntu Bionic build with system packages
-* [ARROW-4854](https://issues.apache.org/jira/browse/ARROW-4854) - [Rust] Use Array Slice for limit kernel
-* [ARROW-4855](https://issues.apache.org/jira/browse/ARROW-4855) - [Packaging] Generate default package version based on cpp tags in crossbow.py
-* [ARROW-4858](https://issues.apache.org/jira/browse/ARROW-4858) - [Flight][Python] Enable custom FlightDataStream in Python
-* [ARROW-4859](https://issues.apache.org/jira/browse/ARROW-4859) - [GLib] Add garrow\_numeric\_array\_mean()
-* [ARROW-4862](https://issues.apache.org/jira/browse/ARROW-4862) - [GLib] Add GArrowCastOptions::allow-invalid-utf8 property
-* [ARROW-4862](https://issues.apache.org/jira/browse/ARROW-4862) - [GLib] Add GArrowCastOptions::allow-invalid-utf8 property
-* [ARROW-4865](https://issues.apache.org/jira/browse/ARROW-4865) - [Rust] Support casting lists and primitives to lists
-* [ARROW-4873](https://issues.apache.org/jira/browse/ARROW-4873) - [C++] Clarify documentation about how to use external ARROW\_PACKAGE\_PREFIX while also using CONDA dependency resolution
-* [ARROW-4878](https://issues.apache.org/jira/browse/ARROW-4878) - [C++] ARROW\_DEPENDENCY\_SOURCE=CONDA does not work properly with MSVC
-* [ARROW-4882](https://issues.apache.org/jira/browse/ARROW-4882) - [GLib] Add "Sum" functions
-* [ARROW-4887](https://issues.apache.org/jira/browse/ARROW-4887) - [GLib] Add garrow\_array\_count()
-* [ARROW-4889](https://issues.apache.org/jira/browse/ARROW-4889) - [C++] Add STATUS messages for Protobuf in CMake
-* [ARROW-4891](https://issues.apache.org/jira/browse/ARROW-4891) - [C++] ZLIB include directories not added
-* [ARROW-4892](https://issues.apache.org/jira/browse/ARROW-4892) - [Rust] [DataFusion] Move SQL parser and planner into sql package
-* [ARROW-4893](https://issues.apache.org/jira/browse/ARROW-4893) - [C++] conda packages should use $PREFIX inside of conda-build
-* [ARROW-4894](https://issues.apache.org/jira/browse/ARROW-4894) - [Rust] [DataFusion] Remove all uses of panic! from aggregate.rs
-* [ARROW-4895](https://issues.apache.org/jira/browse/ARROW-4895) - [Rust] [DataFusion] Move error.rs to top level package
-* [ARROW-4896](https://issues.apache.org/jira/browse/ARROW-4896) - [Rust] [DataFusion] Remove all uses of panic! from tests
-* [ARROW-4897](https://issues.apache.org/jira/browse/ARROW-4897) - [Rust] [DataFusion] Improve Rustdoc
-* [ARROW-4898](https://issues.apache.org/jira/browse/ARROW-4898) - [C++] Old versions of FindProtobuf.cmake use ALL-CAPS for variables
-* [ARROW-4899](https://issues.apache.org/jira/browse/ARROW-4899) - [Rust] [DataFusion] Remove all uses of panic! from expression.rs
-* [ARROW-4901](https://issues.apache.org/jira/browse/ARROW-4901) - [Go] Run tests in Appveyor
-* [ARROW-4905](https://issues.apache.org/jira/browse/ARROW-4905) - [C++][Plasma] Remove dlmalloc from client library
-* [ARROW-4907](https://issues.apache.org/jira/browse/ARROW-4907) - [CI] Add docker container to inspect docker context
-* [ARROW-4908](https://issues.apache.org/jira/browse/ARROW-4908) - [Rust] [DataFusion] Add support for parquet date/time in int32/64 encoding
-* [ARROW-4909](https://issues.apache.org/jira/browse/ARROW-4909) - [CI] Use hadolint to lint Dockerfiles
-* [ARROW-4910](https://issues.apache.org/jira/browse/ARROW-4910) - [Rust] [DataFusion] Remove all uses of unimplemented!
-* [ARROW-4915](https://issues.apache.org/jira/browse/ARROW-4915) - [GLib] Add support for arrow::NullBuilder
-* [ARROW-4922](https://issues.apache.org/jira/browse/ARROW-4922) - [Packaging] Use system libraris for .deb and .rpm
-* [ARROW-4924](https://issues.apache.org/jira/browse/ARROW-4924) - [Ruby] Add Decimal128\#to\_s(scale=nil)
-* [ARROW-4925](https://issues.apache.org/jira/browse/ARROW-4925) - [Rust] [DataFusion] Remove duplicate implementations of collect\_expr
-* [ARROW-4926](https://issues.apache.org/jira/browse/ARROW-4926) - [Rust] [DataFusion] Update README for 0.13.0 release
-* [ARROW-4929](https://issues.apache.org/jira/browse/ARROW-4929) - [GLib] Add garrow\_array\_count\_values()
-* [ARROW-4932](https://issues.apache.org/jira/browse/ARROW-4932) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE macro
-* [ARROW-4933](https://issues.apache.org/jira/browse/ARROW-4933) - [R] Autodetect Parquet support using pkg-config
-* [ARROW-4937](https://issues.apache.org/jira/browse/ARROW-4937) - [R] Clean pkg-config related logic
-* [ARROW-4939](https://issues.apache.org/jira/browse/ARROW-4939) - [Python] Add wrapper for "sum" kernel
-* [ARROW-4940](https://issues.apache.org/jira/browse/ARROW-4940) - [Rust] Enhance documentation for datafusion
-* [ARROW-4944](https://issues.apache.org/jira/browse/ARROW-4944) - [C++] Raise minimal required thrift-cpp to 0.11 in conda environment
-* [ARROW-4946](https://issues.apache.org/jira/browse/ARROW-4946) - [C++] Support detection of flatbuffers without FlatbuffersConfig.cmake
-* [ARROW-4947](https://issues.apache.org/jira/browse/ARROW-4947) - [Flight][C++/Python] Remove redundant schema parameter in DoGet
-* [ARROW-4951](https://issues.apache.org/jira/browse/ARROW-4951) - [C++] Turn off cpp benchmarks in cpp docker images
-* [ARROW-4955](https://issues.apache.org/jira/browse/ARROW-4955) - [GLib] Add garrow\_file\_is\_closed()
-* [ARROW-4964](https://issues.apache.org/jira/browse/ARROW-4964) - [Ruby] Add closed check if available on auto close
-* [ARROW-4969](https://issues.apache.org/jira/browse/ARROW-4969) - [C++] Set RPATH in correct order for test executables on OSX
-* [ARROW-4977](https://issues.apache.org/jira/browse/ARROW-4977) - [Ruby] Add support for building on Windows
-* [ARROW-4978](https://issues.apache.org/jira/browse/ARROW-4978) - [Ruby] Fix wrong internal variable name for table data
-* [ARROW-4979](https://issues.apache.org/jira/browse/ARROW-4979) - [GLib] Add missing lock to garrow::GIOInputStream
-* [ARROW-4980](https://issues.apache.org/jira/browse/ARROW-4980) - [GLib] Use GInputStream as the parent of GArrowInputStream
-* [ARROW-4981](https://issues.apache.org/jira/browse/ARROW-4981) - [Ruby] Add support for CSV data encoding conversion
-* [ARROW-4983](https://issues.apache.org/jira/browse/ARROW-4983) - [Plasma] Unmap memory when the client is destroyed
-* [ARROW-4994](https://issues.apache.org/jira/browse/ARROW-4994) - [website] Update Details for ptgoetz
-* [ARROW-4995](https://issues.apache.org/jira/browse/ARROW-4995) - [R] Make sure winbuilder tests pass for package
-* [ARROW-4996](https://issues.apache.org/jira/browse/ARROW-4996) - [Plasma] There are many log files in /tmp
-* [ARROW-5003](https://issues.apache.org/jira/browse/ARROW-5003) - [R] remove dependency on withr
-* [ARROW-5006](https://issues.apache.org/jira/browse/ARROW-5006) - [R] parquet.cpp does not include enough Rcpp
-* [ARROW-5011](https://issues.apache.org/jira/browse/ARROW-5011) - [Release] Add support in the source release script for custom hash
-* [ARROW-5013](https://issues.apache.org/jira/browse/ARROW-5013) - [Rust] [DataFusion] Refactor runtime expression support
-* [ARROW-5014](https://issues.apache.org/jira/browse/ARROW-5014) - [Java] Fix typos in Flight module
-* [ARROW-5018](https://issues.apache.org/jira/browse/ARROW-5018) - [Release] Include JavaScript implementation
-* [ARROW-5032](https://issues.apache.org/jira/browse/ARROW-5032) - [C++] Headers in vendored/datetime directory aren't installed
-* [ARROW-5041](https://issues.apache.org/jira/browse/ARROW-5041) - [Release][C++] use bundled gtest and gmock in verify-release-candidate.bat
-* [ARROW-5075](https://issues.apache.org/jira/browse/ARROW-5075) - [Release] Add 0.13.0 release note
-* [ARROW-5084](https://issues.apache.org/jira/browse/ARROW-5084) - [Website] Blog post / release announcement for 0.13.0
-* [PARQUET-1477](https://issues.apache.org/jira/browse/PARQUET-1477) - Thrift crypto updates
-* [PARQUET-1508](https://issues.apache.org/jira/browse/PARQUET-1508) - [C++] Enable reading from ByteArray and FixedLenByteArray decoders directly into arrow::BinaryBuilder or arrow::BinaryDictionaryBuilder
-* [PARQUET-1519](https://issues.apache.org/jira/browse/PARQUET-1519) - [C++] Remove use of "extern template class" from parquet/column\_reader.h
-* [PARQUET-1521](https://issues.apache.org/jira/browse/PARQUET-1521) - [C++] Do not use "extern template class" with parquet::ColumnWriter
-* [PARQUET-1525](https://issues.apache.org/jira/browse/PARQUET-1525) - [C++] remove dependency on getopt in parquet tools
-
-
-
-# Apache Arrow 0.12.1 (2019-02-25)
-
-## Bug Fixes
-
-* [ARROW-3564](https://issues.apache.org/jira/browse/ARROW-3564) - [Python] writing version 2.0 parquet format with dictionary encoding enabled
-* [ARROW-4255](https://issues.apache.org/jira/browse/ARROW-4255) - [C++] Schema::GetFieldIndex is not thread-safe
-* [ARROW-4267](https://issues.apache.org/jira/browse/ARROW-4267) - [Python/C++][Parquet] Segfault when reading rowgroups with duplicated columns
-* [ARROW-4323](https://issues.apache.org/jira/browse/ARROW-4323) - [Packaging] Fix failing OSX clang conda forge builds
-* [ARROW-4367](https://issues.apache.org/jira/browse/ARROW-4367) - [C++] StringDictionaryBuilder segfaults on Finish with only null entries
-* [ARROW-4374](https://issues.apache.org/jira/browse/ARROW-4374) - [C++] DictionaryBuilder does not correctly report length and null\_count
-* [ARROW-4492](https://issues.apache.org/jira/browse/ARROW-4492) - [Python] Failure reading Parquet column as pandas Categorical in 0.12
-* [ARROW-4501](https://issues.apache.org/jira/browse/ARROW-4501) - [C++] Unique returns non-unique strings
-* [ARROW-4582](https://issues.apache.org/jira/browse/ARROW-4582) - [C++/Python] Memory corruption on Pandas-\>Arrow conversion
-* [ARROW-4629](https://issues.apache.org/jira/browse/ARROW-4629) - [Python] Pandas to arrow conversion slowed down by local imports
-* [ARROW-4636](https://issues.apache.org/jira/browse/ARROW-4636) - [Python/Packaging] Crossbow builds for conda-osx fail on upload with Ruby linkage errors
-* [ARROW-4647](https://issues.apache.org/jira/browse/ARROW-4647) - [Packaging] dev/release/00-prepare.sh fails for minor version changes
-
-
-## New Features and Improvements
-
-* [ARROW-4291](https://issues.apache.org/jira/browse/ARROW-4291) - [Dev] Support selecting features in release scripts
-* [ARROW-4298](https://issues.apache.org/jira/browse/ARROW-4298) - [Java] Building Flight fails with OpenJDK 11
-* [ARROW-4373](https://issues.apache.org/jira/browse/ARROW-4373) - [Packaging] Travis fails to deploy conda packages on OSX
-
-
-
-# Apache Arrow 0.12.0 (2019-01-20)
-
-## New Features and Improvements
-
-* [ARROW-45](https://issues.apache.org/jira/browse/ARROW-45) - [Python] Add unnest/flatten function for List types
-* [ARROW-536](https://issues.apache.org/jira/browse/ARROW-536) - [C++] Provide non-SSE4 versions of functions that use CPU intrinsics for older processors
-* [ARROW-554](https://issues.apache.org/jira/browse/ARROW-554) - [C++] Implement functions to conform unequal dictionaries amongst multiple Arrow arrays
-* [ARROW-766](https://issues.apache.org/jira/browse/ARROW-766) - [C++] Introduce zero-copy "StringPiece" type
-* [ARROW-854](https://issues.apache.org/jira/browse/ARROW-854) - [Format] Support sparse tensor
-* [ARROW-912](https://issues.apache.org/jira/browse/ARROW-912) - [Python] Account for multiarch systems in development.rst
-* [ARROW-1019](https://issues.apache.org/jira/browse/ARROW-1019) - [C++] Implement input stream and output stream with Gzip codec
-* [ARROW-1055](https://issues.apache.org/jira/browse/ARROW-1055) - [C++] GPU support library development
-* [ARROW-1262](https://issues.apache.org/jira/browse/ARROW-1262) - [Packaging] Packaging automation in arrow-dist
-* [ARROW-1423](https://issues.apache.org/jira/browse/ARROW-1423) - [C++] Create non-owned CudaContext from context handle provided by thirdparty user
-* [ARROW-1492](https://issues.apache.org/jira/browse/ARROW-1492) - [C++] Type casting function kernel suite
-* [ARROW-1688](https://issues.apache.org/jira/browse/ARROW-1688) - [Java] Fail build on checkstyle warnings
-* [ARROW-1696](https://issues.apache.org/jira/browse/ARROW-1696) - [C++] Add codec benchmarks
-* [ARROW-1822](https://issues.apache.org/jira/browse/ARROW-1822) - [C++] Add SSE4.2-accelerated hash kernels and use if host CPU supports
-* [ARROW-1993](https://issues.apache.org/jira/browse/ARROW-1993) - [Python] Add function for determining implied Arrow schema from pandas.DataFrame
-* [ARROW-1994](https://issues.apache.org/jira/browse/ARROW-1994) - [Python] Test against Pandas master
-* [ARROW-2183](https://issues.apache.org/jira/browse/ARROW-2183) - [C++] Add helper CMake function for globbing the right header files
-* [ARROW-2211](https://issues.apache.org/jira/browse/ARROW-2211) - [C++] Use simpler hash functions for integers
-* [ARROW-2216](https://issues.apache.org/jira/browse/ARROW-2216) - [CI] CI descriptions and envars are misleading
-* [ARROW-2337](https://issues.apache.org/jira/browse/ARROW-2337) - [Scripts] Windows release verification script should use boost DSOs instead of static linkage
-* [ARROW-2374](https://issues.apache.org/jira/browse/ARROW-2374) - [Rust] Add support for array of List<T\>
-* [ARROW-2475](https://issues.apache.org/jira/browse/ARROW-2475) - [Format] Confusing array length description
-* [ARROW-2476](https://issues.apache.org/jira/browse/ARROW-2476) - [Python/Question] Maximum length of an Array created from ndarray
-* [ARROW-2483](https://issues.apache.org/jira/browse/ARROW-2483) - [Rust] use bit-packing for boolean vectors
-* [ARROW-2504](https://issues.apache.org/jira/browse/ARROW-2504) - [Website] Add ApacheCon NA link
-* [ARROW-2535](https://issues.apache.org/jira/browse/ARROW-2535) - [Python] Provide pre-commit hooks that check flake8
-* [ARROW-2560](https://issues.apache.org/jira/browse/ARROW-2560) - [Rust] The Rust README should include Rust-specific information on contributing
-* [ARROW-2624](https://issues.apache.org/jira/browse/ARROW-2624) - [Python] Random schema and data generator for Arrow conversion and Parquet testing
-* [ARROW-2637](https://issues.apache.org/jira/browse/ARROW-2637) - [C++/Python] Build support and instructions for development on Alpine Linux
-* [ARROW-2648](https://issues.apache.org/jira/browse/ARROW-2648) - [Packaging] Follow up packaging tasks
-* [ARROW-2653](https://issues.apache.org/jira/browse/ARROW-2653) - [C++] Refactor hash table support
-* [ARROW-2670](https://issues.apache.org/jira/browse/ARROW-2670) - [C++/Python] Add Ubuntu 18.04 / gcc7 as a nightly build
-* [ARROW-2673](https://issues.apache.org/jira/browse/ARROW-2673) - [Python] Add documentation + docstring for ARROW-2661
-* [ARROW-2684](https://issues.apache.org/jira/browse/ARROW-2684) - [Python] Various documentation improvements
-* [ARROW-2712](https://issues.apache.org/jira/browse/ARROW-2712) - [C\#] Initial C\# .NET library
-* [ARROW-2720](https://issues.apache.org/jira/browse/ARROW-2720) - [C++] Clean up cmake CXX\_STANDARD and PIC flag setting
-* [ARROW-2759](https://issues.apache.org/jira/browse/ARROW-2759) - Export notification socket of Plasma
-* [ARROW-2803](https://issues.apache.org/jira/browse/ARROW-2803) - [C++] Put hashing function into src/arrow/util
-* [ARROW-2807](https://issues.apache.org/jira/browse/ARROW-2807) - [Python] Enable memory-mapping to be toggled in get\_reader when reading Parquet files
-* [ARROW-2808](https://issues.apache.org/jira/browse/ARROW-2808) - [Python] Add unit tests for ProxyMemoryPool, enable new default MemoryPool to be constructed
-* [ARROW-2919](https://issues.apache.org/jira/browse/ARROW-2919) - [C++] Improve error message when listing empty HDFS file
-* [ARROW-2968](https://issues.apache.org/jira/browse/ARROW-2968) - [R] Multi-threaded conversion from Arrow table to R data.frame
-* [ARROW-2995](https://issues.apache.org/jira/browse/ARROW-2995) - [CI] Build Python libraries in same run when running C++ unit tests so project does not need to be rebuilt again right away
-* [ARROW-3020](https://issues.apache.org/jira/browse/ARROW-3020) - [Python] Addition of option to allow empty Parquet row groups
-* [ARROW-3038](https://issues.apache.org/jira/browse/ARROW-3038) - [Go] add support for StringArray
-* [ARROW-3063](https://issues.apache.org/jira/browse/ARROW-3063) - [Go] move list of supported/TODO features to confluence
-* [ARROW-3070](https://issues.apache.org/jira/browse/ARROW-3070) - [Release] Host binary artifacts for RCs and releases on ASF Bintray account instead of dist/mirror system
-* [ARROW-3108](https://issues.apache.org/jira/browse/ARROW-3108) - [C++] arrow::PrettyPrint for Table instances
-* [ARROW-3126](https://issues.apache.org/jira/browse/ARROW-3126) - [Python] Make Buffered\* IO classes available to Python, incorporate into input\_stream, output\_stream factory functions
-* [ARROW-3131](https://issues.apache.org/jira/browse/ARROW-3131) - [Go] add test for Go-1.11
-* [ARROW-3161](https://issues.apache.org/jira/browse/ARROW-3161) - [Packaging] Ensure to run pyarrow unit tests in conda and wheel builds
-* [ARROW-3169](https://issues.apache.org/jira/browse/ARROW-3169) - [C++] Break array-test.cc and array.cc into multiple compilation units
-* [ARROW-3184](https://issues.apache.org/jira/browse/ARROW-3184) - [C++] Add modular build targets, "all" target, and require explicit target when invoking make or ninja
-* [ARROW-3194](https://issues.apache.org/jira/browse/ARROW-3194) - [Java] Fix setValueCount in spitAndTransfer for variable width vectors
-* [ARROW-3199](https://issues.apache.org/jira/browse/ARROW-3199) - [Plasma] Check for EAGAIN in recvmsg and sendmsg
-* [ARROW-3209](https://issues.apache.org/jira/browse/ARROW-3209) - [C++] Rename libarrow\_gpu to libarrow\_cuda
-* [ARROW-3230](https://issues.apache.org/jira/browse/ARROW-3230) - [Python] Missing comparisons on ChunkedArray, Table
-* [ARROW-3233](https://issues.apache.org/jira/browse/ARROW-3233) - [Python] Sphinx documentation for pyarrow.cuda GPU support
-* [ARROW-3248](https://issues.apache.org/jira/browse/ARROW-3248) - [C++] Arrow tests should have label "arrow"
-* [ARROW-3254](https://issues.apache.org/jira/browse/ARROW-3254) - [C++] Add option to ADD\_ARROW\_TEST to compose a test executable from multiple .cc files containing unit tests
-* [ARROW-3260](https://issues.apache.org/jira/browse/ARROW-3260) - [CI] Make linting a separate job
-* [ARROW-3272](https://issues.apache.org/jira/browse/ARROW-3272) - [Java] Document checkstyle deviations from Google style guide
-* [ARROW-3273](https://issues.apache.org/jira/browse/ARROW-3273) - [Java] checkstyle - fix javadoc style
-* [ARROW-3278](https://issues.apache.org/jira/browse/ARROW-3278) - [Python] Retrieve StructType's and StructArray's field by name
-* [ARROW-3291](https://issues.apache.org/jira/browse/ARROW-3291) - [C++] Convenience API for constructing arrow::io::BufferReader from std::string
-* [ARROW-3293](https://issues.apache.org/jira/browse/ARROW-3293) - [C++] Test Flight RPC in Travis CI
-* [ARROW-3296](https://issues.apache.org/jira/browse/ARROW-3296) - [Python] Add Flight support to manylinux1 wheels
-* [ARROW-3303](https://issues.apache.org/jira/browse/ARROW-3303) - [C++] Enable example arrays to be written with a simplified JSON representation
-* [ARROW-3306](https://issues.apache.org/jira/browse/ARROW-3306) - [R] Objects and support functions different kinds of arrow::Buffer
-* [ARROW-3307](https://issues.apache.org/jira/browse/ARROW-3307) - [R] Convert chunked arrow::Column to R vector
-* [ARROW-3310](https://issues.apache.org/jira/browse/ARROW-3310) - [R] Create wrapper classes for various Arrow IO interfaces
-* [ARROW-3312](https://issues.apache.org/jira/browse/ARROW-3312) - [R] Use same .clang-format file for both R binding C++ code and main C++ codebase
-* [ARROW-3315](https://issues.apache.org/jira/browse/ARROW-3315) - [R] Support for multi-threaded conversions from RecordBatch, Table to R data.frame
-* [ARROW-3318](https://issues.apache.org/jira/browse/ARROW-3318) - [C++] Convenience method for reading all batches from an IPC stream or file as arrow::Table
-* [ARROW-3323](https://issues.apache.org/jira/browse/ARROW-3323) - [Java] checkstyle - fix naming
-* [ARROW-3331](https://issues.apache.org/jira/browse/ARROW-3331) - [C++] Add re2 to ThirdpartyToolchain
-* [ARROW-3340](https://issues.apache.org/jira/browse/ARROW-3340) - [R] support for dates and time classes
-* [ARROW-3347](https://issues.apache.org/jira/browse/ARROW-3347) - [Rust] Implement PrimitiveArrayBuilder
-* [ARROW-3353](https://issues.apache.org/jira/browse/ARROW-3353) - [Packaging] Build python 3.7 wheels
-* [ARROW-3355](https://issues.apache.org/jira/browse/ARROW-3355) - [R] Support for factors
-* [ARROW-3358](https://issues.apache.org/jira/browse/ARROW-3358) - [Gandiva][C++] Replace usages of gandiva/status.h with arrow/status.h
-* [ARROW-3362](https://issues.apache.org/jira/browse/ARROW-3362) - [R] Guard against null buffers
-* [ARROW-3366](https://issues.apache.org/jira/browse/ARROW-3366) - [R] Dockerfile for docker-compose setup
-* [ARROW-3368](https://issues.apache.org/jira/browse/ARROW-3368) - [Integration/CI/Python] Add dask integration test to docker-compose setup
-* [ARROW-3380](https://issues.apache.org/jira/browse/ARROW-3380) - [Python] Support reading CSV files and more from a gzipped file
-* [ARROW-3381](https://issues.apache.org/jira/browse/ARROW-3381) - [C++] Implement InputStream for bz2 files
-* [ARROW-3383](https://issues.apache.org/jira/browse/ARROW-3383) - [Java] Run Gandiva tests in Travis CI
-* [ARROW-3384](https://issues.apache.org/jira/browse/ARROW-3384) - [Gandiva] Sync remaining commits from gandiva repo
-* [ARROW-3385](https://issues.apache.org/jira/browse/ARROW-3385) - [Java] [Gandiva] Deploy gandiva snapshot jars automatically
-* [ARROW-3387](https://issues.apache.org/jira/browse/ARROW-3387) - [C++] Function to cast binary to string/utf8 with UTF8 validation
-* [ARROW-3398](https://issues.apache.org/jira/browse/ARROW-3398) - [Rust] Update existing Builder to use MutableBuffer internally
-* [ARROW-3402](https://issues.apache.org/jira/browse/ARROW-3402) - [Gandiva][C++] Utilize common bitmap operation implementations in precompiled IR routines
-* [ARROW-3407](https://issues.apache.org/jira/browse/ARROW-3407) - [C++] Add UTF8 conversion modes in CSV reader conversion options
-* [ARROW-3409](https://issues.apache.org/jira/browse/ARROW-3409) - [C++] Add streaming compression interfaces
-* [ARROW-3421](https://issues.apache.org/jira/browse/ARROW-3421) - [C++] Add include-what-you-use setup to primary docker-compose.yml
-* [ARROW-3427](https://issues.apache.org/jira/browse/ARROW-3427) - [C++] Add Windows support, Unix static libs for double-conversion package in conda-forge
-* [ARROW-3429](https://issues.apache.org/jira/browse/ARROW-3429) - [Packaging] Add a script to release binaries that use source archive at dist.apache.orgtable bit
-* [ARROW-3430](https://issues.apache.org/jira/browse/ARROW-3430) - [Packaging] Add workaround to verify 0.11.0
-* [ARROW-3431](https://issues.apache.org/jira/browse/ARROW-3431) - [GLib] Include Gemfile to archive
-* [ARROW-3432](https://issues.apache.org/jira/browse/ARROW-3432) - [Packaging] Variables aren't expanded Subversion commit message
-* [ARROW-3433](https://issues.apache.org/jira/browse/ARROW-3433) - [C++] Validate re2 with Windows toolchain, EP
-* [ARROW-3439](https://issues.apache.org/jira/browse/ARROW-3439) - [R] R language bindings for Feather format
-* [ARROW-3440](https://issues.apache.org/jira/browse/ARROW-3440) - [Gandiva][C++] Remove outdated cpp/src/gandiva/README.md, add build documentation to cpp/README.md
-* [ARROW-3441](https://issues.apache.org/jira/browse/ARROW-3441) - [Gandiva][C++] Produce fewer test executables
-* [ARROW-3442](https://issues.apache.org/jira/browse/ARROW-3442) - [C++] Use dynamic linking for unit tests, ensure coverage working properly with clang
-* [ARROW-3450](https://issues.apache.org/jira/browse/ARROW-3450) - [R] Wrap MemoryMappedFile class
-* [ARROW-3451](https://issues.apache.org/jira/browse/ARROW-3451) - [Python] Allocate CUDA memory from a CUcontext created by numba.cuda
-* [ARROW-3455](https://issues.apache.org/jira/browse/ARROW-3455) - [Gandiva][C++] Support pkg-config for Gandiva
-* [ARROW-3456](https://issues.apache.org/jira/browse/ARROW-3456) - [CI] Reuse docker images and optimize docker-compose containers
-* [ARROW-3460](https://issues.apache.org/jira/browse/ARROW-3460) - [Packaging] Add a script to rebase master on local release branch
-* [ARROW-3461](https://issues.apache.org/jira/browse/ARROW-3461) - [Packaging] Add a script to upload RC artifacts as the official release
-* [ARROW-3462](https://issues.apache.org/jira/browse/ARROW-3462) - [Packaging] Update CHANGELOG for 0.11.0
-* [ARROW-3463](https://issues.apache.org/jira/browse/ARROW-3463) - [Website] Update for 0.11.0
-* [ARROW-3464](https://issues.apache.org/jira/browse/ARROW-3464) - [Packaging] Build shared libraries for gandiva fat JAR via crossbow
-* [ARROW-3465](https://issues.apache.org/jira/browse/ARROW-3465) - [Documentation] Fix gen\_apidocs' docker image
-* [ARROW-3469](https://issues.apache.org/jira/browse/ARROW-3469) - [Gandiva] add travis entry for gandiva on OSX
-* [ARROW-3472](https://issues.apache.org/jira/browse/ARROW-3472) - [Gandiva] remove gandiva helpers library
-* [ARROW-3473](https://issues.apache.org/jira/browse/ARROW-3473) - [Format] Update Layout.md document to clarify use of 64-bit array lengths
-* [ARROW-3474](https://issues.apache.org/jira/browse/ARROW-3474) - [GLib] Extend gparquet API with get\_schema and read\_column
-* [ARROW-3479](https://issues.apache.org/jira/browse/ARROW-3479) - [R] Support to write record\_batch as stream
-* [ARROW-3482](https://issues.apache.org/jira/browse/ARROW-3482) - [C++] Build with JEMALLOC by default
-* [ARROW-3487](https://issues.apache.org/jira/browse/ARROW-3487) - [Gandiva] simplify NULL\_IF\_NULL functions that can return errors
-* [ARROW-3488](https://issues.apache.org/jira/browse/ARROW-3488) - [Packaging] Separate crossbow task definition files for packaging and tests
-* [ARROW-3489](https://issues.apache.org/jira/browse/ARROW-3489) - [Gandiva] Support for in expressions
-* [ARROW-3490](https://issues.apache.org/jira/browse/ARROW-3490) - [R] streaming arrow objects to output streams
-* [ARROW-3492](https://issues.apache.org/jira/browse/ARROW-3492) - [C++] Build jemalloc in parallel
-* [ARROW-3493](https://issues.apache.org/jira/browse/ARROW-3493) - [Java] Document BOUNDS\_CHECKING\_ENABLED
-* [ARROW-3499](https://issues.apache.org/jira/browse/ARROW-3499) - [R] Expose arrow::ipc::Message type
-* [ARROW-3501](https://issues.apache.org/jira/browse/ARROW-3501) - [Gandiva] Enable building with gcc 4.8.x on Ubuntu Trusty, similar distros
-* [ARROW-3504](https://issues.apache.org/jira/browse/ARROW-3504) - [Plasma] Add support for Plasma Client to put/get raw bytes without pyarrow serialization.
-* [ARROW-3505](https://issues.apache.org/jira/browse/ARROW-3505) - [R] Read record batch and table
-* [ARROW-3506](https://issues.apache.org/jira/browse/ARROW-3506) - [Packaging] Nightly tests for docker-compose images
-* [ARROW-3508](https://issues.apache.org/jira/browse/ARROW-3508) - [C++] Build against double-conversion from conda-forge
-* [ARROW-3515](https://issues.apache.org/jira/browse/ARROW-3515) - Introduce NumericTensor class
-* [ARROW-3518](https://issues.apache.org/jira/browse/ARROW-3518) - [C++] Detect HOMEBREW\_PREFIX automatically
-* [ARROW-3519](https://issues.apache.org/jira/browse/ARROW-3519) - [Gandiva] Add support for functions that can return variable len output
-* [ARROW-3521](https://issues.apache.org/jira/browse/ARROW-3521) - [GLib] Run Python using find\_program in meson.build
-* [ARROW-3529](https://issues.apache.org/jira/browse/ARROW-3529) - [Ruby] Import Red Parquet
-* [ARROW-3530](https://issues.apache.org/jira/browse/ARROW-3530) - [Java/Python] Add conversion for pyarrow.Schema from org.apache…pojo.Schema
-* [ARROW-3533](https://issues.apache.org/jira/browse/ARROW-3533) - [Python/Documentation] Use sphinx\_rtd\_theme instead of Bootstrap
-* [ARROW-3536](https://issues.apache.org/jira/browse/ARROW-3536) - [C++] Fast UTF8 validation functions
-* [ARROW-3537](https://issues.apache.org/jira/browse/ARROW-3537) - [Rust] Implement Tensor Type
-* [ARROW-3539](https://issues.apache.org/jira/browse/ARROW-3539) - [CI/Packaging] Update scripts to build against vendored jemalloc
-* [ARROW-3540](https://issues.apache.org/jira/browse/ARROW-3540) - [Rust] Incorporate BooleanArray into PrimitiveArray
-* [ARROW-3542](https://issues.apache.org/jira/browse/ARROW-3542) - [C++] Use unsafe appends when building array from CSV
-* [ARROW-3545](https://issues.apache.org/jira/browse/ARROW-3545) - [C++/Python] Normalize child/field terminology with StructType
-* [ARROW-3547](https://issues.apache.org/jira/browse/ARROW-3547) - [R] Protect against Null crash when reading from RecordBatch
-* [ARROW-3548](https://issues.apache.org/jira/browse/ARROW-3548) - Speed up storing small objects in the object store.
-* [ARROW-3551](https://issues.apache.org/jira/browse/ARROW-3551) - Change MapD to OmniSci on Powered By page
-* [ARROW-3553](https://issues.apache.org/jira/browse/ARROW-3553) - [R] Error when losing data on int64, uint64 conversions to double
-* [ARROW-3555](https://issues.apache.org/jira/browse/ARROW-3555) - [Plasma] Unify plasma client get function using metadata.
-* [ARROW-3556](https://issues.apache.org/jira/browse/ARROW-3556) - [CI] Disable optimizations on Windows
-* [ARROW-3557](https://issues.apache.org/jira/browse/ARROW-3557) - [Python] Set language\_level in Cython sources
-* [ARROW-3558](https://issues.apache.org/jira/browse/ARROW-3558) - [Plasma] Remove fatal error when plasma client calls get on an unsealed object that it created.
-* [ARROW-3559](https://issues.apache.org/jira/browse/ARROW-3559) - Statically link libraries for plasma\_store\_server executable.
-* [ARROW-3562](https://issues.apache.org/jira/browse/ARROW-3562) - [R] Disallow creation of objects with null shared\_ptr<T\>
-* [ARROW-3563](https://issues.apache.org/jira/browse/ARROW-3563) - [C++] Declare public link dependencies so arrow\_static, plasma\_static automatically pull in transitive dependencies
-* [ARROW-3566](https://issues.apache.org/jira/browse/ARROW-3566) - Clarify that the type of dictionary encoded field should be the encoded(index) type
-* [ARROW-3567](https://issues.apache.org/jira/browse/ARROW-3567) - [Gandiva] [GLib] Add GLib bindings of Gandiva
-* [ARROW-3568](https://issues.apache.org/jira/browse/ARROW-3568) - [Packaging] Run pyarrow unittests for windows wheels
-* [ARROW-3569](https://issues.apache.org/jira/browse/ARROW-3569) - [Packaging] Run pyarrow unittests when building conda package
-* [ARROW-3574](https://issues.apache.org/jira/browse/ARROW-3574) - Fix remaining bug with plasma static versus shared libraries.
-* [ARROW-3575](https://issues.apache.org/jira/browse/ARROW-3575) - [Python] New documentation page for CSV reader
-* [ARROW-3576](https://issues.apache.org/jira/browse/ARROW-3576) - [Python] Expose compressed file readers as NativeFile
-* [ARROW-3577](https://issues.apache.org/jira/browse/ARROW-3577) - [Go] add support for ChunkedArray
-* [ARROW-3581](https://issues.apache.org/jira/browse/ARROW-3581) - [Gandiva][C++] ARROW\_PROTOBUF\_USE\_SHARED isn't used
-* [ARROW-3582](https://issues.apache.org/jira/browse/ARROW-3582) - [CI] Gandiva C++ build is always triggered
-* [ARROW-3583](https://issues.apache.org/jira/browse/ARROW-3583) - [Python/Java] Create RecordBatch from VectorSchemaRoot
-* [ARROW-3584](https://issues.apache.org/jira/browse/ARROW-3584) - [Go] add support for Table
-* [ARROW-3587](https://issues.apache.org/jira/browse/ARROW-3587) - [Python] Efficient serialization for Arrow Objects (array, table, tensor, etc)
-* [ARROW-3588](https://issues.apache.org/jira/browse/ARROW-3588) - [Java] checkstyle - fix license
-* [ARROW-3589](https://issues.apache.org/jira/browse/ARROW-3589) - [Gandiva] Make it possible to compile gandiva without JNI
-* [ARROW-3591](https://issues.apache.org/jira/browse/ARROW-3591) - [R] Support to collect decimal type
-* [ARROW-3592](https://issues.apache.org/jira/browse/ARROW-3592) - [Python] Get BinaryArray value as zero copy memory view
-* [ARROW-3597](https://issues.apache.org/jira/browse/ARROW-3597) - [Gandiva] gandiva should integrate with ADD\_ARROW\_TEST for tests
-* [ARROW-3600](https://issues.apache.org/jira/browse/ARROW-3600) - [Packaging] Support Ubuntu 18.10
-* [ARROW-3601](https://issues.apache.org/jira/browse/ARROW-3601) - [Rust] Release 0.11.0
-* [ARROW-3602](https://issues.apache.org/jira/browse/ARROW-3602) - [Gandiva] [Python] Add preliminary Cython bindings for Gandiva
-* [ARROW-3603](https://issues.apache.org/jira/browse/ARROW-3603) - [Gandiva][C++] Can't build with vendored Boost
-* [ARROW-3605](https://issues.apache.org/jira/browse/ARROW-3605) - Remove AE library from plasma header files.
-* [ARROW-3607](https://issues.apache.org/jira/browse/ARROW-3607) - [Java] delete() method via JNI for plasma
-* [ARROW-3608](https://issues.apache.org/jira/browse/ARROW-3608) - [R] Support for time32 and time64 array types
-* [ARROW-3609](https://issues.apache.org/jira/browse/ARROW-3609) - [Gandiva] Move benchmark tests out of unit test
-* [ARROW-3610](https://issues.apache.org/jira/browse/ARROW-3610) - [C++] Add interface to turn stl\_allocator into arrow::MemoryPool
-* [ARROW-3611](https://issues.apache.org/jira/browse/ARROW-3611) - Give error more quickly when pyarrow serialization context is used incorrectly.
-* [ARROW-3612](https://issues.apache.org/jira/browse/ARROW-3612) - [Go] implement RecordBatch and RecordBatchReader
-* [ARROW-3615](https://issues.apache.org/jira/browse/ARROW-3615) - [R] Support for NaN
-* [ARROW-3616](https://issues.apache.org/jira/browse/ARROW-3616) - [Java] checkstyle - fix remaining coding checks
-* [ARROW-3618](https://issues.apache.org/jira/browse/ARROW-3618) - [Packaging/Documentation] Add \`-c conda-forge\` option to avoid PackagesNotFoundError
-* [ARROW-3620](https://issues.apache.org/jira/browse/ARROW-3620) - [Python] Document multithreading options in Sphinx and add to api.rst
-* [ARROW-3621](https://issues.apache.org/jira/browse/ARROW-3621) - [Go] implement TableBatchReader
-* [ARROW-3622](https://issues.apache.org/jira/browse/ARROW-3622) - [Go] implement Schema.Equal
-* [ARROW-3623](https://issues.apache.org/jira/browse/ARROW-3623) - [Go] implement Field.Equal
-* [ARROW-3624](https://issues.apache.org/jira/browse/ARROW-3624) - [Python/C++] Support for zero-sized device buffers
-* [ARROW-3625](https://issues.apache.org/jira/browse/ARROW-3625) - [Go] add examples for Table, Record and {Table,Record}Reader
-* [ARROW-3626](https://issues.apache.org/jira/browse/ARROW-3626) - [Go] add a CSV TableReader
-* [ARROW-3627](https://issues.apache.org/jira/browse/ARROW-3627) - [Go] add RecordBatchBuilder
-* [ARROW-3629](https://issues.apache.org/jira/browse/ARROW-3629) - [Python] Add write\_to\_dataset to Python Sphinx API listing
-* [ARROW-3630](https://issues.apache.org/jira/browse/ARROW-3630) - [Plasma] [GLib] Add GLib bindings of Plasma
-* [ARROW-3632](https://issues.apache.org/jira/browse/ARROW-3632) - [Packaging] Update deb names in dev/tasks/tasks.yml in dev/release/00-prepare.sh
-* [ARROW-3633](https://issues.apache.org/jira/browse/ARROW-3633) - [Packaging] Update deb names in dev/tasks/tasks.yml for 0.12.0
-* [ARROW-3636](https://issues.apache.org/jira/browse/ARROW-3636) - [C++/Python] Update arrow/python/pyarrow\_api.h
-* [ARROW-3638](https://issues.apache.org/jira/browse/ARROW-3638) - [C++][Python] Move reading from Feather as Table feature to C++ from Python
-* [ARROW-3639](https://issues.apache.org/jira/browse/ARROW-3639) - [Packaging] Run gandiva nightly packaging tasks
-* [ARROW-3640](https://issues.apache.org/jira/browse/ARROW-3640) - [Go] add support for Tensors
-* [ARROW-3641](https://issues.apache.org/jira/browse/ARROW-3641) - [C++/Python] remove public keyword from Cython api functions
-* [ARROW-3642](https://issues.apache.org/jira/browse/ARROW-3642) - [C++] Add arrowConfig.cmake generation
-* [ARROW-3644](https://issues.apache.org/jira/browse/ARROW-3644) - [Rust] Implement ListArrayBuilder
-* [ARROW-3645](https://issues.apache.org/jira/browse/ARROW-3645) - [Python] Document compression support in Sphinx
-* [ARROW-3646](https://issues.apache.org/jira/browse/ARROW-3646) - [Python] Add convenience factories to create IO streams
-* [ARROW-3647](https://issues.apache.org/jira/browse/ARROW-3647) - [R] Crash after unloading bit64 package
-* [ARROW-3648](https://issues.apache.org/jira/browse/ARROW-3648) - [Plasma] Add API to get metadata and data at the same time
-* [ARROW-3649](https://issues.apache.org/jira/browse/ARROW-3649) - [Rust] Refactor MutableBuffer's resize
-* [ARROW-3656](https://issues.apache.org/jira/browse/ARROW-3656) - [C++] Allow whitespace in numeric CSV fields
-* [ARROW-3657](https://issues.apache.org/jira/browse/ARROW-3657) - [R] Require bit64 package
-* [ARROW-3659](https://issues.apache.org/jira/browse/ARROW-3659) - [C++] Clang Travis build (matrix entry 2) might not actually be using clang
-* [ARROW-3660](https://issues.apache.org/jira/browse/ARROW-3660) - [C++] Don't unnecessarily lock MemoryMappedFile for resizing in readonly files
-* [ARROW-3661](https://issues.apache.org/jira/browse/ARROW-3661) - [Gandiva][GLib] Improve constant name
-* [ARROW-3662](https://issues.apache.org/jira/browse/ARROW-3662) - [C++] Add a const overload to MemoryMappedFile::GetSize
-* [ARROW-3664](https://issues.apache.org/jira/browse/ARROW-3664) - [Rust] Add benchmark for PrimitiveArrayBuilder
-* [ARROW-3665](https://issues.apache.org/jira/browse/ARROW-3665) - [Rust] Implement StructArrayBuilder
-* [ARROW-3666](https://issues.apache.org/jira/browse/ARROW-3666) - [C++] Improve CSV parser performance
-* [ARROW-3672](https://issues.apache.org/jira/browse/ARROW-3672) - [Go] implement Time32 array
-* [ARROW-3673](https://issues.apache.org/jira/browse/ARROW-3673) - [Go] implement Time64 array
-* [ARROW-3674](https://issues.apache.org/jira/browse/ARROW-3674) - [Go] implement Date32 array
-* [ARROW-3675](https://issues.apache.org/jira/browse/ARROW-3675) - [Go] implement Date64 array
-* [ARROW-3677](https://issues.apache.org/jira/browse/ARROW-3677) - [Go] implement FixedSizedBinary array
-* [ARROW-3681](https://issues.apache.org/jira/browse/ARROW-3681) - [Go] add benchmarks for CSV reader
-* [ARROW-3682](https://issues.apache.org/jira/browse/ARROW-3682) - [Go] unexport encoding/csv.Reader from CSV reader
-* [ARROW-3683](https://issues.apache.org/jira/browse/ARROW-3683) - [Go] add functional-option style to CSV reader
-* [ARROW-3684](https://issues.apache.org/jira/browse/ARROW-3684) - [Go] add chunk size option to CSV reader
-* [ARROW-3692](https://issues.apache.org/jira/browse/ARROW-3692) - [Gandiva] [Ruby] Add Ruby bindings of Gandiva
-* [ARROW-3693](https://issues.apache.org/jira/browse/ARROW-3693) - [R] Invalid buffer for empty characters with null data
-* [ARROW-3694](https://issues.apache.org/jira/browse/ARROW-3694) - [Java] Avoid superfluous string creation when logging level is disabled
-* [ARROW-3695](https://issues.apache.org/jira/browse/ARROW-3695) - [Gandiva] Use add\_arrow\_lib()
-* [ARROW-3696](https://issues.apache.org/jira/browse/ARROW-3696) - [C++] Add feather::TableWriter::Write(table)
-* [ARROW-3697](https://issues.apache.org/jira/browse/ARROW-3697) - [Ruby] Add schema\#[]
-* [ARROW-3701](https://issues.apache.org/jira/browse/ARROW-3701) - [Gandiva] Add support for decimal operations
-* [ARROW-3708](https://issues.apache.org/jira/browse/ARROW-3708) - [Packaging] Nightly CentOS builds are failing
-* [ARROW-3713](https://issues.apache.org/jira/browse/ARROW-3713) - [Rust] Implement BinaryArrayBuilder
-* [ARROW-3718](https://issues.apache.org/jira/browse/ARROW-3718) - [Gandiva] Remove spurious gtest include
-* [ARROW-3719](https://issues.apache.org/jira/browse/ARROW-3719) - [GLib] Support read/write table to/from Feather
-* [ARROW-3720](https://issues.apache.org/jira/browse/ARROW-3720) - [GLib] Use "indices" instead of "indexes"
-* [ARROW-3721](https://issues.apache.org/jira/browse/ARROW-3721) - [Gandiva] [Python] Support all Gandiva literals
-* [ARROW-3722](https://issues.apache.org/jira/browse/ARROW-3722) - [C++] Allow specifying column types to CSV reader
-* [ARROW-3723](https://issues.apache.org/jira/browse/ARROW-3723) - [Plasma] [Ruby] Add Ruby bindings of Plasma
-* [ARROW-3724](https://issues.apache.org/jira/browse/ARROW-3724) - [GLib] Update gitignore
-* [ARROW-3725](https://issues.apache.org/jira/browse/ARROW-3725) - [GLib] Add field readers to GArrowStructDataType
-* [ARROW-3726](https://issues.apache.org/jira/browse/ARROW-3726) - [Rust] CSV Reader & Writer
-* [ARROW-3727](https://issues.apache.org/jira/browse/ARROW-3727) - [Python] Document use of pyarrow.foreign\_buffer, cuda.foreign\_buffer in Sphinx
-* [ARROW-3731](https://issues.apache.org/jira/browse/ARROW-3731) - [R] R API for reading and writing Parquet files
-* [ARROW-3733](https://issues.apache.org/jira/browse/ARROW-3733) - [GLib] Add to\_string() to GArrowTable and GArrowColumn
-* [ARROW-3736](https://issues.apache.org/jira/browse/ARROW-3736) - [CI/Docker] Ninja test in docker-compose run cpp hangs
-* [ARROW-3738](https://issues.apache.org/jira/browse/ARROW-3738) - [C++] Add CSV conversion option to parse ISO8601-like timestamp strings
-* [ARROW-3741](https://issues.apache.org/jira/browse/ARROW-3741) - [R] Add support for arrow::compute::Cast to convert Arrow arrays from one type to another
-* [ARROW-3743](https://issues.apache.org/jira/browse/ARROW-3743) - [Ruby] Add support for saving/loading Feather
-* [ARROW-3744](https://issues.apache.org/jira/browse/ARROW-3744) - [Ruby] Use garrow\_table\_to\_string() in Arrow::Table\#to\_s
-* [ARROW-3746](https://issues.apache.org/jira/browse/ARROW-3746) - [Gandiva] [Python] Make it possible to list all functions registered with Gandiva
-* [ARROW-3747](https://issues.apache.org/jira/browse/ARROW-3747) - [C++] Flip order of data members in arrow::Decimal128
-* [ARROW-3748](https://issues.apache.org/jira/browse/ARROW-3748) - [GLib] Add GArrowCSVReader
-* [ARROW-3749](https://issues.apache.org/jira/browse/ARROW-3749) - [GLib] Typos in documentation and test case name
-* [ARROW-3751](https://issues.apache.org/jira/browse/ARROW-3751) - [Python] Add more cython bindings for gandiva
-* [ARROW-3752](https://issues.apache.org/jira/browse/ARROW-3752) - [C++] Remove unused status::ArrowError
-* [ARROW-3753](https://issues.apache.org/jira/browse/ARROW-3753) - [Gandiva] Remove debug print
-* [ARROW-3755](https://issues.apache.org/jira/browse/ARROW-3755) - [GLib] Support for CompressedInputStream, CompressedOutputStream
-* [ARROW-3760](https://issues.apache.org/jira/browse/ARROW-3760) - [R] Support Arrow CSV reader
-* [ARROW-3773](https://issues.apache.org/jira/browse/ARROW-3773) - [C++] Remove duplicated AssertArraysEqual code in parquet/arrow/arrow-reader-writer-test.cc
-* [ARROW-3778](https://issues.apache.org/jira/browse/ARROW-3778) - [C++] Don't put implementations in test-util.h
-* [ARROW-3781](https://issues.apache.org/jira/browse/ARROW-3781) - [C++] Configure buffer size in arrow::io::BufferedOutputStream
-* [ARROW-3782](https://issues.apache.org/jira/browse/ARROW-3782) - [C++] Implement BufferedReader for C++
-* [ARROW-3784](https://issues.apache.org/jira/browse/ARROW-3784) - [R] Array with type fails with x is not a vector
-* [ARROW-3785](https://issues.apache.org/jira/browse/ARROW-3785) - [C++] Use double-conversion conda package in CI toolchain
-* [ARROW-3787](https://issues.apache.org/jira/browse/ARROW-3787) - Implement From<ListArray\> for BinaryArray
-* [ARROW-3788](https://issues.apache.org/jira/browse/ARROW-3788) - [Ruby] Add support for CSV parser writtin in C++
-* [ARROW-3795](https://issues.apache.org/jira/browse/ARROW-3795) - [R] Support for retrieving NAs from INT64 arrays
-* [ARROW-3796](https://issues.apache.org/jira/browse/ARROW-3796) - [Rust] Add Example for PrimitiveArrayBuilder
-* [ARROW-3798](https://issues.apache.org/jira/browse/ARROW-3798) - [GLib] Add support for column type CSV read options
-* [ARROW-3800](https://issues.apache.org/jira/browse/ARROW-3800) - [C++] Vendor a string\_view backport
-* [ARROW-3803](https://issues.apache.org/jira/browse/ARROW-3803) - [C++/Python] Split C++ and Python unit test Travis CI jobs, run all C++ tests (including Gandiva) together
-* [ARROW-3807](https://issues.apache.org/jira/browse/ARROW-3807) - [R] Missing Field API
-* [ARROW-3819](https://issues.apache.org/jira/browse/ARROW-3819) - [Packaging] Update conda variant files to conform with feedstock after compiler migration
-* [ARROW-3821](https://issues.apache.org/jira/browse/ARROW-3821) - [Format/Documentation]: Fix typos and grammar issues in Flight.proto comments
-* [ARROW-3823](https://issues.apache.org/jira/browse/ARROW-3823) - [R] + buffer.complex
-* [ARROW-3825](https://issues.apache.org/jira/browse/ARROW-3825) - [Python] The Python README.md does not show how to run the unit test suite
-* [ARROW-3826](https://issues.apache.org/jira/browse/ARROW-3826) - [C++] Determine if using ccache caching in Travis CI actually improves build times
-* [ARROW-3830](https://issues.apache.org/jira/browse/ARROW-3830) - [GLib] Add GArrowCodec
-* [ARROW-3834](https://issues.apache.org/jira/browse/ARROW-3834) - [Doc] Merge Python & C++ and move to top-level
-* [ARROW-3836](https://issues.apache.org/jira/browse/ARROW-3836) - [C++] Add PREFIX option to ADD\_ARROW\_BENCHMARK
-* [ARROW-3839](https://issues.apache.org/jira/browse/ARROW-3839) - [Rust] Add ability to infer schema in CSV reader
-* [ARROW-3841](https://issues.apache.org/jira/browse/ARROW-3841) - [C++] warning: catching polymorphic type by value
-* [ARROW-3842](https://issues.apache.org/jira/browse/ARROW-3842) - [R] RecordBatchStreamWriter api
-* [ARROW-3844](https://issues.apache.org/jira/browse/ARROW-3844) - [C++] Remove ARROW\_USE\_SSE and ARROW\_SSE3
-* [ARROW-3845](https://issues.apache.org/jira/browse/ARROW-3845) - [Gandiva] [GLib] Add GGandivaNode
-* [ARROW-3847](https://issues.apache.org/jira/browse/ARROW-3847) - [GLib] Remove unnecessary “\”.
-* [ARROW-3849](https://issues.apache.org/jira/browse/ARROW-3849) - Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64.
-* [ARROW-3851](https://issues.apache.org/jira/browse/ARROW-3851) - [C++] "make check-format" is slow
-* [ARROW-3852](https://issues.apache.org/jira/browse/ARROW-3852) - [C++] used uninitialized warning
-* [ARROW-3853](https://issues.apache.org/jira/browse/ARROW-3853) - [C++] Implement string to timestamp cast
-* [ARROW-3854](https://issues.apache.org/jira/browse/ARROW-3854) - [GLib] Deprecate garrow\_gio\_{input,output}\_stream\_get\_raw()
-* [ARROW-3855](https://issues.apache.org/jira/browse/ARROW-3855) - [Rust] Schema/Field/Datatype should implement serde traits
-* [ARROW-3856](https://issues.apache.org/jira/browse/ARROW-3856) - [Ruby] Support compressed CSV save/load
-* [ARROW-3858](https://issues.apache.org/jira/browse/ARROW-3858) - [GLib] Use {class\_name}\_get\_instance\_private
-* [ARROW-3859](https://issues.apache.org/jira/browse/ARROW-3859) - [Java] Fix ComplexWriter backward incompatible change
-* [ARROW-3860](https://issues.apache.org/jira/browse/ARROW-3860) - [Gandiva] [C++] Add option to use -static-libstdc++ when building libgandiva\_jni.so
-* [ARROW-3862](https://issues.apache.org/jira/browse/ARROW-3862) - [C++] Improve dependencies download script
-* [ARROW-3863](https://issues.apache.org/jira/browse/ARROW-3863) - [GLib] Use travis\_retry with brew bundle command
-* [ARROW-3864](https://issues.apache.org/jira/browse/ARROW-3864) - [GLib] Add support for allow-float-truncate cast option
-* [ARROW-3865](https://issues.apache.org/jira/browse/ARROW-3865) - [Packaging] Add double-conversion dependency to conda forge recipes and the windows wheel build
-* [ARROW-3867](https://issues.apache.org/jira/browse/ARROW-3867) - [Documentation] Uploading binary realase artifacts to Bintray
-* [ARROW-3868](https://issues.apache.org/jira/browse/ARROW-3868) - [Rust] Build against nightly Rust in CI
-* [ARROW-3870](https://issues.apache.org/jira/browse/ARROW-3870) - [C++] Add Peek to InputStream API
-* [ARROW-3871](https://issues.apache.org/jira/browse/ARROW-3871) - [R] Replace usages of C++ GetValuesSafely with new methods on ArrayData
-* [ARROW-3878](https://issues.apache.org/jira/browse/ARROW-3878) - [Rust] Improve primitive types
-* [ARROW-3880](https://issues.apache.org/jira/browse/ARROW-3880) - [Rust] PrimitiveArray<T\> should support simple math operations
-* [ARROW-3881](https://issues.apache.org/jira/browse/ARROW-3881) - [Rust] PrimitiveArray<T\> should support comparison operators
-* [ARROW-3883](https://issues.apache.org/jira/browse/ARROW-3883) - [Rust] Update Rust README to reflect new functionality
-* [ARROW-3884](https://issues.apache.org/jira/browse/ARROW-3884) - [Python] Add LLVM6 to manylinux1 base image
-* [ARROW-3885](https://issues.apache.org/jira/browse/ARROW-3885) - [Rust] Update version to 0.12.0 and update release instructions on wiki
-* [ARROW-3886](https://issues.apache.org/jira/browse/ARROW-3886) - [C++] Additional test cases for ARROW-3831
-* [ARROW-3891](https://issues.apache.org/jira/browse/ARROW-3891) - [Java] Remove Long.bitCount with simple bitmap operations
-* [ARROW-3893](https://issues.apache.org/jira/browse/ARROW-3893) - [C++] Improve adaptive int builder performance
-* [ARROW-3895](https://issues.apache.org/jira/browse/ARROW-3895) - [Rust] CSV reader should return Result<Option<\>\> not Option<Result<\>\>
-* [ARROW-3899](https://issues.apache.org/jira/browse/ARROW-3899) - [Python] Table.to\_pandas converts Arrow date32[day] to pandas datetime64[ns]
-* [ARROW-3900](https://issues.apache.org/jira/browse/ARROW-3900) - [GLib] Add garrow\_mutable\_buffer\_set\_data()
-* [ARROW-3905](https://issues.apache.org/jira/browse/ARROW-3905) - [Ruby] Add StructDataType\#[]
-* [ARROW-3906](https://issues.apache.org/jira/browse/ARROW-3906) - [C++] Break builder.cc into multiple compilation units
-* [ARROW-3908](https://issues.apache.org/jira/browse/ARROW-3908) - [Rust] Update rust dockerfile to use nightly toolchain
-* [ARROW-3910](https://issues.apache.org/jira/browse/ARROW-3910) - [Python] Set date\_as\_object to True in \*.to\_pandas as default after deduplicating logic implemented
-* [ARROW-3911](https://issues.apache.org/jira/browse/ARROW-3911) - [Python] Deduplicate datetime.date objects in Table.to\_pandas internals
-* [ARROW-3912](https://issues.apache.org/jira/browse/ARROW-3912) - [Plasma][GLib] Add support for creating and referring objects
-* [ARROW-3913](https://issues.apache.org/jira/browse/ARROW-3913) - [Gandiva] [GLib] Add GGandivaLiteralNode
-* [ARROW-3914](https://issues.apache.org/jira/browse/ARROW-3914) - [C++/Python/Packaging] Docker-compose setup for Alpine linux
-* [ARROW-3916](https://issues.apache.org/jira/browse/ARROW-3916) - [Python] Support caller-provided filesystem in \`ParquetWriter\` constructor
-* [ARROW-3921](https://issues.apache.org/jira/browse/ARROW-3921) - [CI][GLib] Log Homebrew output
-* [ARROW-3922](https://issues.apache.org/jira/browse/ARROW-3922) - [C++] improve the performance of bitmap operations
-* [ARROW-3924](https://issues.apache.org/jira/browse/ARROW-3924) - [Packaging][Plasma] Add support for Plasma deb/rpm packages
-* [ARROW-3925](https://issues.apache.org/jira/browse/ARROW-3925) - [Python] Include autoconf in Linux/macOS dependencies in conda environment
-* [ARROW-3928](https://issues.apache.org/jira/browse/ARROW-3928) - [Python] Add option to deduplicate PyBytes / PyString / PyUnicode objects in Table.to\_pandas conversion path
-* [ARROW-3929](https://issues.apache.org/jira/browse/ARROW-3929) - [Go] improve memory usage of CSV reader to improve runtime performances
-* [ARROW-3930](https://issues.apache.org/jira/browse/ARROW-3930) - [C++] Random test data generation is slow
-* [ARROW-3932](https://issues.apache.org/jira/browse/ARROW-3932) - [Python/Documentation] Include Benchmarks.md in Sphinx docs
-* [ARROW-3934](https://issues.apache.org/jira/browse/ARROW-3934) - [Gandiva] Don't compile precompiled tests if ARROW\_GANDIVA\_BUILD\_TESTS=off
-* [ARROW-3938](https://issues.apache.org/jira/browse/ARROW-3938) - [Packaging] Stop to refer java/pom.xml to get version information
-* [ARROW-3939](https://issues.apache.org/jira/browse/ARROW-3939) - [Rust] Remove macro definition for ListArrayBuilder
-* [ARROW-3945](https://issues.apache.org/jira/browse/ARROW-3945) - [Website] Blog post about Gandiva code donation
-* [ARROW-3946](https://issues.apache.org/jira/browse/ARROW-3946) - [GLib] Add support for union
-* [ARROW-3948](https://issues.apache.org/jira/browse/ARROW-3948) - [CI][GLib] Set timeout to Homebrew
-* [ARROW-3950](https://issues.apache.org/jira/browse/ARROW-3950) - [Plasma] Don't force loading the TensorFlow op on import
-* [ARROW-3952](https://issues.apache.org/jira/browse/ARROW-3952) - [Rust] Specify edition="2018" in Cargo.toml
-* [ARROW-3958](https://issues.apache.org/jira/browse/ARROW-3958) - [Plasma] Reduce number of IPCs
-* [ARROW-3959](https://issues.apache.org/jira/browse/ARROW-3959) - [Rust] Time and Timestamp Support
-* [ARROW-3960](https://issues.apache.org/jira/browse/ARROW-3960) - [Rust] remove extern crate for Rust 2018
-* [ARROW-3963](https://issues.apache.org/jira/browse/ARROW-3963) - [Packaging/Docker] Nightly test for building sphinx documentations
-* [ARROW-3964](https://issues.apache.org/jira/browse/ARROW-3964) - [Go] More readable example for csv.Reader
-* [ARROW-3967](https://issues.apache.org/jira/browse/ARROW-3967) - [Gandiva] [C++] Make gandiva/node.h public
-* [ARROW-3970](https://issues.apache.org/jira/browse/ARROW-3970) - [Gandiva][C++] Remove unnecessary boost dependencies
-* [ARROW-3971](https://issues.apache.org/jira/browse/ARROW-3971) - [Python] Remove APIs deprecated in 0.11 and prior
-* [ARROW-3974](https://issues.apache.org/jira/browse/ARROW-3974) - [C++] Combine field\_builders\_ and children\_ members in array/builder.h
-* [ARROW-3982](https://issues.apache.org/jira/browse/ARROW-3982) - [C++] Allow "binary" input in simple JSON format
-* [ARROW-3983](https://issues.apache.org/jira/browse/ARROW-3983) - [Gandiva][Crossbow] Use static boost while packaging
-* [ARROW-3984](https://issues.apache.org/jira/browse/ARROW-3984) - [C++] Exit with error if user hits zstd ExternalProject path
-* [ARROW-3986](https://issues.apache.org/jira/browse/ARROW-3986) - [C++] Write prose documentation
-* [ARROW-3986](https://issues.apache.org/jira/browse/ARROW-3986) - [C++] Write prose documentation
-* [ARROW-3987](https://issues.apache.org/jira/browse/ARROW-3987) - [Java] Benchmark results for ARROW-1807
-* [ARROW-3988](https://issues.apache.org/jira/browse/ARROW-3988) - [C++] Do not build unit tests by default in build system
-* [ARROW-3993](https://issues.apache.org/jira/browse/ARROW-3993) - [JS] CI Jobs Failing
-* [ARROW-3994](https://issues.apache.org/jira/browse/ARROW-3994) - [C++] Remove ARROW\_GANDIVA\_BUILD\_TESTS option
-* [ARROW-3995](https://issues.apache.org/jira/browse/ARROW-3995) - [CI] Use understandable names in Travis Matrix
-* [ARROW-3997](https://issues.apache.org/jira/browse/ARROW-3997) - [C++] [Doc] Clarify dictionary encoding integer signedness (and width?)
-* [ARROW-4002](https://issues.apache.org/jira/browse/ARROW-4002) - [C++][Gandiva] Remove CMake version check
-* [ARROW-4004](https://issues.apache.org/jira/browse/ARROW-4004) - [GLib] Replace GPU with CUDA
-* [ARROW-4005](https://issues.apache.org/jira/browse/ARROW-4005) - [Plasma] [GLib] Add gplasma\_client\_disconnect()
-* [ARROW-4006](https://issues.apache.org/jira/browse/ARROW-4006) - Add CODE\_OF\_CONDUCT.md
-* [ARROW-4009](https://issues.apache.org/jira/browse/ARROW-4009) - [CI] Run Valgrind and C++ code coverage in different bulds
-* [ARROW-4010](https://issues.apache.org/jira/browse/ARROW-4010) - [C++] Enable Travis CI scripts to only build and install only certain targets
-* [ARROW-4015](https://issues.apache.org/jira/browse/ARROW-4015) - [Plasma] remove legacy interfaces for plasma manager
-* [ARROW-4017](https://issues.apache.org/jira/browse/ARROW-4017) - [C++] Check and update vendored libraries
-* [ARROW-4026](https://issues.apache.org/jira/browse/ARROW-4026) - [C++] Use separate modular $COMPONENT-test targets for unit tests
-* [ARROW-4028](https://issues.apache.org/jira/browse/ARROW-4028) - [Rust] Merge parquet-rs codebase
-* [ARROW-4029](https://issues.apache.org/jira/browse/ARROW-4029) - [C++] Define and document naming convention for internal / private header files not to be installed
-* [ARROW-4030](https://issues.apache.org/jira/browse/ARROW-4030) - [CI] Use travis\_terminate to halt builds when a step fails
-* [ARROW-4035](https://issues.apache.org/jira/browse/ARROW-4035) - [Ruby] Support msys2 mingw dependencies
-* [ARROW-4037](https://issues.apache.org/jira/browse/ARROW-4037) - [Packaging] Remove workaround to verify 0.11.0
-* [ARROW-4038](https://issues.apache.org/jira/browse/ARROW-4038) - [Rust] Add array\_ops methods for boolean AND, OR, NOT
-* [ARROW-4039](https://issues.apache.org/jira/browse/ARROW-4039) - [Python] Update link to 'development.rst' page from Python README.md
-* [ARROW-4042](https://issues.apache.org/jira/browse/ARROW-4042) - [Rust] Inconsistent method naming between BinaryArray and PrimitiveArray
-* [ARROW-4043](https://issues.apache.org/jira/browse/ARROW-4043) - [Packaging/Docker] Python tests on alpine miss pytest dependency
-* [ARROW-4044](https://issues.apache.org/jira/browse/ARROW-4044) - [Packaging/Python] Add hypothesis test dependency to pyarrow conda recipe
-* [ARROW-4045](https://issues.apache.org/jira/browse/ARROW-4045) - [Packaging/Python] Add hypothesis test dependency to wheel crossbow tests
-* [ARROW-4048](https://issues.apache.org/jira/browse/ARROW-4048) - [GLib] Return ChunkedArray instead of Array in gparquet\_arrow\_file\_reader\_read\_column
-* [ARROW-4051](https://issues.apache.org/jira/browse/ARROW-4051) - [Gandiva] [GLib] Add support for null literal
-* [ARROW-4054](https://issues.apache.org/jira/browse/ARROW-4054) - [Python] Update gtest, flatbuffers and OpenSSL in manylinux1 base image
-* [ARROW-4060](https://issues.apache.org/jira/browse/ARROW-4060) - [Rust] Add Parquet/Arrow schema converter
-* [ARROW-4069](https://issues.apache.org/jira/browse/ARROW-4069) - [Python] Add tests for casting from binary to utf8
-* [ARROW-4075](https://issues.apache.org/jira/browse/ARROW-4075) - [Rust] Reuse array builder after calling finish()
-* [ARROW-4079](https://issues.apache.org/jira/browse/ARROW-4079) - [C++] Add machine benchmarks
-* [ARROW-4080](https://issues.apache.org/jira/browse/ARROW-4080) - [Rust] Improving lengthy build times in Appveyor
-* [ARROW-4082](https://issues.apache.org/jira/browse/ARROW-4082) - [C++] CMake tweaks: allow RelWithDebInfo, improve FindClangTools
-* [ARROW-4084](https://issues.apache.org/jira/browse/ARROW-4084) - [C++] Simplify Status and stringstream boilerplate
-* [ARROW-4085](https://issues.apache.org/jira/browse/ARROW-4085) - [GLib] Use "field" for struct data type
-* [ARROW-4087](https://issues.apache.org/jira/browse/ARROW-4087) - [C++] Make CSV nulls configurable
-* [ARROW-4093](https://issues.apache.org/jira/browse/ARROW-4093) - [C++] Deprecated method suggests wrong method
-* [ARROW-4098](https://issues.apache.org/jira/browse/ARROW-4098) - [Python] Deprecate pyarrow.open\_stream,open\_file in favor of pa.ipc.open\_stream/open\_file
-* [ARROW-4100](https://issues.apache.org/jira/browse/ARROW-4100) - [Gandiva][C++] Fix regex to ignore "." character
-* [ARROW-4102](https://issues.apache.org/jira/browse/ARROW-4102) - [C++] FixedSizeBinary identity cast not implemented
-* [ARROW-4103](https://issues.apache.org/jira/browse/ARROW-4103) - [Documentation] Add README to docs/ root
-* [ARROW-4105](https://issues.apache.org/jira/browse/ARROW-4105) - Add rust-toolchain to enforce user to use nightly toolchain for building
-* [ARROW-4107](https://issues.apache.org/jira/browse/ARROW-4107) - [Python] Use ninja in pyarrow manylinux1 build
-* [ARROW-4112](https://issues.apache.org/jira/browse/ARROW-4112) - [Packaging][Gandiva] Add support for deb packages
-* [ARROW-4116](https://issues.apache.org/jira/browse/ARROW-4116) - [Python] Clarify in development.rst that virtualenv cannot be used with miniconda/Anaconda
-* [ARROW-4122](https://issues.apache.org/jira/browse/ARROW-4122) - [C++] Initialize some uninitialized class members
-* [ARROW-4127](https://issues.apache.org/jira/browse/ARROW-4127) - [Documentation] Add Docker build instructions
-* [ARROW-4129](https://issues.apache.org/jira/browse/ARROW-4129) - [Python] Fix syntax problem in benchmark docs
-* [ARROW-4132](https://issues.apache.org/jira/browse/ARROW-4132) - [GLib] Add more GArrowTable constructors
-* [ARROW-4141](https://issues.apache.org/jira/browse/ARROW-4141) - [Ruby] Add support for creating schema from raw Ruby objects
-* [ARROW-4148](https://issues.apache.org/jira/browse/ARROW-4148) - [CI/Python] Disable ORC on nightly Alpine builds
-* [ARROW-4150](https://issues.apache.org/jira/browse/ARROW-4150) - [C++] Do not return buffers containing nullptr from internal allocations
-* [ARROW-4151](https://issues.apache.org/jira/browse/ARROW-4151) - [Rust] Restructure project directories
-* [ARROW-4152](https://issues.apache.org/jira/browse/ARROW-4152) - [GLib] Remove an example to show Torch integration
-* [ARROW-4153](https://issues.apache.org/jira/browse/ARROW-4153) - [GLib] Add builder\_append\_value() for consistency
-* [ARROW-4154](https://issues.apache.org/jira/browse/ARROW-4154) - [GLib] Add GArrowDecimal128DataType
-* [ARROW-4155](https://issues.apache.org/jira/browse/ARROW-4155) - [Rust] Implement array\_ops::sum() for PrimitiveArray<T\>
-* [ARROW-4156](https://issues.apache.org/jira/browse/ARROW-4156) - [C++] xcodebuild failure for cmake generated project
-* [ARROW-4158](https://issues.apache.org/jira/browse/ARROW-4158) - [Dev] Allow maintainers to use a GitHub API token when merging pull requests
-* [ARROW-4160](https://issues.apache.org/jira/browse/ARROW-4160) - [Rust] Add README and executable files to parquet
-* [ARROW-4161](https://issues.apache.org/jira/browse/ARROW-4161) - [GLib] Add GPlasmaClientOptions
-* [ARROW-4162](https://issues.apache.org/jira/browse/ARROW-4162) - [Ruby] Add support for creating data types from description
-* [ARROW-4166](https://issues.apache.org/jira/browse/ARROW-4166) - [Ruby] Add support for saving to and loading from buffer
-* [ARROW-4167](https://issues.apache.org/jira/browse/ARROW-4167) - [Gandiva] switch to arrow/util/variant
-* [ARROW-4168](https://issues.apache.org/jira/browse/ARROW-4168) - [GLib] Use property to keep GArrowDataType passed in garrow\_field\_new()
-* [ARROW-4172](https://issues.apache.org/jira/browse/ARROW-4172) - [Rust] more consistent naming in array builders
-* [ARROW-4174](https://issues.apache.org/jira/browse/ARROW-4174) - [Ruby] Add support for building composite array from raw Ruby objects
-* [ARROW-4175](https://issues.apache.org/jira/browse/ARROW-4175) - [GLib] Add support for decimal compare operators
-* [ARROW-4177](https://issues.apache.org/jira/browse/ARROW-4177) - [C++] Add ThreadPool and TaskGroup microbenchmarks
-* [ARROW-4183](https://issues.apache.org/jira/browse/ARROW-4183) - [Ruby] Add Arrow::Struct as an element of Arrow::StructArray
-* [ARROW-4184](https://issues.apache.org/jira/browse/ARROW-4184) - [Ruby] Add Arrow::RecordBatch\#to\_table
-* [ARROW-4191](https://issues.apache.org/jira/browse/ARROW-4191) - [C++] Use same CC and AR for jemalloc as for the main sources
-* [ARROW-4199](https://issues.apache.org/jira/browse/ARROW-4199) - [GLib] Add garrow\_seekable\_input\_stream\_peek()
-* [ARROW-4207](https://issues.apache.org/jira/browse/ARROW-4207) - [Gandiva] [GLib] Add support for IfNode
-* [ARROW-4210](https://issues.apache.org/jira/browse/ARROW-4210) - [Python] Mention boost-cpp directly in the conda meta.yaml for pyarrow
-* [ARROW-4211](https://issues.apache.org/jira/browse/ARROW-4211) - [GLib] Add GArrowFixedSizeBinaryDataType
-* [ARROW-4214](https://issues.apache.org/jira/browse/ARROW-4214) - [Ruby] Add support for building RecordBatch from raw Ruby objects
-* [ARROW-4216](https://issues.apache.org/jira/browse/ARROW-4216) - [Python] Add CUDA API docs
-* [ARROW-4228](https://issues.apache.org/jira/browse/ARROW-4228) - [GLib] Add garrow\_list\_data\_type\_get\_field()
-* [ARROW-4229](https://issues.apache.org/jira/browse/ARROW-4229) - [Packaging] Set crossbow target explicitly to enable building arbitrary arrow repo
-* [ARROW-4233](https://issues.apache.org/jira/browse/ARROW-4233) - [Packaging] Create a Dockerfile to build source archive
-* [ARROW-4239](https://issues.apache.org/jira/browse/ARROW-4239) - [Release] Updating .deb package names in the prepare script failed to run on OSX
-* [ARROW-4240](https://issues.apache.org/jira/browse/ARROW-4240) - [Packaging] Documents for Plasma GLib and Gandiva GLib are missing in source archive
-* [ARROW-4241](https://issues.apache.org/jira/browse/ARROW-4241) - [Packaging] Disable crossbow conda OSX clang builds
-* [ARROW-4243](https://issues.apache.org/jira/browse/ARROW-4243) - [Python] Test failure with pandas 0.24.0rc1
-* [ARROW-4249](https://issues.apache.org/jira/browse/ARROW-4249) - [Plasma] Remove reference to logging.h from plasma/common.h
-* [ARROW-4257](https://issues.apache.org/jira/browse/ARROW-4257) - [Release] Update release verification script to check binaries on Bintray
-* [ARROW-4266](https://issues.apache.org/jira/browse/ARROW-4266) - [Python][CI] Disable ORC tests in dask integration test
-* [ARROW-4269](https://issues.apache.org/jira/browse/ARROW-4269) - [Python] AttributeError: module 'pandas.core' has no attribute 'arrays'
-* [ARROW-4270](https://issues.apache.org/jira/browse/ARROW-4270) - [Packaging][Conda] Update xcode version and remove toolchain builds
-* [ARROW-4276](https://issues.apache.org/jira/browse/ARROW-4276) - [Release] Remove needless Bintray authentication from binaries verify script
-* [ARROW-4306](https://issues.apache.org/jira/browse/ARROW-4306) - [Release] Update website and add blog post announcing 0.12.0 release
-* [PARQUET-690](https://issues.apache.org/jira/browse/PARQUET-690) - [C++] Investigate / improve performance of Thrift utilities
-* [PARQUET-1271](https://issues.apache.org/jira/browse/PARQUET-1271) - [C++] "parquet\_reader" should be "parquet-reader"
-* [PARQUET-1439](https://issues.apache.org/jira/browse/PARQUET-1439) - [C++] Parquet build fails when PARQUET\_ARROW\_LINKAGE is static
-* [PARQUET-1449](https://issues.apache.org/jira/browse/PARQUET-1449) - [C++] Can't build with ARROW\_BOOST\_VENDORED=ON
-* [PARQUET-1463](https://issues.apache.org/jira/browse/PARQUET-1463) - [C++] Utilize revamped common hashing machinery for dictionary encoding
-* [PARQUET-1467](https://issues.apache.org/jira/browse/PARQUET-1467) - [C++] Remove ChunkedAllocator code, now unused
-* [PARQUET-1473](https://issues.apache.org/jira/browse/PARQUET-1473) - [C++] Add helper function that converts ParquetVersion to human-friendly string
-* [PARQUET-1484](https://issues.apache.org/jira/browse/PARQUET-1484) - [C++] Improve memory usage of FileMetaDataBuilder
-
-
-## Bug Fixes
-
-* [ARROW-1847](https://issues.apache.org/jira/browse/ARROW-1847) - [Doc] Document the difference between RecordBatch and Table in an FAQ fashion
-* [ARROW-2026](https://issues.apache.org/jira/browse/ARROW-2026) - [Python] Cast all timestamp resolutions to INT96 use\_deprecated\_int96\_timestamps=True
-* [ARROW-2038](https://issues.apache.org/jira/browse/ARROW-2038) - [Python] Follow-up bug fixes for s3fs Parquet support
-* [ARROW-2113](https://issues.apache.org/jira/browse/ARROW-2113) - [Python] Incomplete CLASSPATH with "hadoop" contained in it can fool the classpath setting HDFS logic
-* [ARROW-2591](https://issues.apache.org/jira/browse/ARROW-2591) - [Python] Segmentation fault when writing empty ListType column to Parquet
-* [ARROW-2592](https://issues.apache.org/jira/browse/ARROW-2592) - [Python] Error reading old Parquet file due to metadata backwards compatibility issue
-* [ARROW-2654](https://issues.apache.org/jira/browse/ARROW-2654) - [Python] Error with errno 22 when loading 3.6 GB Parquet file
-* [ARROW-2708](https://issues.apache.org/jira/browse/ARROW-2708) - [C++] Internal GetValues function in arrow::compute should check for nullptr
-* [ARROW-2831](https://issues.apache.org/jira/browse/ARROW-2831) - [Plasma] MemoryError in teardown
-* [ARROW-2970](https://issues.apache.org/jira/browse/ARROW-2970) - [Python] NumPyConverter::Visit for Binary/String/FixedSizeBinary can overflow
-* [ARROW-2987](https://issues.apache.org/jira/browse/ARROW-2987) - [Python] test\_cython\_api can fail if run in an environment where vsvarsall.bat has been run more than once
-* [ARROW-3048](https://issues.apache.org/jira/browse/ARROW-3048) - [Python] Import pyarrow fails if scikit-learn is installed from conda (boost-cpp / libboost issue)
-* [ARROW-3058](https://issues.apache.org/jira/browse/ARROW-3058) - [Python] Feather reads fail with unintuitive error when conversion from pandas yields ChunkedArray
-* [ARROW-3186](https://issues.apache.org/jira/browse/ARROW-3186) - [GLib] mesonbuild failures in Travis CI
-* [ARROW-3202](https://issues.apache.org/jira/browse/ARROW-3202) - [C++] Build does not succeed on Alpine Linux
-* [ARROW-3225](https://issues.apache.org/jira/browse/ARROW-3225) - [C++/Python] Pandas object conversion of ListType<DateType\> and ListType<TimeType\>
-* [ARROW-3324](https://issues.apache.org/jira/browse/ARROW-3324) - [Parquet] Free more internal resources when writing multiple row groups
-* [ARROW-3343](https://issues.apache.org/jira/browse/ARROW-3343) - [Java] Java tests fail non-deterministically with memory leak from Flight tests
-* [ARROW-3405](https://issues.apache.org/jira/browse/ARROW-3405) - [Python] Document CSV reader
-* [ARROW-3428](https://issues.apache.org/jira/browse/ARROW-3428) - [Python] from\_pandas gives incorrect results when converting floating point to bool
-* [ARROW-3436](https://issues.apache.org/jira/browse/ARROW-3436) - [C++] Boost version required by Gandiva is too new for Ubuntu 14.04
-* [ARROW-3437](https://issues.apache.org/jira/browse/ARROW-3437) - [Gandiva][C++] Configure static linking of libgcc, libstdc++ with LDFLAGS
-* [ARROW-3438](https://issues.apache.org/jira/browse/ARROW-3438) - [Packaging] Escaped bulletpoints in changelog
-* [ARROW-3445](https://issues.apache.org/jira/browse/ARROW-3445) - [GLib] Parquet GLib doesn't link Arrow GLib
-* [ARROW-3449](https://issues.apache.org/jira/browse/ARROW-3449) - [C++] Support CMake 3.2 for "out of the box" builds
-* [ARROW-3466](https://issues.apache.org/jira/browse/ARROW-3466) - [Python] Crash when importing tensorflow and pyarrow
-* [ARROW-3467](https://issues.apache.org/jira/browse/ARROW-3467) - Building against external double conversion is broken
-* [ARROW-3470](https://issues.apache.org/jira/browse/ARROW-3470) - [C++] Row-wise conversion tutorial has fallen out of date
-* [ARROW-3477](https://issues.apache.org/jira/browse/ARROW-3477) - [C++] Testsuite fails on 32 bit arch
-* [ARROW-3480](https://issues.apache.org/jira/browse/ARROW-3480) - [Website] Install document for Ubuntu is broken
-* [ARROW-3483](https://issues.apache.org/jira/browse/ARROW-3483) - [CI] Python 3.6 build failure on Travis-CI
-* [ARROW-3485](https://issues.apache.org/jira/browse/ARROW-3485) - [C++] Examples fail with Protobuf error
-* [ARROW-3494](https://issues.apache.org/jira/browse/ARROW-3494) - [C++] re2 conda-forge package not working in toolchain
-* [ARROW-3498](https://issues.apache.org/jira/browse/ARROW-3498) - [R] Make IPC APIs consistent
-* [ARROW-3516](https://issues.apache.org/jira/browse/ARROW-3516) - [C++] Use unsigned type for difference of pointers in parallel\_memcpy
-* [ARROW-3517](https://issues.apache.org/jira/browse/ARROW-3517) - [C++] MinGW 32bit build causes g++ segv
-* [ARROW-3524](https://issues.apache.org/jira/browse/ARROW-3524) - [C++] Fix compiler warnings from ARROW-3409 on clang-6
-* [ARROW-3527](https://issues.apache.org/jira/browse/ARROW-3527) - [R] Unused variables in R-package C++ code
-* [ARROW-3528](https://issues.apache.org/jira/browse/ARROW-3528) - [R] Typo in R documentation
-* [ARROW-3535](https://issues.apache.org/jira/browse/ARROW-3535) - [Python] pip install tensorflow install too new numpy in manylinux1 build
-* [ARROW-3541](https://issues.apache.org/jira/browse/ARROW-3541) - [Rust] Update BufferBuilder to allow for new bit-packed BooleanArray
-* [ARROW-3544](https://issues.apache.org/jira/browse/ARROW-3544) - [Gandiva] Populate function registry in multiple compilation units to mitigate long compile times in release mode
-* [ARROW-3549](https://issues.apache.org/jira/browse/ARROW-3549) - [Rust] Replace i64 with usize for some bit utility functions
-* [ARROW-3573](https://issues.apache.org/jira/browse/ARROW-3573) - [Rust] with\_bitset does not set valid bits correctly
-* [ARROW-3580](https://issues.apache.org/jira/browse/ARROW-3580) - [Gandiva][C++] Build error with g++ 8.2.0
-* [ARROW-3586](https://issues.apache.org/jira/browse/ARROW-3586) - [Python] Segmentation fault when converting empty table to pandas with categoricals
-* [ARROW-3598](https://issues.apache.org/jira/browse/ARROW-3598) - [Plasma] plasma\_store\_server fails linking with GPU enabled
-* [ARROW-3613](https://issues.apache.org/jira/browse/ARROW-3613) - [Go] Resize does not correctly update the length
-* [ARROW-3613](https://issues.apache.org/jira/browse/ARROW-3613) - [Go] Resize does not correctly update the length
-* [ARROW-3614](https://issues.apache.org/jira/browse/ARROW-3614) - [R] Handle Type::TIMESTAMP from Arrow to R
-* [ARROW-3634](https://issues.apache.org/jira/browse/ARROW-3634) - [GLib] cuda.cpp compile error
-* [ARROW-3637](https://issues.apache.org/jira/browse/ARROW-3637) - [Go] Implement Stringer for arrays
-* [ARROW-3658](https://issues.apache.org/jira/browse/ARROW-3658) - [Rust] validation of offsets buffer is incorrect for \`List<T\>\`
-* [ARROW-3670](https://issues.apache.org/jira/browse/ARROW-3670) - [C++] Use FindBacktrace to find execinfo.h support
-* [ARROW-3687](https://issues.apache.org/jira/browse/ARROW-3687) - [Rust] Anything measuring array slots should be \`usize\`
-* [ARROW-3698](https://issues.apache.org/jira/browse/ARROW-3698) - [C++] Segmentation fault when using a large table in Gandiva
-* [ARROW-3700](https://issues.apache.org/jira/browse/ARROW-3700) - [C++] CSV parser should allow ignoring empty lines
-* [ARROW-3703](https://issues.apache.org/jira/browse/ARROW-3703) - [Python] DataFrame.to\_parquet crashes if datetime column has time zones
-* [ARROW-3704](https://issues.apache.org/jira/browse/ARROW-3704) - [Gandiva] Can't build with g++ 8.2.0
-* [ARROW-3707](https://issues.apache.org/jira/browse/ARROW-3707) - [C++] test failure with zstd 1.3.7
-* [ARROW-3711](https://issues.apache.org/jira/browse/ARROW-3711) - [C++] Don't pass CXX\_FLAGS to C\_FLAGS
-* [ARROW-3712](https://issues.apache.org/jira/browse/ARROW-3712) - [CI] License check regression (RAT failure)
-* [ARROW-3715](https://issues.apache.org/jira/browse/ARROW-3715) - [C++] gflags\_ep fails to build with CMake 3.13
-* [ARROW-3716](https://issues.apache.org/jira/browse/ARROW-3716) - [R] Missing cases for ChunkedArray conversion
-* [ARROW-3728](https://issues.apache.org/jira/browse/ARROW-3728) - [Python] Merging Parquet Files - Pandas Meta in Schema Mismatch
-* [ARROW-3734](https://issues.apache.org/jira/browse/ARROW-3734) - [C++] Linking static zstd library fails on Arch x86-64
-* [ARROW-3740](https://issues.apache.org/jira/browse/ARROW-3740) - [C++] Calling ArrayBuilder::Resize with length smaller than current appended length results in invalid state
-* [ARROW-3742](https://issues.apache.org/jira/browse/ARROW-3742) - Fix pyarrow.types & gandiva cython bindings
-* [ARROW-3745](https://issues.apache.org/jira/browse/ARROW-3745) - [C++] CMake passes static libraries multiple times to linker
-* [ARROW-3754](https://issues.apache.org/jira/browse/ARROW-3754) - [Packaging] Zstd configure error on linux package builds
-* [ARROW-3756](https://issues.apache.org/jira/browse/ARROW-3756) - [CI/Docker/Java] Java tests are failing in docker-compose setup
-* [ARROW-3765](https://issues.apache.org/jira/browse/ARROW-3765) - [Gandiva] Segfault when the validity bitmap has not been allocated
-* [ARROW-3766](https://issues.apache.org/jira/browse/ARROW-3766) - [Python] pa.Table.from\_pandas doesn't use schema ordering
-* [ARROW-3768](https://issues.apache.org/jira/browse/ARROW-3768) - [Python] set classpath to hdfs not hadoop executable
-* [ARROW-3775](https://issues.apache.org/jira/browse/ARROW-3775) - [C++] Handling Parquet Arrow reads that overflow a BinaryArray capacity
-* [ARROW-3790](https://issues.apache.org/jira/browse/ARROW-3790) - [C++] Signed to unsigned integer cast yields incorrect results when type sizes are the same
-* [ARROW-3792](https://issues.apache.org/jira/browse/ARROW-3792) - [Python] Segmentation fault when writing empty RecordBatches to Parquet
-* [ARROW-3793](https://issues.apache.org/jira/browse/ARROW-3793) - [C++] TestScalarAppendUnsafe is not testing unsafe appends
-* [ARROW-3797](https://issues.apache.org/jira/browse/ARROW-3797) - [Rust] BinaryArray::value\_offset incorrect in offset case
-* [ARROW-3805](https://issues.apache.org/jira/browse/ARROW-3805) - [Gandiva] handle null validity bitmap in if-else expressions
-* [ARROW-3831](https://issues.apache.org/jira/browse/ARROW-3831) - [C++] arrow::util::Codec::Decompress() doesn't return decompressed data size
-* [ARROW-3835](https://issues.apache.org/jira/browse/ARROW-3835) - [C++] arrow::io::CompressedOutputStream::raw() impementation is missing
-* [ARROW-3837](https://issues.apache.org/jira/browse/ARROW-3837) - [C++] gflags link errors on Windows
-* [ARROW-3866](https://issues.apache.org/jira/browse/ARROW-3866) - [Python] Column metadata is not transferred to tables in pyarrow
-* [ARROW-3869](https://issues.apache.org/jira/browse/ARROW-3869) - [Rust] "invalid fastbin errors" since Rust nightly-2018-11-03
-* [ARROW-3874](https://issues.apache.org/jira/browse/ARROW-3874) - [Gandiva] Cannot build: LLVM not detected correctly
-* [ARROW-3879](https://issues.apache.org/jira/browse/ARROW-3879) - [C++] cuda-test failure
-* [ARROW-3888](https://issues.apache.org/jira/browse/ARROW-3888) - [C++] Compilation warnings with gcc 7.3.0
-* [ARROW-3889](https://issues.apache.org/jira/browse/ARROW-3889) - [Python] creating schema with invalid paramaters causes segmanetation fault
-* [ARROW-3890](https://issues.apache.org/jira/browse/ARROW-3890) - [Python] Creating Array with explicit string type fails on Python 2.7
-* [ARROW-3894](https://issues.apache.org/jira/browse/ARROW-3894) - [Python] Error reading IPC file with no record batches
-* [ARROW-3898](https://issues.apache.org/jira/browse/ARROW-3898) - parquet-arrow example has compilation errors
-* [ARROW-3909](https://issues.apache.org/jira/browse/ARROW-3909) - [Python] Table.from\_pandas call that seemingly should zero copy does not
-* [ARROW-3918](https://issues.apache.org/jira/browse/ARROW-3918) - [Python] ParquetWriter.write\_table doesn't support coerce\_timestamps or allow\_truncated\_timestamps
-* [ARROW-3920](https://issues.apache.org/jira/browse/ARROW-3920) - Plasma reference counting not properly done in TensorFlow custom operator.
-* [ARROW-3931](https://issues.apache.org/jira/browse/ARROW-3931) - [C++] Make possible to build regardless of LANG
-* [ARROW-3936](https://issues.apache.org/jira/browse/ARROW-3936) - Add \_O\_NOINHERIT to the file open flags on Windows
-* [ARROW-3937](https://issues.apache.org/jira/browse/ARROW-3937) - [Rust] Rust nightly build is failing
-* [ARROW-3940](https://issues.apache.org/jira/browse/ARROW-3940) - [Python/Documentation] Add required packages to the development instruction
-* [ARROW-3941](https://issues.apache.org/jira/browse/ARROW-3941) - [R] RecordBatchStreamReader$schema
-* [ARROW-3942](https://issues.apache.org/jira/browse/ARROW-3942) - [R] Feather api fixes
-* [ARROW-3953](https://issues.apache.org/jira/browse/ARROW-3953) - Compat with pandas 0.24 rename of MultiIndex labels -\> codes
-* [ARROW-3955](https://issues.apache.org/jira/browse/ARROW-3955) - [GLib] Add (transfer full) to free when no longer needed
-* [ARROW-3957](https://issues.apache.org/jira/browse/ARROW-3957) - [Python] Better error message when user connects to HDFS cluster with wrong port
-* [ARROW-3961](https://issues.apache.org/jira/browse/ARROW-3961) - [Python/Documentation] Fix wrong path in the pyarrow README
-* [ARROW-3969](https://issues.apache.org/jira/browse/ARROW-3969) - [Rust] CI build broken because rustfmt not available on nightly toolchain
-* [ARROW-3976](https://issues.apache.org/jira/browse/ARROW-3976) - [Ruby] Homebrew donation solicitation on CLI breaking CI builds
-* [ARROW-3977](https://issues.apache.org/jira/browse/ARROW-3977) - [Gandiva] gandiva cpp tests not running in CI
-* [ARROW-3979](https://issues.apache.org/jira/browse/ARROW-3979) - [Gandiva] fix all valgrind reported errors
-* [ARROW-3980](https://issues.apache.org/jira/browse/ARROW-3980) - [C++] Fix CRTP use in json-simple.cc
-* [ARROW-3989](https://issues.apache.org/jira/browse/ARROW-3989) - [Rust] CSV reader should handle case sensitivity for boolean values
-* [ARROW-3996](https://issues.apache.org/jira/browse/ARROW-3996) - [C++] Insufficient description on build
-* [ARROW-4008](https://issues.apache.org/jira/browse/ARROW-4008) - [C++] Integration test executable failure
-* [ARROW-4011](https://issues.apache.org/jira/browse/ARROW-4011) - [Gandiva] Refer irhelpers.bc in build directory
-* [ARROW-4019](https://issues.apache.org/jira/browse/ARROW-4019) - [C++] Fix coverity issues
-* [ARROW-4033](https://issues.apache.org/jira/browse/ARROW-4033) - [C++] thirdparty/download\_dependencies.sh uses tools or options not available in older Linuxes
-* [ARROW-4034](https://issues.apache.org/jira/browse/ARROW-4034) - [Ruby] Interface for FileOutputStream doesn't respect append=True
-* [ARROW-4041](https://issues.apache.org/jira/browse/ARROW-4041) - [CI] Python 2.7 run uses Python 3.6
-* [ARROW-4049](https://issues.apache.org/jira/browse/ARROW-4049) - [C++] Arrow never use glog even though glog is linked.
-* [ARROW-4052](https://issues.apache.org/jira/browse/ARROW-4052) - [C++] Linker errors with glog and gflags
-* [ARROW-4053](https://issues.apache.org/jira/browse/ARROW-4053) - [Python/Integration] HDFS Tests failing with I/O operation on closed file
-* [ARROW-4055](https://issues.apache.org/jira/browse/ARROW-4055) - [Python] Fails to convert pytz.utc with versions 2018.3 and earlier
-* [ARROW-4058](https://issues.apache.org/jira/browse/ARROW-4058) - [C++] arrow-io-hdfs-test fails when run against HDFS cluster from docker-compose
-* [ARROW-4065](https://issues.apache.org/jira/browse/ARROW-4065) - [C++] arrowTargets.cmake is broken
-* [ARROW-4066](https://issues.apache.org/jira/browse/ARROW-4066) - Instructions to create Sphinx documentation
-* [ARROW-4070](https://issues.apache.org/jira/browse/ARROW-4070) - [C++] ARROW\_BOOST\_VENDORED doesn't work properly with ninja build
-* [ARROW-4073](https://issues.apache.org/jira/browse/ARROW-4073) - [Python] Parquet test failures on AppVeyor
-* [ARROW-4074](https://issues.apache.org/jira/browse/ARROW-4074) - [Python] test\_get\_library\_dirs\_win32 fails if libraries installed someplace different from conda or wheel packages
-* [ARROW-4078](https://issues.apache.org/jira/browse/ARROW-4078) - [CI] Run Travis job where documentation is built when docs/ is changed
-* [ARROW-4088](https://issues.apache.org/jira/browse/ARROW-4088) - [Python] Table.from\_batches() fails when passed a schema with metadata
-* [ARROW-4089](https://issues.apache.org/jira/browse/ARROW-4089) - [Plasma] The tutorial is wrong regarding the parameter type of PlasmaClient.Create
-* [ARROW-4101](https://issues.apache.org/jira/browse/ARROW-4101) - [C++] Binary identity cast not implemented
-* [ARROW-4106](https://issues.apache.org/jira/browse/ARROW-4106) - [Python] Tests fail to run because hypothesis update broke its API
-* [ARROW-4109](https://issues.apache.org/jira/browse/ARROW-4109) - [Packaging] Missing glog dependency from arrow-cpp conda recipe
-* [ARROW-4113](https://issues.apache.org/jira/browse/ARROW-4113) - [R] Version number patch broke build
-* [ARROW-4114](https://issues.apache.org/jira/browse/ARROW-4114) - [C++][DOCUMENTATION] Add "python" to Linux build instructions
-* [ARROW-4115](https://issues.apache.org/jira/browse/ARROW-4115) - [Gandiva] valgrind complains that boolean output data buffer has uninited data
-* [ARROW-4118](https://issues.apache.org/jira/browse/ARROW-4118) - [Python] Error with "asv run"
-* [ARROW-4125](https://issues.apache.org/jira/browse/ARROW-4125) - [Python] ASV benchmarks fail to run if Plasma extension is not built (e.g. on Windows)
-* [ARROW-4126](https://issues.apache.org/jira/browse/ARROW-4126) - [Go] offset not used when accessing boolean array
-* [ARROW-4128](https://issues.apache.org/jira/browse/ARROW-4128) - [C++][DOCUMENTATION] Update style guide to reflect some more exceptions
-* [ARROW-4130](https://issues.apache.org/jira/browse/ARROW-4130) - [Go] offset not used when accessing binary array
-* [ARROW-4134](https://issues.apache.org/jira/browse/ARROW-4134) - [Packaging] Properly setup timezone in docker tests to prevent ORC adapter's abort
-* [ARROW-4135](https://issues.apache.org/jira/browse/ARROW-4135) - [Python] Can't reload a pandas dataframe containing a list of datetime.time
-* [ARROW-4137](https://issues.apache.org/jira/browse/ARROW-4137) - [Rust] Move parquet code into a separate crate
-* [ARROW-4138](https://issues.apache.org/jira/browse/ARROW-4138) - [Python] setuptools\_scm customization does not work for versions above 0.9.0 on Windows
-* [ARROW-4147](https://issues.apache.org/jira/browse/ARROW-4147) - [JAVA] Reduce heap usage for variable width vectors
-* [ARROW-4149](https://issues.apache.org/jira/browse/ARROW-4149) - [CI/C++] Parquet test misses ZSTD compression codec in CMake 3.2 nightly builds
-* [ARROW-4157](https://issues.apache.org/jira/browse/ARROW-4157) - [C++] -Wdocumentation failures with clang 6.0 on Ubuntu 18.04
-* [ARROW-4171](https://issues.apache.org/jira/browse/ARROW-4171) - [Rust] fix parquet crate release version
-* [ARROW-4173](https://issues.apache.org/jira/browse/ARROW-4173) - JIRA library name is wrong in error message of dev/merge\_arrow\_pr.py
-* [ARROW-4178](https://issues.apache.org/jira/browse/ARROW-4178) - [C++] Fix TSan and UBSan errors
-* [ARROW-4179](https://issues.apache.org/jira/browse/ARROW-4179) - [Python] Tests crashing on all platforms in CI
-* [ARROW-4182](https://issues.apache.org/jira/browse/ARROW-4182) - [Python][CI] SEGV frequency
-* [ARROW-4185](https://issues.apache.org/jira/browse/ARROW-4185) - [Rust] Appveyor builds are broken
-* [ARROW-4186](https://issues.apache.org/jira/browse/ARROW-4186) - [C++] BitmapWriters clobber the first byte when length=0
-* [ARROW-4188](https://issues.apache.org/jira/browse/ARROW-4188) - [Rust] There should be a README in the top level rust directory
-* [ARROW-4197](https://issues.apache.org/jira/browse/ARROW-4197) - [C++] Emscripten compiler fails building Arrow
-* [ARROW-4200](https://issues.apache.org/jira/browse/ARROW-4200) - [C++] conda\_env\_\* files cannot be used to create a fresh conda environment on Windows
-* [ARROW-4209](https://issues.apache.org/jira/browse/ARROW-4209) - [Gandiva] returning IR structs causes issues with windows
-* [ARROW-4215](https://issues.apache.org/jira/browse/ARROW-4215) - [GLib] Fix typos in documentation
-* [ARROW-4227](https://issues.apache.org/jira/browse/ARROW-4227) - [GLib] Field in composite data type returns wrong data type
-* [ARROW-4237](https://issues.apache.org/jira/browse/ARROW-4237) - [Packaging] Fix CMAKE\_INSTALL\_LIBDIR in release verification script
-* [ARROW-4238](https://issues.apache.org/jira/browse/ARROW-4238) - [Packaging] Fix RC version conflict between crossbow and rake
-* [ARROW-4246](https://issues.apache.org/jira/browse/ARROW-4246) - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma
-* [ARROW-4246](https://issues.apache.org/jira/browse/ARROW-4246) - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma
-* [ARROW-4256](https://issues.apache.org/jira/browse/ARROW-4256) - [Release] Update Windows verification script for 0.12 release
-* [ARROW-4258](https://issues.apache.org/jira/browse/ARROW-4258) - [Python] Safe cast fails from numpy float64 array with nans to integer
-* [ARROW-4260](https://issues.apache.org/jira/browse/ARROW-4260) - [Python] test\_serialize\_deserialize\_pandas is failing in multiple build entries
-* [PARQUET-1426](https://issues.apache.org/jira/browse/PARQUET-1426) - [C++] parquet-dump-schema has poor usability
-* [PARQUET-1458](https://issues.apache.org/jira/browse/PARQUET-1458) - [C++] parquet::CompressionToString not recognizing brotli compression
-* [PARQUET-1469](https://issues.apache.org/jira/browse/PARQUET-1469) - [C++] DefinitionLevelsToBitmap can overwrite prior decoded data
-* [PARQUET-1471](https://issues.apache.org/jira/browse/PARQUET-1471) - [C++] Out of bounds access in statistics UpdateSpaced when writing optional list with null list slots
-* [PARQUET-1481](https://issues.apache.org/jira/browse/PARQUET-1481) - [C++] SEGV when reading corrupt parquet file
-
-
-
-# Apache Arrow 0.11.1 (2018-10-23)
-
-## New Features and Improvements
-
-* [ARROW-3353](https://issues.apache.org/jira/browse/ARROW-3353) - [Packaging] Build python 3.7 wheels
-* [ARROW-3534](https://issues.apache.org/jira/browse/ARROW-3534) - [Python] Update zlib library in manylinux1 image
-* [ARROW-3546](https://issues.apache.org/jira/browse/ARROW-3546) - [Python] Provide testing setup to verify wheel binaries work in one or more common Linux distributions
-* [ARROW-3565](https://issues.apache.org/jira/browse/ARROW-3565) - [Python] Pin tensorflow to 1.11.0 in manylinux1 container
-
-
-## Bug Fixes
-
-* [ARROW-3514](https://issues.apache.org/jira/browse/ARROW-3514) - [Python] zlib deflate exception when writing Parquet file
-* [ARROW-3907](https://issues.apache.org/jira/browse/ARROW-3907) - [Python] from\_pandas errors when schemas are used with lower resolution timestamps
-
-
-
-# Apache Arrow 0.11.0 (2018-10-08)
-
-## New Features and Improvements
-
-* [ARROW-25](https://issues.apache.org/jira/browse/ARROW-25) - [C++] Implement delimited file scanner / CSV reader
-* [ARROW-249](https://issues.apache.org/jira/browse/ARROW-249) - [Flight] Define GRPC IDL / wire protocol for messaging with Arrow data
-* [ARROW-614](https://issues.apache.org/jira/browse/ARROW-614) - [C++] Use glog (or some other tool) to print stack traces in debug builds on errors
-* [ARROW-1325](https://issues.apache.org/jira/browse/ARROW-1325) - [R] Bootstrap R bindings subproject
-* [ARROW-1424](https://issues.apache.org/jira/browse/ARROW-1424) - [Python] Initial bindings for libarrow\_gpu
-* [ARROW-1491](https://issues.apache.org/jira/browse/ARROW-1491) - [C++] Add casting implementations from strings to numbers or boolean
-* [ARROW-1521](https://issues.apache.org/jira/browse/ARROW-1521) - [C++] Add Reset method to BufferOutputStream to enable object reuse
-* [ARROW-1563](https://issues.apache.org/jira/browse/ARROW-1563) - [C++] Implement logical unary and binary kernels for boolean arrays
-* [ARROW-1563](https://issues.apache.org/jira/browse/ARROW-1563) - [C++] Implement logical unary and binary kernels for boolean arrays
-* [ARROW-1860](https://issues.apache.org/jira/browse/ARROW-1860) - [C++] Add data structure to "stage" a sequence of IPC messages from in-memory data
-* [ARROW-1949](https://issues.apache.org/jira/browse/ARROW-1949) - [Python/C++] Add option to Array.from\_pandas and pyarrow.array to perform unsafe casts
-* [ARROW-1963](https://issues.apache.org/jira/browse/ARROW-1963) - [C++/Python] Create Array from sequence of numpy.datetime64
-* [ARROW-1968](https://issues.apache.org/jira/browse/ARROW-1968) - [Python] Unit testing setup for ORC files
-* [ARROW-2165](https://issues.apache.org/jira/browse/ARROW-2165) - enhance AllocatorListener to listen for child allocator addition and removal
-* [ARROW-2338](https://issues.apache.org/jira/browse/ARROW-2338) - [Scripts] Windows release verification script should create a conda environment
-* [ARROW-2352](https://issues.apache.org/jira/browse/ARROW-2352) - [C++/Python] Test OSX packaging in Travis matrix
-* [ARROW-2519](https://issues.apache.org/jira/browse/ARROW-2519) - [Rust] Implement min/max for primitive arrays
-* [ARROW-2520](https://issues.apache.org/jira/browse/ARROW-2520) - [Rust] CI should also build against nightly Rust
-* [ARROW-2555](https://issues.apache.org/jira/browse/ARROW-2555) - [Python] Provide an option to convert on coerce\_timestamps instead of error
-* [ARROW-2583](https://issues.apache.org/jira/browse/ARROW-2583) - [Rust] Buffer should be typeless
-* [ARROW-2617](https://issues.apache.org/jira/browse/ARROW-2617) - [Rust] Schema should contain fields not columns
-* [ARROW-2687](https://issues.apache.org/jira/browse/ARROW-2687) - [JS] Example usage in README is outdated
-* [ARROW-2734](https://issues.apache.org/jira/browse/ARROW-2734) - [Python] Cython api example doesn't work by default on macOS
-* [ARROW-2750](https://issues.apache.org/jira/browse/ARROW-2750) - [MATLAB] Add MATLAB support for reading numeric types from Feather files
-* [ARROW-2799](https://issues.apache.org/jira/browse/ARROW-2799) - [Python] Add safe option to Table.from\_pandas to avoid unsafe casts
-* [ARROW-2813](https://issues.apache.org/jira/browse/ARROW-2813) - [C++] Strip uninformative lcov output from Travis CI logs
-* [ARROW-2813](https://issues.apache.org/jira/browse/ARROW-2813) - [C++] Strip uninformative lcov output from Travis CI logs
-* [ARROW-2817](https://issues.apache.org/jira/browse/ARROW-2817) - [C++] Enable libraries to be installed in msys2 on Windows
-* [ARROW-2840](https://issues.apache.org/jira/browse/ARROW-2840) - [C++] See if stream alignment logic can be simplified
-* [ARROW-2865](https://issues.apache.org/jira/browse/ARROW-2865) - [C++/Python] Reduce some duplicated code in python/builtin\_convert.cc
-* [ARROW-2889](https://issues.apache.org/jira/browse/ARROW-2889) - [C++] Add optional argument to ADD\_ARROW\_TEST CMake function to add unit test prefix
-* [ARROW-2900](https://issues.apache.org/jira/browse/ARROW-2900) - [Python] Improve performance of appending nested NumPy arrays in builtin\_convert.cc
-* [ARROW-2936](https://issues.apache.org/jira/browse/ARROW-2936) - [Python] Implement Table.cast for casting from one schema to another (if possible)
-* [ARROW-2948](https://issues.apache.org/jira/browse/ARROW-2948) - [Packaging] Generate changelog with crossbow
-* [ARROW-2950](https://issues.apache.org/jira/browse/ARROW-2950) - [C++] Clean up util/bit-util.h
-* [ARROW-2952](https://issues.apache.org/jira/browse/ARROW-2952) - [C++] Dockerfile for running include-what-you-use checks
-* [ARROW-2958](https://issues.apache.org/jira/browse/ARROW-2958) - [C++] Flatbuffers EP fails to compile with GCC 8.1
-* [ARROW-2960](https://issues.apache.org/jira/browse/ARROW-2960) - [Packaging] Fix verify-release-candidate for binary packages and fix release cutting script for lib64 cmake issue
-* [ARROW-2964](https://issues.apache.org/jira/browse/ARROW-2964) - [Go] wire all currently implemented array types in array.MakeFromData
-* [ARROW-2971](https://issues.apache.org/jira/browse/ARROW-2971) - [Python] Give more descriptive names to python\_to\_arrow.cc/arrow\_to\_python.cc
-* [ARROW-2972](https://issues.apache.org/jira/browse/ARROW-2972) - [Python] Implement inference logic for uint64 conversions in builtin\_convert.cc
-* [ARROW-2975](https://issues.apache.org/jira/browse/ARROW-2975) - [Plasma] TensorFlow op: Compilation only working if arrow found by pkg-config
-* [ARROW-2976](https://issues.apache.org/jira/browse/ARROW-2976) - [Python] Directory in pyarrow.get\_library\_dirs() on Travis doesn't contain libarrow.so
-* [ARROW-2979](https://issues.apache.org/jira/browse/ARROW-2979) - [GLib] Add operator functions in GArrowDecimal128
-* [ARROW-2983](https://issues.apache.org/jira/browse/ARROW-2983) - [Packaging] Verify source release and binary artifacts in different scripts
-* [ARROW-2989](https://issues.apache.org/jira/browse/ARROW-2989) - [C++] Remove deprecated APIs in 0.10.0 and below
-* [ARROW-2991](https://issues.apache.org/jira/browse/ARROW-2991) - [CI] Cut down number of AppVeyor jobs
-* [ARROW-2994](https://issues.apache.org/jira/browse/ARROW-2994) - [C++] Only include Python C header directories for Python-related compilation units
-* [ARROW-2996](https://issues.apache.org/jira/browse/ARROW-2996) - [C++] Fix typo in cpp/.clang-tidy
-* [ARROW-2998](https://issues.apache.org/jira/browse/ARROW-2998) - [C++] Add variants of AllocateBuffer, AllocateResizeableBuffer that return unique\_ptr<Buffer\>
-* [ARROW-2999](https://issues.apache.org/jira/browse/ARROW-2999) - [Python] Do not run ASV benchmarks in every Travis CI build to improve runtimes
-* [ARROW-3000](https://issues.apache.org/jira/browse/ARROW-3000) - [Python] Do not build unit tests other than python-test in travis\_script\_python.sh
-* [ARROW-3001](https://issues.apache.org/jira/browse/ARROW-3001) - [Packaging] Don't modify PATH during rust release verification
-* [ARROW-3002](https://issues.apache.org/jira/browse/ARROW-3002) - [Python] Implement better DataType hash function
-* [ARROW-3003](https://issues.apache.org/jira/browse/ARROW-3003) - [Doc] Enable Java doc in dev/gen\_apidocs/create\_documents.sh
-* [ARROW-3005](https://issues.apache.org/jira/browse/ARROW-3005) - [Website] Update website and write blog post for 0.10.0 release announcement
-* [ARROW-3008](https://issues.apache.org/jira/browse/ARROW-3008) - [Packaging] Verify GPU related modules if available
-* [ARROW-3009](https://issues.apache.org/jira/browse/ARROW-3009) - [Python] pyarrow.orc uses APIs now prohibited in 0.10.0
-* [ARROW-3010](https://issues.apache.org/jira/browse/ARROW-3010) - [GLib] Update README to use Bundler
-* [ARROW-3017](https://issues.apache.org/jira/browse/ARROW-3017) - [C++] Don't throw exception in arrow/util/thread-pool.h
-* [ARROW-3018](https://issues.apache.org/jira/browse/ARROW-3018) - [Plasma] Improve random ObjectID generation
-* [ARROW-3018](https://issues.apache.org/jira/browse/ARROW-3018) - [Plasma] Improve random ObjectID generation
-* [ARROW-3019](https://issues.apache.org/jira/browse/ARROW-3019) - [Packaging] Use Bundler to verify Arrow GLib
-* [ARROW-3021](https://issues.apache.org/jira/browse/ARROW-3021) - [Go] support for List
-* [ARROW-3022](https://issues.apache.org/jira/browse/ARROW-3022) - [Go] support for Struct
-* [ARROW-3023](https://issues.apache.org/jira/browse/ARROW-3023) - [C++] Use gold linker in builds if it is available
-* [ARROW-3024](https://issues.apache.org/jira/browse/ARROW-3024) - [C++] Replace usages of std::mutex with atomics in memory\_pool.cc
-* [ARROW-3025](https://issues.apache.org/jira/browse/ARROW-3025) - [C++] Add option to switch between dynamic and static linking in unit test executables
-* [ARROW-3026](https://issues.apache.org/jira/browse/ARROW-3026) - [Plasma] Only run Plasma Python unit tests under valgrind once instead of twice in CI
-* [ARROW-3027](https://issues.apache.org/jira/browse/ARROW-3027) - [Ruby] Stop "git tag" by "rake release"
-* [ARROW-3028](https://issues.apache.org/jira/browse/ARROW-3028) - [Python] Trim unneeded work from documentation build in Travis CI
-* [ARROW-3029](https://issues.apache.org/jira/browse/ARROW-3029) - [Python] pkg\_resources is slow
-* [ARROW-3031](https://issues.apache.org/jira/browse/ARROW-3031) - [Go] Streamline release of Arrays and Builders
-* [ARROW-3033](https://issues.apache.org/jira/browse/ARROW-3033) - [Dev] docker-compose test tooling does not seem to cache built Docker images
-* [ARROW-3034](https://issues.apache.org/jira/browse/ARROW-3034) - [Packaging] Source archive can't be extracted by bsdtar on MSYS2
-* [ARROW-3035](https://issues.apache.org/jira/browse/ARROW-3035) - [Rust] Examples in README.md do not run
-* [ARROW-3036](https://issues.apache.org/jira/browse/ARROW-3036) - [Go] add support for slicing Arrays
-* [ARROW-3037](https://issues.apache.org/jira/browse/ARROW-3037) - [Go] add support NullArray
-* [ARROW-3042](https://issues.apache.org/jira/browse/ARROW-3042) - [Go] add badge to GoDoc in the Go-Arrow README
-* [ARROW-3043](https://issues.apache.org/jira/browse/ARROW-3043) - [C++] pthread doesn't exist on MinGW
-* [ARROW-3044](https://issues.apache.org/jira/browse/ARROW-3044) - [Python] Remove all occurrences of cython's legacy property definition syntax
-* [ARROW-3045](https://issues.apache.org/jira/browse/ARROW-3045) - [Python] Remove nullcheck from ipc Message and MessageReader
-* [ARROW-3046](https://issues.apache.org/jira/browse/ARROW-3046) - [GLib] Use rubyish method in test-orc-file-reader.rb
-* [ARROW-3050](https://issues.apache.org/jira/browse/ARROW-3050) - [C++] Adopt HiveServer2 client C++ codebase
-* [ARROW-3051](https://issues.apache.org/jira/browse/ARROW-3051) - [C++] Status performance optimization from Impala/Kudu
-* [ARROW-3057](https://issues.apache.org/jira/browse/ARROW-3057) - [INTEGRATION] Fix spark and hdfs dockerfiles
-* [ARROW-3059](https://issues.apache.org/jira/browse/ARROW-3059) - [C++] Streamline namespace array::test
-* [ARROW-3060](https://issues.apache.org/jira/browse/ARROW-3060) - [C++] Factor out parsing routines
-* [ARROW-3062](https://issues.apache.org/jira/browse/ARROW-3062) - [Python] Extend fast libtensorflow\_framework.so compatibility workaround to Python 2.7
-* [ARROW-3064](https://issues.apache.org/jira/browse/ARROW-3064) - [C++] Add option to ADD\_ARROW\_TEST to indicate additional dependencies for particular unit test executables
-* [ARROW-3067](https://issues.apache.org/jira/browse/ARROW-3067) - [Packaging] Support dev/rc/release .deb/.rpm builds
-* [ARROW-3068](https://issues.apache.org/jira/browse/ARROW-3068) - [Packaging] Bump version to 0.11.0-SNAPSHOT
-* [ARROW-3069](https://issues.apache.org/jira/browse/ARROW-3069) - [Release] Stop using SHA1 checksums per ASF policy
-* [ARROW-3072](https://issues.apache.org/jira/browse/ARROW-3072) - [C++] Use ARROW\_RETURN\_NOT\_OK instead of RETURN\_NOT\_OK in header files
-* [ARROW-3075](https://issues.apache.org/jira/browse/ARROW-3075) - [C++] Incorporate apache/parquet-cpp codebase into Arrow C++ codebase and build system
-* [ARROW-3076](https://issues.apache.org/jira/browse/ARROW-3076) - [Website] Add Google Analytics tags to C++, Python API docs
-* [ARROW-3088](https://issues.apache.org/jira/browse/ARROW-3088) - [Rust] Use internal \`Result<T\>\` type instead of \`Result<T, ArrowError\>\`
-* [ARROW-3090](https://issues.apache.org/jira/browse/ARROW-3090) - [Rust] Accompany error messages with assertions
-* [ARROW-3094](https://issues.apache.org/jira/browse/ARROW-3094) - [Python] Allow lighter construction of pa.Schema / pa.StructType
-* [ARROW-3099](https://issues.apache.org/jira/browse/ARROW-3099) - [C++] Add benchmark for number parsing
-* [ARROW-3105](https://issues.apache.org/jira/browse/ARROW-3105) - [Plasma] Improve flushing error message
-* [ARROW-3106](https://issues.apache.org/jira/browse/ARROW-3106) - [Website] Update committers and PMC roster on website
-* [ARROW-3109](https://issues.apache.org/jira/browse/ARROW-3109) - [Python] Add Python 3.7 virtualenvs to manylinux1 container
-* [ARROW-3110](https://issues.apache.org/jira/browse/ARROW-3110) - [C++] Compilation warnings with gcc 7.3.0
-* [ARROW-3111](https://issues.apache.org/jira/browse/ARROW-3111) - [Java] Enable changing default logging level when running tests
-* [ARROW-3114](https://issues.apache.org/jira/browse/ARROW-3114) - [Website] Add information about user@ mailing list to website / Community page
-* [ARROW-3115](https://issues.apache.org/jira/browse/ARROW-3115) - [Java] Style Checks - Fix import ordering
-* [ARROW-3116](https://issues.apache.org/jira/browse/ARROW-3116) - [Plasma] Add "ls" to object store
-* [ARROW-3117](https://issues.apache.org/jira/browse/ARROW-3117) - [GLib] Add garrow\_chunked\_array\_to\_string()
-* [ARROW-3119](https://issues.apache.org/jira/browse/ARROW-3119) - [Packaging] Nightly packaging script fails
-* [ARROW-3127](https://issues.apache.org/jira/browse/ARROW-3127) - [C++] Add Tutorial about Sending Tensor from C++ to Python
-* [ARROW-3128](https://issues.apache.org/jira/browse/ARROW-3128) - [C++] Support system shared zlib
-* [ARROW-3129](https://issues.apache.org/jira/browse/ARROW-3129) - [Packaging] Stop to use deprecated BuildRoot and Group in .rpm
-* [ARROW-3130](https://issues.apache.org/jira/browse/ARROW-3130) - [Go] add initial support for Go modules
-* [ARROW-3136](https://issues.apache.org/jira/browse/ARROW-3136) - [C++] Clean up arrow:: public API
-* [ARROW-3142](https://issues.apache.org/jira/browse/ARROW-3142) - [C++] Fetch all libs from toolchain environment
-* [ARROW-3143](https://issues.apache.org/jira/browse/ARROW-3143) - [C++] CopyBitmap into existing memory
-* [ARROW-3146](https://issues.apache.org/jira/browse/ARROW-3146) - [C++] Barebones Flight RPC server and client implementations
-* [ARROW-3147](https://issues.apache.org/jira/browse/ARROW-3147) - [C++] MSVC version isn't detected in code page 932
-* [ARROW-3148](https://issues.apache.org/jira/browse/ARROW-3148) - [C++] MSVC shows C4819 warning on code page 932
-* [ARROW-3152](https://issues.apache.org/jira/browse/ARROW-3152) - [C++][Packaging] Use dynamic linking for zlib in conda recipes
-* [ARROW-3153](https://issues.apache.org/jira/browse/ARROW-3153) - [Packaging] Fix broken nightly package builds introduced with recent cmake changes and orc tests
-* [ARROW-3157](https://issues.apache.org/jira/browse/ARROW-3157) - [C++] Improve buffer creation for typed data
-* [ARROW-3158](https://issues.apache.org/jira/browse/ARROW-3158) - [C++] Handle float truncation during casting
-* [ARROW-3160](https://issues.apache.org/jira/browse/ARROW-3160) - [Python] Improve pathlib.Path support in parquet and filesystem modules
-* [ARROW-3163](https://issues.apache.org/jira/browse/ARROW-3163) - [Python] Cython dependency is missing in non wheel package
-* [ARROW-3167](https://issues.apache.org/jira/browse/ARROW-3167) - [CI] Limit clcache cache size
-* [ARROW-3168](https://issues.apache.org/jira/browse/ARROW-3168) - [C++] Restore pkgconfig for Parquet C++ libraries
-* [ARROW-3170](https://issues.apache.org/jira/browse/ARROW-3170) - [C++] Implement "readahead spooler" class for background input buffering
-* [ARROW-3171](https://issues.apache.org/jira/browse/ARROW-3171) - [Java] checkstyle - fix line length and indentation
-* [ARROW-3172](https://issues.apache.org/jira/browse/ARROW-3172) - [Rust] Update documentation for datatypes.rs
-* [ARROW-3174](https://issues.apache.org/jira/browse/ARROW-3174) - [Rust] run examples as part of CI
-* [ARROW-3177](https://issues.apache.org/jira/browse/ARROW-3177) - [Rust] Update expected error messages for tests that 'should panic'
-* [ARROW-3180](https://issues.apache.org/jira/browse/ARROW-3180) - [C++] Add docker-compose setup to simulate Travis CI run locally
-* [ARROW-3181](https://issues.apache.org/jira/browse/ARROW-3181) - [Packaging] Adjust conda package scripts to account for Parquet codebase migration
-* [ARROW-3182](https://issues.apache.org/jira/browse/ARROW-3182) - [C++] Merge Gandiva codebase
-* [ARROW-3187](https://issues.apache.org/jira/browse/ARROW-3187) - [Plasma] Make Plasma Log pluggable with glog
-* [ARROW-3195](https://issues.apache.org/jira/browse/ARROW-3195) - [C++] NumPy initialization error check is missing in test
-* [ARROW-3196](https://issues.apache.org/jira/browse/ARROW-3196) - Enable merge\_arrow\_py.py script to merge Parquet patches and set fix versions
-* [ARROW-3197](https://issues.apache.org/jira/browse/ARROW-3197) - [C++] Add instructions to cpp/README.md about Parquet-only development and Arrow+Parquet
-* [ARROW-3198](https://issues.apache.org/jira/browse/ARROW-3198) - [Website] Blog post for 0.11 release
-* [ARROW-3211](https://issues.apache.org/jira/browse/ARROW-3211) - [C++] gold linker doesn't work with MinGW-w64
-* [ARROW-3212](https://issues.apache.org/jira/browse/ARROW-3212) - [C++] Create deterministic IPC metadata
-* [ARROW-3213](https://issues.apache.org/jira/browse/ARROW-3213) - [C++] Use CMake to build vendored Snappy on Windows
-* [ARROW-3214](https://issues.apache.org/jira/browse/ARROW-3214) - [C++] Disable insecure warnings with MinGW build
-* [ARROW-3215](https://issues.apache.org/jira/browse/ARROW-3215) - [C++] Add support for finding libpython on MSYS2
-* [ARROW-3216](https://issues.apache.org/jira/browse/ARROW-3216) - [C++] libpython isn't linked to libarrow\_python in MinGW build
-* [ARROW-3217](https://issues.apache.org/jira/browse/ARROW-3217) - [C++] ARROW\_STATIC definition is missing in MinGW build
-* [ARROW-3218](https://issues.apache.org/jira/browse/ARROW-3218) - [C++] Utilities has needless pthread link in MinGW build
-* [ARROW-3219](https://issues.apache.org/jira/browse/ARROW-3219) - [C++] Use Win32 API in MinGW
-* [ARROW-3223](https://issues.apache.org/jira/browse/ARROW-3223) - [GLib] Use the same shared object versioning rule in C++
-* [ARROW-3229](https://issues.apache.org/jira/browse/ARROW-3229) - [Packaging]: Adjust wheel package scripts to account for Parquet codebase migration
-* [ARROW-3234](https://issues.apache.org/jira/browse/ARROW-3234) - [C++] Link order is wrong when ARROW\_ORC=on and ARROW\_PROTOBUF\_USE\_SHARED=ON
-* [ARROW-3235](https://issues.apache.org/jira/browse/ARROW-3235) - [Packaging] Update deb names
-* [ARROW-3236](https://issues.apache.org/jira/browse/ARROW-3236) - [C++] OutputStream bookkeeping logic when writing IPC file format is incorrect
-* [ARROW-3240](https://issues.apache.org/jira/browse/ARROW-3240) - [GLib] Add build instructions using Meson
-* [ARROW-3242](https://issues.apache.org/jira/browse/ARROW-3242) - [C++] Use coarser-grained dispatch to SIMD hash functions
-* [ARROW-3249](https://issues.apache.org/jira/browse/ARROW-3249) - [Python] Run flake8 on integration\_test.py and crossbow.py
-* [ARROW-3250](https://issues.apache.org/jira/browse/ARROW-3250) - [C++] Create Buffer implementation that takes ownership for the memory from a std::string via std::move
-* [ARROW-3252](https://issues.apache.org/jira/browse/ARROW-3252) - [C++] Do not hard code the "v" part of versions in thirdparty toolchain
-* [ARROW-3257](https://issues.apache.org/jira/browse/ARROW-3257) - [C++] Stop to use IMPORTED\_LINK\_INTERFACE\_LIBRARIES
-* [ARROW-3258](https://issues.apache.org/jira/browse/ARROW-3258) - [GLib] CI is failued on macOS
-* [ARROW-3259](https://issues.apache.org/jira/browse/ARROW-3259) - [GLib] Rename "writeable" to "writable"
-* [ARROW-3261](https://issues.apache.org/jira/browse/ARROW-3261) - [Python] Add "field" method to select fields from StructArray
-* [ARROW-3262](https://issues.apache.org/jira/browse/ARROW-3262) - [Python] Implement \_\_getitem\_\_ with integers on pyarrow.Column
-* [ARROW-3264](https://issues.apache.org/jira/browse/ARROW-3264) - [Java] checkstyle - fix whitespace
-* [ARROW-3267](https://issues.apache.org/jira/browse/ARROW-3267) - [Python] Create empty table from schema
-* [ARROW-3268](https://issues.apache.org/jira/browse/ARROW-3268) - [CI] Reduce conda times on AppVeyor
-* [ARROW-3269](https://issues.apache.org/jira/browse/ARROW-3269) - [Python] Fix warnings in unit test suite
-* [ARROW-3270](https://issues.apache.org/jira/browse/ARROW-3270) - [Release] Adjust release verification scripts to recent parquet migration
-* [ARROW-3274](https://issues.apache.org/jira/browse/ARROW-3274) - [Packaging] Missing glog dependency from conda-forge recipes
-* [ARROW-3276](https://issues.apache.org/jira/browse/ARROW-3276) - [Packaging] Add support Parquet related Linux packages
-* [ARROW-3281](https://issues.apache.org/jira/browse/ARROW-3281) - [Java] Make sure that WritableByteChannel in WriteChannel writes out complete bytes
-* [ARROW-3282](https://issues.apache.org/jira/browse/ARROW-3282) - [R] initial R functionality
-* [ARROW-3284](https://issues.apache.org/jira/browse/ARROW-3284) - [R] Adding R Error in Status
-* [ARROW-3285](https://issues.apache.org/jira/browse/ARROW-3285) - [GLib] Add arrow\_cpp\_build\_type and arrow\_cpp\_build\_dir Meson options
-* [ARROW-3286](https://issues.apache.org/jira/browse/ARROW-3286) - [C++] ARROW\_EXPORT for RecordBatchBuilder is missing
-* [ARROW-3287](https://issues.apache.org/jira/browse/ARROW-3287) - [C++] "redeclared without dllimport attribute after being referenced with dll linkage" with MinGW
-* [ARROW-3288](https://issues.apache.org/jira/browse/ARROW-3288) - [GLib] Add new API index for 0.11.0
-* [ARROW-3300](https://issues.apache.org/jira/browse/ARROW-3300) - [Release] Update .deb package names in preparation
-* [ARROW-3301](https://issues.apache.org/jira/browse/ARROW-3301) - [Website] Update Jekyll and Bootstrap 4
-* [ARROW-3305](https://issues.apache.org/jira/browse/ARROW-3305) - [JS] Incorrect development documentation link in javascript readme
-* [ARROW-3309](https://issues.apache.org/jira/browse/ARROW-3309) - [JS] Missing links from DEVELOP.md
-* [ARROW-3313](https://issues.apache.org/jira/browse/ARROW-3313) - [R] Run clang-format, cpplint checks on R C++ code
-* [ARROW-3313](https://issues.apache.org/jira/browse/ARROW-3313) - [R] Run clang-format, cpplint checks on R C++ code
-* [ARROW-3319](https://issues.apache.org/jira/browse/ARROW-3319) - [GLib] Expose AlignStream methods in InputStream, OutputStream classes
-* [ARROW-3320](https://issues.apache.org/jira/browse/ARROW-3320) - [C++] Improve float parsing performance
-* [ARROW-3321](https://issues.apache.org/jira/browse/ARROW-3321) - [C++] Improve integer parsing performance
-* [ARROW-3334](https://issues.apache.org/jira/browse/ARROW-3334) - [Python] Update conda packages to new numpy requirement
-* [ARROW-3335](https://issues.apache.org/jira/browse/ARROW-3335) - [Python] Add ccache to manylinux1 container
-* [ARROW-3339](https://issues.apache.org/jira/browse/ARROW-3339) - [R] Support for character vectors
-* [ARROW-3341](https://issues.apache.org/jira/browse/ARROW-3341) - [R] Support for logical vector
-* [ARROW-3349](https://issues.apache.org/jira/browse/ARROW-3349) - [C++] Use aligned API in MinGW
-* [ARROW-3350](https://issues.apache.org/jira/browse/ARROW-3350) - [Website] Fix powered by links
-* [ARROW-3352](https://issues.apache.org/jira/browse/ARROW-3352) - [Packaging] Fix recently failing wheel builds
-* [ARROW-3356](https://issues.apache.org/jira/browse/ARROW-3356) - [Python] Document parameters of Table.to\_pandas method
-* [ARROW-3357](https://issues.apache.org/jira/browse/ARROW-3357) - [Rust] Add a mutable buffer implementation
-* [ARROW-3360](https://issues.apache.org/jira/browse/ARROW-3360) - [GLib] Import Parquet bindings
-* [ARROW-3363](https://issues.apache.org/jira/browse/ARROW-3363) - [C++/Python] Add helper functions to detect scalar Python types
-* [ARROW-3371](https://issues.apache.org/jira/browse/ARROW-3371) - [Python] Remove check\_metadata argument for Field.equals docstring
-* [ARROW-3375](https://issues.apache.org/jira/browse/ARROW-3375) - [Rust] Remove memory\_pool.rs
-* [ARROW-3376](https://issues.apache.org/jira/browse/ARROW-3376) - [C++] Add double-conversion to cpp/thirdparty/download\_dependencies.sh
-* [ARROW-3377](https://issues.apache.org/jira/browse/ARROW-3377) - [Gandiva][C++] Remove If statement from bit map set function
-* [ARROW-3382](https://issues.apache.org/jira/browse/ARROW-3382) - [C++] Run Gandiva tests in Travis CI
-* [ARROW-3392](https://issues.apache.org/jira/browse/ARROW-3392) - [Python] Support filters in disjunctive normal form in ParquetDataset
-* [ARROW-3395](https://issues.apache.org/jira/browse/ARROW-3395) - [C++/Python] Add docker container for linting
-* [ARROW-3397](https://issues.apache.org/jira/browse/ARROW-3397) - [C++] Use relative CMake path for modules
-* [ARROW-3400](https://issues.apache.org/jira/browse/ARROW-3400) - [Packaging] Add support Parquet GLib related Linux packages
-* [ARROW-3404](https://issues.apache.org/jira/browse/ARROW-3404) - [C++] Make CSV chunker faster
-* [ARROW-3411](https://issues.apache.org/jira/browse/ARROW-3411) - [Packaging] dev/release/01-perform.sh doesn't have executable bit
-* [ARROW-3412](https://issues.apache.org/jira/browse/ARROW-3412) - [Packaging] rat failure in dev/release/02-source.sh
-* [ARROW-3413](https://issues.apache.org/jira/browse/ARROW-3413) - [Packaging] dev/release/02-source.sh doesn't generate Parquet GLib document
-* [ARROW-3415](https://issues.apache.org/jira/browse/ARROW-3415) - [Packaging] dev/release/verify-release-cndidate.sh fails in "conda activate arrow-test"
-* [ARROW-3416](https://issues.apache.org/jira/browse/ARROW-3416) - [Packaging] dev/release/02-source.sh must use SHA512 instead of SHA1
-* [ARROW-3417](https://issues.apache.org/jira/browse/ARROW-3417) - [Packaging] dev/release/verify-release-cndidate.sh fails Parquet C++ test
-* [ARROW-3418](https://issues.apache.org/jira/browse/ARROW-3418) - [C++] Update Parquet snapshot version for release
-* [ARROW-3423](https://issues.apache.org/jira/browse/ARROW-3423) - [Packaging] Remove RC information from deb/rpm
-* [ARROW-3443](https://issues.apache.org/jira/browse/ARROW-3443) - [Java] Flight reports memory leaks in TestBasicOperation
-* [PARQUET-169](https://issues.apache.org/jira/browse/PARQUET-169) - Parquet-cpp: Implement support for bulk reading and writing repetition/definition levels.
-* [PARQUET-267](https://issues.apache.org/jira/browse/PARQUET-267) - Detach thirdparty code from build configuration.
-* [PARQUET-416](https://issues.apache.org/jira/browse/PARQUET-416) - C++11, cpplint cleanup, package target and header installation
-* [PARQUET-418](https://issues.apache.org/jira/browse/PARQUET-418) - Add a utility to print contents of a Parquet file to stdout
-* [PARQUET-428](https://issues.apache.org/jira/browse/PARQUET-428) - Support INT96 and FIXED\_LEN\_BYTE\_ARRAY types
-* [PARQUET-434](https://issues.apache.org/jira/browse/PARQUET-434) - Add a ParquetFileReader class to encapsulate some low-level details of interacting with Parquet files
-* [PARQUET-435](https://issues.apache.org/jira/browse/PARQUET-435) - Provide vectorized ColumnReader interface
-* [PARQUET-436](https://issues.apache.org/jira/browse/PARQUET-436) - Implement ParquetFileWriter class entry point for generating new Parquet files
-* [PARQUET-437](https://issues.apache.org/jira/browse/PARQUET-437) - Incorporate googletest thirdparty dependency and add cmake tools (ADD\_PARQUET\_TEST) to simplify adding new unit tests
-* [PARQUET-438](https://issues.apache.org/jira/browse/PARQUET-438) - Update RLE encoder/decoder modules from Impala upstream changes and adapt unit tests
-* [PARQUET-439](https://issues.apache.org/jira/browse/PARQUET-439) - Conform all copyright headers to ASF requirements
-* [PARQUET-442](https://issues.apache.org/jira/browse/PARQUET-442) - Convert flat SchemaElement vector to implied nested schema data structure
-* [PARQUET-448](https://issues.apache.org/jira/browse/PARQUET-448) - Add cmake option to skip building the unit tests
-* [PARQUET-449](https://issues.apache.org/jira/browse/PARQUET-449) - Update to latest parquet.thrift
-* [PARQUET-451](https://issues.apache.org/jira/browse/PARQUET-451) - Add a RowGroup reader interface class
-* [PARQUET-456](https://issues.apache.org/jira/browse/PARQUET-456) - Add zlib codec support
-* [PARQUET-463](https://issues.apache.org/jira/browse/PARQUET-463) - Add DCHECK\* macros for assertions in debug builds
-* [PARQUET-468](https://issues.apache.org/jira/browse/PARQUET-468) - Add a cmake option to generate the Parquet thrift headers with the thriftc in the environment
-* [PARQUET-477](https://issues.apache.org/jira/browse/PARQUET-477) - Enable clang-format check during the Travis CI build
-* [PARQUET-482](https://issues.apache.org/jira/browse/PARQUET-482) - Organize src code file structure to have a very clear folder with public headers.
-* [PARQUET-485](https://issues.apache.org/jira/browse/PARQUET-485) - Decouple data page delimiting from column reader / scanner classes, create test fixtures
-* [PARQUET-488](https://issues.apache.org/jira/browse/PARQUET-488) - Add SSE-related cmake options to manage compiler flags
-* [PARQUET-489](https://issues.apache.org/jira/browse/PARQUET-489) - Add visibility macros to be used for public and internal APIs of libparquet
-* [PARQUET-494](https://issues.apache.org/jira/browse/PARQUET-494) - Implement PLAIN\_DICTIONARY encoding and decoding
-* [PARQUET-496](https://issues.apache.org/jira/browse/PARQUET-496) - Fix cpplint configuration to be more restrictive
-* [PARQUET-497](https://issues.apache.org/jira/browse/PARQUET-497) - Decouple Parquet physical file structure from FileReader class
-* [PARQUET-499](https://issues.apache.org/jira/browse/PARQUET-499) - Complete PlainEncoder implementation for all primitive types and test end to end
-* [PARQUET-501](https://issues.apache.org/jira/browse/PARQUET-501) - Add an OutputStream abstraction (capable of memory allocation) for Encoder public API
-* [PARQUET-503](https://issues.apache.org/jira/browse/PARQUET-503) - Re-enable parquet 2.0 encodings
-* [PARQUET-508](https://issues.apache.org/jira/browse/PARQUET-508) - Add ParquetFilePrinter
-* [PARQUET-508](https://issues.apache.org/jira/browse/PARQUET-508) - Add ParquetFilePrinter
-* [PARQUET-512](https://issues.apache.org/jira/browse/PARQUET-512) - Add optional google/benchmark 3rd-party dependency for performance testing
-* [PARQUET-515](https://issues.apache.org/jira/browse/PARQUET-515) - Add "Reset" to LevelEncoder and LevelDecoder
-* [PARQUET-518](https://issues.apache.org/jira/browse/PARQUET-518) - Review usages of size\_t and unsigned integers generally per Google style guide
-* [PARQUET-519](https://issues.apache.org/jira/browse/PARQUET-519) - Disable compiler warning supressions and fix all DEBUG build warnings
-* [PARQUET-520](https://issues.apache.org/jira/browse/PARQUET-520) - Add version of LocalFileSource that uses memory-mapping for zero-copy reads
-* [PARQUET-533](https://issues.apache.org/jira/browse/PARQUET-533) - Simplify RandomAccessSource API to combine Seek/Read
-* [PARQUET-538](https://issues.apache.org/jira/browse/PARQUET-538) - Improve ColumnReader Tests
-* [PARQUET-542](https://issues.apache.org/jira/browse/PARQUET-542) - Support memory allocation from external memory
-* [PARQUET-545](https://issues.apache.org/jira/browse/PARQUET-545) - Improve API to support Decimal type
-* [PARQUET-547](https://issues.apache.org/jira/browse/PARQUET-547) - Refactor most templates to use DataType structs rather than the Type::type enum
-* [PARQUET-551](https://issues.apache.org/jira/browse/PARQUET-551) - Handle compiler warnings due to disabled DCHECKs in release builds
-* [PARQUET-556](https://issues.apache.org/jira/browse/PARQUET-556) - Extend RowGroupStatistics to include "min" "max" statistics
-* [PARQUET-559](https://issues.apache.org/jira/browse/PARQUET-559) - Enable InputStream as a source to the ParquetFileReader
-* [PARQUET-564](https://issues.apache.org/jira/browse/PARQUET-564) - Add option to run unit tests with valgrind --tool=memcheck
-* [PARQUET-566](https://issues.apache.org/jira/browse/PARQUET-566) - Add method to retrieve the full column path
-* [PARQUET-568](https://issues.apache.org/jira/browse/PARQUET-568) - Read only specified top-level columns in DebugPrint
-* [PARQUET-572](https://issues.apache.org/jira/browse/PARQUET-572) - Rename parquet\_cpp namespace to parquet
-* [PARQUET-573](https://issues.apache.org/jira/browse/PARQUET-573) - C++: Create a public API for reading and writing file metadata
-* [PARQUET-582](https://issues.apache.org/jira/browse/PARQUET-582) - Conversion functions for Parquet enums to Thrift enums
-* [PARQUET-583](https://issues.apache.org/jira/browse/PARQUET-583) - Implement Parquet to Thrift schema conversion
-* [PARQUET-587](https://issues.apache.org/jira/browse/PARQUET-587) - Implement BufferReader::Read(int64\_t,uint8\_t\*)
-* [PARQUET-589](https://issues.apache.org/jira/browse/PARQUET-589) - Implement Chunked InMemoryInputStream for better memory usage
-* [PARQUET-592](https://issues.apache.org/jira/browse/PARQUET-592) - Support compressed writes
-* [PARQUET-593](https://issues.apache.org/jira/browse/PARQUET-593) - Add API for writing Page statistics
-* [PARQUET-595](https://issues.apache.org/jira/browse/PARQUET-595) - Add API for key-value metadata
-* [PARQUET-595](https://issues.apache.org/jira/browse/PARQUET-595) - Add API for key-value metadata
-* [PARQUET-597](https://issues.apache.org/jira/browse/PARQUET-597) - Add data rates to benchmark output
-* [PARQUET-598](https://issues.apache.org/jira/browse/PARQUET-598) - [C++] Test writing all primitive data types
-* [PARQUET-600](https://issues.apache.org/jira/browse/PARQUET-600) - Add benchmarks for RLE-Level encoding
-* [PARQUET-603](https://issues.apache.org/jira/browse/PARQUET-603) - Implement missing information in schema descriptor
-* [PARQUET-605](https://issues.apache.org/jira/browse/PARQUET-605) - Expose schema node in ColumnDescriptor
-* [PARQUET-607](https://issues.apache.org/jira/browse/PARQUET-607) - Public Writer header
-* [PARQUET-610](https://issues.apache.org/jira/browse/PARQUET-610) - Print ColumnMetaData for each RowGroup
-* [PARQUET-616](https://issues.apache.org/jira/browse/PARQUET-616) - C++: WriteBatch should accept const arrays
-* [PARQUET-619](https://issues.apache.org/jira/browse/PARQUET-619) - C++: Add OutputStream for local files
-* [PARQUET-625](https://issues.apache.org/jira/browse/PARQUET-625) - Improve RLE read performance
-* [PARQUET-633](https://issues.apache.org/jira/browse/PARQUET-633) - Add version to WriterProperties
-* [PARQUET-634](https://issues.apache.org/jira/browse/PARQUET-634) - Consistent private linking of dependencies
-* [PARQUET-636](https://issues.apache.org/jira/browse/PARQUET-636) - Expose selection for different encodings
-* [PARQUET-641](https://issues.apache.org/jira/browse/PARQUET-641) - Instantiate stringstream only if needed in SerializedPageReader::NextPage
-* [PARQUET-646](https://issues.apache.org/jira/browse/PARQUET-646) - [C++] Enable easier 3rd-party toolchain clang builds on Linux
-* [PARQUET-666](https://issues.apache.org/jira/browse/PARQUET-666) - PLAIN\_DICTIONARY write support
-* [PARQUET-671](https://issues.apache.org/jira/browse/PARQUET-671) - Improve performance of RLE/bit-packed decoding in parquet-cpp
-* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
-* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
-* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
-* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
-* [PARQUET-681](https://issues.apache.org/jira/browse/PARQUET-681) - Add tool to scan a parquet file
-* [PARQUET-681](https://issues.apache.org/jira/browse/PARQUET-681) - Add tool to scan a parquet file
-* [PARQUET-687](https://issues.apache.org/jira/browse/PARQUET-687) - C++: Switch to PLAIN encoding if dictionary grows too large
-* [PARQUET-689](https://issues.apache.org/jira/browse/PARQUET-689) - C++: Compress DataPages eagerly
-* [PARQUET-699](https://issues.apache.org/jira/browse/PARQUET-699) - Update parquet.thrift from https://github.com/apache/parquet-format
-* [PARQUET-712](https://issues.apache.org/jira/browse/PARQUET-712) - C++: Read into Arrow memory
-* [PARQUET-721](https://issues.apache.org/jira/browse/PARQUET-721) - Performance benchmarks for reading into Arrow structures
-* [PARQUET-724](https://issues.apache.org/jira/browse/PARQUET-724) - Test more advanced properties setting
-* [PARQUET-728](https://issues.apache.org/jira/browse/PARQUET-728) - [C++] Bring parquet::arrow up to date with API changes in arrow::io
-* [PARQUET-728](https://issues.apache.org/jira/browse/PARQUET-728) - [C++] Bring parquet::arrow up to date with API changes in arrow::io
-* [PARQUET-731](https://issues.apache.org/jira/browse/PARQUET-731) - [CPP] Add API to return metadata size and Skip reading values
-* [PARQUET-737](https://issues.apache.org/jira/browse/PARQUET-737) - Use absolute namespace in macros
-* [PARQUET-752](https://issues.apache.org/jira/browse/PARQUET-752) - [C++] Conform parquet\_arrow to upstream API changes
-* [PARQUET-762](https://issues.apache.org/jira/browse/PARQUET-762) - C++: Use optimistic allocation instead of Arrow Builders
-* [PARQUET-763](https://issues.apache.org/jira/browse/PARQUET-763) - C++: Expose ParquetFileReader through Arrow reader
-* [PARQUET-769](https://issues.apache.org/jira/browse/PARQUET-769) - C++: Add support for Brotli Compression
-* [PARQUET-778](https://issues.apache.org/jira/browse/PARQUET-778) - Standardize the schema output to match the parquet-mr format
-* [PARQUET-782](https://issues.apache.org/jira/browse/PARQUET-782) - C++: Support writing to Arrow sinks
-* [PARQUET-785](https://issues.apache.org/jira/browse/PARQUET-785) - C++: List conversion for Arrow Schemas
-* [PARQUET-805](https://issues.apache.org/jira/browse/PARQUET-805) - C++: Read Int96 into Arrow Timestamp(ns)
-* [PARQUET-807](https://issues.apache.org/jira/browse/PARQUET-807) - [C++] Add API to read file metadata only from a file handle
-* [PARQUET-807](https://issues.apache.org/jira/browse/PARQUET-807) - [C++] Add API to read file metadata only from a file handle
-* [PARQUET-809](https://issues.apache.org/jira/browse/PARQUET-809) - [C++] Add API to determine if two files' schemas are compatible
-* [PARQUET-813](https://issues.apache.org/jira/browse/PARQUET-813) - C++: Build dependencies using CMake External project
-* [PARQUET-820](https://issues.apache.org/jira/browse/PARQUET-820) - C++: Decoders should directly emit arrays with spacing for null entries
-* [PARQUET-829](https://issues.apache.org/jira/browse/PARQUET-829) - C++: Make use of ARROW-469
-* [PARQUET-830](https://issues.apache.org/jira/browse/PARQUET-830) - [C++] Add additional configuration options to parquet::arrow::OpenFIle
-* [PARQUET-833](https://issues.apache.org/jira/browse/PARQUET-833) - C++: Provide API to write spaced arrays (e.g. Arrow)
-* [PARQUET-834](https://issues.apache.org/jira/browse/PARQUET-834) - C++: Support r/w of arrow::ListArray
-* [PARQUET-835](https://issues.apache.org/jira/browse/PARQUET-835) - [C++] Add option to parquet::arrow to read columns in parallel using a thread pool
-* [PARQUET-836](https://issues.apache.org/jira/browse/PARQUET-836) - [C++] Add column selection to parquet::arrow::FileReader
-* [PARQUET-844](https://issues.apache.org/jira/browse/PARQUET-844) - [C++] Consolidate encodings, schema, and compression subdirectories into fewer files
-* [PARQUET-848](https://issues.apache.org/jira/browse/PARQUET-848) - [C++] Consolidate libparquet\_thrift subcomponent
-* [PARQUET-857](https://issues.apache.org/jira/browse/PARQUET-857) - [C++] Flatten parquet/encodings directory
-* [PARQUET-858](https://issues.apache.org/jira/browse/PARQUET-858) - [C++] Flatten parquet/column directory, consolidate related code
-* [PARQUET-859](https://issues.apache.org/jira/browse/PARQUET-859) - [C++] Flatten parquet/file directory
-* [PARQUET-862](https://issues.apache.org/jira/browse/PARQUET-862) - Provide defaut cache size values if CPU info probing is not available
-* [PARQUET-866](https://issues.apache.org/jira/browse/PARQUET-866) - [C++] Account for API changes in ARROW-33
-* [PARQUET-867](https://issues.apache.org/jira/browse/PARQUET-867) - [C++] Support writing sliced Arrow arrays
-* [PARQUET-874](https://issues.apache.org/jira/browse/PARQUET-874) - [C++] Use default memory allocator from Arrow
-* [PARQUET-877](https://issues.apache.org/jira/browse/PARQUET-877) - C++: Update Arrow Hash, update Version in metadata.
-* [PARQUET-882](https://issues.apache.org/jira/browse/PARQUET-882) - [CPP] Improve Application Version parsing
-* [PARQUET-890](https://issues.apache.org/jira/browse/PARQUET-890) - C++: Support I/O of DATE columns in parquet\_arrow
-* [PARQUET-894](https://issues.apache.org/jira/browse/PARQUET-894) - Fix compilation warning
-* [PARQUET-894](https://issues.apache.org/jira/browse/PARQUET-894) - Fix compilation warning
-* [PARQUET-897](https://issues.apache.org/jira/browse/PARQUET-897) - [C++] Only use designated public headers from libarrow
-* [PARQUET-903](https://issues.apache.org/jira/browse/PARQUET-903) - C++: Add option to set RPATH to ORIGIN
-* [PARQUET-909](https://issues.apache.org/jira/browse/PARQUET-909) - [CPP]: Reduce buffer allocations (mallocs) on critical path
-* [PARQUET-909](https://issues.apache.org/jira/browse/PARQUET-909) - [CPP]: Reduce buffer allocations (mallocs) on critical path
-* [PARQUET-911](https://issues.apache.org/jira/browse/PARQUET-911) - C++: Support nested structs in parquet\_arrow
-* [PARQUET-928](https://issues.apache.org/jira/browse/PARQUET-928) - [C++] Support pkg-config
-* [PARQUET-929](https://issues.apache.org/jira/browse/PARQUET-929) - [C++] Handle arrow::DictionaryArray when writing Arrow data
-* [PARQUET-930](https://issues.apache.org/jira/browse/PARQUET-930) - [C++] Account for all Arrow date/time types
-* [PARQUET-934](https://issues.apache.org/jira/browse/PARQUET-934) - [C++] Support multiarch on Debian
-* [PARQUET-935](https://issues.apache.org/jira/browse/PARQUET-935) - [C++] Set shared library version for .deb packages
-* [PARQUET-946](https://issues.apache.org/jira/browse/PARQUET-946) - [C++] Refactoring in parquet::arrow::FileReader to be able to read a single row group
-* [PARQUET-953](https://issues.apache.org/jira/browse/PARQUET-953) - [C++] Change arrow::FileWriter API to be initialized from a Schema, and provide for writing multiple tables
-* [PARQUET-967](https://issues.apache.org/jira/browse/PARQUET-967) - [C++] Combine libparquet/libparquet\_arrow libraries
-* [PARQUET-970](https://issues.apache.org/jira/browse/PARQUET-970) - Add Add Lz4 and Zstd compression codecs
-* [PARQUET-978](https://issues.apache.org/jira/browse/PARQUET-978) - [C++] Minimizing footer reads for small(ish) metadata
-* [PARQUET-984](https://issues.apache.org/jira/browse/PARQUET-984) - C++: Add abi and so version to pkg-config
-* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
-* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
-* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
-* [PARQUET-999](https://issues.apache.org/jira/browse/PARQUET-999) - Improve MSVC build - Enable PARQUET\_BUILD\_BENCHMARKS
-* [PARQUET-1008](https://issues.apache.org/jira/browse/PARQUET-1008) - Update TypedColumnReader::ReadBatch method to accept batch\_size as int64\_t
-* [PARQUET-1035](https://issues.apache.org/jira/browse/PARQUET-1035) - Write Int96 from Arrow Timestamp(ns)
-* [PARQUET-1037](https://issues.apache.org/jira/browse/PARQUET-1037) - Allow final RowGroup to be unfilled
-* [PARQUET-1041](https://issues.apache.org/jira/browse/PARQUET-1041) - C++: Support Arrow's NullArray
-* [PARQUET-1043](https://issues.apache.org/jira/browse/PARQUET-1043) - [C++] Raise minimum supported CMake version to 3.2
-* [PARQUET-1044](https://issues.apache.org/jira/browse/PARQUET-1044) - [C++] Use compression libraries from Apache Arrow
-* [PARQUET-1045](https://issues.apache.org/jira/browse/PARQUET-1045) - [C++] Refactor to account for computational utility code migration in ARROW-1154
-* [PARQUET-1053](https://issues.apache.org/jira/browse/PARQUET-1053) - Fix unused result warnings due to unchecked Statuses
-* [PARQUET-1053](https://issues.apache.org/jira/browse/PARQUET-1053) - Fix unused result warnings due to unchecked Statuses
-* [PARQUET-1068](https://issues.apache.org/jira/browse/PARQUET-1068) - [C++] Use more vanilla Google C++ code formatting
-* [PARQUET-1068](https://issues.apache.org/jira/browse/PARQUET-1068) - [C++] Use more vanilla Google C++ code formatting
-* [PARQUET-1072](https://issues.apache.org/jira/browse/PARQUET-1072) - [C++] Add ARROW\_NO\_DEPRECATED\_API to CI to check for deprecated API use
-* [PARQUET-1078](https://issues.apache.org/jira/browse/PARQUET-1078) - [C++] Add Arrow writer option to coerce timestamps to milliseconds or microseconds
-* [PARQUET-1079](https://issues.apache.org/jira/browse/PARQUET-1079) - [C++] Account for Arrow API change in ARROW-1335
-* [PARQUET-1083](https://issues.apache.org/jira/browse/PARQUET-1083) - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking
-* [PARQUET-1083](https://issues.apache.org/jira/browse/PARQUET-1083) - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking
-* [PARQUET-1086](https://issues.apache.org/jira/browse/PARQUET-1086) - [C++] Remove usage of arrow/util/compiler-util.h after 1.3.0 release
-* [PARQUET-1087](https://issues.apache.org/jira/browse/PARQUET-1087) - [C++] Add wrapper for ScanFileContents in parquet::arrow that catches exceptions
-* [PARQUET-1092](https://issues.apache.org/jira/browse/PARQUET-1092) - [C++] Write Arrow tables with chunked columns
-* [PARQUET-1093](https://issues.apache.org/jira/browse/PARQUET-1093) - C++: Improve Arrow level generation error message
-* [PARQUET-1094](https://issues.apache.org/jira/browse/PARQUET-1094) - C++: Add benchmark for boolean Arrow column I/O
-* [PARQUET-1095](https://issues.apache.org/jira/browse/PARQUET-1095) - [C++] Read and write Arrow decimal values
-* [PARQUET-1104](https://issues.apache.org/jira/browse/PARQUET-1104) - [C++] Upgrade to Apache Arrow 0.7.0 RC0
-* [PARQUET-1150](https://issues.apache.org/jira/browse/PARQUET-1150) - C++: Hide statically linked boost symbols
-* [PARQUET-1160](https://issues.apache.org/jira/browse/PARQUET-1160) - [C++] Implement BYTE\_ARRAY-backed Decimal reads
-* [PARQUET-1164](https://issues.apache.org/jira/browse/PARQUET-1164) - [C++] Follow API changes in ARROW-1808
-* [PARQUET-1165](https://issues.apache.org/jira/browse/PARQUET-1165) - [C++] Pin clang-format version to 4.0
-* [PARQUET-1166](https://issues.apache.org/jira/browse/PARQUET-1166) - [API Proposal] Add GetRecordBatchReader in parquet/arrow/reader.h
-* [PARQUET-1177](https://issues.apache.org/jira/browse/PARQUET-1177) - [C++] Add more extensive compiler warnings when using Clang
-* [PARQUET-1177](https://issues.apache.org/jira/browse/PARQUET-1177) - [C++] Add more extensive compiler warnings when using Clang
-* [PARQUET-1196](https://issues.apache.org/jira/browse/PARQUET-1196) - [C++] Provide a parquet\_arrow example project incl. CMake setup
-* [PARQUET-1200](https://issues.apache.org/jira/browse/PARQUET-1200) - [C++] Support reading a single Arrow column from a Parquet file
-* [PARQUET-1218](https://issues.apache.org/jira/browse/PARQUET-1218) - [C++] More informative error message on too short pages
-* [PARQUET-1225](https://issues.apache.org/jira/browse/PARQUET-1225) - NaN values may lead to incorrect filtering under certain circumstances
-* [PARQUET-1227](https://issues.apache.org/jira/browse/PARQUET-1227) - Thrift crypto metadata structures
-* [PARQUET-1256](https://issues.apache.org/jira/browse/PARQUET-1256) - [C++] Add --print-key-value-metadata option to parquet\_reader tool
-* [PARQUET-1256](https://issues.apache.org/jira/browse/PARQUET-1256) - [C++] Add --print-key-value-metadata option to parquet\_reader tool
-* [PARQUET-1267](https://issues.apache.org/jira/browse/PARQUET-1267) - replace "unsafe" std::equal by std::memcmp
-* [PARQUET-1276](https://issues.apache.org/jira/browse/PARQUET-1276) - [C++] Reduce the amount of memory used for writing null decimal values
-* [PARQUET-1279](https://issues.apache.org/jira/browse/PARQUET-1279) - Use ASSERT\_NO\_FATAIL\_FAILURE in C++ unit tests
-* [PARQUET-1301](https://issues.apache.org/jira/browse/PARQUET-1301) - [C++] Crypto package in parquet-cpp
-* [PARQUET-1308](https://issues.apache.org/jira/browse/PARQUET-1308) - [C++] parquet::arrow should use thread pool, not ParallelFor
-* [PARQUET-1323](https://issues.apache.org/jira/browse/PARQUET-1323) - [C++] Fix compiler warnings with clang-6.0
-* [PARQUET-1332](https://issues.apache.org/jira/browse/PARQUET-1332) - [C++] Add bloom filter utility class
-* [PARQUET-1340](https://issues.apache.org/jira/browse/PARQUET-1340) - [C++] Fix Travis Ci valgrind errors related to std::random\_device
-* [PARQUET-1346](https://issues.apache.org/jira/browse/PARQUET-1346) - [C++] Protect against null values data in empty Arrow array
-* [PARQUET-1348](https://issues.apache.org/jira/browse/PARQUET-1348) - [C++] Allow Arrow FileWriter To Write FileMetaData
-* [PARQUET-1350](https://issues.apache.org/jira/browse/PARQUET-1350) - [C++] Use abstract ResizableBuffer instead of concrete PoolBuffer
-* [PARQUET-1360](https://issues.apache.org/jira/browse/PARQUET-1360) - [C++] Minor API + style changes follow up to PARQUET-1348
-* [PARQUET-1366](https://issues.apache.org/jira/browse/PARQUET-1366) - [C++] Streamline use of Arrow bit-util.h
-* [PARQUET-1372](https://issues.apache.org/jira/browse/PARQUET-1372) - [C++] Add an API to allow writing RowGroups based on their size rather than num\_rows
-* [PARQUET-1372](https://issues.apache.org/jira/browse/PARQUET-1372) - [C++] Add an API to allow writing RowGroups based on their size rather than num\_rows
-* [PARQUET-1378](https://issues.apache.org/jira/browse/PARQUET-1378) - [c++] Allow RowGroups with zero rows to be written
-* [PARQUET-1382](https://issues.apache.org/jira/browse/PARQUET-1382) - [C++] Prepare for arrow::test namespace removal
-* [PARQUET-1392](https://issues.apache.org/jira/browse/PARQUET-1392) - [C++] Supply row group indices to parquet::arrow::FileReader::ReadTable
-* [PARQUET-1398](https://issues.apache.org/jira/browse/PARQUET-1398) - Separate iv\_prefix for GCM and CTR modes
-* [PARQUET-1401](https://issues.apache.org/jira/browse/PARQUET-1401) - RowGroup offset and total compressed size fields
-* [PARQUET-1427](https://issues.apache.org/jira/browse/PARQUET-1427) - [C++] Move example executables and CLI tools to Apache Arrow repo
-* [PARQUET-1431](https://issues.apache.org/jira/browse/PARQUET-1431) - [C++] Automaticaly set thrift to use boost for thrift versions before 0.11
-
-
-## Bug Fixes
-
-* [ARROW-1380](https://issues.apache.org/jira/browse/ARROW-1380) - [C++] Fix "still reachable" valgrind warnings when PLASMA\_VALGRIND=1
-* [ARROW-1661](https://issues.apache.org/jira/browse/ARROW-1661) - [Python] Python 3.7 support
-* [ARROW-1799](https://issues.apache.org/jira/browse/ARROW-1799) - [Plasma C++] Make unittest does not create plasma store executable
-* [ARROW-1996](https://issues.apache.org/jira/browse/ARROW-1996) - [Python] pyarrow.read\_serialized cannot read concatenated records
-* [ARROW-2027](https://issues.apache.org/jira/browse/ARROW-2027) - [C++] ipc::Message::SerializeTo does not pad the message body
-* [ARROW-2220](https://issues.apache.org/jira/browse/ARROW-2220) - Change default fix version in merge tool to be the next mainline release version
-* [ARROW-2310](https://issues.apache.org/jira/browse/ARROW-2310) - Source release scripts fail with Java8
-* [ARROW-2646](https://issues.apache.org/jira/browse/ARROW-2646) - [C++/Python] Pandas roundtrip for date objects
-* [ARROW-2775](https://issues.apache.org/jira/browse/ARROW-2775) - [Python] ccache error when building manylinux1 wheels
-* [ARROW-2776](https://issues.apache.org/jira/browse/ARROW-2776) - [C++] Do not pass -Wno-noexcept-type for compilers that do not support it
-* [ARROW-2782](https://issues.apache.org/jira/browse/ARROW-2782) - [Python] Ongoing Travis CI failures in Plasma unit tests
-* [ARROW-2785](https://issues.apache.org/jira/browse/ARROW-2785) - [C++] Crash in json-integration-test
-* [ARROW-2814](https://issues.apache.org/jira/browse/ARROW-2814) - [Python] Unify PyObject\* sequence conversion paths for built-in sequences, NumPy arrays
-* [ARROW-2854](https://issues.apache.org/jira/browse/ARROW-2854) - [C++/Python] Casting float NaN to int should raise an error on safe cast
-* [ARROW-2925](https://issues.apache.org/jira/browse/ARROW-2925) - [JS] Documentation failing in docker container
-* [ARROW-2965](https://issues.apache.org/jira/browse/ARROW-2965) - [Python] Possible uint64 overflow issues in python\_to\_arrow.cc
-* [ARROW-2966](https://issues.apache.org/jira/browse/ARROW-2966) - [Python] Data type conversion error
-* [ARROW-2973](https://issues.apache.org/jira/browse/ARROW-2973) - [Python] pitrou/asv.git@customize\_commands does not work with the "new" way of activating conda
-* [ARROW-2974](https://issues.apache.org/jira/browse/ARROW-2974) - [Python] Replace usages of "source activate" with "conda activate" in CI scripts
-* [ARROW-2986](https://issues.apache.org/jira/browse/ARROW-2986) - [C++] /EHsc possibly needed for Visual Studio 2015 builds
-* [ARROW-2992](https://issues.apache.org/jira/browse/ARROW-2992) - [Python] Parquet benchmark failure
-* [ARROW-2992](https://issues.apache.org/jira/browse/ARROW-2992) - [Python] Parquet benchmark failure
-* [ARROW-3006](https://issues.apache.org/jira/browse/ARROW-3006) - [GLib] .gir/.typelib for GPU aren't installed
-* [ARROW-3007](https://issues.apache.org/jira/browse/ARROW-3007) - [Packaging] libarrow-gpu10 deb for Ubuntu 18.04 has broken dependencies
-* [ARROW-3011](https://issues.apache.org/jira/browse/ARROW-3011) - [CI] Remove Slack notification
-* [ARROW-3012](https://issues.apache.org/jira/browse/ARROW-3012) - [Python] Installation crashes with setuptools\_scm error
-* [ARROW-3013](https://issues.apache.org/jira/browse/ARROW-3013) - [Website] Fix download links on website for tarballs, checksums
-* [ARROW-3015](https://issues.apache.org/jira/browse/ARROW-3015) - [Python] Fix documentation typo for pa.uint8
-* [ARROW-3047](https://issues.apache.org/jira/browse/ARROW-3047) - [C++] cmake downloads and builds ORC even though it's installed
-* [ARROW-3049](https://issues.apache.org/jira/browse/ARROW-3049) - [C++/Python] ORC reader fails on empty file
-* [ARROW-3053](https://issues.apache.org/jira/browse/ARROW-3053) - [Python] Pandas decimal conversion segfault
-* [ARROW-3056](https://issues.apache.org/jira/browse/ARROW-3056) - [Python] Indicate in NativeFile docstrings methods that are part of the RawIOBase API but not implemented
-* [ARROW-3061](https://issues.apache.org/jira/browse/ARROW-3061) - [Java] headroom does not take into account reservation
-* [ARROW-3065](https://issues.apache.org/jira/browse/ARROW-3065) - [Python] concat\_tables() failing from bad Pandas Metadata
-* [ARROW-3083](https://issues.apache.org/jira/browse/ARROW-3083) - [Python] Version in manylinux1 wheel builds is wrong
-* [ARROW-3093](https://issues.apache.org/jira/browse/ARROW-3093) - [C++] Linking errors with ORC enabled
-* [ARROW-3095](https://issues.apache.org/jira/browse/ARROW-3095) - [Python] test\_plasma.py fails
-* [ARROW-3098](https://issues.apache.org/jira/browse/ARROW-3098) - [Python] BufferReader doesn't adhere to the seek protocol
-* [ARROW-3100](https://issues.apache.org/jira/browse/ARROW-3100) - [CI] C/glib build broken on OS X
-* [ARROW-3125](https://issues.apache.org/jira/browse/ARROW-3125) - [Python] Update ASV instructions
-* [ARROW-3125](https://issues.apache.org/jira/browse/ARROW-3125) - [Python] Update ASV instructions
-* [ARROW-3132](https://issues.apache.org/jira/browse/ARROW-3132) - Regenerate 0.10.0 changelog
-* [ARROW-3137](https://issues.apache.org/jira/browse/ARROW-3137) - [Python] pyarrow 0.10 requires newer version of numpy than specified in requirements
-* [ARROW-3140](https://issues.apache.org/jira/browse/ARROW-3140) - [Plasma] Plasma fails building with GPU enabled
-* [ARROW-3141](https://issues.apache.org/jira/browse/ARROW-3141) - [Python] Tensorflow support in pyarrow wheels pins numpy\>=1.14
-* [ARROW-3145](https://issues.apache.org/jira/browse/ARROW-3145) - [C++] Thrift compiler reruns in arrow/dbi/hiveserver2/thrift when using Ninja build
-* [ARROW-3173](https://issues.apache.org/jira/browse/ARROW-3173) - [Rust] dynamic\_types example does not run
-* [ARROW-3175](https://issues.apache.org/jira/browse/ARROW-3175) - [Java] Upgrade to official FlatBuffers release (Flatbuffers incompatibility)
-* [ARROW-3183](https://issues.apache.org/jira/browse/ARROW-3183) - [Python] get\_library\_dirs on Windows can give the wrong directory
-* [ARROW-3188](https://issues.apache.org/jira/browse/ARROW-3188) - [Python] Table.from\_arrays segfaults if lists and schema are passed
-* [ARROW-3190](https://issues.apache.org/jira/browse/ARROW-3190) - [C++] "WriteableFile" is misspelled, should be renamed "WritableFile" with deprecation for old name
-* [ARROW-3206](https://issues.apache.org/jira/browse/ARROW-3206) - [C++] Building with ARROW\_HIVESERVER2=ON with unit tests disabled causes error
-* [ARROW-3227](https://issues.apache.org/jira/browse/ARROW-3227) - [Python] NativeFile.write shouldn't accept unicode strings
-* [ARROW-3228](https://issues.apache.org/jira/browse/ARROW-3228) - [Python] Immutability of bytes is ignored
-* [ARROW-3231](https://issues.apache.org/jira/browse/ARROW-3231) - [Python] Sphinx's autodoc\_default\_flags is now deprecated
-* [ARROW-3237](https://issues.apache.org/jira/browse/ARROW-3237) - [CI] Update linux packaging filenames in rat exclusion list
-* [ARROW-3241](https://issues.apache.org/jira/browse/ARROW-3241) - [Plasma] test\_plasma\_list test failure on Ubuntu 14.04
-* [ARROW-3251](https://issues.apache.org/jira/browse/ARROW-3251) - [C++] Conversion warnings in cast.cc
-* [ARROW-3256](https://issues.apache.org/jira/browse/ARROW-3256) - [JS] File footer and message metadata is inconsistent
-* [ARROW-3271](https://issues.apache.org/jira/browse/ARROW-3271) - [Python] Manylinux1 builds timing out in Travis CI
-* [ARROW-3279](https://issues.apache.org/jira/browse/ARROW-3279) - [C++] Allow linking Arrow tests dynamically on Windows
-* [ARROW-3299](https://issues.apache.org/jira/browse/ARROW-3299) - [C++] Appveyor builds failing
-* [ARROW-3322](https://issues.apache.org/jira/browse/ARROW-3322) - [CI] Rust job always runs on AppVeyor
-* [ARROW-3327](https://issues.apache.org/jira/browse/ARROW-3327) - [Python] manylinux container confusing
-* [ARROW-3338](https://issues.apache.org/jira/browse/ARROW-3338) - [Python] Crash when schema and columns do not match
-* [ARROW-3342](https://issues.apache.org/jira/browse/ARROW-3342) - Appveyor builds have stopped triggering on GitHub
-* [ARROW-3348](https://issues.apache.org/jira/browse/ARROW-3348) - Plasma store dies when an object that a dead client is waiting for gets created.
-* [ARROW-3354](https://issues.apache.org/jira/browse/ARROW-3354) - [Python] read\_record\_batch interfaces differ in pyarrow and pyarrow.cuda
-* [ARROW-3369](https://issues.apache.org/jira/browse/ARROW-3369) - [Packaging] Wheel builds are failing due to wheel 0.32 release
-* [ARROW-3370](https://issues.apache.org/jira/browse/ARROW-3370) - [Packaging] Centos 6 build is failing
-* [ARROW-3373](https://issues.apache.org/jira/browse/ARROW-3373) - Fix bug in which plasma store can die when client gets multiple objects and object becomes available.
-* [ARROW-3374](https://issues.apache.org/jira/browse/ARROW-3374) - [Python] Dictionary has out-of-bound index when creating DictionaryArray from Pandas with NaN
-* [ARROW-3390](https://issues.apache.org/jira/browse/ARROW-3390) - [C++] cmake file under windows msys2 system doesn't work
-* [ARROW-3393](https://issues.apache.org/jira/browse/ARROW-3393) - [C++] Fix compiler warning in util/task-group-cc on clang 6
-* [ARROW-3394](https://issues.apache.org/jira/browse/ARROW-3394) - [Java] Remove duplicate dependency entry in Flight
-* [ARROW-3403](https://issues.apache.org/jira/browse/ARROW-3403) - [Website] Source tarball link missing from install page
-* [ARROW-3420](https://issues.apache.org/jira/browse/ARROW-3420) - [C++] Fix outstanding include-what-you-use issues in src/arrow, src/parquet codebases
-* [PARQUET-232](https://issues.apache.org/jira/browse/PARQUET-232) - minor compilation issue
-* [PARQUET-446](https://issues.apache.org/jira/browse/PARQUET-446) - Hide thrift dependency in parquet-cpp
-* [PARQUET-454](https://issues.apache.org/jira/browse/PARQUET-454) - Address inconsistencies in boolean decoding
-* [PARQUET-455](https://issues.apache.org/jira/browse/PARQUET-455) - Fix compiler warnings on OS X / Clang
-* [PARQUET-457](https://issues.apache.org/jira/browse/PARQUET-457) - Add compressed data page unit tests
-* [PARQUET-469](https://issues.apache.org/jira/browse/PARQUET-469) - Roll back Thrift bindings to 0.9.0
-* [PARQUET-472](https://issues.apache.org/jira/browse/PARQUET-472) - Clean up InputStream ownership semantics in ColumnReader
-* [PARQUET-505](https://issues.apache.org/jira/browse/PARQUET-505) - Column reader: automatically handle large data pages
-* [PARQUET-507](https://issues.apache.org/jira/browse/PARQUET-507) - Improve runtime of rle-test.cc
-* [PARQUET-513](https://issues.apache.org/jira/browse/PARQUET-513) - Valgrind errors are not failing the Travis CI build
-* [PARQUET-525](https://issues.apache.org/jira/browse/PARQUET-525) - Test coverage for malformed file failure modes on the read path
-* [PARQUET-537](https://issues.apache.org/jira/browse/PARQUET-537) - LocalFileSource leaks resources
-* [PARQUET-549](https://issues.apache.org/jira/browse/PARQUET-549) - Add scanner and column reader tests for dictionary data pages
-* [PARQUET-555](https://issues.apache.org/jira/browse/PARQUET-555) - Dictionary page metadata handling inconsistencies
-* [PARQUET-561](https://issues.apache.org/jira/browse/PARQUET-561) - ParquetFileReader::Contents PIMPL missing a virtual destructor
-* [PARQUET-599](https://issues.apache.org/jira/browse/PARQUET-599) - ColumnWriter::RleEncodeLevels' size estimation might be wrong
-* [PARQUET-604](https://issues.apache.org/jira/browse/PARQUET-604) - Install writer.h headers
-* [PARQUET-614](https://issues.apache.org/jira/browse/PARQUET-614) - C++: Remove unneeded LZ4-related code
-* [PARQUET-620](https://issues.apache.org/jira/browse/PARQUET-620) - C++: Duplicate calls to ParquetFileWriter::Close cause duplicate metdata writes
-* [PARQUET-621](https://issues.apache.org/jira/browse/PARQUET-621) - C++: Uninitialised DecimalMetadata is read
-* [PARQUET-629](https://issues.apache.org/jira/browse/PARQUET-629) - RowGroupSerializer should only close itself once
-* [PARQUET-639](https://issues.apache.org/jira/browse/PARQUET-639) - Do not export DCHECK in public headers
-* [PARQUET-643](https://issues.apache.org/jira/browse/PARQUET-643) - Add const modifier to schema pointer reference in ParquetFileWriter
-* [PARQUET-657](https://issues.apache.org/jira/browse/PARQUET-657) - [C++] Don't define DISALLOW\_COPY\_AND\_ASSIGN if already defined
-* [PARQUET-658](https://issues.apache.org/jira/browse/PARQUET-658) - ColumnReader has no virtual destructor
-* [PARQUET-659](https://issues.apache.org/jira/browse/PARQUET-659) - [C++] Instantiated template visibility is broken on clang / OS X
-* [PARQUET-662](https://issues.apache.org/jira/browse/PARQUET-662) - [C++] ParquetException must be explicitly exported in dynamic libraries
-* [PARQUET-676](https://issues.apache.org/jira/browse/PARQUET-676) - MAX\_VALUES\_PER\_LITERAL\_RUN causes RLE encoding failure
-* [PARQUET-691](https://issues.apache.org/jira/browse/PARQUET-691) - [C++] Write ColumnChunk metadata after each column chunk in the file
-* [PARQUET-694](https://issues.apache.org/jira/browse/PARQUET-694) - C++: Revert default data page size back to 1M
-* [PARQUET-700](https://issues.apache.org/jira/browse/PARQUET-700) - C++: Disable dictionary encoding for boolean columns
-* [PARQUET-701](https://issues.apache.org/jira/browse/PARQUET-701) - C++: Dictionary is written multiple times if close is called multiple times.
-* [PARQUET-702](https://issues.apache.org/jira/browse/PARQUET-702) - Add a writer + reader example with detailed comments
-* [PARQUET-702](https://issues.apache.org/jira/browse/PARQUET-702) - Add a writer + reader example with detailed comments
-* [PARQUET-703](https://issues.apache.org/jira/browse/PARQUET-703) - [C++] Validate num\_values metadata for columns with nulls
-* [PARQUET-704](https://issues.apache.org/jira/browse/PARQUET-704) - [C++] scan-all.h is not being installed
-* [PARQUET-708](https://issues.apache.org/jira/browse/PARQUET-708) - [C++] RleEncoder does not account for "worst case scenario" in MaxBufferSize for bit\_width \> 1
-* [PARQUET-710](https://issues.apache.org/jira/browse/PARQUET-710) - Remove unneeded private member variables from RowGroupReader ABI
-* [PARQUET-711](https://issues.apache.org/jira/browse/PARQUET-711) - Use metadata builders in parquet writer
-* [PARQUET-711](https://issues.apache.org/jira/browse/PARQUET-711) - Use metadata builders in parquet writer
-* [PARQUET-718](https://issues.apache.org/jira/browse/PARQUET-718) - Reading boolean pages written by parquet-cpp fails
-* [PARQUET-719](https://issues.apache.org/jira/browse/PARQUET-719) - Fix WriterBatch API to handle NULL values
-* [PARQUET-720](https://issues.apache.org/jira/browse/PARQUET-720) - Parquet-cpp fails to link when included in multiple TUs
-* [PARQUET-739](https://issues.apache.org/jira/browse/PARQUET-739) - Rle-decoding uses static buffer that is shared accross threads
-* [PARQUET-739](https://issues.apache.org/jira/browse/PARQUET-739) - Rle-decoding uses static buffer that is shared accross threads
-* [PARQUET-741](https://issues.apache.org/jira/browse/PARQUET-741) - compression\_buffer\_ is reused although it shouldn't
-* [PARQUET-742](https://issues.apache.org/jira/browse/PARQUET-742) - Add missing license headers
-* [PARQUET-745](https://issues.apache.org/jira/browse/PARQUET-745) - TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType
-* [PARQUET-747](https://issues.apache.org/jira/browse/PARQUET-747) - [C++] TypedRowGroupStatistics are not being exported in libparquet.so
-* [PARQUET-759](https://issues.apache.org/jira/browse/PARQUET-759) - Cannot store columns consisting of empty strings
-* [PARQUET-760](https://issues.apache.org/jira/browse/PARQUET-760) - On switching from dictionary to the fallback encoding, an incorrect encoding is set
-* [PARQUET-764](https://issues.apache.org/jira/browse/PARQUET-764) - [CPP] Parquet Writer does not write Boolean values correctly
-* [PARQUET-766](https://issues.apache.org/jira/browse/PARQUET-766) - C++: Expose ParquetFileReader through Arrow reader as const
-* [PARQUET-775](https://issues.apache.org/jira/browse/PARQUET-775) - C++: TrackingAllocator is not thread-safe
-* [PARQUET-779](https://issues.apache.org/jira/browse/PARQUET-779) - Export TypedRowGroupStatistics in libparquet
-* [PARQUET-780](https://issues.apache.org/jira/browse/PARQUET-780) - WriterBatch API does not properly handle NULL values for byte array types
-* [PARQUET-789](https://issues.apache.org/jira/browse/PARQUET-789) - [C++] Catch and translate ParquetException in parquet::arrow::FileReader::{ReadFlatColumn, ReadFlatTable}}
-* [PARQUET-793](https://issues.apache.org/jira/browse/PARQUET-793) - [CPP] Do not return incorrect statistics
-* [PARQUET-797](https://issues.apache.org/jira/browse/PARQUET-797) - [C++] Update for API changes in ARROW-418
-* [PARQUET-799](https://issues.apache.org/jira/browse/PARQUET-799) - concurrent usage of the file reader API
-* [PARQUET-812](https://issues.apache.org/jira/browse/PARQUET-812) - [C++] Failure reading BYTE\_ARRAY data from file in parquet-compatibility project
-* [PARQUET-816](https://issues.apache.org/jira/browse/PARQUET-816) - [C++] Failure decoding sample dict-encoded file from parquet-compatibility project
-* [PARQUET-818](https://issues.apache.org/jira/browse/PARQUET-818) - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow
-* [PARQUET-818](https://issues.apache.org/jira/browse/PARQUET-818) - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow
-* [PARQUET-819](https://issues.apache.org/jira/browse/PARQUET-819) - C++: Trying to install non-existing parquet/arrow/utils.h
-* [PARQUET-827](https://issues.apache.org/jira/browse/PARQUET-827) - [C++] Incorporate addition of arrow::MemoryPool::Reallocate
-* [PARQUET-828](https://issues.apache.org/jira/browse/PARQUET-828) - [C++] "version" field set improperly in file metadata
-* [PARQUET-837](https://issues.apache.org/jira/browse/PARQUET-837) - [C++] SerializedFile::ParseMetaData uses Seek, followed by Read, and could have race conditions
-* [PARQUET-841](https://issues.apache.org/jira/browse/PARQUET-841) - [C++] Writing wrong format version when using ParquetVersion::PARQUET\_1\_0
-* [PARQUET-842](https://issues.apache.org/jira/browse/PARQUET-842) - [C++] Impala rejects DOUBLE columns if decimal metadata is set
-* [PARQUET-843](https://issues.apache.org/jira/browse/PARQUET-843) - [C++] Impala unable to read files created by parquet-cpp
-* [PARQUET-846](https://issues.apache.org/jira/browse/PARQUET-846) - [CPP] CpuInfo::Init() is not thread safe
-* [PARQUET-880](https://issues.apache.org/jira/browse/PARQUET-880) - [CPP] Prevent destructors from throwing
-* [PARQUET-888](https://issues.apache.org/jira/browse/PARQUET-888) - C++ Memory leak in RowGroupSerializer
-* [PARQUET-889](https://issues.apache.org/jira/browse/PARQUET-889) - Fix compilation when PARQUET\_USE\_SSE is on
-* [PARQUET-892](https://issues.apache.org/jira/browse/PARQUET-892) - [C++] Clean up link library targets in CMake files
-* [PARQUET-895](https://issues.apache.org/jira/browse/PARQUET-895) - Reading of nested columns is broken
-* [PARQUET-898](https://issues.apache.org/jira/browse/PARQUET-898) - [C++] Change Travis CI OS X image to Xcode 6.4 and fix our thirdparty build
-* [PARQUET-908](https://issues.apache.org/jira/browse/PARQUET-908) - Fix for PARQUET-890 introduces undefined symbol in libparquet\_arrow.so
-* [PARQUET-914](https://issues.apache.org/jira/browse/PARQUET-914) - [C++] Throw more informative exception when user writes too many values to a column in a row group
-* [PARQUET-915](https://issues.apache.org/jira/browse/PARQUET-915) - Support Arrow Time Types in Schema
-* [PARQUET-918](https://issues.apache.org/jira/browse/PARQUET-918) - FromParquetSchema API crashes on nested schemas
-* [PARQUET-918](https://issues.apache.org/jira/browse/PARQUET-918) - FromParquetSchema API crashes on nested schemas
-* [PARQUET-919](https://issues.apache.org/jira/browse/PARQUET-919) - [C++] Account for API changes in ARROW-683
-* [PARQUET-923](https://issues.apache.org/jira/browse/PARQUET-923) - [C++] Account for Time metadata changes in ARROW-686
-* [PARQUET-933](https://issues.apache.org/jira/browse/PARQUET-933) - [C++] Account for Arrow Table API changes coming in ARROW-728
-* [PARQUET-936](https://issues.apache.org/jira/browse/PARQUET-936) - [C++] parquet::arrow::WriteTable can enter infinite loop if chunk\_size is 0
-* [PARQUET-943](https://issues.apache.org/jira/browse/PARQUET-943) - [C++] Overflow build error on x86
-* [PARQUET-947](https://issues.apache.org/jira/browse/PARQUET-947) - [C++] Refactor to account for ARROW-795 Arrow core library consolidation
-* [PARQUET-958](https://issues.apache.org/jira/browse/PARQUET-958) - [C++] Print Parquet metadata in JSON format
-* [PARQUET-958](https://issues.apache.org/jira/browse/PARQUET-958) - [C++] Print Parquet metadata in JSON format
-* [PARQUET-963](https://issues.apache.org/jira/browse/PARQUET-963) - [C++] Disallow reading struct types in Arrow reader for now
-* [PARQUET-965](https://issues.apache.org/jira/browse/PARQUET-965) - [C++] FIXED\_LEN\_BYTE\_ARRAY types are unhandled in the Arrow reader
-* [PARQUET-979](https://issues.apache.org/jira/browse/PARQUET-979) - [C++] Limit size of min, max or disable stats for long binary types
-* [PARQUET-992](https://issues.apache.org/jira/browse/PARQUET-992) - [C++] parquet/compression.h leaks zlib.h
-* [PARQUET-995](https://issues.apache.org/jira/browse/PARQUET-995) - [C++] Int96 reader in parquet\_arrow uses size of Int96Type instead of Int96
-* [PARQUET-997](https://issues.apache.org/jira/browse/PARQUET-997) - Fix override compiler warnings
-* [PARQUET-1002](https://issues.apache.org/jira/browse/PARQUET-1002) - [C++] Compute statistics based on Logical Types
-* [PARQUET-1003](https://issues.apache.org/jira/browse/PARQUET-1003) - [C++] Modify DEFAULT\_CREATED\_BY value for every new release version
-* [PARQUET-1007](https://issues.apache.org/jira/browse/PARQUET-1007) - [C++ ] Update parquet.thrift from https://github.com/apache/parquet-format
-* [PARQUET-1029](https://issues.apache.org/jira/browse/PARQUET-1029) - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported
-* [PARQUET-1029](https://issues.apache.org/jira/browse/PARQUET-1029) - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported
-* [PARQUET-1033](https://issues.apache.org/jira/browse/PARQUET-1033) - Mismatched Read and Write
-* [PARQUET-1038](https://issues.apache.org/jira/browse/PARQUET-1038) - Key value metadata should be nullptr if not set
-* [PARQUET-1040](https://issues.apache.org/jira/browse/PARQUET-1040) - Missing writer method implementations
-* [PARQUET-1042](https://issues.apache.org/jira/browse/PARQUET-1042) - C++: Compilation breaks on GCC 4.8
-* [PARQUET-1048](https://issues.apache.org/jira/browse/PARQUET-1048) - [C++] Static linking of libarrow is no longer supported
-* [PARQUET-1048](https://issues.apache.org/jira/browse/PARQUET-1048) - [C++] Static linking of libarrow is no longer supported
-* [PARQUET-1054](https://issues.apache.org/jira/browse/PARQUET-1054) - [C++] Account for Arrow API changes in ARROW-1199
-* [PARQUET-1071](https://issues.apache.org/jira/browse/PARQUET-1071) - [C++] parquet::arrow::FileWriter::Close is not idempotent
-* [PARQUET-1085](https://issues.apache.org/jira/browse/PARQUET-1085) - [C++] Backwards compatibility from macro cleanup in transitive dependencies in ARROW-1452
-* [PARQUET-1088](https://issues.apache.org/jira/browse/PARQUET-1088) - [CPP] remove parquet\_version.h from version control since it gets auto generated
-* [PARQUET-1090](https://issues.apache.org/jira/browse/PARQUET-1090) - [C++] Fix int32 overflow in Arrow table writer, add max row group size property
-* [PARQUET-1098](https://issues.apache.org/jira/browse/PARQUET-1098) - [C++] Install new header in parquet/util
-* [PARQUET-1100](https://issues.apache.org/jira/browse/PARQUET-1100) - [C++] Reading repeated types should decode number of records rather than number of values
-* [PARQUET-1108](https://issues.apache.org/jira/browse/PARQUET-1108) - [C++] Fix Int96 comparators
-* [PARQUET-1114](https://issues.apache.org/jira/browse/PARQUET-1114) - Apply fix for ARROW-1601 and ARROW-1611 to parquet-cpp
-* [PARQUET-1121](https://issues.apache.org/jira/browse/PARQUET-1121) - C++: DictionaryArrays of NullType cannot be written
-* [PARQUET-1123](https://issues.apache.org/jira/browse/PARQUET-1123) - [C++] Update parquet-cpp to use Arrow's AssertArraysEqual
-* [PARQUET-1138](https://issues.apache.org/jira/browse/PARQUET-1138) - [C++] Fix compilation with Arrow 0.7.1
-* [PARQUET-1167](https://issues.apache.org/jira/browse/PARQUET-1167) - [C++] FieldToNode function should return a status when throwing an exception
-* [PARQUET-1175](https://issues.apache.org/jira/browse/PARQUET-1175) - [C++] Fix usage of deprecated Arrow API
-* [PARQUET-1179](https://issues.apache.org/jira/browse/PARQUET-1179) - [C++] Support Apache Thrift 0.11
-* [PARQUET-1180](https://issues.apache.org/jira/browse/PARQUET-1180) - C++: Fix behaviour of num\_children element of primitive nodes
-* [PARQUET-1193](https://issues.apache.org/jira/browse/PARQUET-1193) - [CPP] Implement ColumnOrder to support min\_value and max\_value
-* [PARQUET-1226](https://issues.apache.org/jira/browse/PARQUET-1226) - [C++] Fix new build warnings with clang 5.0
-* [PARQUET-1233](https://issues.apache.org/jira/browse/PARQUET-1233) - [CPP ]Enable option to switch between stl classes and boost classes for thrift header
-* [PARQUET-1245](https://issues.apache.org/jira/browse/PARQUET-1245) - [C++] Segfault when writing Arrow table with duplicate columns
-* [PARQUET-1255](https://issues.apache.org/jira/browse/PARQUET-1255) - [C++] Exceptions thrown in some tests
-* [PARQUET-1265](https://issues.apache.org/jira/browse/PARQUET-1265) - Segfault on static ApplicationVersion initialization
-* [PARQUET-1268](https://issues.apache.org/jira/browse/PARQUET-1268) - [C++] Conversion of Arrow null list columns fails
-* [PARQUET-1270](https://issues.apache.org/jira/browse/PARQUET-1270) - [C++] Executable tools do not get installed
-* [PARQUET-1272](https://issues.apache.org/jira/browse/PARQUET-1272) - [C++] ScanFileContents reports wrong row count for nested columns
-* [PARQUET-1273](https://issues.apache.org/jira/browse/PARQUET-1273) - [Python] Error writing to partitioned Parquet dataset
-* [PARQUET-1274](https://issues.apache.org/jira/browse/PARQUET-1274) - [Python] SegFault in pyarrow.parquet.write\_table with specific options
-* [PARQUET-1283](https://issues.apache.org/jira/browse/PARQUET-1283) - [C++] FormatStatValue appends trailing space to string and int96
-* [PARQUET-1307](https://issues.apache.org/jira/browse/PARQUET-1307) - [C++] memory-test fails with latest Arrow
-* [PARQUET-1315](https://issues.apache.org/jira/browse/PARQUET-1315) - [C++] ColumnChunkMetaData.has\_dictionary\_page() should return bool, not int64\_t
-* [PARQUET-1333](https://issues.apache.org/jira/browse/PARQUET-1333) - [C++] Reading of files with dictionary size 0 fails on Windows with bad\_alloc
-* [PARQUET-1334](https://issues.apache.org/jira/browse/PARQUET-1334) - [C++] memory\_map parameter seems missleading in parquet file opener
-* [PARQUET-1357](https://issues.apache.org/jira/browse/PARQUET-1357) - [C++] FormatStatValue truncates binary statistics on zero character
-* [PARQUET-1358](https://issues.apache.org/jira/browse/PARQUET-1358) - [C++] index\_page\_offset should be unset as it is not supported.
-* [PARQUET-1369](https://issues.apache.org/jira/browse/PARQUET-1369) - [Python] Unavailable Parquet column statistics from Spark-generated file
-* [PARQUET-1384](https://issues.apache.org/jira/browse/PARQUET-1384) - [C++] Clang compiler warnings in bloom\_filter-test.cc
-
-
-
-# Apache Arrow 0.10.0 (2018-08-06)
-
-## Bug Fixes
-
-* [ARROW-198](https://issues.apache.org/jira/browse/ARROW-198) - [Java] OutOfMemoryError for vector test case
-* [ARROW-640](https://issues.apache.org/jira/browse/ARROW-640) - [Python] Arrow scalar values should have a sensible \_\_hash\_\_ and comparison
-* [ARROW-2020](https://issues.apache.org/jira/browse/ARROW-2020) - [Python] Parquet segfaults if coercing ns timestamps and writing 96-bit timestamps
-* [ARROW-2059](https://issues.apache.org/jira/browse/ARROW-2059) - [Python] Possible performance regression in Feather read/write path
-* [ARROW-2101](https://issues.apache.org/jira/browse/ARROW-2101) - [Python] from\_pandas reads 'str' type as binary Arrow data with Python 2
-* [ARROW-2122](https://issues.apache.org/jira/browse/ARROW-2122) - [Python] Pyarrow fails to serialize dataframe with timestamp.
-* [ARROW-2182](https://issues.apache.org/jira/browse/ARROW-2182) - [Python] ASV benchmark setup does not account for C++ library changing
-* [ARROW-2189](https://issues.apache.org/jira/browse/ARROW-2189) - [C++] Seg. fault on make\_shared<PoolBuffer\>
-* [ARROW-2193](https://issues.apache.org/jira/browse/ARROW-2193) - [Plasma] plasma\_store has runtime dependency on Boost shared libraries when ARROW\_BOOST\_USE\_SHARED=on
-* [ARROW-2195](https://issues.apache.org/jira/browse/ARROW-2195) - [Plasma] Segfault when retrieving RecordBatch from plasma store
-* [ARROW-2247](https://issues.apache.org/jira/browse/ARROW-2247) - [Python] Statically-linking boost\_regex in both libarrow and libparquet results in segfault
-* [ARROW-2273](https://issues.apache.org/jira/browse/ARROW-2273) - Cannot deserialize pandas SparseDataFrame
-* [ARROW-2300](https://issues.apache.org/jira/browse/ARROW-2300) - [Python] python/testing/test\_hdfs.sh no longer works
-* [ARROW-2305](https://issues.apache.org/jira/browse/ARROW-2305) - [Python] Cython 0.25.2 compilation failure
-* [ARROW-2314](https://issues.apache.org/jira/browse/ARROW-2314) - [Python] Union array slicing is defective
-* [ARROW-2326](https://issues.apache.org/jira/browse/ARROW-2326) - [Python] cannot import pip installed pyarrow on OS X (10.9)
-* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - Writing a slice with feather ignores the offset
-* [ARROW-2331](https://issues.apache.org/jira/browse/ARROW-2331) - [Python] Fix indexing implementations
-* [ARROW-2333](https://issues.apache.org/jira/browse/ARROW-2333) - [Python] boost bundling fails in setup.py
-* [ARROW-2342](https://issues.apache.org/jira/browse/ARROW-2342) - [Python] Aware timestamp type fails pickling
-* [ARROW-2346](https://issues.apache.org/jira/browse/ARROW-2346) - [Python] PYARROW\_CXXFLAGS doesn't accept multiple options
-* [ARROW-2349](https://issues.apache.org/jira/browse/ARROW-2349) - [Python] Boost shared library bundling is broken for MSVC
-* [ARROW-2351](https://issues.apache.org/jira/browse/ARROW-2351) - [C++] StringBuilder::append(vector<string\>...) not implemented
-* [ARROW-2354](https://issues.apache.org/jira/browse/ARROW-2354) - [C++] PyDecimal\_Check() is much too slow
-* [ARROW-2355](https://issues.apache.org/jira/browse/ARROW-2355) - [Python] Unable to import pyarrow [0.9.0] OSX
-* [ARROW-2357](https://issues.apache.org/jira/browse/ARROW-2357) - Benchmark PandasObjectIsNull
-* [ARROW-2368](https://issues.apache.org/jira/browse/ARROW-2368) - DecimalVector\#setBigEndian is not padding correctly for negative values
-* [ARROW-2369](https://issues.apache.org/jira/browse/ARROW-2369) - Large (\>\~20 GB) files written to Parquet via PyArrow are corrupted
-* [ARROW-2370](https://issues.apache.org/jira/browse/ARROW-2370) - [GLib] include path is wrong on Meson build
-* [ARROW-2371](https://issues.apache.org/jira/browse/ARROW-2371) - [GLib] gio-2.0 isn't required on GNU Autotools build
-* [ARROW-2372](https://issues.apache.org/jira/browse/ARROW-2372) - [Python] ArrowIOError: Invalid argument when reading Parquet file
-* [ARROW-2375](https://issues.apache.org/jira/browse/ARROW-2375) - [Rust] Buffer should release memory when dropped
-* [ARROW-2377](https://issues.apache.org/jira/browse/ARROW-2377) - [GLib] Travis-CI failures
-* [ARROW-2380](https://issues.apache.org/jira/browse/ARROW-2380) - [Python] Correct issues in numpy\_to\_arrow conversion routines
-* [ARROW-2382](https://issues.apache.org/jira/browse/ARROW-2382) - [Rust] List<T\> was not using memory safely
-* [ARROW-2383](https://issues.apache.org/jira/browse/ARROW-2383) - [C++] Debian packages need to depend on libprotobuf
-* [ARROW-2387](https://issues.apache.org/jira/browse/ARROW-2387) - [Python] negative decimal values get spurious rescaling error
-* [ARROW-2391](https://issues.apache.org/jira/browse/ARROW-2391) - [Python] Segmentation fault from PyArrow when mapping Pandas datetime column to pyarrow.date64
-* [ARROW-2393](https://issues.apache.org/jira/browse/ARROW-2393) - [C++] arrow/status.h does not define ARROW\_CHECK needed for ARROW\_CHECK\_OK
-* [ARROW-2403](https://issues.apache.org/jira/browse/ARROW-2403) - [C++] arrow::CpuInfo::model\_name\_ destructed twice on exit
-* [ARROW-2405](https://issues.apache.org/jira/browse/ARROW-2405) - [C++] <functional\> is missing in plasma/client.h
-* [ARROW-2418](https://issues.apache.org/jira/browse/ARROW-2418) - [Rust] List builder fails due to memory not being reserved correctly
-* [ARROW-2419](https://issues.apache.org/jira/browse/ARROW-2419) - [Site] Website generation depends on local timezone
-* [ARROW-2420](https://issues.apache.org/jira/browse/ARROW-2420) - [Rust] Memory is never released
-* [ARROW-2421](https://issues.apache.org/jira/browse/ARROW-2421) - [C++] Update LLVM version in cpp README
-* [ARROW-2423](https://issues.apache.org/jira/browse/ARROW-2423) - [Python] PyArrow datatypes raise ValueError on equality checks against non-PyArrow objects
-* [ARROW-2424](https://issues.apache.org/jira/browse/ARROW-2424) - [Rust] Missing import causing broken build
-* [ARROW-2425](https://issues.apache.org/jira/browse/ARROW-2425) - [Rust] Array::from missing mapping for u8 type
-* [ARROW-2426](https://issues.apache.org/jira/browse/ARROW-2426) - [CI] glib build failure
-* [ARROW-2432](https://issues.apache.org/jira/browse/ARROW-2432) - [Python] from\_pandas fails when converting decimals if have None values
-* [ARROW-2437](https://issues.apache.org/jira/browse/ARROW-2437) - [C++] Change of arrow::ipc::ReadMessage signature breaks ABI compability
-* [ARROW-2438](https://issues.apache.org/jira/browse/ARROW-2438) - [Rust] memory\_pool.rs misses license header
-* [ARROW-2441](https://issues.apache.org/jira/browse/ARROW-2441) - [Rust] Builder<T\>::slice\_mut assertions are too strict
-* [ARROW-2443](https://issues.apache.org/jira/browse/ARROW-2443) - [Python] Conversion from pandas of empty categorical fails with ArrowInvalid
-* [ARROW-2450](https://issues.apache.org/jira/browse/ARROW-2450) - [Python] Saving to parquet fails for empty lists
-* [ARROW-2452](https://issues.apache.org/jira/browse/ARROW-2452) - [TEST] Spark integration test fails with permission error
-* [ARROW-2454](https://issues.apache.org/jira/browse/ARROW-2454) - [Python] Empty chunked array slice crashes
-* [ARROW-2455](https://issues.apache.org/jira/browse/ARROW-2455) - [C++] The bytes\_allocated\_ in CudaContextImpl isn't initialized
-* [ARROW-2457](https://issues.apache.org/jira/browse/ARROW-2457) - garrow\_array\_builder\_append\_values() won't work for large arrays
-* [ARROW-2459](https://issues.apache.org/jira/browse/ARROW-2459) - pyarrow: Segfault with pyarrow.deserialize\_pandas
-* [ARROW-2462](https://issues.apache.org/jira/browse/ARROW-2462) - [C++] Segfault when writing a parquet table containing a dictionary column from Record Batch Stream
-* [ARROW-2465](https://issues.apache.org/jira/browse/ARROW-2465) - [Plasma] plasma\_store fails to find libarrow\_gpu.so
-* [ARROW-2466](https://issues.apache.org/jira/browse/ARROW-2466) - [C++] misleading "append" flag to FileOutputStream
-* [ARROW-2468](https://issues.apache.org/jira/browse/ARROW-2468) - [Rust] Builder::slice\_mut should take mut self
-* [ARROW-2471](https://issues.apache.org/jira/browse/ARROW-2471) - [Rust] Assertion when pushing value to Builder/ListBuilder with zero capacity
-* [ARROW-2473](https://issues.apache.org/jira/browse/ARROW-2473) - [Rust] List assertion error with list of zero length
-* [ARROW-2474](https://issues.apache.org/jira/browse/ARROW-2474) - [Rust] Add windows support for memory pool abstraction
-* [ARROW-2489](https://issues.apache.org/jira/browse/ARROW-2489) - [Plasma] test\_plasma.py crashes
-* [ARROW-2491](https://issues.apache.org/jira/browse/ARROW-2491) - [Python] Array.from\_buffers does not work for ListArray
-* [ARROW-2492](https://issues.apache.org/jira/browse/ARROW-2492) - [Python] Prevent segfault on accidental call of pyarrow.Array
-* [ARROW-2500](https://issues.apache.org/jira/browse/ARROW-2500) - [Java] IPC Writers/readers are not always setting validity bits correctly
-* [ARROW-2502](https://issues.apache.org/jira/browse/ARROW-2502) - [Rust] Restore Windows Compatibility
-* [ARROW-2503](https://issues.apache.org/jira/browse/ARROW-2503) - [Python] Trailing space character in RowGroup statistics of pyarrow.parquet.ParquetFile
-* [ARROW-2509](https://issues.apache.org/jira/browse/ARROW-2509) - [CI] Intermittent npm failures
-* [ARROW-2510](https://issues.apache.org/jira/browse/ARROW-2510) - [Python] Segmentation fault when converting empty column as categorical
-* [ARROW-2511](https://issues.apache.org/jira/browse/ARROW-2511) - BaseVariableWidthVector.allocateNew is not throwing OOM when it can't allocate memory
-* [ARROW-2514](https://issues.apache.org/jira/browse/ARROW-2514) - [Python] Inferring / converting nested Numpy array is very slow
-* [ARROW-2515](https://issues.apache.org/jira/browse/ARROW-2515) - Errors with DictionaryArray inside of ListArray or other DictionaryArray
-* [ARROW-2518](https://issues.apache.org/jira/browse/ARROW-2518) - [Java] Restore Java unit tests and javadoc test to CI matrix
-* [ARROW-2530](https://issues.apache.org/jira/browse/ARROW-2530) - [GLib] Out-of-source build is failed
-* [ARROW-2534](https://issues.apache.org/jira/browse/ARROW-2534) - [C++] libarrow.so leaks zlib symbols
-* [ARROW-2545](https://issues.apache.org/jira/browse/ARROW-2545) - [Python] Arrow fails linking against statically-compiled Python
-* [ARROW-2554](https://issues.apache.org/jira/browse/ARROW-2554) - pa.array type inference bug when using NS-timestamp
-* [ARROW-2557](https://issues.apache.org/jira/browse/ARROW-2557) - [Rust] Add badge for code coverage in README
-* [ARROW-2561](https://issues.apache.org/jira/browse/ARROW-2561) - [C++] Crash in cuda-test shutdown with coverage enabled
-* [ARROW-2564](https://issues.apache.org/jira/browse/ARROW-2564) - [C++] Rowwise Tutorial is out of date
-* [ARROW-2565](https://issues.apache.org/jira/browse/ARROW-2565) - [Plasma] new subscriber cannot receive notifications about existing objects
-* [ARROW-2570](https://issues.apache.org/jira/browse/ARROW-2570) - [Python] Add support for writing parquet files with LZ4 compression
-* [ARROW-2571](https://issues.apache.org/jira/browse/ARROW-2571) - [C++] Lz4Codec doesn't properly handle empty data
-* [ARROW-2575](https://issues.apache.org/jira/browse/ARROW-2575) - [Python] Exclude hidden files when reading Parquet dataset
-* [ARROW-2578](https://issues.apache.org/jira/browse/ARROW-2578) - [Plasma] Valgrind errors related to std::random\_device
-* [ARROW-2589](https://issues.apache.org/jira/browse/ARROW-2589) - [Python] test\_parquet.py regression with Pandas 0.23.0
-* [ARROW-2593](https://issues.apache.org/jira/browse/ARROW-2593) - [Python] TypeError: data type "mixed-integer" not understood
-* [ARROW-2594](https://issues.apache.org/jira/browse/ARROW-2594) - [Java] Vector reallocation does not properly clear reused buffers
-* [ARROW-2599](https://issues.apache.org/jira/browse/ARROW-2599) - [Python] pip install is not working without Arrow C++ being installed
-* [ARROW-2601](https://issues.apache.org/jira/browse/ARROW-2601) - [Python] MemoryPool bytes\_allocated causes seg
-* [ARROW-2603](https://issues.apache.org/jira/browse/ARROW-2603) - [Python] from pandas raises ArrowInvalid for date(time) subclasses
-* [ARROW-2615](https://issues.apache.org/jira/browse/ARROW-2615) - [Rust] Refactor introduced a bug around Arrays of String
-* [ARROW-2622](https://issues.apache.org/jira/browse/ARROW-2622) - [C++] Array methods IsNull and IsValid are not complementary
-* [ARROW-2629](https://issues.apache.org/jira/browse/ARROW-2629) - [Plasma] Iterator invalidation for pending\_notifications\_
-* [ARROW-2630](https://issues.apache.org/jira/browse/ARROW-2630) - [Java] Typo in the document
-* [ARROW-2632](https://issues.apache.org/jira/browse/ARROW-2632) - [Java] ArrowStreamWriter accumulates ArrowBlock but does not use them
-* [ARROW-2640](https://issues.apache.org/jira/browse/ARROW-2640) - JS Writer should serialize schema metadata
-* [ARROW-2642](https://issues.apache.org/jira/browse/ARROW-2642) - [Python] Fail building parquet binding on Windows
-* [ARROW-2643](https://issues.apache.org/jira/browse/ARROW-2643) - [C++] Travis-CI build failure with cpp toolchain enabled
-* [ARROW-2644](https://issues.apache.org/jira/browse/ARROW-2644) - [Python] parquet binding fails building on AppVeyor
-* [ARROW-2655](https://issues.apache.org/jira/browse/ARROW-2655) - [C++] Failure with -Werror=conversion on gcc 7.3.0
-* [ARROW-2657](https://issues.apache.org/jira/browse/ARROW-2657) - Segfault when importing TensorFlow after Pyarrow
-* [ARROW-2668](https://issues.apache.org/jira/browse/ARROW-2668) - [C++] -Wnull-pointer-arithmetic warning with dlmalloc.c on clang 6.0, Ubuntu 14.04
-* [ARROW-2669](https://issues.apache.org/jira/browse/ARROW-2669) - [C++] EP\_CXX\_FLAGS not passed on when building gbenchmark
-* [ARROW-2675](https://issues.apache.org/jira/browse/ARROW-2675) - Arrow build error with clang-10 (Apple Clang / LLVM)
-* [ARROW-2683](https://issues.apache.org/jira/browse/ARROW-2683) - [Python] Resource Warning (Unclosed File) when using pyarrow.parquet.read\_table()
-* [ARROW-2690](https://issues.apache.org/jira/browse/ARROW-2690) - [C++] Plasma does not follow style conventions for variable and function names
-* [ARROW-2691](https://issues.apache.org/jira/browse/ARROW-2691) - [Rust] Travis fails due to formatting diff
-* [ARROW-2693](https://issues.apache.org/jira/browse/ARROW-2693) - [Python] pa.chunked\_array causes a segmentation fault on empty input
-* [ARROW-2694](https://issues.apache.org/jira/browse/ARROW-2694) - [Python] ArrayValue string conversion returns the representation instead of the converted python object string
-* [ARROW-2698](https://issues.apache.org/jira/browse/ARROW-2698) - [Python] Exception when passing a string to Table.column
-* [ARROW-2711](https://issues.apache.org/jira/browse/ARROW-2711) - [Python/C++] Pandas-Arrow doesn't roundtrip when column of lists has empty first element
-* [ARROW-2715](https://issues.apache.org/jira/browse/ARROW-2715) - Address apt flakiness with launchpad.net
-* [ARROW-2716](https://issues.apache.org/jira/browse/ARROW-2716) - [Python] Make manylinux1 base image independent of Python patch releases
-* [ARROW-2721](https://issues.apache.org/jira/browse/ARROW-2721) - [C++] Link error with Arrow C++ build with -DARROW\_ORC=ON on CentOS 7
-* [ARROW-2722](https://issues.apache.org/jira/browse/ARROW-2722) - [Python] ndarray to arrow conversion fails when downcasted from pandas to\_numeric
-* [ARROW-2723](https://issues.apache.org/jira/browse/ARROW-2723) - [C++] arrow-orc.pc is missing
-* [ARROW-2726](https://issues.apache.org/jira/browse/ARROW-2726) - [C++] The latest Boost version is wrong
-* [ARROW-2727](https://issues.apache.org/jira/browse/ARROW-2727) - [Java] Unable to build java/adapters module
-* [ARROW-2741](https://issues.apache.org/jira/browse/ARROW-2741) - [Python] pa.array from np.datetime[D] and type=pa.date64 produces invalid results
-* [ARROW-2744](https://issues.apache.org/jira/browse/ARROW-2744) - [Python] Writing to parquet crashes when writing a ListArray of empty lists
-* [ARROW-2745](https://issues.apache.org/jira/browse/ARROW-2745) - [C++] ORC ExternalProject needs to declare dependency on vendored protobuf
-* [ARROW-2747](https://issues.apache.org/jira/browse/ARROW-2747) - [CI] [Plasma] huge tables test failure on Travis
-* [ARROW-2754](https://issues.apache.org/jira/browse/ARROW-2754) - [Python] When installing pyarrow via pip, a debug build is created
-* [ARROW-2770](https://issues.apache.org/jira/browse/ARROW-2770) - [Packaging] Account for conda-forge compiler migration in conda recipes
-* [ARROW-2773](https://issues.apache.org/jira/browse/ARROW-2773) - [Python] Corrected parquet docs partition\_cols parameter name
-* [ARROW-2781](https://issues.apache.org/jira/browse/ARROW-2781) - [Python] Download boost using curl in manylinux1 image
-* [ARROW-2787](https://issues.apache.org/jira/browse/ARROW-2787) - [Python] Memory Issue passing table from python to c++ via cython
-* [ARROW-2795](https://issues.apache.org/jira/browse/ARROW-2795) - [Python] Run TensorFlow import workaround only on Linux
-* [ARROW-2806](https://issues.apache.org/jira/browse/ARROW-2806) - [Python] Inconsistent handling of np.nan
-* [ARROW-2810](https://issues.apache.org/jira/browse/ARROW-2810) - [Plasma] Plasma public headers leak flatbuffers.h
-* [ARROW-2812](https://issues.apache.org/jira/browse/ARROW-2812) - [Ruby] StructArray\#[] raises NoMethodError
-* [ARROW-2820](https://issues.apache.org/jira/browse/ARROW-2820) - [Python] RecordBatch.from\_arrays does not validate array lengths are all equal
-* [ARROW-2823](https://issues.apache.org/jira/browse/ARROW-2823) - [C++] Search for flatbuffers in <root\>/lib64
-* [ARROW-2841](https://issues.apache.org/jira/browse/ARROW-2841) - [Go] Fix recent Go build failures in Travis CI
-* [ARROW-2850](https://issues.apache.org/jira/browse/ARROW-2850) - [C++/Python] PARQUET\_RPATH\_ORIGIN=ON missing in manylinux1 build
-* [ARROW-2851](https://issues.apache.org/jira/browse/ARROW-2851) - [C++] Update RAT excludes for new install file names
-* [ARROW-2852](https://issues.apache.org/jira/browse/ARROW-2852) - [Rust] Mark Array as Sync and Send
-* [ARROW-2856](https://issues.apache.org/jira/browse/ARROW-2856) - [Python/C++] Array constructor should not truncate floats when casting to int
-* [ARROW-2862](https://issues.apache.org/jira/browse/ARROW-2862) - [C++] Ensure thirdparty download directory has been created in thirdparty/download\_thirdparty.sh
-* [ARROW-2867](https://issues.apache.org/jira/browse/ARROW-2867) - [Python] Incorrect example for Cython usage
-* [ARROW-2871](https://issues.apache.org/jira/browse/ARROW-2871) - [Python] Array.to\_numpy is invalid for boolean arrays
-* [ARROW-2872](https://issues.apache.org/jira/browse/ARROW-2872) - [Python] Add pytest mark to opt into TensorFlow-related unit tests
-* [ARROW-2876](https://issues.apache.org/jira/browse/ARROW-2876) - [Packaging] Crossbow builds can hang if you cloned using SSH
-* [ARROW-2877](https://issues.apache.org/jira/browse/ARROW-2877) - [Packaging] crossbow submit results in duplicate Travis CI build
-* [ARROW-2878](https://issues.apache.org/jira/browse/ARROW-2878) - [Packaging] README.md does not mention setting GitHub API token in user's crossbow repo settings
-* [ARROW-2883](https://issues.apache.org/jira/browse/ARROW-2883) - [Plasma] Compilation warnings
-* [ARROW-2891](https://issues.apache.org/jira/browse/ARROW-2891) - [Python] Preserve schema in write\_to\_dataset
-* [ARROW-2894](https://issues.apache.org/jira/browse/ARROW-2894) - [Glib] Format tests broken due to recent refactor
-* [ARROW-2895](https://issues.apache.org/jira/browse/ARROW-2895) - [Ruby] CI isn't ran when C++ is changed
-* [ARROW-2896](https://issues.apache.org/jira/browse/ARROW-2896) - [GLib] export are missing
-* [ARROW-2901](https://issues.apache.org/jira/browse/ARROW-2901) - [Java] Build is failing on Java9
-* [ARROW-2902](https://issues.apache.org/jira/browse/ARROW-2902) - [Python] HDFS Docker integration tests leave around files created by root
-* [ARROW-2903](https://issues.apache.org/jira/browse/ARROW-2903) - [C++] Setting -DARROW\_HDFS=OFF breaks arrow build when linking against boost libraries
-* [ARROW-2911](https://issues.apache.org/jira/browse/ARROW-2911) - [Python] Parquet binary statistics that end in '\0' truncate last byte
-* [ARROW-2917](https://issues.apache.org/jira/browse/ARROW-2917) - [Python] Tensor requiring gradiant cannot be serialized with pyarrow.serialize
-* [ARROW-2920](https://issues.apache.org/jira/browse/ARROW-2920) - [Python] Segfault with pytorch 0.4
-* [ARROW-2926](https://issues.apache.org/jira/browse/ARROW-2926) - [Python] ParquetWriter segfaults in example where passed schema and table schema do not match
-* [ARROW-2930](https://issues.apache.org/jira/browse/ARROW-2930) - [C++] Trying to set target properties on not existing CMake target
-* [ARROW-2940](https://issues.apache.org/jira/browse/ARROW-2940) - [Python] Import error with pytorch 0.3
-* [ARROW-2945](https://issues.apache.org/jira/browse/ARROW-2945) - [Packaging] Update argument check for 02-source.sh
-* [ARROW-2955](https://issues.apache.org/jira/browse/ARROW-2955) - [Python] Typo in pyarrow's HDFS API result
-* [ARROW-2963](https://issues.apache.org/jira/browse/ARROW-2963) - [Python] Deadlock during fork-join and use\_threads=True
-* [ARROW-2978](https://issues.apache.org/jira/browse/ARROW-2978) - [Rust] Travis CI build is failing
-* [ARROW-2982](https://issues.apache.org/jira/browse/ARROW-2982) - The "--show-progress" option is only supported in wget 1.16 and higher
-* [ARROW-3210](https://issues.apache.org/jira/browse/ARROW-3210) - [Python] Creating ParquetDataset creates partitioned ParquetFiles with mismatched Parquet schemas
-
-
-## New Features and Improvements
-
-* [ARROW-530](https://issues.apache.org/jira/browse/ARROW-530) - C++/Python: Provide subpools for better memory allocation tracking
-* [ARROW-564](https://issues.apache.org/jira/browse/ARROW-564) - [Python] Add methods to return vanilla NumPy arrays (plus boolean mask array if there are nulls)
-* [ARROW-665](https://issues.apache.org/jira/browse/ARROW-665) - C++: Move zeroing logic for (re)allocations to the Allocator
-* [ARROW-889](https://issues.apache.org/jira/browse/ARROW-889) - [C++] Implement arrow::PrettyPrint for ChunkedArray
-* [ARROW-902](https://issues.apache.org/jira/browse/ARROW-902) - [C++] Build C++ project including thirdparty dependencies from local tarballs
-* [ARROW-906](https://issues.apache.org/jira/browse/ARROW-906) - [C++] Serialize Field metadata to IPC metadata
-* [ARROW-1018](https://issues.apache.org/jira/browse/ARROW-1018) - [C++] Add option to create FileOutputStream, ReadableFile from OS file descriptor
-* [ARROW-1163](https://issues.apache.org/jira/browse/ARROW-1163) - [Plasma][Java] Java client for Plasma
-* [ARROW-1388](https://issues.apache.org/jira/browse/ARROW-1388) - [Python] Add Table.drop method for removing columns
-* [ARROW-1454](https://issues.apache.org/jira/browse/ARROW-1454) - [Python] More informative error message when attempting to write an unsupported Arrow type to Parquet format
-* [ARROW-1715](https://issues.apache.org/jira/browse/ARROW-1715) - [Python] Implement pickling for Column, ChunkedArray, RecordBatch, Table
-* [ARROW-1722](https://issues.apache.org/jira/browse/ARROW-1722) - [C++] Add linting script to look for C++/CLI issues
-* [ARROW-1731](https://issues.apache.org/jira/browse/ARROW-1731) - [Python] Provide for selecting a subset of columns to convert in RecordBatch/Table.from\_pandas
-* [ARROW-1744](https://issues.apache.org/jira/browse/ARROW-1744) - [Plasma] Provide TensorFlow operator to read tensors from plasma
-* [ARROW-1780](https://issues.apache.org/jira/browse/ARROW-1780) - [Java] JDBC Adapter for Apache Arrow
-* [ARROW-1858](https://issues.apache.org/jira/browse/ARROW-1858) - [Python] Add documentation about parquet.write\_to\_dataset and related methods
-* [ARROW-1868](https://issues.apache.org/jira/browse/ARROW-1868) - [Java] Change vector getMinorType to use MinorType instead of Types.MinorType
-* [ARROW-1886](https://issues.apache.org/jira/browse/ARROW-1886) - [Python] Add function to "flatten" structs within tables
-* [ARROW-1913](https://issues.apache.org/jira/browse/ARROW-1913) - [Java] Fix Javadoc generation bugs with JDK8
-* [ARROW-1928](https://issues.apache.org/jira/browse/ARROW-1928) - [C++] Add benchmarks comparing performance of internal::BitmapReader/Writer with naive approaches
-* [ARROW-1954](https://issues.apache.org/jira/browse/ARROW-1954) - [Python] Add metadata accessor to pyarrow.Field
-* [ARROW-1964](https://issues.apache.org/jira/browse/ARROW-1964) - [Python] Expose Builder classes
-* [ARROW-2014](https://issues.apache.org/jira/browse/ARROW-2014) - [Python] Document read\_pandas method in pyarrow.parquet
-* [ARROW-2055](https://issues.apache.org/jira/browse/ARROW-2055) - [Java] Upgrade to Java 8
-* [ARROW-2060](https://issues.apache.org/jira/browse/ARROW-2060) - [Python] Documentation for creating StructArray using from\_arrays or a sequence of dicts
-* [ARROW-2061](https://issues.apache.org/jira/browse/ARROW-2061) - [C++] Run ASAN builds in Travis CI
-* [ARROW-2074](https://issues.apache.org/jira/browse/ARROW-2074) - [Python] Allow type inference for struct arrays
-* [ARROW-2097](https://issues.apache.org/jira/browse/ARROW-2097) - [Python] Suppress valgrind stdout/stderr in Travis CI builds when there are no errors
-* [ARROW-2100](https://issues.apache.org/jira/browse/ARROW-2100) - [Python] Drop Python 3.4 support
-* [ARROW-2140](https://issues.apache.org/jira/browse/ARROW-2140) - [Python] Conversion from Numpy float16 array unimplemented
-* [ARROW-2141](https://issues.apache.org/jira/browse/ARROW-2141) - [Python] Conversion from Numpy object array to varsize binary unimplemented
-* [ARROW-2147](https://issues.apache.org/jira/browse/ARROW-2147) - [Python] Type inference doesn't work on lists of Numpy arrays
-* [ARROW-2207](https://issues.apache.org/jira/browse/ARROW-2207) - [GLib] Support decimal type
-* [ARROW-2222](https://issues.apache.org/jira/browse/ARROW-2222) - [C++] Add option to validate Flatbuffers messages
-* [ARROW-2224](https://issues.apache.org/jira/browse/ARROW-2224) - [C++] Get rid of boost regex usage
-* [ARROW-2241](https://issues.apache.org/jira/browse/ARROW-2241) - [Python] Simple script for running all current ASV benchmarks at a commit or tag
-* [ARROW-2264](https://issues.apache.org/jira/browse/ARROW-2264) - [Python] Efficiently serialize numpy arrays with dtype of unicode fixed length string
-* [ARROW-2267](https://issues.apache.org/jira/browse/ARROW-2267) - Rust bindings
-* [ARROW-2276](https://issues.apache.org/jira/browse/ARROW-2276) - [Python] Tensor could implement the buffer protocol
-* [ARROW-2281](https://issues.apache.org/jira/browse/ARROW-2281) - [Python] Expose MakeArray to construct arrays from buffers
-* [ARROW-2285](https://issues.apache.org/jira/browse/ARROW-2285) - [Python] Can't convert Numpy string arrays
-* [ARROW-2286](https://issues.apache.org/jira/browse/ARROW-2286) - [Python] Allow subscripting pyarrow.lib.StructValue
-* [ARROW-2287](https://issues.apache.org/jira/browse/ARROW-2287) - [Python] chunked array not iterable, not indexable
-* [ARROW-2299](https://issues.apache.org/jira/browse/ARROW-2299) - [Go] Go language implementation
-* [ARROW-2301](https://issues.apache.org/jira/browse/ARROW-2301) - [Python] Add source distribution publishing instructions to package / release management documentation
-* [ARROW-2302](https://issues.apache.org/jira/browse/ARROW-2302) - [GLib] Run autotools and meson Linux builds in same Travis CI build entry
-* [ARROW-2308](https://issues.apache.org/jira/browse/ARROW-2308) - Serialized tensor data should be 64-byte aligned.
-* [ARROW-2315](https://issues.apache.org/jira/browse/ARROW-2315) - [C++/Python] Add method to flatten a struct array
-* [ARROW-2319](https://issues.apache.org/jira/browse/ARROW-2319) - [C++] Add buffered output class implementing OutputStream interface
-* [ARROW-2322](https://issues.apache.org/jira/browse/ARROW-2322) - Document requirements to run dev/release/01-perform.sh
-* [ARROW-2325](https://issues.apache.org/jira/browse/ARROW-2325) - [Python] Update setup.py to use Markdown project description
-* [ARROW-2330](https://issues.apache.org/jira/browse/ARROW-2330) - [C++] Optimize delta buffer creation with partially finishable array builders
-* [ARROW-2332](https://issues.apache.org/jira/browse/ARROW-2332) - [Python] Provide API for reading multiple Feather files
-* [ARROW-2332](https://issues.apache.org/jira/browse/ARROW-2332) - [Python] Provide API for reading multiple Feather files
-* [ARROW-2334](https://issues.apache.org/jira/browse/ARROW-2334) - [C++] Update boost to 1.66.0
-* [ARROW-2335](https://issues.apache.org/jira/browse/ARROW-2335) - [Go] Move Go README one directory higher
-* [ARROW-2340](https://issues.apache.org/jira/browse/ARROW-2340) - [Website] Add blog post about Go codebase donation
-* [ARROW-2341](https://issues.apache.org/jira/browse/ARROW-2341) - [Python] pa.union() mode argument unintuitive
-* [ARROW-2343](https://issues.apache.org/jira/browse/ARROW-2343) - [Java/Packaging] Run mvn clean in API doc builds
-* [ARROW-2344](https://issues.apache.org/jira/browse/ARROW-2344) - [Go] Run Go unit tests in Travis CI
-* [ARROW-2345](https://issues.apache.org/jira/browse/ARROW-2345) - [Documentation] Fix bundle exec and set sphinx nosidebar to True
-* [ARROW-2348](https://issues.apache.org/jira/browse/ARROW-2348) - [GLib] Remove Go example
-* [ARROW-2350](https://issues.apache.org/jira/browse/ARROW-2350) - Shrink size of spark\_integration Docker container
-* [ARROW-2353](https://issues.apache.org/jira/browse/ARROW-2353) - Test correctness of built wheel on AppVeyor
-* [ARROW-2361](https://issues.apache.org/jira/browse/ARROW-2361) - [Rust] Start native Rust Implementation
-* [ARROW-2364](https://issues.apache.org/jira/browse/ARROW-2364) - [Plasma] PlasmaClient::Get() could take vector of object ids
-* [ARROW-2376](https://issues.apache.org/jira/browse/ARROW-2376) - [Rust] Travis should run tests for Rust library
-* [ARROW-2378](https://issues.apache.org/jira/browse/ARROW-2378) - [Rust] Use rustfmt to format source code
-* [ARROW-2381](https://issues.apache.org/jira/browse/ARROW-2381) - [Rust] Buffer<T\> should have an Iterator
-* [ARROW-2384](https://issues.apache.org/jira/browse/ARROW-2384) - Rust: Use Traits rather than defining methods directly
-* [ARROW-2385](https://issues.apache.org/jira/browse/ARROW-2385) - [Rust] Implement to\_json() for Field and DataType
-* [ARROW-2388](https://issues.apache.org/jira/browse/ARROW-2388) - [C++] Arrow::StringBuilder::Append() uses null\_bytes not valid\_bytes
-* [ARROW-2389](https://issues.apache.org/jira/browse/ARROW-2389) - [C++] Add StatusCode::OverflowError
-* [ARROW-2390](https://issues.apache.org/jira/browse/ARROW-2390) - [C++/Python] CheckPyError() could inspect exception type
-* [ARROW-2394](https://issues.apache.org/jira/browse/ARROW-2394) - [Python] Correct flake8 errors in benchmarks
-* [ARROW-2395](https://issues.apache.org/jira/browse/ARROW-2395) - [Python] Correct flake8 errors outside of pyarrow/ directory
-* [ARROW-2396](https://issues.apache.org/jira/browse/ARROW-2396) - Unify Rust Errors
-* [ARROW-2397](https://issues.apache.org/jira/browse/ARROW-2397) - Document changes in Tensor encoding in IPC.md.
-* [ARROW-2398](https://issues.apache.org/jira/browse/ARROW-2398) - [Rust] Provide a zero-copy builder for type-safe Buffer<T\>
-* [ARROW-2400](https://issues.apache.org/jira/browse/ARROW-2400) - [C++] Status destructor is expensive
-* [ARROW-2401](https://issues.apache.org/jira/browse/ARROW-2401) - Support filters on Hive partitioned Parquet files
-* [ARROW-2402](https://issues.apache.org/jira/browse/ARROW-2402) - [C++] FixedSizeBinaryBuilder::Append lacks "const char\*" overload
-* [ARROW-2404](https://issues.apache.org/jira/browse/ARROW-2404) - Fix declaration of 'type\_id' hides class member warning in msvc build
-* [ARROW-2407](https://issues.apache.org/jira/browse/ARROW-2407) - [GLib] Add garrow\_string\_array\_builder\_append\_values()
-* [ARROW-2408](https://issues.apache.org/jira/browse/ARROW-2408) - [Rust] It should be possible to get a &mut[T] from Builder<T\>
-* [ARROW-2408](https://issues.apache.org/jira/browse/ARROW-2408) - [Rust] It should be possible to get a &mut[T] from Builder<T\>
-* [ARROW-2411](https://issues.apache.org/jira/browse/ARROW-2411) - [C++] Add method to append batches of null-terminated strings to StringBuilder
-* [ARROW-2413](https://issues.apache.org/jira/browse/ARROW-2413) - [Rust] Remove useless use of \`format!\`
-* [ARROW-2414](https://issues.apache.org/jira/browse/ARROW-2414) - [Documentation] Fix miscellaneous documentation typos
-* [ARROW-2415](https://issues.apache.org/jira/browse/ARROW-2415) - [Rust] Fix using references in pattern matching
-* [ARROW-2416](https://issues.apache.org/jira/browse/ARROW-2416) - [C++] Support system libprotobuf
-* [ARROW-2417](https://issues.apache.org/jira/browse/ARROW-2417) - [Rust] Review APIs for safety
-* [ARROW-2422](https://issues.apache.org/jira/browse/ARROW-2422) - [Python] Support more filter operators on Hive partitioned Parquet files
-* [ARROW-2427](https://issues.apache.org/jira/browse/ARROW-2427) - [C++] ReadAt implementations suboptimal
-* [ARROW-2430](https://issues.apache.org/jira/browse/ARROW-2430) - MVP for branch based packaging automation
-* [ARROW-2433](https://issues.apache.org/jira/browse/ARROW-2433) - [Rust] Add Builder.push\_slice(&[T])
-* [ARROW-2434](https://issues.apache.org/jira/browse/ARROW-2434) - [Rust] Add windows support
-* [ARROW-2435](https://issues.apache.org/jira/browse/ARROW-2435) - [Rust] Add memory pool abstraction.
-* [ARROW-2436](https://issues.apache.org/jira/browse/ARROW-2436) - [Rust] Add windows CI
-* [ARROW-2439](https://issues.apache.org/jira/browse/ARROW-2439) - [Rust] Run license header checks also in Rust CI entry
-* [ARROW-2440](https://issues.apache.org/jira/browse/ARROW-2440) - [Rust] Implement ListBuilder<T\>
-* [ARROW-2442](https://issues.apache.org/jira/browse/ARROW-2442) - [C++] Disambiguate Builder::Append overloads
-* [ARROW-2445](https://issues.apache.org/jira/browse/ARROW-2445) - [Rust] Add documentation and make some fields private
-* [ARROW-2448](https://issues.apache.org/jira/browse/ARROW-2448) - Segfault when plasma client goes out of scope before buffer.
-* [ARROW-2451](https://issues.apache.org/jira/browse/ARROW-2451) - Handle more dtypes efficiently in custom numpy array serializer.
-* [ARROW-2453](https://issues.apache.org/jira/browse/ARROW-2453) - [Python] Improve Table column access
-* [ARROW-2458](https://issues.apache.org/jira/browse/ARROW-2458) - [Plasma] PlasmaClient uses global variable
-* [ARROW-2463](https://issues.apache.org/jira/browse/ARROW-2463) - [C++] Update flatbuffers to 1.9.0
-* [ARROW-2464](https://issues.apache.org/jira/browse/ARROW-2464) - [Python] Use a python\_version marker instead of a condition
-* [ARROW-2469](https://issues.apache.org/jira/browse/ARROW-2469) - Make out arguments last in ReadMessage API.
-* [ARROW-2470](https://issues.apache.org/jira/browse/ARROW-2470) - [C++] FileGetSize() should not seek
-* [ARROW-2472](https://issues.apache.org/jira/browse/ARROW-2472) - [Rust] The Schema and Fields types should not have public attributes
-* [ARROW-2477](https://issues.apache.org/jira/browse/ARROW-2477) - [Rust] Set up code coverage in CI
-* [ARROW-2478](https://issues.apache.org/jira/browse/ARROW-2478) - [C++] Introduce a checked\_cast function that performs a dynamic\_cast in debug mode
-* [ARROW-2479](https://issues.apache.org/jira/browse/ARROW-2479) - [C++] Have a global thread pool
-* [ARROW-2480](https://issues.apache.org/jira/browse/ARROW-2480) - [C++] Enable casting the value of a decimal to int32\_t or int64\_t
-* [ARROW-2481](https://issues.apache.org/jira/browse/ARROW-2481) - [Rust] Move calls to free() into memory.rs
-* [ARROW-2482](https://issues.apache.org/jira/browse/ARROW-2482) - [Rust] support nested types
-* [ARROW-2484](https://issues.apache.org/jira/browse/ARROW-2484) - [C++] Document ABI compliance checking
-* [ARROW-2485](https://issues.apache.org/jira/browse/ARROW-2485) - [C++] Output diff when run\_clang\_format.py reports a change
-* [ARROW-2486](https://issues.apache.org/jira/browse/ARROW-2486) - [C++/Python] Provide a Docker image that contains all dependencies for development
-* [ARROW-2488](https://issues.apache.org/jira/browse/ARROW-2488) - [C++] List Boost 1.67 as supported version
-* [ARROW-2493](https://issues.apache.org/jira/browse/ARROW-2493) - [Python] Add support for pickling to buffers and arrays
-* [ARROW-2494](https://issues.apache.org/jira/browse/ARROW-2494) - Return status codes from PlasmaClient::Seal
-* [ARROW-2498](https://issues.apache.org/jira/browse/ARROW-2498) - [Java] Upgrade to JDK 1.8
-* [ARROW-2499](https://issues.apache.org/jira/browse/ARROW-2499) - [C++] Add iterator facility for Python sequences
-* [ARROW-2505](https://issues.apache.org/jira/browse/ARROW-2505) - [C++] Disable MSVC warning C4800
-* [ARROW-2506](https://issues.apache.org/jira/browse/ARROW-2506) - [Plasma] Build error on macOS
-* [ARROW-2507](https://issues.apache.org/jira/browse/ARROW-2507) - [Rust] Don't take a reference when not needed
-* [ARROW-2508](https://issues.apache.org/jira/browse/ARROW-2508) - [Python] pytest API changes make tests fail
-* [ARROW-2513](https://issues.apache.org/jira/browse/ARROW-2513) - [Python] DictionaryType should give access to index type and dictionary array
-* [ARROW-2516](https://issues.apache.org/jira/browse/ARROW-2516) - AppVeyor Build Matrix should be specific to the changes made in a PR
-* [ARROW-2521](https://issues.apache.org/jira/browse/ARROW-2521) - [Rust] Refactor Rust API to use traits and generics
-* [ARROW-2522](https://issues.apache.org/jira/browse/ARROW-2522) - [C++] Version shared library files
-* [ARROW-2525](https://issues.apache.org/jira/browse/ARROW-2525) - [GLib] Add garrow\_struct\_array\_flatten()
-* [ARROW-2526](https://issues.apache.org/jira/browse/ARROW-2526) - [GLib] Update .gitignore
-* [ARROW-2527](https://issues.apache.org/jira/browse/ARROW-2527) - [GLib] Enable GPU document
-* [ARROW-2528](https://issues.apache.org/jira/browse/ARROW-2528) - [Rust] Add trait bounds for T in Buffer/List
-* [ARROW-2529](https://issues.apache.org/jira/browse/ARROW-2529) - [C++] Update mention of clang-format to 5.0 in the docs
-* [ARROW-2531](https://issues.apache.org/jira/browse/ARROW-2531) - [C++] Update clang bits to 6.0
-* [ARROW-2533](https://issues.apache.org/jira/browse/ARROW-2533) - [CI] Fast finish failing AppVeyor builds
-* [ARROW-2536](https://issues.apache.org/jira/browse/ARROW-2536) - [Rust] ListBuilder uses wrong initial size for offset builder
-* [ARROW-2537](https://issues.apache.org/jira/browse/ARROW-2537) - [Ruby] Import
-* [ARROW-2539](https://issues.apache.org/jira/browse/ARROW-2539) - [Plasma] Use unique\_ptr instead of raw pointer
-* [ARROW-2540](https://issues.apache.org/jira/browse/ARROW-2540) - [Plasma] add constructor/destructor to make sure dlfree is called automatically
-* [ARROW-2541](https://issues.apache.org/jira/browse/ARROW-2541) - [Plasma] Clean up macro usage
-* [ARROW-2543](https://issues.apache.org/jira/browse/ARROW-2543) - [Rust] CI should cache dependencies for faster builds
-* [ARROW-2544](https://issues.apache.org/jira/browse/ARROW-2544) - [CI] Run C++ tests with two jobs on Travis-CI
-* [ARROW-2547](https://issues.apache.org/jira/browse/ARROW-2547) - [Format] Fix off-by-one in List<List<byte\>\> example
-* [ARROW-2548](https://issues.apache.org/jira/browse/ARROW-2548) - [Format] Clarify \`List<Char\>\` Array example
-* [ARROW-2549](https://issues.apache.org/jira/browse/ARROW-2549) - [GLib] Apply arrow::StatusCodes changes to GArrowError
-* [ARROW-2550](https://issues.apache.org/jira/browse/ARROW-2550) - [C++] Add missing status codes into arrow::StatusCode::CodeAsString()
-* [ARROW-2551](https://issues.apache.org/jira/browse/ARROW-2551) - [Plasma] Improve notification logic
-* [ARROW-2552](https://issues.apache.org/jira/browse/ARROW-2552) - [Plasma] Unit tests are flaky
-* [ARROW-2553](https://issues.apache.org/jira/browse/ARROW-2553) - [Python] Set MACOSX\_DEPLOYMENT\_TARGET in wheel build
-* [ARROW-2558](https://issues.apache.org/jira/browse/ARROW-2558) - [Plasma] avoid walk through all the objects when a client disconnects
-* [ARROW-2562](https://issues.apache.org/jira/browse/ARROW-2562) - [C++] Upload coverage data to codecov.io
-* [ARROW-2563](https://issues.apache.org/jira/browse/ARROW-2563) - [Rust] Poor caching in Travis-CI
-* [ARROW-2566](https://issues.apache.org/jira/browse/ARROW-2566) - [CI] Add codecov.io badge to README
-* [ARROW-2567](https://issues.apache.org/jira/browse/ARROW-2567) - [C++/Python] Unit is ignored on comparison of TimestampArrays
-* [ARROW-2568](https://issues.apache.org/jira/browse/ARROW-2568) - [Python] Expose thread pool size setting to Python, and deprecate "nthreads"
-* [ARROW-2569](https://issues.apache.org/jira/browse/ARROW-2569) - [C++] Improve thread pool size heuristic
-* [ARROW-2574](https://issues.apache.org/jira/browse/ARROW-2574) - [CI] Collect and publish Python coverage
-* [ARROW-2576](https://issues.apache.org/jira/browse/ARROW-2576) - [GLib] Add abs functions for Decimal128.
-* [ARROW-2577](https://issues.apache.org/jira/browse/ARROW-2577) - [Plasma] Add ASV benchmarks
-* [ARROW-2580](https://issues.apache.org/jira/browse/ARROW-2580) - [GLib] Fix abs functions for Decimal128
-* [ARROW-2582](https://issues.apache.org/jira/browse/ARROW-2582) - [GLib] Add negate functions for Decimal128
-* [ARROW-2585](https://issues.apache.org/jira/browse/ARROW-2585) - [C++] Add Decimal128::FromBigEndian
-* [ARROW-2586](https://issues.apache.org/jira/browse/ARROW-2586) - [C++] Make child builders of ListBuilder and StructBuilder shared\_ptr's
-* [ARROW-2595](https://issues.apache.org/jira/browse/ARROW-2595) - [Plasma] operator[] creates entries in map
-* [ARROW-2596](https://issues.apache.org/jira/browse/ARROW-2596) - [GLib] Use the default value of GTK-Doc
-* [ARROW-2597](https://issues.apache.org/jira/browse/ARROW-2597) - [Plasma] remove UniqueIDHasher
-* [ARROW-2604](https://issues.apache.org/jira/browse/ARROW-2604) - [Java] Add method overload for VarCharVector.set(int,String)
-* [ARROW-2608](https://issues.apache.org/jira/browse/ARROW-2608) - [Java/Python] Add pyarrow.{Array,Field}.from\_jvm / jvm\_buffer
-* [ARROW-2611](https://issues.apache.org/jira/browse/ARROW-2611) - [Python] Python 2 integer serialization
-* [ARROW-2612](https://issues.apache.org/jira/browse/ARROW-2612) - [Plasma] Fix deprecated PLASMA\_DEFAULT\_RELEASE\_DELAY
-* [ARROW-2613](https://issues.apache.org/jira/browse/ARROW-2613) - [Docs] Update the gen\_apidocs docker script
-* [ARROW-2614](https://issues.apache.org/jira/browse/ARROW-2614) - [CI] Remove 'group: deprecated' in Travis
-* [ARROW-2626](https://issues.apache.org/jira/browse/ARROW-2626) - [Python] pandas ArrowInvalid message should include failing column name
-* [ARROW-2634](https://issues.apache.org/jira/browse/ARROW-2634) - [Go] Add LICENSE additions for Go subproject
-* [ARROW-2635](https://issues.apache.org/jira/browse/ARROW-2635) - [Ruby] LICENSE.txt isn't suitable
-* [ARROW-2636](https://issues.apache.org/jira/browse/ARROW-2636) - [Ruby] "Unofficial" package note is missing
-* [ARROW-2638](https://issues.apache.org/jira/browse/ARROW-2638) - [Python] Prevent calling extension class constructors directly
-* [ARROW-2639](https://issues.apache.org/jira/browse/ARROW-2639) - [Python] Remove unnecessary \_check\_nullptr methods
-* [ARROW-2641](https://issues.apache.org/jira/browse/ARROW-2641) - [C++] Investigate spurious memset() calls
-* [ARROW-2645](https://issues.apache.org/jira/browse/ARROW-2645) - [Java] ArrowStreamWriter accumulates DictionaryBatch ArrowBlocks
-* [ARROW-2649](https://issues.apache.org/jira/browse/ARROW-2649) - [C++] Add std::generate()-like function for faster bitmap writing
-* [ARROW-2656](https://issues.apache.org/jira/browse/ARROW-2656) - [Python] Improve ParquetManifest creation time
-* [ARROW-2660](https://issues.apache.org/jira/browse/ARROW-2660) - [Python] Experiment with zero-copy pickling
-* [ARROW-2661](https://issues.apache.org/jira/browse/ARROW-2661) - [Python/C++] Allow passing HDFS Config values via map/dict instead of needing an hdfs-site.xml file
-* [ARROW-2662](https://issues.apache.org/jira/browse/ARROW-2662) - [Python] Add to\_pandas / to\_numpy to ChunkedArray
-* [ARROW-2663](https://issues.apache.org/jira/browse/ARROW-2663) - [Python] Make dictionary\_encode and unique accesible on Column / ChunkedArray
-* [ARROW-2664](https://issues.apache.org/jira/browse/ARROW-2664) - [Python] Implement \_\_getitem\_\_ / slicing on Buffer
-* [ARROW-2666](https://issues.apache.org/jira/browse/ARROW-2666) - [Python] numpy.asarray should trigger to\_pandas on Array/ChunkedArray
-* [ARROW-2672](https://issues.apache.org/jira/browse/ARROW-2672) - [Python] Build ORC extension in manylinux1 wheels
-* [ARROW-2674](https://issues.apache.org/jira/browse/ARROW-2674) - [Packaging] Start building nightlies
-* [ARROW-2676](https://issues.apache.org/jira/browse/ARROW-2676) - [Packaging] Deploy build artifacts to github releases
-* [ARROW-2677](https://issues.apache.org/jira/browse/ARROW-2677) - [Python] Expose Parquet ZSTD compression
-* [ARROW-2678](https://issues.apache.org/jira/browse/ARROW-2678) - [GLib] Add extra information to common build problems on macOS
-* [ARROW-2680](https://issues.apache.org/jira/browse/ARROW-2680) - [Python] Add documentation about type inference in Table.from\_pandas
-* [ARROW-2682](https://issues.apache.org/jira/browse/ARROW-2682) - [CI] Notify in Slack about broken builds
-* [ARROW-2689](https://issues.apache.org/jira/browse/ARROW-2689) - [Python] Remove references to timestamps\_to\_ms argument from documentation
-* [ARROW-2692](https://issues.apache.org/jira/browse/ARROW-2692) - [Python] Add test for writing dictionary encoded columns to chunked Parquet files
-* [ARROW-2695](https://issues.apache.org/jira/browse/ARROW-2695) - [Python] Prevent calling scalar contructors directly
-* [ARROW-2696](https://issues.apache.org/jira/browse/ARROW-2696) - [JAVA] enhance AllocationListener with an onFailedAllocation() call
-* [ARROW-2699](https://issues.apache.org/jira/browse/ARROW-2699) - [C++/Python] Add Table method that replaces a column with a new supplied column
-* [ARROW-2700](https://issues.apache.org/jira/browse/ARROW-2700) - [Python] Add simple examples to Array.cast docstring
-* [ARROW-2701](https://issues.apache.org/jira/browse/ARROW-2701) - [C++] Make MemoryMappedFile resizable
-* [ARROW-2704](https://issues.apache.org/jira/browse/ARROW-2704) - [Java] IPC stream handling should be more friendly to low level processing
-* [ARROW-2713](https://issues.apache.org/jira/browse/ARROW-2713) - [Packaging] Fix linux package builds
-* [ARROW-2717](https://issues.apache.org/jira/browse/ARROW-2717) - [Packaging] Postfix conda artifacts with target arch
-* [ARROW-2718](https://issues.apache.org/jira/browse/ARROW-2718) - [Packaging] GPG sign downloaded artifacts
-* [ARROW-2724](https://issues.apache.org/jira/browse/ARROW-2724) - [Packaging] Determine whether all the expected artifacts are uploaded
-* [ARROW-2725](https://issues.apache.org/jira/browse/ARROW-2725) - [JAVA] make Accountant.AllocationOutcome publicly visible
-* [ARROW-2729](https://issues.apache.org/jira/browse/ARROW-2729) - [GLib] Add decimal128 array builder
-* [ARROW-2731](https://issues.apache.org/jira/browse/ARROW-2731) - Allow usage of external ORC library
-* [ARROW-2732](https://issues.apache.org/jira/browse/ARROW-2732) - Update brew packages for macOS
-* [ARROW-2733](https://issues.apache.org/jira/browse/ARROW-2733) - [GLib] Cast garrow\_decimal128 to gint64
-* [ARROW-2738](https://issues.apache.org/jira/browse/ARROW-2738) - [GLib] Use Brewfile on installation process
-* [ARROW-2739](https://issues.apache.org/jira/browse/ARROW-2739) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE for GArrowDecimalDataType and GArrowDecimal128ArrayBuilder
-* [ARROW-2740](https://issues.apache.org/jira/browse/ARROW-2740) - [Python] Add address property to Buffer
-* [ARROW-2742](https://issues.apache.org/jira/browse/ARROW-2742) - [Python] Allow Table.from\_batches to use Iterator of ArrowRecordBatches
-* [ARROW-2748](https://issues.apache.org/jira/browse/ARROW-2748) - [GLib] Add garrow\_decimal\_data\_type\_get\_scale() (and \_precision())
-* [ARROW-2749](https://issues.apache.org/jira/browse/ARROW-2749) - [GLib] Rename \*garrow\_decimal128\_array\_get\_value to \*garrow\_decimal128\_array\_format\_value
-* [ARROW-2751](https://issues.apache.org/jira/browse/ARROW-2751) - [GLib] Add garrow\_table\_replace\_column()
-* [ARROW-2752](https://issues.apache.org/jira/browse/ARROW-2752) - [GLib] Document garrow\_decimal\_data\_type\_new()
-* [ARROW-2753](https://issues.apache.org/jira/browse/ARROW-2753) - [GLib] Add garrow\_schema\_\*\_field()
-* [ARROW-2755](https://issues.apache.org/jira/browse/ARROW-2755) - [Python] Allow using Ninja to build extension
-* [ARROW-2756](https://issues.apache.org/jira/browse/ARROW-2756) - [Python] Remove redundant imports and minor fixes in parquet tests
-* [ARROW-2758](https://issues.apache.org/jira/browse/ARROW-2758) - [Plasma] Use Scope enum in Plasma
-* [ARROW-2760](https://issues.apache.org/jira/browse/ARROW-2760) - [Python] Remove legacy property definition syntax from parquet module and test them
-* [ARROW-2761](https://issues.apache.org/jira/browse/ARROW-2761) - Support set filter operators on Hive partitioned Parquet files
-* [ARROW-2763](https://issues.apache.org/jira/browse/ARROW-2763) - [Python] Make parquet \_metadata file accessible from ParquetDataset
-* [ARROW-2780](https://issues.apache.org/jira/browse/ARROW-2780) - [Go] Run code coverage analysis
-* [ARROW-2784](https://issues.apache.org/jira/browse/ARROW-2784) - [C++] MemoryMappedFile::WriteAt allow writing past the end
-* [ARROW-2790](https://issues.apache.org/jira/browse/ARROW-2790) - [C++] Buffers contain uninitialized memory
-* [ARROW-2790](https://issues.apache.org/jira/browse/ARROW-2790) - [C++] Buffers contain uninitialized memory
-* [ARROW-2791](https://issues.apache.org/jira/browse/ARROW-2791) - [Packaging] Build Ubuntu 18.04 packages
-* [ARROW-2792](https://issues.apache.org/jira/browse/ARROW-2792) - [Packaging] Consider uploading tarballs to avoid naming conflicts
-* [ARROW-2794](https://issues.apache.org/jira/browse/ARROW-2794) - [Plasma] Add Delete method for multiple objects
-* [ARROW-2798](https://issues.apache.org/jira/browse/ARROW-2798) - [Plasma] Use hashing function that takes into account all UniqueID bytes
-* [ARROW-2802](https://issues.apache.org/jira/browse/ARROW-2802) - [Docs] Move release management guide to project wiki
-* [ARROW-2804](https://issues.apache.org/jira/browse/ARROW-2804) - [Website] Link to Developer wiki (Confluence) from front page
-* [ARROW-2805](https://issues.apache.org/jira/browse/ARROW-2805) - [Python] TensorFlow import workaround not working with tensorflow-gpu if CUDA is not installed
-* [ARROW-2809](https://issues.apache.org/jira/browse/ARROW-2809) - [C++] Decrease verbosity of lint checks in Travis CI
-* [ARROW-2811](https://issues.apache.org/jira/browse/ARROW-2811) - [Python] Test serialization for determinism
-* [ARROW-2815](https://issues.apache.org/jira/browse/ARROW-2815) - [CI] Suppress DEBUG logging when building Java library in C++ CI entries
-* [ARROW-2816](https://issues.apache.org/jira/browse/ARROW-2816) - [Python] Add \_\_iter\_\_ method to NativeFile
-* [ARROW-2821](https://issues.apache.org/jira/browse/ARROW-2821) - [C++] Only zero memory in BooleanBuilder in one place
-* [ARROW-2822](https://issues.apache.org/jira/browse/ARROW-2822) - [C++] Zero padding bytes in PoolBuffer::Resize
-* [ARROW-2822](https://issues.apache.org/jira/browse/ARROW-2822) - [C++] Zero padding bytes in PoolBuffer::Resize
-* [ARROW-2824](https://issues.apache.org/jira/browse/ARROW-2824) - [GLib] Add garrow\_decimal128\_array\_get\_value()
-* [ARROW-2825](https://issues.apache.org/jira/browse/ARROW-2825) - [C++] Need AllocateBuffer / AllocateResizableBuffer variant with default memory pool
-* [ARROW-2826](https://issues.apache.org/jira/browse/ARROW-2826) - [C++] Clarification needed between ArrayBuilder::Init(), Resize() and Reserve()
-* [ARROW-2827](https://issues.apache.org/jira/browse/ARROW-2827) - [C++] LZ4 and Zstd build may be failed in parallel build
-* [ARROW-2829](https://issues.apache.org/jira/browse/ARROW-2829) - [GLib] Add GArrowORCFileReader
-* [ARROW-2830](https://issues.apache.org/jira/browse/ARROW-2830) - [Packaging] Enable parallel build for deb package build again
-* [ARROW-2832](https://issues.apache.org/jira/browse/ARROW-2832) - [Python] Pretty-print schema metadata in Schema.\_\_repr\_\_
-* [ARROW-2833](https://issues.apache.org/jira/browse/ARROW-2833) - [Python] Column.\_\_repr\_\_ will lock up Jupyter with large datasets
-* [ARROW-2834](https://issues.apache.org/jira/browse/ARROW-2834) - [GLib] Remove "enable\_" prefix from Meson options
-* [ARROW-2836](https://issues.apache.org/jira/browse/ARROW-2836) - [Packaging] Expand build matrices to multiple tasks
-* [ARROW-2837](https://issues.apache.org/jira/browse/ARROW-2837) - [C++] ArrayBuilder::null\_bitmap returns PoolBuffer
-* [ARROW-2838](https://issues.apache.org/jira/browse/ARROW-2838) - [Python] Speed up null testing with Pandas semantics
-* [ARROW-2844](https://issues.apache.org/jira/browse/ARROW-2844) - [Packaging] Test OSX wheels after build
-* [ARROW-2845](https://issues.apache.org/jira/browse/ARROW-2845) - [Packaging] Upload additional debian artifacts
-* [ARROW-2846](https://issues.apache.org/jira/browse/ARROW-2846) - [Packaging] Update nightly build in crossbow as well as the sample configuration
-* [ARROW-2847](https://issues.apache.org/jira/browse/ARROW-2847) - [Packaging] Fix artifact name matching for conda forge packages
-* [ARROW-2848](https://issues.apache.org/jira/browse/ARROW-2848) - [Packaging] lib\*.deb package name doesn't match so version
-* [ARROW-2849](https://issues.apache.org/jira/browse/ARROW-2849) - [Ruby] Arrow::Table\#load supports ORC
-* [ARROW-2855](https://issues.apache.org/jira/browse/ARROW-2855) - [C++] Blog post that outlines the benefits of using jemalloc
-* [ARROW-2859](https://issues.apache.org/jira/browse/ARROW-2859) - [Python] Handle objects exporting the buffer protocol in open\_stream, open\_file, and RecordBatch\*Reader APIs
-* [ARROW-2861](https://issues.apache.org/jira/browse/ARROW-2861) - [Python] Add extra tips about using Parquet to store index-less pandas data
-* [ARROW-2864](https://issues.apache.org/jira/browse/ARROW-2864) - [Plasma] Add deletion cache to delete objects later
-* [ARROW-2868](https://issues.apache.org/jira/browse/ARROW-2868) - [Packaging] Fix centos-7 build
-* [ARROW-2869](https://issues.apache.org/jira/browse/ARROW-2869) - [Python] Add documentation for Array.to\_numpy
-* [ARROW-2874](https://issues.apache.org/jira/browse/ARROW-2874) - [Packaging] Pass job prefix when putting on Queue
-* [ARROW-2875](https://issues.apache.org/jira/browse/ARROW-2875) - [Packaging] Don't attempt to download arrow archive in linux builds
-* [ARROW-2881](https://issues.apache.org/jira/browse/ARROW-2881) - [Website] Add Community tab to website
-* [ARROW-2884](https://issues.apache.org/jira/browse/ARROW-2884) - [Packaging] Options to build packages from apache source archive
-* [ARROW-2886](https://issues.apache.org/jira/browse/ARROW-2886) - [Release] An unused variable exists
-* [ARROW-2890](https://issues.apache.org/jira/browse/ARROW-2890) - [Plasma] Make Python PlasmaClient.release private
-* [ARROW-2893](https://issues.apache.org/jira/browse/ARROW-2893) - [C++] Remove PoolBuffer class from public API and hide implementation details behind factory functions
-* [ARROW-2897](https://issues.apache.org/jira/browse/ARROW-2897) - Organize supported Ubuntu versions
-* [ARROW-2898](https://issues.apache.org/jira/browse/ARROW-2898) - [Packaging] Setuptools\_scm just shipped a new version which fails to parse \`apache-arrow-<version\>\` tag
-* [ARROW-2906](https://issues.apache.org/jira/browse/ARROW-2906) - [Website] Remove the link to slack channel
-* [ARROW-2907](https://issues.apache.org/jira/browse/ARROW-2907) - [GitHub] Improve "How to contribute patches"
-* [ARROW-2908](https://issues.apache.org/jira/browse/ARROW-2908) - [Rust] Update version to 0.10.0
-* [ARROW-2914](https://issues.apache.org/jira/browse/ARROW-2914) - [Integration] Add WindowPandasUDFTests to Spark Integration
-* [ARROW-2915](https://issues.apache.org/jira/browse/ARROW-2915) - [Packaging] Remove artifact form ubuntu-trusty build
-* [ARROW-2918](https://issues.apache.org/jira/browse/ARROW-2918) - [C++] Improve formatting of Struct pretty prints
-* [ARROW-2921](https://issues.apache.org/jira/browse/ARROW-2921) - [Release] Update .deb/.rpm changelos in preparation
-* [ARROW-2922](https://issues.apache.org/jira/browse/ARROW-2922) - [Release] Make python command name customizable
-* [ARROW-2923](https://issues.apache.org/jira/browse/ARROW-2923) - [Doc] Add instructions for running Spark integration tests
-* [ARROW-2924](https://issues.apache.org/jira/browse/ARROW-2924) - [Java] mvn release fails when an older maven javadoc plugin is installed
-* [ARROW-2927](https://issues.apache.org/jira/browse/ARROW-2927) - [Packaging] AppVeyor wheel task is failing on initial checkout
-* [ARROW-2928](https://issues.apache.org/jira/browse/ARROW-2928) - [Packaging] AppVeyor crossbow conda builds are picking up boost 1.63.0 instead of the installed version
-* [ARROW-2929](https://issues.apache.org/jira/browse/ARROW-2929) - [C++] ARROW-2826 Breaks parquet-cpp 1.4.0 builds
-* [ARROW-2934](https://issues.apache.org/jira/browse/ARROW-2934) - [Packaging] Add checksums creation to sign subcommand
-* [ARROW-2935](https://issues.apache.org/jira/browse/ARROW-2935) - [Packaging] Add verify\_binary\_artifacts function to verify-release-candidate.sh
-* [ARROW-2937](https://issues.apache.org/jira/browse/ARROW-2937) - [Java] Follow-up changes to ARROW-2704
-* [ARROW-2943](https://issues.apache.org/jira/browse/ARROW-2943) - [C++] Implement BufferedOutputStream::Flush
-* [ARROW-2944](https://issues.apache.org/jira/browse/ARROW-2944) - [Format] Arrow columnar format docs mentions VectorLayout that does not exist anymore
-* [ARROW-2946](https://issues.apache.org/jira/browse/ARROW-2946) - [Packaging] Stop to use PWD in debian/rules
-* [ARROW-2947](https://issues.apache.org/jira/browse/ARROW-2947) - [Packaging] Remove Ubuntu Artful
-* [ARROW-2949](https://issues.apache.org/jira/browse/ARROW-2949) - [CI] repo.continuum.io can be flaky in builds
-* [ARROW-2951](https://issues.apache.org/jira/browse/ARROW-2951) - [CI] Changes in format/ should cause Appveyor builds to run
-* [ARROW-2953](https://issues.apache.org/jira/browse/ARROW-2953) - [Plasma] Store memory usage
-* [ARROW-2954](https://issues.apache.org/jira/browse/ARROW-2954) - [Plasma] Store object\_id only once in object table
-* [ARROW-2962](https://issues.apache.org/jira/browse/ARROW-2962) - [Packaging] Bintray descriptor files are no longer needed
-* [ARROW-2977](https://issues.apache.org/jira/browse/ARROW-2977) - [Packaging] Release verification script should check rust too
-* [ARROW-2985](https://issues.apache.org/jira/browse/ARROW-2985) - [Ruby] Run unit tests in verify-release-candidate.sh
-* [ARROW-2988](https://issues.apache.org/jira/browse/ARROW-2988) - [Release] More automated release verification on Windows
-* [ARROW-2990](https://issues.apache.org/jira/browse/ARROW-2990) - [GLib] Fail to build with rpath-ed Arrow C++ on macOS
-
-
-
-# Apache Arrow 0.9.0 (2018-03-19)
-
-## New Features and Improvements
-
-* [ARROW-232](https://issues.apache.org/jira/browse/ARROW-232) - C++/Parquet: Support writing chunked arrays as part of a table
-* [ARROW-633](https://issues.apache.org/jira/browse/ARROW-633) - [Java] Add support for FixedSizeBinary type
-* [ARROW-634](https://issues.apache.org/jira/browse/ARROW-634) - Add integration tests for FixedSizeBinary
-* [ARROW-760](https://issues.apache.org/jira/browse/ARROW-760) - [Python] document differences w.r.t. fastparquet
-* [ARROW-764](https://issues.apache.org/jira/browse/ARROW-764) - [C++] Improve performance of CopyBitmap, add benchmarks
-* [ARROW-969](https://issues.apache.org/jira/browse/ARROW-969) - [C++/Python] Add add/remove field functions for RecordBatch
-* [ARROW-1021](https://issues.apache.org/jira/browse/ARROW-1021) - [Python] Add documentation about using pyarrow from other Cython and C++ projects
-* [ARROW-1035](https://issues.apache.org/jira/browse/ARROW-1035) - [Python] Add ASV benchmarks for streaming columnar deserialization
-* [ARROW-1394](https://issues.apache.org/jira/browse/ARROW-1394) - [Plasma] Add optional extension for allocating memory on GPUs
-* [ARROW-1463](https://issues.apache.org/jira/browse/ARROW-1463) - [JAVA] Restructure ValueVector hierarchy to minimize compile-time generated code
-* [ARROW-1579](https://issues.apache.org/jira/browse/ARROW-1579) - [Java] Add dockerized test setup to validate Spark integration
-* [ARROW-1580](https://issues.apache.org/jira/browse/ARROW-1580) - [Python] Instructions for setting up nightly builds on Linux
-* [ARROW-1623](https://issues.apache.org/jira/browse/ARROW-1623) - [C++] Add convenience method to construct Buffer from a string that owns its memory
-* [ARROW-1632](https://issues.apache.org/jira/browse/ARROW-1632) - [Python] Permit categorical conversions in Table.to\_pandas on a per-column basis
-* [ARROW-1643](https://issues.apache.org/jira/browse/ARROW-1643) - [Python] Accept hdfs:// prefixes in parquet.read\_table and attempt to connect to HDFS
-* [ARROW-1705](https://issues.apache.org/jira/browse/ARROW-1705) - [Python] Create StructArray from sequence of dicts given a known data type
-* [ARROW-1706](https://issues.apache.org/jira/browse/ARROW-1706) - [Python] StructArray.from\_arrays should handle sequences that are coercible to arrays
-* [ARROW-1712](https://issues.apache.org/jira/browse/ARROW-1712) - [C++] Add method to BinaryBuilder to reserve space for value data
-* [ARROW-1757](https://issues.apache.org/jira/browse/ARROW-1757) - [C++] Add DictionaryArray::FromArrays alternate ctor that can check or sanitized "untrusted" indices
-* [ARROW-1815](https://issues.apache.org/jira/browse/ARROW-1815) - [Java] Rename MapVector to StructVector
-* [ARROW-1832](https://issues.apache.org/jira/browse/ARROW-1832) - [JS] Implement JSON reader for integration tests
-* [ARROW-1835](https://issues.apache.org/jira/browse/ARROW-1835) - [C++] Create Arrow schema from std::tuple types
-* [ARROW-1861](https://issues.apache.org/jira/browse/ARROW-1861) - [Python] Fix up ASV setup, add developer instructions for writing new benchmarks and running benchmark suite locally
-* [ARROW-1872](https://issues.apache.org/jira/browse/ARROW-1872) - [Website] Populate hard-coded fields for current release from a YAML file
-* [ARROW-1899](https://issues.apache.org/jira/browse/ARROW-1899) - [Python] Refactor handling of null sentinels in python/numpy\_to\_arrow.cc
-* [ARROW-1920](https://issues.apache.org/jira/browse/ARROW-1920) - Add support for reading ORC files
-* [ARROW-1926](https://issues.apache.org/jira/browse/ARROW-1926) - [GLib] Add garrow\_timestamp\_data\_type\_get\_unit()
-* [ARROW-1927](https://issues.apache.org/jira/browse/ARROW-1927) - [Plasma] Implement delete function
-* [ARROW-1929](https://issues.apache.org/jira/browse/ARROW-1929) - [C++] Move various Arrow testing utility code from Parquet to Arrow codebase
-* [ARROW-1930](https://issues.apache.org/jira/browse/ARROW-1930) - [C++] Implement Slice for ChunkedArray and Column
-* [ARROW-1931](https://issues.apache.org/jira/browse/ARROW-1931) - [C++] w4996 warning due to std::tr1 failing builds on Visual Studio 2017
-* [ARROW-1937](https://issues.apache.org/jira/browse/ARROW-1937) - [Python] Add documentation for different forms of constructing nested arrays from Python data structures
-* [ARROW-1942](https://issues.apache.org/jira/browse/ARROW-1942) - [C++] Hash table specializations for small integers
-* [ARROW-1947](https://issues.apache.org/jira/browse/ARROW-1947) - [Plasma] Change Client Create and Get to use Buffers
-* [ARROW-1951](https://issues.apache.org/jira/browse/ARROW-1951) - Add memcopy\_threads to serialization context
-* [ARROW-1962](https://issues.apache.org/jira/browse/ARROW-1962) - [Java] Add reset() to ValueVector interface
-* [ARROW-1965](https://issues.apache.org/jira/browse/ARROW-1965) - [GLib] Add garrow\_array\_builder\_get\_value\_data\_type() and garrow\_array\_builder\_get\_value\_type()
-* [ARROW-1969](https://issues.apache.org/jira/browse/ARROW-1969) - [C++] Do not build ORC adapter by default
-* [ARROW-1970](https://issues.apache.org/jira/browse/ARROW-1970) - [GLib] Add garrow\_chunked\_array\_get\_value\_data\_type() and garrow\_chunked\_array\_get\_value\_type()
-* [ARROW-1977](https://issues.apache.org/jira/browse/ARROW-1977) - [C++] Update windows dev docs
-* [ARROW-1978](https://issues.apache.org/jira/browse/ARROW-1978) - [Website] Add more visible link to "Powered By" page to front page, simplify Powered By
-* [ARROW-2004](https://issues.apache.org/jira/browse/ARROW-2004) - [C++] Add shrink\_to\_fit option in BufferBuilder::Resize
-* [ARROW-2007](https://issues.apache.org/jira/browse/ARROW-2007) - [Python] Sequence converter for float32 not implemented
-* [ARROW-2011](https://issues.apache.org/jira/browse/ARROW-2011) - Allow setting the pickler to use in pyarrow serialization.
-* [ARROW-2012](https://issues.apache.org/jira/browse/ARROW-2012) - [GLib] Support "make distclean"
-* [ARROW-2018](https://issues.apache.org/jira/browse/ARROW-2018) - [C++] Build instruction on macOS and Homebrew is incomplete
-* [ARROW-2019](https://issues.apache.org/jira/browse/ARROW-2019) - Control the memory allocated for inner vector in LIST
-* [ARROW-2024](https://issues.apache.org/jira/browse/ARROW-2024) - [Python] Remove global SerializationContext variables
-* [ARROW-2028](https://issues.apache.org/jira/browse/ARROW-2028) - [Python] extra\_cmake\_args needs to be passed through shlex.split
-* [ARROW-2031](https://issues.apache.org/jira/browse/ARROW-2031) - HadoopFileSystem isn't pickleable
-* [ARROW-2035](https://issues.apache.org/jira/browse/ARROW-2035) - [C++] Update vendored cpplint.py to a Py3-compatible one
-* [ARROW-2036](https://issues.apache.org/jira/browse/ARROW-2036) - NativeFile should support standard IOBase methods
-* [ARROW-2042](https://issues.apache.org/jira/browse/ARROW-2042) - [Plasma] Revert API change of plasma::Create to output a MutableBuffer
-* [ARROW-2043](https://issues.apache.org/jira/browse/ARROW-2043) - [C++] Change description from OS X to macOS
-* [ARROW-2046](https://issues.apache.org/jira/browse/ARROW-2046) - [Python] Add support for PEP519 - pathlib and similar objects
-* [ARROW-2048](https://issues.apache.org/jira/browse/ARROW-2048) - [Python/C++] Upate Thrift pin to 0.11
-* [ARROW-2050](https://issues.apache.org/jira/browse/ARROW-2050) - Support \`setup.py pytest\` to automatically fetch the test dependencies
-* [ARROW-2052](https://issues.apache.org/jira/browse/ARROW-2052) - Unify OwnedRef and ScopedRef
-* [ARROW-2053](https://issues.apache.org/jira/browse/ARROW-2053) - [C++] Build instruction is incomplete
-* [ARROW-2054](https://issues.apache.org/jira/browse/ARROW-2054) - Compilation warnings
-* [ARROW-2064](https://issues.apache.org/jira/browse/ARROW-2064) - [GLib] Add common build problems link to the install section
-* [ARROW-2065](https://issues.apache.org/jira/browse/ARROW-2065) - Fix bug in SerializationContext.clone().
-* [ARROW-2066](https://issues.apache.org/jira/browse/ARROW-2066) - [Python] Document reading Parquet files from Azure Blob Store
-* [ARROW-2068](https://issues.apache.org/jira/browse/ARROW-2068) - [Python] Expose Array's buffers to Python users
-* [ARROW-2069](https://issues.apache.org/jira/browse/ARROW-2069) - [Python] Document that Plasma is not (yet) supported on Windows
-* [ARROW-2071](https://issues.apache.org/jira/browse/ARROW-2071) - [Python] Reduce runtime of builds in Travis CI
-* [ARROW-2071](https://issues.apache.org/jira/browse/ARROW-2071) - [Python] Reduce runtime of builds in Travis CI
-* [ARROW-2073](https://issues.apache.org/jira/browse/ARROW-2073) - [Python] Create StructArray from sequence of tuples given a known data type
-* [ARROW-2076](https://issues.apache.org/jira/browse/ARROW-2076) - [Python] Display slowest test durations
-* [ARROW-2083](https://issues.apache.org/jira/browse/ARROW-2083) - Support skipping builds
-* [ARROW-2084](https://issues.apache.org/jira/browse/ARROW-2084) - [C++] Support newer Brotli static library names
-* [ARROW-2086](https://issues.apache.org/jira/browse/ARROW-2086) - [Python] Shrink size of arrow\_manylinux1\_x86\_64\_base docker image
-* [ARROW-2087](https://issues.apache.org/jira/browse/ARROW-2087) - [Python] Binaries of 3rdparty are not stripped in manylinux1 base image
-* [ARROW-2088](https://issues.apache.org/jira/browse/ARROW-2088) - [GLib] Add GArrowNumericArray
-* [ARROW-2089](https://issues.apache.org/jira/browse/ARROW-2089) - [GLib] Rename to GARROW\_TYPE\_BOOLEAN for consistency
-* [ARROW-2090](https://issues.apache.org/jira/browse/ARROW-2090) - [Python] Add context manager methods to ParquetWriter
-* [ARROW-2093](https://issues.apache.org/jira/browse/ARROW-2093) - [Python] Possibly do not test pytorch serialization in Travis CI
-* [ARROW-2094](https://issues.apache.org/jira/browse/ARROW-2094) - [Python] Use toolchain libraries and PROTOBUF\_HOME for protocol buffers
-* [ARROW-2095](https://issues.apache.org/jira/browse/ARROW-2095) - [C++] Suppress ORC EP build logging by default
-* [ARROW-2096](https://issues.apache.org/jira/browse/ARROW-2096) - [C++] Turn off Boost\_DEBUG to trim build output
-* [ARROW-2099](https://issues.apache.org/jira/browse/ARROW-2099) - [Python] Support DictionaryArray::FromArrays in Python bindings
-* [ARROW-2107](https://issues.apache.org/jira/browse/ARROW-2107) - [GLib] Follow arrow::gpu::CudaIpcMemHandle API change
-* [ARROW-2108](https://issues.apache.org/jira/browse/ARROW-2108) - [Python] Update instructions for ASV
-* [ARROW-2110](https://issues.apache.org/jira/browse/ARROW-2110) - [Python] Only require pytest-runner on test commands
-* [ARROW-2111](https://issues.apache.org/jira/browse/ARROW-2111) - [C++] Linting could be faster
-* [ARROW-2114](https://issues.apache.org/jira/browse/ARROW-2114) - [Python] Pull latest docker manylinux1 image
-* [ARROW-2117](https://issues.apache.org/jira/browse/ARROW-2117) - [C++] Pin clang to version 5.0
-* [ARROW-2118](https://issues.apache.org/jira/browse/ARROW-2118) - [Python] Improve error message when calling parquet.read\_table on an empty file
-* [ARROW-2120](https://issues.apache.org/jira/browse/ARROW-2120) - Add possibility to use empty \_MSVC\_STATIC\_LIB\_SUFFIX for Thirdparties
-* [ARROW-2121](https://issues.apache.org/jira/browse/ARROW-2121) - [Python] Consider special casing object arrays in pandas serializers.
-* [ARROW-2123](https://issues.apache.org/jira/browse/ARROW-2123) - [JS] Upgrade to TS 2.7.1
-* [ARROW-2132](https://issues.apache.org/jira/browse/ARROW-2132) - [Doc] Add links / mentions of Plasma store to main README
-* [ARROW-2134](https://issues.apache.org/jira/browse/ARROW-2134) - [CI] Make Travis commit inspection more robust
-* [ARROW-2137](https://issues.apache.org/jira/browse/ARROW-2137) - [Python] Don't print paths that are ignored when reading Parquet files
-* [ARROW-2138](https://issues.apache.org/jira/browse/ARROW-2138) - [C++] Have FatalLog abort instead of exiting
-* [ARROW-2142](https://issues.apache.org/jira/browse/ARROW-2142) - [Python] Conversion from Numpy struct array unimplemented
-* [ARROW-2143](https://issues.apache.org/jira/browse/ARROW-2143) - [Python] Provide a manylinux1 wheel for cp27m
-* [ARROW-2146](https://issues.apache.org/jira/browse/ARROW-2146) - [GLib] Implement Slice for ChunkedArray
-* [ARROW-2149](https://issues.apache.org/jira/browse/ARROW-2149) - [Python] reorganize test\_convert\_pandas.py
-* [ARROW-2154](https://issues.apache.org/jira/browse/ARROW-2154) - [Python] \_\_eq\_\_ unimplemented on Buffer
-* [ARROW-2155](https://issues.apache.org/jira/browse/ARROW-2155) - [Python] pa.frombuffer(bytearray) returns immutable Buffer
-* [ARROW-2156](https://issues.apache.org/jira/browse/ARROW-2156) - [CI] Isolate Sphinx dependencies
-* [ARROW-2163](https://issues.apache.org/jira/browse/ARROW-2163) - Install apt dependencies separate from built-in Travis commands, retry on flakiness
-* [ARROW-2166](https://issues.apache.org/jira/browse/ARROW-2166) - [GLib] Implement Slice for Column
-* [ARROW-2168](https://issues.apache.org/jira/browse/ARROW-2168) - [C++] Build toolchain builds with jemalloc
-* [ARROW-2169](https://issues.apache.org/jira/browse/ARROW-2169) - [C++] MSVC is complaining about uncaptured variables
-* [ARROW-2174](https://issues.apache.org/jira/browse/ARROW-2174) - [JS] Export format and schema enums
-* [ARROW-2176](https://issues.apache.org/jira/browse/ARROW-2176) - [C++] Extend DictionaryBuilder to support delta dictionaries
-* [ARROW-2177](https://issues.apache.org/jira/browse/ARROW-2177) - [C++] Remove support for specifying negative scale values in DecimalType
-* [ARROW-2180](https://issues.apache.org/jira/browse/ARROW-2180) - [C++] Remove APIs deprecated in 0.8.0 release
-* [ARROW-2181](https://issues.apache.org/jira/browse/ARROW-2181) - [Python] Add concat\_tables to API reference, add documentation on use
-* [ARROW-2184](https://issues.apache.org/jira/browse/ARROW-2184) - [C++] Add static constructor for FileOutputStream returning shared\_ptr to base OutputStream
-* [ARROW-2185](https://issues.apache.org/jira/browse/ARROW-2185) - Remove CI directives from squashed commit messages
-* [ARROW-2190](https://issues.apache.org/jira/browse/ARROW-2190) - [GLib] Add add/remove field functions for RecordBatch.
-* [ARROW-2191](https://issues.apache.org/jira/browse/ARROW-2191) - [C++] Only use specific version of jemalloc
-* [ARROW-2197](https://issues.apache.org/jira/browse/ARROW-2197) - Document "undefined symbol" issue and workaround
-* [ARROW-2198](https://issues.apache.org/jira/browse/ARROW-2198) - [Python] Docstring for parquet.read\_table is misleading or incorrect
-* [ARROW-2199](https://issues.apache.org/jira/browse/ARROW-2199) - [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
-* [ARROW-2203](https://issues.apache.org/jira/browse/ARROW-2203) - [C++] StderrStream class
-* [ARROW-2204](https://issues.apache.org/jira/browse/ARROW-2204) - [C++] Build fails with TLS error on parquet-cpp clone
-* [ARROW-2205](https://issues.apache.org/jira/browse/ARROW-2205) - [Python] Option for integer object nulls
-* [ARROW-2206](https://issues.apache.org/jira/browse/ARROW-2206) - [JS] Add Perspective as a community project
-* [ARROW-2218](https://issues.apache.org/jira/browse/ARROW-2218) - [Python] PythonFile should infer mode when not given
-* [ARROW-2231](https://issues.apache.org/jira/browse/ARROW-2231) - [CI] Use clcache on AppVeyor
-* [ARROW-2238](https://issues.apache.org/jira/browse/ARROW-2238) - [C++] Detect clcache in cmake configuration
-* [ARROW-2239](https://issues.apache.org/jira/browse/ARROW-2239) - [C++] Update build docs for Windows
-* [ARROW-2250](https://issues.apache.org/jira/browse/ARROW-2250) - plasma\_store process should cleanup on INT and TERM signals
-* [ARROW-2252](https://issues.apache.org/jira/browse/ARROW-2252) - [Python] Create buffer from address, size and base
-* [ARROW-2253](https://issues.apache.org/jira/browse/ARROW-2253) - [Python] Support \_\_eq\_\_ on scalar values
-* [ARROW-2257](https://issues.apache.org/jira/browse/ARROW-2257) - [C++] Add high-level option to toggle CXX11 ABI
-* [ARROW-2261](https://issues.apache.org/jira/browse/ARROW-2261) - [GLib] Can't share the same memory in GArrowBuffer safely
-* [ARROW-2262](https://issues.apache.org/jira/browse/ARROW-2262) - [Python] Support slicing on pyarrow.ChunkedArray
-* [ARROW-2279](https://issues.apache.org/jira/browse/ARROW-2279) - [Python] Better error message if lib cannot be found
-* [ARROW-2282](https://issues.apache.org/jira/browse/ARROW-2282) - [Python] Create StringArray from buffers
-* [ARROW-2283](https://issues.apache.org/jira/browse/ARROW-2283) - [C++] Support Arrow C++ installed in /usr detection by pkg-config
-* [ARROW-2289](https://issues.apache.org/jira/browse/ARROW-2289) - [GLib] Add Numeric, Integer and FloatingPoint data types
-* [ARROW-2291](https://issues.apache.org/jira/browse/ARROW-2291) - [C++] README missing instructions for libboost-regex-dev
-* [ARROW-2292](https://issues.apache.org/jira/browse/ARROW-2292) - [Python] More consistent / intuitive name for pyarrow.frombuffer
-* [ARROW-2309](https://issues.apache.org/jira/browse/ARROW-2309) - [C++] Use std::make\_unsigned
-* [ARROW-2321](https://issues.apache.org/jira/browse/ARROW-2321) - [C++] Release verification script fails with if CMAKE\_INSTALL\_LIBDIR is not $ARROW\_HOME/lib
-* [ARROW-2329](https://issues.apache.org/jira/browse/ARROW-2329) - [Website]: 0.9.0 release update
-* [ARROW-2336](https://issues.apache.org/jira/browse/ARROW-2336) - [Website] Blog post for 0.9.0 release
-* [ARROW-2768](https://issues.apache.org/jira/browse/ARROW-2768) - [Packaging] Support Ubuntu 18.04
-* [ARROW-2783](https://issues.apache.org/jira/browse/ARROW-2783) - Importing conda-forge pyarrow fails
-
-
-## Bug Fixes
-
-* [ARROW-1345](https://issues.apache.org/jira/browse/ARROW-1345) - [Python] Conversion from nested NumPy arrays fails on integers other than int64, float32
-* [ARROW-1589](https://issues.apache.org/jira/browse/ARROW-1589) - [C++] Fuzzing for certain input formats
-* [ARROW-1646](https://issues.apache.org/jira/browse/ARROW-1646) - [Python] pyarrow.array cannot handle NumPy scalar types
-* [ARROW-1856](https://issues.apache.org/jira/browse/ARROW-1856) - [Python] Auto-detect Parquet ABI version when using PARQUET\_HOME
-* [ARROW-1909](https://issues.apache.org/jira/browse/ARROW-1909) - [C++] Bug: Build fails on windows with "-DARROW\_BUILD\_BENCHMARKS=ON"
-* [ARROW-1912](https://issues.apache.org/jira/browse/ARROW-1912) - [Website] Add org affiliations to committers.html
-* [ARROW-1919](https://issues.apache.org/jira/browse/ARROW-1919) - Plasma hanging if object id is not 20 bytes
-* [ARROW-1924](https://issues.apache.org/jira/browse/ARROW-1924) - [Python] Bring back pickle=True option for serialization
-* [ARROW-1933](https://issues.apache.org/jira/browse/ARROW-1933) - [GLib] Build failure with --with-arrow-cpp-build-dir and GPU enabled Arrow C++
-* [ARROW-1940](https://issues.apache.org/jira/browse/ARROW-1940) - [Python] Extra metadata gets added after multiple conversions between pd.DataFrame and pa.Table
-* [ARROW-1941](https://issues.apache.org/jira/browse/ARROW-1941) - Table <–\> DataFrame roundtrip failing
-* [ARROW-1943](https://issues.apache.org/jira/browse/ARROW-1943) - Handle setInitialCapacity() for deeply nested lists of lists
-* [ARROW-1944](https://issues.apache.org/jira/browse/ARROW-1944) - FindArrow has wrong ARROW\_STATIC\_LIB
-* [ARROW-1945](https://issues.apache.org/jira/browse/ARROW-1945) - [C++] Fix doxygen documentation of array.h
-* [ARROW-1946](https://issues.apache.org/jira/browse/ARROW-1946) - Add APIs to decimal vector for writing big endian data
-* [ARROW-1948](https://issues.apache.org/jira/browse/ARROW-1948) - [Java] ListVector does not handle ipc with all non-null values with none set
-* [ARROW-1950](https://issues.apache.org/jira/browse/ARROW-1950) - [Python] pandas\_type in pandas metadata incorrect for List types
-* [ARROW-1953](https://issues.apache.org/jira/browse/ARROW-1953) - [JS] JavaScript builds broken on master
-* [ARROW-1955](https://issues.apache.org/jira/browse/ARROW-1955) - MSVC generates "attempting to reference a deleted function" during build.
-* [ARROW-1958](https://issues.apache.org/jira/browse/ARROW-1958) - [Python] Error in pandas conversion for datetimetz row index
-* [ARROW-1961](https://issues.apache.org/jira/browse/ARROW-1961) - [Python] Writing Parquet file with flavor='spark' loses pandas schema metadata
-* [ARROW-1966](https://issues.apache.org/jira/browse/ARROW-1966) - [C++] Support JAVA\_HOME paths in HDFS libjvm loading that include the jre directory
-* [ARROW-1967](https://issues.apache.org/jira/browse/ARROW-1967) - Python: AssertionError w.r.t Pandas conversion on Parquet files in 0.8.0 dev version
-* [ARROW-1971](https://issues.apache.org/jira/browse/ARROW-1971) - [Python] Add pandas serialization to the default
-* [ARROW-1972](https://issues.apache.org/jira/browse/ARROW-1972) - Deserialization of buffer objects (and pandas dataframes) segfaults on different processes.
-* [ARROW-1973](https://issues.apache.org/jira/browse/ARROW-1973) - [Python] Memory leak when converting Arrow tables with array columns to Pandas dataframes.
-* [ARROW-1976](https://issues.apache.org/jira/browse/ARROW-1976) - [Python] Handling unicode pandas columns on parquet.read\_table
-* [ARROW-1979](https://issues.apache.org/jira/browse/ARROW-1979) - [JS] JS builds handing in es2015:umd tests
-* [ARROW-1980](https://issues.apache.org/jira/browse/ARROW-1980) - [Python] Race condition in \`write\_to\_dataset\`
-* [ARROW-1982](https://issues.apache.org/jira/browse/ARROW-1982) - [Python] Return parquet statistics min/max as values instead of strings
-* [ARROW-1986](https://issues.apache.org/jira/browse/ARROW-1986) - [Python] HadoopFileSystem is not picklable and cannot currently be used with multiprocessing
-* [ARROW-1991](https://issues.apache.org/jira/browse/ARROW-1991) - [GLib] Docker-based documentation build is broken
-* [ARROW-1992](https://issues.apache.org/jira/browse/ARROW-1992) - [Python] to\_pandas crashes when using strings\_to\_categoricals on empty string cols on 0.8.0
-* [ARROW-1997](https://issues.apache.org/jira/browse/ARROW-1997) - [Python] to\_pandas with strings\_to\_categorical fails
-* [ARROW-1998](https://issues.apache.org/jira/browse/ARROW-1998) - [Python] Table.from\_pandas crashes when data frame is empty
-* [ARROW-1999](https://issues.apache.org/jira/browse/ARROW-1999) - [Python] from\_numpy\_dtype returns wrong types
-* [ARROW-2000](https://issues.apache.org/jira/browse/ARROW-2000) - Deduplicate file descriptors when plasma store replies to get request.
-* [ARROW-2002](https://issues.apache.org/jira/browse/ARROW-2002) - use pyarrow download file will raise queue.Full exceptions sometimes
-* [ARROW-2003](https://issues.apache.org/jira/browse/ARROW-2003) - [Python] Do not use deprecated kwarg in pandas.core.internals.make\_block
-* [ARROW-2005](https://issues.apache.org/jira/browse/ARROW-2005) - [Python] pyflakes warnings on Cython files not failing build
-* [ARROW-2008](https://issues.apache.org/jira/browse/ARROW-2008) - [Python] Type inference for int32 NumPy arrays (expecting list<int32\>) returns int64 and then conversion fails
-* [ARROW-2010](https://issues.apache.org/jira/browse/ARROW-2010) - [C++] Compiler warnings with CHECKIN warning level in ORC adapter
-* [ARROW-2017](https://issues.apache.org/jira/browse/ARROW-2017) - Array initialization with large (\>2\*\*31-1) uint64 values fails
-* [ARROW-2023](https://issues.apache.org/jira/browse/ARROW-2023) - [C++] Test opening IPC stream reader or file reader on an empty InputStream
-* [ARROW-2025](https://issues.apache.org/jira/browse/ARROW-2025) - [Python/C++] HDFS Client disconnect closes all open clients
-* [ARROW-2029](https://issues.apache.org/jira/browse/ARROW-2029) - [Python] Program crash on \`HdfsFile.tell\` if file is closed
-* [ARROW-2032](https://issues.apache.org/jira/browse/ARROW-2032) - [C++] ORC ep installs on each call to ninja build (even if no work to do)
-* [ARROW-2033](https://issues.apache.org/jira/browse/ARROW-2033) - pa.array() doesn't work with iterators
-* [ARROW-2039](https://issues.apache.org/jira/browse/ARROW-2039) - [Python] pyarrow.Buffer().to\_pybytes() segfaults
-* [ARROW-2040](https://issues.apache.org/jira/browse/ARROW-2040) - [Python] Deserialized Numpy array must keep ref to underlying tensor
-* [ARROW-2047](https://issues.apache.org/jira/browse/ARROW-2047) - [Python] test\_serialization.py uses a python executable in PATH rather than that used for a test run
-* [ARROW-2049](https://issues.apache.org/jira/browse/ARROW-2049) - ARROW-2049: [Python] Use python -m cython to run Cython, instead of CYTHON\_EXECUTABLE
-* [ARROW-2062](https://issues.apache.org/jira/browse/ARROW-2062) - [C++] Stalled builds in test\_serialization.py in Travis CI
-* [ARROW-2070](https://issues.apache.org/jira/browse/ARROW-2070) - [Python] chdir logic in setup.py buggy
-* [ARROW-2072](https://issues.apache.org/jira/browse/ARROW-2072) - [Python] decimal128.byte\_width crashes
-* [ARROW-2080](https://issues.apache.org/jira/browse/ARROW-2080) - [Python] Update documentation after ARROW-2024
-* [ARROW-2085](https://issues.apache.org/jira/browse/ARROW-2085) - HadoopFileSystem.isdir and .isfile should return False if the path doesn't exist
-* [ARROW-2106](https://issues.apache.org/jira/browse/ARROW-2106) - [Python] pyarrow.array can't take a pandas Series of python datetime objects.
-* [ARROW-2109](https://issues.apache.org/jira/browse/ARROW-2109) - [C++] Boost 1.66 compilation fails on Windows on linkage stage
-* [ARROW-2124](https://issues.apache.org/jira/browse/ARROW-2124) - [Python] ArrowInvalid raised if the first item of a nested list of numpy arrays is empty
-* [ARROW-2128](https://issues.apache.org/jira/browse/ARROW-2128) - [Python] Cannot serialize array of empty lists
-* [ARROW-2129](https://issues.apache.org/jira/browse/ARROW-2129) - [Python] Segmentation fault on conversion of empty array to Pandas
-* [ARROW-2131](https://issues.apache.org/jira/browse/ARROW-2131) - [Python] Serialization test fails on Windows when library has been built in place / not installed
-* [ARROW-2133](https://issues.apache.org/jira/browse/ARROW-2133) - [Python] Segmentation fault on conversion of empty nested arrays to Pandas
-* [ARROW-2135](https://issues.apache.org/jira/browse/ARROW-2135) - [Python] NaN values silently casted to int64 when passing explicit schema for conversion in Table.from\_pandas
-* [ARROW-2139](https://issues.apache.org/jira/browse/ARROW-2139) - [Python] Address Sphinx deprecation warning when building docs
-* [ARROW-2145](https://issues.apache.org/jira/browse/ARROW-2145) - [Python] Decimal conversion not working for NaN values
-* [ARROW-2150](https://issues.apache.org/jira/browse/ARROW-2150) - [Python] array equality defaults to identity
-* [ARROW-2151](https://issues.apache.org/jira/browse/ARROW-2151) - [Python] Error when converting from list of uint64 arrays
-* [ARROW-2153](https://issues.apache.org/jira/browse/ARROW-2153) - [C++/Python] Decimal conversion not working for exponential notation
-* [ARROW-2157](https://issues.apache.org/jira/browse/ARROW-2157) - [Python] Decimal arrays cannot be constructed from Python lists
-* [ARROW-2158](https://issues.apache.org/jira/browse/ARROW-2158) - [Python] Construction of Decimal array with None or np.nan fails
-* [ARROW-2160](https://issues.apache.org/jira/browse/ARROW-2160) - [C++/Python] Fix decimal precision inference
-* [ARROW-2161](https://issues.apache.org/jira/browse/ARROW-2161) - [Python] Skip test\_cython\_api if ARROW\_HOME isn't defined
-* [ARROW-2162](https://issues.apache.org/jira/browse/ARROW-2162) - [Python/C++] Decimal Values with too-high precision are multiplied by 100
-* [ARROW-2167](https://issues.apache.org/jira/browse/ARROW-2167) - [C++] Building Orc extensions fails with the default BUILD\_WARNING\_LEVEL=Production
-* [ARROW-2170](https://issues.apache.org/jira/browse/ARROW-2170) - [Python] construct\_metadata fails on reading files where no index was preserved
-* [ARROW-2171](https://issues.apache.org/jira/browse/ARROW-2171) - [Python] OwnedRef is fragile
-* [ARROW-2172](https://issues.apache.org/jira/browse/ARROW-2172) - [Python] Incorrect conversion from Numpy array when stride % itemsize != 0
-* [ARROW-2173](https://issues.apache.org/jira/browse/ARROW-2173) - [Python] NumPyBuffer destructor should hold the GIL
-* [ARROW-2175](https://issues.apache.org/jira/browse/ARROW-2175) - [Python] arrow\_ep build is triggering during parquet-cpp build in Travis CI
-* [ARROW-2178](https://issues.apache.org/jira/browse/ARROW-2178) - [JS] Fix JS html FileReader example
-* [ARROW-2179](https://issues.apache.org/jira/browse/ARROW-2179) - [C++] arrow/util/io-util.h missing from libarrow-dev
-* [ARROW-2192](https://issues.apache.org/jira/browse/ARROW-2192) - Commits to master should run all builds in CI matrix
-* [ARROW-2194](https://issues.apache.org/jira/browse/ARROW-2194) - [Python] Pandas columns metadata incorrect for empty string columns
-* [ARROW-2208](https://issues.apache.org/jira/browse/ARROW-2208) - [Python] install issues with jemalloc
-* [ARROW-2209](https://issues.apache.org/jira/browse/ARROW-2209) - [Python] Partition columns are not correctly loaded in schema of ParquetDataset
-* [ARROW-2210](https://issues.apache.org/jira/browse/ARROW-2210) - [C++] TestBuffer\_ResizeOOM has a memory leak with jemalloc
-* [ARROW-2212](https://issues.apache.org/jira/browse/ARROW-2212) - [C++/Python] Build Protobuf in base manylinux 1 docker image
-* [ARROW-2223](https://issues.apache.org/jira/browse/ARROW-2223) - [JS] installing umd release throws an error
-* [ARROW-2227](https://issues.apache.org/jira/browse/ARROW-2227) - [Python] Table.from\_pandas does not create chunked\_arrays.
-* [ARROW-2228](https://issues.apache.org/jira/browse/ARROW-2228) - [Python] Unsigned int type for arrow Table not supported
-* [ARROW-2230](https://issues.apache.org/jira/browse/ARROW-2230) - [Python] JS version number is sometimes picked up
-* [ARROW-2232](https://issues.apache.org/jira/browse/ARROW-2232) - [Python] pyarrow.Tensor constructor segfaults
-* [ARROW-2234](https://issues.apache.org/jira/browse/ARROW-2234) - [JS] Read timestamp low bits as Uint32s
-* [ARROW-2240](https://issues.apache.org/jira/browse/ARROW-2240) - [Python] Array initialization with leading numpy nan fails with exception
-* [ARROW-2244](https://issues.apache.org/jira/browse/ARROW-2244) - [C++] Slicing NullArray should not cause the null count on the internal data to be unknown
-* [ARROW-2245](https://issues.apache.org/jira/browse/ARROW-2245) - [Python] Revert static linkage of parquet-cpp in manylinux1 wheel
-* [ARROW-2246](https://issues.apache.org/jira/browse/ARROW-2246) - [Python] Use namespaced boost in manylinux1 package
-* [ARROW-2251](https://issues.apache.org/jira/browse/ARROW-2251) - [GLib] Destroying GArrowBuffer while GArrowTensor that uses the buffer causes a crash
-* [ARROW-2254](https://issues.apache.org/jira/browse/ARROW-2254) - [Python] Local in-place dev versions picking up JS tags
-* [ARROW-2258](https://issues.apache.org/jira/browse/ARROW-2258) - [C++] Appveyor builds failing on master
-* [ARROW-2263](https://issues.apache.org/jira/browse/ARROW-2263) - [Python] test\_cython.py fails if pyarrow is not in import path (e.g. with inplace builds)
-* [ARROW-2265](https://issues.apache.org/jira/browse/ARROW-2265) - [Python] Serializing subclasses of np.ndarray returns a np.ndarray.
-* [ARROW-2268](https://issues.apache.org/jira/browse/ARROW-2268) - Remove MD5 checksums from release process
-* [ARROW-2269](https://issues.apache.org/jira/browse/ARROW-2269) - [Python] Cannot build bdist\_wheel for Python
-* [ARROW-2270](https://issues.apache.org/jira/browse/ARROW-2270) - [Python] ForeignBuffer doesn't tie Python object lifetime to C++ buffer lifetime
-* [ARROW-2272](https://issues.apache.org/jira/browse/ARROW-2272) - [Python] test\_plasma spams /tmp
-* [ARROW-2275](https://issues.apache.org/jira/browse/ARROW-2275) - [C++] Buffer::mutable\_data\_ member uninitialized
-* [ARROW-2280](https://issues.apache.org/jira/browse/ARROW-2280) - [Python] pyarrow.Array.buffers should also include the offsets
-* [ARROW-2284](https://issues.apache.org/jira/browse/ARROW-2284) - [Python] test\_plasma error on plasma\_store error
-* [ARROW-2288](https://issues.apache.org/jira/browse/ARROW-2288) - [Python] slicing logic defective
-* [ARROW-2297](https://issues.apache.org/jira/browse/ARROW-2297) - [JS] babel-jest is not listed as a dev dependency
-* [ARROW-2304](https://issues.apache.org/jira/browse/ARROW-2304) - [C++] MultipleClients test in io-hdfs-test fails on trunk
-* [ARROW-2306](https://issues.apache.org/jira/browse/ARROW-2306) - [Python] HDFS test failures
-* [ARROW-2307](https://issues.apache.org/jira/browse/ARROW-2307) - [Python] Unable to read arrow stream containing 0 record batches
-* [ARROW-2311](https://issues.apache.org/jira/browse/ARROW-2311) - [Python] Struct array slicing defective
-* [ARROW-2312](https://issues.apache.org/jira/browse/ARROW-2312) - [JS] verify-release-candidate-sh must be updated to include JS in integration tests
-* [ARROW-2313](https://issues.apache.org/jira/browse/ARROW-2313) - [GLib] Release builds must define NDEBUG
-* [ARROW-2316](https://issues.apache.org/jira/browse/ARROW-2316) - [C++] Revert Buffer::mutable\_data member to always inline
-* [ARROW-2318](https://issues.apache.org/jira/browse/ARROW-2318) - [C++] TestPlasmaStore.MultipleClientTest is flaky (hangs) in release builds
-* [ARROW-2320](https://issues.apache.org/jira/browse/ARROW-2320) - [C++] Vendored Boost build does not build regex library
-* [ARROW-2406](https://issues.apache.org/jira/browse/ARROW-2406) - [Python] Segfault when creating PyArrow table from Pandas for empty string column when schema provided
-
-
-
-# Apache Arrow 0.8.0 (2017-12-18)
-
-## Bug Fixes
-
-* [ARROW-226](https://issues.apache.org/jira/browse/ARROW-226) - [C++] libhdfs: feedback to help determining cause of failure in opening file path
-* [ARROW-641](https://issues.apache.org/jira/browse/ARROW-641) - [C++] Do not build/run io-hdfs-test if ARROW\_HDFS=off
-* [ARROW-1282](https://issues.apache.org/jira/browse/ARROW-1282) - Large memory reallocation by Arrow causes hang in jemalloc
-* [ARROW-1298](https://issues.apache.org/jira/browse/ARROW-1298) - C++: Add prefix to jemalloc functions to guard against issues when using multiple allocators in the same process
-* [ARROW-1341](https://issues.apache.org/jira/browse/ARROW-1341) - [C++] Deprecate arrow::MakeTable in favor of new ctor from ARROW-1334
-* [ARROW-1347](https://issues.apache.org/jira/browse/ARROW-1347) - [JAVA] List null type should use consistent name for inner field
-* [ARROW-1398](https://issues.apache.org/jira/browse/ARROW-1398) - [Python] No support reading columns of type decimal(19,4)
-* [ARROW-1409](https://issues.apache.org/jira/browse/ARROW-1409) - [Format] Use for "page" attribute in Buffer in metadata
-* [ARROW-1431](https://issues.apache.org/jira/browse/ARROW-1431) - [Java] JsonFileReader doesn't intialize some vectors approperately
-* [ARROW-1436](https://issues.apache.org/jira/browse/ARROW-1436) - PyArrow Timestamps written to Parquet as INT96 appear in Spark as 'bigint'
-* [ARROW-1540](https://issues.apache.org/jira/browse/ARROW-1540) - [C++] Fix valgrind warnings in cuda-test if possible
-* [ARROW-1541](https://issues.apache.org/jira/browse/ARROW-1541) - [C++] Race condition with arrow\_gpu
-* [ARROW-1543](https://issues.apache.org/jira/browse/ARROW-1543) - [C++] row\_wise\_conversion example doesn't correspond to ListBuilder constructor arguments
-* [ARROW-1549](https://issues.apache.org/jira/browse/ARROW-1549) - [JS] Integrate auto-generated Arrow test files
-* [ARROW-1555](https://issues.apache.org/jira/browse/ARROW-1555) - [Python] write\_to\_dataset on s3
-* [ARROW-1584](https://issues.apache.org/jira/browse/ARROW-1584) - [PYTHON] serialize\_pandas on empty dataframe
-* [ARROW-1585](https://issues.apache.org/jira/browse/ARROW-1585) - serialize\_pandas round trip fails on integer columns
-* [ARROW-1586](https://issues.apache.org/jira/browse/ARROW-1586) - [PYTHON] serialize\_pandas roundtrip loses columns name
-* [ARROW-1609](https://issues.apache.org/jira/browse/ARROW-1609) - Plasma: Build fails with Xcode 9.0
-* [ARROW-1615](https://issues.apache.org/jira/browse/ARROW-1615) - CXX flags for development more permissive than Travis CI builds
-* [ARROW-1617](https://issues.apache.org/jira/browse/ARROW-1617) - [Python] Do not use symlinks in python/cmake\_modules
-* [ARROW-1620](https://issues.apache.org/jira/browse/ARROW-1620) - Python: Download Boost in manylinux1 build from bintray
-* [ARROW-1622](https://issues.apache.org/jira/browse/ARROW-1622) - [Plasma] Plasma doesn't compile with XCode 9
-* [ARROW-1624](https://issues.apache.org/jira/browse/ARROW-1624) - [C++] Follow up fixes / tweaks to compiler warnings for Plasma / LLVM 4.0, add to readme
-* [ARROW-1625](https://issues.apache.org/jira/browse/ARROW-1625) - [Serialization] Support OrderedDict properly
-* [ARROW-1629](https://issues.apache.org/jira/browse/ARROW-1629) - [C++] Fix problematic code paths identified by infer tool
-* [ARROW-1633](https://issues.apache.org/jira/browse/ARROW-1633) - [Python] numpy "unicode" arrays not understood
-* [ARROW-1640](https://issues.apache.org/jira/browse/ARROW-1640) - Resolve OpenSSL issues in Travis CI
-* [ARROW-1647](https://issues.apache.org/jira/browse/ARROW-1647) - [Plasma] Potential bug when reading/writing messages.
-* [ARROW-1653](https://issues.apache.org/jira/browse/ARROW-1653) - [Plasma] Use static cast to avoid compiler warning.
-* [ARROW-1655](https://issues.apache.org/jira/browse/ARROW-1655) - [Java] Add Scale and Precision to ValueVectorTypes.tdd for Decimals
-* [ARROW-1656](https://issues.apache.org/jira/browse/ARROW-1656) - [C++] Endianness Macro is Incorrect on Windows And Mac
-* [ARROW-1657](https://issues.apache.org/jira/browse/ARROW-1657) - [C++] Multithreaded Read Test Failing on Arch Linux
-* [ARROW-1658](https://issues.apache.org/jira/browse/ARROW-1658) - [Python] Out of bounds dictionary indices causes segfault after converting to pandas
-* [ARROW-1663](https://issues.apache.org/jira/browse/ARROW-1663) - [Java] Follow up on ARROW-1347 and make schema backward compatible
-* [ARROW-1670](https://issues.apache.org/jira/browse/ARROW-1670) - [Python] Speed up deserialization code path
-* [ARROW-1672](https://issues.apache.org/jira/browse/ARROW-1672) - [Python] Failure to write Feather bytes column
-* [ARROW-1673](https://issues.apache.org/jira/browse/ARROW-1673) - [Python] NumPy boolean arrays get converted to uint8 arrays on NdarrayToTensor roundtrip
-* [ARROW-1676](https://issues.apache.org/jira/browse/ARROW-1676) - [C++] Correctly truncate oversized validity bitmaps when writing Feather format
-* [ARROW-1678](https://issues.apache.org/jira/browse/ARROW-1678) - [Python] Incorrect serialization of numpy.float16
-* [ARROW-1680](https://issues.apache.org/jira/browse/ARROW-1680) - [Python] Timestamp unit change not done in from\_pandas() conversion
-* [ARROW-1681](https://issues.apache.org/jira/browse/ARROW-1681) - [Python] Error writing with nulls in lists
-* [ARROW-1686](https://issues.apache.org/jira/browse/ARROW-1686) - Documentation generation script creates "apidocs" directory under site/java
-* [ARROW-1693](https://issues.apache.org/jira/browse/ARROW-1693) - [JS] Error reading dictionary-encoded integration test files
-* [ARROW-1694](https://issues.apache.org/jira/browse/ARROW-1694) - [Java] Unclosed VectorSchemaRoot in JsonFileReader\#readDictionaryBatches()
-* [ARROW-1695](https://issues.apache.org/jira/browse/ARROW-1695) - [Serialization] Fix reference counting of numpy arrays created in custom serialializer
-* [ARROW-1698](https://issues.apache.org/jira/browse/ARROW-1698) - [JS] File reader attempts to load the same dictionary batch more than once
-* [ARROW-1704](https://issues.apache.org/jira/browse/ARROW-1704) - [GLib] Go example in test suite is broken
-* [ARROW-1708](https://issues.apache.org/jira/browse/ARROW-1708) - [JS] Linter problem breaks master build
-* [ARROW-1709](https://issues.apache.org/jira/browse/ARROW-1709) - [C++] Decimal.ToString is incorrect for negative scale
-* [ARROW-1711](https://issues.apache.org/jira/browse/ARROW-1711) - [Python] flake8 checks still not failing builds
-* [ARROW-1714](https://issues.apache.org/jira/browse/ARROW-1714) - [Python] No named pd.Series name serialized as u'None'
-* [ARROW-1720](https://issues.apache.org/jira/browse/ARROW-1720) - [Python] Segmentation fault while trying to access an out-of-bound chunk
-* [ARROW-1723](https://issues.apache.org/jira/browse/ARROW-1723) - Windows: \_\_declspec(dllexport) specified when building arrow static library
-* [ARROW-1730](https://issues.apache.org/jira/browse/ARROW-1730) - [Python] Incorrect result from pyarrow.array when passing timestamp type
-* [ARROW-1732](https://issues.apache.org/jira/browse/ARROW-1732) - [Python] RecordBatch.from\_pandas fails on DataFrame with no columns when preserve\_index=False
-* [ARROW-1735](https://issues.apache.org/jira/browse/ARROW-1735) - [C++] Cast kernels cannot write into sliced output array
-* [ARROW-1738](https://issues.apache.org/jira/browse/ARROW-1738) - [Python] Wrong datetime conversion when pa.array with unit
-* [ARROW-1739](https://issues.apache.org/jira/browse/ARROW-1739) - [Python] Fix usages of assertRaises causing broken build
-* [ARROW-1742](https://issues.apache.org/jira/browse/ARROW-1742) - C++: clang-format is not detected correct on OSX anymore
-* [ARROW-1743](https://issues.apache.org/jira/browse/ARROW-1743) - [Python] Table to\_pandas fails when index contains categorical column
-* [ARROW-1745](https://issues.apache.org/jira/browse/ARROW-1745) - Compilation failure on Mac OS in plasma tests
-* [ARROW-1749](https://issues.apache.org/jira/browse/ARROW-1749) - [C++] Handle range of Decimal128 values that require 39 digits to be displayed
-* [ARROW-1751](https://issues.apache.org/jira/browse/ARROW-1751) - [Python] Pandas 0.21.0 introduces a breaking API change for MultiIndex construction
-* [ARROW-1754](https://issues.apache.org/jira/browse/ARROW-1754) - [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name
-* [ARROW-1756](https://issues.apache.org/jira/browse/ARROW-1756) - [Python] Observed int32 overflow in Feather write/read path
-* [ARROW-1762](https://issues.apache.org/jira/browse/ARROW-1762) - [C++] unittest failure for language environment
-* [ARROW-1764](https://issues.apache.org/jira/browse/ARROW-1764) - [Python] Add -c conda-forge for Windows dev installation instructions
-* [ARROW-1766](https://issues.apache.org/jira/browse/ARROW-1766) - [GLib] Fix failing builds on OSX
-* [ARROW-1768](https://issues.apache.org/jira/browse/ARROW-1768) - [Python] Fix suppressed exception in ParquetWriter.\_\_del\_\_
-* [ARROW-1769](https://issues.apache.org/jira/browse/ARROW-1769) - Python: pyarrow.parquet.write\_to\_dataset creates cyclic references
-* [ARROW-1770](https://issues.apache.org/jira/browse/ARROW-1770) - [GLib] Fix GLib compiler warning
-* [ARROW-1771](https://issues.apache.org/jira/browse/ARROW-1771) - [C++] ARROW-1749 Breaks Public API test in parquet-cpp
-* [ARROW-1776](https://issues.apache.org/jira/browse/ARROW-1776) - [C++[ arrow::gpu::CudaContext::bytes\_allocated() isn't defined
-* [ARROW-1778](https://issues.apache.org/jira/browse/ARROW-1778) - [Python] Link parquet-cpp statically, privately in manylinux1 wheels
-* [ARROW-1781](https://issues.apache.org/jira/browse/ARROW-1781) - [CI] OSX Builds on Travis-CI time out often
-* [ARROW-1788](https://issues.apache.org/jira/browse/ARROW-1788) - Plasma store crashes when trying to abort objects for disconnected client
-* [ARROW-1791](https://issues.apache.org/jira/browse/ARROW-1791) - Integration tests generate date[DAY] values outside of reasonable range
-* [ARROW-1793](https://issues.apache.org/jira/browse/ARROW-1793) - [Integration] fix a typo for README.md
-* [ARROW-1800](https://issues.apache.org/jira/browse/ARROW-1800) - [C++] Fix and simplify random\_decimals
-* [ARROW-1805](https://issues.apache.org/jira/browse/ARROW-1805) - [Python] ignore non-parquet files when exploring dataset
-* [ARROW-1811](https://issues.apache.org/jira/browse/ARROW-1811) - [C++/Python] Rename all Decimal based APIs to Decimal128
-* [ARROW-1812](https://issues.apache.org/jira/browse/ARROW-1812) - Plasma store modifies hash table while iterating during client disconnect
-* [ARROW-1813](https://issues.apache.org/jira/browse/ARROW-1813) - Enforce checkstyle failure in JAVA build and fix all checkstyle
-* [ARROW-1821](https://issues.apache.org/jira/browse/ARROW-1821) - Add integration test case to explicitly check for optional validity buffer
-* [ARROW-1829](https://issues.apache.org/jira/browse/ARROW-1829) - [Plasma] Clean up eviction policy bookkeeping
-* [ARROW-1830](https://issues.apache.org/jira/browse/ARROW-1830) - [Python] Error when loading all the files in a dictionary
-* [ARROW-1831](https://issues.apache.org/jira/browse/ARROW-1831) - [Python] Docker-based documentation build does not properly set LD\_LIBRARY\_PATH
-* [ARROW-1836](https://issues.apache.org/jira/browse/ARROW-1836) - [C++] Fix C4996 warning from arrow/util/variant.h on MSVC builds
-* [ARROW-1839](https://issues.apache.org/jira/browse/ARROW-1839) - [C++/Python] Add Decimal Parquet Read/Write Tests
-* [ARROW-1840](https://issues.apache.org/jira/browse/ARROW-1840) - [Website] The installation command failed on Windows10 anaconda environment.
-* [ARROW-1845](https://issues.apache.org/jira/browse/ARROW-1845) - [Python] Expose Decimal128Type
-* [ARROW-1852](https://issues.apache.org/jira/browse/ARROW-1852) - [Plasma] Make retrieving manager file descriptor const
-* [ARROW-1853](https://issues.apache.org/jira/browse/ARROW-1853) - [Plasma] Fix off-by-one error in retry processing
-* [ARROW-1863](https://issues.apache.org/jira/browse/ARROW-1863) - [Python] PyObjectStringify could render bytes-like output for more types of objects
-* [ARROW-1865](https://issues.apache.org/jira/browse/ARROW-1865) - [C++] Adding a column to an empty Table fails
-* [ARROW-1869](https://issues.apache.org/jira/browse/ARROW-1869) - Fix typo in LowCostIdentityHashMap
-* [ARROW-1871](https://issues.apache.org/jira/browse/ARROW-1871) - [Python/C++] Appending Python Decimals with different scales requires rescaling
-* [ARROW-1873](https://issues.apache.org/jira/browse/ARROW-1873) - [Python] Segmentation fault when loading total 2GB of parquet files
-* [ARROW-1877](https://issues.apache.org/jira/browse/ARROW-1877) - Incorrect comparison in JsonStringArrayList.equals
-* [ARROW-1879](https://issues.apache.org/jira/browse/ARROW-1879) - [Python] Dask integration tests are not skipped if dask is not installed
-* [ARROW-1881](https://issues.apache.org/jira/browse/ARROW-1881) - [Python] setuptools\_scm picks up JS version tags
-* [ARROW-1882](https://issues.apache.org/jira/browse/ARROW-1882) - [C++] Reintroduce DictionaryBuilder
-* [ARROW-1883](https://issues.apache.org/jira/browse/ARROW-1883) - [Python] BUG: Table.to\_pandas metadata checking fails if columns are not present
-* [ARROW-1889](https://issues.apache.org/jira/browse/ARROW-1889) - [Python] --exclude is not available in older git versions
-* [ARROW-1890](https://issues.apache.org/jira/browse/ARROW-1890) - [Python] Masking for date32 arrays not working
-* [ARROW-1891](https://issues.apache.org/jira/browse/ARROW-1891) - [Python] NaT date32 values are only converted to nulls if from\_pandas is used
-* [ARROW-1892](https://issues.apache.org/jira/browse/ARROW-1892) - [Python] Unknown list item type: binary
-* [ARROW-1893](https://issues.apache.org/jira/browse/ARROW-1893) - [Python] test\_primitive\_serialization fails on Python 2.7.3
-* [ARROW-1895](https://issues.apache.org/jira/browse/ARROW-1895) - [Python] Add field\_name to pandas index metadata
-* [ARROW-1897](https://issues.apache.org/jira/browse/ARROW-1897) - [Python] Incorrect numpy\_type for pandas metadata of Categoricals
-* [ARROW-1904](https://issues.apache.org/jira/browse/ARROW-1904) - [C++] Deprecate PrimitiveArray::raw\_values
-* [ARROW-1906](https://issues.apache.org/jira/browse/ARROW-1906) - [Python] Creating a pyarrow.Array with timestamp of different unit is not casted
-* [ARROW-1908](https://issues.apache.org/jira/browse/ARROW-1908) - [Python] Construction of arrow table from pandas DataFrame with duplicate column names crashes
-* [ARROW-1910](https://issues.apache.org/jira/browse/ARROW-1910) - CPP README Brewfile link incorrect
-* [ARROW-1914](https://issues.apache.org/jira/browse/ARROW-1914) - [C++] make -j may fail to build with -DARROW\_GPU=on
-* [ARROW-1915](https://issues.apache.org/jira/browse/ARROW-1915) - [Python] Parquet tests should be optional
-* [ARROW-1916](https://issues.apache.org/jira/browse/ARROW-1916) - [Java] Do not exclude java/dev/checkstyle from source releases
-* [ARROW-1917](https://issues.apache.org/jira/browse/ARROW-1917) - [GLib] Must set GI\_TYPELIB\_PATH in verify-release-candidate.sh
-* [ARROW-1935](https://issues.apache.org/jira/browse/ARROW-1935) - Download page must not link to snapshots / nightly builds
-* [ARROW-1936](https://issues.apache.org/jira/browse/ARROW-1936) - Broken links to signatures/hashes etc
-* [ARROW-1939](https://issues.apache.org/jira/browse/ARROW-1939) - Correct links in release 0.8 blog post
-
-
-## New Features and Improvements
-
-* [ARROW-480](https://issues.apache.org/jira/browse/ARROW-480) - [Python] Add accessors for Parquet column statistics
-* [ARROW-504](https://issues.apache.org/jira/browse/ARROW-504) - [Python] Add adapter to write pandas.DataFrame in user-selected chunk size to streaming format
-* [ARROW-507](https://issues.apache.org/jira/browse/ARROW-507) - [C++/Python] Construct List container from offsets and values subarrays
-* [ARROW-541](https://issues.apache.org/jira/browse/ARROW-541) - [JS] Implement JavaScript-compatible implementation
-* [ARROW-571](https://issues.apache.org/jira/browse/ARROW-571) - [Python] Add APIs to build Parquet files incrementally from Arrow tables
-* [ARROW-587](https://issues.apache.org/jira/browse/ARROW-587) - Add JIRA fix version to merge tool
-* [ARROW-609](https://issues.apache.org/jira/browse/ARROW-609) - [C++] Function for casting from days since UNIX epoch to int64 date
-* [ARROW-838](https://issues.apache.org/jira/browse/ARROW-838) - [Python] Efficient construction of arrays from non-pandas 1D NumPy arrays
-* [ARROW-905](https://issues.apache.org/jira/browse/ARROW-905) - [Docs] Add Dockerfile for reproducible documentation generation
-* [ARROW-911](https://issues.apache.org/jira/browse/ARROW-911) - [Python] Expand development.rst with build instructions without conda
-* [ARROW-942](https://issues.apache.org/jira/browse/ARROW-942) - Support integration testing on Python 2.7
-* [ARROW-950](https://issues.apache.org/jira/browse/ARROW-950) - [Site] Add Google Analytics tag
-* [ARROW-972](https://issues.apache.org/jira/browse/ARROW-972) - [Python] Add test cases and basic APIs for UnionArray
-* [ARROW-1032](https://issues.apache.org/jira/browse/ARROW-1032) - [JS] Support custom\_metadata
-* [ARROW-1047](https://issues.apache.org/jira/browse/ARROW-1047) - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing
-* [ARROW-1047](https://issues.apache.org/jira/browse/ARROW-1047) - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing
-* [ARROW-1087](https://issues.apache.org/jira/browse/ARROW-1087) - [Python] add get\_include to expose directory containing header files
-* [ARROW-1114](https://issues.apache.org/jira/browse/ARROW-1114) - [C++] Create Record Batch Builder class as a reusable and efficient way to transpose row-by-row data to columns
-* [ARROW-1134](https://issues.apache.org/jira/browse/ARROW-1134) - [C++] Allow C++/CLI projects to build with Arrow​
-* [ARROW-1178](https://issues.apache.org/jira/browse/ARROW-1178) - [Python] Create alternative to Table.from\_pandas that yields a list of RecordBatch objects with a given chunk size
-* [ARROW-1226](https://issues.apache.org/jira/browse/ARROW-1226) - [C++] Improve / correct doxygen function documentation in arrow::ipc
-* [ARROW-1250](https://issues.apache.org/jira/browse/ARROW-1250) - [Python] Define API for user type checking of array types
-* [ARROW-1362](https://issues.apache.org/jira/browse/ARROW-1362) - [Integration] Validate vector type layout in IPC messages
-* [ARROW-1367](https://issues.apache.org/jira/browse/ARROW-1367) - [Website] Divide CHANGELOG issues by component and add subheaders
-* [ARROW-1369](https://issues.apache.org/jira/browse/ARROW-1369) - Support boolean types in the javascript arrow reader library
-* [ARROW-1371](https://issues.apache.org/jira/browse/ARROW-1371) - [Website] Add "Powered By" page to the website
-* [ARROW-1455](https://issues.apache.org/jira/browse/ARROW-1455) - [Python] Add Dockerfile for validating Dask integration outside of usual CI
-* [ARROW-1471](https://issues.apache.org/jira/browse/ARROW-1471) - [JAVA] Document requirements and non/requirements for ValueVector updates
-* [ARROW-1472](https://issues.apache.org/jira/browse/ARROW-1472) - [JAVA] Design updated ValueVector Object Hierarchy
-* [ARROW-1473](https://issues.apache.org/jira/browse/ARROW-1473) - [JAVA] Create Prototype Code Hierarchy (Implementation Phase 1)
-* [ARROW-1474](https://issues.apache.org/jira/browse/ARROW-1474) - [JAVA] ValueVector hierarchy (Implementation Phase 2)
-* [ARROW-1476](https://issues.apache.org/jira/browse/ARROW-1476) - [JAVA] Implement final ValueVector updates
-* [ARROW-1482](https://issues.apache.org/jira/browse/ARROW-1482) - [C++] Implement casts between date32 and date64
-* [ARROW-1483](https://issues.apache.org/jira/browse/ARROW-1483) - [C++] Implement casts between time32 and time64
-* [ARROW-1484](https://issues.apache.org/jira/browse/ARROW-1484) - [C++] Implement (safe and unsafe) casts between timestamps and times of different units
-* [ARROW-1485](https://issues.apache.org/jira/browse/ARROW-1485) - [C++] Implement union-like data type for accommodating kernel arguments which may be scalars or arrays
-* [ARROW-1486](https://issues.apache.org/jira/browse/ARROW-1486) - [C++] Decide if arrow::RecordBatch needs to be copyable
-* [ARROW-1487](https://issues.apache.org/jira/browse/ARROW-1487) - [C++] Implement casts from List<A\> to List<B\>, where a cast function is defined from any A to B
-* [ARROW-1488](https://issues.apache.org/jira/browse/ARROW-1488) - [C++] Implement ArrayBuilder::Finish in terms of internal::ArrayData
-* [ARROW-1498](https://issues.apache.org/jira/browse/ARROW-1498) - [GitHub] Add CONTRIBUTING.md and ISSUE\_TEMPLATE.md
-* [ARROW-1503](https://issues.apache.org/jira/browse/ARROW-1503) - [Python] Add serialization callbacks for pandas objects in pyarrow.serialize
-* [ARROW-1522](https://issues.apache.org/jira/browse/ARROW-1522) - [C++] Support pyarrow.Buffer as built-in type in pyarrow.serialize
-* [ARROW-1523](https://issues.apache.org/jira/browse/ARROW-1523) - [C++] Add helper data struct with methods for reading a validity bitmap possibly having a non-zero offset
-* [ARROW-1524](https://issues.apache.org/jira/browse/ARROW-1524) - [C++] More graceful solution for handling non-zero offsets on inputs and outputs in compute library
-* [ARROW-1525](https://issues.apache.org/jira/browse/ARROW-1525) - [C++] Change functions in arrow/compare.h to not return Status
-* [ARROW-1526](https://issues.apache.org/jira/browse/ARROW-1526) - [Python] Unit tests to exercise code path in PARQUET-1100
-* [ARROW-1535](https://issues.apache.org/jira/browse/ARROW-1535) - [Python] Enable sdist source tarballs to build assuming that Arrow C++ libraries are available on the host system
-* [ARROW-1538](https://issues.apache.org/jira/browse/ARROW-1538) - [C++] Support Ubuntu 14.04 in .deb packaging automation
-* [ARROW-1539](https://issues.apache.org/jira/browse/ARROW-1539) - [C++] Remove functions deprecated as of 0.7.0 and prior releases
-* [ARROW-1556](https://issues.apache.org/jira/browse/ARROW-1556) - [C++] Incorporate AssertArraysEqual function from PARQUET-1100 patch
-* [ARROW-1559](https://issues.apache.org/jira/browse/ARROW-1559) - [C++] Kernel implementations for "unique" (compute distinct elements of array)
-* [ARROW-1573](https://issues.apache.org/jira/browse/ARROW-1573) - [C++] Implement stateful kernel function that uses DictionaryBuilder to compute dictionary indices
-* [ARROW-1575](https://issues.apache.org/jira/browse/ARROW-1575) - [Python] Add pyarrow.column factory function
-* [ARROW-1576](https://issues.apache.org/jira/browse/ARROW-1576) - [Python] Add utility functions (or a richer type hierachy) for checking whether data type instances are members of various type classes
-* [ARROW-1577](https://issues.apache.org/jira/browse/ARROW-1577) - [JS] Package release script for NPM modules
-* [ARROW-1588](https://issues.apache.org/jira/browse/ARROW-1588) - [C++/Format] Harden Decimal Format
-* [ARROW-1593](https://issues.apache.org/jira/browse/ARROW-1593) - [PYTHON] serialize\_pandas should pass through the preserve\_index keyword
-* [ARROW-1594](https://issues.apache.org/jira/browse/ARROW-1594) - [Python] Enable multi-threaded conversions in Table.from\_pandas
-* [ARROW-1600](https://issues.apache.org/jira/browse/ARROW-1600) - [C++] Zero-copy Buffer constructor from std::string
-* [ARROW-1602](https://issues.apache.org/jira/browse/ARROW-1602) - [C++] Add IsValid/IsNotNull method to arrow::Array
-* [ARROW-1603](https://issues.apache.org/jira/browse/ARROW-1603) - [C++] Add BinaryArray method to get a value as a std::string
-* [ARROW-1604](https://issues.apache.org/jira/browse/ARROW-1604) - [Python] Support common type aliases in cast(...) and various type= arguments
-* [ARROW-1605](https://issues.apache.org/jira/browse/ARROW-1605) - [Python] pyarrow.array should be able to yield smaller integer types without an explicit cast
-* [ARROW-1607](https://issues.apache.org/jira/browse/ARROW-1607) - [C++] Implement DictionaryBuilder for Decimals
-* [ARROW-1613](https://issues.apache.org/jira/browse/ARROW-1613) - [Java] ArrowReader should not close the input ReadChannel
-* [ARROW-1616](https://issues.apache.org/jira/browse/ARROW-1616) - [Python] Add "write" method to RecordBatchStreamWriter that dispatches to write\_table/write\_back as appropriate
-* [ARROW-1626](https://issues.apache.org/jira/browse/ARROW-1626) - Add make targets to run the inter-procedural static analysis tool called "infer".
-* [ARROW-1627](https://issues.apache.org/jira/browse/ARROW-1627) - [JAVA] Reduce heap usage(Phase 2) - memory footprint in AllocationManager.BufferLedger
-* [ARROW-1630](https://issues.apache.org/jira/browse/ARROW-1630) - [Serialization] Support Python datetime objects
-* [ARROW-1631](https://issues.apache.org/jira/browse/ARROW-1631) - [C++] Add GRPC to ThirdpartyToolchain.cmake
-* [ARROW-1635](https://issues.apache.org/jira/browse/ARROW-1635) - Add release management guide for PMCs
-* [ARROW-1637](https://issues.apache.org/jira/browse/ARROW-1637) - [C++] IPC round-trip for null type
-* [ARROW-1641](https://issues.apache.org/jira/browse/ARROW-1641) - [C++] Do not include <mutex\> in public headers
-* [ARROW-1648](https://issues.apache.org/jira/browse/ARROW-1648) - C++: Add cast from Dictionary[NullType] to NullType
-* [ARROW-1649](https://issues.apache.org/jira/browse/ARROW-1649) - C++: Print number of nulls in PrettyPrint for NullArray
-* [ARROW-1651](https://issues.apache.org/jira/browse/ARROW-1651) - [JS] Lazy row accessor in Table
-* [ARROW-1652](https://issues.apache.org/jira/browse/ARROW-1652) - [JS] Separate Vector into BatchVector and CompositeVector
-* [ARROW-1654](https://issues.apache.org/jira/browse/ARROW-1654) - [Python] pa.DataType cannot be pickled
-* [ARROW-1662](https://issues.apache.org/jira/browse/ARROW-1662) - Move OSX Dependency management into brew bundle Brewfiles
-* [ARROW-1665](https://issues.apache.org/jira/browse/ARROW-1665) - [Serialization] Support more custom datatypes in the default serialization context
-* [ARROW-1666](https://issues.apache.org/jira/browse/ARROW-1666) - [GLib] Enable gtk-doc on Travis CI Mac environment
-* [ARROW-1667](https://issues.apache.org/jira/browse/ARROW-1667) - [GLib] Support Meson
-* [ARROW-1671](https://issues.apache.org/jira/browse/ARROW-1671) - [C++] Change arrow::MakeArray to not return Status
-* [ARROW-1675](https://issues.apache.org/jira/browse/ARROW-1675) - [Python] Use RecordBatch.from\_pandas in FeatherWriter.write
-* [ARROW-1677](https://issues.apache.org/jira/browse/ARROW-1677) - [Blog] Add blog post on Ray and Arrow Python serialization
-* [ARROW-1679](https://issues.apache.org/jira/browse/ARROW-1679) - [GLib] Add garrow\_record\_batch\_reader\_read\_next()
-* [ARROW-1683](https://issues.apache.org/jira/browse/ARROW-1683) - [Python] Restore "TimestampType" to pyarrow namespace
-* [ARROW-1684](https://issues.apache.org/jira/browse/ARROW-1684) - [Python] Simplify user API for reading nested Parquet columns
-* [ARROW-1685](https://issues.apache.org/jira/browse/ARROW-1685) - [GLib] Add GArrowTableReader
-* [ARROW-1687](https://issues.apache.org/jira/browse/ARROW-1687) - [Python] Expose UnionArray to pyarrow
-* [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689) - [Python] Categorical Indices Should Be Zero-Copy
-* [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689) - [Python] Categorical Indices Should Be Zero-Copy
-* [ARROW-1690](https://issues.apache.org/jira/browse/ARROW-1690) - [GLib] Add garrow\_array\_is\_valid()
-* [ARROW-1691](https://issues.apache.org/jira/browse/ARROW-1691) - [Java] Conform Java Decimal type implementation to format decisions in ARROW-1588
-* [ARROW-1697](https://issues.apache.org/jira/browse/ARROW-1697) - [GitHub] Add ISSUE\_TEMPLATE.md
-* [ARROW-1701](https://issues.apache.org/jira/browse/ARROW-1701) - [Serialization] Support zero copy PyTorch Tensor serialization
-* [ARROW-1702](https://issues.apache.org/jira/browse/ARROW-1702) - Update jemalloc in manylinux1 build
-* [ARROW-1703](https://issues.apache.org/jira/browse/ARROW-1703) - [C++] Vendor exact version of jemalloc we depend on
-* [ARROW-1707](https://issues.apache.org/jira/browse/ARROW-1707) - Update dev README after movement to GitBox
-* [ARROW-1710](https://issues.apache.org/jira/browse/ARROW-1710) - [Java] Remove non-nullable vectors in new vector class hierarchy
-* [ARROW-1716](https://issues.apache.org/jira/browse/ARROW-1716) - [Format/JSON] Use string integer value for Decimals in JSON
-* [ARROW-1717](https://issues.apache.org/jira/browse/ARROW-1717) - [Java] Remove public static helper method in vector classes for JSONReader/Writer
-* [ARROW-1718](https://issues.apache.org/jira/browse/ARROW-1718) - [Python] Implement casts from timestamp to date32/date64 and support in Array.from\_pandas
-* [ARROW-1719](https://issues.apache.org/jira/browse/ARROW-1719) - [Java] Remove accessor/mutator
-* [ARROW-1721](https://issues.apache.org/jira/browse/ARROW-1721) - [Python] Support null mask in places where it isn't supported in numpy\_to\_arrow.cc
-* [ARROW-1724](https://issues.apache.org/jira/browse/ARROW-1724) - [Packaging] Support Ubuntu 17.10
-* [ARROW-1725](https://issues.apache.org/jira/browse/ARROW-1725) - [Packaging] Upload .deb for Ubuntu 17.10
-* [ARROW-1726](https://issues.apache.org/jira/browse/ARROW-1726) - [GLib] Add setup description to verify C GLib build
-* [ARROW-1727](https://issues.apache.org/jira/browse/ARROW-1727) - [Format] Expand Arrow streaming format to permit new dictionaries and deltas / additions to existing dictionaries
-* [ARROW-1728](https://issues.apache.org/jira/browse/ARROW-1728) - [C++] Run clang-format checks in Travis CI
-* [ARROW-1734](https://issues.apache.org/jira/browse/ARROW-1734) - C++/Python: Add cast function on Column-level
-* [ARROW-1736](https://issues.apache.org/jira/browse/ARROW-1736) - [GLib] Add GArrowCastOptions:allow-time-truncate
-* [ARROW-1737](https://issues.apache.org/jira/browse/ARROW-1737) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE
-* [ARROW-1740](https://issues.apache.org/jira/browse/ARROW-1740) - C++: Kernel to get unique values of an Array/Column
-* [ARROW-1746](https://issues.apache.org/jira/browse/ARROW-1746) - [Python] Add build dependencies for Arch Linux
-* [ARROW-1747](https://issues.apache.org/jira/browse/ARROW-1747) - [C++] Don't export symbols of statically linked libraries
-* [ARROW-1748](https://issues.apache.org/jira/browse/ARROW-1748) - [GLib] Add GArrowRecordBatchBuilder
-* [ARROW-1750](https://issues.apache.org/jira/browse/ARROW-1750) - [C++] Remove the need for arrow/util/random.h
-* [ARROW-1752](https://issues.apache.org/jira/browse/ARROW-1752) - [Packaging] Add GPU packages for Debian and Ubuntu
-* [ARROW-1753](https://issues.apache.org/jira/browse/ARROW-1753) - [Python] Provide for matching subclasses with register\_type in serialization context
-* [ARROW-1755](https://issues.apache.org/jira/browse/ARROW-1755) - [C++] Add build options for MSVC to use static runtime libraries
-* [ARROW-1758](https://issues.apache.org/jira/browse/ARROW-1758) - [Python] Remove pickle=True option for object serialization
-* [ARROW-1759](https://issues.apache.org/jira/browse/ARROW-1759) - [Python] Add function / property to get implied Arrow schema from Parquet file
-* [ARROW-1763](https://issues.apache.org/jira/browse/ARROW-1763) - [Python] DataType should be hashable
-* [ARROW-1765](https://issues.apache.org/jira/browse/ARROW-1765) - [Doc] Use dependencies from conda in C++ docker build
-* [ARROW-1767](https://issues.apache.org/jira/browse/ARROW-1767) - [C++] Support file reads and writes over 2GB on Windows
-* [ARROW-1772](https://issues.apache.org/jira/browse/ARROW-1772) - [C++] Add public-api-test module in style of parquet-cpp
-* [ARROW-1773](https://issues.apache.org/jira/browse/ARROW-1773) - [C++] Add casts from date/time types to compatible signed integers
-* [ARROW-1775](https://issues.apache.org/jira/browse/ARROW-1775) - Ability to abort created but unsealed Plasma objects
-* [ARROW-1777](https://issues.apache.org/jira/browse/ARROW-1777) - [C++] Add static ctor ArrayData::Make for nicer syntax in places
-* [ARROW-1779](https://issues.apache.org/jira/browse/ARROW-1779) - [Java] Integration test breaks without zeroing out validity vectors
-* [ARROW-1782](https://issues.apache.org/jira/browse/ARROW-1782) - [Python] Expose compressors as pyarrow.compress, pyarrow.decompress
-* [ARROW-1783](https://issues.apache.org/jira/browse/ARROW-1783) - [Python] Convert SerializedPyObject to/from sequence of component buffers with minimal memory allocation / copying
-* [ARROW-1784](https://issues.apache.org/jira/browse/ARROW-1784) - [Python] Read and write pandas.DataFrame in pyarrow.serialize by decomposing the BlockManager rather than coercing to Arrow format
-* [ARROW-1785](https://issues.apache.org/jira/browse/ARROW-1785) - [Format/C++/Java] Remove VectorLayout metadata from Flatbuffers metadata
-* [ARROW-1787](https://issues.apache.org/jira/browse/ARROW-1787) - [Python] Support reading parquet files into DataFrames in a backward compatible way
-* [ARROW-1794](https://issues.apache.org/jira/browse/ARROW-1794) - [C++/Python] Rename DecimalArray to Decimal128Array
-* [ARROW-1795](https://issues.apache.org/jira/browse/ARROW-1795) - [Plasma C++] change evict policy
-* [ARROW-1801](https://issues.apache.org/jira/browse/ARROW-1801) - [Docs] Update install instructions to use red-data-tools repos
-* [ARROW-1802](https://issues.apache.org/jira/browse/ARROW-1802) - [GLib] Add Arrow GPU support
-* [ARROW-1806](https://issues.apache.org/jira/browse/ARROW-1806) - [GLib] Add garrow\_record\_batch\_writer\_write\_table()
-* [ARROW-1808](https://issues.apache.org/jira/browse/ARROW-1808) - [C++] Make RecordBatch interface virtual to permit record batches that lazy-materialize columns
-* [ARROW-1809](https://issues.apache.org/jira/browse/ARROW-1809) - [GLib] Use .xml instead of .sgml for GTK-Doc main file
-* [ARROW-1810](https://issues.apache.org/jira/browse/ARROW-1810) - [Plasma] Remove test shell scripts
-* [ARROW-1816](https://issues.apache.org/jira/browse/ARROW-1816) - [Java] Resolve new vector classes structure for timestamp, date and maybe interval
-* [ARROW-1817](https://issues.apache.org/jira/browse/ARROW-1817) - Configure JsonFileReader to read NaN for floats
-* [ARROW-1818](https://issues.apache.org/jira/browse/ARROW-1818) - Examine Java Dependencies
-* [ARROW-1819](https://issues.apache.org/jira/browse/ARROW-1819) - [Java] Remove legacy vector classes
-* [ARROW-1820](https://issues.apache.org/jira/browse/ARROW-1820) - [C++] Create arrow\_compute shared library subcomponent
-* [ARROW-1826](https://issues.apache.org/jira/browse/ARROW-1826) - [JAVA] Avoid branching at cell level (copyFrom)
-* [ARROW-1827](https://issues.apache.org/jira/browse/ARROW-1827) - [Java] Add checkstyle config file and header file
-* [ARROW-1828](https://issues.apache.org/jira/browse/ARROW-1828) - [C++] Implement hash kernel specialization for BooleanType
-* [ARROW-1834](https://issues.apache.org/jira/browse/ARROW-1834) - [Doc] Build documentation in separate build folders
-* [ARROW-1838](https://issues.apache.org/jira/browse/ARROW-1838) - [C++] Use compute::Datum uniformly for input argument to kernels
-* [ARROW-1841](https://issues.apache.org/jira/browse/ARROW-1841) - [JS] Update text-encoding-utf-8 and tslib for node ESModules support
-* [ARROW-1844](https://issues.apache.org/jira/browse/ARROW-1844) - [C++] Basic benchmark suite for hash kernels
-* [ARROW-1849](https://issues.apache.org/jira/browse/ARROW-1849) - [GLib] Add input checks to GArrowRecordBatch
-* [ARROW-1850](https://issues.apache.org/jira/browse/ARROW-1850) - [C++] Use const void\* in Writable::Write instead of const uint8\_t\*
-* [ARROW-1854](https://issues.apache.org/jira/browse/ARROW-1854) - [Python] Improve performance of serializing object dtype ndarrays
-* [ARROW-1855](https://issues.apache.org/jira/browse/ARROW-1855) - [GLib] Add workaround for build failure on macOS
-* [ARROW-1857](https://issues.apache.org/jira/browse/ARROW-1857) - [Python] Add switch for boost linkage with static parquet in wheels
-* [ARROW-1859](https://issues.apache.org/jira/browse/ARROW-1859) - [GLib] Add GArrowDictionaryDataType
-* [ARROW-1862](https://issues.apache.org/jira/browse/ARROW-1862) - [GLib] Add GArrowDictionaryArray
-* [ARROW-1864](https://issues.apache.org/jira/browse/ARROW-1864) - [Java] Upgrade Netty to 4.1.x
-* [ARROW-1866](https://issues.apache.org/jira/browse/ARROW-1866) - [Java] Combine MapVector and NonNullableMapVector Classes
-* [ARROW-1867](https://issues.apache.org/jira/browse/ARROW-1867) - [Java] Add BitVector APIs from old vector class
-* [ARROW-1874](https://issues.apache.org/jira/browse/ARROW-1874) - [GLib] Add garrow\_array\_unique()
-* [ARROW-1878](https://issues.apache.org/jira/browse/ARROW-1878) - [GLib] Add garrow\_array\_dictionary\_encode()
-* [ARROW-1884](https://issues.apache.org/jira/browse/ARROW-1884) - [C++] Make JsonReader/JsonWriter classes internal APIs
-* [ARROW-1885](https://issues.apache.org/jira/browse/ARROW-1885) - [Java] Restore previous MapVector class names
-* [ARROW-1901](https://issues.apache.org/jira/browse/ARROW-1901) - [Python] Support recursive mkdir for DaskFilesystem
-* [ARROW-1902](https://issues.apache.org/jira/browse/ARROW-1902) - [Python] Remove mkdir race condition from write\_to\_dataset
-* [ARROW-1905](https://issues.apache.org/jira/browse/ARROW-1905) - [Python] Add more functions for checking exact types in pyarrow.types
-* [ARROW-1911](https://issues.apache.org/jira/browse/ARROW-1911) - Add Graphistry to Arrow JS proof points
-* [ARROW-1922](https://issues.apache.org/jira/browse/ARROW-1922) - Blog post on recent improvements/changes in JAVA Vectors
-* [ARROW-1932](https://issues.apache.org/jira/browse/ARROW-1932) - [Website] Update site for 0.8.0
-* [ARROW-1934](https://issues.apache.org/jira/browse/ARROW-1934) - [Website] Blog post summarizing highlights of 0.8.0 release
-
-
-
-# Apache Arrow 0.7.1 (2017-10-01)
-
-## New Features and Improvements
-
-* [ARROW-559](https://issues.apache.org/jira/browse/ARROW-559) - Script to easily verify release in all languages
-* [ARROW-1464](https://issues.apache.org/jira/browse/ARROW-1464) - [GLib] Documentation for troubleshooting of build errors
-* [ARROW-1537](https://issues.apache.org/jira/browse/ARROW-1537) - [C++] Support building with full path install\_name on macOS
-* [ARROW-1546](https://issues.apache.org/jira/browse/ARROW-1546) - [GLib] Support GLib 2.40 again
-* [ARROW-1548](https://issues.apache.org/jira/browse/ARROW-1548) - [GLib] Support build append in builder
-* [ARROW-1578](https://issues.apache.org/jira/browse/ARROW-1578) - [C++/Python] Run lint checks in Travis CI to fail for linting issues as early as possible
-* [ARROW-1592](https://issues.apache.org/jira/browse/ARROW-1592) - [GLib] Add GArrowUIntArrayBuilder
-* [ARROW-1608](https://issues.apache.org/jira/browse/ARROW-1608) - Support Release verification script on macOS
-* [ARROW-1612](https://issues.apache.org/jira/browse/ARROW-1612) - [GLib] add how to install for mac os to README
-* [ARROW-1618](https://issues.apache.org/jira/browse/ARROW-1618) - [JAVA] Reduce Heap Usage(Phase 1): move release listener logic to Allocation Manager
-* [ARROW-1634](https://issues.apache.org/jira/browse/ARROW-1634) - [Website] Updates for 0.7.1 release
-
-
-## Bug Fixes
-
-* [ARROW-1497](https://issues.apache.org/jira/browse/ARROW-1497) - [Java] JsonFileReader doesn't set value count for some vectors
-* [ARROW-1500](https://issues.apache.org/jira/browse/ARROW-1500) - [C++] Result of ftruncate ignored in MemoryMappedFile::Create
-* [ARROW-1529](https://issues.apache.org/jira/browse/ARROW-1529) - [GLib] Fix failure on macOS on Travis CI
-* [ARROW-1533](https://issues.apache.org/jira/browse/ARROW-1533) - [JAVA] realloc should consider the existing buffer capacity for computing target memory requirement
-* [ARROW-1536](https://issues.apache.org/jira/browse/ARROW-1536) - [C++] Do not transitively depend on libboost\_system
-* [ARROW-1542](https://issues.apache.org/jira/browse/ARROW-1542) - [C++] Windows release verification script should not modify conda environment
-* [ARROW-1544](https://issues.apache.org/jira/browse/ARROW-1544) - [JS] Export Vector type definitions
-* [ARROW-1545](https://issues.apache.org/jira/browse/ARROW-1545) - Int64Builder should not need int64() as arg
-* [ARROW-1547](https://issues.apache.org/jira/browse/ARROW-1547) - [JAVA] Fix 8x memory over-allocation in BitVector
-* [ARROW-1550](https://issues.apache.org/jira/browse/ARROW-1550) - [Python] Fix flaky test on Windows
-* [ARROW-1550](https://issues.apache.org/jira/browse/ARROW-1550) - [Python] Fix flaky test on Windows
-* [ARROW-1553](https://issues.apache.org/jira/browse/ARROW-1553) - [JAVA] Implement setInitialCapacity for MapWriter and pass on this capacity during lazy creation of child vectors
-* [ARROW-1554](https://issues.apache.org/jira/browse/ARROW-1554) - [Python] Document that pip wheels depend on MSVC14 runtime
-* [ARROW-1557](https://issues.apache.org/jira/browse/ARROW-1557) - [PYTHON] pyarrow.Table.from\_arrays doesn't validate names length
-* [ARROW-1590](https://issues.apache.org/jira/browse/ARROW-1590) - Flow TS Table method generics
-* [ARROW-1591](https://issues.apache.org/jira/browse/ARROW-1591) - C++: Xcode 9 is not correctly detected
-* [ARROW-1595](https://issues.apache.org/jira/browse/ARROW-1595) - [Python] Fix package dependency issues causing build failures
-* [ARROW-1598](https://issues.apache.org/jira/browse/ARROW-1598) - [C++/Tutorials] MIsmatch code comment and actual code about Object ID
-* [ARROW-1601](https://issues.apache.org/jira/browse/ARROW-1601) - [C++] READ\_NEXT\_BITSET reads one byte past the last byte on last iteration
-* [ARROW-1606](https://issues.apache.org/jira/browse/ARROW-1606) - Python: Windows wheels don't include .lib files.
-* [ARROW-1610](https://issues.apache.org/jira/browse/ARROW-1610) - C++/Python: Only call python-prefix if the default PYTHON\_LIBRARY is not present
-* [ARROW-1611](https://issues.apache.org/jira/browse/ARROW-1611) - Crash in BitmapReader when length is zero
-* [ARROW-1619](https://issues.apache.org/jira/browse/ARROW-1619) - [Java] Correctly set "lastSet" for variable vectors in JsonReader
-
-
-
-# Apache Arrow 0.7.0 (2017-09-17)
-
-## Bug Fixes
-
-* [ARROW-12](https://issues.apache.org/jira/browse/ARROW-12) - Get Github activity mirrored to JIRA
-* [ARROW-248](https://issues.apache.org/jira/browse/ARROW-248) - UnionVector.close() should call clear()
-* [ARROW-269](https://issues.apache.org/jira/browse/ARROW-269) - UnionVector getBuffers method does not include typevector
-* [ARROW-407](https://issues.apache.org/jira/browse/ARROW-407) - BitVector.copyFromSafe() should re-allocate if necessary instead of returning false
-* [ARROW-801](https://issues.apache.org/jira/browse/ARROW-801) - [JAVA] Provide direct access to underlying buffer memory addresses in consistent way without generating garbage or large amount indirections
-* [ARROW-1302](https://issues.apache.org/jira/browse/ARROW-1302) - C++: ${MAKE} variable not set sometimes on older MacOS installations
-* [ARROW-1332](https://issues.apache.org/jira/browse/ARROW-1332) - [Packaging] Building Windows wheels in Apache repos
-* [ARROW-1354](https://issues.apache.org/jira/browse/ARROW-1354) - [Python] Segfault in Table.from\_pandas with Mixed-Type Categories
-* [ARROW-1357](https://issues.apache.org/jira/browse/ARROW-1357) - [Python] Data corruption in reading multi-file parquet dataset
-* [ARROW-1363](https://issues.apache.org/jira/browse/ARROW-1363) - [C++] IPC writer sends buffer layout for dictionary rather than indices
-* [ARROW-1365](https://issues.apache.org/jira/browse/ARROW-1365) - [Python] Remove usage of removed jemalloc\_memory\_pool in Python API docs
-* [ARROW-1373](https://issues.apache.org/jira/browse/ARROW-1373) - [Java] Implement get<type\>Buffer() methods at the ValueVector interface
-* [ARROW-1375](https://issues.apache.org/jira/browse/ARROW-1375) - [C++] Visual Studio 2017 Appveyor builds failing
-* [ARROW-1378](https://issues.apache.org/jira/browse/ARROW-1378) - [Python] whl is not a supported wheel on this platform on Debian/Jessie
-* [ARROW-1379](https://issues.apache.org/jira/browse/ARROW-1379) - [Java] maven dependency issues - both unused and undeclared
-* [ARROW-1390](https://issues.apache.org/jira/browse/ARROW-1390) - [Python] Extend tests for python serialization
-* [ARROW-1407](https://issues.apache.org/jira/browse/ARROW-1407) - Dictionaries can only hold a maximum of 4096 indices
-* [ARROW-1411](https://issues.apache.org/jira/browse/ARROW-1411) - [Python] Booleans in Float Columns cause Segfault
-* [ARROW-1414](https://issues.apache.org/jira/browse/ARROW-1414) - [GLib] Cast after status check
-* [ARROW-1421](https://issues.apache.org/jira/browse/ARROW-1421) - [Python] pyarrow.serialize cannot serialize a Python dict input
-* [ARROW-1426](https://issues.apache.org/jira/browse/ARROW-1426) - [Website] The title element of the top page is empty
-* [ARROW-1429](https://issues.apache.org/jira/browse/ARROW-1429) - [Python] Error loading parquet file with \_metadata from HDFS
-* [ARROW-1430](https://issues.apache.org/jira/browse/ARROW-1430) - [Python] flake8 warnings are not failing CI builds
-* [ARROW-1434](https://issues.apache.org/jira/browse/ARROW-1434) - [C++/Python] pyarrow.Array.from\_pandas does not support datetime64[D] arrays
-* [ARROW-1435](https://issues.apache.org/jira/browse/ARROW-1435) - [Python] PyArrow not propagating timezone information from Parquet to Python
-* [ARROW-1437](https://issues.apache.org/jira/browse/ARROW-1437) - [Python] pa.Array.from\_pandas segfaults when given a mixed-type array
-* [ARROW-1439](https://issues.apache.org/jira/browse/ARROW-1439) - [Packaging] Automate updating RPM in RPM build
-* [ARROW-1443](https://issues.apache.org/jira/browse/ARROW-1443) - [Java] Bug on ArrowBuf.setBytes with unsliced ByteBuffers
-* [ARROW-1444](https://issues.apache.org/jira/browse/ARROW-1444) - [JAVA] BitVector.splitAndTransfer copies last byte incorrectly
-* [ARROW-1446](https://issues.apache.org/jira/browse/ARROW-1446) - Python: Writing more than 2^31 rows from pandas dataframe causes row count overflow error
-* [ARROW-1450](https://issues.apache.org/jira/browse/ARROW-1450) - [Python] Raise proper error if custom serialization handler fails
-* [ARROW-1452](https://issues.apache.org/jira/browse/ARROW-1452) - [C++] Make UNUSED macro name more unique so it does not conflict with thirdparty projects
-* [ARROW-1452](https://issues.apache.org/jira/browse/ARROW-1452) - [C++] Make UNUSED macro name more unique so it does not conflict with thirdparty projects
-* [ARROW-1453](https://issues.apache.org/jira/browse/ARROW-1453) - [Python] Implement WriteTensor for non-contiguous tensors
-* [ARROW-1457](https://issues.apache.org/jira/browse/ARROW-1457) - [C++] Optimize strided WriteTensor
-* [ARROW-1458](https://issues.apache.org/jira/browse/ARROW-1458) - [Python] Document that HadoopFileSystem.mkdir with create\_parents=False has no effect
-* [ARROW-1459](https://issues.apache.org/jira/browse/ARROW-1459) - [Python] PyArrow fails to load partitioned parquet files with non-primitive types
-* [ARROW-1461](https://issues.apache.org/jira/browse/ARROW-1461) - [C++] Disable builds using LLVM apt packages temporarily
-* [ARROW-1461](https://issues.apache.org/jira/browse/ARROW-1461) - [C++] Disable builds using LLVM apt packages temporarily
-* [ARROW-1467](https://issues.apache.org/jira/browse/ARROW-1467) - [JAVA]: Fix reset() and allocateNew() in Nullable Value Vectors template
-* [ARROW-1469](https://issues.apache.org/jira/browse/ARROW-1469) - Segfault when serialize Pandas series with mixed object type
-* [ARROW-1490](https://issues.apache.org/jira/browse/ARROW-1490) - [Java] Allow Travis CI failures for JDK9 for now
-* [ARROW-1493](https://issues.apache.org/jira/browse/ARROW-1493) - [C++] Flush the output stream at the end of each PrettyPrint function
-* [ARROW-1495](https://issues.apache.org/jira/browse/ARROW-1495) - [C++] Store shared\_ptr to boxed arrays in RecordBatch
-* [ARROW-1507](https://issues.apache.org/jira/browse/ARROW-1507) - [C++] arrow/compute/api.h can't be used without arrow/array.h
-* [ARROW-1512](https://issues.apache.org/jira/browse/ARROW-1512) - [Docs] NumericArray has no member named 'raw\_data'
-* [ARROW-1514](https://issues.apache.org/jira/browse/ARROW-1514) - [C++] Fix a typo in document
-* [ARROW-1527](https://issues.apache.org/jira/browse/ARROW-1527) - Fix Travis JDK9 build
-* [ARROW-1531](https://issues.apache.org/jira/browse/ARROW-1531) - [C++] Return ToBytes by value from Decimal128
-* [ARROW-1532](https://issues.apache.org/jira/browse/ARROW-1532) - [Python] Referencing an Empty Schema causes a SegFault
-
-
-## New Features and Improvements
-
-* [ARROW-34](https://issues.apache.org/jira/browse/ARROW-34) - C++: establish a basic function evaluation model
-* [ARROW-229](https://issues.apache.org/jira/browse/ARROW-229) - [C++] Implement safe casts for primitive types
-* [ARROW-592](https://issues.apache.org/jira/browse/ARROW-592) - [C++] Provide .deb and .rpm packages
-* [ARROW-594](https://issues.apache.org/jira/browse/ARROW-594) - [Python] Provide interface to write pyarrow.Table to a stream
-* [ARROW-695](https://issues.apache.org/jira/browse/ARROW-695) - Integration tests for Decimal types
-* [ARROW-696](https://issues.apache.org/jira/browse/ARROW-696) - [C++] Add JSON read/write support for decimals for integration tests
-* [ARROW-759](https://issues.apache.org/jira/browse/ARROW-759) - [Python] Implement a transient list serialization function that can handle a mix of scalars, lists, ndarrays, dicts
-* [ARROW-786](https://issues.apache.org/jira/browse/ARROW-786) - [Format] In-memory format for 128-bit Decimals, handling of sign bit
-* [ARROW-837](https://issues.apache.org/jira/browse/ARROW-837) - [Python] Expose buffer allocation, FixedSizeBufferWriter
-* [ARROW-941](https://issues.apache.org/jira/browse/ARROW-941) - [Docs] Improve "cold start" integration testing instructions
-* [ARROW-989](https://issues.apache.org/jira/browse/ARROW-989) - [Python] Write pyarrow.Table to FileWriter or StreamWriter
-* [ARROW-1156](https://issues.apache.org/jira/browse/ARROW-1156) - [Python] pyarrow.Array.from\_pandas should take a type parameter
-* [ARROW-1238](https://issues.apache.org/jira/browse/ARROW-1238) - [Java] Add JSON read/write support for decimals for integration tests
-* [ARROW-1286](https://issues.apache.org/jira/browse/ARROW-1286) - PYTHON: support Categorical serialization to/from parquet
-* [ARROW-1307](https://issues.apache.org/jira/browse/ARROW-1307) - [Python] Add pandas serialization section + Feather API to Sphinx docs
-* [ARROW-1317](https://issues.apache.org/jira/browse/ARROW-1317) - [Python] Add function to set Hadoop CLASSPATH
-* [ARROW-1331](https://issues.apache.org/jira/browse/ARROW-1331) - [Java] Refactor tests
-* [ARROW-1331](https://issues.apache.org/jira/browse/ARROW-1331) - [Java] Refactor tests
-* [ARROW-1339](https://issues.apache.org/jira/browse/ARROW-1339) - [C++] Use boost::filesystem for handling of platform-specific file path encodings
-* [ARROW-1344](https://issues.apache.org/jira/browse/ARROW-1344) - [C++] Calling BufferOutputStream::Write after calling Finish crashes
-* [ARROW-1348](https://issues.apache.org/jira/browse/ARROW-1348) - [C++/Python] Add release verification script for Windows
-* [ARROW-1351](https://issues.apache.org/jira/browse/ARROW-1351) - Automate updating CHANGELOG.md as part of release scripts
-* [ARROW-1352](https://issues.apache.org/jira/browse/ARROW-1352) - [Integration] Improve print formatting for producer, consumer line
-* [ARROW-1355](https://issues.apache.org/jira/browse/ARROW-1355) - Make arrow buildable with java9
-* [ARROW-1356](https://issues.apache.org/jira/browse/ARROW-1356) - [Website] Add new committers
-* [ARROW-1358](https://issues.apache.org/jira/browse/ARROW-1358) - Update source release scripts to account for new SHA checksum policy
-* [ARROW-1359](https://issues.apache.org/jira/browse/ARROW-1359) - [Python] Add Parquet writer option to normalize field names for use in Spark
-* [ARROW-1364](https://issues.apache.org/jira/browse/ARROW-1364) - [C++] IPC reader and writer specialized for GPU device memory
-* [ARROW-1366](https://issues.apache.org/jira/browse/ARROW-1366) - [Python] Add instructions for starting the Plasma store when installing pyarrow from wheels
-* [ARROW-1372](https://issues.apache.org/jira/browse/ARROW-1372) - [Plasma] Support for storing data in huge pages
-* [ARROW-1376](https://issues.apache.org/jira/browse/ARROW-1376) - [C++] RecordBatchStreamReader::Open API is inconsistent with writer
-* [ARROW-1377](https://issues.apache.org/jira/browse/ARROW-1377) - [Python] Add function to assist with benchmarking Parquet scan performance
-* [ARROW-1381](https://issues.apache.org/jira/browse/ARROW-1381) - [Python] Improve performance of SerializedPyObject.to\_buffer
-* [ARROW-1383](https://issues.apache.org/jira/browse/ARROW-1383) - [C++] Support std::vector<bool\> in builder vector appends
-* [ARROW-1384](https://issues.apache.org/jira/browse/ARROW-1384) - [C++] Add convenience function for serializing a record batch to an IPC message
-* [ARROW-1386](https://issues.apache.org/jira/browse/ARROW-1386) - [C++] Unpin CMake version in MSVC build toolchain
-* [ARROW-1387](https://issues.apache.org/jira/browse/ARROW-1387) - [C++] Set up GPU leaf library build toolchain
-* [ARROW-1392](https://issues.apache.org/jira/browse/ARROW-1392) - [C++] Implement reader and writer IO interfaces for GPU buffers
-* [ARROW-1395](https://issues.apache.org/jira/browse/ARROW-1395) - [C++] Remove APIs deprecated as of 0.5.0 and later versions
-* [ARROW-1396](https://issues.apache.org/jira/browse/ARROW-1396) - [C++] Add PrettyPrint function for Schemas, which also outputs any dictionaries
-* [ARROW-1397](https://issues.apache.org/jira/browse/ARROW-1397) - [Packaging] Use Docker instead of Vagrant
-* [ARROW-1399](https://issues.apache.org/jira/browse/ARROW-1399) - [C++] Add CUDA build version in a public header to help prevent ABI conflicts
-* [ARROW-1400](https://issues.apache.org/jira/browse/ARROW-1400) - [Python] Ability to create partitions when writing to Parquet
-* [ARROW-1401](https://issues.apache.org/jira/browse/ARROW-1401) - [C++] Add extra debugging context to failures in RETURN\_NOT\_OK in debug builds
-* [ARROW-1401](https://issues.apache.org/jira/browse/ARROW-1401) - [C++] Add extra debugging context to failures in RETURN\_NOT\_OK in debug builds
-* [ARROW-1402](https://issues.apache.org/jira/browse/ARROW-1402) - [C++] Possibly deprecate public APIs that use MutableBuffer
-* [ARROW-1404](https://issues.apache.org/jira/browse/ARROW-1404) - [Packaging] Build .deb and .rpm on Travis CI
-* [ARROW-1405](https://issues.apache.org/jira/browse/ARROW-1405) - [Python] Add logging option for verbose memory allocations
-* [ARROW-1406](https://issues.apache.org/jira/browse/ARROW-1406) - [Python] Harden user API for generating serialized schema and record batch messages as memoryview-compatible objects
-* [ARROW-1408](https://issues.apache.org/jira/browse/ARROW-1408) - [C++] Refactor and make IPC read / write APIs more consistent, add appropriate deprecations
-* [ARROW-1410](https://issues.apache.org/jira/browse/ARROW-1410) - Plasma object store occasionally pauses for a long time
-* [ARROW-1412](https://issues.apache.org/jira/browse/ARROW-1412) - [Plasma] Add higher level API for putting and getting Python objects
-* [ARROW-1413](https://issues.apache.org/jira/browse/ARROW-1413) - [C++] Add include-what-you-use configuration
-* [ARROW-1415](https://issues.apache.org/jira/browse/ARROW-1415) - [GLib] Support date32 and date64
-* [ARROW-1416](https://issues.apache.org/jira/browse/ARROW-1416) - [Format] Clarify example array in memory layout documentation
-* [ARROW-1417](https://issues.apache.org/jira/browse/ARROW-1417) - [Python] Allow more generic filesystem objects to be passed to ParquetDataset
-* [ARROW-1418](https://issues.apache.org/jira/browse/ARROW-1418) - [Python] Introduce SerializationContext to register custom serialization callbacks
-* [ARROW-1419](https://issues.apache.org/jira/browse/ARROW-1419) - [GLib] Suppress sign-conversion warning on Clang
-* [ARROW-1427](https://issues.apache.org/jira/browse/ARROW-1427) - [GLib] Add a link to readme of Arrow GLib
-* [ARROW-1428](https://issues.apache.org/jira/browse/ARROW-1428) - [C++] Append steps to clone source code to README.mb
-* [ARROW-1432](https://issues.apache.org/jira/browse/ARROW-1432) - [C++] Build bundled jemalloc functions with private prefix
-* [ARROW-1433](https://issues.apache.org/jira/browse/ARROW-1433) - [C++] Simplify implementation of Array::Slice
-* [ARROW-1438](https://issues.apache.org/jira/browse/ARROW-1438) - [Plasma] Pull SerializationContext through PlasmaClient put and get
-* [ARROW-1441](https://issues.apache.org/jira/browse/ARROW-1441) - [Site] Add Ruby to Flexible section
-* [ARROW-1442](https://issues.apache.org/jira/browse/ARROW-1442) - [Website] Add pointer to nightly conda packages on /install
-* [ARROW-1447](https://issues.apache.org/jira/browse/ARROW-1447) - [C++] Round of include-what-you-use include cleanups
-* [ARROW-1448](https://issues.apache.org/jira/browse/ARROW-1448) - [Packaging] Support uploading built .deb and .rpm to Bintray
-* [ARROW-1449](https://issues.apache.org/jira/browse/ARROW-1449) - Implement Decimal using only Int128
-* [ARROW-1451](https://issues.apache.org/jira/browse/ARROW-1451) - [C++] Create arrow/io/api.h
-* [ARROW-1460](https://issues.apache.org/jira/browse/ARROW-1460) - [C++] Upgrade clang-format used to LLVM 4.0
-* [ARROW-1462](https://issues.apache.org/jira/browse/ARROW-1462) - [GLib] Support time array
-* [ARROW-1466](https://issues.apache.org/jira/browse/ARROW-1466) - [C++] Support DecimalArray in arrow::PrettyPrint
-* [ARROW-1468](https://issues.apache.org/jira/browse/ARROW-1468) - [C++] Append to PrimitiveBuilder from std::vector<CTYPE\>
-* [ARROW-1479](https://issues.apache.org/jira/browse/ARROW-1479) - [JS] Expand JavaScript implementation
-* [ARROW-1480](https://issues.apache.org/jira/browse/ARROW-1480) - [Python] Improve performance of serializing sets
-* [ARROW-1481](https://issues.apache.org/jira/browse/ARROW-1481) - [C++] Expose type casts as generic callable object that can write into pre-allocated memory
-* [ARROW-1494](https://issues.apache.org/jira/browse/ARROW-1494) - [C++] Document that shared\_ptr returned by RecordBatch::column needs to be retained
-* [ARROW-1499](https://issues.apache.org/jira/browse/ARROW-1499) - [Python] Consider adding option to parquet.write\_table that sets options for maximum Spark compatibility
-* [ARROW-1504](https://issues.apache.org/jira/browse/ARROW-1504) - [GLib] Support timestamp
-* [ARROW-1505](https://issues.apache.org/jira/browse/ARROW-1505) - [GLib] Simplify arguments check
-* [ARROW-1506](https://issues.apache.org/jira/browse/ARROW-1506) - [C++] Support pkg-config for compute modules
-* [ARROW-1508](https://issues.apache.org/jira/browse/ARROW-1508) - C++: Add support for FixedSizeBinaryType in DictionaryBuilder
-* [ARROW-1510](https://issues.apache.org/jira/browse/ARROW-1510) - [C++] Support cast
-* [ARROW-1511](https://issues.apache.org/jira/browse/ARROW-1511) - [C++] Deprecate arrow::MakePrimitiveArray
-* [ARROW-1513](https://issues.apache.org/jira/browse/ARROW-1513) - C++: Add cast from Dictionary to plain arrays
-* [ARROW-1515](https://issues.apache.org/jira/browse/ARROW-1515) - [GLib] Detect version directly
-* [ARROW-1516](https://issues.apache.org/jira/browse/ARROW-1516) - [GLib] Update document
-* [ARROW-1517](https://issues.apache.org/jira/browse/ARROW-1517) - Remove unnecessary temporary in DecimalUtil::ToString function
-* [ARROW-1519](https://issues.apache.org/jira/browse/ARROW-1519) - [C++] Move DecimalUtil functions to methods on the Int128 class
-* [ARROW-1528](https://issues.apache.org/jira/browse/ARROW-1528) - [GLib] Resolve include dependency
-* [ARROW-1530](https://issues.apache.org/jira/browse/ARROW-1530) - [C++] Install arrow/util/parallel.h
-* [ARROW-1551](https://issues.apache.org/jira/browse/ARROW-1551) - [Website] Updates for 0.7.0 release
-* [ARROW-1597](https://issues.apache.org/jira/browse/ARROW-1597) - [Packaging] arrow-compute.pc is missing in .deb/.rpm file list
-
-
-
-# Apache Arrow 0.6.0 (2017-08-14)
-
-## Bug Fixes
-
-* [ARROW-187](https://issues.apache.org/jira/browse/ARROW-187) - [C++] Decide on how pedantic we want to be about exceptions
-* [ARROW-276](https://issues.apache.org/jira/browse/ARROW-276) - [JAVA] Nullable Value Vectors should extend BaseValueVector instead of BaseDataValueVector
-* [ARROW-573](https://issues.apache.org/jira/browse/ARROW-573) - [Python/C++] Support ordered dictionaries data, pandas Categorical
-* [ARROW-884](https://issues.apache.org/jira/browse/ARROW-884) - [C++] Exclude internal classes from documentation
-* [ARROW-932](https://issues.apache.org/jira/browse/ARROW-932) - [Python] Fix compiler warnings on MSVC
-* [ARROW-968](https://issues.apache.org/jira/browse/ARROW-968) - [Python] RecordBatch [i:j] syntax is incomplete
-* [ARROW-1192](https://issues.apache.org/jira/browse/ARROW-1192) - [JAVA] Improve splitAndTransfer performance for List and Union vectors
-* [ARROW-1195](https://issues.apache.org/jira/browse/ARROW-1195) - [C++] CpuInfo doesn't get cache size on Windows
-* [ARROW-1204](https://issues.apache.org/jira/browse/ARROW-1204) - [C++] lz4 ExternalProject fails in Visual Studio 2015
-* [ARROW-1225](https://issues.apache.org/jira/browse/ARROW-1225) - [Python] pyarrow.array does not attempt to convert bytes to UTF8 when passed a StringType
-* [ARROW-1237](https://issues.apache.org/jira/browse/ARROW-1237) - [JAVA] Expose the ability to set lastSet
-* [ARROW-1239](https://issues.apache.org/jira/browse/ARROW-1239) - issue with current version of git-commit-id-plugin
-* [ARROW-1240](https://issues.apache.org/jira/browse/ARROW-1240) - security: upgrade logback to address CVE-2017-5929
-* [ARROW-1240](https://issues.apache.org/jira/browse/ARROW-1240) - security: upgrade logback to address CVE-2017-5929
-* [ARROW-1241](https://issues.apache.org/jira/browse/ARROW-1241) - [C++] Visual Studio 2017 Appveyor build job
-* [ARROW-1242](https://issues.apache.org/jira/browse/ARROW-1242) - [Java] security - upgrade Jackson to mitigate 3 CVE vulnerabilities
-* [ARROW-1242](https://issues.apache.org/jira/browse/ARROW-1242) - [Java] security - upgrade Jackson to mitigate 3 CVE vulnerabilities
-* [ARROW-1245](https://issues.apache.org/jira/browse/ARROW-1245) - [Integration] Java Integration Tests Disabled
-* [ARROW-1248](https://issues.apache.org/jira/browse/ARROW-1248) - [Python] C linkage warnings in Clang with public Cython API
-* [ARROW-1249](https://issues.apache.org/jira/browse/ARROW-1249) - [JAVA] Expose the fillEmpties function from Nullable<Varlength\>Vector.mutator
-* [ARROW-1263](https://issues.apache.org/jira/browse/ARROW-1263) - [C++] CpuInfo should be able to get CPU features on Windows
-* [ARROW-1265](https://issues.apache.org/jira/browse/ARROW-1265) - [Plasma] Plasma store memory leak warnings in Python test suite
-* [ARROW-1267](https://issues.apache.org/jira/browse/ARROW-1267) - [Java] Handle zero length case in BitVector.splitAndTransfer
-* [ARROW-1269](https://issues.apache.org/jira/browse/ARROW-1269) - [Packaging] Add Windows wheel build scripts from ARROW-1068 to arrow-dist
-* [ARROW-1275](https://issues.apache.org/jira/browse/ARROW-1275) - [C++] Default static library prefix for Snappy should be "\_static"
-* [ARROW-1276](https://issues.apache.org/jira/browse/ARROW-1276) - Cannot serializer empty DataFrame to parquet
-* [ARROW-1283](https://issues.apache.org/jira/browse/ARROW-1283) - [Java] VectorSchemaRoot should be able to be closed() more than once
-* [ARROW-1285](https://issues.apache.org/jira/browse/ARROW-1285) - PYTHON: NotImplemented exception creates empty parquet file
-* [ARROW-1287](https://issues.apache.org/jira/browse/ARROW-1287) - [Python] Emulate "whence" argument of seek in NativeFile
-* [ARROW-1290](https://issues.apache.org/jira/browse/ARROW-1290) - [C++] Use array capacity doubling in arrow::BufferBuilder
-* [ARROW-1291](https://issues.apache.org/jira/browse/ARROW-1291) - [Python] pa.RecordBatch.from\_pandas doesn't accept DataFrame with numeric column names
-* [ARROW-1294](https://issues.apache.org/jira/browse/ARROW-1294) - [C++] New Appveyor build failures
-* [ARROW-1296](https://issues.apache.org/jira/browse/ARROW-1296) - [Java] templates/FixValueVectors reset() method doesn't set allocationSizeInBytes correctly
-* [ARROW-1300](https://issues.apache.org/jira/browse/ARROW-1300) - [JAVA] Fix ListVector Tests
-* [ARROW-1306](https://issues.apache.org/jira/browse/ARROW-1306) - [Python] Encoding? issue with error reporting for parquet.read\_table
-* [ARROW-1308](https://issues.apache.org/jira/browse/ARROW-1308) - [C++] ld tries to link 'arrow\_static' even when -DARROW\_BUILD\_STATIC=off
-* [ARROW-1309](https://issues.apache.org/jira/browse/ARROW-1309) - [Python] Error inferring List type in Array.from\_pandas when inner values are all None
-* [ARROW-1310](https://issues.apache.org/jira/browse/ARROW-1310) - [JAVA] Revert ARROW-886
-* [ARROW-1311](https://issues.apache.org/jira/browse/ARROW-1311) - python hangs after write a few parquet tables
-* [ARROW-1312](https://issues.apache.org/jira/browse/ARROW-1312) - [C++] Set default value to ARROW\_JEMALLOC to OFF until ARROW-1282 is resolved
-* [ARROW-1312](https://issues.apache.org/jira/browse/ARROW-1312) - [C++] Set default value to ARROW\_JEMALLOC to OFF until ARROW-1282 is resolved
-* [ARROW-1326](https://issues.apache.org/jira/browse/ARROW-1326) - [Python] Fix Sphinx build in Travis CI
-* [ARROW-1327](https://issues.apache.org/jira/browse/ARROW-1327) - [Python] Failing to release GIL in MemoryMappedFile.\_open causes deadlock
-* [ARROW-1328](https://issues.apache.org/jira/browse/ARROW-1328) - [Python] pyarrow.Table.from\_pandas option timestamps\_to\_ms changes column values
-* [ARROW-1330](https://issues.apache.org/jira/browse/ARROW-1330) - [Plasma] Turn on plasma tests on manylinux1
-* [ARROW-1335](https://issues.apache.org/jira/browse/ARROW-1335) - [C++] PrimitiveArray::raw\_values has inconsistent semantics re: offsets compared with subclasses
-* [ARROW-1338](https://issues.apache.org/jira/browse/ARROW-1338) - [Python] Investigate non-deterministic core dump on Python 2.7, Travis CI builds
-* [ARROW-1340](https://issues.apache.org/jira/browse/ARROW-1340) - [Java] NullableMapVector field doesn't maintain metadata
-* [ARROW-1342](https://issues.apache.org/jira/browse/ARROW-1342) - [Python] Support strided array of lists
-* [ARROW-1343](https://issues.apache.org/jira/browse/ARROW-1343) - [Format/Java/C++] Ensuring encapsulated stream / IPC message sizes are always a multiple of 8
-* [ARROW-1350](https://issues.apache.org/jira/browse/ARROW-1350) - [C++] Include Plasma source tree in source distribution
-
-
-## New Features and Improvements
-
-* [ARROW-439](https://issues.apache.org/jira/browse/ARROW-439) - [Python] Add option in "to\_pandas" conversions to yield Categorical from String/Binary arrays
-* [ARROW-622](https://issues.apache.org/jira/browse/ARROW-622) - [Python] Investigate alternatives to timestamps\_to\_ms argument in pandas conversion
-* [ARROW-1076](https://issues.apache.org/jira/browse/ARROW-1076) - [Python] Handle nanosecond timestamps more gracefully when writing to Parquet format
-* [ARROW-1093](https://issues.apache.org/jira/browse/ARROW-1093) - [Python] Fail Python builds if flake8 yields warnings
-* [ARROW-1104](https://issues.apache.org/jira/browse/ARROW-1104) - Integrate in-memory object store from Ray
-* [ARROW-1116](https://issues.apache.org/jira/browse/ARROW-1116) - [Python] Create single external GitHub repo building for building wheels for all platforms in one shot
-* [ARROW-1121](https://issues.apache.org/jira/browse/ARROW-1121) - [C++] Improve error message when opening OS file fails
-* [ARROW-1140](https://issues.apache.org/jira/browse/ARROW-1140) - [C++] Allow optional build of plasma
-* [ARROW-1149](https://issues.apache.org/jira/browse/ARROW-1149) - [Plasma] Create Cython client library for Plasma
-* [ARROW-1173](https://issues.apache.org/jira/browse/ARROW-1173) - [Plasma] Blog post for Plasma
-* [ARROW-1211](https://issues.apache.org/jira/browse/ARROW-1211) - [C++] Consider making default\_memory\_pool() the default for builder classes
-* [ARROW-1213](https://issues.apache.org/jira/browse/ARROW-1213) - [Python] Enable s3fs to be used with ParquetDataset and reader/writer functions
-* [ARROW-1219](https://issues.apache.org/jira/browse/ARROW-1219) - [C++] Use more vanilla Google C++ formatting
-* [ARROW-1224](https://issues.apache.org/jira/browse/ARROW-1224) - [Format] Clarify language around buffer padding and alignment in IPC
-* [ARROW-1230](https://issues.apache.org/jira/browse/ARROW-1230) - [Plasma] Install libraries and headers
-* [ARROW-1243](https://issues.apache.org/jira/browse/ARROW-1243) - [Java] security: upgrade all libraries to latest stable versions
-* [ARROW-1246](https://issues.apache.org/jira/browse/ARROW-1246) - [Format] Add Map logical type to metadata
-* [ARROW-1251](https://issues.apache.org/jira/browse/ARROW-1251) - [Python/C++] Revise build documentation to account for latest build toolchain
-* [ARROW-1253](https://issues.apache.org/jira/browse/ARROW-1253) - [C++] Use pre-built toolchain libraries where prudent to speed up CI builds
-* [ARROW-1255](https://issues.apache.org/jira/browse/ARROW-1255) - [Plasma] Check plasma flatbuffer messages with the flatbuffer verifier
-* [ARROW-1256](https://issues.apache.org/jira/browse/ARROW-1256) - [Plasma] Fix compile warnings on macOS
-* [ARROW-1257](https://issues.apache.org/jira/browse/ARROW-1257) - [Plasma] Plasma documentation
-* [ARROW-1258](https://issues.apache.org/jira/browse/ARROW-1258) - [C++] Suppress dlmalloc warnings on Clang
-* [ARROW-1259](https://issues.apache.org/jira/browse/ARROW-1259) - [Plasma] Speed up Plasma tests
-* [ARROW-1260](https://issues.apache.org/jira/browse/ARROW-1260) - [Plasma] Use factory method to create Python PlasmaClient
-* [ARROW-1264](https://issues.apache.org/jira/browse/ARROW-1264) - [Plasma] Don't exit the Python interpreter if the plasma client can't connect to the store
-* [ARROW-1268](https://issues.apache.org/jira/browse/ARROW-1268) - [Website] Blog post on Arrow integration with Spark
-* [ARROW-1270](https://issues.apache.org/jira/browse/ARROW-1270) - [Packaging] Add Python wheel build scripts for macOS to arrow-dist
-* [ARROW-1272](https://issues.apache.org/jira/browse/ARROW-1272) - [Python] Add script to arrow-dist to generate and upload manylinux1 Python wheels
-* [ARROW-1273](https://issues.apache.org/jira/browse/ARROW-1273) - [Python] Add convenience functions for reading only Parquet metadata or effective Arrow schema from a particular Parquet file
-* [ARROW-1274](https://issues.apache.org/jira/browse/ARROW-1274) - [C++] add\_compiler\_export\_flags() throws warning with CMake \>= 3.3
-* [ARROW-1281](https://issues.apache.org/jira/browse/ARROW-1281) - [C++/Python] Add Docker setup for running HDFS tests and other tests we may not run in Travis CI
-* [ARROW-1288](https://issues.apache.org/jira/browse/ARROW-1288) - Clean up many ASF license headers
-* [ARROW-1289](https://issues.apache.org/jira/browse/ARROW-1289) - [Python] Add PYARROW\_BUILD\_PLASMA option like Parquet
-* [ARROW-1297](https://issues.apache.org/jira/browse/ARROW-1297) - 0.6.0 Release
-* [ARROW-1301](https://issues.apache.org/jira/browse/ARROW-1301) - [C++/Python] Add remaining supported libhdfs UNIX-like filesystem APIs
-* [ARROW-1303](https://issues.apache.org/jira/browse/ARROW-1303) - [C++] Support downloading Boost
-* [ARROW-1304](https://issues.apache.org/jira/browse/ARROW-1304) - [Java] Fix checkstyle checks warning
-* [ARROW-1305](https://issues.apache.org/jira/browse/ARROW-1305) - [GLib] Add GArrowIntArrayBuilder
-* [ARROW-1315](https://issues.apache.org/jira/browse/ARROW-1315) - [GLib] Status check of arrow::ArrayBuilder::Finish() is missing
-* [ARROW-1323](https://issues.apache.org/jira/browse/ARROW-1323) - [GLib] Add garrow\_boolean\_array\_get\_values()
-* [ARROW-1333](https://issues.apache.org/jira/browse/ARROW-1333) - [Plasma] Sorting example for DataFrames in plasma
-* [ARROW-1334](https://issues.apache.org/jira/browse/ARROW-1334) - [C++] Instantiate arrow::Table from vector of Array objects (instead of Columns)
-* [ARROW-1336](https://issues.apache.org/jira/browse/ARROW-1336) - [C++] Add arrow::schema factory function
-* [ARROW-1353](https://issues.apache.org/jira/browse/ARROW-1353) - [Website] Updates + blog post for 0.6.0 release
-
-
-
-# Apache Arrow 0.5.0 (2017-07-23)
-
-## New Features and Improvements
-
-* [ARROW-111](https://issues.apache.org/jira/browse/ARROW-111) - [C++] Add static analyzer to tool chain to verify checking of Status returns
-* [ARROW-195](https://issues.apache.org/jira/browse/ARROW-195) - [C++] Upgrade clang bits to clang-3.8 and move back to trusty.
-* [ARROW-460](https://issues.apache.org/jira/browse/ARROW-460) - [C++] Implement JSON round trip for DictionaryArray
-* [ARROW-462](https://issues.apache.org/jira/browse/ARROW-462) - [C++] Implement in-memory conversions between non-nested primitive types and DictionaryArray equivalent
-* [ARROW-575](https://issues.apache.org/jira/browse/ARROW-575) - Python: Auto-detect nested lists and nested numpy arrays in Pandas
-* [ARROW-597](https://issues.apache.org/jira/browse/ARROW-597) - [Python] Add convenience function to yield DataFrame from any object that a StreamReader or FileReader can read from
-* [ARROW-599](https://issues.apache.org/jira/browse/ARROW-599) - [C++] Add LZ4 codec to 3rd-party toolchain
-* [ARROW-599](https://issues.apache.org/jira/browse/ARROW-599) - [C++] Add LZ4 codec to 3rd-party toolchain
-* [ARROW-600](https://issues.apache.org/jira/browse/ARROW-600) - [C++] Add ZSTD codec to 3rd-party toolchain
-* [ARROW-692](https://issues.apache.org/jira/browse/ARROW-692) - Java<-\>C++ Integration tests for dictionary-encoded vectors
-* [ARROW-693](https://issues.apache.org/jira/browse/ARROW-693) - [Java] Add JSON support for dictionary vectors
-* [ARROW-742](https://issues.apache.org/jira/browse/ARROW-742) - Handling exceptions during execution of std::wstring\_convert
-* [ARROW-742](https://issues.apache.org/jira/browse/ARROW-742) - Handling exceptions during execution of std::wstring\_convert
-* [ARROW-834](https://issues.apache.org/jira/browse/ARROW-834) - [Python] Support creating Arrow arrays from Python iterables
-* [ARROW-915](https://issues.apache.org/jira/browse/ARROW-915) - Struct Array reads limited support
-* [ARROW-935](https://issues.apache.org/jira/browse/ARROW-935) - [Java] Build Javadoc in Travis CI
-* [ARROW-960](https://issues.apache.org/jira/browse/ARROW-960) - [Python] Add source build guide for macOS + Homebrew
-* [ARROW-962](https://issues.apache.org/jira/browse/ARROW-962) - [Python] Add schema attribute to FileReader
-* [ARROW-964](https://issues.apache.org/jira/browse/ARROW-964) - [Python] Improve api docs
-* [ARROW-966](https://issues.apache.org/jira/browse/ARROW-966) - [Python] pyarrow.list\_ should also accept Field instance
-* [ARROW-978](https://issues.apache.org/jira/browse/ARROW-978) - [Python] Use sphinx-bootstrap-theme for Sphinx documentation
-* [ARROW-1041](https://issues.apache.org/jira/browse/ARROW-1041) - [Python] Support read\_pandas on a directory of Parquet files
-* [ARROW-1048](https://issues.apache.org/jira/browse/ARROW-1048) - Allow user LD\_LIBRARY\_PATH to be used with source release script
-* [ARROW-1052](https://issues.apache.org/jira/browse/ARROW-1052) - Arrow 0.5.0 release
-* [ARROW-1071](https://issues.apache.org/jira/browse/ARROW-1071) - [Python] RecordBatchFileReader does not have a schema property
-* [ARROW-1073](https://issues.apache.org/jira/browse/ARROW-1073) - C++: Adapative integer builder
-* [ARROW-1095](https://issues.apache.org/jira/browse/ARROW-1095) - [Website] Add Arrow icon asset
-* [ARROW-1100](https://issues.apache.org/jira/browse/ARROW-1100) - [Python] Add "mode" property to NativeFile instances
-* [ARROW-1102](https://issues.apache.org/jira/browse/ARROW-1102) - Make MessageSerializer.serializeMessage() public
-* [ARROW-1120](https://issues.apache.org/jira/browse/ARROW-1120) - [Python] Write support for int96
-* [ARROW-1122](https://issues.apache.org/jira/browse/ARROW-1122) - [Website] Guest blog post on Arrow + ODBC from turbodbc
-* [ARROW-1122](https://issues.apache.org/jira/browse/ARROW-1122) - [Website] Guest blog post on Arrow + ODBC from turbodbc
-* [ARROW-1123](https://issues.apache.org/jira/browse/ARROW-1123) - C++: Make jemalloc the default allocator
-* [ARROW-1135](https://issues.apache.org/jira/browse/ARROW-1135) - Upgrade Travis CI clang builds to use LLVM 4.0
-* [ARROW-1137](https://issues.apache.org/jira/browse/ARROW-1137) - Python: Ensure Pandas roundtrip of all-None column
-* [ARROW-1142](https://issues.apache.org/jira/browse/ARROW-1142) - [C++] Move over compression library toolchain from parquet-cpp
-* [ARROW-1145](https://issues.apache.org/jira/browse/ARROW-1145) - [GLib] Add get\_values()
-* [ARROW-1146](https://issues.apache.org/jira/browse/ARROW-1146) - Add .gitignore for \*\_generated.h files in src/plasma/format
-* [ARROW-1148](https://issues.apache.org/jira/browse/ARROW-1148) - [C++] Raise minimum CMake version to 3.2
-* [ARROW-1151](https://issues.apache.org/jira/browse/ARROW-1151) - [C++] Add gcc branch prediction to status check macro
-* [ARROW-1154](https://issues.apache.org/jira/browse/ARROW-1154) - [C++] Migrate more computational utility code from parquet-cpp
-* [ARROW-1160](https://issues.apache.org/jira/browse/ARROW-1160) - C++: Implement DictionaryBuilder
-* [ARROW-1165](https://issues.apache.org/jira/browse/ARROW-1165) - [C++] Refactor PythonDecimalToArrowDecimal to not use templates
-* [ARROW-1172](https://issues.apache.org/jira/browse/ARROW-1172) - [C++] Use unique\_ptr with array builder classes
-* [ARROW-1183](https://issues.apache.org/jira/browse/ARROW-1183) - [Python] Implement time type conversions in to\_pandas
-* [ARROW-1185](https://issues.apache.org/jira/browse/ARROW-1185) - [C++] Clean up arrow::Status implementation, add warn\_unused\_result attribute for clang
-* [ARROW-1187](https://issues.apache.org/jira/browse/ARROW-1187) - Serialize a DataFrame with None column
-* [ARROW-1193](https://issues.apache.org/jira/browse/ARROW-1193) - [C++] Support pkg-config forarrow\_python.so
-* [ARROW-1196](https://issues.apache.org/jira/browse/ARROW-1196) - [C++] Appveyor separate jobs for Debug/Release builds from sources; Build with conda toolchain; Build with NMake Makefiles Generator
-* [ARROW-1198](https://issues.apache.org/jira/browse/ARROW-1198) - Python: Add public C++ API to unwrap PyArrow object
-* [ARROW-1199](https://issues.apache.org/jira/browse/ARROW-1199) - [C++] Introduce mutable POD struct for generic array data
-* [ARROW-1202](https://issues.apache.org/jira/browse/ARROW-1202) - Remove semicolons from status macros
-* [ARROW-1212](https://issues.apache.org/jira/browse/ARROW-1212) - [GLib] Add garrow\_binary\_array\_get\_offsets\_buffer()
-* [ARROW-1214](https://issues.apache.org/jira/browse/ARROW-1214) - [Python] Add classes / functions to enable stream message components to be handled outside of the stream reader class
-* [ARROW-1217](https://issues.apache.org/jira/browse/ARROW-1217) - [GLib] Add GInputStream based arrow::io::RandomAccessFile
-* [ARROW-1220](https://issues.apache.org/jira/browse/ARROW-1220) - [C++] Standartize usage of \*\_HOME cmake script variables for 3rd party libs
-* [ARROW-1221](https://issues.apache.org/jira/browse/ARROW-1221) - [C++] Pin clang-format version
-* [ARROW-1227](https://issues.apache.org/jira/browse/ARROW-1227) - [GLib] Support GOutputStream
-* [ARROW-1229](https://issues.apache.org/jira/browse/ARROW-1229) - [GLib] Follow Reader API change (get -\> read)
-* [ARROW-1244](https://issues.apache.org/jira/browse/ARROW-1244) - [C++] Do not include cpp/src/plasma in source release pending IP clearance
-* [ARROW-1252](https://issues.apache.org/jira/browse/ARROW-1252) - [Website] Update for 0.5.0 release, add blog post summarizing changes from 0.4.x
-
-
-## Bug Fixes
-
-* [ARROW-288](https://issues.apache.org/jira/browse/ARROW-288) - Implement Arrow adapter for Spark Datasets
-* [ARROW-601](https://issues.apache.org/jira/browse/ARROW-601) - Some logical types not supported when loading Parquet
-* [ARROW-784](https://issues.apache.org/jira/browse/ARROW-784) - Cleaning up thirdparty toolchain support in Arrow on Windows
-* [ARROW-785](https://issues.apache.org/jira/browse/ARROW-785) - possible issue on writing parquet via pyarrow, subsequently read in Hive
-* [ARROW-924](https://issues.apache.org/jira/browse/ARROW-924) - Setting GTEST\_HOME Fails on CMake run
-* [ARROW-992](https://issues.apache.org/jira/browse/ARROW-992) - [Python] In place development builds do not have a \_\_version\_\_
-* [ARROW-1043](https://issues.apache.org/jira/browse/ARROW-1043) - [Python] Make sure pandas metadata created by arrow conforms to the pandas spec
-* [ARROW-1074](https://issues.apache.org/jira/browse/ARROW-1074) - from\_pandas doesnt convert ndarray to list
-* [ARROW-1079](https://issues.apache.org/jira/browse/ARROW-1079) - [Python] Empty "private" directories should be ignored by Parquet interface
-* [ARROW-1081](https://issues.apache.org/jira/browse/ARROW-1081) - C++: arrow::test::TestBase::MakePrimitive doesn't fill null\_bitmap
-* [ARROW-1096](https://issues.apache.org/jira/browse/ARROW-1096) - [C++] Memory mapping file over 4GB fails on Windows
-* [ARROW-1097](https://issues.apache.org/jira/browse/ARROW-1097) - Reading tensor needs file to be opened in writeable mode
-* [ARROW-1098](https://issues.apache.org/jira/browse/ARROW-1098) - Document Error?
-* [ARROW-1101](https://issues.apache.org/jira/browse/ARROW-1101) - UnionListWriter is not implementing all methods on interface ScalarWriter
-* [ARROW-1103](https://issues.apache.org/jira/browse/ARROW-1103) - [Python] Utilize pandas metadata from common \_metadata Parquet file if it exists
-* [ARROW-1107](https://issues.apache.org/jira/browse/ARROW-1107) - [JAVA] NullableMapVector getField() should return nullable type
-* [ARROW-1108](https://issues.apache.org/jira/browse/ARROW-1108) - Check if ArrowBuf is empty buffer in getActualConsumedMemory() and getPossibleConsumedMemory()
-* [ARROW-1109](https://issues.apache.org/jira/browse/ARROW-1109) - [JAVA] transferOwnership fails when readerIndex is not 0
-* [ARROW-1110](https://issues.apache.org/jira/browse/ARROW-1110) - [JAVA] make union vector naming consistent
-* [ARROW-1111](https://issues.apache.org/jira/browse/ARROW-1111) - [JAVA] Make aligning buffers optional, and allow -1 for unknown null count
-* [ARROW-1112](https://issues.apache.org/jira/browse/ARROW-1112) - [JAVA] Set lastSet for VarLength and List vectors when loading
-* [ARROW-1113](https://issues.apache.org/jira/browse/ARROW-1113) - [C++] gflags EP build gets triggered (as a no-op) on subsequent calls to make or ninja build
-* [ARROW-1115](https://issues.apache.org/jira/browse/ARROW-1115) - [C++] Use absolute path for ccache
-* [ARROW-1117](https://issues.apache.org/jira/browse/ARROW-1117) - [Docs] Minor issues in GLib README
-* [ARROW-1124](https://issues.apache.org/jira/browse/ARROW-1124) - [Python] pyarrow needs to depend on numpy\>=1.10 (not 1.9)
-* [ARROW-1125](https://issues.apache.org/jira/browse/ARROW-1125) - Python: Table.from\_pandas doesn't work anymore on partial schemas
-* [ARROW-1125](https://issues.apache.org/jira/browse/ARROW-1125) - Python: Table.from\_pandas doesn't work anymore on partial schemas
-* [ARROW-1128](https://issues.apache.org/jira/browse/ARROW-1128) - [Docs] command to build a wheel is not properly rendered
-* [ARROW-1129](https://issues.apache.org/jira/browse/ARROW-1129) - [C++] Fix Linux toolchain build regression from ARROW-742
-* [ARROW-1130](https://issues.apache.org/jira/browse/ARROW-1130) - io-hdfs-test failure
-* [ARROW-1131](https://issues.apache.org/jira/browse/ARROW-1131) - Python: Parquet unit tests are always skipped
-* [ARROW-1132](https://issues.apache.org/jira/browse/ARROW-1132) - [Python] Unable to write pandas DataFrame w/MultiIndex containing duplicate values to parquet
-* [ARROW-1136](https://issues.apache.org/jira/browse/ARROW-1136) - [C++/Python] Segfault on empty stream
-* [ARROW-1138](https://issues.apache.org/jira/browse/ARROW-1138) - Travis: Use OpenJDK7 instead of OracleJDK7
-* [ARROW-1139](https://issues.apache.org/jira/browse/ARROW-1139) - [C++] dlmalloc doesn't allow arrow to be built with clang 4 or gcc 7.1.1
-* [ARROW-1141](https://issues.apache.org/jira/browse/ARROW-1141) - on import get libjemalloc.so.2: cannot allocate memory in static TLS block
-* [ARROW-1143](https://issues.apache.org/jira/browse/ARROW-1143) - C++: Fix comparison of NullArray
-* [ARROW-1144](https://issues.apache.org/jira/browse/ARROW-1144) - [C++] Remove unused variable
-* [ARROW-1147](https://issues.apache.org/jira/browse/ARROW-1147) - [C++] Allow optional vendoring of flatbuffers in plasma
-* [ARROW-1150](https://issues.apache.org/jira/browse/ARROW-1150) - [C++] AdaptiveIntBuilder compiler warning on MSVC
-* [ARROW-1152](https://issues.apache.org/jira/browse/ARROW-1152) - [Cython] read\_tensor should work with a readable file
-* [ARROW-1153](https://issues.apache.org/jira/browse/ARROW-1153) - All non-Pandas column throws NotImplemented: unhandled type
-* [ARROW-1155](https://issues.apache.org/jira/browse/ARROW-1155) - segmentation fault when run pa.Int16Value()
-* [ARROW-1157](https://issues.apache.org/jira/browse/ARROW-1157) - C++/Python: Decimal templates are not correctly exported on OSX
-* [ARROW-1159](https://issues.apache.org/jira/browse/ARROW-1159) - [C++] Static data members cannot be accessed from inline functions in Arrow headers by thirdparty users
-* [ARROW-1162](https://issues.apache.org/jira/browse/ARROW-1162) - Transfer Between Empty Lists Should Not Invoke Callback
-* [ARROW-1164](https://issues.apache.org/jira/browse/ARROW-1164) - C++: Templated functions need ARROW\_EXPORT instead of ARROW\_TEMPLATE\_EXPORT
-* [ARROW-1166](https://issues.apache.org/jira/browse/ARROW-1166) - Errors in Struct type's example and missing reference in Layout.md
-* [ARROW-1167](https://issues.apache.org/jira/browse/ARROW-1167) - [Python] Create chunked BinaryArray in Table.from\_pandas when a column's data exceeds 2GB
-* [ARROW-1168](https://issues.apache.org/jira/browse/ARROW-1168) - [Python] pandas metadata may contain "mixed" data types
-* [ARROW-1169](https://issues.apache.org/jira/browse/ARROW-1169) - C++: jemalloc externalproject doesn't build with CMake's ninja generator
-* [ARROW-1170](https://issues.apache.org/jira/browse/ARROW-1170) - C++: ARROW\_JEMALLOC=OFF breaks linking on unittest
-* [ARROW-1174](https://issues.apache.org/jira/browse/ARROW-1174) - [GLib] Investigate root cause of ListArray glib test failure
-* [ARROW-1177](https://issues.apache.org/jira/browse/ARROW-1177) - [C++] Detect int32 overflow in ListBuilder::Append
-* [ARROW-1179](https://issues.apache.org/jira/browse/ARROW-1179) - C++: Add missing virtual destructors
-* [ARROW-1180](https://issues.apache.org/jira/browse/ARROW-1180) - [GLib] garrow\_tensor\_get\_dimension\_name() returns invalid address
-* [ARROW-1181](https://issues.apache.org/jira/browse/ARROW-1181) - [Python] Parquet test fail if not enabled
-* [ARROW-1182](https://issues.apache.org/jira/browse/ARROW-1182) - C++: Specify BUILD\_BYPRODUCTS for zlib and zstd
-* [ARROW-1186](https://issues.apache.org/jira/browse/ARROW-1186) - [C++] Enable option to build arrow with minimal dependencies needed to build Parquet library
-* [ARROW-1188](https://issues.apache.org/jira/browse/ARROW-1188) - Segfault when trying to serialize a DataFrame with Null-only Categorical Column
-* [ARROW-1190](https://issues.apache.org/jira/browse/ARROW-1190) - VectorLoader corrupts vectors with duplicate names
-* [ARROW-1191](https://issues.apache.org/jira/browse/ARROW-1191) - [JAVA] Implement getField() method for the complex readers
-* [ARROW-1194](https://issues.apache.org/jira/browse/ARROW-1194) - Getting record batch size with pa.get\_record\_batch\_size returns a size that is too small for pandas DataFrame.
-* [ARROW-1197](https://issues.apache.org/jira/browse/ARROW-1197) - [GLib] record\_batch.hpp Inclusion is missing
-* [ARROW-1200](https://issues.apache.org/jira/browse/ARROW-1200) - [C++] DictionaryBuilder should use signed integers for indices
-* [ARROW-1201](https://issues.apache.org/jira/browse/ARROW-1201) - [Python] Incomplete Python types cause a core dump when repr-ing
-* [ARROW-1203](https://issues.apache.org/jira/browse/ARROW-1203) - [C++] Disallow BinaryBuilder to append byte strings larger than the maximum value of int32\_t
-* [ARROW-1205](https://issues.apache.org/jira/browse/ARROW-1205) - C++: Reference to type objects in ArrayLoader may cause segmentation faults.
-* [ARROW-1206](https://issues.apache.org/jira/browse/ARROW-1206) - [C++] Enable MSVC builds to work with some compression library support disabled
-* [ARROW-1208](https://issues.apache.org/jira/browse/ARROW-1208) - [C++] Toolchain build with ZSTD library from conda-forge failure
-* [ARROW-1208](https://issues.apache.org/jira/browse/ARROW-1208) - [C++] Toolchain build with ZSTD library from conda-forge failure
-* [ARROW-1215](https://issues.apache.org/jira/browse/ARROW-1215) - [Python] Class methods in API reference
-* [ARROW-1216](https://issues.apache.org/jira/browse/ARROW-1216) - Numpy arrays cannot be created from Arrow Buffers on Python 2
-* [ARROW-1218](https://issues.apache.org/jira/browse/ARROW-1218) - Arrow doesn't compile if all compression libraries are deactivated
-* [ARROW-1222](https://issues.apache.org/jira/browse/ARROW-1222) - [Python] pyarrow.array returns NullArray for array of unsupported Python objects
-* [ARROW-1223](https://issues.apache.org/jira/browse/ARROW-1223) - [GLib] Fix function name that returns wrapped object
-* [ARROW-1228](https://issues.apache.org/jira/browse/ARROW-1228) - [GLib] Test file name should be the same name as target class
-* [ARROW-1233](https://issues.apache.org/jira/browse/ARROW-1233) - [C++] Validate cmake script resolving of 3rd party linked libs from correct location in toolchain build
-* [ARROW-1235](https://issues.apache.org/jira/browse/ARROW-1235) - [C++] macOS linker failure with operator<< and std::ostream
-* [ARROW-1236](https://issues.apache.org/jira/browse/ARROW-1236) - Library paths in exported pkg-config file are incorrect
-* [ARROW-1284](https://issues.apache.org/jira/browse/ARROW-1284) - Windows can't install pyarrow 0.4.1 and 0.5.0
-
-
-
-# Apache Arrow 0.4.1 (2017-06-09)
-
-## Bug Fixes
-
-* [ARROW-424](https://issues.apache.org/jira/browse/ARROW-424) - [C++] Threadsafety in arrow/io/hdfs.h
-* [ARROW-1039](https://issues.apache.org/jira/browse/ARROW-1039) - Python: pyarrow.Filesystem.read\_parquet causing error if nthreads\>1
-* [ARROW-1050](https://issues.apache.org/jira/browse/ARROW-1050) - [C++] Export arrow::ValidateArray
-* [ARROW-1051](https://issues.apache.org/jira/browse/ARROW-1051) - [Python] If pyarrow.parquet fails to import due to a shared library ABI conflict, the test\_parquet.py tests silently do not run
-* [ARROW-1056](https://issues.apache.org/jira/browse/ARROW-1056) - [Python] Parquet+HDFS test failure due to writing pandas index
-* [ARROW-1057](https://issues.apache.org/jira/browse/ARROW-1057) - Fix cmake warning and msvc debug asserts
-* [ARROW-1060](https://issues.apache.org/jira/browse/ARROW-1060) - [Python] Add unit test for ARROW-1053
-* [ARROW-1062](https://issues.apache.org/jira/browse/ARROW-1062) - [GLib] Examples use old API
-* [ARROW-1066](https://issues.apache.org/jira/browse/ARROW-1066) - remove warning on feather for pandas \>= 0.20.1
-* [ARROW-1070](https://issues.apache.org/jira/browse/ARROW-1070) - [C++] Feather files for date/time types should be written with the physical types
-* [ARROW-1075](https://issues.apache.org/jira/browse/ARROW-1075) - [GLib] Build error on macOS
-* [ARROW-1082](https://issues.apache.org/jira/browse/ARROW-1082) - [GLib] Add CI on macOS
-* [ARROW-1085](https://issues.apache.org/jira/browse/ARROW-1085) - [java] Follow up on template cleanup. Missing method for IntervalYear
-* [ARROW-1086](https://issues.apache.org/jira/browse/ARROW-1086) - [Python] pyarrow 0.4.0 on pypi is missing pxd files
-* [ARROW-1088](https://issues.apache.org/jira/browse/ARROW-1088) - [Python] test\_unicode\_filename test fails when unicode filenames aren't supported by system
-* [ARROW-1090](https://issues.apache.org/jira/browse/ARROW-1090) - [Python] build\_ext usability
-* [ARROW-1091](https://issues.apache.org/jira/browse/ARROW-1091) - Decimal scale and precision are flipped
-* [ARROW-1092](https://issues.apache.org/jira/browse/ARROW-1092) - More Decimal and scale flipped follow-up
-* [ARROW-1094](https://issues.apache.org/jira/browse/ARROW-1094) - [C++] Incomplete buffer reads in arrow::io::ReadableFile should exactly truncate returned buffer
-* [ARROW-1127](https://issues.apache.org/jira/browse/ARROW-1127) - pyarrow 4.1 import failure on Travis
-
-
-## New Features and Improvements
-
-* [ARROW-897](https://issues.apache.org/jira/browse/ARROW-897) - [GLib] Build arrow-glib as a separate build in the Travis CI build matrix
-* [ARROW-986](https://issues.apache.org/jira/browse/ARROW-986) - [Format] Update IPC.md to account for dictionary batches
-* [ARROW-990](https://issues.apache.org/jira/browse/ARROW-990) - [JS] Add tslint support for linting TypeScript
-* [ARROW-1020](https://issues.apache.org/jira/browse/ARROW-1020) - [Format] Add additional language to Schema.fbs to clarify naive vs. localized Timestamp values
-* [ARROW-1034](https://issues.apache.org/jira/browse/ARROW-1034) - [Python] Enable creation of binary wheels on Windows / MSVC
-* [ARROW-1049](https://issues.apache.org/jira/browse/ARROW-1049) - [java] vector template cleanup
-* [ARROW-1063](https://issues.apache.org/jira/browse/ARROW-1063) - [Website] Blog post and website updates for 0.4.0 release
-* [ARROW-1068](https://issues.apache.org/jira/browse/ARROW-1068) - [Python] Create external repo with appveyor.yml configured for building Python wheel installers
-* [ARROW-1069](https://issues.apache.org/jira/browse/ARROW-1069) - Add instructions for publishing maven artifacts
-* [ARROW-1078](https://issues.apache.org/jira/browse/ARROW-1078) - [Python] Account for PARQUET-967
-* [ARROW-1080](https://issues.apache.org/jira/browse/ARROW-1080) - C++: Add tutorial about converting to/from row-wise representation
-* [ARROW-1084](https://issues.apache.org/jira/browse/ARROW-1084) - Implementations of BufferAllocator should handle Netty's OutOfDirectMemoryError
-* [ARROW-1118](https://issues.apache.org/jira/browse/ARROW-1118) - [Website] Site updates for 0.4.1
-
-
-
-# Apache Arrow 0.4.0 (2017-05-22)
-
-## Bug Fixes
-
-* [ARROW-813](https://issues.apache.org/jira/browse/ARROW-813) - [Python] setup.py sdist must also bundle dependent cmake modules
-* [ARROW-824](https://issues.apache.org/jira/browse/ARROW-824) - Date and Time Vectors should reflect timezone-less semantics
-* [ARROW-856](https://issues.apache.org/jira/browse/ARROW-856) - CmakeError by Unknown compiler.
-* [ARROW-909](https://issues.apache.org/jira/browse/ARROW-909) - libjemalloc.so.2: cannot open shared object file:
-* [ARROW-939](https://issues.apache.org/jira/browse/ARROW-939) - Fix division by zero for zero-dimensional Tensors
-* [ARROW-940](https://issues.apache.org/jira/browse/ARROW-940) - [JS] Generate multiple sets of artifacts
-* [ARROW-944](https://issues.apache.org/jira/browse/ARROW-944) - Python: Compat broken for pandas==0.18.1
-* [ARROW-948](https://issues.apache.org/jira/browse/ARROW-948) - [GLib] Update C++ header file list
-* [ARROW-952](https://issues.apache.org/jira/browse/ARROW-952) - Compilation error on macOS with clang-802.0.42
-* [ARROW-958](https://issues.apache.org/jira/browse/ARROW-958) - [Python] Conda build guide still needs ARROW\_HOME, PARQUET\_HOME
-* [ARROW-979](https://issues.apache.org/jira/browse/ARROW-979) - [Python] Fix setuptools\_scm version when release tag is not in the master timeline
-* [ARROW-991](https://issues.apache.org/jira/browse/ARROW-991) - [Python] PyArray\_SimpleNew should not be used with NPY\_DATETIME
-* [ARROW-995](https://issues.apache.org/jira/browse/ARROW-995) - [Website] 0.3 release announce has a typo in reference
-* [ARROW-998](https://issues.apache.org/jira/browse/ARROW-998) - [Doc] File format documents incorrect schema location
-* [ARROW-1003](https://issues.apache.org/jira/browse/ARROW-1003) - [C++] Hdfs and java dlls fail to load when built for Windows with MSVC
-* [ARROW-1004](https://issues.apache.org/jira/browse/ARROW-1004) - ArrowInvalid: Invalid: Python object of type float is not None and is not a string, bool, or date object
-* [ARROW-1017](https://issues.apache.org/jira/browse/ARROW-1017) - Python: Table.to\_pandas leaks memory
-* [ARROW-1023](https://issues.apache.org/jira/browse/ARROW-1023) - Python: Fix bundling of arrow-cpp for macOS
-* [ARROW-1033](https://issues.apache.org/jira/browse/ARROW-1033) - [Python] pytest discovers scripts/test\_leak.py
-* [ARROW-1045](https://issues.apache.org/jira/browse/ARROW-1045) - [JAVA] Add support for custom metadata in org.apache.arrow.vector.types.pojo.\*
-* [ARROW-1046](https://issues.apache.org/jira/browse/ARROW-1046) - [Python] Conform DataFrame metadata to pandas spec
-* [ARROW-1053](https://issues.apache.org/jira/browse/ARROW-1053) - [Python] Memory leak with RecordBatchFileReader
-* [ARROW-1054](https://issues.apache.org/jira/browse/ARROW-1054) - [Python] Test suite fails on pandas 0.19.2
-* [ARROW-1061](https://issues.apache.org/jira/browse/ARROW-1061) - [C++] Harden decimal parsing against invalid strings
-* [ARROW-1064](https://issues.apache.org/jira/browse/ARROW-1064) - ModuleNotFoundError: No module named 'pyarrow.\_parquet'
-
-
-## New Features and Improvements
-
-* [ARROW-29](https://issues.apache.org/jira/browse/ARROW-29) - C++: Add re2 as optional 3rd-party toolchain dependency
-* [ARROW-182](https://issues.apache.org/jira/browse/ARROW-182) - [C++] Remove Array::Validate virtual function and make a separate method
-* [ARROW-376](https://issues.apache.org/jira/browse/ARROW-376) - Python: Convert non-range Pandas indices (optionally) to Arrow
-* [ARROW-446](https://issues.apache.org/jira/browse/ARROW-446) - [Python] Document NativeFile interfaces, HDFS client in Sphinx
-* [ARROW-482](https://issues.apache.org/jira/browse/ARROW-482) - [Java] Provide API access to "custom\_metadata" Field attribute in IPC setting
-* [ARROW-532](https://issues.apache.org/jira/browse/ARROW-532) - [Python] Expand pyarrow.parquet documentation for 0.3 release
-* [ARROW-579](https://issues.apache.org/jira/browse/ARROW-579) - Python: Provide redistributable pyarrow wheels on OSX
-* [ARROW-596](https://issues.apache.org/jira/browse/ARROW-596) - [Python] Add convenience function to convert pandas.DataFrame to pyarrow.Buffer containing a file or stream representation
-* [ARROW-629](https://issues.apache.org/jira/browse/ARROW-629) - [JS] Add unit test suite
-* [ARROW-714](https://issues.apache.org/jira/browse/ARROW-714) - [C++] Add import\_pyarrow C API in the style of NumPy for thirdparty C++ users
-* [ARROW-819](https://issues.apache.org/jira/browse/ARROW-819) - [Python] Define public Cython API
-* [ARROW-872](https://issues.apache.org/jira/browse/ARROW-872) - [JS] Read streaming format
-* [ARROW-873](https://issues.apache.org/jira/browse/ARROW-873) - [JS] Implement fixed width list type
-* [ARROW-874](https://issues.apache.org/jira/browse/ARROW-874) - [JS] Read dictionary-encoded vectors
-* [ARROW-881](https://issues.apache.org/jira/browse/ARROW-881) - [Python] Reconstruct Pandas DataFrame indexes using custom\_metadata
-* [ARROW-891](https://issues.apache.org/jira/browse/ARROW-891) - [Python] Expand Windows build instructions to not require looking at separate C++ docs
-* [ARROW-899](https://issues.apache.org/jira/browse/ARROW-899) - [Docs] Add CHANGELOG for 0.3.0
-* [ARROW-901](https://issues.apache.org/jira/browse/ARROW-901) - [Python] Write FixedSizeBinary to Parquet
-* [ARROW-913](https://issues.apache.org/jira/browse/ARROW-913) - [Python] Only link jemalloc to the Cython extension where it's needed
-* [ARROW-923](https://issues.apache.org/jira/browse/ARROW-923) - [Docs] Generate Changelog for website with JIRA links
-* [ARROW-929](https://issues.apache.org/jira/browse/ARROW-929) - Move KEYS file to SVN, remove from git
-* [ARROW-943](https://issues.apache.org/jira/browse/ARROW-943) - [GLib] Support running unit tests with source archive
-* [ARROW-945](https://issues.apache.org/jira/browse/ARROW-945) - [GLib] Add a Lua example to show Torch integration
-* [ARROW-946](https://issues.apache.org/jira/browse/ARROW-946) - [GLib] Use "new" instead of "open" for constructor name
-* [ARROW-947](https://issues.apache.org/jira/browse/ARROW-947) - [Python] Improve execution time of manylinux1 build
-* [ARROW-953](https://issues.apache.org/jira/browse/ARROW-953) - Use cmake / curl from conda-forge in CI builds
-* [ARROW-954](https://issues.apache.org/jira/browse/ARROW-954) - Make it possible to compile Arrow with header-only boost
-* [ARROW-956](https://issues.apache.org/jira/browse/ARROW-956) - remove pandas pre-0.20.0 compat
-* [ARROW-957](https://issues.apache.org/jira/browse/ARROW-957) - [Doc] Add HDFS and Windows documents to doxygen output
-* [ARROW-961](https://issues.apache.org/jira/browse/ARROW-961) - [Python] Rename InMemoryOutputStream to BufferOutputStream
-* [ARROW-963](https://issues.apache.org/jira/browse/ARROW-963) - [GLib] Add equal
-* [ARROW-967](https://issues.apache.org/jira/browse/ARROW-967) - [GLib] Support initializing array with buffer
-* [ARROW-970](https://issues.apache.org/jira/browse/ARROW-970) - [Python] Accidentally calling pyarrow.Table() should not segfault process
-* [ARROW-977](https://issues.apache.org/jira/browse/ARROW-977) - [java] Add Timezone aware timestamp vectors
-* [ARROW-980](https://issues.apache.org/jira/browse/ARROW-980) - Fix detection of "msvc" COMPILER\_FAMILY
-* [ARROW-982](https://issues.apache.org/jira/browse/ARROW-982) - [Website] Improve website front copy to highlight serialization efficiency benefits
-* [ARROW-984](https://issues.apache.org/jira/browse/ARROW-984) - [GLib] Add Go examples
-* [ARROW-985](https://issues.apache.org/jira/browse/ARROW-985) - [GLib] Update package information
-* [ARROW-988](https://issues.apache.org/jira/browse/ARROW-988) - [JS] Add entry to Travis CI matrix
-* [ARROW-993](https://issues.apache.org/jira/browse/ARROW-993) - [GLib] Add missing error checks in Go examples
-* [ARROW-996](https://issues.apache.org/jira/browse/ARROW-996) - [Website] Add 0.3 release announce in Japanese
-* [ARROW-997](https://issues.apache.org/jira/browse/ARROW-997) - [Java] Implement transfer in FixedSizeListVector
-* [ARROW-1000](https://issues.apache.org/jira/browse/ARROW-1000) - [GLib] Move install document to Website
-* [ARROW-1001](https://issues.apache.org/jira/browse/ARROW-1001) - [GLib] Unify writer files
-* [ARROW-1002](https://issues.apache.org/jira/browse/ARROW-1002) - [C++] It is not necessary to add padding after the magic header in the FileWriter implementation
-* [ARROW-1008](https://issues.apache.org/jira/browse/ARROW-1008) - [C++] Define abstract interface for stream iteration
-* [ARROW-1010](https://issues.apache.org/jira/browse/ARROW-1010) - [Website] Only show English posts in /blog/
-* [ARROW-1011](https://issues.apache.org/jira/browse/ARROW-1011) - [Format] Clarify requirements around buffer padding in validity bitmaps
-* [ARROW-1014](https://issues.apache.org/jira/browse/ARROW-1014) - 0.4.0 release
-* [ARROW-1015](https://issues.apache.org/jira/browse/ARROW-1015) - [Java] Implement schema-level metadata
-* [ARROW-1016](https://issues.apache.org/jira/browse/ARROW-1016) - Python: Include C++ headers (optionally) in wheels
-* [ARROW-1022](https://issues.apache.org/jira/browse/ARROW-1022) - [Python] Add nthreads option to Feather read method
-* [ARROW-1024](https://issues.apache.org/jira/browse/ARROW-1024) - Python: Update build time numpy version to 1.10.1
-* [ARROW-1025](https://issues.apache.org/jira/browse/ARROW-1025) - [Website] Improve changelog on website
-* [ARROW-1027](https://issues.apache.org/jira/browse/ARROW-1027) - [Python] Allow negative indexing in fields/columns on pyarrow Table and Schema objects
-* [ARROW-1028](https://issues.apache.org/jira/browse/ARROW-1028) - [Python] Documentation updates after ARROW-1008
-* [ARROW-1029](https://issues.apache.org/jira/browse/ARROW-1029) - [Python] Fix --with-parquet build on Windows, add unit tests to Appveyor
-* [ARROW-1030](https://issues.apache.org/jira/browse/ARROW-1030) - Python: Account for library versioning in parquet-cpp
-* [ARROW-1031](https://issues.apache.org/jira/browse/ARROW-1031) - [GLib] Support pretty print
-* [ARROW-1037](https://issues.apache.org/jira/browse/ARROW-1037) - [GLib] Follow reader name change
-* [ARROW-1038](https://issues.apache.org/jira/browse/ARROW-1038) - [GLib] Follow writer name change
-* [ARROW-1040](https://issues.apache.org/jira/browse/ARROW-1040) - [GLib] Follow tensor IO
-* [ARROW-1044](https://issues.apache.org/jira/browse/ARROW-1044) - [GLib] Support Feather
-* [ARROW-1126](https://issues.apache.org/jira/browse/ARROW-1126) - Python: Add function to convert NumPy/Pandas dtypes to Arrow DataTypes
-
-
-
-# Apache Arrow 0.3.0 (2017-05-05)
-
-## Bug Fixes
-
-* [ARROW-109](https://issues.apache.org/jira/browse/ARROW-109) - [C++] Investigate recursive data types limit in flatbuffers
-* [ARROW-208](https://issues.apache.org/jira/browse/ARROW-208) - Add checkstyle policy to java project
-* [ARROW-347](https://issues.apache.org/jira/browse/ARROW-347) - Add method to pass CallBack when creating a transfer pair
-* [ARROW-413](https://issues.apache.org/jira/browse/ARROW-413) - DATE type is not specified clearly
-* [ARROW-431](https://issues.apache.org/jira/browse/ARROW-431) - [Python] Review GIL release and acquisition in to\_pandas conversion
-* [ARROW-443](https://issues.apache.org/jira/browse/ARROW-443) - [Python] Support for converting from strided pandas data in Table.from\_pandas
-* [ARROW-451](https://issues.apache.org/jira/browse/ARROW-451) - [C++] Override DataType::Equals for other types with additional metadata
-* [ARROW-454](https://issues.apache.org/jira/browse/ARROW-454) - pojo.Field doesn't implement hashCode()
-* [ARROW-526](https://issues.apache.org/jira/browse/ARROW-526) - [Format] Update IPC.md to account for File format changes and Streaming format
-* [ARROW-565](https://issues.apache.org/jira/browse/ARROW-565) - [C++] Examine "Field::dictionary" member
-* [ARROW-570](https://issues.apache.org/jira/browse/ARROW-570) - Determine Java tools JAR location from project metadata
-* [ARROW-584](https://issues.apache.org/jira/browse/ARROW-584) - [C++] Fix compiler warnings exposed with -Wconversion
-* [ARROW-586](https://issues.apache.org/jira/browse/ARROW-586) - Problem with reading parquet files saved by Apache Spark
-* [ARROW-588](https://issues.apache.org/jira/browse/ARROW-588) - [C++] Fix compiler warnings on 32-bit platforms
-* [ARROW-595](https://issues.apache.org/jira/browse/ARROW-595) - [Python] StreamReader.schema returns None
-* [ARROW-604](https://issues.apache.org/jira/browse/ARROW-604) - Python: boxed Field instances are missing the reference to DataType
-* [ARROW-611](https://issues.apache.org/jira/browse/ARROW-611) - [Java] TimeVector TypeLayout is incorrectly specified as 64 bit width
-* [ARROW-613](https://issues.apache.org/jira/browse/ARROW-613) - [JS] Implement random-access file format
-* [ARROW-617](https://issues.apache.org/jira/browse/ARROW-617) - Time type is not specified clearly
-* [ARROW-619](https://issues.apache.org/jira/browse/ARROW-619) - Python: Fix typos in setup.py args and LD\_LIBRARY\_PATH
-* [ARROW-619](https://issues.apache.org/jira/browse/ARROW-619) - Python: Fix typos in setup.py args and LD\_LIBRARY\_PATH
-* [ARROW-623](https://issues.apache.org/jira/browse/ARROW-623) - segfault with \_\_repr\_\_ of empty Field
-* [ARROW-624](https://issues.apache.org/jira/browse/ARROW-624) - [C++] Restore MakePrimitiveArray function
-* [ARROW-627](https://issues.apache.org/jira/browse/ARROW-627) - [C++] Compatibility macros for exported extern template class declarations
-* [ARROW-628](https://issues.apache.org/jira/browse/ARROW-628) - [Python] Install nomkl metapackage when building parquet-cpp for faster Travis builds
-* [ARROW-630](https://issues.apache.org/jira/browse/ARROW-630) - [C++] IPC unloading for BooleanArray does not account for offset
-* [ARROW-636](https://issues.apache.org/jira/browse/ARROW-636) - [C++] Add Boost / other system requirements to C++ README
-* [ARROW-639](https://issues.apache.org/jira/browse/ARROW-639) - [C++] Invalid offset in slices
-* [ARROW-642](https://issues.apache.org/jira/browse/ARROW-642) - [Java] Remove temporary file in java/tools
-* [ARROW-644](https://issues.apache.org/jira/browse/ARROW-644) - Python: Cython should be a setup-only requirement
-* [ARROW-652](https://issues.apache.org/jira/browse/ARROW-652) - Remove trailing f in merge script output
-* [ARROW-654](https://issues.apache.org/jira/browse/ARROW-654) - [C++] Support timezone metadata in file/stream formats
-* [ARROW-666](https://issues.apache.org/jira/browse/ARROW-666) - [Python] Error in DictionaryArray \_\_repr\_\_
-* [ARROW-667](https://issues.apache.org/jira/browse/ARROW-667) - build of arrow-master/cpp fails with altivec error?
-* [ARROW-668](https://issues.apache.org/jira/browse/ARROW-668) - [Python] Convert nanosecond timestamps to pandas.Timestamp when converting from TimestampValue
-* [ARROW-671](https://issues.apache.org/jira/browse/ARROW-671) - [GLib] License file isn't installed
-* [ARROW-673](https://issues.apache.org/jira/browse/ARROW-673) - [Java] Support additional Time metadata
-* [ARROW-677](https://issues.apache.org/jira/browse/ARROW-677) - [java] Fix checkstyle jcl-over-slf4j conflict issue
-* [ARROW-678](https://issues.apache.org/jira/browse/ARROW-678) - [GLib] Fix dependenciesfff
-* [ARROW-680](https://issues.apache.org/jira/browse/ARROW-680) - [C++] Multiarch support impacts user-supplied install prefix
-* [ARROW-682](https://issues.apache.org/jira/browse/ARROW-682) - Add self-validation checks in integration tests
-* [ARROW-683](https://issues.apache.org/jira/browse/ARROW-683) - [C++] Support date32 (DateUnit::DAY) in IPC metadata, rename date to date64
-* [ARROW-685](https://issues.apache.org/jira/browse/ARROW-685) - [GLib] AX\_CXX\_COMPILE\_STDCXX\_11 error running ./configure
-* [ARROW-686](https://issues.apache.org/jira/browse/ARROW-686) - [C++] Account for time metadata changes, add time32 and time64 types
-* [ARROW-689](https://issues.apache.org/jira/browse/ARROW-689) - [GLib] Install header files and documents to wrong directories
-* [ARROW-691](https://issues.apache.org/jira/browse/ARROW-691) - [Java] Encode dictionary Int type in message format
-* [ARROW-697](https://issues.apache.org/jira/browse/ARROW-697) - [Java] Raise appropriate exceptions when encountering large (\> INT32\_MAX) record batches
-* [ARROW-699](https://issues.apache.org/jira/browse/ARROW-699) - [C++] Arrow dynamic libraries are missed on run of unit tests on Windows
-* [ARROW-702](https://issues.apache.org/jira/browse/ARROW-702) - Fix BitVector.copyFromSafe to reAllocate instead of returning false
-* [ARROW-703](https://issues.apache.org/jira/browse/ARROW-703) - Fix issue where setValueCount(0) doesn’t work in the case that we’ve shipped vectors across the wire
-* [ARROW-704](https://issues.apache.org/jira/browse/ARROW-704) - Fix bad import caused by conflicting changes
-* [ARROW-709](https://issues.apache.org/jira/browse/ARROW-709) - [C++] Restore type comparator for DecimalType
-* [ARROW-713](https://issues.apache.org/jira/browse/ARROW-713) - [C++] Fix linking issue with ipc benchmark
-* [ARROW-715](https://issues.apache.org/jira/browse/ARROW-715) - Python: Explicit pandas import makes it a hard requirement
-* [ARROW-716](https://issues.apache.org/jira/browse/ARROW-716) - error building arrow/python
-* [ARROW-720](https://issues.apache.org/jira/browse/ARROW-720) - [java] arrow should not have a dependency on slf4j bridges in compile
-* [ARROW-723](https://issues.apache.org/jira/browse/ARROW-723) - Arrow freezes on write if chunk\_size=0
-* [ARROW-726](https://issues.apache.org/jira/browse/ARROW-726) - [C++] PyBuffer dtor may segfault if constructor passed an object not exporting buffer protocol
-* [ARROW-732](https://issues.apache.org/jira/browse/ARROW-732) - Schema comparison bugs in struct and union types
-* [ARROW-736](https://issues.apache.org/jira/browse/ARROW-736) - [Python] Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default
-* [ARROW-738](https://issues.apache.org/jira/browse/ARROW-738) - [Python] Fix manylinux1 packaging
-* [ARROW-739](https://issues.apache.org/jira/browse/ARROW-739) - Parallel build fails non-deterministically.
-* [ARROW-740](https://issues.apache.org/jira/browse/ARROW-740) - FileReader fails for large objects
-* [ARROW-747](https://issues.apache.org/jira/browse/ARROW-747) - [C++] Fix spurious warning caused by passing dl to add\_dependencies
-* [ARROW-749](https://issues.apache.org/jira/browse/ARROW-749) - [Python] Delete incomplete binary files when writing fails
-* [ARROW-753](https://issues.apache.org/jira/browse/ARROW-753) - [Python] Unit tests in arrow/python fail to link on some OS X platforms
-* [ARROW-756](https://issues.apache.org/jira/browse/ARROW-756) - [C++] Do not pass -fPIC when compiling with MSVC
-* [ARROW-757](https://issues.apache.org/jira/browse/ARROW-757) - [C++] MSVC build fails on googletest when using NMake
-* [ARROW-762](https://issues.apache.org/jira/browse/ARROW-762) - Kerberos Problem with PyArrow
-* [ARROW-776](https://issues.apache.org/jira/browse/ARROW-776) - [GLib] Cast type is wrong
-* [ARROW-777](https://issues.apache.org/jira/browse/ARROW-777) - [Java] Resolve getObject behavior per changes / discussion in ARROW-729
-* [ARROW-778](https://issues.apache.org/jira/browse/ARROW-778) - Modify merge tool to work on Windows
-* [ARROW-780](https://issues.apache.org/jira/browse/ARROW-780) - PYTHON\_EXECUTABLE Required to be set during build
-* [ARROW-781](https://issues.apache.org/jira/browse/ARROW-781) - [Python/C++] Increase reference count for base object?
-* [ARROW-783](https://issues.apache.org/jira/browse/ARROW-783) - Integration tests fail for length-0 record batch
-* [ARROW-787](https://issues.apache.org/jira/browse/ARROW-787) - [GLib] Fix compilation errors caused by ARROW-758
-* [ARROW-789](https://issues.apache.org/jira/browse/ARROW-789) - Fix issue where setValueCount(0) doesn’t work in the case that we’ve shipped vectors across the wire
-* [ARROW-793](https://issues.apache.org/jira/browse/ARROW-793) - [GLib] Wrong indent
-* [ARROW-794](https://issues.apache.org/jira/browse/ARROW-794) - [C++] Check whether data is contiguous in ipc::WriteTensor
-* [ARROW-796](https://issues.apache.org/jira/browse/ARROW-796) - [Java] Checkstyle additions causing build failure in some environments
-* [ARROW-797](https://issues.apache.org/jira/browse/ARROW-797) - [Python] Add updated pyarrow.\* public API listing in Sphinx docs
-* [ARROW-800](https://issues.apache.org/jira/browse/ARROW-800) - [C++] Boost headers being transitively included in pyarrow
-* [ARROW-805](https://issues.apache.org/jira/browse/ARROW-805) - listing empty HDFS directory returns an error instead of returning empty list
-* [ARROW-809](https://issues.apache.org/jira/browse/ARROW-809) - C++: Writing sliced record batch to IPC writes the entire array
-* [ARROW-812](https://issues.apache.org/jira/browse/ARROW-812) - Pip install pyarrow on mac failed.
-* [ARROW-817](https://issues.apache.org/jira/browse/ARROW-817) - [C++] Fix incorrect code comment from ARROW-722
-* [ARROW-821](https://issues.apache.org/jira/browse/ARROW-821) - [Python] Extra file \_table\_api.h generated during Python build process
-* [ARROW-822](https://issues.apache.org/jira/browse/ARROW-822) - [Python] StreamWriter fails to open with socket as sink
-* [ARROW-826](https://issues.apache.org/jira/browse/ARROW-826) - Compilation error on Mac with -DARROW\_PYTHON=on
-* [ARROW-829](https://issues.apache.org/jira/browse/ARROW-829) - Python: Parquet: Dictionary encoding is deactivated if column-wise compression was selected
-* [ARROW-830](https://issues.apache.org/jira/browse/ARROW-830) - Python: jemalloc is not anymore publicly exposed
-* [ARROW-836](https://issues.apache.org/jira/browse/ARROW-836) - Test for timedelta compat with pandas
-* [ARROW-839](https://issues.apache.org/jira/browse/ARROW-839) - [C++] Portable alternative to PyDate\_to\_ms function
-* [ARROW-847](https://issues.apache.org/jira/browse/ARROW-847) - C++: BUILD\_BYPRODUCTS not specified anymore for gtest
-* [ARROW-852](https://issues.apache.org/jira/browse/ARROW-852) - Python: Also set Arrow Library PATHS when detection was done through pkg-config
-* [ARROW-853](https://issues.apache.org/jira/browse/ARROW-853) - [Python] It is no longer necessary to modify the RPATH of the Cython extensions on many environments
-* [ARROW-858](https://issues.apache.org/jira/browse/ARROW-858) - Remove dependency on boost regex
-* [ARROW-866](https://issues.apache.org/jira/browse/ARROW-866) - [Python] Error from file object destructor
-* [ARROW-867](https://issues.apache.org/jira/browse/ARROW-867) - [Python] Miscellaneous pyarrow MSVC fixes
-* [ARROW-875](https://issues.apache.org/jira/browse/ARROW-875) - Nullable variable length vector fillEmpties() fills an extra value
-* [ARROW-879](https://issues.apache.org/jira/browse/ARROW-879) - compat with pandas 0.20.0
-* [ARROW-882](https://issues.apache.org/jira/browse/ARROW-882) - [C++] On Windows statically built lib file overwrites lib file of shared build
-* [ARROW-883](https://issues.apache.org/jira/browse/ARROW-883) - [JAVA] Introduction of new types has shifted Enumerations
-* [ARROW-885](https://issues.apache.org/jira/browse/ARROW-885) - [Python/C++] Decimal test failure on MSVC
-* [ARROW-886](https://issues.apache.org/jira/browse/ARROW-886) - VariableLengthVectors don't reAlloc offsets
-* [ARROW-887](https://issues.apache.org/jira/browse/ARROW-887) - [format] For backward compatibility, new unit fields must have default values matching previous implied unit
-* [ARROW-888](https://issues.apache.org/jira/browse/ARROW-888) - BitVector transfer() does not transfer ownership
-* [ARROW-895](https://issues.apache.org/jira/browse/ARROW-895) - Nullable variable length vector lastSet not set correctly
-* [ARROW-900](https://issues.apache.org/jira/browse/ARROW-900) - [Python] UnboundLocalError in ParquetDatasetPiece
-* [ARROW-903](https://issues.apache.org/jira/browse/ARROW-903) - [GLib] Remove a needless "."
-* [ARROW-914](https://issues.apache.org/jira/browse/ARROW-914) - [C++/Python] Fix Decimal ToBytes
-* [ARROW-922](https://issues.apache.org/jira/browse/ARROW-922) - Allow Flatbuffers and RapidJSON to be used locally on Windows
-* [ARROW-927](https://issues.apache.org/jira/browse/ARROW-927) - C++/Python: Add manylinux1 builds to Travis matrix
-* [ARROW-928](https://issues.apache.org/jira/browse/ARROW-928) - Update CMAKE script to detect unsupported msvc compilers versions
-* [ARROW-933](https://issues.apache.org/jira/browse/ARROW-933) - [Python] arrow\_python bindings have debug print statement
-* [ARROW-934](https://issues.apache.org/jira/browse/ARROW-934) - [GLib] Glib sources missing from result of 02-source.sh
-* [ARROW-936](https://issues.apache.org/jira/browse/ARROW-936) - Fix release README
-* [ARROW-936](https://issues.apache.org/jira/browse/ARROW-936) - Fix release README
-* [ARROW-938](https://issues.apache.org/jira/browse/ARROW-938) - Fix Apache Rat errors from source release build
-
-
-## New Features and Improvements
-
-* [ARROW-6](https://issues.apache.org/jira/browse/ARROW-6) - Hope to add development document
-* [ARROW-39](https://issues.apache.org/jira/browse/ARROW-39) - C++: Logical chunked arrays / columns: conforming to fixed chunk sizes
-* [ARROW-52](https://issues.apache.org/jira/browse/ARROW-52) - Set up project blog
-* [ARROW-95](https://issues.apache.org/jira/browse/ARROW-95) - Scaffold Main Documentation using asciidoc
-* [ARROW-98](https://issues.apache.org/jira/browse/ARROW-98) - Java: API documentation
-* [ARROW-99](https://issues.apache.org/jira/browse/ARROW-99) - C++: Explore if RapidCheck may be helpful for testing / worth adding to toolchain
-* [ARROW-183](https://issues.apache.org/jira/browse/ARROW-183) - C++: Add storage type to DecimalType
-* [ARROW-231](https://issues.apache.org/jira/browse/ARROW-231) - C++: Add typed Resize to PoolBuffer
-* [ARROW-281](https://issues.apache.org/jira/browse/ARROW-281) - [C++] IPC/RPC support on Win32 platforms
-* [ARROW-316](https://issues.apache.org/jira/browse/ARROW-316) - Finalize Date type
-* [ARROW-341](https://issues.apache.org/jira/browse/ARROW-341) - [Python] Making libpyarrow available to third parties
-* [ARROW-452](https://issues.apache.org/jira/browse/ARROW-452) - [C++/Python] Merge "Feather" file format implementation
-* [ARROW-459](https://issues.apache.org/jira/browse/ARROW-459) - [C++] Implement IPC round trip for DictionaryArray, dictionaries shared across record batches
-* [ARROW-483](https://issues.apache.org/jira/browse/ARROW-483) - [C++/Python] Provide access to "custom\_metadata" Field attribute in IPC setting
-* [ARROW-491](https://issues.apache.org/jira/browse/ARROW-491) - [C++] Add FixedWidthBinary type
-* [ARROW-492](https://issues.apache.org/jira/browse/ARROW-492) - [C++] Add arrow/arrow.h public API
-* [ARROW-493](https://issues.apache.org/jira/browse/ARROW-493) - [C++] Allow in-memory array over 2^31 -1 elements but require splitting at IPC / RPC boundaries
-* [ARROW-502](https://issues.apache.org/jira/browse/ARROW-502) - [C++/Python] Add MemoryPool implementation that logs allocation activity to std::cout
-* [ARROW-510](https://issues.apache.org/jira/browse/ARROW-510) - Add integration tests for date and time types
-* [ARROW-518](https://issues.apache.org/jira/browse/ARROW-518) - C++: Make Status::OK method constexpr
-* [ARROW-520](https://issues.apache.org/jira/browse/ARROW-520) - [C++] Add STL-compliant allocator that hooks into an arrow::MemoryPool
-* [ARROW-528](https://issues.apache.org/jira/browse/ARROW-528) - [Python] Support \_metadata or \_common\_metadata files when reading Parquet directories
-* [ARROW-534](https://issues.apache.org/jira/browse/ARROW-534) - [C++] Add IPC tests for date/time types
-* [ARROW-539](https://issues.apache.org/jira/browse/ARROW-539) - [Python] Support reading Parquet datasets with standard partition directory schemes
-* [ARROW-542](https://issues.apache.org/jira/browse/ARROW-542) - [Java] Implement dictionaries in stream/file encoding
-* [ARROW-550](https://issues.apache.org/jira/browse/ARROW-550) - [Format] Add a TensorMessage type
-* [ARROW-552](https://issues.apache.org/jira/browse/ARROW-552) - [Python] Add scalar value support for Dictionary type
-* [ARROW-557](https://issues.apache.org/jira/browse/ARROW-557) - [Python] Explicitly opt in to HDFS unit tests
-* [ARROW-563](https://issues.apache.org/jira/browse/ARROW-563) - C++: Support non-standard gcc version strings
-* [ARROW-566](https://issues.apache.org/jira/browse/ARROW-566) - Python: Deterministic position of libarrow in manylinux1 wheels
-* [ARROW-568](https://issues.apache.org/jira/browse/ARROW-568) - [C++] Add default implementations for TypeVisitor, ArrayVisitor methods that return NotImplemented
-* [ARROW-569](https://issues.apache.org/jira/browse/ARROW-569) - [C++] Set version for \*.pc
-* [ARROW-574](https://issues.apache.org/jira/browse/ARROW-574) - Python: Add support for nested Python lists in Pandas conversion
-* [ARROW-576](https://issues.apache.org/jira/browse/ARROW-576) - [C++] Complete round trip Union file/stream IPC tests
-* [ARROW-577](https://issues.apache.org/jira/browse/ARROW-577) - [C++] Refactor StreamWriter and FileWriter to have private implementations
-* [ARROW-578](https://issues.apache.org/jira/browse/ARROW-578) - [C++] Add CMake option to add custom $CXXFLAGS
-* [ARROW-580](https://issues.apache.org/jira/browse/ARROW-580) - C++: Also provide jemalloc\_X targets if only a static or shared version is found
-* [ARROW-582](https://issues.apache.org/jira/browse/ARROW-582) - [Java] Add Date/Time Support to JSON File
-* [ARROW-589](https://issues.apache.org/jira/browse/ARROW-589) - C++: Use system provided shared jemalloc if static is unavailable
-* [ARROW-591](https://issues.apache.org/jira/browse/ARROW-591) - [C++] Add round trip testing fixture for JSON format
-* [ARROW-593](https://issues.apache.org/jira/browse/ARROW-593) - [C++] Rename ReadableFileInterface to RandomAccessFile
-* [ARROW-598](https://issues.apache.org/jira/browse/ARROW-598) - [Python] Add support for converting pyarrow.Buffer to a memoryview with zero copy
-* [ARROW-603](https://issues.apache.org/jira/browse/ARROW-603) - [C++] Add RecordBatch::Validate method that at least checks that schema matches the array metadata
-* [ARROW-605](https://issues.apache.org/jira/browse/ARROW-605) - [C++] Refactor generic ArrayLoader class, support work for Feather merge
-* [ARROW-606](https://issues.apache.org/jira/browse/ARROW-606) - [C++] Upgrade to flatbuffers 1.6.0
-* [ARROW-608](https://issues.apache.org/jira/browse/ARROW-608) - [Format] Days since epoch date type
-* [ARROW-610](https://issues.apache.org/jira/browse/ARROW-610) - [C++] Win32 compatibility in file.cc
-* [ARROW-612](https://issues.apache.org/jira/browse/ARROW-612) - [Java] Field toString should show nullable flag status
-* [ARROW-615](https://issues.apache.org/jira/browse/ARROW-615) - Move ByteArrayReadableSeekableByteChannel to vector.util package
-* [ARROW-616](https://issues.apache.org/jira/browse/ARROW-616) - [C++] Remove -g flag in release builds
-* [ARROW-618](https://issues.apache.org/jira/browse/ARROW-618) - [Python] Implement support for DatetimeTZ custom type from pandas
-* [ARROW-620](https://issues.apache.org/jira/browse/ARROW-620) - [C++] Add date/time support to JSON reader/writer for integration testing
-* [ARROW-621](https://issues.apache.org/jira/browse/ARROW-621) - [C++] Implement an "inline visitor" template that enables visitor-pattern-like code without virtual function dispatch
-* [ARROW-625](https://issues.apache.org/jira/browse/ARROW-625) - [C++] Add time unit to TimeType::ToString
-* [ARROW-626](https://issues.apache.org/jira/browse/ARROW-626) - [Python] Enable pyarrow.BufferReader to read from any Python object implementing the buffer/memoryview protocol
-* [ARROW-631](https://issues.apache.org/jira/browse/ARROW-631) - [GLib] Import C API (C++ API wrapper) based on GLib from https://github.com/kou/arrow-glib
-* [ARROW-632](https://issues.apache.org/jira/browse/ARROW-632) - [Python] Add support for FixedWidthBinary type
-* [ARROW-635](https://issues.apache.org/jira/browse/ARROW-635) - [C++] Add JSON read/write support for FixedWidthBinary
-* [ARROW-637](https://issues.apache.org/jira/browse/ARROW-637) - [Format] Add time zone metadata to Timestamp type
-* [ARROW-646](https://issues.apache.org/jira/browse/ARROW-646) - Cache miniconda packages
-* [ARROW-647](https://issues.apache.org/jira/browse/ARROW-647) - [C++] Don't require Boost static libraries to support CentOS 7
-* [ARROW-648](https://issues.apache.org/jira/browse/ARROW-648) - [C++] Support multiarch on Debian
-* [ARROW-650](https://issues.apache.org/jira/browse/ARROW-650) - [GLib] Follow eadableFileInterface -\> RnadomAccessFile change
-* [ARROW-651](https://issues.apache.org/jira/browse/ARROW-651) - [C++] Set shared library version for .deb packages
-* [ARROW-655](https://issues.apache.org/jira/browse/ARROW-655) - Implement DecimalArray
-* [ARROW-656](https://issues.apache.org/jira/browse/ARROW-656) - [C++] Implement IO interface that can read and write to a fixed-size mutable buffer
-* [ARROW-657](https://issues.apache.org/jira/browse/ARROW-657) - [Python] Write and read tensors (with zero copy) into shared memory
-* [ARROW-658](https://issues.apache.org/jira/browse/ARROW-658) - [C++] Implement in-memory arrow::Tensor objects
-* [ARROW-659](https://issues.apache.org/jira/browse/ARROW-659) - [C++] Add multithreaded memcpy implementation (for hardware where it helps)
-* [ARROW-660](https://issues.apache.org/jira/browse/ARROW-660) - [C++] Restore function that can read a complete encapsulated record batch message
-* [ARROW-661](https://issues.apache.org/jira/browse/ARROW-661) - [C++] Add a Flatbuffer metadata type that supports array data over 2^31 - 1 elements
-* [ARROW-662](https://issues.apache.org/jira/browse/ARROW-662) - [Format] Factor Flatbuffer schema metadata into a Schema.fbs
-* [ARROW-663](https://issues.apache.org/jira/browse/ARROW-663) - [Java] Support additional Time metadata + vector value accessors
-* [ARROW-664](https://issues.apache.org/jira/browse/ARROW-664) - Make C++ Arrow serialization deterministic
-* [ARROW-669](https://issues.apache.org/jira/browse/ARROW-669) - [Python] Attach proper tzinfo when computing boxed scalars for TimestampArray
-* [ARROW-670](https://issues.apache.org/jira/browse/ARROW-670) - Arrow 0.3 release
-* [ARROW-672](https://issues.apache.org/jira/browse/ARROW-672) - [Format] Bump metadata version for 0.3 release
-* [ARROW-674](https://issues.apache.org/jira/browse/ARROW-674) - [Java] Support additional Timestamp timezone metadata
-* [ARROW-675](https://issues.apache.org/jira/browse/ARROW-675) - [GLib] Update package metadata
-* [ARROW-676](https://issues.apache.org/jira/browse/ARROW-676) - [java] move from MinorType to FieldType in ValueVectors to carry all the relevant type bits
-* [ARROW-679](https://issues.apache.org/jira/browse/ARROW-679) - [Format] Change RecordBatch and Field length members from int to long
-* [ARROW-681](https://issues.apache.org/jira/browse/ARROW-681) - [C++] Build Arrow on Windows with dynamically linked boost
-* [ARROW-684](https://issues.apache.org/jira/browse/ARROW-684) - Python: More informative message when parquet-cpp but not parquet-arrow is available
-* [ARROW-687](https://issues.apache.org/jira/browse/ARROW-687) - [C++] Build and run full test suite in Appveyor
-* [ARROW-688](https://issues.apache.org/jira/browse/ARROW-688) - [C++] Use CMAKE\_INSTALL\_INCLUDEDIR for consistency
-* [ARROW-690](https://issues.apache.org/jira/browse/ARROW-690) - Only send JIRA updates to [email protected]
-* [ARROW-698](https://issues.apache.org/jira/browse/ARROW-698) - [C++] Add options to StreamWriter/FileWriter to permit large record batches
-* [ARROW-700](https://issues.apache.org/jira/browse/ARROW-700) - Add headroom interface for allocator.
-* [ARROW-701](https://issues.apache.org/jira/browse/ARROW-701) - [Java] Support additional Date metadata
-* [ARROW-706](https://issues.apache.org/jira/browse/ARROW-706) - [GLib] Add package install document
-* [ARROW-707](https://issues.apache.org/jira/browse/ARROW-707) - Python: All none-Pandas column should be converted to NullArray
-* [ARROW-708](https://issues.apache.org/jira/browse/ARROW-708) - [C++] Some IPC code simplification, perf analysis
-* [ARROW-710](https://issues.apache.org/jira/browse/ARROW-710) - [Python] Enable Feather APIs to read and write using Python file-like objects
-* [ARROW-711](https://issues.apache.org/jira/browse/ARROW-711) - [C++] Remove extern template declarations for NumericArray<T\> types
-* [ARROW-712](https://issues.apache.org/jira/browse/ARROW-712) - [C++] Implement Array::Accept as inline visitor
-* [ARROW-717](https://issues.apache.org/jira/browse/ARROW-717) - [C++] IPC zero-copy round trips for arrow::Tensor
-* [ARROW-718](https://issues.apache.org/jira/browse/ARROW-718) - [Python] Expose arrow::Tensor with conversions to/from NumPy arrays
-* [ARROW-719](https://issues.apache.org/jira/browse/ARROW-719) - [GLib] Support prepared source archive release
-* [ARROW-722](https://issues.apache.org/jira/browse/ARROW-722) - [Python] pandas conversions for new date and time types/metadata
-* [ARROW-724](https://issues.apache.org/jira/browse/ARROW-724) - Add "How to Contribute" section to README
-* [ARROW-725](https://issues.apache.org/jira/browse/ARROW-725) - [Format] Constant length list type
-* [ARROW-727](https://issues.apache.org/jira/browse/ARROW-727) - [Python] Write memoryview-compatible objects in NativeFile.write with zero copy
-* [ARROW-728](https://issues.apache.org/jira/browse/ARROW-728) - [C++/Python] Add arrow::Table function for removing a column
-* [ARROW-729](https://issues.apache.org/jira/browse/ARROW-729) - [Java] Add vector type for 32-bit date as days since UNIX epoch
-* [ARROW-731](https://issues.apache.org/jira/browse/ARROW-731) - [C++] Add shared library related versions to .pc
-* [ARROW-733](https://issues.apache.org/jira/browse/ARROW-733) - [C++/Format] Change name of Fixed Width Binary to Fixed \*Size\* Binary for consistency
-* [ARROW-734](https://issues.apache.org/jira/browse/ARROW-734) - [Python] Support for pyarrow on Windows / MSVC
-* [ARROW-735](https://issues.apache.org/jira/browse/ARROW-735) - [C++] Developer instruction document for MSVC on Windows
-* [ARROW-737](https://issues.apache.org/jira/browse/ARROW-737) - [C++] Support obtaining mutable slices of mutable buffers
-* [ARROW-741](https://issues.apache.org/jira/browse/ARROW-741) - [Python] Add Python 3.6 to Travis CI
-* [ARROW-743](https://issues.apache.org/jira/browse/ARROW-743) - [C++] Consolidate unit tests for code in array.h
-* [ARROW-744](https://issues.apache.org/jira/browse/ARROW-744) - [GLib] Re-add an assertion to garrow\_table\_new() test
-* [ARROW-745](https://issues.apache.org/jira/browse/ARROW-745) - [C++] Allow use of system cpplint
-* [ARROW-746](https://issues.apache.org/jira/browse/ARROW-746) - [GLib] Add garrow\_array\_get\_data\_type()
-* [ARROW-748](https://issues.apache.org/jira/browse/ARROW-748) - [Python] Pin runtime library versions in conda-forge packages to force upgrades
-* [ARROW-751](https://issues.apache.org/jira/browse/ARROW-751) - [Python] Rename all Cython extensions to "private" status with leading underscore
-* [ARROW-752](https://issues.apache.org/jira/browse/ARROW-752) - [Python] Construct pyarrow.DictionaryArray from boxed pyarrow array objects
-* [ARROW-754](https://issues.apache.org/jira/browse/ARROW-754) - [GLib] Add garrow\_array\_is\_null()
-* [ARROW-755](https://issues.apache.org/jira/browse/ARROW-755) - [GLib] Add garrow\_array\_get\_value\_type()
-* [ARROW-758](https://issues.apache.org/jira/browse/ARROW-758) - [C++] Fix compiler warnings on MSVC x64
-* [ARROW-761](https://issues.apache.org/jira/browse/ARROW-761) - [Python] Add function to compute the total size of tensor payloads, including metadata and padding
-* [ARROW-763](https://issues.apache.org/jira/browse/ARROW-763) - C++: Use \`python-config\` to find libpythonX.X.dylib
-* [ARROW-765](https://issues.apache.org/jira/browse/ARROW-765) - [Python] Make generic ArrowException subclass value error
-* [ARROW-768](https://issues.apache.org/jira/browse/ARROW-768) - [Java] Change the "boxed" object representation of date and time types
-* [ARROW-769](https://issues.apache.org/jira/browse/ARROW-769) - [GLib] Support building without installed Arrow C++
-* [ARROW-770](https://issues.apache.org/jira/browse/ARROW-770) - [C++] Move clang-tidy/format config files back to C++ source tree
-* [ARROW-771](https://issues.apache.org/jira/browse/ARROW-771) - [Python] Add APIs for reading individual Parquet row groups
-* [ARROW-773](https://issues.apache.org/jira/browse/ARROW-773) - [C++] Add function to create arrow::Table with column appended to existing table
-* [ARROW-774](https://issues.apache.org/jira/browse/ARROW-774) - [GLib] Remove needless LICENSE.txt copy
-* [ARROW-775](https://issues.apache.org/jira/browse/ARROW-775) - [Java] add simple constructors to value vectors
-* [ARROW-779](https://issues.apache.org/jira/browse/ARROW-779) - [C++/Python] Raise exception if old metadata encountered
-* [ARROW-782](https://issues.apache.org/jira/browse/ARROW-782) - [C++] Change struct to class for objects that meet the criteria in the Google style guide
-* [ARROW-788](https://issues.apache.org/jira/browse/ARROW-788) - Possible nondeterminism in Tensor serialization code
-* [ARROW-795](https://issues.apache.org/jira/browse/ARROW-795) - [C++] Combine libarrow/libarrow\_io/libarrow\_ipc
-* [ARROW-798](https://issues.apache.org/jira/browse/ARROW-798) - [Docs] Publish Format Markdown documents somehow on arrow.apache.org
-* [ARROW-802](https://issues.apache.org/jira/browse/ARROW-802) - [GLib] Add read examples
-* [ARROW-803](https://issues.apache.org/jira/browse/ARROW-803) - [GLib] Update package repository URL
-* [ARROW-804](https://issues.apache.org/jira/browse/ARROW-804) - [GLib] Update build document
-* [ARROW-806](https://issues.apache.org/jira/browse/ARROW-806) - [GLib] Support add/remove a column from table
-* [ARROW-807](https://issues.apache.org/jira/browse/ARROW-807) - [GLib] Update "Since" tag
-* [ARROW-808](https://issues.apache.org/jira/browse/ARROW-808) - [GLib] Remove needless ignore entries
-* [ARROW-810](https://issues.apache.org/jira/browse/ARROW-810) - [GLib] Remove io/ipc prefix
-* [ARROW-811](https://issues.apache.org/jira/browse/ARROW-811) - [GLib] Add GArrowBuffer
-* [ARROW-815](https://issues.apache.org/jira/browse/ARROW-815) - [Java] Allow for expanding underlying buffer size after allocation
-* [ARROW-816](https://issues.apache.org/jira/browse/ARROW-816) - [C++] Use conda packages for RapidJSON, Flatbuffers to speed up builds
-* [ARROW-818](https://issues.apache.org/jira/browse/ARROW-818) - [Python] Review public pyarrow.\* API completeness and update docs
-* [ARROW-820](https://issues.apache.org/jira/browse/ARROW-820) - [C++] Build dependencies for Parquet library without arrow support
-* [ARROW-825](https://issues.apache.org/jira/browse/ARROW-825) - [Python] Generalize pyarrow.from\_pylist to accept any object implementing the PySequence protocol
-* [ARROW-827](https://issues.apache.org/jira/browse/ARROW-827) - [Python] Variety of Parquet improvements to support Dask integration
-* [ARROW-828](https://issues.apache.org/jira/browse/ARROW-828) - [CPP] Document new requirement (libboost-regex-dev) in README.md
-* [ARROW-831](https://issues.apache.org/jira/browse/ARROW-831) - Switch from boost::regex to std::regex
-* [ARROW-832](https://issues.apache.org/jira/browse/ARROW-832) - [C++] Upgrade thirdparty gtest to 1.8.0
-* [ARROW-833](https://issues.apache.org/jira/browse/ARROW-833) - [Python] "Quickstart" build / environment setup guide for Python developers
-* [ARROW-841](https://issues.apache.org/jira/browse/ARROW-841) - [Python] Add pyarrow build to Appveyor
-* [ARROW-844](https://issues.apache.org/jira/browse/ARROW-844) - [Format] Revise format/README.md to reflect progress reaching a more complete specification
-* [ARROW-845](https://issues.apache.org/jira/browse/ARROW-845) - [Python] Sync FindArrow.cmake changes from parquet-cpp
-* [ARROW-846](https://issues.apache.org/jira/browse/ARROW-846) - [GLib] Add GArrowTensor, GArrowInt8Tensor and GArrowUInt8Tensor
-* [ARROW-848](https://issues.apache.org/jira/browse/ARROW-848) - [Python] Improvements / fixes to conda quickstart guide
-* [ARROW-849](https://issues.apache.org/jira/browse/ARROW-849) - [C++] Add optional $ARROW\_BUILD\_TOOLCHAIN environment variable option for configuring build environment
-* [ARROW-857](https://issues.apache.org/jira/browse/ARROW-857) - [Python] Automate publishing Python documentation to arrow-site
-* [ARROW-859](https://issues.apache.org/jira/browse/ARROW-859) - [C++] Do not build unit tests by default?
-* [ARROW-860](https://issues.apache.org/jira/browse/ARROW-860) - [C++] Decide if typed Tensor subclasses are worthwhile
-* [ARROW-861](https://issues.apache.org/jira/browse/ARROW-861) - [Python] Move DEVELOPMENT.md to Sphinx docs
-* [ARROW-862](https://issues.apache.org/jira/browse/ARROW-862) - [Python] Improve source build instructions in README
-* [ARROW-863](https://issues.apache.org/jira/browse/ARROW-863) - [GLib] Use GBytes to implement zero-copy
-* [ARROW-864](https://issues.apache.org/jira/browse/ARROW-864) - [GLib] Unify Array files
-* [ARROW-865](https://issues.apache.org/jira/browse/ARROW-865) - [Python] Verify Parquet roundtrips for new date/time types
-* [ARROW-868](https://issues.apache.org/jira/browse/ARROW-868) - [GLib] Use GBytes to reduce copy
-* [ARROW-869](https://issues.apache.org/jira/browse/ARROW-869) - [JS] Rename directory to js/
-* [ARROW-871](https://issues.apache.org/jira/browse/ARROW-871) - [GLib] Unify DataType files
-* [ARROW-876](https://issues.apache.org/jira/browse/ARROW-876) - [GLib] Unify ArrayBuffer files
-* [ARROW-877](https://issues.apache.org/jira/browse/ARROW-877) - [GLib] Add garrow\_array\_get\_null\_bitmap()
-* [ARROW-878](https://issues.apache.org/jira/browse/ARROW-878) - [GLib] Add garrow\_binary\_array\_get\_buffer()
-* [ARROW-880](https://issues.apache.org/jira/browse/ARROW-880) - [GLib] Add garrow\_primitive\_array\_get\_buffer()
-* [ARROW-890](https://issues.apache.org/jira/browse/ARROW-890) - [GLib] Add GArrowMutableBuffer
-* [ARROW-892](https://issues.apache.org/jira/browse/ARROW-892) - [GLib] Fix GArrowTensor document
-* [ARROW-893](https://issues.apache.org/jira/browse/ARROW-893) - Add GLib document to Web site
-* [ARROW-894](https://issues.apache.org/jira/browse/ARROW-894) - [GLib] Add GArrowPoolBuffer
-* [ARROW-896](https://issues.apache.org/jira/browse/ARROW-896) - [Docs] Add Jekyll plugin for including rendered Jupyter notebooks on website
-* [ARROW-898](https://issues.apache.org/jira/browse/ARROW-898) - [C++] Expand metadata support to field level, provide for sharing instances of KeyValueMetadata
-* [ARROW-904](https://issues.apache.org/jira/browse/ARROW-904) - [GLib] Simplify error check codes
-* [ARROW-907](https://issues.apache.org/jira/browse/ARROW-907) - C++: Convenience construct Table from schema and arrays
-* [ARROW-908](https://issues.apache.org/jira/browse/ARROW-908) - [GLib] Unify OutputStream files
-* [ARROW-910](https://issues.apache.org/jira/browse/ARROW-910) - [C++] Write 0-length EOS indicator at end of stream
-* [ARROW-916](https://issues.apache.org/jira/browse/ARROW-916) - [GLib] Add GArrowBufferOutputStream
-* [ARROW-917](https://issues.apache.org/jira/browse/ARROW-917) - [GLib] Add GArrowBufferReader
-* [ARROW-918](https://issues.apache.org/jira/browse/ARROW-918) - [GLib] Use GArrowBuffer for read
-* [ARROW-919](https://issues.apache.org/jira/browse/ARROW-919) - [GLib] Use "id" to get type enum value from GArrowDataType
-* [ARROW-920](https://issues.apache.org/jira/browse/ARROW-920) - [GLib] Add Lua examples
-* [ARROW-925](https://issues.apache.org/jira/browse/ARROW-925) - [GLib] Fix GArrowBufferReader test
-* [ARROW-926](https://issues.apache.org/jira/browse/ARROW-926) - Update KEYS to include wesm
-* [ARROW-930](https://issues.apache.org/jira/browse/ARROW-930) - javadoc generation fails with java 8
-* [ARROW-931](https://issues.apache.org/jira/browse/ARROW-931) - [GLib] Reconstruct input stream
-* [ARROW-965](https://issues.apache.org/jira/browse/ARROW-965) - Website updates for 0.3.0 release
-
-
-
-# Apache Arrow 0.2.0 (2017-02-18)
-
-## Bug Fixes
-
-* [ARROW-112](https://issues.apache.org/jira/browse/ARROW-112) - [C++] Style fix for constants/enums
-* [ARROW-202](https://issues.apache.org/jira/browse/ARROW-202) - [C++] Integrate with appveyor ci for windows support and get arrow building on windows
-* [ARROW-220](https://issues.apache.org/jira/browse/ARROW-220) - [C++] Build conda artifacts in a build environment with better cross-linux ABI compatibility
-* [ARROW-224](https://issues.apache.org/jira/browse/ARROW-224) - [C++] Address static linking of boost dependencies
-* [ARROW-230](https://issues.apache.org/jira/browse/ARROW-230) - Python: Do not name modules like native ones (i.e. rename pyarrow.io)
-* [ARROW-239](https://issues.apache.org/jira/browse/ARROW-239) - [Python] HdfsFile.read called with no arguments should read remainder of file
-* [ARROW-261](https://issues.apache.org/jira/browse/ARROW-261) - [C++] Refactor BinaryArray/StringArray classes to not inherit from ListArray
-* [ARROW-273](https://issues.apache.org/jira/browse/ARROW-273) - Lists use unsigned offset vectors instead of signed (as defined in the spec)
-* [ARROW-275](https://issues.apache.org/jira/browse/ARROW-275) - Add tests for UnionVector in Arrow File
-* [ARROW-294](https://issues.apache.org/jira/browse/ARROW-294) - [C++] Do not use fopen / fclose / etc. methods for memory mapped file implementation
-* [ARROW-322](https://issues.apache.org/jira/browse/ARROW-322) - [C++] Do not build HDFS IO interface optionally
-* [ARROW-323](https://issues.apache.org/jira/browse/ARROW-323) - [Python] Opt-in to PyArrow parquet build rather than skipping silently on failure
-* [ARROW-334](https://issues.apache.org/jira/browse/ARROW-334) - [Python] OS X rpath issues on some configurations
-* [ARROW-337](https://issues.apache.org/jira/browse/ARROW-337) - UnionListWriter.list() is doing more than it should, this can cause data corruption
-* [ARROW-339](https://issues.apache.org/jira/browse/ARROW-339) - Make merge\_arrow\_pr script work with Python 3
-* [ARROW-339](https://issues.apache.org/jira/browse/ARROW-339) - Make merge\_arrow\_pr script work with Python 3
-* [ARROW-340](https://issues.apache.org/jira/browse/ARROW-340) - [C++] Opening a writeable file on disk that already exists does not truncate to zero
-* [ARROW-342](https://issues.apache.org/jira/browse/ARROW-342) - Set Python version on release
-* [ARROW-345](https://issues.apache.org/jira/browse/ARROW-345) - libhdfs integration doesn't work for Mac
-* [ARROW-346](https://issues.apache.org/jira/browse/ARROW-346) - Python API Documentation
-* [ARROW-348](https://issues.apache.org/jira/browse/ARROW-348) - [Python] CMake build type should be configurable on the command line
-* [ARROW-349](https://issues.apache.org/jira/browse/ARROW-349) - Six is missing as a requirement in the python setup.py
-* [ARROW-351](https://issues.apache.org/jira/browse/ARROW-351) - Time type has no unit
-* [ARROW-354](https://issues.apache.org/jira/browse/ARROW-354) - Connot compare an array of empty strings to another
-* [ARROW-357](https://issues.apache.org/jira/browse/ARROW-357) - Default Parquet chunk\_size of 64k is too small
-* [ARROW-358](https://issues.apache.org/jira/browse/ARROW-358) - [C++] libhdfs can be in non-standard locations in some Hadoop distributions
-* [ARROW-362](https://issues.apache.org/jira/browse/ARROW-362) - Python: Calling to\_pandas on a table read from Parquet leaks memory
-* [ARROW-371](https://issues.apache.org/jira/browse/ARROW-371) - Python: Table with null timestamp becomes float in pandas
-* [ARROW-375](https://issues.apache.org/jira/browse/ARROW-375) - columns parameter in parquet.read\_table() raises KeyError for valid column
-* [ARROW-384](https://issues.apache.org/jira/browse/ARROW-384) - Align Java and C++ RecordBatch data and metadata layout
-* [ARROW-386](https://issues.apache.org/jira/browse/ARROW-386) - [Java] Respect case of struct / map field names
-* [ARROW-387](https://issues.apache.org/jira/browse/ARROW-387) - [C++] arrow::io::BufferReader does not permit shared memory ownership in zero-copy reads
-* [ARROW-390](https://issues.apache.org/jira/browse/ARROW-390) - C++: CMake fails on json-integration-test with ARROW\_BUILD\_TESTS=OFF
-* [ARROW-392](https://issues.apache.org/jira/browse/ARROW-392) - Fix string/binary integration tests
-* [ARROW-393](https://issues.apache.org/jira/browse/ARROW-393) - [JAVA] JSON file reader fails to set the buffer size on String data vector
-* [ARROW-395](https://issues.apache.org/jira/browse/ARROW-395) - Arrow file format writes record batches in reverse order.
-* [ARROW-398](https://issues.apache.org/jira/browse/ARROW-398) - [Java] Java file format requires bitmaps of all 1's to be written when there are no nulls
-* [ARROW-399](https://issues.apache.org/jira/browse/ARROW-399) - [Java] ListVector.loadFieldBuffers ignores the ArrowFieldNode length metadata
-* [ARROW-400](https://issues.apache.org/jira/browse/ARROW-400) - [Java] ArrowWriter writes length 0 for Struct types
-* [ARROW-401](https://issues.apache.org/jira/browse/ARROW-401) - [Java] Floating point vectors should do an approximate comparison in integration tests
-* [ARROW-402](https://issues.apache.org/jira/browse/ARROW-402) - [Java] "refCnt gone negative" error in integration tests
-* [ARROW-403](https://issues.apache.org/jira/browse/ARROW-403) - [JAVA] UnionVector: Creating a transfer pair doesn't transfer the schema to destination vector
-* [ARROW-404](https://issues.apache.org/jira/browse/ARROW-404) - [Python] Closing an HdfsClient while there are still open file handles results in a crash
-* [ARROW-405](https://issues.apache.org/jira/browse/ARROW-405) - [C++] Be less stringent about finding include/hdfs.h in HADOOP\_HOME
-* [ARROW-406](https://issues.apache.org/jira/browse/ARROW-406) - [C++] Large HDFS reads must utilize the set file buffer size when making RPCs
-* [ARROW-408](https://issues.apache.org/jira/browse/ARROW-408) - [C++/Python] Remove defunct conda recipes
-* [ARROW-414](https://issues.apache.org/jira/browse/ARROW-414) - [Java] "Buffer too large to resize to ..." error
-* [ARROW-420](https://issues.apache.org/jira/browse/ARROW-420) - Align Date implementation between Java and C++
-* [ARROW-421](https://issues.apache.org/jira/browse/ARROW-421) - [Python] Zero-copy buffers read by pyarrow::PyBytesReader must retain a reference to the parent PyBytes to avoid premature garbage collection issues
-* [ARROW-422](https://issues.apache.org/jira/browse/ARROW-422) - C++: IPC should depend on rapidjson\_ep if RapidJSON is vendored
-* [ARROW-429](https://issues.apache.org/jira/browse/ARROW-429) - git-archive SHA-256 checksums are changing
-* [ARROW-433](https://issues.apache.org/jira/browse/ARROW-433) - [Python] Date conversion is locale-dependent
-* [ARROW-434](https://issues.apache.org/jira/browse/ARROW-434) - Segfaults and encoding issues in Python Parquet reads
-* [ARROW-435](https://issues.apache.org/jira/browse/ARROW-435) - C++: Spelling mistake in if(RAPIDJSON\_VENDORED)
-* [ARROW-437](https://issues.apache.org/jira/browse/ARROW-437) - [C++] clang compiler warnings from overridden virtual functions
-* [ARROW-445](https://issues.apache.org/jira/browse/ARROW-445) - C++: arrow\_ipc is built before arrow/ipc/Message\_generated.h was generated
-* [ARROW-447](https://issues.apache.org/jira/browse/ARROW-447) - Python: Align scalar/pylist string encoding with pandas' one.
-* [ARROW-455](https://issues.apache.org/jira/browse/ARROW-455) - [C++] BufferOutputStream dtor does not call Close()
-* [ARROW-469](https://issues.apache.org/jira/browse/ARROW-469) - C++: Add option so that resize doesn't decrease the capacity
-* [ARROW-481](https://issues.apache.org/jira/browse/ARROW-481) - [Python] Fix Python 2.7 regression in patch for PARQUET-472
-* [ARROW-486](https://issues.apache.org/jira/browse/ARROW-486) - [C++] arrow::io::MemoryMappedFile can't be casted to arrow::io::FileInterface
-* [ARROW-487](https://issues.apache.org/jira/browse/ARROW-487) - Python: ConvertTableToPandas segfaults if ObjectBlock::Write fails
-* [ARROW-494](https://issues.apache.org/jira/browse/ARROW-494) - [C++] When MemoryMappedFile is destructed, memory is unmapped even if buffer referecnes still exist
-* [ARROW-499](https://issues.apache.org/jira/browse/ARROW-499) - Update file serialization to use streaming serialization format
-* [ARROW-505](https://issues.apache.org/jira/browse/ARROW-505) - [C++] Fix compiler warnings in release mode
-* [ARROW-511](https://issues.apache.org/jira/browse/ARROW-511) - [Python] List[T] conversions not implemented for single arrays
-* [ARROW-513](https://issues.apache.org/jira/browse/ARROW-513) - [C++] Fix Appveyor build
-* [ARROW-516](https://issues.apache.org/jira/browse/ARROW-516) - Building pyarrow with parquet
-* [ARROW-519](https://issues.apache.org/jira/browse/ARROW-519) - [C++] Missing vtable in libarrow.dylib on Xcode 6.4
-* [ARROW-523](https://issues.apache.org/jira/browse/ARROW-523) - Python: Account for changes in PARQUET-834
-* [ARROW-533](https://issues.apache.org/jira/browse/ARROW-533) - [C++] arrow::TimestampArray / TimeArray has a broken constructor
-* [ARROW-535](https://issues.apache.org/jira/browse/ARROW-535) - [Python] Add type mapping for NPY\_LONGLONG
-* [ARROW-537](https://issues.apache.org/jira/browse/ARROW-537) - [C++] StringArray/BinaryArray comparisons may be incorrect when values with non-zero length are null
-* [ARROW-540](https://issues.apache.org/jira/browse/ARROW-540) - [C++] Fix build in aftermath of ARROW-33
-* [ARROW-543](https://issues.apache.org/jira/browse/ARROW-543) - C++: Lazily computed null\_counts counts number of non-null entries
-* [ARROW-544](https://issues.apache.org/jira/browse/ARROW-544) - [C++] ArrayLoader::LoadBinary fails for length-0 arrays
-* [ARROW-545](https://issues.apache.org/jira/browse/ARROW-545) - [Python] Ignore files without .parq or .parquet prefix when reading directory of files
-* [ARROW-548](https://issues.apache.org/jira/browse/ARROW-548) - [Python] Add nthreads option to pyarrow.Filesystem.read\_parquet
-* [ARROW-551](https://issues.apache.org/jira/browse/ARROW-551) - C++: Construction of Column with nullptr Array segfaults
-* [ARROW-556](https://issues.apache.org/jira/browse/ARROW-556) - [Integration] Can not run Integration tests if different cpp build path
-* [ARROW-561](https://issues.apache.org/jira/browse/ARROW-561) - Update java & python dependencies to improve downstream packaging experience
-* [ARROW-562](https://issues.apache.org/jira/browse/ARROW-562) - Mockito should be in test scope
-
-
-## New Features and Improvements
-
-* [ARROW-33](https://issues.apache.org/jira/browse/ARROW-33) - C++: Implement zero-copy array slicing
-* [ARROW-81](https://issues.apache.org/jira/browse/ARROW-81) - [Format] Add a Category logical type (distinct from dictionary-encoding)
-* [ARROW-96](https://issues.apache.org/jira/browse/ARROW-96) - C++: API documentation using Doxygen
-* [ARROW-97](https://issues.apache.org/jira/browse/ARROW-97) - Python: API documentation via sphinx-apidoc
-* [ARROW-108](https://issues.apache.org/jira/browse/ARROW-108) - [C++] Add IPC round trip for union types
-* [ARROW-189](https://issues.apache.org/jira/browse/ARROW-189) - C++: Use ExternalProject to build thirdparty dependencies
-* [ARROW-191](https://issues.apache.org/jira/browse/ARROW-191) - Python: Provide infrastructure for manylinux1 wheels
-* [ARROW-221](https://issues.apache.org/jira/browse/ARROW-221) - Add switch for writing Parquet 1.0 compatible logical types
-* [ARROW-227](https://issues.apache.org/jira/browse/ARROW-227) - [C++/Python] Hook arrow\_io generic reader / writer interface into arrow\_parquet
-* [ARROW-228](https://issues.apache.org/jira/browse/ARROW-228) - [Python] Create an Arrow-cpp-compatible interface for reading bytes from Python file-like objects
-* [ARROW-240](https://issues.apache.org/jira/browse/ARROW-240) - Installation instructions for pyarrow
-* [ARROW-243](https://issues.apache.org/jira/browse/ARROW-243) - [C++] Add "driver" option to HdfsClient to choose between libhdfs and libhdfs3 at runtime
-* [ARROW-268](https://issues.apache.org/jira/browse/ARROW-268) - [C++] Flesh out union implementation to have all required methods for IPC
-* [ARROW-303](https://issues.apache.org/jira/browse/ARROW-303) - [C++] Also build static libraries for leaf libraries
-* [ARROW-312](https://issues.apache.org/jira/browse/ARROW-312) - [Python] Provide Python API to read/write the Arrow IPC file format
-* [ARROW-312](https://issues.apache.org/jira/browse/ARROW-312) - [Python] Provide Python API to read/write the Arrow IPC file format
-* [ARROW-317](https://issues.apache.org/jira/browse/ARROW-317) - [C++] Implement zero-copy Slice method on arrow::Buffer that retains reference to parent
-* [ARROW-327](https://issues.apache.org/jira/browse/ARROW-327) - [Python] Remove conda builds from Travis CI processes
-* [ARROW-328](https://issues.apache.org/jira/browse/ARROW-328) - [C++] Return shared\_ptr by value instead of const-ref?
-* [ARROW-330](https://issues.apache.org/jira/browse/ARROW-330) - [C++] CMake functions to simplify shared / static library configuration
-* [ARROW-332](https://issues.apache.org/jira/browse/ARROW-332) - [Python] Add helper function to convert RecordBatch to pandas.DataFrame
-* [ARROW-333](https://issues.apache.org/jira/browse/ARROW-333) - Make writers update their internal schema even when no data is written.
-* [ARROW-335](https://issues.apache.org/jira/browse/ARROW-335) - Improve Type apis and toString() by encapsulating flatbuffers better
-* [ARROW-336](https://issues.apache.org/jira/browse/ARROW-336) - Run Apache Rat in Travis builds
-* [ARROW-338](https://issues.apache.org/jira/browse/ARROW-338) - [C++] Refactor IPC vector "loading" and "unloading" to be based on cleaner visitor pattern
-* [ARROW-344](https://issues.apache.org/jira/browse/ARROW-344) - Instructions for building with conda
-* [ARROW-350](https://issues.apache.org/jira/browse/ARROW-350) - Add Kerberos support to HDFS shim
-* [ARROW-353](https://issues.apache.org/jira/browse/ARROW-353) - Arrow release 0.2
-* [ARROW-355](https://issues.apache.org/jira/browse/ARROW-355) - Add tests for serialising arrays of empty strings to Parquet
-* [ARROW-356](https://issues.apache.org/jira/browse/ARROW-356) - Add documentation about reading Parquet
-* [ARROW-359](https://issues.apache.org/jira/browse/ARROW-359) - Need to document ARROW\_LIBHDFS\_DIR
-* [ARROW-360](https://issues.apache.org/jira/browse/ARROW-360) - C++: Add method to shrink PoolBuffer using realloc
-* [ARROW-361](https://issues.apache.org/jira/browse/ARROW-361) - Python: Support reading a column-selection from Parquet files
-* [ARROW-363](https://issues.apache.org/jira/browse/ARROW-363) - Set up Java/C++ integration test harness
-* [ARROW-365](https://issues.apache.org/jira/browse/ARROW-365) - Python: Provide Array.to\_pandas()
-* [ARROW-366](https://issues.apache.org/jira/browse/ARROW-366) - [java] implement Dictionary vector
-* [ARROW-367](https://issues.apache.org/jira/browse/ARROW-367) - [java] converter csv/json <=\> Arrow file format for Integration tests
-* [ARROW-368](https://issues.apache.org/jira/browse/ARROW-368) - Document use of LD\_LIBRARY\_PATH when using Python
-* [ARROW-369](https://issues.apache.org/jira/browse/ARROW-369) - [Python] Add ability to convert multiple record batches at once to pandas
-* [ARROW-370](https://issues.apache.org/jira/browse/ARROW-370) - Python: Pandas conversion from \`datetime.date\` columns
-* [ARROW-372](https://issues.apache.org/jira/browse/ARROW-372) - Create JSON arrow file format for integration tests
-* [ARROW-373](https://issues.apache.org/jira/browse/ARROW-373) - [C++] Implement C++ version of JSON file format for testing
-* [ARROW-374](https://issues.apache.org/jira/browse/ARROW-374) - Python: clarify unicode vs. binary in API
-* [ARROW-377](https://issues.apache.org/jira/browse/ARROW-377) - Python: Add support for conversion of Pandas.Categorical
-* [ARROW-379](https://issues.apache.org/jira/browse/ARROW-379) - Python: Use setuptools\_scm/setuptools\_scm\_git\_archive to provide the version number
-* [ARROW-380](https://issues.apache.org/jira/browse/ARROW-380) - [Java] optimize null count when serializing vectors.
-* [ARROW-381](https://issues.apache.org/jira/browse/ARROW-381) - [C++] Simplify primitive array type builders to use a default type singleton
-* [ARROW-382](https://issues.apache.org/jira/browse/ARROW-382) - Python: Extend API documentation
-* [ARROW-383](https://issues.apache.org/jira/browse/ARROW-383) - [C++] Implement C++ version of ARROW-367 integration test validator
-* [ARROW-389](https://issues.apache.org/jira/browse/ARROW-389) - Python: Write Parquet files to pyarrow.io.NativeFile objects
-* [ARROW-394](https://issues.apache.org/jira/browse/ARROW-394) - Add integration tests for boolean, list, struct, and other basic types
-* [ARROW-396](https://issues.apache.org/jira/browse/ARROW-396) - Python: Add pyarrow.schema.Schema.equals
-* [ARROW-409](https://issues.apache.org/jira/browse/ARROW-409) - Python: Change pyarrow.Table.dataframe\_from\_batches API to create Table instead
-* [ARROW-410](https://issues.apache.org/jira/browse/ARROW-410) - [C++] Add Flush method to arrow::io::OutputStream
-* [ARROW-411](https://issues.apache.org/jira/browse/ARROW-411) - [Java] Move Intergration.compare and Intergration.compareSchemas to a public utils class
-* [ARROW-415](https://issues.apache.org/jira/browse/ARROW-415) - C++: Add Equals implementation to compare Tables
-* [ARROW-416](https://issues.apache.org/jira/browse/ARROW-416) - C++: Add Equals implementation to compare Columns
-* [ARROW-417](https://issues.apache.org/jira/browse/ARROW-417) - C++: Add Equals implementation to compare ChunkedArrays
-* [ARROW-418](https://issues.apache.org/jira/browse/ARROW-418) - [C++] Consolidate array container and builder code, remove arrow/types
-* [ARROW-419](https://issues.apache.org/jira/browse/ARROW-419) - [C++] Promote util/{status.h, buffer.h, memory-pool.h} to top level of arrow/ source directory
-* [ARROW-423](https://issues.apache.org/jira/browse/ARROW-423) - C++: Define BUILD\_BYPRODUCTS in external project to support non-make CMake generators
-* [ARROW-425](https://issues.apache.org/jira/browse/ARROW-425) - Python: Expose a C function to convert arrow::Table to pyarrow.Table
-* [ARROW-426](https://issues.apache.org/jira/browse/ARROW-426) - Python: Conversion from pyarrow.Array to a Python list
-* [ARROW-427](https://issues.apache.org/jira/browse/ARROW-427) - [C++] Implement dictionary-encoded array container
-* [ARROW-428](https://issues.apache.org/jira/browse/ARROW-428) - [Python] Deserialize from Arrow record batches to pandas in parallel using a thread pool
-* [ARROW-430](https://issues.apache.org/jira/browse/ARROW-430) - Python: Better version handling
-* [ARROW-432](https://issues.apache.org/jira/browse/ARROW-432) - [Python] Avoid unnecessary memory copy in to\_pandas conversion by using low-level pandas internals APIs
-* [ARROW-438](https://issues.apache.org/jira/browse/ARROW-438) - [Python] Concatenate Table instances with equal schemas
-* [ARROW-440](https://issues.apache.org/jira/browse/ARROW-440) - [C++] Support pkg-config
-* [ARROW-441](https://issues.apache.org/jira/browse/ARROW-441) - [Python] Expose Arrow's file and memory map classes as NativeFile subclasses
-* [ARROW-442](https://issues.apache.org/jira/browse/ARROW-442) - [Python] Add public Python API to inspect Parquet file metadata
-* [ARROW-444](https://issues.apache.org/jira/browse/ARROW-444) - [Python] Avoid unnecessary memory copies from use of PyBytes\_\* C APIs
-* [ARROW-449](https://issues.apache.org/jira/browse/ARROW-449) - Python: Conversion from pyarrow.{Table,RecordBatch} to a Python dict
-* [ARROW-450](https://issues.apache.org/jira/browse/ARROW-450) - Python: Fixes for PARQUET-818
-* [ARROW-456](https://issues.apache.org/jira/browse/ARROW-456) - C++: Add jemalloc based MemoryPool
-* [ARROW-457](https://issues.apache.org/jira/browse/ARROW-457) - Python: Better control over memory pool
-* [ARROW-458](https://issues.apache.org/jira/browse/ARROW-458) - [Python] Expose jemalloc MemoryPool
-* [ARROW-461](https://issues.apache.org/jira/browse/ARROW-461) - [Python] Implement conversion between arrow::DictionaryArray and pandas.Categorical
-* [ARROW-463](https://issues.apache.org/jira/browse/ARROW-463) - C++: Support jemalloc 4.x
-* [ARROW-466](https://issues.apache.org/jira/browse/ARROW-466) - C++: ExternalProject for jemalloc
-* [ARROW-467](https://issues.apache.org/jira/browse/ARROW-467) - [Python] Run parquet-cpp unit tests in Travis CI
-* [ARROW-468](https://issues.apache.org/jira/browse/ARROW-468) - Python: Conversion of nested data in pd.DataFrames to/from Arrow structures
-* [ARROW-470](https://issues.apache.org/jira/browse/ARROW-470) - [Python] Add "FileSystem" abstraction to access directories of files in a uniform way
-* [ARROW-471](https://issues.apache.org/jira/browse/ARROW-471) - [Python] Enable ParquetFile to pass down separately-obtained file metadata
-* [ARROW-472](https://issues.apache.org/jira/browse/ARROW-472) - [Python] Expose parquet::{SchemaDescriptor, ColumnDescriptor}::Equals
-* [ARROW-474](https://issues.apache.org/jira/browse/ARROW-474) - Create an Arrow streaming file fomat
-* [ARROW-475](https://issues.apache.org/jira/browse/ARROW-475) - [Python] High level support for reading directories of Parquet files (as a single Arrow table) from supported file system interfaces
-* [ARROW-476](https://issues.apache.org/jira/browse/ARROW-476) - [Integration] Add integration tests for Binary / Varbytes type
-* [ARROW-477](https://issues.apache.org/jira/browse/ARROW-477) - [Java] Add support for second/microsecond/nanosecond timestamps in-memory and in IPC/JSON layer
-* [ARROW-478](https://issues.apache.org/jira/browse/ARROW-478) - [Python] Accept a PyBytes object in the pyarrow.io.BufferReader ctor
-* [ARROW-479](https://issues.apache.org/jira/browse/ARROW-479) - Python: Test for expected schema in Pandas conversion
-* [ARROW-484](https://issues.apache.org/jira/browse/ARROW-484) - Add more detail about what of technology can be found in the Arrow implementations to README
-* [ARROW-485](https://issues.apache.org/jira/browse/ARROW-485) - [Java] Users are required to initialize VariableLengthVectors.offsetVector before calling VariableLengthVectors.mutator.getSafe
-* [ARROW-490](https://issues.apache.org/jira/browse/ARROW-490) - Python: Update manylinux1 build scripts
-* [ARROW-495](https://issues.apache.org/jira/browse/ARROW-495) - [C++] Add C++ implementation of streaming serialized format
-* [ARROW-497](https://issues.apache.org/jira/browse/ARROW-497) - [Java] Integration test harness for streaming format
-* [ARROW-498](https://issues.apache.org/jira/browse/ARROW-498) - [C++] Integration test harness for streaming format
-* [ARROW-503](https://issues.apache.org/jira/browse/ARROW-503) - [Python] Interface to streaming binary format
-* [ARROW-506](https://issues.apache.org/jira/browse/ARROW-506) - Implement Arrow Echo server for integration testing
-* [ARROW-508](https://issues.apache.org/jira/browse/ARROW-508) - [C++] Make file/memory-mapped file interfaces threadsafe
-* [ARROW-509](https://issues.apache.org/jira/browse/ARROW-509) - [Python] Add support for PARQUET-835 (parallel column reads)
-* [ARROW-512](https://issues.apache.org/jira/browse/ARROW-512) - C++: Add method to check for primitive types
-* [ARROW-514](https://issues.apache.org/jira/browse/ARROW-514) - [Python] Accept pyarrow.io.Buffer as input to StreamReader, FileReader classes
-* [ARROW-515](https://issues.apache.org/jira/browse/ARROW-515) - [Python] Add StreamReader/FileReader methods that read all record batches as a Table
-* [ARROW-521](https://issues.apache.org/jira/browse/ARROW-521) - [C++/Python] Track peak memory use in default MemoryPool
-* [ARROW-524](https://issues.apache.org/jira/browse/ARROW-524) - [java] provide apis to access nested vectors and buffers
-* [ARROW-525](https://issues.apache.org/jira/browse/ARROW-525) - Python: Add more documentation to the package
-* [ARROW-527](https://issues.apache.org/jira/browse/ARROW-527) - clean drill-module.conf file
-* [ARROW-529](https://issues.apache.org/jira/browse/ARROW-529) - Python: Add jemalloc and Python 3.6 to manylinux1 build
-* [ARROW-531](https://issues.apache.org/jira/browse/ARROW-531) - Python: Document jemalloc, extend Pandas section, add Getting Involved
-* [ARROW-538](https://issues.apache.org/jira/browse/ARROW-538) - [C++] Set up AddressSanitizer (ASAN) builds
-* [ARROW-546](https://issues.apache.org/jira/browse/ARROW-546) - Python: Account for changes in PARQUET-867
-* [ARROW-547](https://issues.apache.org/jira/browse/ARROW-547) - [Python] Expose Array::Slice and RecordBatch::Slice
-* [ARROW-553](https://issues.apache.org/jira/browse/ARROW-553) - C++: Faster valid bitmap building
-* [ARROW-558](https://issues.apache.org/jira/browse/ARROW-558) - Add KEYS files
-
-
-
-# Apache Arrow 0.1.0 (2016-10-10)
-
-## New Features and Improvements
-
-* [ARROW-1](https://issues.apache.org/jira/browse/ARROW-1) - Import Initial Codebase
-* [ARROW-2](https://issues.apache.org/jira/browse/ARROW-2) - Post Simple Website
-* [ARROW-3](https://issues.apache.org/jira/browse/ARROW-3) - Post Initial Arrow Format Spec
-* [ARROW-4](https://issues.apache.org/jira/browse/ARROW-4) - Initial Arrow CPP Implementation
-* [ARROW-7](https://issues.apache.org/jira/browse/ARROW-7) - Add Python library build toolchain
-* [ARROW-8](https://issues.apache.org/jira/browse/ARROW-8) - Set up Travis CI
-* [ARROW-9](https://issues.apache.org/jira/browse/ARROW-9) - Rename some unchanged "Drill" to "Arrow"
-* [ARROW-9](https://issues.apache.org/jira/browse/ARROW-9) - Rename some unchanged "Drill" to "Arrow"
-* [ARROW-10](https://issues.apache.org/jira/browse/ARROW-10) - Fix mismatch of javadoc names and method parameters
-* [ARROW-11](https://issues.apache.org/jira/browse/ARROW-11) - Mirror JIRA activity to [email protected]
-* [ARROW-13](https://issues.apache.org/jira/browse/ARROW-13) - Add PR merge tool similar to that used in Parquet
-* [ARROW-14](https://issues.apache.org/jira/browse/ARROW-14) - Add JIRA components
-* [ARROW-15](https://issues.apache.org/jira/browse/ARROW-15) - Fix a naming typo for memory.AllocationManager.AllocationOutcome
-* [ARROW-19](https://issues.apache.org/jira/browse/ARROW-19) - C++: Externalize memory allocations and add a MemoryPool abstract interface to builder classes
-* [ARROW-20](https://issues.apache.org/jira/browse/ARROW-20) - C++: Add null count member to Array containers, remove nullable member
-* [ARROW-21](https://issues.apache.org/jira/browse/ARROW-21) - C++: Add in-memory schema metadata container
-* [ARROW-22](https://issues.apache.org/jira/browse/ARROW-22) - C++: Add schema adapter routines for converting flat Parquet schemas to in-memory Arrow schemas
-* [ARROW-23](https://issues.apache.org/jira/browse/ARROW-23) - C++: Add logical "Column" container for chunked data
-* [ARROW-24](https://issues.apache.org/jira/browse/ARROW-24) - C++: Add logical "Table" container
-* [ARROW-26](https://issues.apache.org/jira/browse/ARROW-26) - C++: Add developer instructions for building parquet-cpp integration
-* [ARROW-28](https://issues.apache.org/jira/browse/ARROW-28) - C++: Add google/benchmark to the 3rd-party build toolchain
-* [ARROW-30](https://issues.apache.org/jira/browse/ARROW-30) - Python: pandas/NumPy to/from Arrow conversion routines
-* [ARROW-31](https://issues.apache.org/jira/browse/ARROW-31) - Python: basic PyList <-\> Arrow marshaling code
-* [ARROW-35](https://issues.apache.org/jira/browse/ARROW-35) - Add a short call-to-action / how-to-get-involved to the main README.md
-* [ARROW-37](https://issues.apache.org/jira/browse/ARROW-37) - C++: Represent boolean array data in bit-packed form
-* [ARROW-42](https://issues.apache.org/jira/browse/ARROW-42) - Python: Add to Travis CI build
-* [ARROW-43](https://issues.apache.org/jira/browse/ARROW-43) - Python: Add rudimentary console \_\_repr\_\_ for array types
-* [ARROW-44](https://issues.apache.org/jira/browse/ARROW-44) - Python: Implement basic object model for scalar values (i.e. results of arrow\_arr[i])
-* [ARROW-48](https://issues.apache.org/jira/browse/ARROW-48) - Python: Add Schema object wrapper
-* [ARROW-49](https://issues.apache.org/jira/browse/ARROW-49) - Python: Add Column and Table wrapper interface
-* [ARROW-50](https://issues.apache.org/jira/browse/ARROW-50) - C++: Enable library builds for 3rd-party users without having to build thirdparty googletest
-* [ARROW-53](https://issues.apache.org/jira/browse/ARROW-53) - Python: Fix RPATH and add source installation instructions
-* [ARROW-54](https://issues.apache.org/jira/browse/ARROW-54) - Python: rename package to "pyarrow"
-* [ARROW-56](https://issues.apache.org/jira/browse/ARROW-56) - Format: Specify LSB bit ordering in bit arrays
-* [ARROW-57](https://issues.apache.org/jira/browse/ARROW-57) - Format: Draft data headers IDL for data interchange
-* [ARROW-58](https://issues.apache.org/jira/browse/ARROW-58) - Format: Draft type metadata ("schemas") IDL
-* [ARROW-59](https://issues.apache.org/jira/browse/ARROW-59) - Python: Boolean data support for builtin data structures
-* [ARROW-60](https://issues.apache.org/jira/browse/ARROW-60) - C++: Struct type builder API
-* [ARROW-64](https://issues.apache.org/jira/browse/ARROW-64) - Add zsh support to C++ build scripts
-* [ARROW-66](https://issues.apache.org/jira/browse/ARROW-66) - Maybe some missing steps in installation guide
-* [ARROW-67](https://issues.apache.org/jira/browse/ARROW-67) - C++: Draft type metadata conversion to/from IPC representation
-* [ARROW-68](https://issues.apache.org/jira/browse/ARROW-68) - Update setup\_build\_env and third-party script to be more userfriendly
-* [ARROW-70](https://issues.apache.org/jira/browse/ARROW-70) - C++: Add "lite" DCHECK macros used in parquet-cpp
-* [ARROW-71](https://issues.apache.org/jira/browse/ARROW-71) - C++: Add script to run clang-tidy on codebase
-* [ARROW-73](https://issues.apache.org/jira/browse/ARROW-73) - Support CMake 2.8
-* [ARROW-76](https://issues.apache.org/jira/browse/ARROW-76) - Revise format document to include null count, defer non-nullable arrays to the domain of metadata
-* [ARROW-78](https://issues.apache.org/jira/browse/ARROW-78) - C++: Add constructor for DecimalType
-* [ARROW-79](https://issues.apache.org/jira/browse/ARROW-79) - Python: Add benchmarks
-* [ARROW-82](https://issues.apache.org/jira/browse/ARROW-82) - C++: Implement IPC exchange for List types
-* [ARROW-85](https://issues.apache.org/jira/browse/ARROW-85) - C++: memcmp can be avoided in Equal when comparing with the same Buffer
-* [ARROW-86](https://issues.apache.org/jira/browse/ARROW-86) - Python: Implement zero-copy Arrow-to-Pandas conversion
-* [ARROW-87](https://issues.apache.org/jira/browse/ARROW-87) - Implement Decimal schema conversion for all ways supported in Parquet
-* [ARROW-89](https://issues.apache.org/jira/browse/ARROW-89) - Python: Add benchmarks for Arrow<-\>Pandas conversion
-* [ARROW-90](https://issues.apache.org/jira/browse/ARROW-90) - Apache Arrow cpp code does not support power architecture
-* [ARROW-91](https://issues.apache.org/jira/browse/ARROW-91) - C++: First draft of an adapter class for parquet-cpp's ParquetFileReader that produces Arrow table/row batch objects
-* [ARROW-92](https://issues.apache.org/jira/browse/ARROW-92) - C++: Arrow to Parquet Schema conversion
-* [ARROW-100](https://issues.apache.org/jira/browse/ARROW-100) - [C++] Computing RowBatch size
-* [ARROW-101](https://issues.apache.org/jira/browse/ARROW-101) - Fix java warnings emitted by java compiler
-* [ARROW-102](https://issues.apache.org/jira/browse/ARROW-102) - travis-ci support for java project
-* [ARROW-106](https://issues.apache.org/jira/browse/ARROW-106) - Add IPC round trip for string types (string, char, varchar, binary)
-* [ARROW-107](https://issues.apache.org/jira/browse/ARROW-107) - [C++] add ipc round trip for struct types
-* [ARROW-190](https://issues.apache.org/jira/browse/ARROW-190) - Python: Provide installable sdist builds
-* [ARROW-196](https://issues.apache.org/jira/browse/ARROW-196) - [C++] Add conda dev recipe for libarrow and libarrow\_parquet
-* [ARROW-197](https://issues.apache.org/jira/browse/ARROW-197) - [Python] Add conda dev recipe for pyarrow
-* [ARROW-199](https://issues.apache.org/jira/browse/ARROW-199) - [C++] Refine third party dependency
-* [ARROW-201](https://issues.apache.org/jira/browse/ARROW-201) - C++: Initial ParquetWriter implementation
-* [ARROW-203](https://issues.apache.org/jira/browse/ARROW-203) - Python: Basic filename based Parquet read/write
-* [ARROW-204](https://issues.apache.org/jira/browse/ARROW-204) - [Python] Automate uploading conda build artifacts for libarrow and pyarrow
-* [ARROW-206](https://issues.apache.org/jira/browse/ARROW-206) - [C++] Expose an equality API for arrays that compares a range of slots on two arrays
-* [ARROW-207](https://issues.apache.org/jira/browse/ARROW-207) - Extend BufferAllocator interface to allow decorators around BufferAllocator
-* [ARROW-212](https://issues.apache.org/jira/browse/ARROW-212) - [C++] Clarify the fact that PrimitiveArray is now abstract class
-* [ARROW-213](https://issues.apache.org/jira/browse/ARROW-213) - Exposing static arrow build
-* [ARROW-214](https://issues.apache.org/jira/browse/ARROW-214) - C++: Add String support to Parquet I/O
-* [ARROW-215](https://issues.apache.org/jira/browse/ARROW-215) - C++: Support other integer types in Parquet I/O
-* [ARROW-218](https://issues.apache.org/jira/browse/ARROW-218) - Add option to use GitHub API token via environment variable when merging PRs
-* [ARROW-222](https://issues.apache.org/jira/browse/ARROW-222) - [C++] Create prototype file-like interface to HDFS (via libhdfs) and begin defining more general IO interface for Arrow data adapters
-* [ARROW-233](https://issues.apache.org/jira/browse/ARROW-233) - [C++] Add visibility defines for limiting shared library symbol visibility
-* [ARROW-234](https://issues.apache.org/jira/browse/ARROW-234) - [C++] Build with libhdfs support in arrow\_io in conda builds
-* [ARROW-236](https://issues.apache.org/jira/browse/ARROW-236) - [Python] Enable Parquet read/write to work with HDFS file objects
-* [ARROW-237](https://issues.apache.org/jira/browse/ARROW-237) - [C++] Create Arrow specializations of Parquet allocator and read interfaces
-* [ARROW-238](https://issues.apache.org/jira/browse/ARROW-238) - C++: InternalMemoryPool::Free() should throw an error when there is insufficient allocated memory
-* [ARROW-242](https://issues.apache.org/jira/browse/ARROW-242) - C++/Python: Support Timestamp Data Type
-* [ARROW-245](https://issues.apache.org/jira/browse/ARROW-245) - [Format] Clarify Arrow's relationship with big endian platforms
-* [ARROW-251](https://issues.apache.org/jira/browse/ARROW-251) - [C++] Expose APIs for getting code and message of the status
-* [ARROW-252](https://issues.apache.org/jira/browse/ARROW-252) - Add implementation guidelines to the documentation
-* [ARROW-253](https://issues.apache.org/jira/browse/ARROW-253) - Int types should only have width of 8\*2^n (8, 16, 32, 64)
-* [ARROW-254](https://issues.apache.org/jira/browse/ARROW-254) - Remove Bit type as it is redundant with boolean
-* [ARROW-255](https://issues.apache.org/jira/browse/ARROW-255) - Finalize Dictionary representation
-* [ARROW-256](https://issues.apache.org/jira/browse/ARROW-256) - Add versioning to the arrow spec.
-* [ARROW-257](https://issues.apache.org/jira/browse/ARROW-257) - Add a typeids Vector to Union type
-* [ARROW-262](https://issues.apache.org/jira/browse/ARROW-262) - [Format] Add a new format document for metadata and logical types for messaging and IPC / on-wire/file representations
-* [ARROW-264](https://issues.apache.org/jira/browse/ARROW-264) - Create an Arrow File format
-* [ARROW-267](https://issues.apache.org/jira/browse/ARROW-267) - [C++] C++ implementation of file-like layout for RPC / IPC
-* [ARROW-270](https://issues.apache.org/jira/browse/ARROW-270) - [Format] Define more generic Interval logical type
-* [ARROW-271](https://issues.apache.org/jira/browse/ARROW-271) - Update Field structure to be more explicit
-* [ARROW-272](https://issues.apache.org/jira/browse/ARROW-272) - Arrow release 0.1
-* [ARROW-279](https://issues.apache.org/jira/browse/ARROW-279) - rename vector module to arrow-vector for consistency
-* [ARROW-280](https://issues.apache.org/jira/browse/ARROW-280) - [C++] Consolidate file and shared memory IO interfaces
-* [ARROW-282](https://issues.apache.org/jira/browse/ARROW-282) - Make parquet-cpp an optional dependency of pyarrow
-* [ARROW-285](https://issues.apache.org/jira/browse/ARROW-285) - Allow for custom flatc compiler
-* [ARROW-286](https://issues.apache.org/jira/browse/ARROW-286) - Build thirdparty dependencies in parallel
-* [ARROW-289](https://issues.apache.org/jira/browse/ARROW-289) - Install test-util.h
-* [ARROW-290](https://issues.apache.org/jira/browse/ARROW-290) - Specialize alloc() in ArrowBuf
-* [ARROW-291](https://issues.apache.org/jira/browse/ARROW-291) - [Python] Update NOTICE file for Python codebase
-* [ARROW-292](https://issues.apache.org/jira/browse/ARROW-292) - [Java] Upgrade Netty to 4.041
-* [ARROW-293](https://issues.apache.org/jira/browse/ARROW-293) - [C++] Implementations of IO interfaces for operating system files
-* [ARROW-296](https://issues.apache.org/jira/browse/ARROW-296) - [C++] Remove arrow\_parquet C++ module and related parts of build system
-* [ARROW-298](https://issues.apache.org/jira/browse/ARROW-298) - create release scripts
-* [ARROW-299](https://issues.apache.org/jira/browse/ARROW-299) - Use absolute namespace in macros
-* [ARROW-301](https://issues.apache.org/jira/browse/ARROW-301) - [Format] Add some form of user field metadata to IPC schemas
-* [ARROW-302](https://issues.apache.org/jira/browse/ARROW-302) - [Python] Add support to use the Arrow file format with file-like objects
-* [ARROW-305](https://issues.apache.org/jira/browse/ARROW-305) - Add compression and use\_dictionary options to Parquet interface
-* [ARROW-306](https://issues.apache.org/jira/browse/ARROW-306) - Add option to pass cmake arguments via environment variable
-* [ARROW-315](https://issues.apache.org/jira/browse/ARROW-315) - Finalize timestamp type
-* [ARROW-318](https://issues.apache.org/jira/browse/ARROW-318) - [Python] Revise README to reflect current state of project
-* [ARROW-319](https://issues.apache.org/jira/browse/ARROW-319) - Add canonical Arrow Schema json representation
-* [ARROW-324](https://issues.apache.org/jira/browse/ARROW-324) - Update arrow metadata diagram
-* [ARROW-325](https://issues.apache.org/jira/browse/ARROW-325) - make TestArrowFile not dependent on timezone
-
-
-## Bug Fixes
-
-* [ARROW-5](https://issues.apache.org/jira/browse/ARROW-5) - Error when run maven install
-* [ARROW-5](https://issues.apache.org/jira/browse/ARROW-5) - Error when run maven install
-* [ARROW-16](https://issues.apache.org/jira/browse/ARROW-16) - Building cpp issues on XCode 7.2.1
-* [ARROW-17](https://issues.apache.org/jira/browse/ARROW-17) - Set some vector fields to default access level for Drill compatibility
-* [ARROW-18](https://issues.apache.org/jira/browse/ARROW-18) - Fix bug with decimal precision and scale
-* [ARROW-36](https://issues.apache.org/jira/browse/ARROW-36) - Remove fixVersions from patch tool (until we have them)
-* [ARROW-46](https://issues.apache.org/jira/browse/ARROW-46) - Port DRILL-4410 to Arrow
-* [ARROW-51](https://issues.apache.org/jira/browse/ARROW-51) - Move ValueVector test from Drill project
-* [ARROW-55](https://issues.apache.org/jira/browse/ARROW-55) - Python: fix legacy Python (2.7) tests and add to Travis CI
-* [ARROW-62](https://issues.apache.org/jira/browse/ARROW-62) - Format: Are the nulls bits 0 or 1 for null values?
-* [ARROW-63](https://issues.apache.org/jira/browse/ARROW-63) - C++: ctest fails if Python 3 is the active Python interpreter
-* [ARROW-65](https://issues.apache.org/jira/browse/ARROW-65) - Python: FindPythonLibsNew does not work in a virtualenv
-* [ARROW-69](https://issues.apache.org/jira/browse/ARROW-69) - Change permissions for assignable users
-* [ARROW-72](https://issues.apache.org/jira/browse/ARROW-72) - FindParquet searches for non-existent header
-* [ARROW-75](https://issues.apache.org/jira/browse/ARROW-75) - C++: Fix handling of empty strings
-* [ARROW-77](https://issues.apache.org/jira/browse/ARROW-77) - C++: conform null bit interpretation to match ARROW-62
-* [ARROW-80](https://issues.apache.org/jira/browse/ARROW-80) - Segmentation fault on len(Array) for empty arrays
-* [ARROW-83](https://issues.apache.org/jira/browse/ARROW-83) - Add basic test infrastructure for DecimalType
-* [ARROW-84](https://issues.apache.org/jira/browse/ARROW-84) - C++: separate test codes
-* [ARROW-88](https://issues.apache.org/jira/browse/ARROW-88) - C++: Refactor given PARQUET-572
-* [ARROW-93](https://issues.apache.org/jira/browse/ARROW-93) - XCode 7.3 breaks builds
-* [ARROW-94](https://issues.apache.org/jira/browse/ARROW-94) - Expand list example to clarify null vs empty list
-* [ARROW-103](https://issues.apache.org/jira/browse/ARROW-103) - Missing patterns from .gitignore
-* [ARROW-104](https://issues.apache.org/jira/browse/ARROW-104) - Update Layout.md based on discussion on the mailing list
-* [ARROW-105](https://issues.apache.org/jira/browse/ARROW-105) - Unit tests fail if assertions are disabled
-* [ARROW-113](https://issues.apache.org/jira/browse/ARROW-113) - TestValueVector test fails if cannot allocate 2GB of memory
-* [ARROW-185](https://issues.apache.org/jira/browse/ARROW-185) - [C++] Make sure alignment and memory padding conform to spec
-* [ARROW-188](https://issues.apache.org/jira/browse/ARROW-188) - Python: Add numpy as install requirement
-* [ARROW-193](https://issues.apache.org/jira/browse/ARROW-193) - For the instruction, typos "int his" should be "in this"
-* [ARROW-194](https://issues.apache.org/jira/browse/ARROW-194) - C++: Allow read-only memory mapped source
-* [ARROW-200](https://issues.apache.org/jira/browse/ARROW-200) - [Python] Convert Values String looks like it has incorrect error handling
-* [ARROW-205](https://issues.apache.org/jira/browse/ARROW-205) - builds failing on master branch with apt-get error
-* [ARROW-209](https://issues.apache.org/jira/browse/ARROW-209) - [C++] Broken builds: llvm.org apt repos are unavailable
-* [ARROW-210](https://issues.apache.org/jira/browse/ARROW-210) - [C++] Tidy up the type system a little bit
-* [ARROW-211](https://issues.apache.org/jira/browse/ARROW-211) - Several typos/errors in Layout.md examples
-* [ARROW-217](https://issues.apache.org/jira/browse/ARROW-217) - Fix Travis w.r.t conda 4.1.0 changes
-* [ARROW-219](https://issues.apache.org/jira/browse/ARROW-219) - [C++] Passed CMAKE\_CXX\_FLAGS are being dropped, fix compiler warnings
-* [ARROW-223](https://issues.apache.org/jira/browse/ARROW-223) - Do not link against libpython
-* [ARROW-225](https://issues.apache.org/jira/browse/ARROW-225) - [C++/Python] master Travis CI build is broken
-* [ARROW-244](https://issues.apache.org/jira/browse/ARROW-244) - [C++] Some global APIs of IPC module should be visible to the outside
-* [ARROW-246](https://issues.apache.org/jira/browse/ARROW-246) - [Java] UnionVector doesn't call allocateNew() when creating it's vectorType
-* [ARROW-247](https://issues.apache.org/jira/browse/ARROW-247) - [C++] Missing explicit destructor in RowBatchReader causes an incomplete type error
-* [ARROW-250](https://issues.apache.org/jira/browse/ARROW-250) - Fix for ARROW-246 may cause memory leaks
-* [ARROW-259](https://issues.apache.org/jira/browse/ARROW-259) - Use flatbuffer fields in java implementation
-* [ARROW-260](https://issues.apache.org/jira/browse/ARROW-260) - TestValueVector.testFixedVectorReallocation and testVariableVectorReallocation are flaky
-* [ARROW-265](https://issues.apache.org/jira/browse/ARROW-265) - Negative decimal values have wrong padding
-* [ARROW-265](https://issues.apache.org/jira/browse/ARROW-265) - Negative decimal values have wrong padding
-* [ARROW-266](https://issues.apache.org/jira/browse/ARROW-266) - [C++] Fix the broken build
-* [ARROW-274](https://issues.apache.org/jira/browse/ARROW-274) - Make the MapVector nullable
-* [ARROW-277](https://issues.apache.org/jira/browse/ARROW-277) - Flatbuf serialization fails for Timestamp type
-* [ARROW-278](https://issues.apache.org/jira/browse/ARROW-278) - [Format] Struct type name consistency in implementations and metadata
-* [ARROW-283](https://issues.apache.org/jira/browse/ARROW-283) - [C++] Update arrow\_parquet to account for API changes in PARQUET-573
-* [ARROW-284](https://issues.apache.org/jira/browse/ARROW-284) - [C++] Triage builds by disabling Arrow-Parquet module
-* [ARROW-287](https://issues.apache.org/jira/browse/ARROW-287) - [java] Make nullable vectors use a BitVecor instead of UInt1Vector for bits
-* [ARROW-297](https://issues.apache.org/jira/browse/ARROW-297) - Fix Arrow pom for release
-* [ARROW-304](https://issues.apache.org/jira/browse/ARROW-304) - NullableMapReaderImpl.isSet() always returns true
-* [ARROW-308](https://issues.apache.org/jira/browse/ARROW-308) - UnionListWriter.setPosition() should not call startList()
-* [ARROW-309](https://issues.apache.org/jira/browse/ARROW-309) - Types.getMinorTypeForArrowType() does not work for Union type
-* [ARROW-313](https://issues.apache.org/jira/browse/ARROW-313) - XCode 8.0 breaks builds
-* [ARROW-314](https://issues.apache.org/jira/browse/ARROW-314) - JSONScalar is unnecessary and unused.
-* [ARROW-320](https://issues.apache.org/jira/browse/ARROW-320) - ComplexCopier.copy(FieldReader, FieldWriter) should not start a list if reader is not set
-* [ARROW-321](https://issues.apache.org/jira/browse/ARROW-321) - Fix Arrow licences
-* [ARROW-855](https://issues.apache.org/jira/browse/ARROW-855) - Arrow Memory Leak
-
-
+
+# Apache Arrow 5.0.0 (2021-07-22)
+
+## Bug Fixes
+
+* [ARROW-6189](https://issues.apache.org/jira/browse/ARROW-6189) - [Rust] [Parquet] Plain encoded boolean column chunks limited to 2048 values
+* [ARROW-6312](https://issues.apache.org/jira/browse/ARROW-6312) - [C++] Declare required Libs.private in arrow.pc package config
+* [ARROW-7948](https://issues.apache.org/jira/browse/ARROW-7948) - [Go][Integration] Decimal integration failures
+* [ARROW-9594](https://issues.apache.org/jira/browse/ARROW-9594) - [Python] DictionaryArray.to\_numpy does not correctly convert null indexes to null values
+* [ARROW-10910](https://issues.apache.org/jira/browse/ARROW-10910) - [Python] Segmentation Fault when None given to read\_table with legacy dataset
+* [ARROW-10958](https://issues.apache.org/jira/browse/ARROW-10958) - [GLib] "Nested data conversions not implemented" through glib, but not through pyarrow
+* [ARROW-11077](https://issues.apache.org/jira/browse/ARROW-11077) - [Rust] ParquetFileArrowReader panicks when trying to read nested list
+* [ARROW-11146](https://issues.apache.org/jira/browse/ARROW-11146) - [CI][Python] Failing conda-python-3.8-jpype Nightly Build
+* [ARROW-11161](https://issues.apache.org/jira/browse/ARROW-11161) - [Python][C++] S3Filesystem: file Content-Type not set correctly?
+* [ARROW-11633](https://issues.apache.org/jira/browse/ARROW-11633) - [CI] [Documentation] Maven default skin not found
+* [ARROW-11780](https://issues.apache.org/jira/browse/ARROW-11780) - [C++][Python] StructArray.from\_arrays() crashes Python interpreter
+* [ARROW-11908](https://issues.apache.org/jira/browse/ARROW-11908) - [Rust] Intermittent Flight integration test failures
+* [ARROW-12007](https://issues.apache.org/jira/browse/ARROW-12007) - [C++] Loading parquet file returns "Invalid UTF8 payload" error
+* [ARROW-12055](https://issues.apache.org/jira/browse/ARROW-12055) - [R] is.na() evaluates to FALSE on Arrow NaN values
+* [ARROW-12096](https://issues.apache.org/jira/browse/ARROW-12096) - [Python][C++] Pyarrow Parquet reader overflows INT96 timestamps when converting to Arrow Array (timestamp[ns])
+* [ARROW-12122](https://issues.apache.org/jira/browse/ARROW-12122) - [Python] Cannot install via pip M1 mac
+* [ARROW-12142](https://issues.apache.org/jira/browse/ARROW-12142) - [Python] undefined symbol: \_ZN5arrow6StatusC1ENS\_10StatusCodeERKNSt7\_\_cxx1112basic\_stringIcSt11char\_traitsIcESaIcEEE
+* [ARROW-12150](https://issues.apache.org/jira/browse/ARROW-12150) - [Python] Bad type inference of mixed-precision Decimals
+* [ARROW-12232](https://issues.apache.org/jira/browse/ARROW-12232) - [Rust][Datafusion] Error with CAST: Unsupported SQL type Time
+* [ARROW-12240](https://issues.apache.org/jira/browse/ARROW-12240) - [Python] invalid-offsetof warning from apple clang-12
+* [ARROW-12377](https://issues.apache.org/jira/browse/ARROW-12377) - [Doc][Java] Java doc build broken
+* [ARROW-12407](https://issues.apache.org/jira/browse/ARROW-12407) - [Python] Deprecation warning when building PyArrow
+* [ARROW-12431](https://issues.apache.org/jira/browse/ARROW-12431) - [Python] pa.array mask inverted when type is binary and value to be converted is numpy array
+* [ARROW-12472](https://issues.apache.org/jira/browse/ARROW-12472) - [Python] read\_table fails when passing a PEP519 filesystem object
+* [ARROW-12482](https://issues.apache.org/jira/browse/ARROW-12482) - [Doc][Python] Mention CSVStreamingReader pitfalls with type inference
+* [ARROW-12491](https://issues.apache.org/jira/browse/ARROW-12491) - [Packaging] Required dependency on LZ4 \>= 1.8 missing from CentOS RPM packages
+* [ARROW-12503](https://issues.apache.org/jira/browse/ARROW-12503) - [C++] Ensure using "lib/" for jemalloc's library directory
+* [ARROW-12508](https://issues.apache.org/jira/browse/ARROW-12508) - [R] expect\_as\_vector implementation causes test failure on R <= 3.3 and variables defined outside of test\_that break build when no arrow install
+* [ARROW-12543](https://issues.apache.org/jira/browse/ARROW-12543) - [CI][Python] Failing conda-python-3.9 Nightly Build
+* [ARROW-12568](https://issues.apache.org/jira/browse/ARROW-12568) - [Python][C++] Segfault when casting a sliced ListArray of int64 in v4.0.0
+* [ARROW-12569](https://issues.apache.org/jira/browse/ARROW-12569) - [R] [CI] Run revdep in CI
+* [ARROW-12570](https://issues.apache.org/jira/browse/ARROW-12570) - [JS] Fix issues that blocked the v4.0.0 release
+* [ARROW-12579](https://issues.apache.org/jira/browse/ARROW-12579) - [Python] Pyarrow 4.0.0 dependency numpy 1.19.4 throws errors on Apple silicon/M1 compilation
+* [ARROW-12589](https://issues.apache.org/jira/browse/ARROW-12589) - [C++] Compiling on windows doesn't work when -DARROW\_WITH\_BACKTRACE=OFF
+* [ARROW-12601](https://issues.apache.org/jira/browse/ARROW-12601) - [R][Packaging] Fix pkg-config check in r/configure
+* [ARROW-12604](https://issues.apache.org/jira/browse/ARROW-12604) - [R][Packaging] Dataset, Parquet off in autobrew and CRAN Mac builds
+* [ARROW-12605](https://issues.apache.org/jira/browse/ARROW-12605) - [Documentation] Repair line numbers in dataset.rst
+* [ARROW-12606](https://issues.apache.org/jira/browse/ARROW-12606) - [C++] Quantile and Mode functions failing on arrays with offset
+* [ARROW-12610](https://issues.apache.org/jira/browse/ARROW-12610) - [C++] Skip TestS3FSGeneric TestDeleteDir and TestDeleteDirContents on windows as they are flaky
+* [ARROW-12611](https://issues.apache.org/jira/browse/ARROW-12611) - [CI][Python] Nightly test-conda-python-pandas-0.24 is failing due to numpy compat issue
+* [ARROW-12613](https://issues.apache.org/jira/browse/ARROW-12613) - [Python] AttributeError when comparing a Scalar with None
+* [ARROW-12614](https://issues.apache.org/jira/browse/ARROW-12614) - [C++][Compute] Revert support for Tables in ExecuteScalarExpression
+* [ARROW-12617](https://issues.apache.org/jira/browse/ARROW-12617) - [Python] pyarrow.orc.write\_table signature reverses that of pyarrow.parquet.write\_table
+* [ARROW-12620](https://issues.apache.org/jira/browse/ARROW-12620) - [C++] Dataset writing can only include projected columns if input columns are also included
+* [ARROW-12622](https://issues.apache.org/jira/browse/ARROW-12622) - [Python] Segfault when reading CSV inside Flight server
+* [ARROW-12630](https://issues.apache.org/jira/browse/ARROW-12630) - [Dev][Integration] conda-integration docker build fails
+* [ARROW-12639](https://issues.apache.org/jira/browse/ARROW-12639) - [CI][Archery] Archery build fails to create branch
+* [ARROW-12640](https://issues.apache.org/jira/browse/ARROW-12640) - [C++] Fix errors from VS 2019 in cpp/src/parquet/types.h
+* [ARROW-12642](https://issues.apache.org/jira/browse/ARROW-12642) - [R] LIBARROW\_MINIMAL, LIBARROW\_DOWNLOAD, NOT\_CRAN env vars should not be case-sensitive
+* [ARROW-12644](https://issues.apache.org/jira/browse/ARROW-12644) - [C++][Dataset] Support reading date/time-partitioned datasets accounting for URL encoding (Spark)
+* [ARROW-12646](https://issues.apache.org/jira/browse/ARROW-12646) - [C++][CI][Packaging][Python] Bump vcpkg version to its latest release
+* [ARROW-12663](https://issues.apache.org/jira/browse/ARROW-12663) - [C++] segfault when arrow header is compiled with nvcc 11.2
+* [ARROW-12668](https://issues.apache.org/jira/browse/ARROW-12668) - [C++][Dataset] CountRows occasionally segfaulting
+* [ARROW-12670](https://issues.apache.org/jira/browse/ARROW-12670) - [C++] extract\_regex gives bizarre behavior after nulls or non-matches
+* [ARROW-12672](https://issues.apache.org/jira/browse/ARROW-12672) - [C++] Segfault casting result of "fill\_null()" (not bitmap but unknown null\_count)
+* [ARROW-12679](https://issues.apache.org/jira/browse/ARROW-12679) - [Java] JDBC adapter does not preserve SQL-nullability
+* [ARROW-12684](https://issues.apache.org/jira/browse/ARROW-12684) - [Go][Flight] Fix nil dereference in error case
+* [ARROW-12708](https://issues.apache.org/jira/browse/ARROW-12708) - [C++] Valgrind errors when calling negate\_checked
+* [ARROW-12729](https://issues.apache.org/jira/browse/ARROW-12729) - [R] Fix length method for Table, RecordBatch
+* [ARROW-12746](https://issues.apache.org/jira/browse/ARROW-12746) - [Go][Flight] Client Auth handler overwrites outgoing metadata
+* [ARROW-12756](https://issues.apache.org/jira/browse/ARROW-12756) - [C++] MSVC build fails with latest gtest from vcpkg
+* [ARROW-12757](https://issues.apache.org/jira/browse/ARROW-12757) - [Dev][Archery] Warning about RUST variable in "archery docker run"
+* [ARROW-12762](https://issues.apache.org/jira/browse/ARROW-12762) - [Python] ListType doesn't preserve field name after pickle and unpickle
+* [ARROW-12769](https://issues.apache.org/jira/browse/ARROW-12769) - [Python] Negative out of range slices yield invalid arrays
+* [ARROW-12771](https://issues.apache.org/jira/browse/ARROW-12771) - [C++] Arrow compute hash\_count skips following chunked arrays in streaming execution
+* [ARROW-12772](https://issues.apache.org/jira/browse/ARROW-12772) - [CI] Merge script test fails due to missing dependency
+* [ARROW-12773](https://issues.apache.org/jira/browse/ARROW-12773) - [Docs] Clarify Java support for ORC and Parquet via JNI bindings
+* [ARROW-12774](https://issues.apache.org/jira/browse/ARROW-12774) - [C++][Compute] replace\_substring\_regex() creates invalid arrays =\> crash
+* [ARROW-12776](https://issues.apache.org/jira/browse/ARROW-12776) - [Archery][Integration] Fix decimal case generation in write\_js\_test\_json
+* [ARROW-12779](https://issues.apache.org/jira/browse/ARROW-12779) - [Python][FlightRPC] Flight server segfaults with certain data
+* [ARROW-12780](https://issues.apache.org/jira/browse/ARROW-12780) - [CI][C++] MinGW builds failing when trying to build Gandiva
+* [ARROW-12790](https://issues.apache.org/jira/browse/ARROW-12790) - [Python] Cannot read from HDFS with blanks in path names
+* [ARROW-12793](https://issues.apache.org/jira/browse/ARROW-12793) - [Python] PYARROW\_BUILD\_TYPE=Debug does not work correctly
+* [ARROW-12797](https://issues.apache.org/jira/browse/ARROW-12797) - [JS] Update readme with new links and remove outdated examples
+* [ARROW-12798](https://issues.apache.org/jira/browse/ARROW-12798) - [JS] Use == null Comparison
+* [ARROW-12799](https://issues.apache.org/jira/browse/ARROW-12799) - [JS] Use Nullish Coalescing Operator (??) For Defaults
+* [ARROW-12804](https://issues.apache.org/jira/browse/ARROW-12804) - [C++] Array methods IsNull and IsValid is confused for NullType
+* [ARROW-12807](https://issues.apache.org/jira/browse/ARROW-12807) - [C++] Fix merge conflicts with Future refactor/async IPC
+* [ARROW-12838](https://issues.apache.org/jira/browse/ARROW-12838) - [Java][Gandiva] Fix JNI CI test for Gandiva
+* [ARROW-12842](https://issues.apache.org/jira/browse/ARROW-12842) - [Java][FlightRPC] Error metadata from FlightStatusException is not propagated to client
+* [ARROW-12850](https://issues.apache.org/jira/browse/ARROW-12850) - [R] is.nan() evaluates to null on Arrow null values
+* [ARROW-12854](https://issues.apache.org/jira/browse/ARROW-12854) - [Dev][Release] Windows wheel verification script fails to download artifacts
+* [ARROW-12857](https://issues.apache.org/jira/browse/ARROW-12857) - [C++] hash\_aggregate\_test not building on master
+* [ARROW-12864](https://issues.apache.org/jira/browse/ARROW-12864) - [C++] Remove needless out argument from arrow::internal::InvertBitmap
+* [ARROW-12865](https://issues.apache.org/jira/browse/ARROW-12865) - [C++][Python] Python FlightRPC server cannot find RE2 symbols
+* [ARROW-12882](https://issues.apache.org/jira/browse/ARROW-12882) - [C++][Gandiva] Fix behavior of convevrt\_replace function for empty replacement char
+* [ARROW-12887](https://issues.apache.org/jira/browse/ARROW-12887) - [CI] AppVeyor pip install failure during setup
+* [ARROW-12906](https://issues.apache.org/jira/browse/ARROW-12906) - [Python] \`fill\_null\` called with a null value seg faults on non fixed-sized types.
+* [ARROW-12907](https://issues.apache.org/jira/browse/ARROW-12907) - [Java] Memory leak possible when exception reading from channel happens
+* [ARROW-12911](https://issues.apache.org/jira/browse/ARROW-12911) - [Python] Export scalar aggregate options to pc.sum (sum of zero rows gives null; should give 0)
+* [ARROW-12917](https://issues.apache.org/jira/browse/ARROW-12917) - [C++][R][pyarrow] Failure importing some decimal types using the C data interface
+* [ARROW-12918](https://issues.apache.org/jira/browse/ARROW-12918) - [C++] Build errors with Visual Studio 16.10.31321.278
+* [ARROW-12919](https://issues.apache.org/jira/browse/ARROW-12919) - [Developer Tools] Crossbow comment bot failing to react to comments
+* [ARROW-12935](https://issues.apache.org/jira/browse/ARROW-12935) - [C++][CI] Compiler error on some clang versions
+* [ARROW-12941](https://issues.apache.org/jira/browse/ARROW-12941) - [C++] csv reader skip\_row does not properly update num\_rows\_seen
+* [ARROW-12942](https://issues.apache.org/jira/browse/ARROW-12942) - [C++][Compute] The result of Arrow compute hash\_min\_max is incorrect if there are new groups in the subsequent chunks
+* [ARROW-12956](https://issues.apache.org/jira/browse/ARROW-12956) - [C++] Fix crash on Parquet file (OSS-Fuzz)
+* [ARROW-12969](https://issues.apache.org/jira/browse/ARROW-12969) - [C++] match\_substring doesn't match empty needle to empty haystack
+* [ARROW-12974](https://issues.apache.org/jira/browse/ARROW-12974) - [R] test-r-without-arrow build fails because of example requiring Arrow
+* [ARROW-12983](https://issues.apache.org/jira/browse/ARROW-12983) - [C++][Python] Converter::Extend gets stuck in infinite loop causing OOM if values don't fit in single chunk
+* [ARROW-12987](https://issues.apache.org/jira/browse/ARROW-12987) - [CI] test-ubuntu-18.04 nightly builds are failing due to Gandiva "TestUpper" test failure
+* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty)
+* [ARROW-12988](https://issues.apache.org/jira/browse/ARROW-12988) - [CI] The kartothek nightly integration build is failing (test\_update\_dataset\_from\_ddf\_empty)
+* [ARROW-12989](https://issues.apache.org/jira/browse/ARROW-12989) - [CI] "Dev PR" jobs undully cancelled
+* [ARROW-12991](https://issues.apache.org/jira/browse/ARROW-12991) - [CI] Travis ARM builds often crash
+* [ARROW-12993](https://issues.apache.org/jira/browse/ARROW-12993) - [Python] Address boundary error with invalid Feather file and stackprinter
+* [ARROW-12995](https://issues.apache.org/jira/browse/ARROW-12995) - [C++] CSV reader should validate options
+* [ARROW-12998](https://issues.apache.org/jira/browse/ARROW-12998) - [C++] Datasets needs dependency on xsimd
+* [ARROW-13001](https://issues.apache.org/jira/browse/ARROW-13001) - [Go] Build failure in parquet/internal/bmi on s390x
+* [ARROW-13003](https://issues.apache.org/jira/browse/ARROW-13003) - [C++] unaligned access in compute/exec/ cc files
+* [ARROW-13008](https://issues.apache.org/jira/browse/ARROW-13008) - [C++] Deprecation warning when compiling minimal example
+* [ARROW-13010](https://issues.apache.org/jira/browse/ARROW-13010) - [C++][Compute] Support outputting to slices from kleene kernels
+* [ARROW-13018](https://issues.apache.org/jira/browse/ARROW-13018) - [C++][Docs] Use consistent terminology for nulls (min\_count) in scalar aggregate kernels
+* [ARROW-13026](https://issues.apache.org/jira/browse/ARROW-13026) - [C++][CI] s390x job setup fails
+* [ARROW-13037](https://issues.apache.org/jira/browse/ARROW-13037) - [R] Incorrect param when creating Expression crashes R
+* [ARROW-13039](https://issues.apache.org/jira/browse/ARROW-13039) - [R] Fix error message handling
+* [ARROW-13041](https://issues.apache.org/jira/browse/ARROW-13041) - [C++] Unary kernels can leave uninitialized data under null entries
+* [ARROW-13046](https://issues.apache.org/jira/browse/ARROW-13046) - [Release] JS package failing test prior to publish
+* [ARROW-13048](https://issues.apache.org/jira/browse/ARROW-13048) - [C++] S3FileSystem fails moving filepaths containing = or +
+* [ARROW-13053](https://issues.apache.org/jira/browse/ARROW-13053) - [Python] Build fails on MacOS Big Sur using homebrewed Arrow libraries
+* [ARROW-13069](https://issues.apache.org/jira/browse/ARROW-13069) - [Website] Add Daniël to committer list
+* [ARROW-13073](https://issues.apache.org/jira/browse/ARROW-13073) - [Developer] archery benchmark list: unexpected keyword 'benchmark\_filter'
+* [ARROW-13080](https://issues.apache.org/jira/browse/ARROW-13080) - [Release] Generate the API docs in ubuntu 20.10
+* [ARROW-13083](https://issues.apache.org/jira/browse/ARROW-13083) - [Python] Wrong SCM version detection both in setup.py and crossbow
+* [ARROW-13085](https://issues.apache.org/jira/browse/ARROW-13085) - [Python] Apache Arrow minimal cpp build segfaults with pyarrow libs
+* [ARROW-13090](https://issues.apache.org/jira/browse/ARROW-13090) - [Python] Test failure with ffspec 2021.6.0
+* [ARROW-13104](https://issues.apache.org/jira/browse/ARROW-13104) - [C++] ByteStreamSplit implementation uses invalid pointer cast
+* [ARROW-13108](https://issues.apache.org/jira/browse/ARROW-13108) - [Python] Pyarrow 4.0.0 crashes upon import on macOS 10.13.6
+* [ARROW-13116](https://issues.apache.org/jira/browse/ARROW-13116) - [R] Test for RecordBatchReader to C-interface fails on arrow-r-minimal due to missing dependencies
+* [ARROW-13125](https://issues.apache.org/jira/browse/ARROW-13125) - [R] Throw error when 2+ args passed to desc() in arrange()
+* [ARROW-13128](https://issues.apache.org/jira/browse/ARROW-13128) - [C\#] TimestampArray conversion logic for nano and micro is wrong
+* [ARROW-13135](https://issues.apache.org/jira/browse/ARROW-13135) - [C++] Fix Status propagation in END\_PARQUET\_CATCH\_EXCEPTIONS
+* [ARROW-13139](https://issues.apache.org/jira/browse/ARROW-13139) - [C++] ReadaheadGenerator cannot be safely copied/moved
+* [ARROW-13145](https://issues.apache.org/jira/browse/ARROW-13145) - [C++][CI] Flight test crashes on MinGW
+* [ARROW-13148](https://issues.apache.org/jira/browse/ARROW-13148) - [Dev][Archery] Crossbow build submission fails
+* [ARROW-13153](https://issues.apache.org/jira/browse/ARROW-13153) - [C++] \`parquet\_dataset\` loses ordering of files in \`\_metadata\`
+* [ARROW-13154](https://issues.apache.org/jira/browse/ARROW-13154) - [C++] Unions can not have 126 and 127 as type\_codes
+* [ARROW-13169](https://issues.apache.org/jira/browse/ARROW-13169) - [R] [C++] sorted partition keys can cause issues
+* [ARROW-13173](https://issues.apache.org/jira/browse/ARROW-13173) - [C++] TestAsyncUtil.ReadaheadFailed asserts occasionally
+* [ARROW-13187](https://issues.apache.org/jira/browse/ARROW-13187) - [c++][python] Possibly memory not deallocated when reading in CSV
+* [ARROW-13189](https://issues.apache.org/jira/browse/ARROW-13189) - [R] Disable row-level metadata application on datasets
+* [ARROW-13203](https://issues.apache.org/jira/browse/ARROW-13203) - [R] Fix optional component checks causing failures
+* [ARROW-13207](https://issues.apache.org/jira/browse/ARROW-13207) - [Python][Doc] Dataset documentation still suggests deprecated scan method as the preferred iterative approach
+* [ARROW-13216](https://issues.apache.org/jira/browse/ARROW-13216) - [R] Type checks test fails with rtools35
+* [ARROW-13217](https://issues.apache.org/jira/browse/ARROW-13217) - [C++][Gandiva] Correct convert\_replace function for invalid chars on string beginning
+* [ARROW-13223](https://issues.apache.org/jira/browse/ARROW-13223) - [C++][CI] Fix thread sanitizer failures
+* [ARROW-13225](https://issues.apache.org/jira/browse/ARROW-13225) - [Go][Flight] Implement Custom Middleware Interface and Enable Integration Tests
+* [ARROW-13229](https://issues.apache.org/jira/browse/ARROW-13229) - [Python] ascii\_trim, ascii\_ltrim and ascii\_rtrim lack options
+* [ARROW-13239](https://issues.apache.org/jira/browse/ARROW-13239) - [Doc][Python] Dataset.head function doesn't mention required argument
+* [ARROW-13243](https://issues.apache.org/jira/browse/ARROW-13243) - [R] altrep function call in R 3.5
+* [ARROW-13246](https://issues.apache.org/jira/browse/ARROW-13246) - [C++] CSV skip\_rows\_after\_names can discard data prematurally
+* [ARROW-13249](https://issues.apache.org/jira/browse/ARROW-13249) - [Java][CI] Consistent timeout in the Java JNI build
+* [ARROW-13253](https://issues.apache.org/jira/browse/ARROW-13253) - [C++][FlightRPC] Segfault when sending record batch \>2GB
+* [ARROW-13254](https://issues.apache.org/jira/browse/ARROW-13254) - [Python] Processes killed and semaphore objects leaked when reading pandas data
+* [ARROW-13265](https://issues.apache.org/jira/browse/ARROW-13265) - [R] cli valgrind errors in nightlies
+* [ARROW-13266](https://issues.apache.org/jira/browse/ARROW-13266) - [JS] Improve benchmark names & add suite name to json
+* [ARROW-13281](https://issues.apache.org/jira/browse/ARROW-13281) - [C++][Gandiva] Error on timestampDiffMonth function behavior for negative diff values
+* [ARROW-13284](https://issues.apache.org/jira/browse/ARROW-13284) - [C++] Wrong pkg\_check\_modules() option name
+* [ARROW-13288](https://issues.apache.org/jira/browse/ARROW-13288) - [Python] Missing default values of kernel options in PyArrow
+* [ARROW-13290](https://issues.apache.org/jira/browse/ARROW-13290) - Compilation fails on clang-12 and gcc-11 due to missing include
+* [ARROW-13305](https://issues.apache.org/jira/browse/ARROW-13305) - [C++] Unable to install nightly on Ubuntu 21.04 due to CSV options
+* [ARROW-13315](https://issues.apache.org/jira/browse/ARROW-13315) - [R] Wrap r\_task\_group includes with ARROW\_R\_WITH\_ARROW checking
+* [ARROW-13321](https://issues.apache.org/jira/browse/ARROW-13321) - [C++][Python] MakeArrayFromScalar doesn't work for FixedSizeBinaryType
+* [ARROW-13324](https://issues.apache.org/jira/browse/ARROW-13324) - [R] Typo in bindings for utf8\_reverse and ascii\_reverse
+* [ARROW-13332](https://issues.apache.org/jira/browse/ARROW-13332) - [C++] TSAN failure in TestAsyncUtil.ReadaheadFailed
+* [ARROW-13341](https://issues.apache.org/jira/browse/ARROW-13341) - [C++] Segfault in arrow-compute-plan-test ExecPlanExecution.SourceScalarAggSink
+* [ARROW-13350](https://issues.apache.org/jira/browse/ARROW-13350) - [Python][CI] conda-python-3.7-pandas-0.24 nightly build failing in test\_extract\_datetime\_components
+* [ARROW-13352](https://issues.apache.org/jira/browse/ARROW-13352) - [C++] Valgrind failure in case\_when kernel
+* [ARROW-13353](https://issues.apache.org/jira/browse/ARROW-13353) - [Documentation] Build failing with sphinx.util.cfamily.DefinitionError
+* [ARROW-13360](https://issues.apache.org/jira/browse/ARROW-13360) - [C++] Missing dependencies in C++ thirdparty offline dependencies versions.txt
+* [ARROW-13363](https://issues.apache.org/jira/browse/ARROW-13363) - [R] is.nan() errors on non-floating point data
+* [ARROW-13368](https://issues.apache.org/jira/browse/ARROW-13368) - [C++][Doc] Rename project to make\_struct in docs
+* [ARROW-13381](https://issues.apache.org/jira/browse/ARROW-13381) - [C++] ArrayFromJSON doesn't work for float value dictionary type
+* [ARROW-13382](https://issues.apache.org/jira/browse/ARROW-13382) - [C++] Aggregation over scalars fails autobrew R job
+* [ARROW-13384](https://issues.apache.org/jira/browse/ARROW-13384) - [C++] Specify minimum required zstd version in cmake
+* [ARROW-13391](https://issues.apache.org/jira/browse/ARROW-13391) - [C++] CSV streaming reader does not include same error information as table reader
+* [ARROW-13417](https://issues.apache.org/jira/browse/ARROW-13417) - [C++] The merged generator can sometimes pull from source sync-reentrant
+* [ARROW-13419](https://issues.apache.org/jira/browse/ARROW-13419) - [JS] Fix perf tests
+* [ARROW-13428](https://issues.apache.org/jira/browse/ARROW-13428) - [C++][Flight] -lssl is missing with bundled gRPC and system shared OpenSSL
+* [ARROW-13431](https://issues.apache.org/jira/browse/ARROW-13431) - [Release] Bump go version to 1.15; don't verify rust source anymore
+* [ARROW-13432](https://issues.apache.org/jira/browse/ARROW-13432) - [Release] Fix ssh connection to the binary uploader container
+
+
+## New Features and Improvements
+
+* [ARROW-2665](https://issues.apache.org/jira/browse/ARROW-2665) - [Python/C++] Add index() method to find first occurence of Python scalar
+* [ARROW-3014](https://issues.apache.org/jira/browse/ARROW-3014) - [C++] Minimal writer adapter for ORC file format
+* [ARROW-3316](https://issues.apache.org/jira/browse/ARROW-3316) - [R] Multi-threaded conversion from R data.frame to Arrow table / record batch
+* [ARROW-5385](https://issues.apache.org/jira/browse/ARROW-5385) - [Go] implement EXTENSION datatype
+* [ARROW-5640](https://issues.apache.org/jira/browse/ARROW-5640) - [Go] implement Map array
+* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension
+* [ARROW-6513](https://issues.apache.org/jira/browse/ARROW-6513) - [CI] The conda environment files arrow/ci/conda\_env\_\*.yml should have .txt extension
+* [ARROW-7001](https://issues.apache.org/jira/browse/ARROW-7001) - [C++] Develop threading APIs to accommodate nested parallelism
+* [ARROW-7114](https://issues.apache.org/jira/browse/ARROW-7114) - [JS][CI] NodeJS build fails on Github Actions Windows node
+* [ARROW-7252](https://issues.apache.org/jira/browse/ARROW-7252) - [Rust] [Parquet] Reading UTF-8/JSON/ENUM field results in a lot of vec allocation
+* [ARROW-7396](https://issues.apache.org/jira/browse/ARROW-7396) - [Format] Register media types (MIME types) for Apache Arrow formats to IANA
+* [ARROW-8421](https://issues.apache.org/jira/browse/ARROW-8421) - [Rust] [Parquet] Implement parquet writer
+* [ARROW-8459](https://issues.apache.org/jira/browse/ARROW-8459) - [Dev][Archery] Use a more recent cmake-format
+* [ARROW-8527](https://issues.apache.org/jira/browse/ARROW-8527) - [C++][CSV] Add support for ReadOptions::skip\_rows \>= block\_size
+* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset
+* [ARROW-8676](https://issues.apache.org/jira/browse/ARROW-8676) - [Rust] Create implementation of IPC RecordBatch body buffer compression from ARROW-300
+* [ARROW-9054](https://issues.apache.org/jira/browse/ARROW-9054) - [C++] Add ScalarAggregateOptions
+* [ARROW-9056](https://issues.apache.org/jira/browse/ARROW-9056) - [C++] Support scalar aggregation over scalars
+* [ARROW-9140](https://issues.apache.org/jira/browse/ARROW-9140) - [R] Zero-copy Arrow to R where possible
+* [ARROW-9295](https://issues.apache.org/jira/browse/ARROW-9295) - [Archery] Support rust clippy in the lint command
+* [ARROW-9299](https://issues.apache.org/jira/browse/ARROW-9299) - [Python] Expose ORC metadata() in Python ORCFile
+* [ARROW-9313](https://issues.apache.org/jira/browse/ARROW-9313) - [Rust] Use feature enum
+* [ARROW-9421](https://issues.apache.org/jira/browse/ARROW-9421) - [C++][Parquet] Redundancies SchemaManifest::GetFieldIndices
+* [ARROW-9430](https://issues.apache.org/jira/browse/ARROW-9430) - [C++/Python] Kernel for SetItem(BooleanArray, values)
+* [ARROW-9697](https://issues.apache.org/jira/browse/ARROW-9697) - [C++][Dataset] num\_rows method for Dataset/Scanner
+* [ARROW-10031](https://issues.apache.org/jira/browse/ARROW-10031) - [Java] Support Java benchmark in Archery
+* [ARROW-10115](https://issues.apache.org/jira/browse/ARROW-10115) - [C++] CSV empty quoted string is treated as NULL
+* [ARROW-10316](https://issues.apache.org/jira/browse/ARROW-10316) - [Python] Consider using \_\_wrapped\_\_ for compute function introspection
+* [ARROW-10391](https://issues.apache.org/jira/browse/ARROW-10391) - [Rust] [Parquet] Nested Arrow reader
+* [ARROW-10440](https://issues.apache.org/jira/browse/ARROW-10440) - [C++][Dataset][Python] Add a callback to visit file writers just before Finish()
+* [ARROW-10550](https://issues.apache.org/jira/browse/ARROW-10550) - [Rust] [Parquet] Write nested types (struct, list)
+* [ARROW-10557](https://issues.apache.org/jira/browse/ARROW-10557) - [C++] Add scalar string slicing/substring extract kernel
+* [ARROW-10640](https://issues.apache.org/jira/browse/ARROW-10640) - [C++] An "if\_else" kernel to combine two arrays based on a mask
+* [ARROW-10658](https://issues.apache.org/jira/browse/ARROW-10658) - [Python][Packaging] Wheel builds for Apple Silicon
+* [ARROW-10675](https://issues.apache.org/jira/browse/ARROW-10675) - [C++][Python] Support AWS S3 Web identity credentials
+* [ARROW-10797](https://issues.apache.org/jira/browse/ARROW-10797) - [C++] Investigate faster random generation for tests and benchmarks
+* [ARROW-10926](https://issues.apache.org/jira/browse/ARROW-10926) - [Rust] Add parquet reader / writer for decimal types
+* [ARROW-10959](https://issues.apache.org/jira/browse/ARROW-10959) - [C++] Add scalar string join kernel
+* [ARROW-11061](https://issues.apache.org/jira/browse/ARROW-11061) - [Rust] Validate array properties against schema
+* [ARROW-11173](https://issues.apache.org/jira/browse/ARROW-11173) - Add Map type as reader / writer in FieldReader / FieldWriter
+* [ARROW-11199](https://issues.apache.org/jira/browse/ARROW-11199) - [C++][Python] Fix the unit tests for the ORC reader
+* [ARROW-11206](https://issues.apache.org/jira/browse/ARROW-11206) - [C++][Compute][Python] Rename "project" kernel to "make\_struct"
+* [ARROW-11342](https://issues.apache.org/jira/browse/ARROW-11342) - [Python] [Gandiva] Expose ToString and result type information
+* [ARROW-11499](https://issues.apache.org/jira/browse/ARROW-11499) - [Packaging] Remove all use of bintray
+* [ARROW-11514](https://issues.apache.org/jira/browse/ARROW-11514) - [R][C++] Bindings for paste(), paste0(), str\_c()
+* [ARROW-11515](https://issues.apache.org/jira/browse/ARROW-11515) - [R] Bindings for strsplit
+* [ARROW-11565](https://issues.apache.org/jira/browse/ARROW-11565) - [C++][Gandiva] Modify upper()/lower() to work with UTF8 and add INIT\_CAP function
+* [ARROW-11608](https://issues.apache.org/jira/browse/ARROW-11608) - [CI] turbodbc integration tests are failing (build isue)
+* [ARROW-11660](https://issues.apache.org/jira/browse/ARROW-11660) - [C++] Move RecordBatch::SelectColumns method from R to C++ library
+* [ARROW-11673](https://issues.apache.org/jira/browse/ARROW-11673) - [C++] Casting dictionary type to use different index type
+* [ARROW-11675](https://issues.apache.org/jira/browse/ARROW-11675) - [CI][C++] Resolve ctest failures on VS 2019 builds
+* [ARROW-11705](https://issues.apache.org/jira/browse/ARROW-11705) - [R] Support scalar value recycling in RecordBatch/Table$create()
+* [ARROW-11759](https://issues.apache.org/jira/browse/ARROW-11759) - [C++] Kernel to extract datetime components (year, month, day, etc) from timestamp type
+* [ARROW-11769](https://issues.apache.org/jira/browse/ARROW-11769) - [R] Pull groups from grouped\_df into RecordBatch or Table
+* [ARROW-11772](https://issues.apache.org/jira/browse/ARROW-11772) - [C++] Add asynchronous read to ipc::RecordBatchFileReader
+* [ARROW-11782](https://issues.apache.org/jira/browse/ARROW-11782) - [GLib][Ruby][Dataset] Remove bindings for internal classes
+* [ARROW-11787](https://issues.apache.org/jira/browse/ARROW-11787) - [R] Implement write csv
+* [ARROW-11843](https://issues.apache.org/jira/browse/ARROW-11843) - [C++] Add asynchronous read to parquet::arrow::FileReader
+* [ARROW-11849](https://issues.apache.org/jira/browse/ARROW-11849) - [R] Use roxygen @examplesIf tag in R docs
+* [ARROW-11889](https://issues.apache.org/jira/browse/ARROW-11889) - [C++] Add parallelism to streaming CSV reader
+* [ARROW-11909](https://issues.apache.org/jira/browse/ARROW-11909) - [C++] Get rid of MakeIteratorGenerator
+* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
+* [ARROW-11926](https://issues.apache.org/jira/browse/ARROW-11926) - [R] Pass on the new UCRT CRAN windows builds
+* [ARROW-11928](https://issues.apache.org/jira/browse/ARROW-11928) - [C++][Compute] Add ExecNode hierarchy
+* [ARROW-11929](https://issues.apache.org/jira/browse/ARROW-11929) - [C++][Compute] Promote Expression to the compute namespace
+* [ARROW-11930](https://issues.apache.org/jira/browse/ARROW-11930) - [C++][Dataset][Compute] Refactor Dataset scans to use an ExecNode graph
+* [ARROW-11932](https://issues.apache.org/jira/browse/ARROW-11932) - [C++] Provide ArrayBuilder::AppendScalar
+* [ARROW-11950](https://issues.apache.org/jira/browse/ARROW-11950) - [C++][Compute] Add unary negative kernel
+* [ARROW-11960](https://issues.apache.org/jira/browse/ARROW-11960) - [C++][Gandiva] Support escape in LIKE
+* [ARROW-11980](https://issues.apache.org/jira/browse/ARROW-11980) - [Python] Remove "experimental" status from Table.replace\_schema\_metadata
+* [ARROW-11986](https://issues.apache.org/jira/browse/ARROW-11986) - [C++][Gandiva] Implement IN expressions for doubles and floats
+* [ARROW-11990](https://issues.apache.org/jira/browse/ARROW-11990) - [C++][Compute] Use Status/Result return consistently to indicate errors
+* [ARROW-12004](https://issues.apache.org/jira/browse/ARROW-12004) - [C++] Result<detail::Empty\> is annoying
+* [ARROW-12010](https://issues.apache.org/jira/browse/ARROW-12010) - [C++][Compute] Improve performance of the hash table used in GroupIdentifier
+* [ARROW-12016](https://issues.apache.org/jira/browse/ARROW-12016) - [C++] Implement array\_sort\_indices and sort\_indices for BOOL type
+* [ARROW-12050](https://issues.apache.org/jira/browse/ARROW-12050) - [C++][Python][FlightRPC] Use StopToken to enable interrupting long Flight operations
+* [ARROW-12074](https://issues.apache.org/jira/browse/ARROW-12074) - [C++][Compute] Add scalar arithmetic kernels for decimal inputs
+* [ARROW-12083](https://issues.apache.org/jira/browse/ARROW-12083) - [R] schema use in open\_dataset
+* [ARROW-12166](https://issues.apache.org/jira/browse/ARROW-12166) - [C++][Gandiva] Implements CONVERT\_TO(value, type) function
+* [ARROW-12184](https://issues.apache.org/jira/browse/ARROW-12184) - [R] Bindings for na.fail, na.omit, na.exclude, na.pass
+* [ARROW-12185](https://issues.apache.org/jira/browse/ARROW-12185) - [R] Bindings for any, all
+* [ARROW-12198](https://issues.apache.org/jira/browse/ARROW-12198) - [R] bindings for strptime
+* [ARROW-12199](https://issues.apache.org/jira/browse/ARROW-12199) - [R] bindings for stddev, variance
+* [ARROW-12205](https://issues.apache.org/jira/browse/ARROW-12205) - [C++][Gandiva] Implement TO\_TIME([number] secs) and TO\_TIMESTAMP([number] secs) function
+* [ARROW-12231](https://issues.apache.org/jira/browse/ARROW-12231) - [C++][Dataset] Separate datasets backed by readers from InMemoryDataset
+* [ARROW-12253](https://issues.apache.org/jira/browse/ARROW-12253) - [Rust] [Ballista] Implement scalable joins
+* [ARROW-12255](https://issues.apache.org/jira/browse/ARROW-12255) - [Rust] [Ballista] Integrate scheduler with DataFusion
+* [ARROW-12256](https://issues.apache.org/jira/browse/ARROW-12256) - [Rust] [Ballista] Add DataFrame support
+* [ARROW-12257](https://issues.apache.org/jira/browse/ARROW-12257) - [Rust] [Ballista] Publish user guide to Arrow site
+* [ARROW-12261](https://issues.apache.org/jira/browse/ARROW-12261) - [Rust] [Ballista] Ballista should not have its own DataFrame API
+* [ARROW-12291](https://issues.apache.org/jira/browse/ARROW-12291) - [R] Determine the type of an unevaluated expression
+* [ARROW-12310](https://issues.apache.org/jira/browse/ARROW-12310) - [Java] ValueVector\#getObject should support covariance for complex types
+* [ARROW-12355](https://issues.apache.org/jira/browse/ARROW-12355) - [C++] Implement efficient async CSV scanning
+* [ARROW-12362](https://issues.apache.org/jira/browse/ARROW-12362) - [Rust] [DataFusion] topk\_query test failure
+* [ARROW-12364](https://issues.apache.org/jira/browse/ARROW-12364) - [Python] [Dataset] Add metadata\_collector option to ds.write\_dataset()
+* [ARROW-12378](https://issues.apache.org/jira/browse/ARROW-12378) - [C++][Gandiva] Implement castVARBINARY functions
+* [ARROW-12386](https://issues.apache.org/jira/browse/ARROW-12386) - [C++] Support file parallelism in AsyncScanner
+* [ARROW-12391](https://issues.apache.org/jira/browse/ARROW-12391) - [Rust][DataFusion] Implement date\_trunc() function
+* [ARROW-12392](https://issues.apache.org/jira/browse/ARROW-12392) - [C++] Restore asynchronous streaming CSV reader
+* [ARROW-12393](https://issues.apache.org/jira/browse/ARROW-12393) - [JS] Optimally use closure compiler
+* [ARROW-12403](https://issues.apache.org/jira/browse/ARROW-12403) - [Rust] [Ballista] Integration tests should check that query results are correct
+* [ARROW-12415](https://issues.apache.org/jira/browse/ARROW-12415) - [CI] [Python] ERROR: Failed building wheel for pygit2 on ARM64
+* [ARROW-12424](https://issues.apache.org/jira/browse/ARROW-12424) - [Go][Parquet] Add Schema Package
+* [ARROW-12428](https://issues.apache.org/jira/browse/ARROW-12428) - [Python] pyarrow.parquet.read\_\* should use pre\_buffer=True
+* [ARROW-12434](https://issues.apache.org/jira/browse/ARROW-12434) - [Rust] [Ballista] Show executed plans with metrics
+* [ARROW-12442](https://issues.apache.org/jira/browse/ARROW-12442) - [CI] Set job timeouts on GitHub Actions
+* [ARROW-12443](https://issues.apache.org/jira/browse/ARROW-12443) - [C++][Gandiva] Implement castVARCHAR function for binary input
+* [ARROW-12444](https://issues.apache.org/jira/browse/ARROW-12444) - [RUST] [CI] Remove Rust and point integration tests to arrow-rs repo
+* [ARROW-12445](https://issues.apache.org/jira/browse/ARROW-12445) - [Rust] Design and implement packaging process to bundle Rust in signed tar
+* [ARROW-12468](https://issues.apache.org/jira/browse/ARROW-12468) - [Python][R] Expose UseAsync to python/R
+* [ARROW-12478](https://issues.apache.org/jira/browse/ARROW-12478) - [C++] Support LLVM 12
+* [ARROW-12484](https://issues.apache.org/jira/browse/ARROW-12484) - [CI] Change jinja macros to not require CROSSBOW\_TOKEN to upload artifacts in Github Actions
+* [ARROW-12489](https://issues.apache.org/jira/browse/ARROW-12489) - [Developer] autotune is broken
+* [ARROW-12490](https://issues.apache.org/jira/browse/ARROW-12490) - [Dev] Use miniforge for all platforms
+* [ARROW-12492](https://issues.apache.org/jira/browse/ARROW-12492) - [Python] Add an helper method to decode a DictionaryArray back to a plain Array
+* [ARROW-12496](https://issues.apache.org/jira/browse/ARROW-12496) - [C++][Dataset] Ensure Scanner tests fully cover async
+* [ARROW-12499](https://issues.apache.org/jira/browse/ARROW-12499) - [C++][Compute][R] Add ScalarAggregateOptions to Any and All kernels
+* [ARROW-12500](https://issues.apache.org/jira/browse/ARROW-12500) - [C++][Dataset] Consolidate similar tests for file formats
+* [ARROW-12501](https://issues.apache.org/jira/browse/ARROW-12501) - [CI][Ruby] Remove needless workaround for MinGW build
+* [ARROW-12507](https://issues.apache.org/jira/browse/ARROW-12507) - [CI] Remove duplicated cron/nightly builds
+* [ARROW-12512](https://issues.apache.org/jira/browse/ARROW-12512) - [C++][Dataset] Implement CSV writing support
+* [ARROW-12514](https://issues.apache.org/jira/browse/ARROW-12514) - [Release] Don't run Gandiva related Ruby test with ARROW\_GANDIVA=OFF
+* [ARROW-12517](https://issues.apache.org/jira/browse/ARROW-12517) - [Go] Expose App Metadata in Flight client
+* [ARROW-12518](https://issues.apache.org/jira/browse/ARROW-12518) - [Python] Expose Parquet statistics has\_null\_count / has\_distinct\_count
+* [ARROW-12520](https://issues.apache.org/jira/browse/ARROW-12520) - [R] Minor docs updates
+* [ARROW-12522](https://issues.apache.org/jira/browse/ARROW-12522) - [C++] Implement asynchronous/"lazy" variants of ReadRangeCache
+* [ARROW-12525](https://issues.apache.org/jira/browse/ARROW-12525) - [JS] Vector toJSON returns an array
+* [ARROW-12527](https://issues.apache.org/jira/browse/ARROW-12527) - [Dev] Don't try getting JIRA information for MINOR PR
+* [ARROW-12528](https://issues.apache.org/jira/browse/ARROW-12528) - [JS] Support typed arrays in Table.new
+* [ARROW-12530](https://issues.apache.org/jira/browse/ARROW-12530) - [C++] Remove Buffer::mutable\_data\_ member and use const\_cast on data\_ only if is\_mutable\_ is true
+* [ARROW-12533](https://issues.apache.org/jira/browse/ARROW-12533) - [C++] Random real generator is slow on Arm64 Linux when built with clang
+* [ARROW-12534](https://issues.apache.org/jira/browse/ARROW-12534) - [C++][Gandiva] Implement LEFT and RIGHT functions on Gandiva for string input values
+* [ARROW-12537](https://issues.apache.org/jira/browse/ARROW-12537) - [JS] Docs build should not include test sources
+* [ARROW-12541](https://issues.apache.org/jira/browse/ARROW-12541) - [Docs] Improve styling/readability of tables in the new doc theme
+* [ARROW-12551](https://issues.apache.org/jira/browse/ARROW-12551) - [Java][Release] Java post-release tests fail due to missing testing data
+* [ARROW-12554](https://issues.apache.org/jira/browse/ARROW-12554) - Allow duplicates in the value\_set for compute::is\_in
+* [ARROW-12555](https://issues.apache.org/jira/browse/ARROW-12555) - [Java][Release] Java post-release script misses dataset JNI bindings
+* [ARROW-12556](https://issues.apache.org/jira/browse/ARROW-12556) - [C++][Gandiva] Implement BYTESUBSTRING functions on Gandiva
+* [ARROW-12560](https://issues.apache.org/jira/browse/ARROW-12560) - [C++] Investigate utilizing aggressive thread task creation when adding callback to finished future
+* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values
+* [ARROW-12567](https://issues.apache.org/jira/browse/ARROW-12567) - [C++][Gandiva] Implement LPAD and RPAD functions for string input values
+* [ARROW-12571](https://issues.apache.org/jira/browse/ARROW-12571) - [R][CI] Run nightly R with valgrind
+* [ARROW-12575](https://issues.apache.org/jira/browse/ARROW-12575) - [R] Use unary negative kernel
+* [ARROW-12577](https://issues.apache.org/jira/browse/ARROW-12577) - [Website] Use Artifactory instead of Bintray in all places
+* [ARROW-12578](https://issues.apache.org/jira/browse/ARROW-12578) - [JS] Simplify UTF8 handling in NodeJS
+* [ARROW-12581](https://issues.apache.org/jira/browse/ARROW-12581) - [C++][FlightRPC] Benchmark compression with real data
+* [ARROW-12584](https://issues.apache.org/jira/browse/ARROW-12584) - [C++][Python] Expose method for benchmarking tools to release unused memory from the allocators
+* [ARROW-12591](https://issues.apache.org/jira/browse/ARROW-12591) - [Java][Gandiva] Create single Gandiva jar for MacOS and Linux
+* [ARROW-12593](https://issues.apache.org/jira/browse/ARROW-12593) - [Packaging][Ubuntu] Add support for Ubuntu 21.04
+* [ARROW-12597](https://issues.apache.org/jira/browse/ARROW-12597) - [C++] Implement OptionalParallelForAsync
+* [ARROW-12598](https://issues.apache.org/jira/browse/ARROW-12598) - [C++][Dataset] Implement row-count for CSV or allow selecting 0 columns from CSV
+* [ARROW-12599](https://issues.apache.org/jira/browse/ARROW-12599) - [Doc][Python] Documentation missing for pyarrow.Table
+* [ARROW-12600](https://issues.apache.org/jira/browse/ARROW-12600) - [CI] Push docker images from crossbow tasks
+* [ARROW-12602](https://issues.apache.org/jira/browse/ARROW-12602) - [R] Add BuildInfo from C++ to arrow\_info
+* [ARROW-12608](https://issues.apache.org/jira/browse/ARROW-12608) - [C++] Add split\_pattern\_regex function
+* [ARROW-12612](https://issues.apache.org/jira/browse/ARROW-12612) - [C++][Compute] Add Expression to type\_fwd.h
+* [ARROW-12619](https://issues.apache.org/jira/browse/ARROW-12619) - [Python] pyarrow sdist should not require git
+* [ARROW-12621](https://issues.apache.org/jira/browse/ARROW-12621) - [C++][Gandiva] Add alias to sha1 and sha256 functions
+* [ARROW-12631](https://issues.apache.org/jira/browse/ARROW-12631) - [Python] pyarrow.dataset.write\_table should accept a Scanner to write
+* [ARROW-12643](https://issues.apache.org/jira/browse/ARROW-12643) - Add documentation for experimental repos
+* [ARROW-12645](https://issues.apache.org/jira/browse/ARROW-12645) - [Python] Fix numpydoc validation
+* [ARROW-12648](https://issues.apache.org/jira/browse/ARROW-12648) - [C++][FlightRPC] Allow using TLS in benchmark
+* [ARROW-12649](https://issues.apache.org/jira/browse/ARROW-12649) - [Python/Packaging] Move conda-aarch64 to Azure with cross-compilation
+* [ARROW-12653](https://issues.apache.org/jira/browse/ARROW-12653) - [Archery] allow me to add a comment to crossbow requests
+* [ARROW-12658](https://issues.apache.org/jira/browse/ARROW-12658) - [C++] Bump aws-c-common to v0.5.10
+* [ARROW-12660](https://issues.apache.org/jira/browse/ARROW-12660) - [R] Post-4.0 adjustments for CRAN
+* [ARROW-12661](https://issues.apache.org/jira/browse/ARROW-12661) - [C++] CSV add skip rows after column names
+* [ARROW-12662](https://issues.apache.org/jira/browse/ARROW-12662) - [Website] Force to use squash merge
+* [ARROW-12667](https://issues.apache.org/jira/browse/ARROW-12667) - [Python] Ensure test coverage for conversion of strided numpy arrays
+* [ARROW-12675](https://issues.apache.org/jira/browse/ARROW-12675) - [C++] CSV should include line/row numbers in parsing error messages
+* [ARROW-12677](https://issues.apache.org/jira/browse/ARROW-12677) - [Python] Add a mask argument to pyarrow.StructArray.from\_arrays
+* [ARROW-12685](https://issues.apache.org/jira/browse/ARROW-12685) - [C++][Compute] Add unary absolute value kernel
+* [ARROW-12686](https://issues.apache.org/jira/browse/ARROW-12686) - [C++][Python][FlightRPC] Support export\_to\_c in DoGet/inherit from RecordBatchReader
+* [ARROW-12687](https://issues.apache.org/jira/browse/ARROW-12687) - [C++][Python][Dataset] Support C Data Interface with Scanner
+* [ARROW-12689](https://issues.apache.org/jira/browse/ARROW-12689) - [R] Implement ArrowArrayStream C interface
+* [ARROW-12692](https://issues.apache.org/jira/browse/ARROW-12692) - [R] Improve tests and comments for strsplit() bindings
+* [ARROW-12694](https://issues.apache.org/jira/browse/ARROW-12694) - [R][CI] rtools35 job failing on 32-bit build tests
+* [ARROW-12696](https://issues.apache.org/jira/browse/ARROW-12696) - [R] Improve testing of error messages converted to warnings
+* [ARROW-12699](https://issues.apache.org/jira/browse/ARROW-12699) - [CI][Packaging][Java] Generate a jar compatible with Linux and MacOS for all Arrow components
+* [ARROW-12701](https://issues.apache.org/jira/browse/ARROW-12701) - [Website][Release] Include Rust and DataFusion commits, contributors, changes in release notes
+* [ARROW-12702](https://issues.apache.org/jira/browse/ARROW-12702) - [JS] Upgrade Webpack and terser
+* [ARROW-12703](https://issues.apache.org/jira/browse/ARROW-12703) - [JS] Separate Table from DataFrame
+* [ARROW-12704](https://issues.apache.org/jira/browse/ARROW-12704) - [JS] use optional chaining
+* [ARROW-12709](https://issues.apache.org/jira/browse/ARROW-12709) - [C++] Add variadic string join kernel
+* [ARROW-12713](https://issues.apache.org/jira/browse/ARROW-12713) - [C++] String reverse kernel
+* [ARROW-12715](https://issues.apache.org/jira/browse/ARROW-12715) - [C++] SQL-style glob string match kernel
+* [ARROW-12716](https://issues.apache.org/jira/browse/ARROW-12716) - [C++] Left/right/center string padding kernels
+* [ARROW-12717](https://issues.apache.org/jira/browse/ARROW-12717) - [C++] Substring find position kernel
+* [ARROW-12719](https://issues.apache.org/jira/browse/ARROW-12719) - [C++][Python] pyarrow.fs.S3FileSystem pass extra kwargs i.e ACL
+* [ARROW-12721](https://issues.apache.org/jira/browse/ARROW-12721) - [CI] Fix path for uploading aarch64 conda artifacts from the nightly builds
+* [ARROW-12722](https://issues.apache.org/jira/browse/ARROW-12722) - [R] Raise error when attemping to print table with duplicated naming
+* [ARROW-12730](https://issues.apache.org/jira/browse/ARROW-12730) - [MATLAB] Update featherreadmex and featherwritemex to build against latest arrow c++ APIs
+* [ARROW-12731](https://issues.apache.org/jira/browse/ARROW-12731) - [R] Use InMemoryDataset for Table/RecordBatch in dplyr code
+* [ARROW-12736](https://issues.apache.org/jira/browse/ARROW-12736) - [C++] Eliminate unnecessary copy in FieldPath::Get()
+* [ARROW-12738](https://issues.apache.org/jira/browse/ARROW-12738) - [CI] [Gandiva] Nightly build error in azure-conda-osx-clang-py38 (and py39, py\*-r\*)
+* [ARROW-12741](https://issues.apache.org/jira/browse/ARROW-12741) - [CI] Configure GitHub Token for Nightly Builds
+* [ARROW-12745](https://issues.apache.org/jira/browse/ARROW-12745) - [C++][Compute] Add floor, ceiling, and truncate kernels
+* [ARROW-12749](https://issues.apache.org/jira/browse/ARROW-12749) - [C++] Unnecessary copy cause by constructing RecordBatch/Table/Schema from lvalues
+* [ARROW-12750](https://issues.apache.org/jira/browse/ARROW-12750) - [CI] [R] Actually pass parameterized docker options to the templates
+* [ARROW-12751](https://issues.apache.org/jira/browse/ARROW-12751) - [C++] Add variadic row-wise min/max kernels (least/greatest)
+* [ARROW-12758](https://issues.apache.org/jira/browse/ARROW-12758) - [R] Add examples to more function documentation
+* [ARROW-12760](https://issues.apache.org/jira/browse/ARROW-12760) - [C++][Python][R] S3FileSystem: IO thread parallelism limited to 8 threads
+* [ARROW-12761](https://issues.apache.org/jira/browse/ARROW-12761) - [R] Better error handling for write\_to\_raw
+* [ARROW-12764](https://issues.apache.org/jira/browse/ARROW-12764) - [CI] Fix arguments in Conda Windows builds
+* [ARROW-12777](https://issues.apache.org/jira/browse/ARROW-12777) - [R] Convert all inputs to Arrow objects in match\_arrow and is\_in
+* [ARROW-12781](https://issues.apache.org/jira/browse/ARROW-12781) - [R] Implement is.type() functions for dplyr
+* [ARROW-12785](https://issues.apache.org/jira/browse/ARROW-12785) - [CI] the r-devdocs build errors when brew installing gcc
+* [ARROW-12791](https://issues.apache.org/jira/browse/ARROW-12791) - [R] Better error handling for DatasetFactory$Finish() when no format specified
+* [ARROW-12796](https://issues.apache.org/jira/browse/ARROW-12796) - [JS] Support JSON output from benchmarks
+* [ARROW-12800](https://issues.apache.org/jira/browse/ARROW-12800) - [JS] Drop IE Support and remove text encoder and decoder polyfills
+* [ARROW-12801](https://issues.apache.org/jira/browse/ARROW-12801) - [CI][Packaging][Java] Include all modules in script that generate Arrow jars
+* [ARROW-12806](https://issues.apache.org/jira/browse/ARROW-12806) - [Python] test\_write\_to\_dataset\_filesystem missing a dataset mark
+* [ARROW-12808](https://issues.apache.org/jira/browse/ARROW-12808) - [JS] Document browser support
+* [ARROW-12810](https://issues.apache.org/jira/browse/ARROW-12810) - [Python] Run tests with AWS\_EC2\_METADATA\_DISABLED=true
+* [ARROW-12812](https://issues.apache.org/jira/browse/ARROW-12812) - [Packaging][Java] Improve JNI jars build
+* [ARROW-12824](https://issues.apache.org/jira/browse/ARROW-12824) - [R][CI] Upgrade builds for R 4.1 release
+* [ARROW-12827](https://issues.apache.org/jira/browse/ARROW-12827) - [C++] [Dataset] Review error pass-through in the datasets API
+* [ARROW-12829](https://issues.apache.org/jira/browse/ARROW-12829) - [GLib][Ruby] Add support for Apache Arrow Flight
+* [ARROW-12831](https://issues.apache.org/jira/browse/ARROW-12831) - [CI][macOS] Remove needless Homebrew workaround
+* [ARROW-12832](https://issues.apache.org/jira/browse/ARROW-12832) - [JS] Write benchmarks in TypeScript
+* [ARROW-12833](https://issues.apache.org/jira/browse/ARROW-12833) - [JS] Construct perf data in JS
+* [ARROW-12835](https://issues.apache.org/jira/browse/ARROW-12835) - [C++] Implement case insenstive match in match\_substring(\_regex) and match\_like
+* [ARROW-12836](https://issues.apache.org/jira/browse/ARROW-12836) - [C++] Installation on IBM i fails because of CxxFlags
+* [ARROW-12841](https://issues.apache.org/jira/browse/ARROW-12841) - [R] Add examples to more function documentation - part 2
+* [ARROW-12843](https://issues.apache.org/jira/browse/ARROW-12843) - [C++][Compute] Add is\_inf kernel for floating point arrays
+* [ARROW-12848](https://issues.apache.org/jira/browse/ARROW-12848) - [Release] Mail template points to 404
+* [ARROW-12851](https://issues.apache.org/jira/browse/ARROW-12851) - [Go][Parquet] Add Encoding Package Part 1
+* [ARROW-12856](https://issues.apache.org/jira/browse/ARROW-12856) - [C++][Gandiva] Implement castBIT and castBOOLEAN functions on Gandiva
+* [ARROW-12859](https://issues.apache.org/jira/browse/ARROW-12859) - [C++] Add ScalarFromJSON for easier testing
+* [ARROW-12861](https://issues.apache.org/jira/browse/ARROW-12861) - [C++][Compute] Add sign function kernels
+* [ARROW-12867](https://issues.apache.org/jira/browse/ARROW-12867) - [R] Bindings for abs()
+* [ARROW-12868](https://issues.apache.org/jira/browse/ARROW-12868) - [R] Bindings for find\_substring and find\_substring\_regex
+* [ARROW-12869](https://issues.apache.org/jira/browse/ARROW-12869) - [R] Bindings for utf8\_reverse and ascii\_reverse
+* [ARROW-12870](https://issues.apache.org/jira/browse/ARROW-12870) - [R] Bindings for stringr::str\_like
+* [ARROW-12875](https://issues.apache.org/jira/browse/ARROW-12875) - [JS] Upgrade Jest and other minor updates
+* [ARROW-12883](https://issues.apache.org/jira/browse/ARROW-12883) - [R] [CI] version compatibility fails on R 4.1
+* [ARROW-12891](https://issues.apache.org/jira/browse/ARROW-12891) - [C++][Compute][Dataset] Extract subtree pruning logic to compute::
+* [ARROW-12894](https://issues.apache.org/jira/browse/ARROW-12894) - [R] Bump R version
+* [ARROW-12895](https://issues.apache.org/jira/browse/ARROW-12895) - [CI] Use "concurrency" setting on Github Actions
+* [ARROW-12898](https://issues.apache.org/jira/browse/ARROW-12898) - [Release][C\#] Package upload script is broken
+* [ARROW-12900](https://issues.apache.org/jira/browse/ARROW-12900) - [Python][Documentation] an np import in Reading Datasets docs
+* [ARROW-12901](https://issues.apache.org/jira/browse/ARROW-12901) - [R] Follow on to more examples
+* [ARROW-12909](https://issues.apache.org/jira/browse/ARROW-12909) - [R][Release] Build of ubuntu-docs is failing
+* [ARROW-12912](https://issues.apache.org/jira/browse/ARROW-12912) - [Website] Use .asf.yaml for publishing
+* [ARROW-12915](https://issues.apache.org/jira/browse/ARROW-12915) - [Release] Build of ubuntu-docs is failing on thrift
+* [ARROW-12936](https://issues.apache.org/jira/browse/ARROW-12936) - [C++][Gandiva] Implement ASCII Hive function on Gandiva
+* [ARROW-12937](https://issues.apache.org/jira/browse/ARROW-12937) - [C++] Allow specifying default metadata for new S3 files
+* [ARROW-12939](https://issues.apache.org/jira/browse/ARROW-12939) - [R] Simplify RTask stop handling
+* [ARROW-12940](https://issues.apache.org/jira/browse/ARROW-12940) - [R] Expose C interface as R6 methods
+* [ARROW-12948](https://issues.apache.org/jira/browse/ARROW-12948) - [C++] Add string slice replace kernel
+* [ARROW-12949](https://issues.apache.org/jira/browse/ARROW-12949) - [C++] Add string starts-with/ends-with kernels
+* [ARROW-12950](https://issues.apache.org/jira/browse/ARROW-12950) - [C++] Add substring count kernel
+* [ARROW-12951](https://issues.apache.org/jira/browse/ARROW-12951) - [C++] Refactor StringTransform
+* [ARROW-12952](https://issues.apache.org/jira/browse/ARROW-12952) - [C++] Add regex count kernel
+* [ARROW-12955](https://issues.apache.org/jira/browse/ARROW-12955) - [C++] Add additional type support for if\_else kernel
+* [ARROW-12957](https://issues.apache.org/jira/browse/ARROW-12957) - [R] rchk issues on cran
+* [ARROW-12961](https://issues.apache.org/jira/browse/ARROW-12961) - [C++] MSVC issues warning building PyArrow on Windows
+* [ARROW-12962](https://issues.apache.org/jira/browse/ARROW-12962) - [GLib][Ruby] Add Arrow:Scalar
+* [ARROW-12964](https://issues.apache.org/jira/browse/ARROW-12964) - [R] Add bindings for ifelse() and if\_else()
+* [ARROW-12966](https://issues.apache.org/jira/browse/ARROW-12966) - [Python] Expose Python binding for ElementWiseAggregateOptions
+* [ARROW-12967](https://issues.apache.org/jira/browse/ARROW-12967) - [R] Add bindings for pmin() and pmax()
+* [ARROW-12968](https://issues.apache.org/jira/browse/ARROW-12968) - [R] [CI] Add an rchk job to our nightlies
+* [ARROW-12972](https://issues.apache.org/jira/browse/ARROW-12972) - [CI] ][C++] archive\_write\_add\_filter\_zstd error on CentOS + ARM64
+* [ARROW-12975](https://issues.apache.org/jira/browse/ARROW-12975) - [C++][Python] if\_else kernel doesn't support upcasting
+* [ARROW-12982](https://issues.apache.org/jira/browse/ARROW-12982) - [C++] Re-enable unused-variable warning
+* [ARROW-12984](https://issues.apache.org/jira/browse/ARROW-12984) - [C++] Passing options parameter of Count/Index aggregation by reference
+* [ARROW-12985](https://issues.apache.org/jira/browse/ARROW-12985) - [Python][Packaging] Unable to install pygit2 in the arm64 wheel builds
+* [ARROW-12986](https://issues.apache.org/jira/browse/ARROW-12986) - [C++][Gandiva] Implement new cache eviction policy in Gandiva
+* [ARROW-12992](https://issues.apache.org/jira/browse/ARROW-12992) - [R] bindings for substr(), substring(), str\_sub()
+* [ARROW-12994](https://issues.apache.org/jira/browse/ARROW-12994) - [R] Fix tests that assume UTC local tz
+* [ARROW-12996](https://issues.apache.org/jira/browse/ARROW-12996) - [C++] CSV stream reader has no progress indication
+* [ARROW-13002](https://issues.apache.org/jira/browse/ARROW-13002) - [C++] Add a check for the utf8proc's version in CMake
+* [ARROW-13005](https://issues.apache.org/jira/browse/ARROW-13005) - [C++] Support filter/take for union data type.
+* [ARROW-13006](https://issues.apache.org/jira/browse/ARROW-13006) - [C++][Gandiva] Implement BASE64 and UNBASE64 Hive functions on Gandiva
+* [ARROW-13009](https://issues.apache.org/jira/browse/ARROW-13009) - [Doc][Dev] Document builds mailing-list
+* [ARROW-13022](https://issues.apache.org/jira/browse/ARROW-13022) - [R] bindings for lubridate's year, isoyear, quarter, month, day, wday, yday, isoweek, hour, minute, and second functions
+* [ARROW-13025](https://issues.apache.org/jira/browse/ARROW-13025) - [C++][Compute] Enhance FunctionOptions with equality, debug representability, and serializability
+* [ARROW-13027](https://issues.apache.org/jira/browse/ARROW-13027) - [C++] Fix ASAN stack traces in CI
+* [ARROW-13030](https://issues.apache.org/jira/browse/ARROW-13030) - [CI][Go] Setup Arm64 golang CI
+* [ARROW-13031](https://issues.apache.org/jira/browse/ARROW-13031) - [JS] Support arm in closure compiler on macOS
+* [ARROW-13032](https://issues.apache.org/jira/browse/ARROW-13032) - [Java] Update gauva version
+* [ARROW-13034](https://issues.apache.org/jira/browse/ARROW-13034) - [Python][Docs] Update outdated examples for hdfs/azure on the Parquet doc page
+* [ARROW-13036](https://issues.apache.org/jira/browse/ARROW-13036) - [Doc] Mention recommended file extension(s) for Arrow IPC
+* [ARROW-13042](https://issues.apache.org/jira/browse/ARROW-13042) - [C++] Automatic checks that kernels don't leave uninitialized data in output
+* [ARROW-13043](https://issues.apache.org/jira/browse/ARROW-13043) - [GLib][Ruby] Add GArrowEqualOptions
+* [ARROW-13044](https://issues.apache.org/jira/browse/ARROW-13044) - [Java] Union vectors should extend ValueVector
+* [ARROW-13045](https://issues.apache.org/jira/browse/ARROW-13045) - [Packaging][RPM][deb] Don't install system utf8proc if it's old
+* [ARROW-13047](https://issues.apache.org/jira/browse/ARROW-13047) - [Website] Add kiszk to committer list
+* [ARROW-13049](https://issues.apache.org/jira/browse/ARROW-13049) - [C++][Gandiva] Implement BIN Hive function on Gandiva
+* [ARROW-13050](https://issues.apache.org/jira/browse/ARROW-13050) - [C++][Gandiva] Implement SPACE Hive function on Gandiva
+* [ARROW-13054](https://issues.apache.org/jira/browse/ARROW-13054) - [C++] Add option to specify the first day of the week for the "day\_of\_week" temporal kernel
+* [ARROW-13064](https://issues.apache.org/jira/browse/ARROW-13064) - [C++] Add a general "if, ifelse, ..., else" kernel ("CASE WHEN")
+* [ARROW-13065](https://issues.apache.org/jira/browse/ARROW-13065) - [Packaging][RPM] Add missing required LZ4 version information
+* [ARROW-13068](https://issues.apache.org/jira/browse/ARROW-13068) - [GLib][Dataset] Change prefix to gadataset\_ from gad\_
+* [ARROW-13070](https://issues.apache.org/jira/browse/ARROW-13070) - [R] bindings for sd and var
+* [ARROW-13072](https://issues.apache.org/jira/browse/ARROW-13072) - [C++] Add bitwise arithmetic compute functions
+* [ARROW-13074](https://issues.apache.org/jira/browse/ARROW-13074) - [Python] Start with deprecating ParquetDataset custom attributes
+* [ARROW-13075](https://issues.apache.org/jira/browse/ARROW-13075) - [Python] Expose C data interface API for pyarrow.Field
+* [ARROW-13076](https://issues.apache.org/jira/browse/ARROW-13076) - [Java] Enable ExtensionType to use StructVector and UnionVector for underlying storage
+* [ARROW-13082](https://issues.apache.org/jira/browse/ARROW-13082) - [CI] Forward R argument to ubuntu-docs build
+* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_
+* [ARROW-13086](https://issues.apache.org/jira/browse/ARROW-13086) - [Python] Expose Parquet ArrowReaderProperties::coerce\_int96\_timestamp\_unit\_
+* [ARROW-13091](https://issues.apache.org/jira/browse/ARROW-13091) - [Python] Add compression\_level argument to IpcWriteOptions constructor
+* [ARROW-13092](https://issues.apache.org/jira/browse/ARROW-13092) - [C++] CreateDir should fail if the target exists and is not a directory
+* [ARROW-13095](https://issues.apache.org/jira/browse/ARROW-13095) - [C++] Implement trigonometric compute functions
+* [ARROW-13096](https://issues.apache.org/jira/browse/ARROW-13096) - [C++] Implement logarithm compute functions
+* [ARROW-13097](https://issues.apache.org/jira/browse/ARROW-13097) - [C++] Provide a simple reflection utility for {{struct}}s
+* [ARROW-13098](https://issues.apache.org/jira/browse/ARROW-13098) - [Dev][Archery] Reorganize docker submodule to its own subpackage
+* [ARROW-13100](https://issues.apache.org/jira/browse/ARROW-13100) - [MATLAB] Integrate GoogleTest with MATLAB Interface C++ Code
+* [ARROW-13101](https://issues.apache.org/jira/browse/ARROW-13101) - [Python][Doc] pyarrow.FixedSizeListArray does not appear in the documentation
+* [ARROW-13110](https://issues.apache.org/jira/browse/ARROW-13110) - [C++] Deadlock can happen when using BackgroundGenerator without transferring callbacks
+* [ARROW-13113](https://issues.apache.org/jira/browse/ARROW-13113) - [R] use RTasks to manage parallel in converting arrow to R
+* [ARROW-13117](https://issues.apache.org/jira/browse/ARROW-13117) - [R] Retain schema in new Expressions
+* [ARROW-13119](https://issues.apache.org/jira/browse/ARROW-13119) - [R] Set empty schema in scalar Expressions
+* [ARROW-13124](https://issues.apache.org/jira/browse/ARROW-13124) - [Ruby] Add support for memory view
+* [ARROW-13127](https://issues.apache.org/jira/browse/ARROW-13127) - [R] Valgrind nightly errors
+* [ARROW-13136](https://issues.apache.org/jira/browse/ARROW-13136) - [C++] Add a "coalesce" variadic scalar kernel
+* [ARROW-13137](https://issues.apache.org/jira/browse/ARROW-13137) - [C++][Documentation] Make in-table references consistent
+* [ARROW-13140](https://issues.apache.org/jira/browse/ARROW-13140) - [C++/Python] Upgrade libthrift pin in the nightlies
+* [ARROW-13142](https://issues.apache.org/jira/browse/ARROW-13142) - [Python] Use vector append when converting from list of non-strided numpy arrays
+* [ARROW-13147](https://issues.apache.org/jira/browse/ARROW-13147) - [Java] Respect the rounding policy when allocating vector buffers
+* [ARROW-13157](https://issues.apache.org/jira/browse/ARROW-13157) - [C++] Add find\_substring\_regex kernel and implement ignore\_case for find\_substring
+* [ARROW-13158](https://issues.apache.org/jira/browse/ARROW-13158) - [Python] Fix repr and contains of StructScalar with duplicate field names
+* [ARROW-13162](https://issues.apache.org/jira/browse/ARROW-13162) - [C++][Gandiva] Add new alias for extract date functions in Gandiva registry
+* [ARROW-13171](https://issues.apache.org/jira/browse/ARROW-13171) - [R] Add binding for str\_pad()
+* [ARROW-13190](https://issues.apache.org/jira/browse/ARROW-13190) - [C++] [Gandiva] Change behavior of INITCAP function
+* [ARROW-13194](https://issues.apache.org/jira/browse/ARROW-13194) - [Java][Document] Create prose document about Java algorithms
+* [ARROW-13195](https://issues.apache.org/jira/browse/ARROW-13195) - [R] Problem with rlang reverse dependency checks
+* [ARROW-13199](https://issues.apache.org/jira/browse/ARROW-13199) - [R] add ubuntu 21.04 to nightly builds
+* [ARROW-13200](https://issues.apache.org/jira/browse/ARROW-13200) - [R] Add binding for case\_when()
+* [ARROW-13201](https://issues.apache.org/jira/browse/ARROW-13201) - [R] Add binding for coalesce()
+* [ARROW-13210](https://issues.apache.org/jira/browse/ARROW-13210) - [Python][CI] Fix vcpkg caching mechanism for the macOS wheels
+* [ARROW-13211](https://issues.apache.org/jira/browse/ARROW-13211) - [C++][CI] Remove outdated Github Actions ARM builds
+* [ARROW-13212](https://issues.apache.org/jira/browse/ARROW-13212) - [Release] Support deploying to test PyPI in the python post release script
+* [ARROW-13215](https://issues.apache.org/jira/browse/ARROW-13215) - [R] [CI] Add ENV TZ to docker files
+* [ARROW-13218](https://issues.apache.org/jira/browse/ARROW-13218) - [Doc] Document/clarify conventions for timestamp storage
+* [ARROW-13219](https://issues.apache.org/jira/browse/ARROW-13219) - [C++][GLib] Demote/deprecate CompareOptions
+* [ARROW-13224](https://issues.apache.org/jira/browse/ARROW-13224) - [Python][Doc] Documentation missing for pyarrow.dataset.write\_dataset
+* [ARROW-13226](https://issues.apache.org/jira/browse/ARROW-13226) - [Python] Add a general purpose cython trampolining utility
+* [ARROW-13228](https://issues.apache.org/jira/browse/ARROW-13228) - [C++] S3 CreateBucket fails because AWS treats us-east-1 differently than other regions
+* [ARROW-13230](https://issues.apache.org/jira/browse/ARROW-13230) - Add CSV Writer documentation
+* [ARROW-13234](https://issues.apache.org/jira/browse/ARROW-13234) - [C++] Add string padding option to determine which side the extra space goes on
+* [ARROW-13235](https://issues.apache.org/jira/browse/ARROW-13235) - [C++] Make type\_name equal to options class name for all FunctionOptionTypes
+* [ARROW-13236](https://issues.apache.org/jira/browse/ARROW-13236) - [Python] Improve repr of pyarrow.compute.FunctionOptions
+* [ARROW-13238](https://issues.apache.org/jira/browse/ARROW-13238) - [C++][Dataset][Compute] Substitute ExecPlan impl for dataset scans
+* [ARROW-13242](https://issues.apache.org/jira/browse/ARROW-13242) - [C++] Improve decimal random generation
+* [ARROW-13244](https://issues.apache.org/jira/browse/ARROW-13244) - [C++] Add facility to get current thread id
+* [ARROW-13258](https://issues.apache.org/jira/browse/ARROW-13258) - [Python] Improve the repr of ParquetFileFragment
+* [ARROW-13262](https://issues.apache.org/jira/browse/ARROW-13262) - [R] transmute() fails after pulling data into R
+* [ARROW-13273](https://issues.apache.org/jira/browse/ARROW-13273) - [C++] Don't use .pc only in CMake paths for Requires.private
+* [ARROW-13274](https://issues.apache.org/jira/browse/ARROW-13274) - [JS] Remove Webpack
+* [ARROW-13275](https://issues.apache.org/jira/browse/ARROW-13275) - [JS] Fix perf tests
+* [ARROW-13276](https://issues.apache.org/jira/browse/ARROW-13276) - [GLib][Ruby][Flight] Add support for ListFlights
+* [ARROW-13277](https://issues.apache.org/jira/browse/ARROW-13277) - [JS] Add declaration maps
+* [ARROW-13280](https://issues.apache.org/jira/browse/ARROW-13280) - [R] Bindings for log and trig functions
+* [ARROW-13282](https://issues.apache.org/jira/browse/ARROW-13282) - [C++] Remove obsolete generated files
+* [ARROW-13283](https://issues.apache.org/jira/browse/ARROW-13283) - [Developer Tools] Support passing through memory limits in archery docker run
+* [ARROW-13286](https://issues.apache.org/jira/browse/ARROW-13286) - [CI] Require docker-compose 1.27.0 or later
+* [ARROW-13289](https://issues.apache.org/jira/browse/ARROW-13289) - [C++] Log functions don't have int kernels
+* [ARROW-13291](https://issues.apache.org/jira/browse/ARROW-13291) - [GLib][CI] Require gobject-introspection 3.4.5 or later
+* [ARROW-13296](https://issues.apache.org/jira/browse/ARROW-13296) - [C++] Provide reflection-compatible enum replacement
+* [ARROW-13299](https://issues.apache.org/jira/browse/ARROW-13299) - [JS] Upgrade ix and rxjs
+* [ARROW-13303](https://issues.apache.org/jira/browse/ARROW-13303) - [JS] Revise bundles
+* [ARROW-13306](https://issues.apache.org/jira/browse/ARROW-13306) - [Java][JDBC] use ResultSetMetaData.getColumnLabel instead of ResultSetMetaData.getColumnName
+* [ARROW-13313](https://issues.apache.org/jira/browse/ARROW-13313) - [C++][Compute] Add ScalarAggregateNode
+* [ARROW-13320](https://issues.apache.org/jira/browse/ARROW-13320) - [Website] Add MIME types to FAQ
+* [ARROW-13323](https://issues.apache.org/jira/browse/ARROW-13323) - [Archery] Validate docker compose configuration
+* [ARROW-13343](https://issues.apache.org/jira/browse/ARROW-13343) - [R] Update NEWS.md for 5.0
+* [ARROW-13346](https://issues.apache.org/jira/browse/ARROW-13346) - [C++] Remove compile time parsing from EnumType
+* [ARROW-13355](https://issues.apache.org/jira/browse/ARROW-13355) - [R] ensure that sf is installed in our revdep job
+* [ARROW-13357](https://issues.apache.org/jira/browse/ARROW-13357) - [R] bindings for sign()
+* [ARROW-13365](https://issues.apache.org/jira/browse/ARROW-13365) - [R] bindings for floor/ceiling/truncate
+* [ARROW-13385](https://issues.apache.org/jira/browse/ARROW-13385) - [C++][Compute] Document out-of-source addition to the FunctionRegistry
+* [ARROW-13386](https://issues.apache.org/jira/browse/ARROW-13386) - [R][C++] CSV streaming changes break Rtools 35 32-bit build
+* [ARROW-13418](https://issues.apache.org/jira/browse/ARROW-13418) - [R] typo in python.r
+* [PARQUET-1798](https://issues.apache.org/jira/browse/PARQUET-1798) - [C++] Review logic around automatic assignment of field\_id's
+* [PARQUET-1998](https://issues.apache.org/jira/browse/PARQUET-1998) - [C++] Implement LZ4\_RAW compression
+* [PARQUET-2056](https://issues.apache.org/jira/browse/PARQUET-2056) - [C++] Add ability for retrieving dictionary and indices separately for ColumnReader
+
+
+
+# Apache Arrow 3.0.0 (2021-01-18)
+
+## New Features and Improvements
+
+* [ARROW-1846](https://issues.apache.org/jira/browse/ARROW-1846) - [C++] Implement "any" reduction kernel for boolean data
+* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration
+* [ARROW-4193](https://issues.apache.org/jira/browse/ARROW-4193) - [Rust] Add support for decimal data type
+* [ARROW-4544](https://issues.apache.org/jira/browse/ARROW-4544) - [Rust] Read nested JSON structs into StructArrays
+* [ARROW-4804](https://issues.apache.org/jira/browse/ARROW-4804) - [Rust] Read temporal values from CSV - Parse Date32 and Date64 in CSV reader
+* [ARROW-4960](https://issues.apache.org/jira/browse/ARROW-4960) - [R] Add crossbow task for r-arrow-feedstock
+* [ARROW-4970](https://issues.apache.org/jira/browse/ARROW-4970) - [C++][Parquet] Implement parquet::FileMetaData::Equals
+* [ARROW-5336](https://issues.apache.org/jira/browse/ARROW-5336) - [C++] Implement arrow::Concatenate for dictionary-encoded arrays with unequal dictionaries
+* [ARROW-5350](https://issues.apache.org/jira/browse/ARROW-5350) - [Rust] Support filtering on primitive/string lists
+* [ARROW-5394](https://issues.apache.org/jira/browse/ARROW-5394) - [C++] Benchmarks for IsIn Kernel
+* [ARROW-5679](https://issues.apache.org/jira/browse/ARROW-5679) - [Python] Drop Python 3.5 from support matrix
+* [ARROW-5950](https://issues.apache.org/jira/browse/ARROW-5950) - [Rust] [DataFusion] Add ability to log via logger dependency
+* [ARROW-6071](https://issues.apache.org/jira/browse/ARROW-6071) - [C++] Implement casting Binary <-\> LargeBinary
+* [ARROW-6697](https://issues.apache.org/jira/browse/ARROW-6697) - [Rust] [DataFusion] Validate that all parquet partitions have the same schema
+* [ARROW-6715](https://issues.apache.org/jira/browse/ARROW-6715) - [Website] Describe "non-free" component is needed for Plasma packages in install page
+* [ARROW-6883](https://issues.apache.org/jira/browse/ARROW-6883) - [C++] Support sending delta DictionaryBatch or replacement DictionaryBatch in IPC stream writer class
+* [ARROW-6995](https://issues.apache.org/jira/browse/ARROW-6995) - [Packaging][Crossbow] The windows conda artifacts are not uploaded to GitHub releases
+* [ARROW-7531](https://issues.apache.org/jira/browse/ARROW-7531) - [C++] Investigate header cost reduction
+* [ARROW-7800](https://issues.apache.org/jira/browse/ARROW-7800) - [Python] Expose GetRecordBatchReader API in PyArrow
+* [ARROW-7842](https://issues.apache.org/jira/browse/ARROW-7842) - [Rust] [Parquet] Implement array reader for list type
+* [ARROW-8113](https://issues.apache.org/jira/browse/ARROW-8113) - [C++] Implement a lighter-weight variant
+* [ARROW-8199](https://issues.apache.org/jira/browse/ARROW-8199) - [C++] Add support for multi-column sort on Table
+* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer
+* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet
+* [ARROW-8425](https://issues.apache.org/jira/browse/ARROW-8425) - [Rust] [Parquet] Add support for writing temporal types
+* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
+* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
+* [ARROW-8853](https://issues.apache.org/jira/browse/ARROW-8853) - [Rust] [Integration Testing] Enable Flight tests
+* [ARROW-8876](https://issues.apache.org/jira/browse/ARROW-8876) - [C++] Implement casts from date types to Timestamp
+* [ARROW-8883](https://issues.apache.org/jira/browse/ARROW-8883) - [Rust] [Integration Testing] Enable passing tests and update spec doc
+* [ARROW-9001](https://issues.apache.org/jira/browse/ARROW-9001) - [R] Box outputs as correct type in call\_function
+* [ARROW-9164](https://issues.apache.org/jira/browse/ARROW-9164) - [C++] Provide APIs for adding "docstrings" to arrow::compute::Function classes that can be accessed by bindings
+* [ARROW-9187](https://issues.apache.org/jira/browse/ARROW-9187) - [R] Add bindings for arithmetic kernels
+* [ARROW-9296](https://issues.apache.org/jira/browse/ARROW-9296) - [CI][Rust] Enable more clippy lint checks
+* [ARROW-9304](https://issues.apache.org/jira/browse/ARROW-9304) - [C++] Add "AppendEmptyValue" builder APIs for use inside StructBuilder::AppendNull
+* [ARROW-9361](https://issues.apache.org/jira/browse/ARROW-9361) - [Rust] Move other array types into their own modules
+* [ARROW-9400](https://issues.apache.org/jira/browse/ARROW-9400) - [Python] Do not depend on conda-forge static libraries in Windows wheel builds
+* [ARROW-9475](https://issues.apache.org/jira/browse/ARROW-9475) - [Java] Clean up usages of BaseAllocator, use BufferAllocator instead
+* [ARROW-9489](https://issues.apache.org/jira/browse/ARROW-9489) - [C++] Add fill\_null kernel implementation for (array[string], scalar[string])
+* [ARROW-9555](https://issues.apache.org/jira/browse/ARROW-9555) - [Rust] [DataFusion] Add inner (hash) equijoin physical plan
+* [ARROW-9564](https://issues.apache.org/jira/browse/ARROW-9564) - [Packaging] Vendor r-arrow-feedstock conda-forge recipe
+* [ARROW-9674](https://issues.apache.org/jira/browse/ARROW-9674) - [Rust] Parquet reader should implement Send + Sync
+* [ARROW-9704](https://issues.apache.org/jira/browse/ARROW-9704) - [Java] TestEndianness.testLittleEndian fails on big endian platform
+* [ARROW-9707](https://issues.apache.org/jira/browse/ARROW-9707) - [Rust] [DataFusion] Re-implement threading model
+* [ARROW-9709](https://issues.apache.org/jira/browse/ARROW-9709) - [Java] Test cases in arrow-vector assume little-endian platform
+* [ARROW-9728](https://issues.apache.org/jira/browse/ARROW-9728) - [Rust] [Parquet] Compute nested definition and repetition for structs
+* [ARROW-9747](https://issues.apache.org/jira/browse/ARROW-9747) - [C++][Java][Format] Support Decimal256 Type
+* [ARROW-9771](https://issues.apache.org/jira/browse/ARROW-9771) - [Rust] [DataFusion] Predicate Pushdown Improvement: treat predicates separated by AND separately
+* [ARROW-9803](https://issues.apache.org/jira/browse/ARROW-9803) - [Go] Add initial support for s390x
+* [ARROW-9804](https://issues.apache.org/jira/browse/ARROW-9804) - [FlightRPC] Authentication Redesign
+* [ARROW-9828](https://issues.apache.org/jira/browse/ARROW-9828) - [Rust] [DataFusion] TableProvider trait should support predicate push-down
+* [ARROW-9861](https://issues.apache.org/jira/browse/ARROW-9861) - [Java] Failed Arrow Vector on big-endian platform
+* [ARROW-9862](https://issues.apache.org/jira/browse/ARROW-9862) - Throw an exception in UnsafeDirectLittleEndian on Big-Endian platform
+* [ARROW-9911](https://issues.apache.org/jira/browse/ARROW-9911) - [Rust][DataFusion] SELECT <expression\> with no FROM clause should produce a single row of output
+* [ARROW-9945](https://issues.apache.org/jira/browse/ARROW-9945) - [C++][Dataset] Refactor Expression::Assume to return a Result
+* [ARROW-9991](https://issues.apache.org/jira/browse/ARROW-9991) - [C++] split kernels for strings/binary
+* [ARROW-10002](https://issues.apache.org/jira/browse/ARROW-10002) - [Rust] Trait-specialization requires nightly
+* [ARROW-10021](https://issues.apache.org/jira/browse/ARROW-10021) - [C++][Compute] Support finding nth frequently used value in mode kernel
+* [ARROW-10032](https://issues.apache.org/jira/browse/ARROW-10032) - [Documentation] C++ Windows docs are out of date
+* [ARROW-10079](https://issues.apache.org/jira/browse/ARROW-10079) - [Rust]: Benchmark and improve count\_set\_bits function
+* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes
+* [ARROW-10097](https://issues.apache.org/jira/browse/ARROW-10097) - [C++] Persist SetLookupState in between usages of IsIn when filtering dataset batches
+* [ARROW-10106](https://issues.apache.org/jira/browse/ARROW-10106) - [FlightRPC][Java] Expose onIsReady() callback on OutboundStreamListener
+* [ARROW-10108](https://issues.apache.org/jira/browse/ARROW-10108) - [Rust] [Parquet] Fix compiler warning about unused return value
+* [ARROW-10109](https://issues.apache.org/jira/browse/ARROW-10109) - [Rust] Add support to produce a C Data interface
+* [ARROW-10110](https://issues.apache.org/jira/browse/ARROW-10110) - [Rust] Add support to consume C Data Interface
+* [ARROW-10131](https://issues.apache.org/jira/browse/ARROW-10131) - [C++][Dataset] Lazily parse parquet metadata / statistics in ParquetDatasetFactory and ParquetFileFragment
+* [ARROW-10135](https://issues.apache.org/jira/browse/ARROW-10135) - [Rust] [Parquet] Refactor file module to help adding sources
+* [ARROW-10143](https://issues.apache.org/jira/browse/ARROW-10143) - [C++] ArrayRangeEquals should accept EqualOptions
+* [ARROW-10144](https://issues.apache.org/jira/browse/ARROW-10144) - [Flight] Add support for using the TLS\_SNI extension
+* [ARROW-10149](https://issues.apache.org/jira/browse/ARROW-10149) - [Rust] Add support to external release of un-owned buffers
+* [ARROW-10163](https://issues.apache.org/jira/browse/ARROW-10163) - [Rust] [DataFusion] Add DictionaryArray coercion support
+* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields
+* [ARROW-10173](https://issues.apache.org/jira/browse/ARROW-10173) - [Rust][DataFusion] Improve performance of equality to a constant predicate support
+* [ARROW-10180](https://issues.apache.org/jira/browse/ARROW-10180) - [C++][Doc] Update dependency management docs following aws-sdk-cpp addition
+* [ARROW-10182](https://issues.apache.org/jira/browse/ARROW-10182) - [C++] Add basic continuation support to futures
+* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches
+* [ARROW-10197](https://issues.apache.org/jira/browse/ARROW-10197) - [Gandiva][python] Execute expression on filtered data
+* [ARROW-10203](https://issues.apache.org/jira/browse/ARROW-10203) - [Doc] Capture guidance for endianness support in contributors guide.
+* [ARROW-10207](https://issues.apache.org/jira/browse/ARROW-10207) - [C++] Unary kernels that results in a list have no preallocated offset buffer
+* [ARROW-10208](https://issues.apache.org/jira/browse/ARROW-10208) - [C++] String split kernels do not propagate nulls correctly on sliced input
+* [ARROW-10216](https://issues.apache.org/jira/browse/ARROW-10216) - [Rust] Simd implementation of min/max aggregation kernels for primitive types
+* [ARROW-10224](https://issues.apache.org/jira/browse/ARROW-10224) - [Python] Add support for Python 3.9 except macOS wheel and Windows wheel
+* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests
+* [ARROW-10228](https://issues.apache.org/jira/browse/ARROW-10228) - [Julia] Donate Julia Implementation
+* [ARROW-10236](https://issues.apache.org/jira/browse/ARROW-10236) - [Rust] [DataFusion] Make DataFusion casting rules consistent with cast kernel
+* [ARROW-10241](https://issues.apache.org/jira/browse/ARROW-10241) - [C++][Compute] Add variance kernel benchmark
+* [ARROW-10249](https://issues.apache.org/jira/browse/ARROW-10249) - [Rust]: Support Dictionary types for ListArrays in arrow json reader
+* [ARROW-10259](https://issues.apache.org/jira/browse/ARROW-10259) - [Rust] Support field metadata
+* [ARROW-10261](https://issues.apache.org/jira/browse/ARROW-10261) - [Rust] [BREAKING] Lists should take Field instead of DataType
+* [ARROW-10263](https://issues.apache.org/jira/browse/ARROW-10263) - [C++][Compute] Improve numerical stability of variances merging
+* [ARROW-10268](https://issues.apache.org/jira/browse/ARROW-10268) - [Rust] Support writing dictionaries to IPC file and stream
+* [ARROW-10269](https://issues.apache.org/jira/browse/ARROW-10269) - [Rust] Update nightly: Oct 2020 Edition
+* [ARROW-10277](https://issues.apache.org/jira/browse/ARROW-10277) - [C++] Support comparing scalars approximately
+* [ARROW-10289](https://issues.apache.org/jira/browse/ARROW-10289) - [Rust] Support reading dictionary streams
+* [ARROW-10292](https://issues.apache.org/jira/browse/ARROW-10292) - [Rust] [DataFusion] Simplify merge
+* [ARROW-10295](https://issues.apache.org/jira/browse/ARROW-10295) - [Rust] [DataFusion] Simplify accumulators
+* [ARROW-10300](https://issues.apache.org/jira/browse/ARROW-10300) - [Rust] Improve benchmark documentation for generating/converting TPC-H data
+* [ARROW-10301](https://issues.apache.org/jira/browse/ARROW-10301) - [C++] Add "all" boolean reducing kernel
+* [ARROW-10302](https://issues.apache.org/jira/browse/ARROW-10302) - [Python] Don't double-package plasma-store-server
+* [ARROW-10304](https://issues.apache.org/jira/browse/ARROW-10304) - [C++][Compute] Optimize variance kernel for integers
+* [ARROW-10310](https://issues.apache.org/jira/browse/ARROW-10310) - [C++][Gandiva] Add single argument round() in Gandiva
+* [ARROW-10311](https://issues.apache.org/jira/browse/ARROW-10311) - [Release] Update crossbow verification process
+* [ARROW-10313](https://issues.apache.org/jira/browse/ARROW-10313) - [C++] Improve UTF8 validation speed and CSV string conversion
+* [ARROW-10318](https://issues.apache.org/jira/browse/ARROW-10318) - [C++] Use pimpl idiom in CSV parser
+* [ARROW-10319](https://issues.apache.org/jira/browse/ARROW-10319) - [Flight][Go] Add Context to Client Auth Handler functions for Flight
+* [ARROW-10320](https://issues.apache.org/jira/browse/ARROW-10320) - [Rust] Convert RecordBatchIterator to a Stream
+* [ARROW-10322](https://issues.apache.org/jira/browse/ARROW-10322) - [C++][Dataset] Minimize Expression to a wrapper around compute::Function
+* [ARROW-10323](https://issues.apache.org/jira/browse/ARROW-10323) - [Release][wheel] Add missing verification setup step
+* [ARROW-10325](https://issues.apache.org/jira/browse/ARROW-10325) - [C++][Compute] Separate aggregate kernel registration
+* [ARROW-10328](https://issues.apache.org/jira/browse/ARROW-10328) - [C++] Consider using fast-double-parser
+* [ARROW-10330](https://issues.apache.org/jira/browse/ARROW-10330) - [Rust][Datafusion] Implement nullif() function for DataFusion
+* [ARROW-10331](https://issues.apache.org/jira/browse/ARROW-10331) - [Rust] [DataFusion] Re-organize errors
+* [ARROW-10332](https://issues.apache.org/jira/browse/ARROW-10332) - [Rust] Allow CSV reader to start from a line
+* [ARROW-10334](https://issues.apache.org/jira/browse/ARROW-10334) - [Rust] [Parquet] Support reading and writing Arrow NullArray
+* [ARROW-10336](https://issues.apache.org/jira/browse/ARROW-10336) - [Rust] Support fromIter and toIter for strings
+* [ARROW-10337](https://issues.apache.org/jira/browse/ARROW-10337) - [C++] More liberal parsing of ISO8601 timestamps with fractional seconds
+* [ARROW-10338](https://issues.apache.org/jira/browse/ARROW-10338) - [Rust]: Use const fn for applicable methods
+* [ARROW-10340](https://issues.apache.org/jira/browse/ARROW-10340) - [Packaging][deb][RPM] Use Python 3.8 for pygit2
+* [ARROW-10356](https://issues.apache.org/jira/browse/ARROW-10356) - [Rust] [DataFusion] Add support for is\_in
+* [ARROW-10363](https://issues.apache.org/jira/browse/ARROW-10363) - [Python] Remove workaround for CMake bug in manylinux
+* [ARROW-10366](https://issues.apache.org/jira/browse/ARROW-10366) - [Rust] [DataFusion] Remove collect from merge
+* [ARROW-10375](https://issues.apache.org/jira/browse/ARROW-10375) - [Rust] Remove PrimitiveArrayOps
+* [ARROW-10378](https://issues.apache.org/jira/browse/ARROW-10378) - [Rust] Update take() kernel with support for large lists
+* [ARROW-10381](https://issues.apache.org/jira/browse/ARROW-10381) - [Rust] Generalize Arrow to support MergeSort
+* [ARROW-10382](https://issues.apache.org/jira/browse/ARROW-10382) - [Rust] Fix typos and spelling
+* [ARROW-10383](https://issues.apache.org/jira/browse/ARROW-10383) - [Doc] Fix typos and spelling
+* [ARROW-10384](https://issues.apache.org/jira/browse/ARROW-10384) - [C++] Fix typos and spelling
+* [ARROW-10385](https://issues.apache.org/jira/browse/ARROW-10385) - [C++][Gandiva] Add support for LLVM 11
+* [ARROW-10389](https://issues.apache.org/jira/browse/ARROW-10389) - [Rust][DataFusion] Make the custom source implementation API more explicit
+* [ARROW-10392](https://issues.apache.org/jira/browse/ARROW-10392) - [C++][Gandiva] Avoid string copy while evaluating IN expression
+* [ARROW-10396](https://issues.apache.org/jira/browse/ARROW-10396) - [Rust] [Parquet] Expose SliceableCursor and FileSource
+* [ARROW-10398](https://issues.apache.org/jira/browse/ARROW-10398) - [Rust] [Parquet] Re-export parquet::record::api::Field
+* [ARROW-10400](https://issues.apache.org/jira/browse/ARROW-10400) - Propagate TLS client peer\_identity when using mutual TLS
+* [ARROW-10402](https://issues.apache.org/jira/browse/ARROW-10402) - [Rust] Improve array equality
+* [ARROW-10407](https://issues.apache.org/jira/browse/ARROW-10407) - [C++] Division Support in Decimal256
+* [ARROW-10408](https://issues.apache.org/jira/browse/ARROW-10408) - [Java] Upgrade Avro dependency to 1.10
+* [ARROW-10410](https://issues.apache.org/jira/browse/ARROW-10410) - [Rust] Some refactorings
+* [ARROW-10416](https://issues.apache.org/jira/browse/ARROW-10416) - [R] Support Tables in Flight
+* [ARROW-10422](https://issues.apache.org/jira/browse/ARROW-10422) - [Rust] Removed unused BinaryArrayBuilder
+* [ARROW-10424](https://issues.apache.org/jira/browse/ARROW-10424) - [Rust] Simplify code for impl PrimitiveArray
+* [ARROW-10428](https://issues.apache.org/jira/browse/ARROW-10428) - [FlightRPC][Java] Add support for HTTP cookies
+* [ARROW-10445](https://issues.apache.org/jira/browse/ARROW-10445) - [Rust] Add DoubleEnded to PrimitiveArrayIter
+* [ARROW-10449](https://issues.apache.org/jira/browse/ARROW-10449) - [Rust] Make dictionary keys be a PrimitiveArray
+* [ARROW-10454](https://issues.apache.org/jira/browse/ARROW-10454) - [Rust][Datafusion] support creating ParquetExec from externally resolved file list and schema
+* [ARROW-10455](https://issues.apache.org/jira/browse/ARROW-10455) - [Rust] Fix CI cache misses on windows
+* [ARROW-10458](https://issues.apache.org/jira/browse/ARROW-10458) - [Rust] [Datafusion] context.create\_logical\_plan should not take a mutable self reference
+* [ARROW-10464](https://issues.apache.org/jira/browse/ARROW-10464) - [Rust] Implement utility to convert TPC-H tbl files to CSV and Parquet
+* [ARROW-10466](https://issues.apache.org/jira/browse/ARROW-10466) - [Rust] [Website] Update implementation status page
+* [ARROW-10467](https://issues.apache.org/jira/browse/ARROW-10467) - [FlightRPC][Java] Ability to pass arbitrary client properties to server
+* [ARROW-10468](https://issues.apache.org/jira/browse/ARROW-10468) - [C++][Compute] Refactor FunctionExecutor -\> KernelExecutor
+* [ARROW-10476](https://issues.apache.org/jira/browse/ARROW-10476) - [Rust] Allow string array to be built from iterator of &str
+* [ARROW-10477](https://issues.apache.org/jira/browse/ARROW-10477) - [Rust] Add support for iterators over binary arrays
+* [ARROW-10478](https://issues.apache.org/jira/browse/ARROW-10478) - [Dev][Release] Correct Java versions to 3.0.0-SNAPSHOT
+* [ARROW-10481](https://issues.apache.org/jira/browse/ARROW-10481) - [R] Bindings to add, remove, replace Table columns
+* [ARROW-10483](https://issues.apache.org/jira/browse/ARROW-10483) - [C++] Move Executor into a separate header
+* [ARROW-10484](https://issues.apache.org/jira/browse/ARROW-10484) - [C++] Future<{void,Status}\> could be more generic
+* [ARROW-10487](https://issues.apache.org/jira/browse/ARROW-10487) - [FlightRPC][C++] Header-based auth in clients
+* [ARROW-10490](https://issues.apache.org/jira/browse/ARROW-10490) - [C++][GLib] Fail to build with Xcode 12.0.1
+* [ARROW-10492](https://issues.apache.org/jira/browse/ARROW-10492) - [Java][JDBC] Allow users to config the mapping between SQL types and Arrow types
+* [ARROW-10504](https://issues.apache.org/jira/browse/ARROW-10504) - [C++] Suppress UBSAN pointer-overflow warning in RapidJSON
+* [ARROW-10510](https://issues.apache.org/jira/browse/ARROW-10510) - [Rust] [DataFusion] Add benchmarks for COUNT(DISTINCT)
+* [ARROW-10515](https://issues.apache.org/jira/browse/ARROW-10515) - [Julia][Doc] Update lists of supported languages to include Julia
+* [ARROW-10522](https://issues.apache.org/jira/browse/ARROW-10522) - [R] Allow rename Table and RecordBatch columns with names()
+* [ARROW-10526](https://issues.apache.org/jira/browse/ARROW-10526) - [FlightRPC][C++] HTTP cookie handling in clients
+* [ARROW-10530](https://issues.apache.org/jira/browse/ARROW-10530) - [R] Optionally use distro package in linuxlibs.R
+* [ARROW-10531](https://issues.apache.org/jira/browse/ARROW-10531) - [Rust] [DataFusion] Better display for logical plans: Graphviz and Schema information
+* [ARROW-10539](https://issues.apache.org/jira/browse/ARROW-10539) - [Packaging][Python] Use GitHub Actions to build wheels for Windows
+* [ARROW-10540](https://issues.apache.org/jira/browse/ARROW-10540) - [Rust] Allow unary kernels of arbitrary array types
+* [ARROW-10541](https://issues.apache.org/jira/browse/ARROW-10541) - [C++] Add re2 library to core arrow / ARROW\_WITH\_RE2
+* [ARROW-10542](https://issues.apache.org/jira/browse/ARROW-10542) - [C\#][Flight] Add beginning on flight code for net core
+* [ARROW-10543](https://issues.apache.org/jira/browse/ARROW-10543) - [Developer] Update dev instructions to note there may be a timelag
+* [ARROW-10552](https://issues.apache.org/jira/browse/ARROW-10552) - [Rust] Remove un-used Result from Buffer
+* [ARROW-10559](https://issues.apache.org/jira/browse/ARROW-10559) - [Rust] [DataFusion] Break up logical\_plan/mod.rs into smaller modules
+* [ARROW-10561](https://issues.apache.org/jira/browse/ARROW-10561) - [Rust] Simplify \`MutableBuffer::write\` and \`MutableBuffer::write\_bytes\`
+* [ARROW-10562](https://issues.apache.org/jira/browse/ARROW-10562) - [Rust] Potential UB on unsafe code
+* [ARROW-10566](https://issues.apache.org/jira/browse/ARROW-10566) - [C++] Array validation should work on ArrayData
+* [ARROW-10567](https://issues.apache.org/jira/browse/ARROW-10567) - [C++][FlightRPC] Add options to help increase precision of arrow-flight-benchmark
+* [ARROW-10572](https://issues.apache.org/jira/browse/ARROW-10572) - [Rust][DataFusion] Use aHash and std::collections hashmap for aggregates / distinct
+* [ARROW-10574](https://issues.apache.org/jira/browse/ARROW-10574) - [Python][Parquet] Allow collections for 'in' / 'not in' filter (in addition to sets)
+* [ARROW-10575](https://issues.apache.org/jira/browse/ARROW-10575) - [Rust] Rename union.rs to be cosistent with other arrays
+* [ARROW-10581](https://issues.apache.org/jira/browse/ARROW-10581) - [Doc] IPC dictionary reference to relevant section
+* [ARROW-10582](https://issues.apache.org/jira/browse/ARROW-10582) - [Rust] [DataFusion] Implement "repartition" operator
+* [ARROW-10584](https://issues.apache.org/jira/browse/ARROW-10584) - [Rust] [DataFusion] Implement SQL join support using explicit JOIN ON syntax
+* [ARROW-10585](https://issues.apache.org/jira/browse/ARROW-10585) - [Rust] [DataFusion] Add join support to DataFrame and LogicalPlan
+* [ARROW-10586](https://issues.apache.org/jira/browse/ARROW-10586) - [Rust] [DataFusion] Add join support to query planner
+* [ARROW-10589](https://issues.apache.org/jira/browse/ARROW-10589) - [Rust]: Implement AVX-512 bit and operation
+* [ARROW-10590](https://issues.apache.org/jira/browse/ARROW-10590) - [Rust] Remove Date32(Millisecond) from test
+* [ARROW-10591](https://issues.apache.org/jira/browse/ARROW-10591) - [Rust] Add support to structArrays for MutableArrayData
+* [ARROW-10595](https://issues.apache.org/jira/browse/ARROW-10595) - [Rust] Simplify inner loop of min/max kernels for non-null case
+* [ARROW-10596](https://issues.apache.org/jira/browse/ARROW-10596) - [Rust] Improve take benchmark
+* [ARROW-10598](https://issues.apache.org/jira/browse/ARROW-10598) - [C++] Improve performance of GenerateBitsUnrolled
+* [ARROW-10604](https://issues.apache.org/jira/browse/ARROW-10604) - [Ruby] Support Decimal256 type
+* [ARROW-10607](https://issues.apache.org/jira/browse/ARROW-10607) - [C++][Parquet] Support Reading/Writing Decimal256 type in Parquet
+* [ARROW-10609](https://issues.apache.org/jira/browse/ARROW-10609) - [Rust] Optimize min/max of non null strings
+* [ARROW-10628](https://issues.apache.org/jira/browse/ARROW-10628) - [Rust] Make clippy error on clippy warnings
+* [ARROW-10633](https://issues.apache.org/jira/browse/ARROW-10633) - [Rust][DataFusion] Dependency version upgrades
+* [ARROW-10634](https://issues.apache.org/jira/browse/ARROW-10634) - [C\#][CI] Change the build version from 2.2 to 3.1 in CI
+* [ARROW-10636](https://issues.apache.org/jira/browse/ARROW-10636) - [Rust] Remove specialisation from Rust parquet
+* [ARROW-10637](https://issues.apache.org/jira/browse/ARROW-10637) - [Rust] Add examples to boolean kernels
+* [ARROW-10638](https://issues.apache.org/jira/browse/ARROW-10638) - [Rust] Improve tests of boolean kernels
+* [ARROW-10639](https://issues.apache.org/jira/browse/ARROW-10639) - [Rust] Simplify signature of is\_null and add example
+* [ARROW-10644](https://issues.apache.org/jira/browse/ARROW-10644) - [Python] Consolidate path/filesystem handling in pyarrow.dataset and pyarrow.fs
+* [ARROW-10646](https://issues.apache.org/jira/browse/ARROW-10646) - [C++][FlightRPC] Disable flaky test
+* [ARROW-10648](https://issues.apache.org/jira/browse/ARROW-10648) - [Java] Prepare Java codebase for source release without requiring any git tags to be created or pushed
+* [ARROW-10651](https://issues.apache.org/jira/browse/ARROW-10651) - [C++] alloc-dealloc-mismatch in s3fs.cc
+* [ARROW-10652](https://issues.apache.org/jira/browse/ARROW-10652) - [C++][Gandiva] Make gandiva cache size configurable
+* [ARROW-10653](https://issues.apache.org/jira/browse/ARROW-10653) - [Rust]: Update toolchain version to bring new features
+* [ARROW-10654](https://issues.apache.org/jira/browse/ARROW-10654) - [Rust] Specialize parsing of floats / bools
+* [ARROW-10660](https://issues.apache.org/jira/browse/ARROW-10660) - [Rust] Implement AVX-512 bit or operation
+* [ARROW-10665](https://issues.apache.org/jira/browse/ARROW-10665) - [Rust] Add fast paths for common utf8 like patterns
+* [ARROW-10666](https://issues.apache.org/jira/browse/ARROW-10666) - [Rust] [DataFusion] Support nested SELECT statements
+* [ARROW-10669](https://issues.apache.org/jira/browse/ARROW-10669) - [C++][Compute] Support Scalar inputs to boolean kernels
+* [ARROW-10672](https://issues.apache.org/jira/browse/ARROW-10672) - [Rust] [DataFusion] Make limit be computed as a stream
+* [ARROW-10673](https://issues.apache.org/jira/browse/ARROW-10673) - [Rust] [DataFusion] Make sort be computed on the stream
+* [ARROW-10674](https://issues.apache.org/jira/browse/ARROW-10674) - [Rust] Add integration tests for Decimal type
+* [ARROW-10677](https://issues.apache.org/jira/browse/ARROW-10677) - [Rust] Fix Bug and Add tests as documentation showing supported csv parsing
+* [ARROW-10679](https://issues.apache.org/jira/browse/ARROW-10679) - [Rust] [DataFusion] Implement SQL CASE WHEN physical expression
+* [ARROW-10680](https://issues.apache.org/jira/browse/ARROW-10680) - [Rust] [DataFusion] Implement TPC-H Query 12
+* [ARROW-10682](https://issues.apache.org/jira/browse/ARROW-10682) - [Rust] Sort kernel performance tuning
+* [ARROW-10685](https://issues.apache.org/jira/browse/ARROW-10685) - [Rust] [DataFusion] Add support for join on filter pushdown optimizer
+* [ARROW-10688](https://issues.apache.org/jira/browse/ARROW-10688) - [Rust] [DataFusion] Support CASE WHEN from DataFrame API
+* [ARROW-10689](https://issues.apache.org/jira/browse/ARROW-10689) - [Rust] [DataFusion] Support CASE WHEN from SQL
+* [ARROW-10693](https://issues.apache.org/jira/browse/ARROW-10693) - [Rust] [DataFusion] Add support for the left join
+* [ARROW-10696](https://issues.apache.org/jira/browse/ARROW-10696) - [C++] Investigate a bit run reader that would only return runs of set bits
+* [ARROW-10697](https://issues.apache.org/jira/browse/ARROW-10697) - [C++] Consolidate bitmap word readers
+* [ARROW-10703](https://issues.apache.org/jira/browse/ARROW-10703) - [Rust] [DataFusion] Make join not collect left on every part
+* [ARROW-10704](https://issues.apache.org/jira/browse/ARROW-10704) - [Rust][DataFusion] Remove Nested from expression enum
+* [ARROW-10708](https://issues.apache.org/jira/browse/ARROW-10708) - [Packaging][deb] Add support for Ubuntu 20.10
+* [ARROW-10709](https://issues.apache.org/jira/browse/ARROW-10709) - [Python] Difficult to make an efficient zero-copy file reader in Python
+* [ARROW-10712](https://issues.apache.org/jira/browse/ARROW-10712) - [Rust] [DataFusion] Add tests to TPC-H benchmarks
+* [ARROW-10717](https://issues.apache.org/jira/browse/ARROW-10717) - [Rust] [DataFusion] Add support for right join
+* [ARROW-10720](https://issues.apache.org/jira/browse/ARROW-10720) - [C++] Add BasicDecimal256 Rescale Support
+* [ARROW-10721](https://issues.apache.org/jira/browse/ARROW-10721) - [C\#][CI] Use .NET 3.1 by default
+* [ARROW-10722](https://issues.apache.org/jira/browse/ARROW-10722) - [Rust][DataFusion] Reduce overhead in data types in aggregations / joins, improve benchmarks
+* [ARROW-10723](https://issues.apache.org/jira/browse/ARROW-10723) - [Packaging][deb][RPM] Enable Parquet encription
+* [ARROW-10724](https://issues.apache.org/jira/browse/ARROW-10724) - [Developer Tools] Add labeler to when PRs need rebase
+* [ARROW-10725](https://issues.apache.org/jira/browse/ARROW-10725) - [Python][Compute] Exposing bindings for sort options
+* [ARROW-10728](https://issues.apache.org/jira/browse/ARROW-10728) - [Rust] [DataFusion] Add SQL support for JOIN with USING clause
+* [ARROW-10729](https://issues.apache.org/jira/browse/ARROW-10729) - [Rust] [DataFusion] Add SQL support for JOIN using implicit syntax
+* [ARROW-10732](https://issues.apache.org/jira/browse/ARROW-10732) - [Rust] [DataFusion] Add SQL support for table/relation aliases and compound identifiers
+* [ARROW-10733](https://issues.apache.org/jira/browse/ARROW-10733) - [R] Improvements to Linux installation troubleshooting
+* [ARROW-10740](https://issues.apache.org/jira/browse/ARROW-10740) - [Rust][DataFusion] Remove redundant clones found by clippy
+* [ARROW-10741](https://issues.apache.org/jira/browse/ARROW-10741) - Apply clippy lints to source code, remove them from ignore list
+* [ARROW-10742](https://issues.apache.org/jira/browse/ARROW-10742) - [Python] Mask not checked when creating array from numpy array
+* [ARROW-10745](https://issues.apache.org/jira/browse/ARROW-10745) - [Rust] Allocate padding bytes in filter context
+* [ARROW-10747](https://issues.apache.org/jira/browse/ARROW-10747) - [Rust] Optimizations for csv reader
+* [ARROW-10750](https://issues.apache.org/jira/browse/ARROW-10750) - [Rust] [DataFusion] Add SQL support for LEFT and RIGHT join
+* [ARROW-10752](https://issues.apache.org/jira/browse/ARROW-10752) - [GLib] Add garrow\_schema\_has\_metadata()
+* [ARROW-10754](https://issues.apache.org/jira/browse/ARROW-10754) - [GLib] Add support for metadata to GArrowField
+* [ARROW-10755](https://issues.apache.org/jira/browse/ARROW-10755) - [Rust] [Parquet] Add support for writing boolean type
+* [ARROW-10756](https://issues.apache.org/jira/browse/ARROW-10756) - [Rust] Clippy - fix reduntant clone
+* [ARROW-10759](https://issues.apache.org/jira/browse/ARROW-10759) - [Rust][DataFusion] Implement support for casting string to date in sql expressions
+* [ARROW-10763](https://issues.apache.org/jira/browse/ARROW-10763) - [Rust] Speed up take kernels
+* [ARROW-10765](https://issues.apache.org/jira/browse/ARROW-10765) - [Rust] Optimize take strings for non-null arrays
+* [ARROW-10767](https://issues.apache.org/jira/browse/ARROW-10767) - [Rust] Speed up sum kernel with nulls
+* [ARROW-10770](https://issues.apache.org/jira/browse/ARROW-10770) - [Rust] Support reading nested JSON lists
+* [ARROW-10772](https://issues.apache.org/jira/browse/ARROW-10772) - [Rust] Improve take performance
+* [ARROW-10775](https://issues.apache.org/jira/browse/ARROW-10775) - [Rust][DataFusion] Use ahash in hash join
+* [ARROW-10776](https://issues.apache.org/jira/browse/ARROW-10776) - [C++] Provide iterator access to primitive elements inside an Array
+* [ARROW-10781](https://issues.apache.org/jira/browse/ARROW-10781) - [Rust] [DataFusion] TableProvider should provide row count statistics
+* [ARROW-10783](https://issues.apache.org/jira/browse/ARROW-10783) - [Rust] [DataFusion] Implement row count statistics for Parquet TableProvider
+* [ARROW-10785](https://issues.apache.org/jira/browse/ARROW-10785) - Further optimize take string
+* [ARROW-10786](https://issues.apache.org/jira/browse/ARROW-10786) - [Packaging][RPM] Drop support for CentOS 6
+* [ARROW-10788](https://issues.apache.org/jira/browse/ARROW-10788) - [C++] Make S3 recursive walks parallel
+* [ARROW-10789](https://issues.apache.org/jira/browse/ARROW-10789) - [Rust][DataFusion] Make TableProvider dynamically typed
+* [ARROW-10790](https://issues.apache.org/jira/browse/ARROW-10790) - [C++][Compute] Investigate ChunkedArray sort performance
+* [ARROW-10792](https://issues.apache.org/jira/browse/ARROW-10792) - [Rust] [CI] Modulararize CI for faster and smaller builds
+* [ARROW-10795](https://issues.apache.org/jira/browse/ARROW-10795) - [Rust] Fix specialization for arrow datatypes
+* [ARROW-10796](https://issues.apache.org/jira/browse/ARROW-10796) - [C++] Investigate RecordBatch sort performance
+* [ARROW-10800](https://issues.apache.org/jira/browse/ARROW-10800) - [Rust] [Parquet] Provide access to the elements of parquet::record::{List, Map}
+* [ARROW-10802](https://issues.apache.org/jira/browse/ARROW-10802) - [C++] Remove Dictionary[NullType] special casing in parquet column writer
+* [ARROW-10808](https://issues.apache.org/jira/browse/ARROW-10808) - [Rust] [DataFusion] Support nested expressions in aggregations
+* [ARROW-10809](https://issues.apache.org/jira/browse/ARROW-10809) - [C++] Use Datum for SortIndices() input
+* [ARROW-10812](https://issues.apache.org/jira/browse/ARROW-10812) - [Rust] Make BooleanArray not a PrimitiveArray
+* [ARROW-10813](https://issues.apache.org/jira/browse/ARROW-10813) - [Rust] [DataFusion] Implement DFSchema
+* [ARROW-10814](https://issues.apache.org/jira/browse/ARROW-10814) - [Packaging][deb] Drop support for Debian GNU/Linux Stretch
+* [ARROW-10817](https://issues.apache.org/jira/browse/ARROW-10817) - [Rust] [DataFusion] Implement TypedString
+* [ARROW-10820](https://issues.apache.org/jira/browse/ARROW-10820) - [Rust] [DataFusion] Complete TPC-H Benchmark Queries
+* [ARROW-10821](https://issues.apache.org/jira/browse/ARROW-10821) - [Rust] [Datafusion] implement negative expression
+* [ARROW-10822](https://issues.apache.org/jira/browse/ARROW-10822) - [Rust] [Datafusion] support compiling datafusion with simd support
+* [ARROW-10824](https://issues.apache.org/jira/browse/ARROW-10824) - [Rust] Added PartialEq for NullArray
+* [ARROW-10825](https://issues.apache.org/jira/browse/ARROW-10825) - [Rust] Add support to NullArrays for MutableArrayData
+* [ARROW-10826](https://issues.apache.org/jira/browse/ARROW-10826) - [Rust] Add support for FixedSizeBinary to MutableArrayData
+* [ARROW-10827](https://issues.apache.org/jira/browse/ARROW-10827) - [Rust] Extend concatenate to all types
+* [ARROW-10828](https://issues.apache.org/jira/browse/ARROW-10828) - [Rust][DataFusion] Enable more clippy lints
+* [ARROW-10829](https://issues.apache.org/jira/browse/ARROW-10829) - [Rust] [DataFusion] Implement Into<Schema\> for DFSchema
+* [ARROW-10832](https://issues.apache.org/jira/browse/ARROW-10832) - [Rust] Evaluate latest snapshot flatc
+* [ARROW-10836](https://issues.apache.org/jira/browse/ARROW-10836) - [Rust] Extend take kernel to FixedSizeListArray
+* [ARROW-10838](https://issues.apache.org/jira/browse/ARROW-10838) - [Rust] [CI] Add CI for wasm32 target
+* [ARROW-10839](https://issues.apache.org/jira/browse/ARROW-10839) - [Rust] [DataFusion] Implement BETWEEN Operator
+* [ARROW-10843](https://issues.apache.org/jira/browse/ARROW-10843) - [C++] Add support for temporal types in sort family kernels
+* [ARROW-10845](https://issues.apache.org/jira/browse/ARROW-10845) - [Python][CI] Add python CI build using numpy nightly
+* [ARROW-10849](https://issues.apache.org/jira/browse/ARROW-10849) - [Python] Handle numpy deprecation warnings for builtin type aliases
+* [ARROW-10851](https://issues.apache.org/jira/browse/ARROW-10851) - [C++] Reduce code size of vector\_sort.cc
+* [ARROW-10857](https://issues.apache.org/jira/browse/ARROW-10857) - [Packaging] Follow PowerTools repository name change on CentOS 8
+* [ARROW-10858](https://issues.apache.org/jira/browse/ARROW-10858) - [C++][MSVC] Add missing Boost dependency
+* [ARROW-10861](https://issues.apache.org/jira/browse/ARROW-10861) - [Python] Update minimal NumPy version to 1.16.6
+* [ARROW-10864](https://issues.apache.org/jira/browse/ARROW-10864) - [Rust] Use standard ordering for floats
+* [ARROW-10865](https://issues.apache.org/jira/browse/ARROW-10865) - [Rust][DataFusion] More ergonomic conversion between Schema, SchemaRef, DFSchema, and DFSchemaRef
+* [ARROW-10867](https://issues.apache.org/jira/browse/ARROW-10867) - build failure on aarch64 with -DARROW\_PYTHON=ON and gcc
+* [ARROW-10869](https://issues.apache.org/jira/browse/ARROW-10869) - [GLib] Add garrow\_\*\_sort\_indices() and related options
+* [ARROW-10870](https://issues.apache.org/jira/browse/ARROW-10870) - [Julia] Update website with Julia implementation
+* [ARROW-10871](https://issues.apache.org/jira/browse/ARROW-10871) - [Julia] Setup Julia CI via GitHub Actions
+* [ARROW-10873](https://issues.apache.org/jira/browse/ARROW-10873) - [C++] Apple Silicon is reported as arm64 in CMake
+* [ARROW-10874](https://issues.apache.org/jira/browse/ARROW-10874) - [Rust][DataFusion] Add table statistics for MemTable
+* [ARROW-10877](https://issues.apache.org/jira/browse/ARROW-10877) - [Rust] [DataFusion] Add benchmark based on kaggle movies
+* [ARROW-10878](https://issues.apache.org/jira/browse/ARROW-10878) - [Rust] Simplify extend\_from\_slice
+* [ARROW-10879](https://issues.apache.org/jira/browse/ARROW-10879) - [Packaging][deb] Restore Debian GNU/Linux Buster configuration
+* [ARROW-10881](https://issues.apache.org/jira/browse/ARROW-10881) - [C++] EXC\_BAD\_ACCESS in BaseSetBitRunReader<false\>::NextRun
+* [ARROW-10885](https://issues.apache.org/jira/browse/ARROW-10885) - [Rust][DataFusion] Optimize join build vs probe based on statistics on row number
+* [ARROW-10887](https://issues.apache.org/jira/browse/ARROW-10887) - [C++][Doc] Document IPC API
+* [ARROW-10889](https://issues.apache.org/jira/browse/ARROW-10889) - [Rust] Document our approach to unsafe code in README
+* [ARROW-10890](https://issues.apache.org/jira/browse/ARROW-10890) - [Rust] [DataFusion] JOIN support
+* [ARROW-10891](https://issues.apache.org/jira/browse/ARROW-10891) - [Rust][DataFusion] More clippy lints
+* [ARROW-10893](https://issues.apache.org/jira/browse/ARROW-10893) - [Rust] [DataFusion] Easier clippy fixes
+* [ARROW-10896](https://issues.apache.org/jira/browse/ARROW-10896) - [C++][CMake] Rename internal RE2 package name to "re2" from "RE2"
+* [ARROW-10900](https://issues.apache.org/jira/browse/ARROW-10900) - [Rust][DataFusion] Resolve TableScan provider eagerly
+* [ARROW-10904](https://issues.apache.org/jira/browse/ARROW-10904) - [Python] Add support for Python 3.9 macOS wheels
+* [ARROW-10905](https://issues.apache.org/jira/browse/ARROW-10905) - [Python] Add support for Python 3.9 windows wheels
+* [ARROW-10908](https://issues.apache.org/jira/browse/ARROW-10908) - [Rust] [DataFusion] Update relevant tpch-queries with BETWEEN
+* [ARROW-10917](https://issues.apache.org/jira/browse/ARROW-10917) - [Rust][Doc] Update feature matrix
+* [ARROW-10918](https://issues.apache.org/jira/browse/ARROW-10918) - [C++][Doc] Document supported Parquet features
+* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
+* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
+* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary
+* [ARROW-10929](https://issues.apache.org/jira/browse/ARROW-10929) - [Rust] Migrate CI tests to stable rust
+* [ARROW-10933](https://issues.apache.org/jira/browse/ARROW-10933) - [Rust] Update docs in regard to stable rust
+* [ARROW-10934](https://issues.apache.org/jira/browse/ARROW-10934) - [Python] Tests are failed with fsspec-0.8.5
+* [ARROW-10938](https://issues.apache.org/jira/browse/ARROW-10938) - [Rust] upgrade dependency "flatbuffers" to 0.8
+* [ARROW-10940](https://issues.apache.org/jira/browse/ARROW-10940) - [Rust] Extend sort kernel to ListArray
+* [ARROW-10941](https://issues.apache.org/jira/browse/ARROW-10941) - [Doc][C++] Document supported Parquet encryption features
+* [ARROW-10944](https://issues.apache.org/jira/browse/ARROW-10944) - [Rust] Implement min/max kernels for BooleanArray
+* [ARROW-10946](https://issues.apache.org/jira/browse/ARROW-10946) - [Rust] Make ChunkIter not depend on a buffer
+* [ARROW-10947](https://issues.apache.org/jira/browse/ARROW-10947) - [Rust][DataFusion] Refactor UTF8 to Date32 for Performance
+* [ARROW-10948](https://issues.apache.org/jira/browse/ARROW-10948) - [C++] Always use GTestConfig.cmake
+* [ARROW-10949](https://issues.apache.org/jira/browse/ARROW-10949) - [Rust] Avoid clones in getting values of boolean arrays
+* [ARROW-10951](https://issues.apache.org/jira/browse/ARROW-10951) - [Python][CI] Nightly pandas builds failing because of pytest monkeypatch issue
+* [ARROW-10952](https://issues.apache.org/jira/browse/ARROW-10952) - [Rust] Add pre-commit hook
+* [ARROW-10966](https://issues.apache.org/jira/browse/ARROW-10966) - [C++] Use FnOnce for ThreadPool's tasks instead of std::function
+* [ARROW-10968](https://issues.apache.org/jira/browse/ARROW-10968) - [Rust][DataFusion] Don't build hash table for right side of the join
+* [ARROW-10969](https://issues.apache.org/jira/browse/ARROW-10969) - [Rust][DataFusion] Implement ANSI SQL Functions
+* [ARROW-10985](https://issues.apache.org/jira/browse/ARROW-10985) - [Rust] Update unsafe guidelines for adding JIRA references
+* [ARROW-10986](https://issues.apache.org/jira/browse/ARROW-10986) - [Rust][DataFusion] Add average statistic to TCP-H benchmark too
+* [ARROW-10988](https://issues.apache.org/jira/browse/ARROW-10988) - [C++] Require CMake 3.5 or later
+* [ARROW-10989](https://issues.apache.org/jira/browse/ARROW-10989) - [Rust] Use slices for iterating primitive arrays
+* [ARROW-10993](https://issues.apache.org/jira/browse/ARROW-10993) - [CI][macOS] Fix Python 3.9 installation by Homebrew
+* [ARROW-10995](https://issues.apache.org/jira/browse/ARROW-10995) - [Rust] [DataFusion] Improve parallelism when reading Parquet files
+* [ARROW-11004](https://issues.apache.org/jira/browse/ARROW-11004) - [FlightRPC][Python] Header-based auth in clients
+* [ARROW-11005](https://issues.apache.org/jira/browse/ARROW-11005) - [Rust] Remove indirection from take kernel and simplify interface
+* [ARROW-11008](https://issues.apache.org/jira/browse/ARROW-11008) - [Rust][DataFusion] Simplify count accumulator
+* [ARROW-11009](https://issues.apache.org/jira/browse/ARROW-11009) - [Python] Add environment variable to elect default usage of system memory allocator instead of jemalloc/mimalloc
+* [ARROW-11010](https://issues.apache.org/jira/browse/ARROW-11010) - [Python] \`np.float\` deprecation warning in \`\_pandas\_logical\_type\_map\`
+* [ARROW-11012](https://issues.apache.org/jira/browse/ARROW-11012) - [Rust] [DataFusion] Make write\_csv and write\_parquet concurrent
+* [ARROW-11015](https://issues.apache.org/jira/browse/ARROW-11015) - [CI][Gandiva] Move gandiva nightly build from travis to github action
+* [ARROW-11018](https://issues.apache.org/jira/browse/ARROW-11018) - [Rust][DataFusion] Add null count column statistics
+* [ARROW-11026](https://issues.apache.org/jira/browse/ARROW-11026) - [Rust]: Run tests without requiring environment variables
+* [ARROW-11028](https://issues.apache.org/jira/browse/ARROW-11028) - [Rust] Somewhat pedantic pattern-matches
+* [ARROW-11029](https://issues.apache.org/jira/browse/ARROW-11029) - [Rust] [DataFusion] Document why join order optimization does not work with filter pushdown
+* [ARROW-11032](https://issues.apache.org/jira/browse/ARROW-11032) - [C++][FlightRPC] Add benchmark for local RPC through unix socket
+* [ARROW-11033](https://issues.apache.org/jira/browse/ARROW-11033) - [Rust] CSV writer performance improvements
+* [ARROW-11034](https://issues.apache.org/jira/browse/ARROW-11034) - [Rust] rustfmt cleanup
+* [ARROW-11035](https://issues.apache.org/jira/browse/ARROW-11035) - [Rust] Improve performance of cast to utf8 via FromIter
+* [ARROW-11037](https://issues.apache.org/jira/browse/ARROW-11037) - [Rust] Improve performance of string fromIter
+* [ARROW-11038](https://issues.apache.org/jira/browse/ARROW-11038) - [Rust] Remove \`BufferBuilderTrait\` and associated Result requirement.
+* [ARROW-11039](https://issues.apache.org/jira/browse/ARROW-11039) - [Rust] Improve performance for utf8 to float cast
+* [ARROW-11040](https://issues.apache.org/jira/browse/ARROW-11040) - [Rust] Simplify builders with generics
+* [ARROW-11042](https://issues.apache.org/jira/browse/ARROW-11042) - [Rust][DataFusion] Increase default batch size
+* [ARROW-11043](https://issues.apache.org/jira/browse/ARROW-11043) - [C++] Add "is\_nan" kernel
+* [ARROW-11046](https://issues.apache.org/jira/browse/ARROW-11046) - [Rust][DataFusion] Add count\_distinct to dataframe API
+* [ARROW-11049](https://issues.apache.org/jira/browse/ARROW-11049) - [Python] Expose alternate memory pools
+* [ARROW-11052](https://issues.apache.org/jira/browse/ARROW-11052) - [Rust] [DataFusion] Implement metrics in join operator
+* [ARROW-11053](https://issues.apache.org/jira/browse/ARROW-11053) - [Rust] [DataFusion] Optimize joins with dynamic capacity for output batches
+* [ARROW-11054](https://issues.apache.org/jira/browse/ARROW-11054) - Update SQLParser to 0.70
+* [ARROW-11055](https://issues.apache.org/jira/browse/ARROW-11055) - [Rust] [DataFusion] Support date\_trunc function
+* [ARROW-11058](https://issues.apache.org/jira/browse/ARROW-11058) - [Rust] [DataFusion] Implement "coalesce batches" operator
+* [ARROW-11063](https://issues.apache.org/jira/browse/ARROW-11063) - [Rust] Validate null counts when building arrays
+* [ARROW-11064](https://issues.apache.org/jira/browse/ARROW-11064) - [Rust][DataFusion] Speed up hash join on smaller batches
+* [ARROW-11072](https://issues.apache.org/jira/browse/ARROW-11072) - [Rust] [Parquet] Support int32 and int64 physical types
+* [ARROW-11076](https://issues.apache.org/jira/browse/ARROW-11076) - [Rust][DataFusion] Refactor usage of right indices in hash join
+* [ARROW-11079](https://issues.apache.org/jira/browse/ARROW-11079) - [R] Catch up on changelog since 2.0
+* [ARROW-11080](https://issues.apache.org/jira/browse/ARROW-11080) - [C++][Dataset] Improvements to implicit casting
+* [ARROW-11082](https://issues.apache.org/jira/browse/ARROW-11082) - [Rust] Add FFI for LargeUtf8
+* [ARROW-11086](https://issues.apache.org/jira/browse/ARROW-11086) - [Rust] Extend take to support more index types
+* [ARROW-11091](https://issues.apache.org/jira/browse/ARROW-11091) - [Rust][DataFusion] Fix clippy warning in rust 1.49
+* [ARROW-11095](https://issues.apache.org/jira/browse/ARROW-11095) - [Python] Access pyarrow.RecordBatch column by name
+* [ARROW-11096](https://issues.apache.org/jira/browse/ARROW-11096) - [Rust] Add FFI for [Large]Binary
+* [ARROW-11097](https://issues.apache.org/jira/browse/ARROW-11097) - [Rust] Simplify tests
+* [ARROW-11099](https://issues.apache.org/jira/browse/ARROW-11099) - [Rust]: Remove unsafe value\_slice method from PrimitiveArray and BooleanArray
+* [ARROW-11100](https://issues.apache.org/jira/browse/ARROW-11100) - [Rust] Speed up numeric to string cast using lexical\_core
+* [ARROW-11101](https://issues.apache.org/jira/browse/ARROW-11101) - [Rust] enable "cargo +nightly fmt" in git pre-commit hook
+* [ARROW-11104](https://issues.apache.org/jira/browse/ARROW-11104) - [GLib] Add append\_null/append\_nulls to GArrowArrayBuilder and use them
+* [ARROW-11105](https://issues.apache.org/jira/browse/ARROW-11105) - [Rust] Favor From/Into traits in MutableBuffer
+* [ARROW-11109](https://issues.apache.org/jira/browse/ARROW-11109) - [GLib] Add garrow\_array\_builder\_append\_empty\_value() and values()
+* [ARROW-11110](https://issues.apache.org/jira/browse/ARROW-11110) - [Rust] [Datafusion] context.table should not take a mutable self reference
+* [ARROW-11111](https://issues.apache.org/jira/browse/ARROW-11111) - [GLib] Add GArrowFixedSizeBinaryArrayBuilder
+* [ARROW-11121](https://issues.apache.org/jira/browse/ARROW-11121) - [Developer] Use pull\_request\_target for PR JIRA integration
+* [ARROW-11122](https://issues.apache.org/jira/browse/ARROW-11122) - [Rust] Add FFI for date and time
+* [ARROW-11124](https://issues.apache.org/jira/browse/ARROW-11124) - [Doc] Update status matrix for Decimal256
+* [ARROW-11125](https://issues.apache.org/jira/browse/ARROW-11125) - [Rust] Implement logical equality for list arrays
+* [ARROW-11126](https://issues.apache.org/jira/browse/ARROW-11126) - [Rust] Document and test ARROW-10656
+* [ARROW-11127](https://issues.apache.org/jira/browse/ARROW-11127) - [C++] Unused cpu\_info on non-x86 architecture
+* [ARROW-11129](https://issues.apache.org/jira/browse/ARROW-11129) - [Rust][DataFusion] Use tokio thread pool for loading parquet
+* [ARROW-11130](https://issues.apache.org/jira/browse/ARROW-11130) - [Website][CentOS 8][RHEL 8] Enable all required repositories by default
+* [ARROW-11131](https://issues.apache.org/jira/browse/ARROW-11131) - [Rust] Improve performance of bool\_equal
+* [ARROW-11136](https://issues.apache.org/jira/browse/ARROW-11136) - [R] Bindings for is.nan
+* [ARROW-11137](https://issues.apache.org/jira/browse/ARROW-11137) - [Rust][DataFusion] Fix Clippy needless\_range\_loop, needless\_lifetimes
+* [ARROW-11138](https://issues.apache.org/jira/browse/ARROW-11138) - [Rust] [DataFusion] Support ltrim, rtrim
+* [ARROW-11139](https://issues.apache.org/jira/browse/ARROW-11139) - [GLib] Add support for extension type
+* [ARROW-11155](https://issues.apache.org/jira/browse/ARROW-11155) - [C++][Packaging] Move gandiva crossbow jobs off of Travis-CI
+* [ARROW-11158](https://issues.apache.org/jira/browse/ARROW-11158) - [Julia] Implement Decimal256 support
+* [ARROW-11159](https://issues.apache.org/jira/browse/ARROW-11159) - [Developer] Consolidate pull request related jobs
+* [ARROW-11165](https://issues.apache.org/jira/browse/ARROW-11165) - [Rust] [DataFusion] Document the desired SQL dialect for DataFusion
+* [ARROW-11168](https://issues.apache.org/jira/browse/ARROW-11168) - [Rust] Fix cargo doc warnings
+* [ARROW-11169](https://issues.apache.org/jira/browse/ARROW-11169) - [Rust] Add a comment explaining where float total\_order algorithm came from
+* [ARROW-11175](https://issues.apache.org/jira/browse/ARROW-11175) - [R] Small docs fixes
+* [ARROW-11176](https://issues.apache.org/jira/browse/ARROW-11176) - [R] Expose memory pool name and document setting it
+* [ARROW-11187](https://issues.apache.org/jira/browse/ARROW-11187) - [Rust] [Parquet] Pin specific parquet-format-rs version
+* [ARROW-11188](https://issues.apache.org/jira/browse/ARROW-11188) - [Rust] Implement crypto functions from PostgreSQL dialect
+* [ARROW-11193](https://issues.apache.org/jira/browse/ARROW-11193) - [Documentation] Add docs for Java ListVector
+* [ARROW-11194](https://issues.apache.org/jira/browse/ARROW-11194) - [Rust] Enable SIMD for aarch64
+* [ARROW-11195](https://issues.apache.org/jira/browse/ARROW-11195) - [Rust] [DataFusion] Built-in table providers should expose relevant fields
+* [ARROW-11196](https://issues.apache.org/jira/browse/ARROW-11196) - [GLib] Add support for mock, HDFS and S3 file systems with factory function
+* [ARROW-11198](https://issues.apache.org/jira/browse/ARROW-11198) - [Packaging][Python] Ensure setuptools version during build supports markdown
+* [ARROW-11200](https://issues.apache.org/jira/browse/ARROW-11200) - [Rust] [DateFusion] Physical operators and expressions should have public accessor methods
+* [ARROW-11201](https://issues.apache.org/jira/browse/ARROW-11201) - [Rust] create\_batch\_empty - support more types
+* [ARROW-11203](https://issues.apache.org/jira/browse/ARROW-11203) - [Developer][Website] Enable JIRA and pull request integration
+* [ARROW-11204](https://issues.apache.org/jira/browse/ARROW-11204) - [C++] Fix build failure with bundled gRPC and Protobuf
+* [ARROW-11205](https://issues.apache.org/jira/browse/ARROW-11205) - [GLib][Dataset] Add GADFileFormat and its family
+* [ARROW-11209](https://issues.apache.org/jira/browse/ARROW-11209) - [Rust] DF - Provide better error message on unsupported GROUP BY
+* [ARROW-11210](https://issues.apache.org/jira/browse/ARROW-11210) - [CI] Restore workflows that had been blocked by INFRA
+* [ARROW-11212](https://issues.apache.org/jira/browse/ARROW-11212) - [Packaging][Python] Use vcpkg as dependency source for manylinux and windows wheels
+* [ARROW-11213](https://issues.apache.org/jira/browse/ARROW-11213) - [Packaging][Python] Dockerize wheel building on windows
+* [ARROW-11215](https://issues.apache.org/jira/browse/ARROW-11215) - [CI] Use named volumes by default for caching in docker-compose
+* [ARROW-11218](https://issues.apache.org/jira/browse/ARROW-11218) - [R] Make SubTreeFileSystem print method more informative
+* [ARROW-11219](https://issues.apache.org/jira/browse/ARROW-11219) - [CI][Ruby][MinGW] Reduce CI time
+* [ARROW-11221](https://issues.apache.org/jira/browse/ARROW-11221) - [Rust] DF Implement GROUP BY support for Float32/Float64
+* [ARROW-11231](https://issues.apache.org/jira/browse/ARROW-11231) - [Packaging] Add mimalloc to Linux builds
+* [ARROW-11234](https://issues.apache.org/jira/browse/ARROW-11234) - [CI][Ruby][macOS] Reduce CI time
+* [ARROW-11236](https://issues.apache.org/jira/browse/ARROW-11236) - [Java] Bump Jackson to 2.11.4
+* [ARROW-11240](https://issues.apache.org/jira/browse/ARROW-11240) - [Packaging][R] Add mimalloc to R packaging
+* [ARROW-11242](https://issues.apache.org/jira/browse/ARROW-11242) - [CI] Remove CMake 3.2 job
+* [ARROW-11245](https://issues.apache.org/jira/browse/ARROW-11245) - [C++][Gandiva] Add support for LLVM 11.1
+* [ARROW-11247](https://issues.apache.org/jira/browse/ARROW-11247) - [C++] Infer date32 columns in CSV
+* [ARROW-11256](https://issues.apache.org/jira/browse/ARROW-11256) - [Packaging][Linux] Don't buffer packaging output
+* [ARROW-11272](https://issues.apache.org/jira/browse/ARROW-11272) - [Release][wheel] Remove unsupported Python 3.5 and manylinux1
+* [ARROW-11273](https://issues.apache.org/jira/browse/ARROW-11273) - [Release][deb] Remove unsupported Debian GNU/Linux stretch
+* [ARROW-11278](https://issues.apache.org/jira/browse/ARROW-11278) - [Release][NodeJS] Don't touch \~/.bash\_profile
+* [ARROW-11280](https://issues.apache.org/jira/browse/ARROW-11280) - [Release][APT] Fix minimal build example check
+* [ARROW-11281](https://issues.apache.org/jira/browse/ARROW-11281) - [C++] Remove needless runtime RapidJSON dependency
+* [ARROW-11282](https://issues.apache.org/jira/browse/ARROW-11282) - [Packaging][deb] Add missing libgflags-dev dependency
+* [ARROW-11285](https://issues.apache.org/jira/browse/ARROW-11285) - [Release][APT] Add support for Ubuntu Groovy
+* [ARROW-11292](https://issues.apache.org/jira/browse/ARROW-11292) - [Release][JS] Use Node.JS LTS
+* [ARROW-11293](https://issues.apache.org/jira/browse/ARROW-11293) - [C++] Don't require Boost and gflags with find\_package(Arrow)
+* [ARROW-11307](https://issues.apache.org/jira/browse/ARROW-11307) - [Release][Ubuntu][20.10] Add workaround for dependency issue
+* [PARQUET-1566](https://issues.apache.org/jira/browse/PARQUET-1566) - [C++] Indicate if null count, distinct count are present in column statistics
+
+
+## Bug Fixes
+
+* [ARROW-2616](https://issues.apache.org/jira/browse/ARROW-2616) - [Python] Cross-compiling Pyarrow
+* [ARROW-6582](https://issues.apache.org/jira/browse/ARROW-6582) - [R] Arrow to R fails with embedded nuls in strings
+* [ARROW-7363](https://issues.apache.org/jira/browse/ARROW-7363) - [Python] Add combine\_chunks method to ChunkedArray
+* [ARROW-7909](https://issues.apache.org/jira/browse/ARROW-7909) - [Website] Add how to install on Red Hat Enterprise Linux
+* [ARROW-8258](https://issues.apache.org/jira/browse/ARROW-8258) - [Rust] [Parquet] ArrowReader fails on some timestamp types
+* [ARROW-9027](https://issues.apache.org/jira/browse/ARROW-9027) - [Python] Split in multiple files + clean-up pyarrow.parquet tests
+* [ARROW-9479](https://issues.apache.org/jira/browse/ARROW-9479) - [JS] Table.from fails for zero-item Lists, FixedSizeLists, Maps. ditto Table.empty
+* [ARROW-9636](https://issues.apache.org/jira/browse/ARROW-9636) - [Python] Update documentation about 'LZO' compression in parquet.write\_table
+* [ARROW-9776](https://issues.apache.org/jira/browse/ARROW-9776) - [R] read\_feather causes segfault in R if file doesn't exist
+* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
+* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
+* [ARROW-9898](https://issues.apache.org/jira/browse/ARROW-9898) - [C++][Gandiva] Error handling in castINT fails in some enviroments
+* [ARROW-9903](https://issues.apache.org/jira/browse/ARROW-9903) - [R] open\_dataset freezes opening feather files on Windows
+* [ARROW-9963](https://issues.apache.org/jira/browse/ARROW-9963) - [Python] Recognize datetime.timezone.utc as UTC on conversion python-\>pyarrow
+* [ARROW-10039](https://issues.apache.org/jira/browse/ARROW-10039) - [Rust] Do not require memory alignment of buffers
+* [ARROW-10042](https://issues.apache.org/jira/browse/ARROW-10042) - [Rust] Buffer equalities may be incorrect
+* [ARROW-10080](https://issues.apache.org/jira/browse/ARROW-10080) - [R] Arrow does not release unused memory
+* [ARROW-10122](https://issues.apache.org/jira/browse/ARROW-10122) - [Python] Selecting one column of multi-index results in a duplicated value column.
+* [ARROW-10145](https://issues.apache.org/jira/browse/ARROW-10145) - [C++][Dataset] Assert integer overflow in partitioning falls back to string
+* [ARROW-10146](https://issues.apache.org/jira/browse/ARROW-10146) - [Python] Parquet metadata to\_dict raises attribute error
+* [ARROW-10174](https://issues.apache.org/jira/browse/ARROW-10174) - [Java] Reading of Dictionary encoded struct vector fails
+* [ARROW-10177](https://issues.apache.org/jira/browse/ARROW-10177) - [CI][Gandiva] Nightly gandiva-jar-xenial fails
+* [ARROW-10186](https://issues.apache.org/jira/browse/ARROW-10186) - [Rust] Tests fail when following instructions in README
+* [ARROW-10247](https://issues.apache.org/jira/browse/ARROW-10247) - [C++][Dataset] Cannot write dataset with dictionary column as partition field
+* [ARROW-10264](https://issues.apache.org/jira/browse/ARROW-10264) - [C++][Python] Parquet test failing with HadoopFileSystem URI
+* [ARROW-10270](https://issues.apache.org/jira/browse/ARROW-10270) - [R] Fix CSV timestamp\_parsers test on R-devel
+* [ARROW-10283](https://issues.apache.org/jira/browse/ARROW-10283) - [Python] Python deprecation warning for "PY\_SSIZE\_T\_CLEAN will be required for '\#' formats"
+* [ARROW-10293](https://issues.apache.org/jira/browse/ARROW-10293) - [Rust] [DataFusion] Fix benchmarks
+* [ARROW-10294](https://issues.apache.org/jira/browse/ARROW-10294) - [Java] Resolve problems of DecimalVector APIs on ArrowBufs
+* [ARROW-10321](https://issues.apache.org/jira/browse/ARROW-10321) - [C++] Building AVX512 code when we should not
+* [ARROW-10333](https://issues.apache.org/jira/browse/ARROW-10333) - [Java] Remove split packages in arrow-memory-core and arrow-vectors
+* [ARROW-10345](https://issues.apache.org/jira/browse/ARROW-10345) - [C++] NaN breaks sorting
+* [ARROW-10346](https://issues.apache.org/jira/browse/ARROW-10346) - [Python] Default S3 region is eu-central-1 even with LANG=C
+* [ARROW-10348](https://issues.apache.org/jira/browse/ARROW-10348) - [C++] Fix crash on invalid Parquet file (OSS-Fuzz)
+* [ARROW-10350](https://issues.apache.org/jira/browse/ARROW-10350) - [Rust] parquet\_derive crate cannot be published to crates.io
+* [ARROW-10353](https://issues.apache.org/jira/browse/ARROW-10353) - [C++] Parquet decompresses DataPageV2 pages even if is\_compressed==0
+* [ARROW-10358](https://issues.apache.org/jira/browse/ARROW-10358) - [R] Followups to 2.0.0 release
+* [ARROW-10365](https://issues.apache.org/jira/browse/ARROW-10365) - [R] Remove duplicate setting of S3 flag on macOS
+* [ARROW-10369](https://issues.apache.org/jira/browse/ARROW-10369) - [Dev] Fix archery release utility test cases
+* [ARROW-10371](https://issues.apache.org/jira/browse/ARROW-10371) - [R] Linux system requirements check needs to support older cmake versions
+* [ARROW-10386](https://issues.apache.org/jira/browse/ARROW-10386) - [R] List column class attributes not preserved in roundtrip
+* [ARROW-10388](https://issues.apache.org/jira/browse/ARROW-10388) - [Java] Fix Spark integration build failure
+* [ARROW-10390](https://issues.apache.org/jira/browse/ARROW-10390) - [Rust] [Parquet] Regression Can not implement custom ParquetWriter because \`TryClone\` is not publically exported
+* [ARROW-10393](https://issues.apache.org/jira/browse/ARROW-10393) - [Rust]: Fix null value reading in jsonreader for both dictionary and stringbuilders
+* [ARROW-10394](https://issues.apache.org/jira/browse/ARROW-10394) - [Rust] [Large]BinaryArray can be created from non-binary datatypes
+* [ARROW-10397](https://issues.apache.org/jira/browse/ARROW-10397) - [C++] Outdated and confusing comment on dictionary indices
+* [ARROW-10399](https://issues.apache.org/jira/browse/ARROW-10399) - [R] Fix performance regression from cpp11::r\_string
+* [ARROW-10411](https://issues.apache.org/jira/browse/ARROW-10411) - [C++] Fix incorrect child array lengths for Concatenate of FixedSizeList
+* [ARROW-10412](https://issues.apache.org/jira/browse/ARROW-10412) - [C++] CMake Build Fails with grpc 1.33.1, "GRPC\_CPP\_PLUGIN-NOTFOUND: program not found or is not executable"
+* [ARROW-10413](https://issues.apache.org/jira/browse/ARROW-10413) - [Rust] [Parquet] Unignore some roundtrip tests that are passing now
+* [ARROW-10414](https://issues.apache.org/jira/browse/ARROW-10414) - [R] open\_dataset doesn't work with absolute/expanded paths on Windows
+* [ARROW-10426](https://issues.apache.org/jira/browse/ARROW-10426) - [C++] Arrow type large\_string cannot be written to Parquet type column descriptor
+* [ARROW-10433](https://issues.apache.org/jira/browse/ARROW-10433) - [Python] pyarrow doesn't work with s3fs\>=0.5
+* [ARROW-10434](https://issues.apache.org/jira/browse/ARROW-10434) - [Rust] Debug formatting arrays with lengths greater than 10 and less than 20 produces incorrect values
+* [ARROW-10441](https://issues.apache.org/jira/browse/ARROW-10441) - [FlightRPC][Java] FlightClients from FlightGrpcUtils\#createFlightClient shutdown gRPC channel when closed
+* [ARROW-10446](https://issues.apache.org/jira/browse/ARROW-10446) - [C++][Python] Timezone aware pd.Timestamp's are incorrectly converted to Timestamp arrys
+* [ARROW-10448](https://issues.apache.org/jira/browse/ARROW-10448) - [Rust] PrimitiveArray::new can create arrays not in spec
+* [ARROW-10453](https://issues.apache.org/jira/browse/ARROW-10453) - [Rust] [DataFusion] Performance degredation after removing specialization
+* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests
+* [ARROW-10461](https://issues.apache.org/jira/browse/ARROW-10461) - [Rust] Offset related bug in BitChunks::remainder\_bits
+* [ARROW-10462](https://issues.apache.org/jira/browse/ARROW-10462) - [Python] ParquetDatasetPiece's path broken when using fsspec fs on Windows
+* [ARROW-10463](https://issues.apache.org/jira/browse/ARROW-10463) - [R] Better messaging for currently unsupported CSV options in open\_dataset
+* [ARROW-10470](https://issues.apache.org/jira/browse/ARROW-10470) - [R] Fix missing file error causing NYC taxi example to fail
+* [ARROW-10471](https://issues.apache.org/jira/browse/ARROW-10471) - [CI][Python] Ensure we have tests with s3fs and run those on CI
+* [ARROW-10472](https://issues.apache.org/jira/browse/ARROW-10472) - [C++][Python] casting a scalar timestamp to date32 results in Aborted (core dump)
+* [ARROW-10475](https://issues.apache.org/jira/browse/ARROW-10475) - [С++][FlightRPC] Arrow Flight Server / Client cannot be initialized with Ipv6 host
+* [ARROW-10480](https://issues.apache.org/jira/browse/ARROW-10480) - [Python] Parquet write\_table creates gzipped Parquet file, not Parquet with gzip compression
+* [ARROW-10482](https://issues.apache.org/jira/browse/ARROW-10482) - [Python] Specifying compression type on a column basis when writing Parquet not working
+* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler
+* [ARROW-10491](https://issues.apache.org/jira/browse/ARROW-10491) - [FlightRPC][Java] Fix NPE when using FlightProducer without interceptors
+* [ARROW-10493](https://issues.apache.org/jira/browse/ARROW-10493) - [C++][Parquet] Writing nullable nested strings results in wrong data in file
+* [ARROW-10495](https://issues.apache.org/jira/browse/ARROW-10495) - [C++] find\_package(Arrow) is broken on Ubuntu 18
+* [ARROW-10496](https://issues.apache.org/jira/browse/ARROW-10496) - [R][CI] Fix conda-r job
+* [ARROW-10499](https://issues.apache.org/jira/browse/ARROW-10499) - [C++][Java] Fix ORC Java JNI Crash
+* [ARROW-10502](https://issues.apache.org/jira/browse/ARROW-10502) - [C++/Python] CUDA detection messes up nightly conda-win builds
+* [ARROW-10503](https://issues.apache.org/jira/browse/ARROW-10503) - [C++] Uriparser will not compile using Intel compiler
+* [ARROW-10508](https://issues.apache.org/jira/browse/ARROW-10508) - [Java] Allow FixedSizeListVector to have empty children
+* [ARROW-10509](https://issues.apache.org/jira/browse/ARROW-10509) - [C++] Define operator<<(ostream, ParquetException) for clang+Windows
+* [ARROW-10511](https://issues.apache.org/jira/browse/ARROW-10511) - [Python] Table.to\_pandas() failing when timezone-awareness mismatch in metadata
+* [ARROW-10518](https://issues.apache.org/jira/browse/ARROW-10518) - Fix cast function issues in gandiva
+* [ARROW-10519](https://issues.apache.org/jira/browse/ARROW-10519) - [Python] Deadlock when PyArrow imports Pandas from multiple threads
+* [ARROW-10525](https://issues.apache.org/jira/browse/ARROW-10525) - [C++] Fix crash on unsupported IPC stream (OSS-Fuzz)
+* [ARROW-10532](https://issues.apache.org/jira/browse/ARROW-10532) - [Python] Mangled pandas\_metadata when specified schema has different order as DataFrame columns
+* [ARROW-10545](https://issues.apache.org/jira/browse/ARROW-10545) - [C++] Fix crash on invalid Parquet file (OSS-Fuzz)
+* [ARROW-10546](https://issues.apache.org/jira/browse/ARROW-10546) - [Python] Deprecate the S3FSWrapper class
+* [ARROW-10547](https://issues.apache.org/jira/browse/ARROW-10547) - [Rust][DataFusion] Filter pushdown loses filters if below a user defined node
+* [ARROW-10551](https://issues.apache.org/jira/browse/ARROW-10551) - [Rust]: Fix unreproducible benchmarks
+* [ARROW-10558](https://issues.apache.org/jira/browse/ARROW-10558) - [Python] Filesystem S3 tests not independent (native s3 influences s3fs)
+* [ARROW-10560](https://issues.apache.org/jira/browse/ARROW-10560) - [Python] Crash when creating array with string over 2GB
+* [ARROW-10563](https://issues.apache.org/jira/browse/ARROW-10563) - [Packaging][C++] CMake find\_package(Arrow 2.0 CONFIG REQUIRED) broken
+* [ARROW-10565](https://issues.apache.org/jira/browse/ARROW-10565) - [Python] Table.from\_batches and Table.from\_pandas have argument Schema\_schema in documentation instead of schema
+* [ARROW-10568](https://issues.apache.org/jira/browse/ARROW-10568) - [C++][Parquet] Parquet writer crashes process when Tell() does not succeed
+* [ARROW-10569](https://issues.apache.org/jira/browse/ARROW-10569) - [C++][Python] Poor Table filtering performance
+* [ARROW-10577](https://issues.apache.org/jira/browse/ARROW-10577) - [Rust][DataFusion] Hash Aggregator stream finishes unexpectedly after going to Pending state
+* [ARROW-10578](https://issues.apache.org/jira/browse/ARROW-10578) - [C++] Comparison kernels crashing for string array with null string scalar
+* [ARROW-10610](https://issues.apache.org/jira/browse/ARROW-10610) - [C++] arrow-utility-test and arrow-csv-test causes failures on a big-endian platform
+* [ARROW-10616](https://issues.apache.org/jira/browse/ARROW-10616) - [Developer] Expand PR labeler to all supported languages
+* [ARROW-10617](https://issues.apache.org/jira/browse/ARROW-10617) - [Python] RecordBatchStreamReader's iterator doesn't work with python 3.8
+* [ARROW-10619](https://issues.apache.org/jira/browse/ARROW-10619) - [C++] Fix crash on unsupported IPC stream (OSS-Fuzz)
+* [ARROW-10620](https://issues.apache.org/jira/browse/ARROW-10620) - [Rust][Parquet] move column chunk range logic to metadata.rs
+* [ARROW-10621](https://issues.apache.org/jira/browse/ARROW-10621) - [Java] flight-cpre test causes a failure on s390x
+* [ARROW-10622](https://issues.apache.org/jira/browse/ARROW-10622) - [R] Nameof<\>() is incorrect in r-arrow build environment
+* [ARROW-10623](https://issues.apache.org/jira/browse/ARROW-10623) - [R] Version 1.0.1 breaks data.frame attributes when reading file written by 2.0.0
+* [ARROW-10624](https://issues.apache.org/jira/browse/ARROW-10624) - [R] Proactively remove "problems" attributes
+* [ARROW-10627](https://issues.apache.org/jira/browse/ARROW-10627) - [Rust] Github master does not compile for WASM target
+* [ARROW-10629](https://issues.apache.org/jira/browse/ARROW-10629) - [CI] MinGW builds broken on Github Actions
+* [ARROW-10631](https://issues.apache.org/jira/browse/ARROW-10631) - [Rust] Equality of fixed-sized binary is incorrect.
+* [ARROW-10642](https://issues.apache.org/jira/browse/ARROW-10642) - [R] Can't get Table from RecordBatchReader with 0 batches
+* [ARROW-10656](https://issues.apache.org/jira/browse/ARROW-10656) - [Rust] New RecordBatch requires exact match of Data Types
+* [ARROW-10656](https://issues.apache.org/jira/browse/ARROW-10656) - [Rust] New RecordBatch requires exact match of Data Types
+* [ARROW-10661](https://issues.apache.org/jira/browse/ARROW-10661) - [C\#] Fix benchmarking project
+* [ARROW-10662](https://issues.apache.org/jira/browse/ARROW-10662) - [Java] Avoid integer overflow for Json file reader
+* [ARROW-10663](https://issues.apache.org/jira/browse/ARROW-10663) - [C++/Doc] The IsIn kernel ignores the skip\_nulls option of SetLookupOptions
+* [ARROW-10667](https://issues.apache.org/jira/browse/ARROW-10667) - [Rust] [Parquet] Add a convenience type for writing Parquet to memory
+* [ARROW-10668](https://issues.apache.org/jira/browse/ARROW-10668) - [R] Filtering does not work with .data pronoun
+* [ARROW-10681](https://issues.apache.org/jira/browse/ARROW-10681) - [Rust] [DataFusion] TPC-H Query 12 fails with scheduler error
+* [ARROW-10684](https://issues.apache.org/jira/browse/ARROW-10684) - [Rust] Logical equality should consider parent array nullability
+* [ARROW-10690](https://issues.apache.org/jira/browse/ARROW-10690) - [Java] ComplexCopier gives incorrect result for list vector if target vector is non-empty
+* [ARROW-10692](https://issues.apache.org/jira/browse/ARROW-10692) - [Rust] Segfault while array buffer append
+* [ARROW-10699](https://issues.apache.org/jira/browse/ARROW-10699) - [C++] BitmapUInt64Reader doesn't work on big-endian
+* [ARROW-10701](https://issues.apache.org/jira/browse/ARROW-10701) - [Rust] [Datafusion] Benchmark sort\_limit\_query\_sql fails because order by clause specifies column index instead of expression
+* [ARROW-10705](https://issues.apache.org/jira/browse/ARROW-10705) - [Rust] Lifetime annotations in the IPC writer are too strict, preventing code reuse
+* [ARROW-10710](https://issues.apache.org/jira/browse/ARROW-10710) - [Rust] Example flight server is broken after tokio upgrade (among other things)
+* [ARROW-10711](https://issues.apache.org/jira/browse/ARROW-10711) - [CI] Remove set-env from auto-tune to work with new GHA settings
+* [ARROW-10719](https://issues.apache.org/jira/browse/ARROW-10719) - [C\#] ArrowStreamWriter doesn't write schema metadata
+* [ARROW-10746](https://issues.apache.org/jira/browse/ARROW-10746) - [C++] Use GTEST\_SKIP in parquet encoding tests
+* [ARROW-10748](https://issues.apache.org/jira/browse/ARROW-10748) - [Java] TimeStampMilliVector cannot be cast to TimeStampMilliTZVector
+* [ARROW-10749](https://issues.apache.org/jira/browse/ARROW-10749) - [C++] Incorrect string format for Datum with the collection type
+* [ARROW-10751](https://issues.apache.org/jira/browse/ARROW-10751) - [C++] Add RE2 to minimal build example
+* [ARROW-10753](https://issues.apache.org/jira/browse/ARROW-10753) - [Rust] [DataFusion] Negative numbers in SQL WHERE clause not parsed correctly
+* [ARROW-10757](https://issues.apache.org/jira/browse/ARROW-10757) - [Rust] [CI] Sporadic failures due to disk filling up
+* [ARROW-10760](https://issues.apache.org/jira/browse/ARROW-10760) - [Rust] [DataFusion] Predicate push down does not support joins correctly
+* [ARROW-10769](https://issues.apache.org/jira/browse/ARROW-10769) - [CI] Integration tests are failing in master
+* [ARROW-10774](https://issues.apache.org/jira/browse/ARROW-10774) - [R] Set minimum cpp11 version
+* [ARROW-10777](https://issues.apache.org/jira/browse/ARROW-10777) - [Packaging][Python] PyPI pyarrow source dist (sdist) contains architecture dependent binaries
+* [ARROW-10778](https://issues.apache.org/jira/browse/ARROW-10778) - [Python] RowGroupInfo.statistics errors for empty row group
+* [ARROW-10779](https://issues.apache.org/jira/browse/ARROW-10779) - [Java] writeNull method in UnionListWriter doesn't work correctly if validity at that index is already set
+* [ARROW-10780](https://issues.apache.org/jira/browse/ARROW-10780) - [R] Update known R installation issues for CentOS 7
+* [ARROW-10791](https://issues.apache.org/jira/browse/ARROW-10791) - [Rust] StreamReader, read\_dictionary duplicating schema info
+* [ARROW-10801](https://issues.apache.org/jira/browse/ARROW-10801) - [Rust] [Flight] Support sending FlightData for Dictionaries with that of a RecordBatch
+* [ARROW-10803](https://issues.apache.org/jira/browse/ARROW-10803) - [R] Support R \>= 3.3 and add CI
+* [ARROW-10804](https://issues.apache.org/jira/browse/ARROW-10804) - [Rust] Remove UB on parquet crate
+* [ARROW-10807](https://issues.apache.org/jira/browse/ARROW-10807) - [Rust][DataFusion] Avoid double hashing
+* [ARROW-10810](https://issues.apache.org/jira/browse/ARROW-10810) - [Rust] Speed up comparison kernels
+* [ARROW-10811](https://issues.apache.org/jira/browse/ARROW-10811) - [R][CI] Remove nightly centos6 build
+* [ARROW-10823](https://issues.apache.org/jira/browse/ARROW-10823) - MutableArrayData with use\_null false yields wrong results
+* [ARROW-10830](https://issues.apache.org/jira/browse/ARROW-10830) - [Rust] json reader should not hard crash on invalid json
+* [ARROW-10833](https://issues.apache.org/jira/browse/ARROW-10833) - [Python] Avoid usage of NumPy's PyArray\_DescrCheck macro
+* [ARROW-10834](https://issues.apache.org/jira/browse/ARROW-10834) - [R] Fix print method for SubTreeFileSystem
+* [ARROW-10837](https://issues.apache.org/jira/browse/ARROW-10837) - [Rust] Use \`Vec<u8\>\` for hash key instead
+* [ARROW-10840](https://issues.apache.org/jira/browse/ARROW-10840) - [C++] Parquet FileMetaData does not have key\_value\_metadata when built from FileMetaDataBuilder
+* [ARROW-10842](https://issues.apache.org/jira/browse/ARROW-10842) - [Rust] decouple IO from json schema inference code
+* [ARROW-10844](https://issues.apache.org/jira/browse/ARROW-10844) - [Rust] [DataFusion] join of two DataFrames is not possible
+* [ARROW-10850](https://issues.apache.org/jira/browse/ARROW-10850) - [R] Unrecognized compression type: LZ4
+* [ARROW-10852](https://issues.apache.org/jira/browse/ARROW-10852) - [C++] AssertTablesEqual(verbose=true) segfaults if the left array has more rows
+* [ARROW-10854](https://issues.apache.org/jira/browse/ARROW-10854) - [Rust] [DataFusion] Simplified logical scans
+* [ARROW-10855](https://issues.apache.org/jira/browse/ARROW-10855) - [Python][Numpy] ArrowTypeError after upgrading NumPy to 1.20.0rc1
+* [ARROW-10856](https://issues.apache.org/jira/browse/ARROW-10856) - [R] CentOS 7 not correctly identifying compiler version
+* [ARROW-10859](https://issues.apache.org/jira/browse/ARROW-10859) - [Rust] [DataFusion] Make collect not require ExecutionContext
+* [ARROW-10860](https://issues.apache.org/jira/browse/ARROW-10860) - [Java] Avoid integer overflow for generated classes in Vector
+* [ARROW-10863](https://issues.apache.org/jira/browse/ARROW-10863) - [Python] ExtensionArray.to\_pandas not working
+* [ARROW-10863](https://issues.apache.org/jira/browse/ARROW-10863) - [Python] ExtensionArray.to\_pandas not working
+* [ARROW-10875](https://issues.apache.org/jira/browse/ARROW-10875) - simplify simd cfg check
+* [ARROW-10876](https://issues.apache.org/jira/browse/ARROW-10876) - [Rust] json reader should validate value type
+* [ARROW-10897](https://issues.apache.org/jira/browse/ARROW-10897) - [Rust] Replace Arc<String\> by String in DataType::Timestamp
+* [ARROW-10907](https://issues.apache.org/jira/browse/ARROW-10907) - [Rust] Cast UTF8 to Date64 Incorrect
+* [ARROW-10913](https://issues.apache.org/jira/browse/ARROW-10913) - [Python][Doc] Code block typo in filesystems docs
+* [ARROW-10914](https://issues.apache.org/jira/browse/ARROW-10914) - [Rust]: SIMD implementation of arithmetic kernels reads out of bounds
+* [ARROW-10915](https://issues.apache.org/jira/browse/ARROW-10915) - [Rust] Make ARROW\_TEST\_DATA and PARQUET\_TEST\_DATA absolute dirs
+* [ARROW-10921](https://issues.apache.org/jira/browse/ARROW-10921) - \`TypeError: 'coroutine' object is not iterable\` when reading parquet partitions via s3fs \>= 0.5 with pyarrow
+* [ARROW-10930](https://issues.apache.org/jira/browse/ARROW-10930) - [Python] LargeListType doesn't have a value\_field
+* [ARROW-10932](https://issues.apache.org/jira/browse/ARROW-10932) - [C++] BinaryMemoTable::CopyOffsets access out-of-bound address when data is empty
+* [ARROW-10932](https://issues.apache.org/jira/browse/ARROW-10932) - [C++] BinaryMemoTable::CopyOffsets access out-of-bound address when data is empty
+* [ARROW-10942](https://issues.apache.org/jira/browse/ARROW-10942) - [C++] S3FileSystem::Impl::IsEmptyDirectory fails on Amazon S3
+* [ARROW-10943](https://issues.apache.org/jira/browse/ARROW-10943) - [Rust] Intermittent build failure in parquet encoding
+* [ARROW-10954](https://issues.apache.org/jira/browse/ARROW-10954) - [C++][Doc] PlasmaClient is threadSafe now, doc not update
+* [ARROW-10955](https://issues.apache.org/jira/browse/ARROW-10955) - [C++] Reading empty json lists results in invalid non-nullable null type
+* [ARROW-10960](https://issues.apache.org/jira/browse/ARROW-10960) - [C++][FlightRPC] Missing protobuf data\_body should result in default value of empty bytes, not null
+* [ARROW-10962](https://issues.apache.org/jira/browse/ARROW-10962) - [Java][FlightRPC] FlightData deserializer should accept missing fields
+* [ARROW-10967](https://issues.apache.org/jira/browse/ARROW-10967) - [Rust] Make env vars ARROW\_TEST\_DATA and PARQUET\_TEST\_DATA optional
+* [ARROW-10990](https://issues.apache.org/jira/browse/ARROW-10990) - [Rust]: SIMD implementation of compare kernels reads out of bounds
+* [ARROW-10994](https://issues.apache.org/jira/browse/ARROW-10994) - [Rust] Fix bugs in TPC-H file conversion
+* [ARROW-10996](https://issues.apache.org/jira/browse/ARROW-10996) - [Rust] Return error messages via Result for get\_arrow\_schema\_from\_metadata
+* [ARROW-10999](https://issues.apache.org/jira/browse/ARROW-10999) - [Rust] TPC-H parquet files cannot be read by Apache Spark
+* [ARROW-11014](https://issues.apache.org/jira/browse/ARROW-11014) - [Rust] [DataFusion] ParquetExec reports incorrect statistics
+* [ARROW-11023](https://issues.apache.org/jira/browse/ARROW-11023) - [C++][CMake] gRPC doesn't respect CMAKE\_CXX\_COMPILER
+* [ARROW-11024](https://issues.apache.org/jira/browse/ARROW-11024) - [C++][Parquet] Writing List<Struct\> to parquet sometimes writes wrong data
+* [ARROW-11025](https://issues.apache.org/jira/browse/ARROW-11025) - [Rust] Bench for boolean kernels measure array creation
+* [ARROW-11030](https://issues.apache.org/jira/browse/ARROW-11030) - [Rust] [DataFusion] HashJoinExec slow with many batches
+* [ARROW-11048](https://issues.apache.org/jira/browse/ARROW-11048) - [Rust] Add bench to MutableBuffer
+* [ARROW-11050](https://issues.apache.org/jira/browse/ARROW-11050) - [R] Handle RecordBatch in write\_parquet
+* [ARROW-11067](https://issues.apache.org/jira/browse/ARROW-11067) - [C++] CSV reader returns nulls for some strings on macOS
+* [ARROW-11069](https://issues.apache.org/jira/browse/ARROW-11069) - [C++] Parquet writer incorrect data being written when data type is struct
+* [ARROW-11073](https://issues.apache.org/jira/browse/ARROW-11073) - [Rust] Lint Error on CI Tests in /arrow/rust/arrow/src/ipc/reader.rs
+* [ARROW-11083](https://issues.apache.org/jira/browse/ARROW-11083) - [CI] Build "Source Release and Merge Script" is broken
+* [ARROW-11084](https://issues.apache.org/jira/browse/ARROW-11084) - [Rust] Clippy failing in master
+* [ARROW-11085](https://issues.apache.org/jira/browse/ARROW-11085) - [Rust] Rust CI no longer works b/c it uses action-rs: Migrate CI away from action-rs/\*
+* [ARROW-11092](https://issues.apache.org/jira/browse/ARROW-11092) - [CI] (Temporarily) move offending workflows to separate files
+* [ARROW-11102](https://issues.apache.org/jira/browse/ARROW-11102) - [Rust][DataFusion] fmt::Debug for ScalarValue(Utf8) is always quoted
+* [ARROW-11113](https://issues.apache.org/jira/browse/ARROW-11113) - [Rust] support as\_struct\_array cast
+* [ARROW-11114](https://issues.apache.org/jira/browse/ARROW-11114) - [Java] Metadata serialization is broken for Field class
+* [ARROW-11132](https://issues.apache.org/jira/browse/ARROW-11132) - [CI] Use pip to install crossbow's dependencies for the comment bot
+* [ARROW-11144](https://issues.apache.org/jira/browse/ARROW-11144) - [C++][Python][CI] Fix HDFS nightly build
+* [ARROW-11152](https://issues.apache.org/jira/browse/ARROW-11152) - [CI][C++] Fix Homebrew numpy installation on macOS builds
+* [ARROW-11162](https://issues.apache.org/jira/browse/ARROW-11162) - [C++] Fix crash on Decimal256 Parquet file (OSS-Fuzz)
+* [ARROW-11163](https://issues.apache.org/jira/browse/ARROW-11163) - [C++][Python] Compressed Feather file written with pyarrow 0.17 not readable in pyarrow 2.0.0+
+* [ARROW-11166](https://issues.apache.org/jira/browse/ARROW-11166) - [Python][Compute] Add bindings for ProjectOptions
+* [ARROW-11171](https://issues.apache.org/jira/browse/ARROW-11171) - [Go] Build fails on s390x with noasm tag
+* [ARROW-11189](https://issues.apache.org/jira/browse/ARROW-11189) - [Developer] Achery benchmark diff cannot compare two jsons
+* [ARROW-11190](https://issues.apache.org/jira/browse/ARROW-11190) - [C++][Dataset] Clean up compiler warnings
+* [ARROW-11202](https://issues.apache.org/jira/browse/ARROW-11202) - [R][CI] Nightly builds not happening (or artifacts not exported)
+* [ARROW-11224](https://issues.apache.org/jira/browse/ARROW-11224) - [R] don't test metadata serialization on old R versions
+* [ARROW-11226](https://issues.apache.org/jira/browse/ARROW-11226) - [Python][CI] Filesystem tests failing with s3fs 0.5.2
+* [ARROW-11227](https://issues.apache.org/jira/browse/ARROW-11227) - [Python][CI] AMD64 Conda Python 3.7 Pandas 0.24 cron job failing in to\_pandas extension dtype test
+* [ARROW-11229](https://issues.apache.org/jira/browse/ARROW-11229) - [C++][Dataset] Static build is failed
+* [ARROW-11230](https://issues.apache.org/jira/browse/ARROW-11230) - [R] Fix build failures on Windows when multiple libarrow binaries found
+* [ARROW-11232](https://issues.apache.org/jira/browse/ARROW-11232) - [C++] Table::CombineChunks() returns incorrect results if Table has no column
+* [ARROW-11233](https://issues.apache.org/jira/browse/ARROW-11233) - [C++][Flight] Fail to link with bundled gRPC and Abseil
+* [ARROW-11237](https://issues.apache.org/jira/browse/ARROW-11237) - [C++] Compiler error with GLog and unity build enabled
+* [ARROW-11251](https://issues.apache.org/jira/browse/ARROW-11251) - [CI] Make sure that devtoolset-8 is really installed + being used
+* [ARROW-11253](https://issues.apache.org/jira/browse/ARROW-11253) - [R] Make sure that large metadata tests are reproducible
+* [ARROW-11255](https://issues.apache.org/jira/browse/ARROW-11255) - [Packaging][Conda][macOS] Fix Python version
+* [ARROW-11271](https://issues.apache.org/jira/browse/ARROW-11271) - [Rust] [Parquet] List schema to Arrow parser misinterpreting child nullability
+* [ARROW-11274](https://issues.apache.org/jira/browse/ARROW-11274) - [Packaging][wheel][Windows] Fix wheels path for Gemfury
+* [ARROW-11275](https://issues.apache.org/jira/browse/ARROW-11275) - [Packaging][wheel][Linux] Fix paths for Gemfury
+* [ARROW-11283](https://issues.apache.org/jira/browse/ARROW-11283) - [Julia] Fix install link
+* [ARROW-11286](https://issues.apache.org/jira/browse/ARROW-11286) - [Release][Yum] Fix minimal build example check
+* [ARROW-11287](https://issues.apache.org/jira/browse/ARROW-11287) - [Packaging][RPM] Add missing dependencies
+* [ARROW-11301](https://issues.apache.org/jira/browse/ARROW-11301) - [C++] Fix reading LZ4-compressed Parquet files produced by Java Parquet implementation
+* [ARROW-11302](https://issues.apache.org/jira/browse/ARROW-11302) - [Release][Python] Remove verification of python 3.5 wheel on macOS
+* [ARROW-11306](https://issues.apache.org/jira/browse/ARROW-11306) - [Packaging][Ubuntu][16.04] Add missing libprotobuf-dev dependency
+* [PARQUET-1935](https://issues.apache.org/jira/browse/PARQUET-1935) - [C++][Parquet] nullptr access violation when writing arrays of non-nullable values
+
+
+
+# Apache Arrow 2.0.0 (2020-10-13)
+
+## Bug Fixes
+
+* [ARROW-2367](https://issues.apache.org/jira/browse/ARROW-2367) - [Python] ListArray has trouble with sizes greater than kMaximumCapacity
+* [ARROW-4189](https://issues.apache.org/jira/browse/ARROW-4189) - [CI] [Rust] Fix broken cargo coverage
+* [ARROW-4917](https://issues.apache.org/jira/browse/ARROW-4917) - [C++] orc\_ep fails in cpp-alpine docker
+* [ARROW-5578](https://issues.apache.org/jira/browse/ARROW-5578) - [C++][Flight] Flight does not build out of the box on Alpine Linux
+* [ARROW-7226](https://issues.apache.org/jira/browse/ARROW-7226) - [JSON][Python] Json loader fails on example in documentation.
+* [ARROW-7384](https://issues.apache.org/jira/browse/ARROW-7384) - [Website] Fix search indexing warning reported by Google
+* [ARROW-7517](https://issues.apache.org/jira/browse/ARROW-7517) - [C++] Builder does not honour dictionary type provided during initialization
+* [ARROW-7663](https://issues.apache.org/jira/browse/ARROW-7663) - [Python] from\_pandas gives TypeError instead of ArrowTypeError in some cases
+* [ARROW-7903](https://issues.apache.org/jira/browse/ARROW-7903) - [Rust] [DataFusion] Upgrade SQLParser dependency for DataFusion
+* [ARROW-7957](https://issues.apache.org/jira/browse/ARROW-7957) - [Python] ParquetDataset cannot take HadoopFileSystem as filesystem
+* [ARROW-8265](https://issues.apache.org/jira/browse/ARROW-8265) - [Rust] [DataFusion] Table API collect() should not require context
+* [ARROW-8394](https://issues.apache.org/jira/browse/ARROW-8394) - [JS] Typescript compiler errors for arrow d.ts files, when using es2015-esm package
+* [ARROW-8735](https://issues.apache.org/jira/browse/ARROW-8735) - [Rust] [Parquet] Parquet crate fails to compile on Arm architecture
+* [ARROW-8749](https://issues.apache.org/jira/browse/ARROW-8749) - [C++] IpcFormatWriter writes dictionary batches with wrong ID
+* [ARROW-8773](https://issues.apache.org/jira/browse/ARROW-8773) - [Python] pyarrow schema.empty\_table() does not preserve nullability of fields
+* [ARROW-9028](https://issues.apache.org/jira/browse/ARROW-9028) - [R] Should be able to convert an empty table
+* [ARROW-9096](https://issues.apache.org/jira/browse/ARROW-9096) - [Python] Pandas roundtrip with object-dtype column labels with integer values: data type "integer" not understood
+* [ARROW-9177](https://issues.apache.org/jira/browse/ARROW-9177) - [C++][Parquet] Tracking issue for cross-implementation LZ4 Parquet compression compatibility
+* [ARROW-9414](https://issues.apache.org/jira/browse/ARROW-9414) - [C++] apt package includes headers for S3 interface, but no support
+* [ARROW-9462](https://issues.apache.org/jira/browse/ARROW-9462) - [Go] The Indentation after the first Record arrjson writer is missing
+* [ARROW-9463](https://issues.apache.org/jira/browse/ARROW-9463) - [Go] The writer is double closed in TestReadWrite
+* [ARROW-9490](https://issues.apache.org/jira/browse/ARROW-9490) - [Python] pyarrow array creation for specific set of numpy scalars fails
+* [ARROW-9495](https://issues.apache.org/jira/browse/ARROW-9495) - [C++] Equality assertions don't handle Inf / -Inf properly
+* [ARROW-9520](https://issues.apache.org/jira/browse/ARROW-9520) - [Rust] [DataFusion] Can't alias an aggregate expression
+* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow
+* [ARROW-9532](https://issues.apache.org/jira/browse/ARROW-9532) - [Python] Building pyarrow for MacPorts on macOS
+* [ARROW-9535](https://issues.apache.org/jira/browse/ARROW-9535) - [Python] Remove symlink fixes from conda recipe
+* [ARROW-9536](https://issues.apache.org/jira/browse/ARROW-9536) - Missing parameters in PlasmaOutOfMemoryException.java
+* [ARROW-9541](https://issues.apache.org/jira/browse/ARROW-9541) - [C++] CMakeLists requires UTF8PROC\_STATIC when building static library
+* [ARROW-9544](https://issues.apache.org/jira/browse/ARROW-9544) - [R] version argument of write\_parquet not working
+* [ARROW-9546](https://issues.apache.org/jira/browse/ARROW-9546) - [Python] Clean up Pandas Metadata Conversion test
+* [ARROW-9548](https://issues.apache.org/jira/browse/ARROW-9548) - [Go] Test output files in tmp directory are not removed correctly
+* [ARROW-9549](https://issues.apache.org/jira/browse/ARROW-9549) - [Rust] Parquet no longer builds
+* [ARROW-9554](https://issues.apache.org/jira/browse/ARROW-9554) - [Java] FixedWidthInPlaceVectorSorter sometimes produces wrong result
+* [ARROW-9556](https://issues.apache.org/jira/browse/ARROW-9556) - [Python][C++] Segfaults in UnionArray with null values
+* [ARROW-9560](https://issues.apache.org/jira/browse/ARROW-9560) - [Packaging] conda recipes failing due to missing conda-forge.yml
+* [ARROW-9569](https://issues.apache.org/jira/browse/ARROW-9569) - [CI][R] Fix rtools35 builds for msys2 key change
+* [ARROW-9570](https://issues.apache.org/jira/browse/ARROW-9570) - [Doc] Clean up sphinx sidebar
+* [ARROW-9573](https://issues.apache.org/jira/browse/ARROW-9573) - [Python] Parquet doesn't load when partitioned column starts with '\_'
+* [ARROW-9574](https://issues.apache.org/jira/browse/ARROW-9574) - [R] Cleanups for CRAN 1.0.0 release
+* [ARROW-9575](https://issues.apache.org/jira/browse/ARROW-9575) - [R] gcc-UBSAN failure on CRAN
+* [ARROW-9577](https://issues.apache.org/jira/browse/ARROW-9577) - [Python][C++] posix\_madvise error on Debian in pyarrow 1.0.0
+* [ARROW-9583](https://issues.apache.org/jira/browse/ARROW-9583) - [Rust] Offset is mishandled in arithmetic and boolean compute kernels
+* [ARROW-9588](https://issues.apache.org/jira/browse/ARROW-9588) - [C++] clang/win: Copy constructor of ParquetInvalidOrCorruptedFileException not correctly triggered
+* [ARROW-9589](https://issues.apache.org/jira/browse/ARROW-9589) - [C++/R] arrow\_exports.h contains structs declared as class
+* [ARROW-9592](https://issues.apache.org/jira/browse/ARROW-9592) - [CI] Update homebrew before calling brew bundle
+* [ARROW-9596](https://issues.apache.org/jira/browse/ARROW-9596) - [CI][Crossbow] Fix homebrew-cpp again, again
+* [ARROW-9597](https://issues.apache.org/jira/browse/ARROW-9597) - [C++] AddAlias in compute::FunctionRegistry should be synchronized
+* [ARROW-9598](https://issues.apache.org/jira/browse/ARROW-9598) - [C++][Parquet] Spaced definition levels is not assigned correctly.
+* [ARROW-9599](https://issues.apache.org/jira/browse/ARROW-9599) - [CI] Appveyor toolchain build fails because CMake detects different C and C++ compilers
+* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build
+* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build
+* [ARROW-9602](https://issues.apache.org/jira/browse/ARROW-9602) - [R] Improve cmake detection in Linux build
+* [ARROW-9603](https://issues.apache.org/jira/browse/ARROW-9603) - [C++][Parquet] Write Arrow relies on unspecified behavior for nested types
+* [ARROW-9606](https://issues.apache.org/jira/browse/ARROW-9606) - [C++][Dataset] in expressions don't work with \>1 partition levels
+* [ARROW-9609](https://issues.apache.org/jira/browse/ARROW-9609) - [C++] CSV datasets don't materialize virtual columns
+* [ARROW-9621](https://issues.apache.org/jira/browse/ARROW-9621) - [Python] test\_move\_file() is failed with fsspec 0.8.0
+* [ARROW-9622](https://issues.apache.org/jira/browse/ARROW-9622) - [Java] ComplexCopier fails if a structvector has a child UnionVector with nulls
+* [ARROW-9628](https://issues.apache.org/jira/browse/ARROW-9628) - [Rust] Clippy PR test failing intermittently on Rust / AMD64 MacOS
+* [ARROW-9629](https://issues.apache.org/jira/browse/ARROW-9629) - [Python] Kartothek integration tests failing due to missing freezegun module
+* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight
+* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight
+* [ARROW-9642](https://issues.apache.org/jira/browse/ARROW-9642) - [C++] Let MakeBuilder refer DictionaryType's index\_type for deciding the starting bit width of the indices
+* [ARROW-9643](https://issues.apache.org/jira/browse/ARROW-9643) - [C++] Illegal instruction on haswell cpu
+* [ARROW-9644](https://issues.apache.org/jira/browse/ARROW-9644) - [C++][Dataset] Do not check for ignore\_prefixes in the base path
+* [ARROW-9652](https://issues.apache.org/jira/browse/ARROW-9652) - [Rust][DataFusion] Panic trying to select \* from a CSV (panicked at 'index out of bounds: the len is 0 but the index is 0)
+* [ARROW-9653](https://issues.apache.org/jira/browse/ARROW-9653) - [Rust][DataFusion] Multi-column Group by: Invalid Argument Error
+* [ARROW-9659](https://issues.apache.org/jira/browse/ARROW-9659) - [C++] RecordBatchStreamReader throws on CUDA device buffers
+* [ARROW-9660](https://issues.apache.org/jira/browse/ARROW-9660) - [C++] IPC - dictionaries in maps
+* [ARROW-9666](https://issues.apache.org/jira/browse/ARROW-9666) - [Python][wheel][Windows] library missing failure by ARROW-9412
+* [ARROW-9670](https://issues.apache.org/jira/browse/ARROW-9670) - [C++][FlightRPC] Close()ing a DoPut with an ongoing read locks up the client
+* [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz)
+* [ARROW-9692](https://issues.apache.org/jira/browse/ARROW-9692) - [Python] distutils import warning
+* [ARROW-9693](https://issues.apache.org/jira/browse/ARROW-9693) - [CI][Docs] Nightly docs build fails
+* [ARROW-9696](https://issues.apache.org/jira/browse/ARROW-9696) - [Rust] [Datafusion] nested binary expressions broken
+* [ARROW-9698](https://issues.apache.org/jira/browse/ARROW-9698) - [C++] Revert "Add -NDEBUG flag to arrow.pc"
+* [ARROW-9700](https://issues.apache.org/jira/browse/ARROW-9700) - [Python] create\_library\_symlinks doesn't work in macos
+* [ARROW-9712](https://issues.apache.org/jira/browse/ARROW-9712) - [Rust] [DataFusion] ParquetScanExec panics on error
+* [ARROW-9714](https://issues.apache.org/jira/browse/ARROW-9714) - [Rust] [DataFusion] TypeCoercionRule not implemented for Limit or Sort
+* [ARROW-9716](https://issues.apache.org/jira/browse/ARROW-9716) - [Rust] [DataFusion] MergeExec should have concurrency limit
+* [ARROW-9726](https://issues.apache.org/jira/browse/ARROW-9726) - [Rust] [DataFusion] ParquetScanExec launches threads too early
+* [ARROW-9727](https://issues.apache.org/jira/browse/ARROW-9727) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
+* [ARROW-9729](https://issues.apache.org/jira/browse/ARROW-9729) - [Java] Error Prone causes other annotation processors to not work with Eclipse
+* [ARROW-9733](https://issues.apache.org/jira/browse/ARROW-9733) - [Rust][DataFusion] Aggregates COUNT/MIN/MAX don't work on VARCHAR columns
+* [ARROW-9734](https://issues.apache.org/jira/browse/ARROW-9734) - [Rust] [DataFusion] TableProvider.scan executing partitions prematurely
+* [ARROW-9741](https://issues.apache.org/jira/browse/ARROW-9741) - [Rust] [DataFusion] Incorrect count in TPC-H query 1 result set
+* [ARROW-9743](https://issues.apache.org/jira/browse/ARROW-9743) - [R] Sanitize paths in open\_dataset
+* [ARROW-9744](https://issues.apache.org/jira/browse/ARROW-9744) - [Python] Failed to install on aarch64
+* [ARROW-9764](https://issues.apache.org/jira/browse/ARROW-9764) - [CI][Java] Push wrong Docker image
+* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds
+* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds
+* [ARROW-9778](https://issues.apache.org/jira/browse/ARROW-9778) - [Rust] [DataFusion] Logical and physical schemas' nullability does not match in 8 out of 20 end-to-end tests
+* [ARROW-9783](https://issues.apache.org/jira/browse/ARROW-9783) - [Rust] [DataFusion] Logical aggregate expressions require explicit data type
+* [ARROW-9785](https://issues.apache.org/jira/browse/ARROW-9785) - [Python] pyarrow/tests/test\_fs.py::test\_s3\_options too slow
+* [ARROW-9789](https://issues.apache.org/jira/browse/ARROW-9789) - [C++] Don't install jemalloc in parallel
+* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries
+* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries
+* [ARROW-9793](https://issues.apache.org/jira/browse/ARROW-9793) - [Rust] [DataFusion] Tests failing in master
+* [ARROW-9797](https://issues.apache.org/jira/browse/ARROW-9797) - [Rust] AMD64 Conda Integration Tests is failing for the Master branch
+* [ARROW-9799](https://issues.apache.org/jira/browse/ARROW-9799) - [Rust] [DataFusion] Implementation of physical binary expression get\_type method is incorrect
+* [ARROW-9800](https://issues.apache.org/jira/browse/ARROW-9800) - [Rust] [Parquet] "min" and "max" written to standard out when writing columns
+* [ARROW-9809](https://issues.apache.org/jira/browse/ARROW-9809) - [Rust] [DataFusion] logical schema = physical schema is not true
+* [ARROW-9814](https://issues.apache.org/jira/browse/ARROW-9814) - [Python] Crash in test\_parquet.py::test\_read\_partitioned\_directory\_s3fs
+* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
+* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
+* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs
+* [ARROW-9816](https://issues.apache.org/jira/browse/ARROW-9816) - [C++] Escape quotes in config.h
+* [ARROW-9827](https://issues.apache.org/jira/browse/ARROW-9827) - [Python] pandas.read\_parquet fails for wide parquet files and pyarrow 1.0.X
+* [ARROW-9831](https://issues.apache.org/jira/browse/ARROW-9831) - [Rust] [DataFusion] Fix compilation error
+* [ARROW-9840](https://issues.apache.org/jira/browse/ARROW-9840) - [Python] Python fs documentation out of date with code
+* [ARROW-9846](https://issues.apache.org/jira/browse/ARROW-9846) - [Rust] Master branch broken build
+* [ARROW-9851](https://issues.apache.org/jira/browse/ARROW-9851) - [C++] Valgrind errors due to unrecognized instructions
+* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
+* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
+* [ARROW-9855](https://issues.apache.org/jira/browse/ARROW-9855) - [R] Fix bad merge/Rcpp conflict
+* [ARROW-9859](https://issues.apache.org/jira/browse/ARROW-9859) - [C++] S3 FileSystemFromUri with special char in secret key fails
+* [ARROW-9864](https://issues.apache.org/jira/browse/ARROW-9864) - [Python] pathlib.Path not supported in write\_to\_dataset with partition columns
+* [ARROW-9874](https://issues.apache.org/jira/browse/ARROW-9874) - [C++] NewStreamWriter / NewFileWriter don't own output stream
+* [ARROW-9876](https://issues.apache.org/jira/browse/ARROW-9876) - [CI][C++] Travis ARM jobs timeout
+* [ARROW-9877](https://issues.apache.org/jira/browse/ARROW-9877) - [C++][CI] homebrew-cpp fails due to avx512
+* [ARROW-9879](https://issues.apache.org/jira/browse/ARROW-9879) - [Python] ChunkedArray.\_\_getitem\_\_ doesn't work with numpy scalars
+* [ARROW-9882](https://issues.apache.org/jira/browse/ARROW-9882) - [C++/Python] Update conda-forge-pinning to 3 for OSX conda packages
+* [ARROW-9883](https://issues.apache.org/jira/browse/ARROW-9883) - [R] Fix linuxlibs.R install script for R < 3.6
+* [ARROW-9888](https://issues.apache.org/jira/browse/ARROW-9888) - [Rust] [DataFusion] ExecutionContext can not be shared between threads
+* [ARROW-9889](https://issues.apache.org/jira/browse/ARROW-9889) - [Rust][DataFusion] Datafusion CLI: CREATE EXTERNAL TABLE errors with "Unsupported logical plan variant"
+* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern
+* [ARROW-9906](https://issues.apache.org/jira/browse/ARROW-9906) - [Python] Crash in test\_parquet.py::test\_parquet\_writer\_filesystem\_s3\_uri (closing NativeFile from S3FileSystem)
+* [ARROW-9913](https://issues.apache.org/jira/browse/ARROW-9913) - [C++] Outputs of Decimal128::FromString depend on presence of one another
+* [ARROW-9920](https://issues.apache.org/jira/browse/ARROW-9920) - [Python] pyarrow.concat\_arrays segfaults when passing it a chunked array
+* [ARROW-9922](https://issues.apache.org/jira/browse/ARROW-9922) - [Rust] Add \`try\_from(Vec<Option<(&str, ArrayRef)\>\>)\` to StructArray
+* [ARROW-9924](https://issues.apache.org/jira/browse/ARROW-9924) - [Python] Performance regression reading individual Parquet files using Dataset interface
+* [ARROW-9931](https://issues.apache.org/jira/browse/ARROW-9931) - [C++] Fix undefined behaviour on invalid IPC (OSS-Fuzz)
+* [ARROW-9932](https://issues.apache.org/jira/browse/ARROW-9932) - [R] Arrow 1.0.1 R package fails to install on R3.4 over linux
+* [ARROW-9936](https://issues.apache.org/jira/browse/ARROW-9936) - [Python] Fix / test relative file paths in pyarrow.parquet
+* [ARROW-9937](https://issues.apache.org/jira/browse/ARROW-9937) - [Rust] [DataFusion] Average is not correct
+* [ARROW-9943](https://issues.apache.org/jira/browse/ARROW-9943) - [C++] Arrow metadata not applied recursively when reading Parquet file
+* [ARROW-9946](https://issues.apache.org/jira/browse/ARROW-9946) - [R] ParquetFileWriter segfaults when \`sink\` is a string
+* [ARROW-9953](https://issues.apache.org/jira/browse/ARROW-9953) - [R] Declare minimum version for bit64
+* [ARROW-9962](https://issues.apache.org/jira/browse/ARROW-9962) - [Python] Conversion to pandas with index column using fixed timezone fails
+* [ARROW-9968](https://issues.apache.org/jira/browse/ARROW-9968) - [C++] UBSAN link failure with \_\_int8\_t
+* [ARROW-9969](https://issues.apache.org/jira/browse/ARROW-9969) - [C++] RecordBatchBuilder yields invalid result with dictionary fields
+* [ARROW-9970](https://issues.apache.org/jira/browse/ARROW-9970) - [Go] checkptr failures in sum methods
+* [ARROW-9972](https://issues.apache.org/jira/browse/ARROW-9972) - [CI] Work around grpc-re2 clash on Homebrew
+* [ARROW-9973](https://issues.apache.org/jira/browse/ARROW-9973) - [Java] JDBC DateConsumer does not allow dates before epoch
+* [ARROW-9976](https://issues.apache.org/jira/browse/ARROW-9976) - [Python] ArrowCapacityError when doing Table.from\_pandas with large dataframe
+* [ARROW-9990](https://issues.apache.org/jira/browse/ARROW-9990) - [Rust] [DataFusion] NOT is not plannable
+* [ARROW-9993](https://issues.apache.org/jira/browse/ARROW-9993) - [Python] Tzinfo - string roundtrip fails on pytz.StaticTzInfo objects
+* [ARROW-9994](https://issues.apache.org/jira/browse/ARROW-9994) - [C++][Python] Auto chunking nested array containing binary-like fields result malformed output
+* [ARROW-9996](https://issues.apache.org/jira/browse/ARROW-9996) - [C++] Dictionary is unset when calling DictionaryArray.GetScalar for null values
+* [ARROW-10003](https://issues.apache.org/jira/browse/ARROW-10003) - [C++] Create directories in CopyFiles when copying within the same filesystem
+* [ARROW-10008](https://issues.apache.org/jira/browse/ARROW-10008) - [Python] pyarrow.parquet.read\_table fails with predicate pushdown on categorical data with use\_legacy\_dataset=False
+* [ARROW-10011](https://issues.apache.org/jira/browse/ARROW-10011) - [C++] Make FindRE2.cmake re-entrant
+* [ARROW-10012](https://issues.apache.org/jira/browse/ARROW-10012) - [C++] Sporadic failures in CopyFiles test
+* [ARROW-10013](https://issues.apache.org/jira/browse/ARROW-10013) - [C++][CI] Flight test failure in TestFlightClient.GenericOptions
+* [ARROW-10017](https://issues.apache.org/jira/browse/ARROW-10017) - [Java] LargeMemoryUtil.checkedCastToInt has buggy logic
+* [ARROW-10022](https://issues.apache.org/jira/browse/ARROW-10022) - [C++] [Compute] core dumped on some scalar-arithmetic-benchmark
+* [ARROW-10027](https://issues.apache.org/jira/browse/ARROW-10027) - [Python] Incorrect null column returned when using a dataset filter expression.
+* [ARROW-10034](https://issues.apache.org/jira/browse/ARROW-10034) - [Rust] Master build broken
+* [ARROW-10041](https://issues.apache.org/jira/browse/ARROW-10041) - [Rust] Possible to create LargeStringArray with DataType::Utf8
+* [ARROW-10047](https://issues.apache.org/jira/browse/ARROW-10047) - [CI] Conda integration tests failing with cmake error
+* [ARROW-10048](https://issues.apache.org/jira/browse/ARROW-10048) - [Rust] Error in aggregate of min/max for strings
+* [ARROW-10049](https://issues.apache.org/jira/browse/ARROW-10049) - [C++/Python] Sync conda recipe with conda-forge
+* [ARROW-10060](https://issues.apache.org/jira/browse/ARROW-10060) - [Rust] [DataFusion] MergeExec currently discards partitions with errors
+* [ARROW-10062](https://issues.apache.org/jira/browse/ARROW-10062) - [Rust]: Fix for null elems for DoubleEndedIter for DictArray
+* [ARROW-10073](https://issues.apache.org/jira/browse/ARROW-10073) - [Python] Test test\_parquet\_nested\_storage relies on dict item ordering
+* [ARROW-10081](https://issues.apache.org/jira/browse/ARROW-10081) - [C++/Python] Fix bash syntax in drone.io conda builds
+* [ARROW-10085](https://issues.apache.org/jira/browse/ARROW-10085) - [C++] S3 tests fail on AppVeyor
+* [ARROW-10087](https://issues.apache.org/jira/browse/ARROW-10087) - [CI] Fix nightly docs job
+* [ARROW-10098](https://issues.apache.org/jira/browse/ARROW-10098) - [R][Doc] Fix copy\_files doc mismatch
+* [ARROW-10104](https://issues.apache.org/jira/browse/ARROW-10104) - [Python] Separate tests into its own conda package
+* [ARROW-10114](https://issues.apache.org/jira/browse/ARROW-10114) - [R] Segfault in to\_dataframe\_parallel with deeply nested structs
+* [ARROW-10116](https://issues.apache.org/jira/browse/ARROW-10116) - [Python][Packaging] Fix gRPC linking error in macOS wheels builds
+* [ARROW-10119](https://issues.apache.org/jira/browse/ARROW-10119) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz)
+* [ARROW-10121](https://issues.apache.org/jira/browse/ARROW-10121) - [C++][Python] Variable dictionaries do not survive roundtrip to IPC stream
+* [ARROW-10124](https://issues.apache.org/jira/browse/ARROW-10124) - [R] Write functions don't follow umask setting
+* [ARROW-10125](https://issues.apache.org/jira/browse/ARROW-10125) - [R] Int64 downcast check doesn't consider all chunks
+* [ARROW-10130](https://issues.apache.org/jira/browse/ARROW-10130) - [C++][Dataset] ParquetFileFragment::SplitByRowGroup does not preserve "complete\_metadata" status
+* [ARROW-10136](https://issues.apache.org/jira/browse/ARROW-10136) - [Rust][Arrow] Nulls are transformed into "" after filtering for StringArray
+* [ARROW-10137](https://issues.apache.org/jira/browse/ARROW-10137) - [R] Fix cpp helper that breaks if libarrow is not present
+* [ARROW-10147](https://issues.apache.org/jira/browse/ARROW-10147) - [Python] Constructing pandas metadata fails if an Index name is not JSON-serializable by default
+* [ARROW-10150](https://issues.apache.org/jira/browse/ARROW-10150) - [C++] Fix crashes on invalid Parquet file (OSS-Fuzz)
+* [ARROW-10169](https://issues.apache.org/jira/browse/ARROW-10169) - [Rust] Nulls should be rendered as "" rather than default value when pretty printing arrays
+* [ARROW-10175](https://issues.apache.org/jira/browse/ARROW-10175) - [CI] Nightly hdfs integration test job fails
+* [ARROW-10176](https://issues.apache.org/jira/browse/ARROW-10176) - [CI] Nightly valgrind job fails
+* [ARROW-10178](https://issues.apache.org/jira/browse/ARROW-10178) - [CI] Fix spark master integration test build setup
+* [ARROW-10179](https://issues.apache.org/jira/browse/ARROW-10179) - [Rust] Labeler is not labeling
+* [ARROW-10181](https://issues.apache.org/jira/browse/ARROW-10181) - [Rust] Arrow tests fail to compile on Raspberry Pi (32 bit)
+* [ARROW-10188](https://issues.apache.org/jira/browse/ARROW-10188) - [Rust] [DataFusion] Some examples are broken
+* [ARROW-10189](https://issues.apache.org/jira/browse/ARROW-10189) - [Doc] C data interface example for i32 uses \`l\`, not \`i\`, in the format
+* [ARROW-10192](https://issues.apache.org/jira/browse/ARROW-10192) - [C++][Python] Segfault when converting nested struct array with dictionary field to pandas series
+* [ARROW-10193](https://issues.apache.org/jira/browse/ARROW-10193) - [Python] Segfault when converting to fixed size binary array
+* [ARROW-10200](https://issues.apache.org/jira/browse/ARROW-10200) - [Java][CI] Fix failure of Java CI on s390x
+* [ARROW-10204](https://issues.apache.org/jira/browse/ARROW-10204) - [RUST] [Datafusion] Test failure in aggregate\_grouped\_empty with simd feature enabled
+* [ARROW-10214](https://issues.apache.org/jira/browse/ARROW-10214) - [Python] UnicodeDecodeError when printing schema with binary metadata
+* [ARROW-10226](https://issues.apache.org/jira/browse/ARROW-10226) - [Rust] [Parquet] Parquet reader reading wrong columns in some batches within a parquet file
+* [ARROW-10230](https://issues.apache.org/jira/browse/ARROW-10230) - [JS][Doc] JavaScript documentation fails to build
+* [ARROW-10232](https://issues.apache.org/jira/browse/ARROW-10232) - FixedSizeListArray is incorrectly written/read to/from parquet
+* [ARROW-10234](https://issues.apache.org/jira/browse/ARROW-10234) - [C++][Gandiva] Fix logic of round() for floats/decimals in Gandiva
+* [ARROW-10237](https://issues.apache.org/jira/browse/ARROW-10237) - [C++] Duplicate values in a dictionary result in corrupted parquet
+* [ARROW-10238](https://issues.apache.org/jira/browse/ARROW-10238) - [C\#] List<Struct\> is broken
+* [ARROW-10239](https://issues.apache.org/jira/browse/ARROW-10239) - [C++] aws-sdk-cpp apparently requires zlib too
+* [ARROW-10244](https://issues.apache.org/jira/browse/ARROW-10244) - [Python][Docs] Add docs on using pyarrow.dataset.parquet\_dataset
+* [ARROW-10248](https://issues.apache.org/jira/browse/ARROW-10248) - [C++][Dataset] Dataset writing does not write schema metadata
+* [ARROW-10262](https://issues.apache.org/jira/browse/ARROW-10262) - [C++] Some TypeClass in Scalar classes seem incorrect
+* [ARROW-10271](https://issues.apache.org/jira/browse/ARROW-10271) - [Rust] packed\_simd is broken and continued under a new project
+* [ARROW-10279](https://issues.apache.org/jira/browse/ARROW-10279) - [Release][Python] Fix verification script to align with the new macos wheel platform tags
+* [ARROW-10280](https://issues.apache.org/jira/browse/ARROW-10280) - [Packaging][Python] Fix macOS wheel artifact patterns
+* [ARROW-10281](https://issues.apache.org/jira/browse/ARROW-10281) - [Python] Fix warnings when running tests
+* [ARROW-10284](https://issues.apache.org/jira/browse/ARROW-10284) - [Python] Pyarrow is raising deprecation warning about filesystems on import
+* [ARROW-10285](https://issues.apache.org/jira/browse/ARROW-10285) - [Python] pyarrow.orc submodule is using deprecated functionality
+* [ARROW-10286](https://issues.apache.org/jira/browse/ARROW-10286) - [C++][Flight] Misleading CMake errors
+* [ARROW-10288](https://issues.apache.org/jira/browse/ARROW-10288) - [C++] Compilation fails on i386
+* [ARROW-10290](https://issues.apache.org/jira/browse/ARROW-10290) - [C++] List POP\_BACK is not available in older CMake versions
+
+
+## New Features and Improvements
+
+* [ARROW-983](https://issues.apache.org/jira/browse/ARROW-983) - [C++] Implement InputStream and OutputStream classes for interacting with socket connections
+* [ARROW-1105](https://issues.apache.org/jira/browse/ARROW-1105) - [C++] SQLite record batch reader
+* [ARROW-1509](https://issues.apache.org/jira/browse/ARROW-1509) - [Python] Write serialized object as a stream of encapsulated IPC messages
+* [ARROW-1669](https://issues.apache.org/jira/browse/ARROW-1669) - [C++] Consider adding Abseil (Google C++11 standard library extensions) to toolchain
+* [ARROW-1797](https://issues.apache.org/jira/browse/ARROW-1797) - [C++] Implement binary arithmetic kernels for numeric arrays
+* [ARROW-2164](https://issues.apache.org/jira/browse/ARROW-2164) - [C++] Clean up unnecessary decimal module refs
+* [ARROW-3080](https://issues.apache.org/jira/browse/ARROW-3080) - [Python] Unify Arrow to Python object conversion paths
+* [ARROW-3757](https://issues.apache.org/jira/browse/ARROW-3757) - [R] R bindings for Flight RPC client
+* [ARROW-3872](https://issues.apache.org/jira/browse/ARROW-3872) - [R] Add ad hoc test of feather compatibility
+* [ARROW-4046](https://issues.apache.org/jira/browse/ARROW-4046) - [Python/CI] Exercise large memory tests
+* [ARROW-4248](https://issues.apache.org/jira/browse/ARROW-4248) - [C++][Plasma] Build on Windows / Visual Studio
+* [ARROW-4685](https://issues.apache.org/jira/browse/ARROW-4685) - [C++] Update Boost to 1.69 in manylinux1 docker image
+* [ARROW-4927](https://issues.apache.org/jira/browse/ARROW-4927) - [Rust] Update top level README to describe current functionality
+* [ARROW-4957](https://issues.apache.org/jira/browse/ARROW-4957) - [Rust] [DataFusion] Implement get\_supertype correctly
+* [ARROW-4965](https://issues.apache.org/jira/browse/ARROW-4965) - [Python] Timestamp array type detection should use tzname of datetime.datetime objects
+* [ARROW-5034](https://issues.apache.org/jira/browse/ARROW-5034) - [C\#] ArrowStreamWriter should expose synchronous Write methods
+* [ARROW-5123](https://issues.apache.org/jira/browse/ARROW-5123) - [Rust] derive RecordWriter from struct definitions
+* [ARROW-6075](https://issues.apache.org/jira/browse/ARROW-6075) - [FlightRPC] Handle uncaught exceptions in middleware
+* [ARROW-6281](https://issues.apache.org/jira/browse/ARROW-6281) - [Python] Produce chunked arrays for nested types in pyarrow.array
+* [ARROW-6282](https://issues.apache.org/jira/browse/ARROW-6282) - [Format] Support lossy compression
+* [ARROW-6437](https://issues.apache.org/jira/browse/ARROW-6437) - [R] Add AWS SDK to system dependencies for macOS and Windows
+* [ARROW-6535](https://issues.apache.org/jira/browse/ARROW-6535) - [C++] Status::WithMessage should accept variadic parameters
+* [ARROW-6537](https://issues.apache.org/jira/browse/ARROW-6537) - [R] Pass column\_types to CSV reader
+* [ARROW-6972](https://issues.apache.org/jira/browse/ARROW-6972) - [C\#] Should support StructField arrays
+* [ARROW-6982](https://issues.apache.org/jira/browse/ARROW-6982) - [R] Add bindings for compare and boolean kernels
+* [ARROW-7136](https://issues.apache.org/jira/browse/ARROW-7136) - [Rust][CI] Pre-install the rust dependencies in the dockerfile
+* [ARROW-7218](https://issues.apache.org/jira/browse/ARROW-7218) - [Python] Conversion from boolean numpy scalars not working
+* [ARROW-7302](https://issues.apache.org/jira/browse/ARROW-7302) - [C++] CSV: allow converting a column to a specific dictionary type
+* [ARROW-7372](https://issues.apache.org/jira/browse/ARROW-7372) - [C++] Allow creating dictionary array from simple JSON
+* [ARROW-7871](https://issues.apache.org/jira/browse/ARROW-7871) - [Python] Expose more compute kernels
+* [ARROW-7960](https://issues.apache.org/jira/browse/ARROW-7960) - [C++][Parquet] Add support for schema translation from parquet nodes back to arrow for missing types
+* [ARROW-8001](https://issues.apache.org/jira/browse/ARROW-8001) - [R][Dataset] Bindings for dataset writing
+* [ARROW-8002](https://issues.apache.org/jira/browse/ARROW-8002) - [C++][Dataset] Dataset writing should let you (re)partition the data
+* [ARROW-8048](https://issues.apache.org/jira/browse/ARROW-8048) - [Python] Run memory leak tests nightly as follow up to ARROW-4120
+* [ARROW-8172](https://issues.apache.org/jira/browse/ARROW-8172) - [C++] ArrayFromJSON for dictionary arrays
+* [ARROW-8205](https://issues.apache.org/jira/browse/ARROW-8205) - [Rust] [DataFusion] DataFusion should enforce unique field names in a schema
+* [ARROW-8253](https://issues.apache.org/jira/browse/ARROW-8253) - [Rust] [DataFusion] Improve ergonomics of registering UDFs
+* [ARROW-8262](https://issues.apache.org/jira/browse/ARROW-8262) - [Rust] [DataFusion] Add example that uses LogicalPlanBuilder
+* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer
+* [ARROW-8296](https://issues.apache.org/jira/browse/ARROW-8296) - [C++][Dataset] IpcFileFormat should support writing files with compressed buffers
+* [ARROW-8355](https://issues.apache.org/jira/browse/ARROW-8355) - [Python] Reduce the number of pandas dependent test cases in test\_feather
+* [ARROW-8359](https://issues.apache.org/jira/browse/ARROW-8359) - [C++/Python] Enable aarch64/ppc64le build in conda recipes
+* [ARROW-8383](https://issues.apache.org/jira/browse/ARROW-8383) - [Rust] Easier random access to DictionaryArray keys and values
+* [ARROW-8402](https://issues.apache.org/jira/browse/ARROW-8402) - [Java] Support ValidateFull methods in Java
+* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet
+* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types
+* [ARROW-8493](https://issues.apache.org/jira/browse/ARROW-8493) - [C++] Create unified schema resolution code for Array reconstruction.
+* [ARROW-8494](https://issues.apache.org/jira/browse/ARROW-8494) - [C++] Implement basic array-by-array reassembly logic
+* [ARROW-8581](https://issues.apache.org/jira/browse/ARROW-8581) - [C\#] Date32/64Array.Builder should accept DateTime, not DateTimeOffset
+* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface
+* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface
+* [ARROW-8618](https://issues.apache.org/jira/browse/ARROW-8618) - [C++] ASSIGN\_OR\_RAISE should move its argument
+* [ARROW-8678](https://issues.apache.org/jira/browse/ARROW-8678) - [C++][Parquet] Remove legacy arrow to level translation.
+* [ARROW-8712](https://issues.apache.org/jira/browse/ARROW-8712) - [R] Expose strptime timestamp parsing in read\_csv conversion options
+* [ARROW-8774](https://issues.apache.org/jira/browse/ARROW-8774) - [Rust] [DataFusion] Improve threading model
+* [ARROW-8810](https://issues.apache.org/jira/browse/ARROW-8810) - [R] Add documentation about Parquet format, appending to stream format
+* [ARROW-8824](https://issues.apache.org/jira/browse/ARROW-8824) - [Rust] [DataFusion] Implement new SQL parser
+* [ARROW-8828](https://issues.apache.org/jira/browse/ARROW-8828) - [Rust] Implement SQL tokenizer
+* [ARROW-8829](https://issues.apache.org/jira/browse/ARROW-8829) - [Rust] Implement SQL parser
+* [ARROW-9010](https://issues.apache.org/jira/browse/ARROW-9010) - [Java] Framework and interface changes for RecordBatch IPC buffer compression
+* [ARROW-9065](https://issues.apache.org/jira/browse/ARROW-9065) - [C++] Support parsing date32 in dataset partition folders
+* [ARROW-9068](https://issues.apache.org/jira/browse/ARROW-9068) - [C++][Dataset] Simplify Partitioning interface
+* [ARROW-9078](https://issues.apache.org/jira/browse/ARROW-9078) - [C++] Parquet writing of extension type with nested storage type fails
+* [ARROW-9104](https://issues.apache.org/jira/browse/ARROW-9104) - [C++] Parquet encryption tests should write files to a temporary directory instead of the testing submodule's directory
+* [ARROW-9107](https://issues.apache.org/jira/browse/ARROW-9107) - [C++][Dataset] Time-based types support
+* [ARROW-9147](https://issues.apache.org/jira/browse/ARROW-9147) - [C++][Dataset] Support null -\> other type promotion in Dataset scanning
+* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst
+* [ARROW-9266](https://issues.apache.org/jira/browse/ARROW-9266) - [Python][Packaging] Enable S3 support in macOS wheels
+* [ARROW-9271](https://issues.apache.org/jira/browse/ARROW-9271) - [R] Preserve data frame metadata in round trip
+* [ARROW-9286](https://issues.apache.org/jira/browse/ARROW-9286) - [C++] Add function "aliases" to compute::FunctionRegistry
+* [ARROW-9328](https://issues.apache.org/jira/browse/ARROW-9328) - [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string
+* [ARROW-9338](https://issues.apache.org/jira/browse/ARROW-9338) - [Rust] Add instructions for running clippy locally
+* [ARROW-9344](https://issues.apache.org/jira/browse/ARROW-9344) - [C++][Flight] measure latency quantile in flight benchmark
+* [ARROW-9358](https://issues.apache.org/jira/browse/ARROW-9358) - [Integration] Reconsider generated\_large\_batch.json
+* [ARROW-9371](https://issues.apache.org/jira/browse/ARROW-9371) - [Java] Run vector tests for both allocators
+* [ARROW-9377](https://issues.apache.org/jira/browse/ARROW-9377) - [Java] Support unsigned dictionary indices
+* [ARROW-9387](https://issues.apache.org/jira/browse/ARROW-9387) - [R] Use new C++ table select method
+* [ARROW-9388](https://issues.apache.org/jira/browse/ARROW-9388) - [C++] Division kernels
+* [ARROW-9394](https://issues.apache.org/jira/browse/ARROW-9394) - [Python] Support pickling of Scalars
+* [ARROW-9398](https://issues.apache.org/jira/browse/ARROW-9398) - [C++] Register the SIMD sum variants under function instance instead a SIMD function
+* [ARROW-9402](https://issues.apache.org/jira/browse/ARROW-9402) - [C++] Add portable wrappers for \_\_builtin\_add\_overflow and friends
+* [ARROW-9405](https://issues.apache.org/jira/browse/ARROW-9405) - [R] Switch to cpp11
+* [ARROW-9412](https://issues.apache.org/jira/browse/ARROW-9412) - [C++] Add non-BUNDLED dependencies to exported INSTALL\_INTERFACE\_LIBS of arrow\_static and test that it works
+* [ARROW-9429](https://issues.apache.org/jira/browse/ARROW-9429) - [Python] ChunkedArray.to\_numpy
+* [ARROW-9454](https://issues.apache.org/jira/browse/ARROW-9454) - [GLib] Add binding of some dictionary builders
+* [ARROW-9465](https://issues.apache.org/jira/browse/ARROW-9465) - [Python] Improve ergonomics of compute functions
+* [ARROW-9469](https://issues.apache.org/jira/browse/ARROW-9469) - [Python] Make more objects weakrefable
+* [ARROW-9487](https://issues.apache.org/jira/browse/ARROW-9487) - [Developer] Cover the archery release utilities with unittests
+* [ARROW-9488](https://issues.apache.org/jira/browse/ARROW-9488) - [Release] Use the new changelog generation when updating the website
+* [ARROW-9507](https://issues.apache.org/jira/browse/ARROW-9507) - [Rust] [DataFusion] PhysicalExpr should implement Display trait
+* [ARROW-9508](https://issues.apache.org/jira/browse/ARROW-9508) - [Release][APT][Yum] Enable verification for arm64 binaries
+* [ARROW-9516](https://issues.apache.org/jira/browse/ARROW-9516) - [Rust][DataFusion] Refactor physical expressions to not care about their names nor indexes
+* [ARROW-9517](https://issues.apache.org/jira/browse/ARROW-9517) - [C++][Python] Allow session\_token argument when initializing S3FileSystem
+* [ARROW-9518](https://issues.apache.org/jira/browse/ARROW-9518) - [Python] Deprecate pyarrow serialization
+* [ARROW-9521](https://issues.apache.org/jira/browse/ARROW-9521) - [Rust] CsvReadOptions should allow file extension to be specified
+* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel
+* [ARROW-9534](https://issues.apache.org/jira/browse/ARROW-9534) - [Rust] [DataFusion] Implement functions for creating literal expressions for all types
+* [ARROW-9550](https://issues.apache.org/jira/browse/ARROW-9550) - [Rust] [DataFusion] Remove Rc<RefCell<\_\>\> from hash aggregate operator
+* [ARROW-9553](https://issues.apache.org/jira/browse/ARROW-9553) - [Rust] Release script doesn't bump parquet crate's arrow dependency version
+* [ARROW-9557](https://issues.apache.org/jira/browse/ARROW-9557) - [R] Iterating over parquet columns is slow in R
+* [ARROW-9559](https://issues.apache.org/jira/browse/ARROW-9559) - [Rust] [DataFusion] Revert privatization of exprlist\_to\_fields
+* [ARROW-9563](https://issues.apache.org/jira/browse/ARROW-9563) - [Dev][Release] Use archery's changelog generator when creating release notes for the website
+* [ARROW-9568](https://issues.apache.org/jira/browse/ARROW-9568) - [CI] Use official msys action on GHA
+* [ARROW-9576](https://issues.apache.org/jira/browse/ARROW-9576) - [Python][Doc] Fix error in code example for extension types
+* [ARROW-9580](https://issues.apache.org/jira/browse/ARROW-9580) - [JS] Docs have superfluous ()
+* [ARROW-9581](https://issues.apache.org/jira/browse/ARROW-9581) - [Dev][Release] Bump next snapshot versions to 2.0.0
+* [ARROW-9582](https://issues.apache.org/jira/browse/ARROW-9582) - [Rust] Implement Array::memory\_size()
+* [ARROW-9585](https://issues.apache.org/jira/browse/ARROW-9585) - [Rust] Remove duplicated to-do line in DataFusion readme
+* [ARROW-9587](https://issues.apache.org/jira/browse/ARROW-9587) - [FlightRPC][Java] Clean up DoPut/FlightStream memory handling
+* [ARROW-9593](https://issues.apache.org/jira/browse/ARROW-9593) - [Python] Add custom pickle reducers for DictionaryScalar
+* [ARROW-9604](https://issues.apache.org/jira/browse/ARROW-9604) - [C++] Add benchmark for aggregate min/max compute kernels
+* [ARROW-9605](https://issues.apache.org/jira/browse/ARROW-9605) - [C++] Optimize performance for aggregate min/max compute kernels
+* [ARROW-9607](https://issues.apache.org/jira/browse/ARROW-9607) - [C++][Gandiva] Add bitwise\_and(), bitwise\_or() and bitwise\_not() functions for integers
+* [ARROW-9608](https://issues.apache.org/jira/browse/ARROW-9608) - [Rust] Remove arrow flight from parquet's feature gating
+* [ARROW-9615](https://issues.apache.org/jira/browse/ARROW-9615) - [Rust] Add kernel to compute length of string array
+* [ARROW-9617](https://issues.apache.org/jira/browse/ARROW-9617) - [Rust] [DataFusion] Add length of string array
+* [ARROW-9618](https://issues.apache.org/jira/browse/ARROW-9618) - [Rust] [DataFusion] Make it easier to write optimizers
+* [ARROW-9619](https://issues.apache.org/jira/browse/ARROW-9619) - [Rust] [DataFusion] Add predicate push-down
+* [ARROW-9632](https://issues.apache.org/jira/browse/ARROW-9632) - [Rust] Add a "new" method for ExecutionContextSchemaProvider
+* [ARROW-9638](https://issues.apache.org/jira/browse/ARROW-9638) - [C++][Compute] Implement mode(most frequent number) kernel
+* [ARROW-9639](https://issues.apache.org/jira/browse/ARROW-9639) - [Ruby] Add dependency version check
+* [ARROW-9640](https://issues.apache.org/jira/browse/ARROW-9640) - [C++][Gandiva] Implement round() for integers and long integers
+* [ARROW-9641](https://issues.apache.org/jira/browse/ARROW-9641) - [C++][Gandiva] Implement round() for floating point and double floating point numbers
+* [ARROW-9645](https://issues.apache.org/jira/browse/ARROW-9645) - [Python] Deprecate the legacy pyarrow.filesystem interface
+* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets
+* [ARROW-9650](https://issues.apache.org/jira/browse/ARROW-9650) - [Packaging][APT] Drop support for Ubuntu 19.10
+* [ARROW-9654](https://issues.apache.org/jira/browse/ARROW-9654) - [Rust][DataFusion] Add an EXPLAIN command to the datafusion CLI
+* [ARROW-9656](https://issues.apache.org/jira/browse/ARROW-9656) - [Rust][DataFusion] Slightly confusing error message when unsupported type is provided to CREATE EXTERNAL TABLE
+* [ARROW-9658](https://issues.apache.org/jira/browse/ARROW-9658) - [Python][Dataset] Bindings for dataset writing
+* [ARROW-9665](https://issues.apache.org/jira/browse/ARROW-9665) - [R] head/tail/take for Datasets
+* [ARROW-9667](https://issues.apache.org/jira/browse/ARROW-9667) - [CI][Crossbow] Segfault in 2 nightly R builds
+* [ARROW-9671](https://issues.apache.org/jira/browse/ARROW-9671) - [C++] BasicDecimal128 constructor interprets uint64\_t integers with highest bit set as negative
+* [ARROW-9673](https://issues.apache.org/jira/browse/ARROW-9673) - [Rust] Add a param "dialect" for DFParser::parse\_sql
+* [ARROW-9678](https://issues.apache.org/jira/browse/ARROW-9678) - [Rust] [DataFusion] Improve projection push down to remove unused columns
+* [ARROW-9679](https://issues.apache.org/jira/browse/ARROW-9679) - [Rust] [DataFusion] HashAggregate walks map many times building final batch
+* [ARROW-9681](https://issues.apache.org/jira/browse/ARROW-9681) - [Java] Failed Arrow Memory - Core on big-endian platform
+* [ARROW-9683](https://issues.apache.org/jira/browse/ARROW-9683) - [Rust][DataFusion] Implement Debug for ExecutionPlan trait
+* [ARROW-9691](https://issues.apache.org/jira/browse/ARROW-9691) - [Rust] [DataFusion] Make sql\_statement\_to\_plan public
+* [ARROW-9695](https://issues.apache.org/jira/browse/ARROW-9695) - [Rust][DataFusion] Improve documentation on LogicalPlan variants
+* [ARROW-9699](https://issues.apache.org/jira/browse/ARROW-9699) - [C++][Compute] Improve mode kernel performance for small integer types
+* [ARROW-9701](https://issues.apache.org/jira/browse/ARROW-9701) - [Java][CI] Add a test job on s390x
+* [ARROW-9702](https://issues.apache.org/jira/browse/ARROW-9702) - [C++] Move bpacking simd to runtime path
+* [ARROW-9703](https://issues.apache.org/jira/browse/ARROW-9703) - [Developer][Archery] Restartable cherry-picking process for creating maintenance branches
+* [ARROW-9706](https://issues.apache.org/jira/browse/ARROW-9706) - [Java] Tests in TestLargeListVector fails on big endian platform
+* [ARROW-9710](https://issues.apache.org/jira/browse/ARROW-9710) - [C++] Generalize Decimal ToString in preparation for Decimal256
+* [ARROW-9711](https://issues.apache.org/jira/browse/ARROW-9711) - [Rust] Add benchmark based on TPC-H
+* [ARROW-9713](https://issues.apache.org/jira/browse/ARROW-9713) - [Rust][DataFusion] Remove explicit panics
+* [ARROW-9715](https://issues.apache.org/jira/browse/ARROW-9715) - [R] changelog/doc updates for 1.0.1
+* [ARROW-9718](https://issues.apache.org/jira/browse/ARROW-9718) - [Python] Make pyarrow.parquet work with the new filesystem interfaces
+* [ARROW-9721](https://issues.apache.org/jira/browse/ARROW-9721) - [Packaging][Python] Update wheel dependency files
+* [ARROW-9722](https://issues.apache.org/jira/browse/ARROW-9722) - [Rust]: Shorten key lifetime for reverse lookup for dictionary arrays
+* [ARROW-9723](https://issues.apache.org/jira/browse/ARROW-9723) - [C++] Expected behaviour of "mode" kernel with NaNs ?
+* [ARROW-9725](https://issues.apache.org/jira/browse/ARROW-9725) - [Rust] [DataFusion] LimitExec and SortExec should use MergeExec
+* [ARROW-9737](https://issues.apache.org/jira/browse/ARROW-9737) - [C++][Gandiva] Add bitwise\_xor() for integers
+* [ARROW-9739](https://issues.apache.org/jira/browse/ARROW-9739) - [CI][Ruby] Don't install gem documents
+* [ARROW-9742](https://issues.apache.org/jira/browse/ARROW-9742) - [Rust] Create one standard DataFrame API
+* [ARROW-9751](https://issues.apache.org/jira/browse/ARROW-9751) - [Rust] [DataFusion] Extend UDFs to accept more than one type per argument
+* [ARROW-9752](https://issues.apache.org/jira/browse/ARROW-9752) - [Rust] [DataFusion] Add support for Aggregate UDFs
+* [ARROW-9753](https://issues.apache.org/jira/browse/ARROW-9753) - [Rust] [DataFusion] Remove the use of Mutex in ExecutionPlan trait
+* [ARROW-9754](https://issues.apache.org/jira/browse/ARROW-9754) - [Rust] [DataFusion] Implement async in DataFusion traits
+* [ARROW-9757](https://issues.apache.org/jira/browse/ARROW-9757) - [Rust] [DataFusion] Use "pub use" to expose a clean public API
+* [ARROW-9758](https://issues.apache.org/jira/browse/ARROW-9758) - [Rust] [DataFusion] Implement extension API for DataFusion
+* [ARROW-9759](https://issues.apache.org/jira/browse/ARROW-9759) - [Rust] [DataFusion] Implement DataFrame::sort
+* [ARROW-9760](https://issues.apache.org/jira/browse/ARROW-9760) - [Rust] [DataFusion] Implement DataFrame::explain
+* [ARROW-9761](https://issues.apache.org/jira/browse/ARROW-9761) - [C++] Add experimental pull-based iterator structures to C interface implementation
+* [ARROW-9762](https://issues.apache.org/jira/browse/ARROW-9762) - [Rust] [DataFusion] ExecutionContext::sql should return DataFrame
+* [ARROW-9769](https://issues.apache.org/jira/browse/ARROW-9769) - [Python] Remove skip for in-memory fsspec in test\_move\_file
+* [ARROW-9775](https://issues.apache.org/jira/browse/ARROW-9775) - [C++] Automatic S3 region selection
+* [ARROW-9781](https://issues.apache.org/jira/browse/ARROW-9781) - [C++] Fix uninitialized value warnings
+* [ARROW-9782](https://issues.apache.org/jira/browse/ARROW-9782) - [C++][Dataset] Ability to write ".feather" files with IpcFileFormat
+* [ARROW-9784](https://issues.apache.org/jira/browse/ARROW-9784) - [Rust] [DataFusion] Improve instructions for running tpch benchmark
+* [ARROW-9786](https://issues.apache.org/jira/browse/ARROW-9786) - [R] Unvendor cpp11 before release
+* [ARROW-9788](https://issues.apache.org/jira/browse/ARROW-9788) - Handle naming inconsistencies between SQL, DataFrame API and struct names
+* [ARROW-9792](https://issues.apache.org/jira/browse/ARROW-9792) - [Rust] [DataFusion] Logical aggregate functions should not return Result
+* [ARROW-9794](https://issues.apache.org/jira/browse/ARROW-9794) - [C++] Add functionality to cpu\_info to discriminate between Intel vs AMD x86
+* [ARROW-9795](https://issues.apache.org/jira/browse/ARROW-9795) - [C++][Gandiva] Implement castTIMESTAMP(int64) in Gandiva
+* [ARROW-9806](https://issues.apache.org/jira/browse/ARROW-9806) - [R] More compute kernel bindings
+* [ARROW-9807](https://issues.apache.org/jira/browse/ARROW-9807) - [R] News update/version bump post-1.0.1
+* [ARROW-9808](https://issues.apache.org/jira/browse/ARROW-9808) - [Python] parquet.read\_table docstring wrong use\_legacy\_dataset explanation
+* [ARROW-9811](https://issues.apache.org/jira/browse/ARROW-9811) - [C++] Unchecked floating point division by 0 should succeed
+* [ARROW-9813](https://issues.apache.org/jira/browse/ARROW-9813) - [C++] Disable semantic interposition
+* [ARROW-9819](https://issues.apache.org/jira/browse/ARROW-9819) - [C++] Bump mimalloc to 1.6.4
+* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API
+* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API
+* [ARROW-9823](https://issues.apache.org/jira/browse/ARROW-9823) - [CI][C++][MinGW] Enable S3
+* [ARROW-9832](https://issues.apache.org/jira/browse/ARROW-9832) - [Rust] [DataFusion] Refactor PhysicalPlan to remove Partition
+* [ARROW-9833](https://issues.apache.org/jira/browse/ARROW-9833) - [Rust] [DataFusion] Refactor TableProvider.scan to return ExecutionPlan
+* [ARROW-9834](https://issues.apache.org/jira/browse/ARROW-9834) - [Rust] [DataFusion] Remove Partition trait
+* [ARROW-9835](https://issues.apache.org/jira/browse/ARROW-9835) - [Rust] [DataFusion] Remove FunctionMeta
+* [ARROW-9836](https://issues.apache.org/jira/browse/ARROW-9836) - [Rust] [DataFusion] Improve API for usage of UDFs
+* [ARROW-9837](https://issues.apache.org/jira/browse/ARROW-9837) - [Rust] Add provider for variable
+* [ARROW-9838](https://issues.apache.org/jira/browse/ARROW-9838) - [Rust] [DataFusion] DefaultPhysicalPlanner should insert explicit MergeExec nodes
+* [ARROW-9839](https://issues.apache.org/jira/browse/ARROW-9839) - [Rust] [DataFusion] Add ability to downcast ExecutionPlan to specific operator
+* [ARROW-9841](https://issues.apache.org/jira/browse/ARROW-9841) - [Rust] Update checked-in flatbuffer files
+* [ARROW-9844](https://issues.apache.org/jira/browse/ARROW-9844) - [Go][CI] Add Travis CI job for Go on s390x
+* [ARROW-9845](https://issues.apache.org/jira/browse/ARROW-9845) - [Rust] [Parquet] serde\_json is only used in tests but isn't in dev-dependencies
+* [ARROW-9848](https://issues.apache.org/jira/browse/ARROW-9848) - [Rust] Implement changes to ensure flatbuffer alignment
+* [ARROW-9849](https://issues.apache.org/jira/browse/ARROW-9849) - [Rust] [DataFusion] Make UDFs not need a Field
+* [ARROW-9850](https://issues.apache.org/jira/browse/ARROW-9850) - [Go] Defer should not be used in the loop
+* [ARROW-9853](https://issues.apache.org/jira/browse/ARROW-9853) - [RUST] Implement "take" kernel for dictionary arrays
+* [ARROW-9854](https://issues.apache.org/jira/browse/ARROW-9854) - [R] Support reading/writing data to/from S3
+* [ARROW-9858](https://issues.apache.org/jira/browse/ARROW-9858) - [C++][Python][Docs] Expand user guide for FileSystem
+* [ARROW-9863](https://issues.apache.org/jira/browse/ARROW-9863) - [C++] [PARQUET] Optimize meta data recovery of ApplicationVersion
+* [ARROW-9867](https://issues.apache.org/jira/browse/ARROW-9867) - [C++][Dataset] FileSystemDataset should expose its filesystem
+* [ARROW-9868](https://issues.apache.org/jira/browse/ARROW-9868) - [C++] Provide utility for copying files between filesystems
+* [ARROW-9869](https://issues.apache.org/jira/browse/ARROW-9869) - [R] Implement full S3FileSystem/S3Options constructor
+* [ARROW-9870](https://issues.apache.org/jira/browse/ARROW-9870) - [R] Friendly interface for filesystems (S3)
+* [ARROW-9871](https://issues.apache.org/jira/browse/ARROW-9871) - [C++] Add uppercase support to ARROW\_USER\_SIMD\_LEVEL.
+* [ARROW-9873](https://issues.apache.org/jira/browse/ARROW-9873) - [C++][Compute] Improve mode kernel for intergers within limited value range
+* [ARROW-9875](https://issues.apache.org/jira/browse/ARROW-9875) - [Python] Let FileSystem.get\_file\_info accept a single path
+* [ARROW-9884](https://issues.apache.org/jira/browse/ARROW-9884) - [R] Bindings for writing datasets to Parquet
+* [ARROW-9885](https://issues.apache.org/jira/browse/ARROW-9885) - [Rust] [DataFusion] Simplify code of type coercion for binary types
+* [ARROW-9886](https://issues.apache.org/jira/browse/ARROW-9886) - [Rust] [DataFusion] Simplify code to test cast
+* [ARROW-9887](https://issues.apache.org/jira/browse/ARROW-9887) - [Rust] [DataFusion] Add support for complex return types of built-in functions
+* [ARROW-9890](https://issues.apache.org/jira/browse/ARROW-9890) - [R] Add zstandard compression codec in macOS build
+* [ARROW-9891](https://issues.apache.org/jira/browse/ARROW-9891) - [Rust] [DataFusion] Make math functions support f32
+* [ARROW-9892](https://issues.apache.org/jira/browse/ARROW-9892) - [Rust] [DataFusion] Add support for concat
+* [ARROW-9893](https://issues.apache.org/jira/browse/ARROW-9893) - [Python] Bindings for writing datasets to Parquet
+* [ARROW-9895](https://issues.apache.org/jira/browse/ARROW-9895) - [RUST] Improve sort kernels
+* [ARROW-9899](https://issues.apache.org/jira/browse/ARROW-9899) - [Rust] [DataFusion] Switch from Box<Schema\> --\> SchemaRef (Arc<Schema\>) to be consistent with the rest of Arrow
+* [ARROW-9900](https://issues.apache.org/jira/browse/ARROW-9900) - [Rust][DataFusion] Use Arc<\> instead of Box<\> in LogicalPlan
+* [ARROW-9901](https://issues.apache.org/jira/browse/ARROW-9901) - [C++] Add hand-crafted Parquet to Arrow reconstruction test for nested reading
+* [ARROW-9902](https://issues.apache.org/jira/browse/ARROW-9902) - [Rust] [DataFusion] Add support for array()
+* [ARROW-9904](https://issues.apache.org/jira/browse/ARROW-9904) - [C++] Unroll the loop manually for CountSetBits
+* [ARROW-9908](https://issues.apache.org/jira/browse/ARROW-9908) - [Rust] Support temporal data types in JSON reader
+* [ARROW-9910](https://issues.apache.org/jira/browse/ARROW-9910) - [Rust] [DataFusion] Type coercion of Variadic is wrong
+* [ARROW-9914](https://issues.apache.org/jira/browse/ARROW-9914) - [Rust][DataFusion] Document the SQL -\> Arrow type mapping
+* [ARROW-9916](https://issues.apache.org/jira/browse/ARROW-9916) - [RUST] Avoid cloning ArrayData in several places
+* [ARROW-9917](https://issues.apache.org/jira/browse/ARROW-9917) - [Python][Compute] Add bindings for mode kernel
+* [ARROW-9919](https://issues.apache.org/jira/browse/ARROW-9919) - [Rust] [DataFusion] Math functions
+* [ARROW-9921](https://issues.apache.org/jira/browse/ARROW-9921) - [Rust] Add \`from(Vec<Option<&str\>\>)\` to [Large]StringArray
+* [ARROW-9925](https://issues.apache.org/jira/browse/ARROW-9925) - [GLib] Add low level value readers for GArrowListArray family
+* [ARROW-9926](https://issues.apache.org/jira/browse/ARROW-9926) - [GLib] Use placement new for GArrowRecordBatchFileReader
+* [ARROW-9928](https://issues.apache.org/jira/browse/ARROW-9928) - [C++] Speed up integer parsing slightly
+* [ARROW-9929](https://issues.apache.org/jira/browse/ARROW-9929) - [Developer] Autotune cmake-format
+* [ARROW-9933](https://issues.apache.org/jira/browse/ARROW-9933) - [Developer] Add drone as a CI provider for crossbow
+* [ARROW-9934](https://issues.apache.org/jira/browse/ARROW-9934) - [Rust] Shape and stride check in tensor
+* [ARROW-9941](https://issues.apache.org/jira/browse/ARROW-9941) - [Python] Better string representation for extension types
+* [ARROW-9944](https://issues.apache.org/jira/browse/ARROW-9944) - [Rust] Implement TO\_TIMESTAMP function
+* [ARROW-9949](https://issues.apache.org/jira/browse/ARROW-9949) - [C++] Generalize Decimal128::FromString for reuse in Decimal256
+* [ARROW-9950](https://issues.apache.org/jira/browse/ARROW-9950) - [Rust] [DataFusion] Allow UDF usage without registry
+* [ARROW-9952](https://issues.apache.org/jira/browse/ARROW-9952) - [Python] Use pyarrow.dataset writing for pq.write\_to\_dataset
+* [ARROW-9954](https://issues.apache.org/jira/browse/ARROW-9954) - [Rust] [DataFusion] Simplify code of aggregate planning
+* [ARROW-9956](https://issues.apache.org/jira/browse/ARROW-9956) - [C++][Gandiva] Implement Binary string function in Gandiva
+* [ARROW-9957](https://issues.apache.org/jira/browse/ARROW-9957) - [Rust] Remove unmaintained tempdir dependency
+* [ARROW-9961](https://issues.apache.org/jira/browse/ARROW-9961) - [Rust][DataFusion] to\_timestamp function parses timestamp without timezone offset as UTC rather than local
+* [ARROW-9964](https://issues.apache.org/jira/browse/ARROW-9964) - [C++] CSV date support
+* [ARROW-9965](https://issues.apache.org/jira/browse/ARROW-9965) - [Java] Buffer capacity calculations are slow for fixed-width vectors
+* [ARROW-9966](https://issues.apache.org/jira/browse/ARROW-9966) - [Rust] Speedup aggregate kernels
+* [ARROW-9967](https://issues.apache.org/jira/browse/ARROW-9967) - [Python] Add compute module docs
+* [ARROW-9971](https://issues.apache.org/jira/browse/ARROW-9971) - [Rust] Speedup take
+* [ARROW-9977](https://issues.apache.org/jira/browse/ARROW-9977) - [Rust] Add min/max for [Large]String
+* [ARROW-9979](https://issues.apache.org/jira/browse/ARROW-9979) - [Rust] Fix arrow crate clippy lints
+* [ARROW-9980](https://issues.apache.org/jira/browse/ARROW-9980) - [Rust] Fix parquet crate clippy lints
+* [ARROW-9981](https://issues.apache.org/jira/browse/ARROW-9981) - [Rust] Allow configuring flight IPC with IpcWriteOptions
+* [ARROW-9983](https://issues.apache.org/jira/browse/ARROW-9983) - [C++][Dataset][Python] Use larger default batch size than 32K for Datasets API
+* [ARROW-9984](https://issues.apache.org/jira/browse/ARROW-9984) - [Rust] [DataFusion] DRY of function to string
+* [ARROW-9986](https://issues.apache.org/jira/browse/ARROW-9986) - [Rust][DataFusion] TO\_TIMESTAMP function erroneously requires fractional seconds when no timezone is present
+* [ARROW-9987](https://issues.apache.org/jira/browse/ARROW-9987) - [Rust] [DataFusion] Improve docs of \`Expr\`.
+* [ARROW-9988](https://issues.apache.org/jira/browse/ARROW-9988) - [Rust] [DataFusion] Added std::ops to logical expressions
+* [ARROW-9992](https://issues.apache.org/jira/browse/ARROW-9992) - [C++][Python] Refactor python to arrow conversions based on a reusable conversion API
+* [ARROW-9998](https://issues.apache.org/jira/browse/ARROW-9998) - [Python] Support pickling DictionaryScalar
+* [ARROW-9999](https://issues.apache.org/jira/browse/ARROW-9999) - [Python] Support constructing dictionary array directly through pa.array()
+* [ARROW-10000](https://issues.apache.org/jira/browse/ARROW-10000) - [C++][Python] Support constructing StructArray from list of key-value pairs
+* [ARROW-10001](https://issues.apache.org/jira/browse/ARROW-10001) - [Rust] [DataFusion] Add developer guide to README
+* [ARROW-10010](https://issues.apache.org/jira/browse/ARROW-10010) - [Rust] Speedup arithmetic
+* [ARROW-10015](https://issues.apache.org/jira/browse/ARROW-10015) - [Rust] Implement SIMD for aggregate kernel sum
+* [ARROW-10016](https://issues.apache.org/jira/browse/ARROW-10016) - [Rust] [DataFusion] Implement IsNull and IsNotNull
+* [ARROW-10018](https://issues.apache.org/jira/browse/ARROW-10018) - [CI] Disable Sphinx and API documentation build since it takes 6 hours on master
+* [ARROW-10019](https://issues.apache.org/jira/browse/ARROW-10019) - [Rust] Add substring kernel
+* [ARROW-10023](https://issues.apache.org/jira/browse/ARROW-10023) - [Gandiva][C++] Implementing Split part function in gandiva
+* [ARROW-10024](https://issues.apache.org/jira/browse/ARROW-10024) - [C++][Parquet] Create nested reading benchmarks
+* [ARROW-10028](https://issues.apache.org/jira/browse/ARROW-10028) - [Rust] Simplify macro def\_numeric\_from\_vec
+* [ARROW-10030](https://issues.apache.org/jira/browse/ARROW-10030) - [Rust] Support fromIter and toIter
+* [ARROW-10035](https://issues.apache.org/jira/browse/ARROW-10035) - [C++] Bump versions of vendored code
+* [ARROW-10037](https://issues.apache.org/jira/browse/ARROW-10037) - [C++] Workaround to force find AWS SDK to look for shared libraries
+* [ARROW-10040](https://issues.apache.org/jira/browse/ARROW-10040) - [Rust] Create a way to slice unalligned offset buffers
+* [ARROW-10043](https://issues.apache.org/jira/browse/ARROW-10043) - [Rust] [DataFusion] Introduce support for DISTINCT by partially implementing COUNT(DISTINCT)
+* [ARROW-10044](https://issues.apache.org/jira/browse/ARROW-10044) - [Rust] Improve README
+* [ARROW-10046](https://issues.apache.org/jira/browse/ARROW-10046) - [Rust] [DataFusion] Made \`\*Iterator\` implement Iterator
+* [ARROW-10050](https://issues.apache.org/jira/browse/ARROW-10050) - [C++][Gandiva] Implement concat() in Gandiva for up to 10 arguments
+* [ARROW-10051](https://issues.apache.org/jira/browse/ARROW-10051) - [C++][Compute] Make aggregate kernel merge state mutable
+* [ARROW-10054](https://issues.apache.org/jira/browse/ARROW-10054) - [Python] Slice methods should return empty arrays instead of crashing
+* [ARROW-10055](https://issues.apache.org/jira/browse/ARROW-10055) - [Rust] Implement DoubleEndedIterator for NullableIter
+* [ARROW-10057](https://issues.apache.org/jira/browse/ARROW-10057) - [C++] Add Parquet-Arrow roundtrip tests for nested data
+* [ARROW-10058](https://issues.apache.org/jira/browse/ARROW-10058) - [C++] Investigate performance of LevelsToBitmap without BMI2
+* [ARROW-10059](https://issues.apache.org/jira/browse/ARROW-10059) - [R][Doc] Give more advice on how to set up C++ build
+* [ARROW-10063](https://issues.apache.org/jira/browse/ARROW-10063) - [Archery][CI] Fetch main branch in archery build only when it is a pull request
+* [ARROW-10064](https://issues.apache.org/jira/browse/ARROW-10064) - [C++] Resolve compile warnings on Apple Clang 12
+* [ARROW-10065](https://issues.apache.org/jira/browse/ARROW-10065) - [Rust] DRY downcasted Arrays
+* [ARROW-10066](https://issues.apache.org/jira/browse/ARROW-10066) - [C++] Make sure that default AWS region is respected
+* [ARROW-10068](https://issues.apache.org/jira/browse/ARROW-10068) - [C++] Add bundled external project for aws-sdk-cpp
+* [ARROW-10069](https://issues.apache.org/jira/browse/ARROW-10069) - [Java] Support running Java benchmarks from command line
+* [ARROW-10070](https://issues.apache.org/jira/browse/ARROW-10070) - [C++][Compute] Implement stdev aggregate kernel
+* [ARROW-10071](https://issues.apache.org/jira/browse/ARROW-10071) - [R] segfault with ArrowObject from previous session, or saved
+* [ARROW-10074](https://issues.apache.org/jira/browse/ARROW-10074) - [C++] Don't use string\_view.to\_string()
+* [ARROW-10075](https://issues.apache.org/jira/browse/ARROW-10075) - [C++] Don't use nonstd::nullopt this breaks out vendoring abstraction.
+* [ARROW-10076](https://issues.apache.org/jira/browse/ARROW-10076) - [C++] Use TemporaryDir for all tests that don't already use it.
+* [ARROW-10077](https://issues.apache.org/jira/browse/ARROW-10077) - [C++] Potential overflow in bit\_stream\_utils.h multiplication.
+* [ARROW-10083](https://issues.apache.org/jira/browse/ARROW-10083) - [C++] Improve Parquet fuzz seed corpus
+* [ARROW-10084](https://issues.apache.org/jira/browse/ARROW-10084) - [Rust] [DataFusion] Add length of large string array
+* [ARROW-10086](https://issues.apache.org/jira/browse/ARROW-10086) - [Rust] Migrate min\_large\_string -\> min\_string kernels
+* [ARROW-10090](https://issues.apache.org/jira/browse/ARROW-10090) - [C++][Compute] Improve mode kernel
+* [ARROW-10092](https://issues.apache.org/jira/browse/ARROW-10092) - [Dev][Go] Add grpc generated go files to rat exclusion list
+* [ARROW-10093](https://issues.apache.org/jira/browse/ARROW-10093) - [R] Add ability to opt-out of int64 -\> int demotion
+* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes
+* [ARROW-10096](https://issues.apache.org/jira/browse/ARROW-10096) - [Rust] [DataFusion] Remove unused code
+* [ARROW-10099](https://issues.apache.org/jira/browse/ARROW-10099) - [C++][Dataset] Also allow integer partition fields to be dictionary encoded
+* [ARROW-10100](https://issues.apache.org/jira/browse/ARROW-10100) - [C++][Dataset] Ability to read/subset a ParquetFileFragment with given set of row group ids
+* [ARROW-10102](https://issues.apache.org/jira/browse/ARROW-10102) - [C++] Generalize BasicDecimal128::operator\*= for reuse in Decimal256
+* [ARROW-10103](https://issues.apache.org/jira/browse/ARROW-10103) - [Rust] Add a Contains kernel
+* [ARROW-10105](https://issues.apache.org/jira/browse/ARROW-10105) - [FlightRPC] Add client option to disable certificate validation with TLS
+* [ARROW-10120](https://issues.apache.org/jira/browse/ARROW-10120) - [C++][Parquet] Create reading benchmarks for 2-level nested data
+* [ARROW-10127](https://issues.apache.org/jira/browse/ARROW-10127) - [Format] Update specification to support 256-bit Decimal types
+* [ARROW-10129](https://issues.apache.org/jira/browse/ARROW-10129) - [Rust] Cargo build is rebuilding dependencies on arrow changes
+* [ARROW-10134](https://issues.apache.org/jira/browse/ARROW-10134) - [C++][Dataset] Add ParquetFileFragment::num\_row\_groups property
+* [ARROW-10139](https://issues.apache.org/jira/browse/ARROW-10139) - [C++] Add support for building arrow\_testing without building tests
+* [ARROW-10148](https://issues.apache.org/jira/browse/ARROW-10148) - [Rust] Add documentation to lib.rs
+* [ARROW-10151](https://issues.apache.org/jira/browse/ARROW-10151) - [Python] Add support MapArray to\_pandas conversion
+* [ARROW-10155](https://issues.apache.org/jira/browse/ARROW-10155) - [Rust] [DataFusion] Add documentation to lib.rs
+* [ARROW-10156](https://issues.apache.org/jira/browse/ARROW-10156) - [Rust] Auto-label PRs
+* [ARROW-10157](https://issues.apache.org/jira/browse/ARROW-10157) - [Rust] Add more documentation about take
+* [ARROW-10160](https://issues.apache.org/jira/browse/ARROW-10160) - [Rust] Improve documentation of DictionaryType
+* [ARROW-10161](https://issues.apache.org/jira/browse/ARROW-10161) - [Rust] [DataFusion] Simplify expression tests
+* [ARROW-10162](https://issues.apache.org/jira/browse/ARROW-10162) - [Rust] Support display of DictionaryArrays in pretty printing
+* [ARROW-10164](https://issues.apache.org/jira/browse/ARROW-10164) - [Rust] Add support for DictionaryArray types to cast kernels
+* [ARROW-10167](https://issues.apache.org/jira/browse/ARROW-10167) - [Rust] Support display of DictionaryArrays in sql.rs
+* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields
+* [ARROW-10171](https://issues.apache.org/jira/browse/ARROW-10171) - [Rust] [DataFusion] Add \`ExecutionContext::from<ExecutionContextState\>\`
+* [ARROW-10190](https://issues.apache.org/jira/browse/ARROW-10190) - [Website] Add Jorge to list of committers
+* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches
+* [ARROW-10196](https://issues.apache.org/jira/browse/ARROW-10196) - [C++] Add Future::DeferNotOk()
+* [ARROW-10199](https://issues.apache.org/jira/browse/ARROW-10199) - [Rust][Parquet] Release Parquet at crates.io to remove debug prints
+* [ARROW-10201](https://issues.apache.org/jira/browse/ARROW-10201) - [C++][CI] Disable S3 in arm64 job on Travis CI
+* [ARROW-10202](https://issues.apache.org/jira/browse/ARROW-10202) - [CI][Windows] Use sf.net mirror for MSYS2
+* [ARROW-10205](https://issues.apache.org/jira/browse/ARROW-10205) - [Java][FlightRPC] Add client option to disable server verification
+* [ARROW-10206](https://issues.apache.org/jira/browse/ARROW-10206) - [Python][C++][FlightRPC] Add client option to disable server validation
+* [ARROW-10215](https://issues.apache.org/jira/browse/ARROW-10215) - [Rust] [DataFusion] Rename "Source" typedef
+* [ARROW-10217](https://issues.apache.org/jira/browse/ARROW-10217) - [CI] Run fewer GitHub Actions jobs
+* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests
+* [ARROW-10227](https://issues.apache.org/jira/browse/ARROW-10227) - [Ruby] Use a table size as the default for parquet chunk\_size
+* [ARROW-10229](https://issues.apache.org/jira/browse/ARROW-10229) - [C++][Parquet] Remove left over ARROW\_LOG statement.
+* [ARROW-10231](https://issues.apache.org/jira/browse/ARROW-10231) - [CI] Unable to download minio in arm32v7 docker image
+* [ARROW-10233](https://issues.apache.org/jira/browse/ARROW-10233) - [Rust] Make array\_value\_to\_string available in all Arrow builds
+* [ARROW-10235](https://issues.apache.org/jira/browse/ARROW-10235) - [Rust][DataFusion] Improve documentation for type coercion
+* [ARROW-10240](https://issues.apache.org/jira/browse/ARROW-10240) - [Rust] [Datafusion] Optionally load tpch data into memory before running benchmark query
+* [ARROW-10251](https://issues.apache.org/jira/browse/ARROW-10251) - [Rust] [DataFusion] MemTable::load() should load partitions in parallel
+* [ARROW-10252](https://issues.apache.org/jira/browse/ARROW-10252) - [Python] Add option to skip inclusion of Arrow headers in Python installation
+* [ARROW-10256](https://issues.apache.org/jira/browse/ARROW-10256) - [C++][Flight] Disable -Werror carefully
+* [ARROW-10257](https://issues.apache.org/jira/browse/ARROW-10257) - [R] Prepare news/docs for 2.0 release
+* [ARROW-10260](https://issues.apache.org/jira/browse/ARROW-10260) - [Python] Missing MapType to Pandas dtype
+* [ARROW-10265](https://issues.apache.org/jira/browse/ARROW-10265) - [CI] Use smaler build when cache doesn't exit on Travis CI
+* [ARROW-10266](https://issues.apache.org/jira/browse/ARROW-10266) - [CI][macOS] Ensure using Python 3.8 with Homebrew
+* [ARROW-10267](https://issues.apache.org/jira/browse/ARROW-10267) - [Python] Skip flight test if disable\_server\_verification feature is not available
+* [ARROW-10272](https://issues.apache.org/jira/browse/ARROW-10272) - [Packaging][Python] Pin newer multibuild version to avoid updating homebrew
+* [ARROW-10273](https://issues.apache.org/jira/browse/ARROW-10273) - [CI][Homebrew] Fix "brew audit" usage
+* [ARROW-10287](https://issues.apache.org/jira/browse/ARROW-10287) - [C++] Avoid std::random\_device whenever possible
+* [PARQUET-1845](https://issues.apache.org/jira/browse/PARQUET-1845) - [C++] Int96 memory images in test cases assume only little-endian
+* [PARQUET-1878](https://issues.apache.org/jira/browse/PARQUET-1878) - [C++] lz4 codec is not compatible with Hadoop Lz4Codec
+* [PARQUET-1904](https://issues.apache.org/jira/browse/PARQUET-1904) - [C++] Export file\_offset in RowGroupMetaData
+
+
+
+# Apache Arrow 1.0.0 (2020-07-20)
+
+## Bug Fixes
+
+* [ARROW-1692](https://issues.apache.org/jira/browse/ARROW-1692) - [Python, Java] UnionArray round trip not working
+* [ARROW-3329](https://issues.apache.org/jira/browse/ARROW-3329) - [Python] Error casting decimal(38, 4) to int64
+* [ARROW-3861](https://issues.apache.org/jira/browse/ARROW-3861) - [Python] ParquetDataset().read columns argument always returns partition column
+* [ARROW-4018](https://issues.apache.org/jira/browse/ARROW-4018) - [C++] RLE decoder may not big-endian compatible
+* [ARROW-4309](https://issues.apache.org/jira/browse/ARROW-4309) - [Documentation] Add a docker-compose entry which builds the documentation with CUDA enabled
+* [ARROW-4600](https://issues.apache.org/jira/browse/ARROW-4600) - [Ruby] Arrow::DictionaryArray\#[] should returns the item in the indices array
+* [ARROW-5158](https://issues.apache.org/jira/browse/ARROW-5158) - [Packaging][Wheel] Symlink libraries in wheels
+* [ARROW-5310](https://issues.apache.org/jira/browse/ARROW-5310) - [Python] better error message on creating ParquetDataset from empty directory
+* [ARROW-5359](https://issues.apache.org/jira/browse/ARROW-5359) - [Python] timestamp\_as\_object support for pa.Table.to\_pandas in pyarrow
+* [ARROW-5572](https://issues.apache.org/jira/browse/ARROW-5572) - [Python] raise error message when passing invalid filter in parquet reading
+* [ARROW-5666](https://issues.apache.org/jira/browse/ARROW-5666) - [Python] Underscores in partition (string) values are dropped when reading dataset
+* [ARROW-5744](https://issues.apache.org/jira/browse/ARROW-5744) - [C++] Do not error in Table::CombineChunks for BinaryArray types that overflow 2GB limit
+* [ARROW-5875](https://issues.apache.org/jira/browse/ARROW-5875) - [FlightRPC] Test RPC features in integration tests
+* [ARROW-6235](https://issues.apache.org/jira/browse/ARROW-6235) - [R] Conversion from arrow::BinaryArray to R character vector not implemented
+* [ARROW-6523](https://issues.apache.org/jira/browse/ARROW-6523) - [C++][Dataset] arrow\_dataset target does not depend on anything
+* [ARROW-6848](https://issues.apache.org/jira/browse/ARROW-6848) - [C++] Specify -std=c++11 instead of -std=gnu++11 when building
+* [ARROW-7018](https://issues.apache.org/jira/browse/ARROW-7018) - [R] Non-UTF-8 data in Arrow <--\> R conversion
+* [ARROW-7028](https://issues.apache.org/jira/browse/ARROW-7028) - [R] Date roundtrip results in different R storage mode
+* [ARROW-7084](https://issues.apache.org/jira/browse/ARROW-7084) - [C++] ArrayRangeEquals should check for full type equality?
+* [ARROW-7173](https://issues.apache.org/jira/browse/ARROW-7173) - [Integration] Add test to verify Map field names can be arbitrary
+* [ARROW-7208](https://issues.apache.org/jira/browse/ARROW-7208) - [Python] Passing directory to ParquetFile class gives confusing error message
+* [ARROW-7273](https://issues.apache.org/jira/browse/ARROW-7273) - [Python] Non-nullable null field is allowed / crashes when writing to parquet
+* [ARROW-7480](https://issues.apache.org/jira/browse/ARROW-7480) - [Rust] [DataFusion] Query fails/incorrect when aggregated + grouped columns don't match the selected columns
+* [ARROW-7610](https://issues.apache.org/jira/browse/ARROW-7610) - [Java] Finish support for 64 bit int allocations
+* [ARROW-7654](https://issues.apache.org/jira/browse/ARROW-7654) - [Python] Ability to set column\_types to a Schema in csv.ConvertOptions is undocumented
+* [ARROW-7681](https://issues.apache.org/jira/browse/ARROW-7681) - [Rust] Explicitly seeking a BufReader will discard the internal buffer
+* [ARROW-7702](https://issues.apache.org/jira/browse/ARROW-7702) - [C++][Dataset] Provide (optional) deterministic order of batches
+* [ARROW-7782](https://issues.apache.org/jira/browse/ARROW-7782) - [Python] Losing index information when using write\_to\_dataset with partition\_cols
+* [ARROW-7840](https://issues.apache.org/jira/browse/ARROW-7840) - [Java] [Integration] Java executables fail
+* [ARROW-7925](https://issues.apache.org/jira/browse/ARROW-7925) - [C++][Documentation] Instructions about running IWYU and other tasks in cpp/development.rst have gone stale
+* [ARROW-7939](https://issues.apache.org/jira/browse/ARROW-7939) - [Python] crashes when reading parquet file compressed with snappy
+* [ARROW-7967](https://issues.apache.org/jira/browse/ARROW-7967) - [CI][Crossbow] Pin macOS version in autobrew job to match CRAN
+* [ARROW-8050](https://issues.apache.org/jira/browse/ARROW-8050) - [Python][Packaging] Do not include generated Cython source files in wheel packages
+* [ARROW-8078](https://issues.apache.org/jira/browse/ARROW-8078) - [Python] Missing links in the docs regarding field and schema DataTypes
+* [ARROW-8115](https://issues.apache.org/jira/browse/ARROW-8115) - [Python] Conversion when mixing NaT and datetime objects not working
+* [ARROW-8251](https://issues.apache.org/jira/browse/ARROW-8251) - [Python] pandas.ExtensionDtype does not survive round trip with write\_to\_dataset
+* [ARROW-8344](https://issues.apache.org/jira/browse/ARROW-8344) - [C\#] StringArray.Builder.Clear() corrupts subsequently-built array contents
+* [ARROW-8360](https://issues.apache.org/jira/browse/ARROW-8360) - [C++][Gandiva] Fixes date32 support for date/time functions
+* [ARROW-8374](https://issues.apache.org/jira/browse/ARROW-8374) - [R] Table to vector of DictonaryType will error when Arrays don't have the same Dictionary per array
+* [ARROW-8392](https://issues.apache.org/jira/browse/ARROW-8392) - [Java] Fix overflow related corner cases for vector value comparison
+* [ARROW-8448](https://issues.apache.org/jira/browse/ARROW-8448) - [Package] Can't build apt packages with ubuntu-focal
+* [ARROW-8455](https://issues.apache.org/jira/browse/ARROW-8455) - [Rust] [Parquet] Arrow column read on partially compatible files
+* [ARROW-8455](https://issues.apache.org/jira/browse/ARROW-8455) - [Rust] [Parquet] Arrow column read on partially compatible files
+* [ARROW-8471](https://issues.apache.org/jira/browse/ARROW-8471) - [C++][Integration] Regression to /u?int64/ as JSON::number
+* [ARROW-8472](https://issues.apache.org/jira/browse/ARROW-8472) - [Go][Integration] Represent 64 bit integers as JSON::string
+* [ARROW-8473](https://issues.apache.org/jira/browse/ARROW-8473) - [Rust] "Statistics support" in rust/parquet readme is incorrect
+* [ARROW-8480](https://issues.apache.org/jira/browse/ARROW-8480) - [Rust] There is no check for allocation failure
+* [ARROW-8503](https://issues.apache.org/jira/browse/ARROW-8503) - [Packaging][deb] Can't build apache-arrow-archive-keyring for RC
+* [ARROW-8505](https://issues.apache.org/jira/browse/ARROW-8505) - [Release][C\#] "sourcelink test" is failed by Apache.Arrow.AssemblyInfo.cs
+* [ARROW-8508](https://issues.apache.org/jira/browse/ARROW-8508) - [Rust] ListBuilder of FixedSizeListBuilder creates wrong offsets
+* [ARROW-8510](https://issues.apache.org/jira/browse/ARROW-8510) - [C++] arrow/dataset/file\_base.cc fails to compile with internal compiler error with "Visual Studio 15 2017 Win64" generator
+* [ARROW-8511](https://issues.apache.org/jira/browse/ARROW-8511) - [Developer][Release] Windows release verification script does not halt if C++ compilation fails
+* [ARROW-8514](https://issues.apache.org/jira/browse/ARROW-8514) - [Developer] Windows wheel verification script does not check Python 3.5
+* [ARROW-8529](https://issues.apache.org/jira/browse/ARROW-8529) - [C++] Fix usage of NextCounts() in GetBatchWithDict[Spaced]
+* [ARROW-8535](https://issues.apache.org/jira/browse/ARROW-8535) - [Rust] Arrow crate does not specify arrow-flight version
+* [ARROW-8536](https://issues.apache.org/jira/browse/ARROW-8536) - [Rust] Failed to locate format/Flight.proto in any parent directory
+* [ARROW-8537](https://issues.apache.org/jira/browse/ARROW-8537) - [C++] Performance regression from ARROW-8523
+* [ARROW-8539](https://issues.apache.org/jira/browse/ARROW-8539) - [CI] "AMD64 MacOS 10.15 GLib & Ruby" fails
+* [ARROW-8554](https://issues.apache.org/jira/browse/ARROW-8554) - [C++][Benchmark] Fix building error "cannot bind lvalue"
+* [ARROW-8556](https://issues.apache.org/jira/browse/ARROW-8556) - [R] zstd symbol not found if there are multiple installations of zstd
+* [ARROW-8566](https://issues.apache.org/jira/browse/ARROW-8566) - [R] error when writing POSIXct to spark
+* [ARROW-8568](https://issues.apache.org/jira/browse/ARROW-8568) - [C++][Python] Crash on decimal cast in debug mode
+* [ARROW-8577](https://issues.apache.org/jira/browse/ARROW-8577) - [Plasma] PlasmaClient::Connect() of CUDA enabled build is always failed on no CUDA device machine
+* [ARROW-8583](https://issues.apache.org/jira/browse/ARROW-8583) - [C++][Doc] Undocumented parameter in Dataset namespace
+* [ARROW-8584](https://issues.apache.org/jira/browse/ARROW-8584) - [Packaging][C++] Protobuf link error in deb builds
+* [ARROW-8585](https://issues.apache.org/jira/browse/ARROW-8585) - [Packaging][Python] Windows wheels fail to build because of link error
+* [ARROW-8586](https://issues.apache.org/jira/browse/ARROW-8586) - [R] installation failure on CentOS 7
+* [ARROW-8587](https://issues.apache.org/jira/browse/ARROW-8587) - [C++] Compilation error when linking arrow-flight-perf-server
+* [ARROW-8592](https://issues.apache.org/jira/browse/ARROW-8592) - [C++] Docs still list LLVM 7 as compiler used
+* [ARROW-8593](https://issues.apache.org/jira/browse/ARROW-8593) - [C++] Parquet file\_serialize\_test.cc fails to build with musl libc
+* [ARROW-8598](https://issues.apache.org/jira/browse/ARROW-8598) - [Rust] simd\_compare\_op creates buffer of incorrect length when item count is not a multiple of T::lanes()
+* [ARROW-8602](https://issues.apache.org/jira/browse/ARROW-8602) - [CMake] Fix ws2\_32 link issue when cross-compiling on Linux
+* [ARROW-8603](https://issues.apache.org/jira/browse/ARROW-8603) - [Documentation] Fix Sphinx doxygen comment
+* [ARROW-8604](https://issues.apache.org/jira/browse/ARROW-8604) - [R][CI] Update CI to use R 4.0
+* [ARROW-8608](https://issues.apache.org/jira/browse/ARROW-8608) - [C++] Update vendored mpark/variant.h to latest to fix NVCC compilation issues
+* [ARROW-8609](https://issues.apache.org/jira/browse/ARROW-8609) - [C++] ORC JNI bridge crashed on null arrow buffer
+* [ARROW-8610](https://issues.apache.org/jira/browse/ARROW-8610) - [Rust] DivideByZero when running arrow crate when simd feature is disabled
+* [ARROW-8613](https://issues.apache.org/jira/browse/ARROW-8613) - [C++][Dataset] Raise error for unparsable partition value
+* [ARROW-8615](https://issues.apache.org/jira/browse/ARROW-8615) - [R] Error better and insist on RandomAccessFile in read\_feather
+* [ARROW-8617](https://issues.apache.org/jira/browse/ARROW-8617) - [Rust] simd\_load\_set\_invalid does not exist on aarch64
+* [ARROW-8632](https://issues.apache.org/jira/browse/ARROW-8632) - [C++] Fix conversion error warning in array\_union\_test.cc
+* [ARROW-8641](https://issues.apache.org/jira/browse/ARROW-8641) - [Python] Regression in feather: no longer supports permutation in column selection
+* [ARROW-8643](https://issues.apache.org/jira/browse/ARROW-8643) - [Python] Tests with pandas master failing due to freq assertion
+* [ARROW-8644](https://issues.apache.org/jira/browse/ARROW-8644) - [Python] Dask integration tests failing due to change in not including partition columns
+* [ARROW-8646](https://issues.apache.org/jira/browse/ARROW-8646) - [Java] Allow UnionListWriter to write null values
+* [ARROW-8649](https://issues.apache.org/jira/browse/ARROW-8649) - [Java] [Website] Java documentation on website is hidden
+* [ARROW-8657](https://issues.apache.org/jira/browse/ARROW-8657) - [Python][C++][Parquet] Forward compatibility issue from 0.16 to 0.17 when using version='2.0'
+* [ARROW-8663](https://issues.apache.org/jira/browse/ARROW-8663) - [Documentation] Small correction to building.rst
+* [ARROW-8680](https://issues.apache.org/jira/browse/ARROW-8680) - [Rust] ComplexObjectArrayReader incorrect null value shuffling
+* [ARROW-8684](https://issues.apache.org/jira/browse/ARROW-8684) - [Python] "SystemError: Bad call flags in \_PyMethodDef\_RawFastCallDict" in Python 3.7.7 on macOS when using pyarrow wheel
+* [ARROW-8689](https://issues.apache.org/jira/browse/ARROW-8689) - [C++] S3 benchmarks fail linking
+* [ARROW-8693](https://issues.apache.org/jira/browse/ARROW-8693) - [Python] Dataset.get\_fragments is missing an implicit cast when filtering
+* [ARROW-8694](https://issues.apache.org/jira/browse/ARROW-8694) - [Python][Parquet] parquet.read\_schema() fails when loading wide table created from Pandas DataFrame
+* [ARROW-8701](https://issues.apache.org/jira/browse/ARROW-8701) - [Rust] Unresolved import \`crate::compute::util::simd\_load\_set\_invalid\` on Raspberry Pi
+* [ARROW-8704](https://issues.apache.org/jira/browse/ARROW-8704) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz)
+* [ARROW-8705](https://issues.apache.org/jira/browse/ARROW-8705) - [Java] ComplexCopier is skipping null values
+* [ARROW-8706](https://issues.apache.org/jira/browse/ARROW-8706) - [C++][Parquet] Tracking JIRA for PARQUET-1857 (unencrypted INT16\_MAX Parquet row group limit)
+* [ARROW-8710](https://issues.apache.org/jira/browse/ARROW-8710) - [Rust] Continuation marker not written correctly in IPC writer, and stream not flushed
+* [ARROW-8722](https://issues.apache.org/jira/browse/ARROW-8722) - [Dev] "archery docker run -e" doesn't work
+* [ARROW-8726](https://issues.apache.org/jira/browse/ARROW-8726) - [C++][Dataset] Mis-specified DirectoryPartitioning incorrectly uses the file name as value
+* [ARROW-8728](https://issues.apache.org/jira/browse/ARROW-8728) - [C++] Bitmap operation may cause buffer overflow
+* [ARROW-8729](https://issues.apache.org/jira/browse/ARROW-8729) - [C++][Dataset] Only selecting a partition column results in empty table
+* [ARROW-8734](https://issues.apache.org/jira/browse/ARROW-8734) - [R] improve nightly build installation
+* [ARROW-8741](https://issues.apache.org/jira/browse/ARROW-8741) - [Python][Packaging] Keep VS2015 with for the windows wheels
+* [ARROW-8750](https://issues.apache.org/jira/browse/ARROW-8750) - [Python] pyarrow.feather.write\_feather does not default to lz4 compression if it's available
+* [ARROW-8768](https://issues.apache.org/jira/browse/ARROW-8768) - [R][CI] Fix nightly as-cran spurious failure
+* [ARROW-8775](https://issues.apache.org/jira/browse/ARROW-8775) - [C++][FlightRPC] Integration client doesn't run integration tests
+* [ARROW-8776](https://issues.apache.org/jira/browse/ARROW-8776) - [FlightRPC][C++] Flight/C++ middleware don't receive headers on failed calls to Java servers
+* [ARROW-8798](https://issues.apache.org/jira/browse/ARROW-8798) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz)
+* [ARROW-8799](https://issues.apache.org/jira/browse/ARROW-8799) - [C++][Dataset] Reading list column as nested dictionary segfaults
+* [ARROW-8801](https://issues.apache.org/jira/browse/ARROW-8801) - [Python] Memory leak on read from parquet file with UTC timestamps using pandas
+* [ARROW-8802](https://issues.apache.org/jira/browse/ARROW-8802) - [C++][Dataset] Schema metadata are lost when reading a subset of columns
+* [ARROW-8803](https://issues.apache.org/jira/browse/ARROW-8803) - [Java] Row count should be set before loading buffers in VectorLoader
+* [ARROW-8808](https://issues.apache.org/jira/browse/ARROW-8808) - [Rust] Divide by zero in arrays/builder.rs
+* [ARROW-8809](https://issues.apache.org/jira/browse/ARROW-8809) - [Rust] schema mismatch in integration test
+* [ARROW-8811](https://issues.apache.org/jira/browse/ARROW-8811) - [Java] Fix build on master
+* [ARROW-8820](https://issues.apache.org/jira/browse/ARROW-8820) - [C++][Gandiva] fix date\_trunc functions to return date types
+* [ARROW-8821](https://issues.apache.org/jira/browse/ARROW-8821) - [Rust] nested binary expression with Like, NotLike and Not operator results in type cast error
+* [ARROW-8825](https://issues.apache.org/jira/browse/ARROW-8825) - [C++] Cannot compiled pass with Wunused-parameter flag
+* [ARROW-8826](https://issues.apache.org/jira/browse/ARROW-8826) - [Crossbow] remote URL should always have .git
+* [ARROW-8832](https://issues.apache.org/jira/browse/ARROW-8832) - [Python] AttributeError: module 'pyarrow.fs' has no attribute 'S3FileSystem'
+* [ARROW-8848](https://issues.apache.org/jira/browse/ARROW-8848) - [CI][C/Glib] MinGW build error
+* [ARROW-8848](https://issues.apache.org/jira/browse/ARROW-8848) - [CI][C/Glib] MinGW build error
+* [ARROW-8858](https://issues.apache.org/jira/browse/ARROW-8858) - [FlightRPC] Ensure headers are uniformly exposed
+* [ARROW-8860](https://issues.apache.org/jira/browse/ARROW-8860) - [C++] IPC/Feather decompression broken for nested arrays
+* [ARROW-8862](https://issues.apache.org/jira/browse/ARROW-8862) - [C++] NumericBuilder does not use MemoryPool passed to CTOR
+* [ARROW-8863](https://issues.apache.org/jira/browse/ARROW-8863) - [C++] Array subclass constructors must set ArrayData::null\_count to 0 when there is no validity bitmap
+* [ARROW-8869](https://issues.apache.org/jira/browse/ARROW-8869) - [Rust] [DataFusion] Type Coercion optimizer rule does not support new scan nodes
+* [ARROW-8871](https://issues.apache.org/jira/browse/ARROW-8871) - [C++] Gandiva build failure
+* [ARROW-8872](https://issues.apache.org/jira/browse/ARROW-8872) - [CI] Travis-CI jobs fail (can't open file 'ci/detect-changes.py')
+* [ARROW-8874](https://issues.apache.org/jira/browse/ARROW-8874) - [C++][Dataset] Scanner::ToTable race when ScanTask exit early with an error
+* [ARROW-8878](https://issues.apache.org/jira/browse/ARROW-8878) - [R] try\_download is confused when download.file.method isn't default
+* [ARROW-8882](https://issues.apache.org/jira/browse/ARROW-8882) - [C\#] Add .editorconfig to C\# code
+* [ARROW-8888](https://issues.apache.org/jira/browse/ARROW-8888) - [Python] Heuristic in dataframe\_to\_arrays that decides to multithread convert cause slow conversions
+* [ARROW-8889](https://issues.apache.org/jira/browse/ARROW-8889) - [Python] Python 3.7 SIGSEGV when comparing RecordBatch to None
+* [ARROW-8892](https://issues.apache.org/jira/browse/ARROW-8892) - [C++][CI] CI builds for MSVC do not build benchmarks
+* [ARROW-8909](https://issues.apache.org/jira/browse/ARROW-8909) - [Java] Out of order writes using setSafe
+* [ARROW-8911](https://issues.apache.org/jira/browse/ARROW-8911) - [C++] Slicing a ChunkedArray with zero chunks segfaults
+* [ARROW-8924](https://issues.apache.org/jira/browse/ARROW-8924) - [C++][Gandiva] castDATE\_date32() may cause overflow
+* [ARROW-8925](https://issues.apache.org/jira/browse/ARROW-8925) - [Rust] [DataFusion] CsvExec::schema() returns incorrect results
+* [ARROW-8930](https://issues.apache.org/jira/browse/ARROW-8930) - [C++] libz.so linking error with liborc.a
+* [ARROW-8932](https://issues.apache.org/jira/browse/ARROW-8932) - [C++] symbol resolution failures with liborc.a
+* [ARROW-8946](https://issues.apache.org/jira/browse/ARROW-8946) - [Python] Add tests for parquet.write\_metadata metadata\_collector
+* [ARROW-8948](https://issues.apache.org/jira/browse/ARROW-8948) - [Java][Integration] enable duplicate field names integration tests
+* [ARROW-8951](https://issues.apache.org/jira/browse/ARROW-8951) - [C++] Fix compiler warning in compute/kernels/scalar\_cast\_temporal.cc
+* [ARROW-8954](https://issues.apache.org/jira/browse/ARROW-8954) - [Website] ca-certificates should be listed in installation instructions
+* [ARROW-8957](https://issues.apache.org/jira/browse/ARROW-8957) - [FlightRPC][C++] Fail to build due to IpcOptions
+* [ARROW-8959](https://issues.apache.org/jira/browse/ARROW-8959) - [Rust] Broken build due to new benchmark crate using old API
+* [ARROW-8962](https://issues.apache.org/jira/browse/ARROW-8962) - [C++] Linking failure with clang-4.0
+* [ARROW-8968](https://issues.apache.org/jira/browse/ARROW-8968) - [C++][Gandiva] Show link warning message on s390x
+* [ARROW-8975](https://issues.apache.org/jira/browse/ARROW-8975) - [FlightRPC][C++] Fix flaky MacOS tests
+* [ARROW-8977](https://issues.apache.org/jira/browse/ARROW-8977) - [R] Table$create with schema crashes with some dictionary index types
+* [ARROW-8978](https://issues.apache.org/jira/browse/ARROW-8978) - [C++][Compute] "Conditional jump or move depends on uninitialised value(s)" Valgrind warning
+* [ARROW-8980](https://issues.apache.org/jira/browse/ARROW-8980) - [Python] Metadata grows exponentially when using schema from disk
+* [ARROW-8982](https://issues.apache.org/jira/browse/ARROW-8982) - [CI] Remove allow\_failures for s390x in TravisCI
+* [ARROW-8986](https://issues.apache.org/jira/browse/ARROW-8986) - [Archery][ursabot] Fix benchmark diff checkout of origin/master
+* [ARROW-9000](https://issues.apache.org/jira/browse/ARROW-9000) - [Java] build crashes with JDK14
+* [ARROW-9009](https://issues.apache.org/jira/browse/ARROW-9009) - [C++][Dataset] ARROW:schema should be removed from schema's metadata when reading Parquet files
+* [ARROW-9013](https://issues.apache.org/jira/browse/ARROW-9013) - [C++] Validate enum-style CMake options
+* [ARROW-9020](https://issues.apache.org/jira/browse/ARROW-9020) - [Python] read\_json won't respect explicit\_schema in parse\_options
+* [ARROW-9024](https://issues.apache.org/jira/browse/ARROW-9024) - [C++/Python] Install anaconda-client in conda-clean job
+* [ARROW-9026](https://issues.apache.org/jira/browse/ARROW-9026) - [C++/Python] Force package removal from arrow-nightlies conda repository
+* [ARROW-9037](https://issues.apache.org/jira/browse/ARROW-9037) - [C++][C] unable to import array with null count == -1 (which could be exported)
+* [ARROW-9057](https://issues.apache.org/jira/browse/ARROW-9057) - [Rust] Projection should work on InMemoryScan without error
+* [ARROW-9059](https://issues.apache.org/jira/browse/ARROW-9059) - [Rust] Documentation for slicing array data has the wrong sign
+* [ARROW-9066](https://issues.apache.org/jira/browse/ARROW-9066) - [Python] Raise correct error in isnull()
+* [ARROW-9071](https://issues.apache.org/jira/browse/ARROW-9071) - [C++] MakeArrayOfNull makes invalid ListArray
+* [ARROW-9077](https://issues.apache.org/jira/browse/ARROW-9077) - [C++] Fix aggregate/scalar-compare benchmark null\_percent calculation
+* [ARROW-9080](https://issues.apache.org/jira/browse/ARROW-9080) - [C++] arrow::AllocateBuffer returns a Result<unique\_ptr<Buffer\>\>
+* [ARROW-9082](https://issues.apache.org/jira/browse/ARROW-9082) - [Rust] - Stream reader fail when steam not ended with (optional) 0xFFFFFFFF 0x00000000"
+* [ARROW-9084](https://issues.apache.org/jira/browse/ARROW-9084) - [C++] CMake is unable to find zstd target when ZSTD\_SOURCE=SYSTEM
+* [ARROW-9085](https://issues.apache.org/jira/browse/ARROW-9085) - [C++][CI] Appveyor CI test failures
+* [ARROW-9087](https://issues.apache.org/jira/browse/ARROW-9087) - [C++] Missing HDFS options parsing
+* [ARROW-9098](https://issues.apache.org/jira/browse/ARROW-9098) - RecordBatch::ToStructArray cannot handle record batches with 0 column
+* [ARROW-9105](https://issues.apache.org/jira/browse/ARROW-9105) - [C++] ParquetFileFragment scanning doesn't handle filter on partition field
+* [ARROW-9120](https://issues.apache.org/jira/browse/ARROW-9120) - [C++] Lint and Format C++ files with "codegen" in file name
+* [ARROW-9121](https://issues.apache.org/jira/browse/ARROW-9121) - [C++] Do not wipe the filesystem when path is empty
+* [ARROW-9122](https://issues.apache.org/jira/browse/ARROW-9122) - [C++] Adapt ascii\_lower/ascii\_upper bulk transforms to work on sliced arrays
+* [ARROW-9126](https://issues.apache.org/jira/browse/ARROW-9126) - [C++] Trimmed Boost bundle fails to build on Windows
+* [ARROW-9127](https://issues.apache.org/jira/browse/ARROW-9127) - [Rust] Update thrift library dependencies
+* [ARROW-9134](https://issues.apache.org/jira/browse/ARROW-9134) - [Python] Parquet partitioning degrades Int32 to float64
+* [ARROW-9141](https://issues.apache.org/jira/browse/ARROW-9141) - [R] Update cross-package documentation links
+* [ARROW-9142](https://issues.apache.org/jira/browse/ARROW-9142) - [C++] random::RandomArrayGenerator::Boolean "probability" misdocumented / incorrect
+* [ARROW-9143](https://issues.apache.org/jira/browse/ARROW-9143) - [C++] RecordBatch::Slice erroneously sets non-nullable field's internal null\_count to unknown
+* [ARROW-9146](https://issues.apache.org/jira/browse/ARROW-9146) - [C++][Dataset] Scanning a Fragment with a filter + mismatching schema shouldn't abort
+* [ARROW-9151](https://issues.apache.org/jira/browse/ARROW-9151) - [R][CI] Fix Rtools 4.0 build: pacman sync
+* [ARROW-9160](https://issues.apache.org/jira/browse/ARROW-9160) - [C++] Implement string/binary contains for exact matches
+* [ARROW-9174](https://issues.apache.org/jira/browse/ARROW-9174) - [Go] Tests panic with 386 (x86) builds
+* [ARROW-9183](https://issues.apache.org/jira/browse/ARROW-9183) - [C++] Failed to build arrow-cpp with gcc 4.9.2
+* [ARROW-9184](https://issues.apache.org/jira/browse/ARROW-9184) - [Rust][Datafusion] table scan without projection should return all columns
+* [ARROW-9194](https://issues.apache.org/jira/browse/ARROW-9194) - [C++] Array::GetScalar not implemented for decimal type
+* [ARROW-9195](https://issues.apache.org/jira/browse/ARROW-9195) - [Java] Wrong usage of Unsafe.get from bytearray in ByteFunctionsHelper class
+* [ARROW-9209](https://issues.apache.org/jira/browse/ARROW-9209) - [C++] Benchmarks fail to build ARROW\_IPC=OFF and ARROW\_BUILD\_TESTS=OFF
+* [ARROW-9219](https://issues.apache.org/jira/browse/ARROW-9219) - [R] coerce\_timestamps in Parquet write options does not work
+* [ARROW-9221](https://issues.apache.org/jira/browse/ARROW-9221) - ArrowBuf\#setBytes(int, ByteBuffer) doesn't check the byte buffer's endianness
+* [ARROW-9223](https://issues.apache.org/jira/browse/ARROW-9223) - [Python] Fix to\_pandas() export for timestamps within structs
+* [ARROW-9230](https://issues.apache.org/jira/browse/ARROW-9230) - [FlightRPC][Python] flight.connect() doesn't pass through all arguments
+* [ARROW-9233](https://issues.apache.org/jira/browse/ARROW-9233) - [C++] is\_null on NullArray should be true for all values
+* [ARROW-9236](https://issues.apache.org/jira/browse/ARROW-9236) - [Rust] CSV WriterBuilder never writes header
+* [ARROW-9237](https://issues.apache.org/jira/browse/ARROW-9237) - [R] 0.17 install on Arch Linux
+* [ARROW-9238](https://issues.apache.org/jira/browse/ARROW-9238) - [C++][CI] A few test coverages of round-robin in ipc and flight
+* [ARROW-9252](https://issues.apache.org/jira/browse/ARROW-9252) - [Integration] GitHub Actions integration test job does not test against "gold" 0.14.1 files in apache/arrow-testing
+* [ARROW-9260](https://issues.apache.org/jira/browse/ARROW-9260) - [CI] "ARM64v8 Ubuntu 20.04 C++" fails
+* [ARROW-9260](https://issues.apache.org/jira/browse/ARROW-9260) - [CI] "ARM64v8 Ubuntu 20.04 C++" fails
+* [ARROW-9261](https://issues.apache.org/jira/browse/ARROW-9261) - [Python][Packaging] S3FileSystem curl errors in manylinux wheels
+* [ARROW-9274](https://issues.apache.org/jira/browse/ARROW-9274) - [Rust] [Integration Testing] Read i64 from json files as strings
+* [ARROW-9282](https://issues.apache.org/jira/browse/ARROW-9282) - [R] Remove usage of \_EXTPTR\_PTR
+* [ARROW-9284](https://issues.apache.org/jira/browse/ARROW-9284) - [Java] getMinorTypeForArrowType returns sparse minor type for dense union types
+* [ARROW-9288](https://issues.apache.org/jira/browse/ARROW-9288) - [C++][Dataset] Discovery of partition field as dictionary type segfaulting with HivePartitioning
+* [ARROW-9297](https://issues.apache.org/jira/browse/ARROW-9297) - [C++][Dataset] Dataset scanner cannot handle large binary column (\> 2 GB)
+* [ARROW-9298](https://issues.apache.org/jira/browse/ARROW-9298) - [C++] Fix crashes on invalid input (OSS-Fuzz)
+* [ARROW-9303](https://issues.apache.org/jira/browse/ARROW-9303) - [R] Linux static build should always bundle dependencies
+* [ARROW-9305](https://issues.apache.org/jira/browse/ARROW-9305) - [Python] Dependency load failure in Windows wheel build
+* [ARROW-9315](https://issues.apache.org/jira/browse/ARROW-9315) - [Java] Fix the failure of testAllocationManagerType
+* [ARROW-9317](https://issues.apache.org/jira/browse/ARROW-9317) - [Java] A few testcases for arrow-memory
+* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds
+* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds
+* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds
+* [ARROW-9330](https://issues.apache.org/jira/browse/ARROW-9330) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
+* [ARROW-9334](https://issues.apache.org/jira/browse/ARROW-9334) - [Dev][Archery] Push ancestor docker images
+* [ARROW-9336](https://issues.apache.org/jira/browse/ARROW-9336) - [Ruby] Creating RecordBatch with structs missing keys results in a malformed table
+* [ARROW-9343](https://issues.apache.org/jira/browse/ARROW-9343) - [C++][Gandiva] CastINT/Float functions from string should handle leading/trailing white spaces
+* [ARROW-9347](https://issues.apache.org/jira/browse/ARROW-9347) - [Python] Tests fail with latest fsspec
+* [ARROW-9350](https://issues.apache.org/jira/browse/ARROW-9350) - [C++][CI] Nightly valgrind job failures
+* [ARROW-9351](https://issues.apache.org/jira/browse/ARROW-9351) - [C++][CI] Nightly test-ubuntu-18.04-cpp-cmake32 fails
+* [ARROW-9353](https://issues.apache.org/jira/browse/ARROW-9353) - [Python][CI] Nightly dask integration jobs fail
+* [ARROW-9354](https://issues.apache.org/jira/browse/ARROW-9354) - [C++] Turbodbc latest fails to build in the integration tests
+* [ARROW-9355](https://issues.apache.org/jira/browse/ARROW-9355) - [R] Fix -Wimplicit-int-float-conversion
+* [ARROW-9360](https://issues.apache.org/jira/browse/ARROW-9360) - [CI][Crossbow] Nightly homebrew-cpp job times out
+* [ARROW-9363](https://issues.apache.org/jira/browse/ARROW-9363) - [C++][Dataset] ParquetDatasetFactory schema: pandas metadata is lost
+* [ARROW-9368](https://issues.apache.org/jira/browse/ARROW-9368) - [Python] Rename predicate argument to filter in split\_by\_row\_group()
+* [ARROW-9373](https://issues.apache.org/jira/browse/ARROW-9373) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz)
+* [ARROW-9380](https://issues.apache.org/jira/browse/ARROW-9380) - [C++] Segfaults in compute::CallFunction
+* [ARROW-9384](https://issues.apache.org/jira/browse/ARROW-9384) - [C++] Out-of-memory on invalid IPC input (OSS-Fuzz)
+* [ARROW-9385](https://issues.apache.org/jira/browse/ARROW-9385) - [Python] [CI] jpype integration failure
+* [ARROW-9389](https://issues.apache.org/jira/browse/ARROW-9389) - [C++] Can't call isin/match through CallFunction
+* [ARROW-9397](https://issues.apache.org/jira/browse/ARROW-9397) - [R] Pass CC/CXX to cmake when building libarrow in Linux build
+* [ARROW-9408](https://issues.apache.org/jira/browse/ARROW-9408) - [Integration] Tests do not run in Windows due to numpy 64-bit errors
+* [ARROW-9409](https://issues.apache.org/jira/browse/ARROW-9409) - [CI][Crossbow] Nightly conda-r fails
+* [ARROW-9410](https://issues.apache.org/jira/browse/ARROW-9410) - [CI][Crossbow] Fix homebrew-cpp again
+* [ARROW-9413](https://issues.apache.org/jira/browse/ARROW-9413) - [Rust] Fix clippy lint on master
+* [ARROW-9415](https://issues.apache.org/jira/browse/ARROW-9415) - [C++] Arrow does not compile on Power9
+* [ARROW-9416](https://issues.apache.org/jira/browse/ARROW-9416) - [Go] Add test cases for some datatypes
+* [ARROW-9417](https://issues.apache.org/jira/browse/ARROW-9417) - [C++][IPC] size in message written in native endian
+* [ARROW-9418](https://issues.apache.org/jira/browse/ARROW-9418) - [R] nyc-taxi Parquet files not downloaded in binary mode on Windows
+* [ARROW-9419](https://issues.apache.org/jira/browse/ARROW-9419) - [C++] Test that "fill\_null" function works with sliced inputs, expand tests
+* [ARROW-9428](https://issues.apache.org/jira/browse/ARROW-9428) - [C++] Update documentation for buffer allocation functions
+* [ARROW-9436](https://issues.apache.org/jira/browse/ARROW-9436) - [C++][CI] Valgrind errors in fill\_null kernel tests
+* [ARROW-9438](https://issues.apache.org/jira/browse/ARROW-9438) - [CI] Spark integration tests are failing
+* [ARROW-9439](https://issues.apache.org/jira/browse/ARROW-9439) - [C++] Fix crash on invalid IPC input (OSS-Fuzz)
+* [ARROW-9440](https://issues.apache.org/jira/browse/ARROW-9440) - [Python] Expose Fill Null Compute Kernel in PyArrow
+* [ARROW-9443](https://issues.apache.org/jira/browse/ARROW-9443) - [C++] Bundled bz2 build should only build libbz2
+* [ARROW-9448](https://issues.apache.org/jira/browse/ARROW-9448) - [Java] Circular initialization between ArrowBuf and BaseAllocator leads to null HistoricalLog for empty buffer
+* [ARROW-9449](https://issues.apache.org/jira/browse/ARROW-9449) - [R] Strip arrow.so
+* [ARROW-9450](https://issues.apache.org/jira/browse/ARROW-9450) - [Python] "pytest pyarrow" takes over 10 seconds to collect tests and start executing
+* [ARROW-9456](https://issues.apache.org/jira/browse/ARROW-9456) - [Python] Dataset segfault when not importing pyarrow.parquet
+* [ARROW-9458](https://issues.apache.org/jira/browse/ARROW-9458) - [Python] Dataset Scanner is single-threaded only
+* [ARROW-9460](https://issues.apache.org/jira/browse/ARROW-9460) - [C++] BinaryContainsExact doesn't cope with double characters in the pattern
+* [ARROW-9461](https://issues.apache.org/jira/browse/ARROW-9461) - [Rust] Reading Date32 and Date64 errors - they are incorrectly converted to RecordBatch
+* [ARROW-9476](https://issues.apache.org/jira/browse/ARROW-9476) - [C++][Dataset] HivePartitioning discovery with dictionary types fails for multiple fields
+* [ARROW-9486](https://issues.apache.org/jira/browse/ARROW-9486) - [C++][Dataset] Support implicit casting InExpression::set\_ to dict
+* [ARROW-9497](https://issues.apache.org/jira/browse/ARROW-9497) - [C++][Parquet] Fix failure caused by malformed repetition/definition levels
+* [ARROW-9499](https://issues.apache.org/jira/browse/ARROW-9499) - [C++] AdaptiveIntBuilder::AppendNull does not increment the null count
+* [ARROW-9500](https://issues.apache.org/jira/browse/ARROW-9500) - [C++] Fix segfault with std::to\_string in -O3 builds on gcc 7.5.0
+* [ARROW-9501](https://issues.apache.org/jira/browse/ARROW-9501) - [C++][Gandiva] Add logic in timestampdiff() when end date is last day of a month
+* [ARROW-9503](https://issues.apache.org/jira/browse/ARROW-9503) - [Rust] Comparison sliced arrays is wrong
+* [ARROW-9504](https://issues.apache.org/jira/browse/ARROW-9504) - [Python] Segmentation fault on ChunkedArray.take
+* [ARROW-9506](https://issues.apache.org/jira/browse/ARROW-9506) - [Packaging][Python] Fix macOS wheel build failures
+* [ARROW-9512](https://issues.apache.org/jira/browse/ARROW-9512) - [C++] Variadic template unpack inside lambda doesn't compile with gcc
+* [ARROW-9524](https://issues.apache.org/jira/browse/ARROW-9524) - [CI][Gandiva] C++ unit test arrow-ipc-read-write failing in gandiva nightly build
+* [ARROW-9527](https://issues.apache.org/jira/browse/ARROW-9527) - [Rust] Remove un-needed dev-dependencies
+* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow
+* [PARQUET-1839](https://issues.apache.org/jira/browse/PARQUET-1839) - [C++] values\_read not updated in ReadBatchSpaced
+* [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups
+* [PARQUET-1865](https://issues.apache.org/jira/browse/PARQUET-1865) - [C++] Failure from C++17 feature used in parquet/encoding\_benchmark.cc
+* [PARQUET-1877](https://issues.apache.org/jira/browse/PARQUET-1877) - [C++] Reconcile container size with string size for memory issues
+* [PARQUET-1882](https://issues.apache.org/jira/browse/PARQUET-1882) - [C++] Writing an all-null column and then reading it with buffered\_stream aborts the process
+
+
+## New Features and Improvements
+
+* [ARROW-300](https://issues.apache.org/jira/browse/ARROW-300) - [Format] Add body buffer compression option to IPC message protocol using LZ4 or ZSTD
+* [ARROW-842](https://issues.apache.org/jira/browse/ARROW-842) - [Python] Handle more kinds of null sentinel objects from pandas 0.x
+* [ARROW-971](https://issues.apache.org/jira/browse/ARROW-971) - [C++/Python] Implement Array.isvalid/notnull/isnull as scalar functions
+* [ARROW-974](https://issues.apache.org/jira/browse/ARROW-974) - [Website] Add Use Cases section to the website
+* [ARROW-1277](https://issues.apache.org/jira/browse/ARROW-1277) - Completing integration tests for major implemented data types
+* [ARROW-1567](https://issues.apache.org/jira/browse/ARROW-1567) - [C++] Implement "fill null" kernels that replace null values with some scalar replacement value
+* [ARROW-1570](https://issues.apache.org/jira/browse/ARROW-1570) - [C++] Define API for creating a kernel instance from function of scalar input and output with a particular signature
+* [ARROW-1682](https://issues.apache.org/jira/browse/ARROW-1682) - [Python] Add documentation / example for reading a directory of Parquet files on S3
+* [ARROW-1796](https://issues.apache.org/jira/browse/ARROW-1796) - [Python] RowGroup filtering on file level
+* [ARROW-2260](https://issues.apache.org/jira/browse/ARROW-2260) - [C++][Plasma] plasma\_store should show usage
+* [ARROW-2444](https://issues.apache.org/jira/browse/ARROW-2444) - [Python][C++] Better handle reading empty parquet files
+* [ARROW-2702](https://issues.apache.org/jira/browse/ARROW-2702) - [Python] Examine usages of Invalid and TypeError errors in numpy\_to\_arrow.cc to see if we are using the right error type in each instance
+* [ARROW-2714](https://issues.apache.org/jira/browse/ARROW-2714) - [C++/Python] Variable step size slicing for arrays
+* [ARROW-2912](https://issues.apache.org/jira/browse/ARROW-2912) - [Website] Build more detailed Community landing page a la Apache Spark
+* [ARROW-3089](https://issues.apache.org/jira/browse/ARROW-3089) - [Rust] Add ArrayBuilder for different Arrow arrays
+* [ARROW-3134](https://issues.apache.org/jira/browse/ARROW-3134) - [C++] Implement n-ary iterator for a collection of chunked arrays with possibly different chunking layouts
+* [ARROW-3154](https://issues.apache.org/jira/browse/ARROW-3154) - [Python][C++] Document how to write \_metadata, \_common\_metadata files with Parquet datasets
+* [ARROW-3244](https://issues.apache.org/jira/browse/ARROW-3244) - [Python] Multi-file parquet loading without scan
+* [ARROW-3275](https://issues.apache.org/jira/browse/ARROW-3275) - [Python] Add documentation about inspecting Parquet file metadata
+* [ARROW-3308](https://issues.apache.org/jira/browse/ARROW-3308) - [R] Convert R character vector with data exceeding 2GB to Large type
+* [ARROW-3317](https://issues.apache.org/jira/browse/ARROW-3317) - [R] Test/support conversions from data.frame with a single character column exceeding 2GB capacity of BinaryArray
+* [ARROW-3446](https://issues.apache.org/jira/browse/ARROW-3446) - [R] Document mapping of Arrow <-\> R types
+* [ARROW-3509](https://issues.apache.org/jira/browse/ARROW-3509) - [C++] Inconsistent child accessor naming
+* [ARROW-3520](https://issues.apache.org/jira/browse/ARROW-3520) - [C++] Implement List Flatten kernel
+* [ARROW-3688](https://issues.apache.org/jira/browse/ARROW-3688) - [Rust] Implement PrimitiveArrayBuilder<T\>.push\_values
+* [ARROW-3827](https://issues.apache.org/jira/browse/ARROW-3827) - [Rust] Implement UnionArray
+* [ARROW-4022](https://issues.apache.org/jira/browse/ARROW-4022) - [C++] Promote Datum variant out of compute namespace
+* [ARROW-4221](https://issues.apache.org/jira/browse/ARROW-4221) - [Format] Add canonical flag in COO sparse index
+* [ARROW-4390](https://issues.apache.org/jira/browse/ARROW-4390) - [R] Serialize "labeled" metadata in Feather files, IPC messages
+* [ARROW-4412](https://issues.apache.org/jira/browse/ARROW-4412) - [DOCUMENTATION] Add explicit version numbers to the arrow specification documents.
+* [ARROW-4427](https://issues.apache.org/jira/browse/ARROW-4427) - [Doc] Move Confluence Wiki pages to the Sphinx docs
+* [ARROW-4429](https://issues.apache.org/jira/browse/ARROW-4429) - [Doc] Add git rebase tips to the 'Contributing' page in the developer docs
+* [ARROW-5035](https://issues.apache.org/jira/browse/ARROW-5035) - [C\#] ArrowBuffer.Builder<bool\> is broken
+* [ARROW-5082](https://issues.apache.org/jira/browse/ARROW-5082) - [Python][Packaging] Reduce size of macOS and manylinux1 wheels
+* [ARROW-5143](https://issues.apache.org/jira/browse/ARROW-5143) - [Flight] Enable integration testing of batches with dictionaries
+* [ARROW-5279](https://issues.apache.org/jira/browse/ARROW-5279) - [C++] Support reading delta dictionaries in IPC streams
+* [ARROW-5377](https://issues.apache.org/jira/browse/ARROW-5377) - [C++] Make IpcPayload public and add GetPayloadSize
+* [ARROW-5489](https://issues.apache.org/jira/browse/ARROW-5489) - [C++] Normalize kernels and ChunkedArray behavior
+* [ARROW-5548](https://issues.apache.org/jira/browse/ARROW-5548) - [Documentation] http://arrow.apache.org/docs/latest/ is not latest
+* [ARROW-5649](https://issues.apache.org/jira/browse/ARROW-5649) - [Integration][C++] Create round trip integration test for extension types
+* [ARROW-5708](https://issues.apache.org/jira/browse/ARROW-5708) - [C\#] Null support for BooleanArray
+* [ARROW-5760](https://issues.apache.org/jira/browse/ARROW-5760) - [C++] Optimize Take implementation
+* [ARROW-5854](https://issues.apache.org/jira/browse/ARROW-5854) - [Python] Expose compare kernels on Array class
+* [ARROW-6052](https://issues.apache.org/jira/browse/ARROW-6052) - [C++] Divide up arrow/array.h,cc into files in arrow/array/ similar to builder files
+* [ARROW-6110](https://issues.apache.org/jira/browse/ARROW-6110) - [Java] Support LargeList Type and add integration test with C++
+* [ARROW-6111](https://issues.apache.org/jira/browse/ARROW-6111) - [Java] Support LargeVarChar and LargeBinary types and add integration test with C++
+* [ARROW-6439](https://issues.apache.org/jira/browse/ARROW-6439) - [R] Implement S3 file-system interface in R
+* [ARROW-6456](https://issues.apache.org/jira/browse/ARROW-6456) - [C++] Possible to reduce object code generated in compute/kernels/take.cc?
+* [ARROW-6501](https://issues.apache.org/jira/browse/ARROW-6501) - [C++] Remove non\_zero\_length field from SparseIndex
+* [ARROW-6521](https://issues.apache.org/jira/browse/ARROW-6521) - [C++] Add function to arrow:: namespace that returns the current ABI version
+* [ARROW-6543](https://issues.apache.org/jira/browse/ARROW-6543) - [R] Support LargeBinary and LargeString types
+* [ARROW-6602](https://issues.apache.org/jira/browse/ARROW-6602) - [Doc] Add feature / implementation matrix
+* [ARROW-6603](https://issues.apache.org/jira/browse/ARROW-6603) - [C\#] ArrayBuilder API to support writing nulls
+* [ARROW-6645](https://issues.apache.org/jira/browse/ARROW-6645) - [Python] Faster boundschecking of dictionary indices when converting to Categorical
+* [ARROW-6689](https://issues.apache.org/jira/browse/ARROW-6689) - [Rust] [DataFusion] Query execution enhancements for 1.0.0 release
+* [ARROW-6691](https://issues.apache.org/jira/browse/ARROW-6691) - [Rust] [DataFusion] Use tokio and Futures instead of spawning threads
+* [ARROW-6775](https://issues.apache.org/jira/browse/ARROW-6775) - [C++] [Python] Proposal for several Array utility functions
+* [ARROW-6776](https://issues.apache.org/jira/browse/ARROW-6776) - [Python] Need a lite version of pyarrow
+* [ARROW-6800](https://issues.apache.org/jira/browse/ARROW-6800) - [C++] Add CMake option to build libraries targeting a C++14 or C++17 toolchain environment
+* [ARROW-6839](https://issues.apache.org/jira/browse/ARROW-6839) - [Java] Add APIs to read and write "custom\_metadata" field of IPC file footer
+* [ARROW-6856](https://issues.apache.org/jira/browse/ARROW-6856) - [C++] Use ArrayData instead of Array for ArrayData::dictionary
+* [ARROW-6917](https://issues.apache.org/jira/browse/ARROW-6917) - ARROW-6917: [Archery][Release] Add support for JIRA curation, changelog generation and commit cherry-picking for maintenance releases
+* [ARROW-6945](https://issues.apache.org/jira/browse/ARROW-6945) - [Rust] Enable integration tests
+* [ARROW-6959](https://issues.apache.org/jira/browse/ARROW-6959) - [C++] Clarify what signatures are preferred for compute kernels
+* [ARROW-6978](https://issues.apache.org/jira/browse/ARROW-6978) - [R] Add bindings for sum and mean compute kernels
+* [ARROW-6979](https://issues.apache.org/jira/browse/ARROW-6979) - [R] Enable jemalloc in autobrew formula
+* [ARROW-7009](https://issues.apache.org/jira/browse/ARROW-7009) - [C++] Refactor filter/take kernels to use Datum instead of overloads
+* [ARROW-7010](https://issues.apache.org/jira/browse/ARROW-7010) - [C++] Support lossy casts from decimal128 to float32 and float64/double
+* [ARROW-7011](https://issues.apache.org/jira/browse/ARROW-7011) - [C++] Implement casts from float/double to decimal128
+* [ARROW-7012](https://issues.apache.org/jira/browse/ARROW-7012) - [C++] Clarify ChunkedArray chunking strategy and policy
+* [ARROW-7068](https://issues.apache.org/jira/browse/ARROW-7068) - [C++] Expose the offsets of a ListArray as a Int32Array
+* [ARROW-7075](https://issues.apache.org/jira/browse/ARROW-7075) - [C++] Boolean kernels should not allocate in Call()
+* [ARROW-7175](https://issues.apache.org/jira/browse/ARROW-7175) - [Website] Add a security page to track when vulnerabilities are patched
+* [ARROW-7229](https://issues.apache.org/jira/browse/ARROW-7229) - [C++] Unify ConcatenateTables APIs
+* [ARROW-7230](https://issues.apache.org/jira/browse/ARROW-7230) - [C++] Use vendored std::optional instead of boost::optional in Gandiva
+* [ARROW-7237](https://issues.apache.org/jira/browse/ARROW-7237) - [C++] Add Result<T\> to APIs to arrow/json
+* [ARROW-7243](https://issues.apache.org/jira/browse/ARROW-7243) - [Docs] Add common "implementation status" table to the README of each native language implementation, as well as top level README
+* [ARROW-7285](https://issues.apache.org/jira/browse/ARROW-7285) - [C++] ensure C++ implementation meets clarified dictionary spec
+* [ARROW-7300](https://issues.apache.org/jira/browse/ARROW-7300) - [C++][Gandiva] Implement functions to cast from strings to integers/floats
+* [ARROW-7313](https://issues.apache.org/jira/browse/ARROW-7313) - [C++] Add function for retrieving a scalar from an array slot
+* [ARROW-7371](https://issues.apache.org/jira/browse/ARROW-7371) - [GLib] Add Datasets binding
+* [ARROW-7375](https://issues.apache.org/jira/browse/ARROW-7375) - [Python] Expose C++ MakeArrayOfNull
+* [ARROW-7391](https://issues.apache.org/jira/browse/ARROW-7391) - [Python] Remove unnecessary classes from the binding layer
+* [ARROW-7495](https://issues.apache.org/jira/browse/ARROW-7495) - [Java] Remove "empty" concept from ArrowBuf, replace with custom referencemanager
+* [ARROW-7605](https://issues.apache.org/jira/browse/ARROW-7605) - [C++] Create and install static library containing all dependencies built by Arrow
+* [ARROW-7607](https://issues.apache.org/jira/browse/ARROW-7607) - [C++] Add to cpp/examples minimal examples of using Arrow as a dependency of another CMake project
+* [ARROW-7673](https://issues.apache.org/jira/browse/ARROW-7673) - [C++][Dataset] Revisit File discovery failure mode
+* [ARROW-7676](https://issues.apache.org/jira/browse/ARROW-7676) - [Packaging][Python] Ensure that the static libraries are not built in the wheel scripts
+* [ARROW-7699](https://issues.apache.org/jira/browse/ARROW-7699) - [Java] Support concating dense union vectors in batch
+* [ARROW-7705](https://issues.apache.org/jira/browse/ARROW-7705) - [Rust] Initial sort implementation
+* [ARROW-7717](https://issues.apache.org/jira/browse/ARROW-7717) - [CI] Have nightly integration test for Spark's latest release
+* [ARROW-7759](https://issues.apache.org/jira/browse/ARROW-7759) - [C++][Dataset] Add CsvFileFormat for CSV support
+* [ARROW-7778](https://issues.apache.org/jira/browse/ARROW-7778) - [C++] Support nested dictionaries in JSON integration format
+* [ARROW-7784](https://issues.apache.org/jira/browse/ARROW-7784) - [C++] diff.cc is extremely slow to compile
+* [ARROW-7801](https://issues.apache.org/jira/browse/ARROW-7801) - [Developer] Add issue\_comment workflow to fix lint/style/codegen
+* [ARROW-7803](https://issues.apache.org/jira/browse/ARROW-7803) - [R][CI] Autobrew/homebrew tests should not always install from master
+* [ARROW-7831](https://issues.apache.org/jira/browse/ARROW-7831) - [Java] unnecessary buffer allocation when calling splitAndTransferTo on variable width vectors
+* [ARROW-7831](https://issues.apache.org/jira/browse/ARROW-7831) - [Java] unnecessary buffer allocation when calling splitAndTransferTo on variable width vectors
+* [ARROW-7902](https://issues.apache.org/jira/browse/ARROW-7902) - [Integration] Unskip nested dictionary integration tests
+* [ARROW-7910](https://issues.apache.org/jira/browse/ARROW-7910) - [C++] Provide function to query page size portably
+* [ARROW-7924](https://issues.apache.org/jira/browse/ARROW-7924) - [Rust] Add sort for float types
+* [ARROW-7950](https://issues.apache.org/jira/browse/ARROW-7950) - [Python] When initializing pandas API shim, inform user if their installed pandas version is too old
+* [ARROW-7955](https://issues.apache.org/jira/browse/ARROW-7955) - [Java] Support large buffer for file/stream IPC
+* [ARROW-8020](https://issues.apache.org/jira/browse/ARROW-8020) - [Java] Implement vector validate functionality
+* [ARROW-8023](https://issues.apache.org/jira/browse/ARROW-8023) - [Website] Write a blog post about the C data interface
+* [ARROW-8025](https://issues.apache.org/jira/browse/ARROW-8025) - [C++] Implement cast to Binary and FixedSizeBinary
+* [ARROW-8025](https://issues.apache.org/jira/browse/ARROW-8025) - [C++] Implement cast to Binary and FixedSizeBinary
+* [ARROW-8046](https://issues.apache.org/jira/browse/ARROW-8046) - [Developer][Integration] Makefile.docker's target names are broken
+* [ARROW-8062](https://issues.apache.org/jira/browse/ARROW-8062) - [C++][Dataset] Parquet Dataset factory from a \_metadata/\_common\_metadata file
+* [ARROW-8065](https://issues.apache.org/jira/browse/ARROW-8065) - [C++][Dataset] Untangle Dataset, Fragment and ScanOptions
+* [ARROW-8074](https://issues.apache.org/jira/browse/ARROW-8074) - [C++][Dataset] Support for file-like objects (buffers) in FileSystemDataset?
+* [ARROW-8108](https://issues.apache.org/jira/browse/ARROW-8108) - [Java] Extract a common interface for dictionary encoders
+* [ARROW-8111](https://issues.apache.org/jira/browse/ARROW-8111) - [C++][CSV] Support MM/DD/YYYY date format
+* [ARROW-8114](https://issues.apache.org/jira/browse/ARROW-8114) - [Java][Integration] Enable custom\_metadata integration test
+* [ARROW-8121](https://issues.apache.org/jira/browse/ARROW-8121) - [Java] Enhance code style checking for Java code (add space after commas, semi-colons and type casts)
+* [ARROW-8149](https://issues.apache.org/jira/browse/ARROW-8149) - [C++/Python] Enable CUDA Support in conda recipes
+* [ARROW-8157](https://issues.apache.org/jira/browse/ARROW-8157) - [C++][Gandiva] Support building with LLVM 9
+* [ARROW-8162](https://issues.apache.org/jira/browse/ARROW-8162) - [Format][Python] Add serialization for CSF sparse tensors
+* [ARROW-8169](https://issues.apache.org/jira/browse/ARROW-8169) - [Java] Improve the performance of JDBC adapter by allocating memory proactively
+* [ARROW-8171](https://issues.apache.org/jira/browse/ARROW-8171) - Consider pre-allocating memory for fix-width vector in Avro adapter iterator
+* [ARROW-8190](https://issues.apache.org/jira/browse/ARROW-8190) - [C++][Flight] Allow setting IpcWriteOptions and IpcReadOptions in Flight IPC message reader and writer classes
+* [ARROW-8229](https://issues.apache.org/jira/browse/ARROW-8229) - [Java] Move ArrowBuf into the Arrow package
+* [ARROW-8230](https://issues.apache.org/jira/browse/ARROW-8230) - [Java] Move Netty memory manager into a separate module
+* [ARROW-8261](https://issues.apache.org/jira/browse/ARROW-8261) - [Rust] [DataFusion] LogicalPlanBuilder.limit() should take a literal argument
+* [ARROW-8263](https://issues.apache.org/jira/browse/ARROW-8263) - [Rust] [DataFusion] Add documentation for supported SQL functions
+* [ARROW-8281](https://issues.apache.org/jira/browse/ARROW-8281) - [R] Name collision of arrow.dll on Windows conda
+* [ARROW-8283](https://issues.apache.org/jira/browse/ARROW-8283) - [Python][Dataset] Non-existent files are silently dropped in pa.dataset.FileSystemDataset
+* [ARROW-8287](https://issues.apache.org/jira/browse/ARROW-8287) - [Rust] Arrow examples should use utility to print results
+* [ARROW-8293](https://issues.apache.org/jira/browse/ARROW-8293) - [Python] Run flake8 on python/examples also
+* [ARROW-8297](https://issues.apache.org/jira/browse/ARROW-8297) - [FlightRPC][C++] Implement Flight DoExchange for C++
+* [ARROW-8301](https://issues.apache.org/jira/browse/ARROW-8301) - [R] Handle ChunkedArray and Table in C data interface
+* [ARROW-8312](https://issues.apache.org/jira/browse/ARROW-8312) - [Java][Gandiva] improve IN expression support
+* [ARROW-8314](https://issues.apache.org/jira/browse/ARROW-8314) - [Python] Provide a method to select a subset of columns of a Table
+* [ARROW-8318](https://issues.apache.org/jira/browse/ARROW-8318) - [C++][Dataset] Dataset should instantiate Fragment
+* [ARROW-8399](https://issues.apache.org/jira/browse/ARROW-8399) - [Rust] Extend memory alignments to include other architectures
+* [ARROW-8413](https://issues.apache.org/jira/browse/ARROW-8413) - [C++] Refactor DefLevelsToBitmap
+* [ARROW-8422](https://issues.apache.org/jira/browse/ARROW-8422) - [Rust] [Parquet] Implement function to convert Arrow schema to Parquet schema
+* [ARROW-8430](https://issues.apache.org/jira/browse/ARROW-8430) - [CI] Configure self-hosted runners for Github Actions
+* [ARROW-8434](https://issues.apache.org/jira/browse/ARROW-8434) - [C++] Ipc RecordBatchFileReader deserializes the Schema multiple times
+* [ARROW-8440](https://issues.apache.org/jira/browse/ARROW-8440) - [C++] Refine simd header files
+* [ARROW-8443](https://issues.apache.org/jira/browse/ARROW-8443) - [Gandiva][C++] Fix round/truncate to no-op for special cases
+* [ARROW-8447](https://issues.apache.org/jira/browse/ARROW-8447) - [C++][Dataset] Ensure Scanner::ToTable preserve ordering of ScanTasks
+* [ARROW-8467](https://issues.apache.org/jira/browse/ARROW-8467) - [C++] Test cases using ArrayFromJSON assume only a little-endian platform
+* [ARROW-8474](https://issues.apache.org/jira/browse/ARROW-8474) - [CI][Crossbow] Skip some nightlies we don't need to run
+* [ARROW-8477](https://issues.apache.org/jira/browse/ARROW-8477) - [C++] Enable reading and writing of long filenames for Windows
+* [ARROW-8481](https://issues.apache.org/jira/browse/ARROW-8481) - [Java] Provide an allocation manager based on Unsafe API
+* [ARROW-8483](https://issues.apache.org/jira/browse/ARROW-8483) - [Ruby] Arrow::Table documentation improvement
+* [ARROW-8485](https://issues.apache.org/jira/browse/ARROW-8485) - [Integration][Java] Implement extension types integration
+* [ARROW-8486](https://issues.apache.org/jira/browse/ARROW-8486) - [C++] arrow-utility-test causes failures on a big-endian platform
+* [ARROW-8487](https://issues.apache.org/jira/browse/ARROW-8487) - [FlightRPC][C++] Make it possible to target a specific payload size
+* [ARROW-8488](https://issues.apache.org/jira/browse/ARROW-8488) - [R] Replace VALUE\_OR\_STOP with ValueOrStop
+* [ARROW-8496](https://issues.apache.org/jira/browse/ARROW-8496) - [C++] Refine ByteStreamSplitDecodeScalar
+* [ARROW-8497](https://issues.apache.org/jira/browse/ARROW-8497) - [Archery] Add missing component to builds
+* [ARROW-8499](https://issues.apache.org/jira/browse/ARROW-8499) - [C++][Dataset] In ScannerBuilder, batch\_size will not work if projecter is not empty
+* [ARROW-8500](https://issues.apache.org/jira/browse/ARROW-8500) - [C++] Use selection vectors in Filter implementation for record batches, tables
+* [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
+* [ARROW-8502](https://issues.apache.org/jira/browse/ARROW-8502) - [Release][APT][Yum] Ignore all arm64 verifications
+* [ARROW-8504](https://issues.apache.org/jira/browse/ARROW-8504) - [C++] Add Run Length Reader
+* [ARROW-8506](https://issues.apache.org/jira/browse/ARROW-8506) - [c++] Miss tests to verify expected\_buffer with bit\_width \> 8 in RLE
+* [ARROW-8507](https://issues.apache.org/jira/browse/ARROW-8507) - [Release] Detect .git directory automatically in changelog.py
+* [ARROW-8509](https://issues.apache.org/jira/browse/ARROW-8509) - [GLib] Add low level record batch read/write functions
+* [ARROW-8512](https://issues.apache.org/jira/browse/ARROW-8512) - [C++] Delete unused compute expr prototype code
+* [ARROW-8513](https://issues.apache.org/jira/browse/ARROW-8513) - [Python] Expose Take with Table input in Python
+* [ARROW-8515](https://issues.apache.org/jira/browse/ARROW-8515) - [C++] Bitmap ToString should have an option of grouping by bytes
+* [ARROW-8516](https://issues.apache.org/jira/browse/ARROW-8516) - [Rust] Slow BufferBuilder<BooleanType\> inserts within PrimitiveBuilder::append\_slice
+* [ARROW-8517](https://issues.apache.org/jira/browse/ARROW-8517) - [Developer][Release] Update Crossbow RC verification setup for changes since 0.16.0
+* [ARROW-8520](https://issues.apache.org/jira/browse/ARROW-8520) - [Developer] Use .asf.yaml to direct GitHub notifications to e-mail lists and JIRA
+* [ARROW-8521](https://issues.apache.org/jira/browse/ARROW-8521) - [Developer] Group Sub-task, Task, Test, and Wish issue types as "Improvement" in Changelog
+* [ARROW-8522](https://issues.apache.org/jira/browse/ARROW-8522) - [Developer] Add environment variable option to toggle whether ephemeral NodeJS is installed in release verification script
+* [ARROW-8524](https://issues.apache.org/jira/browse/ARROW-8524) - [CI] Free up space on github actions
+* [ARROW-8526](https://issues.apache.org/jira/browse/ARROW-8526) - [Python] Fix non-deterministic row order failure in dataset tests
+* [ARROW-8531](https://issues.apache.org/jira/browse/ARROW-8531) - [C++] Deprecate ARROW\_USE\_SIMD CMake option
+* [ARROW-8538](https://issues.apache.org/jira/browse/ARROW-8538) - [Packaging] Remove boost from homebrew formula
+* [ARROW-8540](https://issues.apache.org/jira/browse/ARROW-8540) - [C++] Create memory allocation benchmark
+* [ARROW-8541](https://issues.apache.org/jira/browse/ARROW-8541) - [Release] Don't remove previous source releases automatically
+* [ARROW-8542](https://issues.apache.org/jira/browse/ARROW-8542) - [Release] Fix checksum url in the website post release script
+* [ARROW-8543](https://issues.apache.org/jira/browse/ARROW-8543) - [C++] IO: single pass coalescing algorithm
+* [ARROW-8544](https://issues.apache.org/jira/browse/ARROW-8544) - [CI][Crossbow] Add a status.json to the gh-pages summary of nightly builds to get around rate limiting
+* [ARROW-8548](https://issues.apache.org/jira/browse/ARROW-8548) - [Website] 0.17 release post
+* [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups
+* [ARROW-8550](https://issues.apache.org/jira/browse/ARROW-8550) - [CI] Don't run cron GHA jobs on forks
+* [ARROW-8551](https://issues.apache.org/jira/browse/ARROW-8551) - [CI][Gandiva] Use LLVM 8 to build gandiva linux jar
+* [ARROW-8552](https://issues.apache.org/jira/browse/ARROW-8552) - [Rust] support column iteration for parquet row
+* [ARROW-8553](https://issues.apache.org/jira/browse/ARROW-8553) - [C++] Optimize unaligned bitmap operations
+* [ARROW-8555](https://issues.apache.org/jira/browse/ARROW-8555) - [FlightRPC][Java] Implement Flight DoExchange for Java
+* [ARROW-8558](https://issues.apache.org/jira/browse/ARROW-8558) - [Rust] GitHub Actions missing rustfmt
+* [ARROW-8559](https://issues.apache.org/jira/browse/ARROW-8559) - [Rust] Consolidate Record Batch reader traits in main arrow crate
+* [ARROW-8560](https://issues.apache.org/jira/browse/ARROW-8560) - [Rust] Docs for MutableBuffer resize are incorrect
+* [ARROW-8561](https://issues.apache.org/jira/browse/ARROW-8561) - [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize()
+* [ARROW-8562](https://issues.apache.org/jira/browse/ARROW-8562) - [C++] IO: Parameterize I/O coalescing using S3 storage metrics
+* [ARROW-8563](https://issues.apache.org/jira/browse/ARROW-8563) - [Go] Minor change to make newBuilder public
+* [ARROW-8564](https://issues.apache.org/jira/browse/ARROW-8564) - [Website] Add Ubuntu 20.04 LTS to supported package list
+* [ARROW-8569](https://issues.apache.org/jira/browse/ARROW-8569) - [CI] Upgrade xcode version for testing homebrew formulae
+* [ARROW-8571](https://issues.apache.org/jira/browse/ARROW-8571) - [C++] Switch AppVeyor image to VS 2017
+* [ARROW-8572](https://issues.apache.org/jira/browse/ARROW-8572) - [Python] Expose UnionArray.array and other fields
+* [ARROW-8573](https://issues.apache.org/jira/browse/ARROW-8573) - [Rust] Upgrade to Rust 1.44 nightly
+* [ARROW-8574](https://issues.apache.org/jira/browse/ARROW-8574) - [Rust] Implement Debug for all plain types
+* [ARROW-8575](https://issues.apache.org/jira/browse/ARROW-8575) - [Developer] Add issue\_comment workflow to rebase a PR
+* [ARROW-8590](https://issues.apache.org/jira/browse/ARROW-8590) - [Rust] Use Arrow pretty print utility in DataFusion
+* [ARROW-8591](https://issues.apache.org/jira/browse/ARROW-8591) - [Rust] Reverse lookup for a key in DictionaryArray
+* [ARROW-8597](https://issues.apache.org/jira/browse/ARROW-8597) - [Rust] arrow crate lint and readability improvements
+* [ARROW-8606](https://issues.apache.org/jira/browse/ARROW-8606) - [CI] Don't trigger all builds on a change to any file in ci/
+* [ARROW-8607](https://issues.apache.org/jira/browse/ARROW-8607) - [R][CI] Unbreak builds following R 4.0 release
+* [ARROW-8611](https://issues.apache.org/jira/browse/ARROW-8611) - [R] Can't install arrow 0.17 on Ubuntu 18.04 R 3.6.3
+* [ARROW-8612](https://issues.apache.org/jira/browse/ARROW-8612) - [GLib] Add GArrowReadOptions and GArrowWriteOptions
+* [ARROW-8616](https://issues.apache.org/jira/browse/ARROW-8616) - [Rust] Turn explicit SIMD off by default
+* [ARROW-8619](https://issues.apache.org/jira/browse/ARROW-8619) - [C++] Use distinct Type::type values for interval types
+* [ARROW-8622](https://issues.apache.org/jira/browse/ARROW-8622) - [Rust] Parquet crate does not compile on aarch64
+* [ARROW-8623](https://issues.apache.org/jira/browse/ARROW-8623) - [C++][Gandiva] Reduce use of Boost, remove Boost headers from header files
+* [ARROW-8624](https://issues.apache.org/jira/browse/ARROW-8624) - [Website] Install page should mention arrow-dataset packages
+* [ARROW-8628](https://issues.apache.org/jira/browse/ARROW-8628) - [CI][Dev] Wrap docker-compose commands with archery
+* [ARROW-8629](https://issues.apache.org/jira/browse/ARROW-8629) - [Rust] Eliminate indirection of ZST allocations
+* [ARROW-8633](https://issues.apache.org/jira/browse/ARROW-8633) - [C++] Add ValidateAscii function
+* [ARROW-8634](https://issues.apache.org/jira/browse/ARROW-8634) - [Java] Create an example
+* [ARROW-8639](https://issues.apache.org/jira/browse/ARROW-8639) - [C++][Plasma] Require gflags
+* [ARROW-8645](https://issues.apache.org/jira/browse/ARROW-8645) - [C++] Missing gflags dependency for plasma
+* [ARROW-8647](https://issues.apache.org/jira/browse/ARROW-8647) - [C++][Dataset] Optionally encode partition field values as dictionary type
+* [ARROW-8648](https://issues.apache.org/jira/browse/ARROW-8648) - [Rust] Optimize Rust CI Build Times
+* [ARROW-8650](https://issues.apache.org/jira/browse/ARROW-8650) - [Rust] [Website] Add documentation to Arrow website
+* [ARROW-8651](https://issues.apache.org/jira/browse/ARROW-8651) - [Python][Dataset] Support pickling of Dataset objects
+* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset
+* [ARROW-8656](https://issues.apache.org/jira/browse/ARROW-8656) - [Python] Switch to VS2017 in the windows wheel builds
+* [ARROW-8659](https://issues.apache.org/jira/browse/ARROW-8659) - [Rust] ListBuilder and FixedSizeListBuilder capacity
+* [ARROW-8660](https://issues.apache.org/jira/browse/ARROW-8660) - [C++][Gandiva] Reduce dependence on Boost
+* [ARROW-8662](https://issues.apache.org/jira/browse/ARROW-8662) - [CI] Consolidate appveyor scripts
+* [ARROW-8664](https://issues.apache.org/jira/browse/ARROW-8664) - [Java] Add skip null check to all Vector types
+* [ARROW-8668](https://issues.apache.org/jira/browse/ARROW-8668) - [Packaging][APT][Yum][ARM] Use Travis CI's ARM machine to build packages
+* [ARROW-8669](https://issues.apache.org/jira/browse/ARROW-8669) - [C++] Add IpcWriteOptions argument to GetRecordBatchSize()
+* [ARROW-8671](https://issues.apache.org/jira/browse/ARROW-8671) - [C++] Use IPC body compression metadata approved in ARROW-300
+* [ARROW-8671](https://issues.apache.org/jira/browse/ARROW-8671) - [C++] Use IPC body compression metadata approved in ARROW-300
+* [ARROW-8682](https://issues.apache.org/jira/browse/ARROW-8682) - [Ruby][Parquet] Add support for column level compression
+* [ARROW-8687](https://issues.apache.org/jira/browse/ARROW-8687) - [Java] Finish move of io.netty.buffer.ArrowBuf
+* [ARROW-8690](https://issues.apache.org/jira/browse/ARROW-8690) - [Python] Clean-up dataset+parquet tests now order is determinstic
+* [ARROW-8692](https://issues.apache.org/jira/browse/ARROW-8692) - [C++] Avoid memory copies when downloading from S3
+* [ARROW-8695](https://issues.apache.org/jira/browse/ARROW-8695) - [Java] remove references to PlatformDependent in memory module
+* [ARROW-8696](https://issues.apache.org/jira/browse/ARROW-8696) - [Java] Convert tests to integration tests
+* [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion
+* [ARROW-8702](https://issues.apache.org/jira/browse/ARROW-8702) - [Packaging][C\#] Build NuGet packages in release process
+* [ARROW-8703](https://issues.apache.org/jira/browse/ARROW-8703) - [R] schema$metadata should be properly typed
+* [ARROW-8707](https://issues.apache.org/jira/browse/ARROW-8707) - [CI] Docker push fails because of wrong dockerhub credentials
+* [ARROW-8708](https://issues.apache.org/jira/browse/ARROW-8708) - [CI] Utilize github actions cache for docker-compose volumes
+* [ARROW-8711](https://issues.apache.org/jira/browse/ARROW-8711) - [Python] Expose strptime timestamp parsing in read\_csv conversion options
+* [ARROW-8717](https://issues.apache.org/jira/browse/ARROW-8717) - [CI][Packaging] Add build dependency on boost to homebrew
+* [ARROW-8720](https://issues.apache.org/jira/browse/ARROW-8720) - [C++] Fix checked\_pointer\_cast
+* [ARROW-8721](https://issues.apache.org/jira/browse/ARROW-8721) - [CI] Fix R build matrix
+* [ARROW-8723](https://issues.apache.org/jira/browse/ARROW-8723) - [Rust] Remove SIMD specific benchmark code
+* [ARROW-8724](https://issues.apache.org/jira/browse/ARROW-8724) - [Packaging][deb][RPM] Use directory in host as build directory
+* [ARROW-8725](https://issues.apache.org/jira/browse/ARROW-8725) - [Rust] redundant directory walk in rust parquet datasource code
+* [ARROW-8727](https://issues.apache.org/jira/browse/ARROW-8727) - [C++] Do not require struct-initialization of StringConverter<T\> to parse strings to other types
+* [ARROW-8730](https://issues.apache.org/jira/browse/ARROW-8730) - [Rust] Use slice instead of &Vec for function arguments
+* [ARROW-8733](https://issues.apache.org/jira/browse/ARROW-8733) - [C++][Dataset][Python] ParquetFileFragment should provide access to parquet FileMetadata
+* [ARROW-8736](https://issues.apache.org/jira/browse/ARROW-8736) - [Rust] [DataFusion] Table API should provide a schema() method
+* [ARROW-8740](https://issues.apache.org/jira/browse/ARROW-8740) - [CI] Fix archery option in pandas master cron test
+* [ARROW-8742](https://issues.apache.org/jira/browse/ARROW-8742) - [C++][Python] Add flight client support for Mutual TLS
+* [ARROW-8743](https://issues.apache.org/jira/browse/ARROW-8743) - [C++][CI] Add a test job on s390x
+* [ARROW-8744](https://issues.apache.org/jira/browse/ARROW-8744) - [Rust] ParquetIterator's next method should be safe to call even after reached end of iteration
+* [ARROW-8745](https://issues.apache.org/jira/browse/ARROW-8745) - [C++] Bitmap.ToString causes failures on a big-endian platform
+* [ARROW-8747](https://issues.apache.org/jira/browse/ARROW-8747) - [C++] Feather tests with compression cause failure on big-endian platforms
+* [ARROW-8751](https://issues.apache.org/jira/browse/ARROW-8751) - [Rust] ParquetFileArrowReader should be able to read empty parquet file without error
+* [ARROW-8752](https://issues.apache.org/jira/browse/ARROW-8752) - [Rust] Remove unused hashmap
+* [ARROW-8753](https://issues.apache.org/jira/browse/ARROW-8753) - [C++][CI] Add a test job on ARM
+* [ARROW-8754](https://issues.apache.org/jira/browse/ARROW-8754) - [C++][CI] enable tests for additional components on big-endian platforms
+* [ARROW-8756](https://issues.apache.org/jira/browse/ARROW-8756) - [C++] Bitmap word tests cause failures on a big-endian platform
+* [ARROW-8757](https://issues.apache.org/jira/browse/ARROW-8757) - [C++] Plasma header is written in native endian
+* [ARROW-8758](https://issues.apache.org/jira/browse/ARROW-8758) - [R] Updates for compatibility with dplyr 1.0
+* [ARROW-8759](https://issues.apache.org/jira/browse/ARROW-8759) - [C++] TestPlasmaSerialization.DeleteReply tests failure on big-endian platforms
+* [ARROW-8762](https://issues.apache.org/jira/browse/ARROW-8762) - [C++][Gandiva] Replace Gandiva's BitmapAnd with common implementation
+* [ARROW-8763](https://issues.apache.org/jira/browse/ARROW-8763) - [C++] Create RandomAccessFile::WillNeed-like API
+* [ARROW-8764](https://issues.apache.org/jira/browse/ARROW-8764) - [C++] Make ThreadPool configurable in ReadRangeCache
+* [ARROW-8766](https://issues.apache.org/jira/browse/ARROW-8766) - [Python] A FileSystem implementation based on Python callbacks
+* [ARROW-8769](https://issues.apache.org/jira/browse/ARROW-8769) - [C++] Add convenience methods to access fields by name in StructScalar
+* [ARROW-8770](https://issues.apache.org/jira/browse/ARROW-8770) - [C++][CI] enable arrow-csv-test on s390x
+* [ARROW-8772](https://issues.apache.org/jira/browse/ARROW-8772) - [C++] Expand SumKernel benchmark to more types
+* [ARROW-8777](https://issues.apache.org/jira/browse/ARROW-8777) - [Rust] Parquet.rs does not support reading fixed-size binary fields.
+* [ARROW-8778](https://issues.apache.org/jira/browse/ARROW-8778) - [C++][Gandiva] SelectionVector related test failed on big-endian platforms
+* [ARROW-8779](https://issues.apache.org/jira/browse/ARROW-8779) - [R] Implement conversion to List<Struct\>
+* [ARROW-8781](https://issues.apache.org/jira/browse/ARROW-8781) - [CI][C++] Enable ccache on GHA MinGW jobs
+* [ARROW-8782](https://issues.apache.org/jira/browse/ARROW-8782) - [Rust] [DataFusion] Add benchmarks based on NYC Taxi data set
+* [ARROW-8783](https://issues.apache.org/jira/browse/ARROW-8783) - [Rust] [DataFusion] Logical plan should have ParquetScan and CsvScan entries
+* [ARROW-8784](https://issues.apache.org/jira/browse/ARROW-8784) - [Rust] [DataFusion] Remove use of Arc from LogicalPlan
+* [ARROW-8785](https://issues.apache.org/jira/browse/ARROW-8785) - [Python][Packaging] Build the windows wheels with MIMALLOC enabled
+* [ARROW-8786](https://issues.apache.org/jira/browse/ARROW-8786) - [Packaging][rpm] Use bundled zstd in the CentOS 8 build
+* [ARROW-8788](https://issues.apache.org/jira/browse/ARROW-8788) - [C\#] Array builders to use bit-packed buffer builder rather than boolean array builder for validity map
+* [ARROW-8789](https://issues.apache.org/jira/browse/ARROW-8789) - [Rust] Add separate crate for integration test binaries
+* [ARROW-8790](https://issues.apache.org/jira/browse/ARROW-8790) - [C++][CI] Enable arrow-flight-test on s390x
+* [ARROW-8791](https://issues.apache.org/jira/browse/ARROW-8791) - [Rust] Creating StringDictionaryBuilder with existing dictionary values
+* [ARROW-8792](https://issues.apache.org/jira/browse/ARROW-8792) - [C++] Improved declarative compute function / kernel development framework, normalize calling conventions
+* [ARROW-8793](https://issues.apache.org/jira/browse/ARROW-8793) - [C++] BitUtil::SetBitsTo probably doesn't need to be inline
+* [ARROW-8794](https://issues.apache.org/jira/browse/ARROW-8794) - [C++] Expand benchmark coverage for arrow from parquet reading
+* [ARROW-8795](https://issues.apache.org/jira/browse/ARROW-8795) - [C++] Limited iOS support
+* [ARROW-8800](https://issues.apache.org/jira/browse/ARROW-8800) - [C++] Split arrow::ChunkedArray into arrow/chunked\_array.h
+* [ARROW-8804](https://issues.apache.org/jira/browse/ARROW-8804) - [R][CI] Followup to Rtools40 upgrade
+* [ARROW-8814](https://issues.apache.org/jira/browse/ARROW-8814) - [Dev][Release] Binary upload script keeps raising locale warnings
+* [ARROW-8815](https://issues.apache.org/jira/browse/ARROW-8815) - [Dev][Release] Binary upload script should retry on unexpected bintray request error
+* [ARROW-8818](https://issues.apache.org/jira/browse/ARROW-8818) - [Rust] Failing to build on master due to Flatbuffers/Union issues
+* [ARROW-8822](https://issues.apache.org/jira/browse/ARROW-8822) - [Rust] [DataFusion] Add MemoryScan variant to LogicalPlan
+* [ARROW-8827](https://issues.apache.org/jira/browse/ARROW-8827) - [Integration Testing] Initial skeleton for Rust integration tests
+* [ARROW-8830](https://issues.apache.org/jira/browse/ARROW-8830) - [GLib] Add support for Tell againt not seekable GIO output stream
+* [ARROW-8831](https://issues.apache.org/jira/browse/ARROW-8831) - [Rust] incomplete SIMD implementation in simd\_compare\_op
+* [ARROW-8833](https://issues.apache.org/jira/browse/ARROW-8833) - [Rust] Implement VALIDATE mode in integration test binary
+* [ARROW-8834](https://issues.apache.org/jira/browse/ARROW-8834) - [Rust] Implement arrow-file-to-stream for integration testing
+* [ARROW-8835](https://issues.apache.org/jira/browse/ARROW-8835) - [Rust] Implement arrow-stream-to-file for integration testing
+* [ARROW-8836](https://issues.apache.org/jira/browse/ARROW-8836) - [Website] Update copyright end year automatically
+* [ARROW-8837](https://issues.apache.org/jira/browse/ARROW-8837) - [Rust] Add Null type
+* [ARROW-8838](https://issues.apache.org/jira/browse/ARROW-8838) - [Rust] File reader fails to read header from valid files
+* [ARROW-8839](https://issues.apache.org/jira/browse/ARROW-8839) - [Rust] datafusion logical plan should support scaning csv without provided schema
+* [ARROW-8840](https://issues.apache.org/jira/browse/ARROW-8840) - [Rust] datafusion ExecutionError should implement std::error:Error trait
+* [ARROW-8841](https://issues.apache.org/jira/browse/ARROW-8841) - [C++] Add benchmark and unittest for PLAIN spaced
+* [ARROW-8843](https://issues.apache.org/jira/browse/ARROW-8843) - [C++] Optimize BitmapEquals unaligned case
+* [ARROW-8844](https://issues.apache.org/jira/browse/ARROW-8844) - [C++] Optimize TransferBitmap unaligned case
+* [ARROW-8846](https://issues.apache.org/jira/browse/ARROW-8846) - [Dev][Python] Autoformat Python sources with Archery
+* [ARROW-8847](https://issues.apache.org/jira/browse/ARROW-8847) - [C++] Pass task size / metrics in Executor API
+* [ARROW-8851](https://issues.apache.org/jira/browse/ARROW-8851) - [Python][Documentation] Fix FutureWarnings in Python Plasma docs
+* [ARROW-8852](https://issues.apache.org/jira/browse/ARROW-8852) - [R] Post-0.17.1 adjustments
+* [ARROW-8854](https://issues.apache.org/jira/browse/ARROW-8854) - [Rust] [Integration Testing] Show output from arrow-json-integration-test
+* [ARROW-8855](https://issues.apache.org/jira/browse/ARROW-8855) - [Rust] [Integration Testing] data type Date32(Day) not supported
+* [ARROW-8856](https://issues.apache.org/jira/browse/ARROW-8856) - [Rust] [Integration Testing] Return empty batch if MessageHeader is NONE
+* [ARROW-8864](https://issues.apache.org/jira/browse/ARROW-8864) - [R] Add methods to Table/RecordBatch for consistency with data.frame
+* [ARROW-8866](https://issues.apache.org/jira/browse/ARROW-8866) - [C++] Split Type::UNION into Type::SPARSE\_UNION and Type::DENSE\_UNION
+* [ARROW-8867](https://issues.apache.org/jira/browse/ARROW-8867) - [R] Support converting POSIXlt type
+* [ARROW-8875](https://issues.apache.org/jira/browse/ARROW-8875) - [C++] use AWS SDK SetResponseStreamFactory to avoid a copy of bytes
+* [ARROW-8877](https://issues.apache.org/jira/browse/ARROW-8877) - [Rust] add CSV read option struct to simplify datafusion interface
+* [ARROW-8880](https://issues.apache.org/jira/browse/ARROW-8880) - [R][Linux] Make R Binary Install Friendlier
+* [ARROW-8881](https://issues.apache.org/jira/browse/ARROW-8881) - [Rust] Add large list and binary support
+* [ARROW-8885](https://issues.apache.org/jira/browse/ARROW-8885) - [R] Don't include everything everywhere
+* [ARROW-8886](https://issues.apache.org/jira/browse/ARROW-8886) - [C\#] Decide and implement appropriate behaviour for Array builder resize to negative size
+* [ARROW-8887](https://issues.apache.org/jira/browse/ARROW-8887) - [Java] Buffer size for complex vectors increases rapidly in case of clear/write loop
+* [ARROW-8890](https://issues.apache.org/jira/browse/ARROW-8890) - [R] Fix C++ lint issue
+* [ARROW-8895](https://issues.apache.org/jira/browse/ARROW-8895) - [C++] Add C++ unit tests for filter and take functions on temporal type inputs, including timestamps
+* [ARROW-8896](https://issues.apache.org/jira/browse/ARROW-8896) - [C++] Reimplement dictionary unpacking in Cast kernels using Take
+* [ARROW-8899](https://issues.apache.org/jira/browse/ARROW-8899) - [R] Add R metadata like pandas metadata for round-trip fidelity
+* [ARROW-8901](https://issues.apache.org/jira/browse/ARROW-8901) - [C++] Reduce number of take kernels
+* [ARROW-8903](https://issues.apache.org/jira/browse/ARROW-8903) - [C++] Implement optimized "unsafe take" for use with selection vectors for kernel execution
+* [ARROW-8904](https://issues.apache.org/jira/browse/ARROW-8904) - [Python] Fix usages of deprecated C++ APIs related to child/field
+* [ARROW-8906](https://issues.apache.org/jira/browse/ARROW-8906) - [Rust] Support reading multiple CSV files for schema inference
+* [ARROW-8907](https://issues.apache.org/jira/browse/ARROW-8907) - [Rust] implement scalar comparison operations
+* [ARROW-8912](https://issues.apache.org/jira/browse/ARROW-8912) - [Ruby] Keep reference of Arrow::Buffer's data for GC
+* [ARROW-8913](https://issues.apache.org/jira/browse/ARROW-8913) - [Ruby] Use "field" instead of "child"
+* [ARROW-8914](https://issues.apache.org/jira/browse/ARROW-8914) - [C++][Gandiva] Decimal128 related test failed on big-endian platforms
+* [ARROW-8915](https://issues.apache.org/jira/browse/ARROW-8915) - [Dev][Archery] Require Click 7
+* [ARROW-8917](https://issues.apache.org/jira/browse/ARROW-8917) - [C++][Compute] Formalize "metafunction" concept
+* [ARROW-8918](https://issues.apache.org/jira/browse/ARROW-8918) - [C++] Add cast "metafunction" to FunctionRegistry that addresses dispatching to appropriate type-specific CastFunction
+* [ARROW-8922](https://issues.apache.org/jira/browse/ARROW-8922) - [C++] Implement example string scalar kernel function to assist with string kernels buildout per ARROW-555
+* [ARROW-8923](https://issues.apache.org/jira/browse/ARROW-8923) - [C++] Improve usability of arrow::compute::CallFunction by moving ExecContext\* argument to end and adding default
+* [ARROW-8926](https://issues.apache.org/jira/browse/ARROW-8926) - [C++] Improve docstrings in new public APIs in arrow/compute and fix miscellaneous typos
+* [ARROW-8927](https://issues.apache.org/jira/browse/ARROW-8927) - [C++] Support dictionary memos when reading/writing record batches using cuda IPC
+* [ARROW-8929](https://issues.apache.org/jira/browse/ARROW-8929) - [C++] Change compute::Arity:VarArgs min\_args default to 0
+* [ARROW-8931](https://issues.apache.org/jira/browse/ARROW-8931) - [Rust] Support lexical sort in arrow compute kernel
+* [ARROW-8933](https://issues.apache.org/jira/browse/ARROW-8933) - [C++] Reduce generated code in vector\_hash.cc
+* [ARROW-8934](https://issues.apache.org/jira/browse/ARROW-8934) - [C++] Add timestamp subtract kernel aliased to int64 subtract implementation
+* [ARROW-8937](https://issues.apache.org/jira/browse/ARROW-8937) - [C++] Add "parse\_strptime" function for string to timestamp conversions using the kernels framework
+* [ARROW-8938](https://issues.apache.org/jira/browse/ARROW-8938) - [R] Provide binding for arrow::compute::CallFunction
+* [ARROW-8940](https://issues.apache.org/jira/browse/ARROW-8940) - [Java] Fix the performance degradation of integration tests
+* [ARROW-8941](https://issues.apache.org/jira/browse/ARROW-8941) - [C++/Python] arrow-nightlies conda repository is full
+* [ARROW-8942](https://issues.apache.org/jira/browse/ARROW-8942) - [R] Detect compression in reading CSV/JSON
+* [ARROW-8943](https://issues.apache.org/jira/browse/ARROW-8943) - [C++][Dataset] Add support for Partitioning to ParquetDatasetFactory
+* [ARROW-8950](https://issues.apache.org/jira/browse/ARROW-8950) - [C++] Make head optional in s3fs
+* [ARROW-8958](https://issues.apache.org/jira/browse/ARROW-8958) - [FlightRPC][Python] Implement Flight DoExchange for Python
+* [ARROW-8960](https://issues.apache.org/jira/browse/ARROW-8960) - [MINOR] [FORMAT] Fix typos in comments
+* [ARROW-8961](https://issues.apache.org/jira/browse/ARROW-8961) - [C++] Add utf8proc library to toolchain
+* [ARROW-8963](https://issues.apache.org/jira/browse/ARROW-8963) - [C++][Parquet] Parquet cpp optimize allocate memory
+* [ARROW-8965](https://issues.apache.org/jira/browse/ARROW-8965) - [Python][Documentation] Pyarrow documentation for pip nightlies references 404'd location
+* [ARROW-8966](https://issues.apache.org/jira/browse/ARROW-8966) - [C++] Move arrow::ArrayData to a separate header file
+* [ARROW-8969](https://issues.apache.org/jira/browse/ARROW-8969) - [C++] Reduce generated code in compute/kernels/scalar\_compare.cc
+* [ARROW-8970](https://issues.apache.org/jira/browse/ARROW-8970) - [C++] Reduce shared library / binary code size (umbrella issue)
+* [ARROW-8972](https://issues.apache.org/jira/browse/ARROW-8972) - [Java] Support range value comparison for large varchar/varbinary vectors
+* [ARROW-8973](https://issues.apache.org/jira/browse/ARROW-8973) - [Java] Support batch value appending for large varchar/varbinary vectors
+* [ARROW-8974](https://issues.apache.org/jira/browse/ARROW-8974) - [C++] Refine TransferBitmap template parameters
+* [ARROW-8976](https://issues.apache.org/jira/browse/ARROW-8976) - [C++] compute::CallFunction can't Filter/Take with ChunkedArray
+* [ARROW-8979](https://issues.apache.org/jira/browse/ARROW-8979) - [C++] Implement bitmap word reader and writer
+* [ARROW-8984](https://issues.apache.org/jira/browse/ARROW-8984) - [R] Revise install guides now that Windows conda package exists
+* [ARROW-8985](https://issues.apache.org/jira/browse/ARROW-8985) - [Format] Add "byte width" field with default of 16 to Decimal Flatbuffers type for forward compatibility
+* [ARROW-8989](https://issues.apache.org/jira/browse/ARROW-8989) - [C++] Document available functions in compute::FunctionRegistry
+* [ARROW-8993](https://issues.apache.org/jira/browse/ARROW-8993) - [Rust] Support reading non-seekable sources in text readers
+* [ARROW-8994](https://issues.apache.org/jira/browse/ARROW-8994) - [C++] Disable include-what-you-use cpplint lint checks
+* [ARROW-8996](https://issues.apache.org/jira/browse/ARROW-8996) - [C++] Runtime SIMD path for Aggregate Sum/Mean kernel
+* [ARROW-8997](https://issues.apache.org/jira/browse/ARROW-8997) - [Archery] Benchmark formatter should have friendly units
+* [ARROW-9004](https://issues.apache.org/jira/browse/ARROW-9004) - [C++][Gandiva] Support building with LLVM 10
+* [ARROW-9005](https://issues.apache.org/jira/browse/ARROW-9005) - [Rust] [DataFusion] Support sort expression
+* [ARROW-9007](https://issues.apache.org/jira/browse/ARROW-9007) - [Rust] Support appending arrays by merging array data
+* [ARROW-9014](https://issues.apache.org/jira/browse/ARROW-9014) - [Packaging] Bump the minor part of the automatically generated version in crossbow
+* [ARROW-9015](https://issues.apache.org/jira/browse/ARROW-9015) - [Java] Make BaseAllocator package private
+* [ARROW-9016](https://issues.apache.org/jira/browse/ARROW-9016) - [Java] Remove direct references to Netty/Unsafe Allocators
+* [ARROW-9017](https://issues.apache.org/jira/browse/ARROW-9017) - [Python] Refactor the Scalar classes
+* [ARROW-9018](https://issues.apache.org/jira/browse/ARROW-9018) - [C++] Remove APIs that were deprecated in 0.17.x and prior
+* [ARROW-9021](https://issues.apache.org/jira/browse/ARROW-9021) - [Python] The filesystem keyword in parquet.read\_table is not documented
+* [ARROW-9022](https://issues.apache.org/jira/browse/ARROW-9022) - [C++] Add/Sub/Mul arithmetic kernels with overflow check
+* [ARROW-9029](https://issues.apache.org/jira/browse/ARROW-9029) - [C++] Implement BitBlockCounter interface for blockwise popcounts of validity bitmaps
+* [ARROW-9030](https://issues.apache.org/jira/browse/ARROW-9030) - [Python] Clean up some usages of pyarrow.compat, move some common functions/symbols to lib.pyx
+* [ARROW-9031](https://issues.apache.org/jira/browse/ARROW-9031) - [R] Implement conversion from Type::UINT64 to R vector
+* [ARROW-9032](https://issues.apache.org/jira/browse/ARROW-9032) - [C++] Split arrow/util/bit\_util.h into multiple header files
+* [ARROW-9034](https://issues.apache.org/jira/browse/ARROW-9034) - [C++] Implement binary (two bitmap) version of BitBlockCounter
+* [ARROW-9042](https://issues.apache.org/jira/browse/ARROW-9042) - [C++] Add Subtract and Multiply arithmetic kernels with wrap-around behavior
+* [ARROW-9043](https://issues.apache.org/jira/browse/ARROW-9043) - [Go] Temporarily copy LICENSE.txt to go/
+* [ARROW-9043](https://issues.apache.org/jira/browse/ARROW-9043) - [Go] Temporarily copy LICENSE.txt to go/
+* [ARROW-9045](https://issues.apache.org/jira/browse/ARROW-9045) - [C++] Improve and expand Take/Filter benchmarks
+* [ARROW-9046](https://issues.apache.org/jira/browse/ARROW-9046) - [C++][R] Put more things in type\_fwds
+* [ARROW-9047](https://issues.apache.org/jira/browse/ARROW-9047) - [Rust] Setting 0-bits of a 0-length bitset segfaults
+* [ARROW-9050](https://issues.apache.org/jira/browse/ARROW-9050) - [Release] Use 1.0.0 as the next version
+* [ARROW-9051](https://issues.apache.org/jira/browse/ARROW-9051) - [GLib] Refer Array related objects from Array
+* [ARROW-9052](https://issues.apache.org/jira/browse/ARROW-9052) - [CI][MinGW] Enable Gandiva
+* [ARROW-9055](https://issues.apache.org/jira/browse/ARROW-9055) - [C++] Add sum/mean kernels for Boolean type
+* [ARROW-9058](https://issues.apache.org/jira/browse/ARROW-9058) - [Packaging][wheel] Boost download is failed
+* [ARROW-9060](https://issues.apache.org/jira/browse/ARROW-9060) - [GLib] Add support for building Apache Arrow Datasets GLib with non-installed Apache Arrow Datasets
+* [ARROW-9061](https://issues.apache.org/jira/browse/ARROW-9061) - [Packaging][APT][Yum][GLib] Add Apache Arrow Datasets GLib
+* [ARROW-9062](https://issues.apache.org/jira/browse/ARROW-9062) - [Rust] Support to read JSON into dictionary type
+* [ARROW-9067](https://issues.apache.org/jira/browse/ARROW-9067) - [C++] Create reusable branchless / vectorized index boundschecking functions
+* [ARROW-9070](https://issues.apache.org/jira/browse/ARROW-9070) - [C++] StructScalar needs field accessor methods
+* [ARROW-9073](https://issues.apache.org/jira/browse/ARROW-9073) - [C++] RapidJSON include directory detection doesn't work with RapidJSONConfig.cmake
+* [ARROW-9074](https://issues.apache.org/jira/browse/ARROW-9074) - [GLib] Add missing arrow-json check
+* [ARROW-9075](https://issues.apache.org/jira/browse/ARROW-9075) - [C++] Optimize Filter implementation
+* [ARROW-9079](https://issues.apache.org/jira/browse/ARROW-9079) - [C++] Write benchmark for arithmetic kernels
+* [ARROW-9083](https://issues.apache.org/jira/browse/ARROW-9083) - [R] collect int64, uint32, uint64 as R integer type if not out of bounds
+* [ARROW-9086](https://issues.apache.org/jira/browse/ARROW-9086) - [CI][Homebrew] Enable Gandiva
+* [ARROW-9088](https://issues.apache.org/jira/browse/ARROW-9088) - [Rust] Recent version of arrow crate does not compile into wasm target
+* [ARROW-9089](https://issues.apache.org/jira/browse/ARROW-9089) - [Python] A PyFileSystem handler for fsspec-based filesystems
+* [ARROW-9090](https://issues.apache.org/jira/browse/ARROW-9090) - [C++] Bump versions of bundled libraries
+* [ARROW-9091](https://issues.apache.org/jira/browse/ARROW-9091) - [C++] Utilize function's default options when passing no options to CallFunction for a function that requires them
+* [ARROW-9093](https://issues.apache.org/jira/browse/ARROW-9093) - [FlightRPC][C++][Python] Allow setting gRPC client options
+* [ARROW-9094](https://issues.apache.org/jira/browse/ARROW-9094) - [Python] Bump versions of compiled dependencies in manylinux wheels
+* [ARROW-9095](https://issues.apache.org/jira/browse/ARROW-9095) - [Rust] Fix NullArray to comply with spec
+* [ARROW-9099](https://issues.apache.org/jira/browse/ARROW-9099) - [C++][Gandiva] Add TRIM function for string
+* [ARROW-9100](https://issues.apache.org/jira/browse/ARROW-9100) - [C++] Add ascii\_lower kernel
+* [ARROW-9101](https://issues.apache.org/jira/browse/ARROW-9101) - [Doc][C++][Python] Document encoding expected by CSV and JSON readers
+* [ARROW-9102](https://issues.apache.org/jira/browse/ARROW-9102) - [Packaging] Upload built manylinux docker images
+* [ARROW-9106](https://issues.apache.org/jira/browse/ARROW-9106) - [C++] Add C++ foundation to ease file transcoding
+* [ARROW-9108](https://issues.apache.org/jira/browse/ARROW-9108) - [C++][Dataset] Add Parquet Statistics conversion for timestamp columns
+* [ARROW-9109](https://issues.apache.org/jira/browse/ARROW-9109) - [Python][Packaging] Enable S3 support in manylinux wheels
+* [ARROW-9110](https://issues.apache.org/jira/browse/ARROW-9110) - [C++] Fix CPU cache size detection on macOS
+* [ARROW-9112](https://issues.apache.org/jira/browse/ARROW-9112) - [R] Update autobrew script location
+* [ARROW-9115](https://issues.apache.org/jira/browse/ARROW-9115) - [C++] Process data buffers in batch in ascii\_lower / ascii\_upper kernels rather than using string\_view value iteration
+* [ARROW-9116](https://issues.apache.org/jira/browse/ARROW-9116) - [C++] Add BinaryArray::total\_values\_length()
+* [ARROW-9116](https://issues.apache.org/jira/browse/ARROW-9116) - [C++] Add BinaryArray::total\_values\_length()
+* [ARROW-9118](https://issues.apache.org/jira/browse/ARROW-9118) - [C++] Add more general BoundsCheck function that also checks for arbitrary lower limits in integer arrays
+* [ARROW-9119](https://issues.apache.org/jira/browse/ARROW-9119) - [C++] Add support for building with system static gRPC
+* [ARROW-9123](https://issues.apache.org/jira/browse/ARROW-9123) - [Python][wheel] Use libzstd.a explicitly
+* [ARROW-9124](https://issues.apache.org/jira/browse/ARROW-9124) - [Rust][Datafusion] DFParser should consume sql query as &str instead of String
+* [ARROW-9125](https://issues.apache.org/jira/browse/ARROW-9125) - [C++] Add missing include for arrow::internal::ZeroMemory() for Valgrind
+* [ARROW-9129](https://issues.apache.org/jira/browse/ARROW-9129) - [Python][JPype] Test is failed with JPype 0.7.5
+* [ARROW-9130](https://issues.apache.org/jira/browse/ARROW-9130) - [Python] Add deprecated wrappers functions to a pyarrow/compat.py module for 1.0.0 that will be removed later
+* [ARROW-9131](https://issues.apache.org/jira/browse/ARROW-9131) - [C++] Faster ascii\_lower and ascii\_upper
+* [ARROW-9132](https://issues.apache.org/jira/browse/ARROW-9132) - [C++] Implement hash kernels for dictionary data with constant dictionaries
+* [ARROW-9133](https://issues.apache.org/jira/browse/ARROW-9133) - [C++] Add utf8\_upper and utf8\_lower
+* [ARROW-9137](https://issues.apache.org/jira/browse/ARROW-9137) - [GLib][Ruby] Allow to read Parquet files in chunks (by RowGroup)
+* [ARROW-9138](https://issues.apache.org/jira/browse/ARROW-9138) - [Docs][Format] Make sure format version is hard coded in the docs
+* [ARROW-9139](https://issues.apache.org/jira/browse/ARROW-9139) - [Python] parquet read\_table should not use\_legacy\_dataset
+* [ARROW-9144](https://issues.apache.org/jira/browse/ARROW-9144) - [CI] OSS-Fuzz build fails because recent changes in the google repository
+* [ARROW-9145](https://issues.apache.org/jira/browse/ARROW-9145) - [C++] Add true\_count / false\_count methods to BooleanArray
+* [ARROW-9152](https://issues.apache.org/jira/browse/ARROW-9152) - [C++] Create specialized filter implementation for varbinary types
+* [ARROW-9153](https://issues.apache.org/jira/browse/ARROW-9153) - [Python] Add bindings for StructScalar
+* [ARROW-9154](https://issues.apache.org/jira/browse/ARROW-9154) - [Developer] Use GitHub issue templates better
+* [ARROW-9155](https://issues.apache.org/jira/browse/ARROW-9155) - [Archery] Less precise but faster default settings for "archery benchmark diff"
+* [ARROW-9156](https://issues.apache.org/jira/browse/ARROW-9156) - [C++] Reducing the code size of the tensor module
+* [ARROW-9157](https://issues.apache.org/jira/browse/ARROW-9157) - [Rust][Datafusion] execution context's create\_physical\_plan should take self as immutable reference
+* [ARROW-9158](https://issues.apache.org/jira/browse/ARROW-9158) - [Rust][Datafusion] Projection physical plan compilation should preserve nullability
+* [ARROW-9159](https://issues.apache.org/jira/browse/ARROW-9159) - [Python] Expose the isnull/isvalid kernels
+* [ARROW-9162](https://issues.apache.org/jira/browse/ARROW-9162) - [Python] Expose Add/Subtract/Multiply arithmetic kernels
+* [ARROW-9163](https://issues.apache.org/jira/browse/ARROW-9163) - [C++] Add methods to StringArray, LargeStringArray, to validate whether its values are all UTF-8
+* [ARROW-9166](https://issues.apache.org/jira/browse/ARROW-9166) - [Website] Add overview page
+* [ARROW-9167](https://issues.apache.org/jira/browse/ARROW-9167) - [Doc][Website] /docs/c\_glib/index.html is overwritten
+* [ARROW-9168](https://issues.apache.org/jira/browse/ARROW-9168) - [C++][Flight] allow flight benchmark to use separated TCP connections
+* [ARROW-9173](https://issues.apache.org/jira/browse/ARROW-9173) - [C++] Document how to use Arrow from a third-party CMake project
+* [ARROW-9175](https://issues.apache.org/jira/browse/ARROW-9175) - [FlightRPC][C++][Python] Expose connected peer
+* [ARROW-9176](https://issues.apache.org/jira/browse/ARROW-9176) - [Rust] Fix for memory leaks in Arrow allocator
+* [ARROW-9178](https://issues.apache.org/jira/browse/ARROW-9178) - [R] Improve documentation about CSV reader
+* [ARROW-9179](https://issues.apache.org/jira/browse/ARROW-9179) - [R] Replace usage of iris dataset in tests
+* [ARROW-9180](https://issues.apache.org/jira/browse/ARROW-9180) - [Developer] Remove usage of whitelist, blacklist, slave, etc.
+* [ARROW-9181](https://issues.apache.org/jira/browse/ARROW-9181) - [C++] Instantiate fewer templates in Cast kernel implementation
+* [ARROW-9182](https://issues.apache.org/jira/browse/ARROW-9182) - [C++] Use "applicator" namespace for kernel operator-to-kernel functors, streamline argument unboxing
+* [ARROW-9185](https://issues.apache.org/jira/browse/ARROW-9185) - [C++] [Java][Gandiva] Make llvm build optimisation configurable from java
+* [ARROW-9188](https://issues.apache.org/jira/browse/ARROW-9188) - [C++] Do not always statically link Brotli libraries
+* [ARROW-9189](https://issues.apache.org/jira/browse/ARROW-9189) - [Website] Improve contributor guide
+* [ARROW-9190](https://issues.apache.org/jira/browse/ARROW-9190) - [Website][C++] Add blog post on efforts to make building lighter and easier
+* [ARROW-9191](https://issues.apache.org/jira/browse/ARROW-9191) - [Rust] Do not panic when int96 milliseconds are negative
+* [ARROW-9192](https://issues.apache.org/jira/browse/ARROW-9192) - [CI][Rust] Add support for running clippy
+* [ARROW-9193](https://issues.apache.org/jira/browse/ARROW-9193) - [C++] Add method to parse date from null-terminated string
+* [ARROW-9197](https://issues.apache.org/jira/browse/ARROW-9197) - [C++] Revamp numeric casts: faster performance and reduced binary size
+* [ARROW-9201](https://issues.apache.org/jira/browse/ARROW-9201) - [Archery] Render-human readable table when using "archery benchmark diff"
+* [ARROW-9202](https://issues.apache.org/jira/browse/ARROW-9202) - [GLib] Add GArrowDatum
+* [ARROW-9203](https://issues.apache.org/jira/browse/ARROW-9203) - [Packaging][deb] Add missing gir1.2-arrow-dataset-1.0.install
+* [ARROW-9204](https://issues.apache.org/jira/browse/ARROW-9204) - [C++][Flight] change records\_per\_stream to int64 in flight benchmark
+* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst
+* [ARROW-9206](https://issues.apache.org/jira/browse/ARROW-9206) - [C++][Flight] measure latency in flight benchmark
+* [ARROW-9207](https://issues.apache.org/jira/browse/ARROW-9207) - [Python][Dataset] Clean-up internal FileSource class
+* [ARROW-9210](https://issues.apache.org/jira/browse/ARROW-9210) - [C++] Use OptionalBitBlockCounter in ArrayDataInlineVisitor
+* [ARROW-9214](https://issues.apache.org/jira/browse/ARROW-9214) - [C++] Avoid util::optional in favor of separate inlineable functions in arrow/visitor\_inline.h
+* [ARROW-9216](https://issues.apache.org/jira/browse/ARROW-9216) - [C++][Parquet] Use BitBlockCounter for plain spaced encoding/decoding
+* [ARROW-9217](https://issues.apache.org/jira/browse/ARROW-9217) - [C++][Parquet] Cover 0.01% null for the plain spaced encoding/decoding benchmark
+* [ARROW-9220](https://issues.apache.org/jira/browse/ARROW-9220) - [C++] Disable relevant compute kernels if ARROW\_WITH\_UTF8PROC=OFF
+* [ARROW-9222](https://issues.apache.org/jira/browse/ARROW-9222) - [Format][Proposal] Remove validity bitmap from Union types
+* [ARROW-9224](https://issues.apache.org/jira/browse/ARROW-9224) - [Dev][Archery] Copy local repo on clone failure
+* [ARROW-9225](https://issues.apache.org/jira/browse/ARROW-9225) - [C++][Compute] Improve counting sort
+* [ARROW-9231](https://issues.apache.org/jira/browse/ARROW-9231) - [Format] Increment MetadataVersion from V4 to V5
+* [ARROW-9234](https://issues.apache.org/jira/browse/ARROW-9234) - [GLib][CUDA] Add support for dictionary memo on reading record batch from buffer
+* [ARROW-9241](https://issues.apache.org/jira/browse/ARROW-9241) - [C++] Add forward compatibility checks for Decimal::bitWidth
+* [ARROW-9242](https://issues.apache.org/jira/browse/ARROW-9242) - [Java] Add forward compatibility checks for Decimal::bitWidth
+* [ARROW-9247](https://issues.apache.org/jira/browse/ARROW-9247) - [Python] Expose BinaryArray::total\_values\_length in bindings
+* [ARROW-9248](https://issues.apache.org/jira/browse/ARROW-9248) - [C++] Add "list\_size" function that returns Int32Array/Int64Array giving list cell sizes
+* [ARROW-9249](https://issues.apache.org/jira/browse/ARROW-9249) - [C++] Implement "list\_parent\_indices" vector function
+* [ARROW-9250](https://issues.apache.org/jira/browse/ARROW-9250) - [C++] Compact generated code in compute/kernels/scalar\_set\_lookup.cc using same method as vector\_hash.cc
+* [ARROW-9251](https://issues.apache.org/jira/browse/ARROW-9251) - [C++] Move JSON testing code for integration tests to libarrow\_testing
+* [ARROW-9254](https://issues.apache.org/jira/browse/ARROW-9254) - [C++] Factor out some integer casting internals so it can be reused with temporal casts
+* [ARROW-9255](https://issues.apache.org/jira/browse/ARROW-9255) - [C++] Use CMake to build bundled Protobuf with CMake \>= 3.7
+* [ARROW-9256](https://issues.apache.org/jira/browse/ARROW-9256) - [C++] Incorrect variable name ARROW\_CXX\_FLAGS
+* [ARROW-9258](https://issues.apache.org/jira/browse/ARROW-9258) - [Format] Add V5 MetadataVersion
+* [ARROW-9259](https://issues.apache.org/jira/browse/ARROW-9259) - [Format] Permit unsigned dictionary indices in Columnar.rst
+* [ARROW-9262](https://issues.apache.org/jira/browse/ARROW-9262) - [Packaging][Linux][CI] Use Ubuntu 18.04 to build ARM64 packages on Travis CI
+* [ARROW-9263](https://issues.apache.org/jira/browse/ARROW-9263) - [C++] Benchmark: promote RegressionSetArgs size to L2
+* [ARROW-9264](https://issues.apache.org/jira/browse/ARROW-9264) - [C++] Cleanup Parquet Arrow Schema code
+* [ARROW-9265](https://issues.apache.org/jira/browse/ARROW-9265) - [C++] Add support for writing MetadataVersion::V4-compatible IPC messages for compatibility with library versions <= 0.17.1
+* [ARROW-9268](https://issues.apache.org/jira/browse/ARROW-9268) - [C++] Add is{alnum,alpha,...} kernels for strings
+* [ARROW-9272](https://issues.apache.org/jira/browse/ARROW-9272) - [C++][Python] Reduce complexity in python to arrow conversion
+* [ARROW-9276](https://issues.apache.org/jira/browse/ARROW-9276) - [Dev] Enable ARROW\_CUDA when generating API documentations
+* [ARROW-9277](https://issues.apache.org/jira/browse/ARROW-9277) - [C++] Fix documentation of Reading CSV files
+* [ARROW-9278](https://issues.apache.org/jira/browse/ARROW-9278) - [C++] Implement Union validity bitmap changes from ARROW-9222
+* [ARROW-9280](https://issues.apache.org/jira/browse/ARROW-9280) - [Rust] Write statistics to Parquet files
+* [ARROW-9281](https://issues.apache.org/jira/browse/ARROW-9281) - [R] Turn off utf8proc in R builds
+* [ARROW-9283](https://issues.apache.org/jira/browse/ARROW-9283) - [Python] Expose C++ build info
+* [ARROW-9287](https://issues.apache.org/jira/browse/ARROW-9287) - [C++] Implement support for unsigned dictionary indices
+* [ARROW-9289](https://issues.apache.org/jira/browse/ARROW-9289) - [R] Remove deprecated functions
+* [ARROW-9290](https://issues.apache.org/jira/browse/ARROW-9290) - [Rust] [Parquet] Add features to allow opting out of dependencies
+* [ARROW-9291](https://issues.apache.org/jira/browse/ARROW-9291) - [R] Support fixed size binary/list types
+* [ARROW-9292](https://issues.apache.org/jira/browse/ARROW-9292) - [Rust] Update feature matrix with passing tests
+* [ARROW-9294](https://issues.apache.org/jira/browse/ARROW-9294) - [GLib] Add GArrowFunction
+* [ARROW-9300](https://issues.apache.org/jira/browse/ARROW-9300) - [Java] Separate Netty Memory to its own module
+* [ARROW-9306](https://issues.apache.org/jira/browse/ARROW-9306) - [Ruby] Add support for Arrow::RecordBatch.new(raw\_table)
+* [ARROW-9307](https://issues.apache.org/jira/browse/ARROW-9307) - [Ruby] Add Arrow::RecordBatchIterator\#to\_a
+* [ARROW-9308](https://issues.apache.org/jira/browse/ARROW-9308) - [Format] Add Feature enum to schema.fbs for forward compatibity
+* [ARROW-9316](https://issues.apache.org/jira/browse/ARROW-9316) - [C++] Use "Dataset" instead of "Datasets"
+* [ARROW-9321](https://issues.apache.org/jira/browse/ARROW-9321) - [C++][Dataset] Allow to "collect" statistics for ParquetFragment row groups if not constructed from \_metadata
+* [ARROW-9322](https://issues.apache.org/jira/browse/ARROW-9322) - [R] Dataset documentation polishing
+* [ARROW-9323](https://issues.apache.org/jira/browse/ARROW-9323) - [Ruby] Add Red Arrow Dataset
+* [ARROW-9327](https://issues.apache.org/jira/browse/ARROW-9327) - Fix all clippy errors for arrow crate
+* [ARROW-9329](https://issues.apache.org/jira/browse/ARROW-9329) - [C++][Gandiva] Implement castTimestampToDate function
+* [ARROW-9331](https://issues.apache.org/jira/browse/ARROW-9331) - [C++] Improve the performance of Tensor-to-SparseTensor conversion
+* [ARROW-9333](https://issues.apache.org/jira/browse/ARROW-9333) - [Python] Expose more IPC write options in Python
+* [ARROW-9335](https://issues.apache.org/jira/browse/ARROW-9335) - [Website] Update website for 1.0
+* [ARROW-9337](https://issues.apache.org/jira/browse/ARROW-9337) - [R] On C++ library build failure, give an unambiguous message
+* [ARROW-9339](https://issues.apache.org/jira/browse/ARROW-9339) - [Rust] Comments on SIMD in Arrow README are incorrect
+* [ARROW-9340](https://issues.apache.org/jira/browse/ARROW-9340) - [R] Use CRAN version of decor package
+* [ARROW-9341](https://issues.apache.org/jira/browse/ARROW-9341) - [GLib] Use arrow::Datum version Take()
+* [ARROW-9345](https://issues.apache.org/jira/browse/ARROW-9345) - [C++][Dataset] Expression with dictionary type should work with operand of value type
+* [ARROW-9346](https://issues.apache.org/jira/browse/ARROW-9346) - [C++][Python][Dataset] Add total\_byte\_size metadata to RowGroupInfo
+* [ARROW-9362](https://issues.apache.org/jira/browse/ARROW-9362) - [Java] Add support for writing MetadataVersion::V4-compatible IPC messages for compatibility with library versions <= 0.17.1
+* [ARROW-9365](https://issues.apache.org/jira/browse/ARROW-9365) - [Go] Implement the rest of the typed array builders in NewBuilder
+* [ARROW-9370](https://issues.apache.org/jira/browse/ARROW-9370) - [Java] Bump Netty version
+* [ARROW-9374](https://issues.apache.org/jira/browse/ARROW-9374) - [C++][Python] Expose MakeArrayFromScalar
+* [ARROW-9379](https://issues.apache.org/jira/browse/ARROW-9379) - [Rust] Support unsigned dictionary indices
+* [ARROW-9383](https://issues.apache.org/jira/browse/ARROW-9383) - [Python] Support fsspec filesystems in Dataset API through fs handler
+* [ARROW-9386](https://issues.apache.org/jira/browse/ARROW-9386) - [Rust] RecordBatch.schema() should not return &Arc<Schema\>
+* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names
+* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names
+* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names
+* [ARROW-9391](https://issues.apache.org/jira/browse/ARROW-9391) - [Rust] Float32 values interpreted as zero when record batch has one row
+* [ARROW-9393](https://issues.apache.org/jira/browse/ARROW-9393) - [Doc] update supported types documentation for Java
+* [ARROW-9395](https://issues.apache.org/jira/browse/ARROW-9395) - [Python] Provide configurable MetadataVersion in IPC API and environment variable to set default to V4 when needed
+* [ARROW-9399](https://issues.apache.org/jira/browse/ARROW-9399) - [C++] Add forward compatibility checks for unrecognized future MetadataVersion
+* [ARROW-9403](https://issues.apache.org/jira/browse/ARROW-9403) - [Python] add .tolist as alias of .to\_pylist
+* [ARROW-9407](https://issues.apache.org/jira/browse/ARROW-9407) - [Python] Accept pd.NA as missing value in array constructor
+* [ARROW-9411](https://issues.apache.org/jira/browse/ARROW-9411) - [Rust] Update dependencies
+* [ARROW-9424](https://issues.apache.org/jira/browse/ARROW-9424) - [C++][Parquet] Disable writing files with LZ4 codec
+* [ARROW-9425](https://issues.apache.org/jira/browse/ARROW-9425) - [Rust][DataFusion] Make ExecutionContext sharable between threads
+* [ARROW-9427](https://issues.apache.org/jira/browse/ARROW-9427) - [Rust][DataFusion] Add pub fn ExecutionContext.tables()
+* [ARROW-9437](https://issues.apache.org/jira/browse/ARROW-9437) - [Python][Packaging] Homebrew fails to install build dependencies in the macOS wheel builds
+* [ARROW-9442](https://issues.apache.org/jira/browse/ARROW-9442) - [Python] Do not force Validate() to be called in pyarrow\_wrap\_table
+* [ARROW-9445](https://issues.apache.org/jira/browse/ARROW-9445) - [Python] Revert Array.equals changes + expose comparison ops in compute
+* [ARROW-9446](https://issues.apache.org/jira/browse/ARROW-9446) - [C++] Export compiler information in BuildInfo
+* [ARROW-9447](https://issues.apache.org/jira/browse/ARROW-9447) - [Rust][DataFusion] Allow closures as ScalarUDFs
+* [ARROW-9452](https://issues.apache.org/jira/browse/ARROW-9452) - [Rust] [DateFusion] Improve performance of parquet scan
+* [ARROW-9470](https://issues.apache.org/jira/browse/ARROW-9470) - [CI][Java] Run Maven in parallel
+* [ARROW-9472](https://issues.apache.org/jira/browse/ARROW-9472) - [R] Provide configurable MetadataVersion in IPC API and environment variable to set default to V4 when needed
+* [ARROW-9473](https://issues.apache.org/jira/browse/ARROW-9473) - [Doc] Polishing for 1.0
+* [ARROW-9478](https://issues.apache.org/jira/browse/ARROW-9478) - [C++] Improve error message on unsupported cast types
+* [ARROW-9484](https://issues.apache.org/jira/browse/ARROW-9484) - [Docs] Update is\* functions to be is\_\* in the compute docs
+* [ARROW-9485](https://issues.apache.org/jira/browse/ARROW-9485) - [R] Better shared library stripping
+* [ARROW-9493](https://issues.apache.org/jira/browse/ARROW-9493) - [Python][Dataset] Dictionary encode string partition columns by default
+* [ARROW-9509](https://issues.apache.org/jira/browse/ARROW-9509) - [Release] Don't test Gandiva in the windows wheel verification script
+* [ARROW-9511](https://issues.apache.org/jira/browse/ARROW-9511) - [Packaging][Release] Set conda packages' build number to 0
+* [ARROW-9519](https://issues.apache.org/jira/browse/ARROW-9519) - [Rust] Improve error message when getting a field by name from schema
+* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel
+* [ARROW-9529](https://issues.apache.org/jira/browse/ARROW-9529) - [Dev][Release] Improvements to release verification scripts
+* [ARROW-9531](https://issues.apache.org/jira/browse/ARROW-9531) - [Packaging][Release] Update conda forge dependency pins
+* [PARQUET-1820](https://issues.apache.org/jira/browse/PARQUET-1820) - [C++] Use a column filter hint to inform read prefetching in Arrow reads
+* [PARQUET-1843](https://issues.apache.org/jira/browse/PARQUET-1843) - [C++] Unnecessary assignment in DictDecoderImpl::Decode
+* [PARQUET-1855](https://issues.apache.org/jira/browse/PARQUET-1855) - [C++] Improve documentation on MetaData ownership
+* [PARQUET-1861](https://issues.apache.org/jira/browse/PARQUET-1861) - [Documentation][C++] Explain ReaderProperters.buffer\_stream\*
+
+
+
+# Apache Arrow 0.17.1 (2020-05-18)
+
+## Bug Fixes
+
+* [ARROW-8503](https://issues.apache.org/jira/browse/ARROW-8503) - [Packaging][deb] Can't build apache-arrow-archive-keyring for RC
+* [ARROW-8505](https://issues.apache.org/jira/browse/ARROW-8505) - [Release][C\#] "sourcelink test" is failed by Apache.Arrow.AssemblyInfo.cs
+* [ARROW-8584](https://issues.apache.org/jira/browse/ARROW-8584) - [Packaging][C++] Protobuf link error in deb builds
+* [ARROW-8608](https://issues.apache.org/jira/browse/ARROW-8608) - [C++] Update vendored mpark/variant.h to latest to fix NVCC compilation issues
+* [ARROW-8609](https://issues.apache.org/jira/browse/ARROW-8609) - [C++] ORC JNI bridge crashed on null arrow buffer
+* [ARROW-8641](https://issues.apache.org/jira/browse/ARROW-8641) - [Python] Regression in feather: no longer supports permutation in column selection
+* [ARROW-8657](https://issues.apache.org/jira/browse/ARROW-8657) - [Python][C++][Parquet] Forward compatibility issue from 0.16 to 0.17 when using version='2.0'
+* [ARROW-8684](https://issues.apache.org/jira/browse/ARROW-8684) - [Python] "SystemError: Bad call flags in \_PyMethodDef\_RawFastCallDict" in Python 3.7.7 on macOS when using pyarrow wheel
+* [ARROW-8694](https://issues.apache.org/jira/browse/ARROW-8694) - [Python][Parquet] parquet.read\_schema() fails when loading wide table created from Pandas DataFrame
+* [ARROW-8704](https://issues.apache.org/jira/browse/ARROW-8704) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz)
+* [ARROW-8706](https://issues.apache.org/jira/browse/ARROW-8706) - [C++][Parquet] Tracking JIRA for PARQUET-1857 (unencrypted INT16\_MAX Parquet row group limit)
+* [ARROW-8728](https://issues.apache.org/jira/browse/ARROW-8728) - [C++] Bitmap operation may cause buffer overflow
+* [ARROW-8741](https://issues.apache.org/jira/browse/ARROW-8741) - [Python][Packaging] Keep VS2015 with for the windows wheels
+* [ARROW-8750](https://issues.apache.org/jira/browse/ARROW-8750) - [Python] pyarrow.feather.write\_feather does not default to lz4 compression if it's available
+* [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups
+
+
+## New Features and Improvements
+
+* [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
+* [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups
+* [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion
+* [ARROW-8758](https://issues.apache.org/jira/browse/ARROW-8758) - [R] Updates for compatibility with dplyr 1.0
+* [ARROW-8786](https://issues.apache.org/jira/browse/ARROW-8786) - [Packaging][rpm] Use bundled zstd in the CentOS 8 build
+
+
+
+# Apache Arrow 0.17.0 (2020-04-20)
+
+## Bug Fixes
+
+* [ARROW-1907](https://issues.apache.org/jira/browse/ARROW-1907) - [C++/Python] Feather format cannot accommodate string columns containing more than a total of 2GB of data
+* [ARROW-2255](https://issues.apache.org/jira/browse/ARROW-2255) - [Developer][Integration] Serialize schema- and field-level custom metadata in integration test JSON format
+* [ARROW-2587](https://issues.apache.org/jira/browse/ARROW-2587) - [Python] Unable to write StructArrays with multiple children to parquet
+* [ARROW-3004](https://issues.apache.org/jira/browse/ARROW-3004) - [Documentation] Builds docs for master rather than a pinned commit
+* [ARROW-3543](https://issues.apache.org/jira/browse/ARROW-3543) - [R] Better support for timestamp format and time zones in R
+* [ARROW-5265](https://issues.apache.org/jira/browse/ARROW-5265) - [Python/CI] Add integration test with kartothek
+* [ARROW-5473](https://issues.apache.org/jira/browse/ARROW-5473) - [C++] Build failure on googletest\_ep on Windows when using Ninja
+* [ARROW-5981](https://issues.apache.org/jira/browse/ARROW-5981) - [C++] DictionaryBuilder<T\> initialization with Array can fail silently
+* [ARROW-6528](https://issues.apache.org/jira/browse/ARROW-6528) - [C++] Spurious Flight test failures (port allocation failure)
+* [ARROW-6547](https://issues.apache.org/jira/browse/ARROW-6547) - [C++] valgrind errors in diff-test
+* [ARROW-6738](https://issues.apache.org/jira/browse/ARROW-6738) - [Java] Fix problems with current union comparison logic
+* [ARROW-6757](https://issues.apache.org/jira/browse/ARROW-6757) - [Python] Creating csv.ParseOptions() causes "Windows fatal exception: access violation" with Visual Studio 2017
+* [ARROW-6871](https://issues.apache.org/jira/browse/ARROW-6871) - [Java] Enhance TransferPair related parameters check and tests
+* [ARROW-6872](https://issues.apache.org/jira/browse/ARROW-6872) - [C++][Python] Empty table with dictionary-columns raises ArrowNotImplementedError
+* [ARROW-6890](https://issues.apache.org/jira/browse/ARROW-6890) - [Rust] [Parquet] ArrowReader fails with seg fault
+* [ARROW-6895](https://issues.apache.org/jira/browse/ARROW-6895) - [C++][Parquet] parquet::arrow::ColumnReader: ByteArrayDictionaryRecordReader repeats returned values when calling \`NextBatch()\`
+* [ARROW-7008](https://issues.apache.org/jira/browse/ARROW-7008) - [Python] pyarrow.chunked\_array([array]) fails on array with all-None buffers
+* [ARROW-7049](https://issues.apache.org/jira/browse/ARROW-7049) - [C++] warnings building on mingw-w64
+* [ARROW-7301](https://issues.apache.org/jira/browse/ARROW-7301) - [Java] Sql type DATE should correspond to DateDayVector
+* [ARROW-7335](https://issues.apache.org/jira/browse/ARROW-7335) - [C++][Gandiva] Add castBIGINT, extractDay interval\_day functions in Gandiva
+* [ARROW-7390](https://issues.apache.org/jira/browse/ARROW-7390) - [C++][Dataset] Concurrency race in Projector::Project
+* [ARROW-7405](https://issues.apache.org/jira/browse/ARROW-7405) - [Java] ListVector isEmpty API is incorrect
+* [ARROW-7466](https://issues.apache.org/jira/browse/ARROW-7466) - [CI][Java] Fix gandiva-jar-osx nightly build failure
+* [ARROW-7467](https://issues.apache.org/jira/browse/ARROW-7467) - [Java] ComplexCopier does incorrect copy for Map nullable info
+* [ARROW-7507](https://issues.apache.org/jira/browse/ARROW-7507) - [Rust] Bump Thrift version to 0.13 in parquet-format and parquet
+* [ARROW-7520](https://issues.apache.org/jira/browse/ARROW-7520) - [R] Writing many batches causes a crash
+* [ARROW-7546](https://issues.apache.org/jira/browse/ARROW-7546) - [Java] Use new implementation to concat vectors values in batch
+* [ARROW-7624](https://issues.apache.org/jira/browse/ARROW-7624) - [Rust] Soundness issues via \`Buffer\` methods
+* [ARROW-7628](https://issues.apache.org/jira/browse/ARROW-7628) - [Python] Better document some read\_csv corner cases
+* [ARROW-7631](https://issues.apache.org/jira/browse/ARROW-7631) - [C++][Gandiva] return zero if there is an overflow while converting a decimal to a lower precision/scale
+* [ARROW-7672](https://issues.apache.org/jira/browse/ARROW-7672) - [C++] NULL pointer dereference bug
+* [ARROW-7680](https://issues.apache.org/jira/browse/ARROW-7680) - [C++][Dataset] Partition discovery is not working with windows path
+* [ARROW-7701](https://issues.apache.org/jira/browse/ARROW-7701) - [C++] [CI] Flight test error on macOS
+* [ARROW-7713](https://issues.apache.org/jira/browse/ARROW-7713) - [Java] TastLeak was put at the wrong location
+* [ARROW-7722](https://issues.apache.org/jira/browse/ARROW-7722) - [Java][FlightRPC] Memory leak
+* [ARROW-7734](https://issues.apache.org/jira/browse/ARROW-7734) - [C++] Segfault when comparing status with and without detail
+* [ARROW-7740](https://issues.apache.org/jira/browse/ARROW-7740) - [C++] Array internals corruption in StructArray::Flatten
+* [ARROW-7755](https://issues.apache.org/jira/browse/ARROW-7755) - [Python] Windows wheel cannot be installed on Python 3.8
+* [ARROW-7758](https://issues.apache.org/jira/browse/ARROW-7758) - [Python] Wrong conversion of timestamps that are out of bounds for pandas (eg 0000-01-01)
+* [ARROW-7760](https://issues.apache.org/jira/browse/ARROW-7760) - [Release] Fix verify-release-candidate.sh since pip3 seems to no longer be in miniconda
+* [ARROW-7762](https://issues.apache.org/jira/browse/ARROW-7762) - [Python] Exceptions in ParquetWriter get ignored
+* [ARROW-7766](https://issues.apache.org/jira/browse/ARROW-7766) - [Python][Packaging] Windows py38 wheels are built with wrong ABI tag
+* [ARROW-7772](https://issues.apache.org/jira/browse/ARROW-7772) - [R][C++][Dataset] Unable to filter on date32 object with date64 scalar
+* [ARROW-7775](https://issues.apache.org/jira/browse/ARROW-7775) - [Rust] Don't let safe code arbitrarily transmute readers and writers
+* [ARROW-7777](https://issues.apache.org/jira/browse/ARROW-7777) - [Go] StructBuilder/ListBuilder index out of range panic
+* [ARROW-7780](https://issues.apache.org/jira/browse/ARROW-7780) - [Release] Fix Windows wheel RC verification script given lack of "m" ABI tag in Python 3.8
+* [ARROW-7781](https://issues.apache.org/jira/browse/ARROW-7781) - [C++][Dataset] Filtering on a non-existent column gives a segfault
+* [ARROW-7783](https://issues.apache.org/jira/browse/ARROW-7783) - [C++] ARROW\_DATASET should enable ARROW\_COMPUTE
+* [ARROW-7785](https://issues.apache.org/jira/browse/ARROW-7785) - [C++] sparse\_tensor.cc is extremely slow to compile
+* [ARROW-7786](https://issues.apache.org/jira/browse/ARROW-7786) - [R] Wire up check\_metadata in Table.Equals method
+* [ARROW-7789](https://issues.apache.org/jira/browse/ARROW-7789) - [R] Can't initialize arrow objects when R.oo package is loaded
+* [ARROW-7791](https://issues.apache.org/jira/browse/ARROW-7791) - [C++][Parquet] Fix building error "cannot bind lvalue"
+* [ARROW-7792](https://issues.apache.org/jira/browse/ARROW-7792) - [R] read\_\* functions should close connection to file
+* [ARROW-7793](https://issues.apache.org/jira/browse/ARROW-7793) - [Java] If there is a leak the base allocator should release the excess memory to parent before throwing exception
+* [ARROW-7794](https://issues.apache.org/jira/browse/ARROW-7794) - [Rust] cargo publish fails for arrow-flight due to relative path to Flight.proto
+* [ARROW-7794](https://issues.apache.org/jira/browse/ARROW-7794) - [Rust] cargo publish fails for arrow-flight due to relative path to Flight.proto
+* [ARROW-7797](https://issues.apache.org/jira/browse/ARROW-7797) - [Release][Rust] Fix arrow-flight's version in datafusion crate
+* [ARROW-7802](https://issues.apache.org/jira/browse/ARROW-7802) - [C++] Support for LargeBinary and LargeString in the hash kernel
+* [ARROW-7806](https://issues.apache.org/jira/browse/ARROW-7806) - [Python] Implement to\_pandas for lists of LargeBinary/String
+* [ARROW-7807](https://issues.apache.org/jira/browse/ARROW-7807) - [R] Installation on RHEL 7 Cannot call io\_\_\_MemoryMappedFile\_\_Open()
+* [ARROW-7809](https://issues.apache.org/jira/browse/ARROW-7809) - [R] vignette does not run on Win 10 nor ubuntu
+* [ARROW-7813](https://issues.apache.org/jira/browse/ARROW-7813) - [Rust] Fix undefined behaviour and and remove unsafe
+* [ARROW-7815](https://issues.apache.org/jira/browse/ARROW-7815) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
+* [ARROW-7827](https://issues.apache.org/jira/browse/ARROW-7827) - [Python] conda-forge pyarrow package does not have s3 enabled
+* [ARROW-7832](https://issues.apache.org/jira/browse/ARROW-7832) - [R] Patches to 0.16.0 release
+* [ARROW-7836](https://issues.apache.org/jira/browse/ARROW-7836) - [Rust] "allocate\_aligned"/"reallocate" need to initialize memory to avoid UB
+* [ARROW-7837](https://issues.apache.org/jira/browse/ARROW-7837) - [Java] bug in BaseVariableWidthVector.copyFromSafe results with an index out of bounds exception
+* [ARROW-7838](https://issues.apache.org/jira/browse/ARROW-7838) - [C++] Installed plasma-store-server fails finding Boost
+* [ARROW-7841](https://issues.apache.org/jira/browse/ARROW-7841) - [C++] HADOOP\_HOME doesn't work to find libhdfs.so
+* [ARROW-7844](https://issues.apache.org/jira/browse/ARROW-7844) - [R] array\_to\_vector is not thread safe
+* [ARROW-7848](https://issues.apache.org/jira/browse/ARROW-7848) - Add doc for MapType
+* [ARROW-7852](https://issues.apache.org/jira/browse/ARROW-7852) - [Python] 0.16.0 wheels not compatible with older numpy
+* [ARROW-7857](https://issues.apache.org/jira/browse/ARROW-7857) - [Python] Failing test with pandas master for extension type conversion
+* [ARROW-7861](https://issues.apache.org/jira/browse/ARROW-7861) - [C++][Parquet] Add fuzz regression corpus for parquet reader
+* [ARROW-7884](https://issues.apache.org/jira/browse/ARROW-7884) - [C++][Python] Crash in pq.read\_table()
+* [ARROW-7887](https://issues.apache.org/jira/browse/ARROW-7887) - [Rust] Filter kernel does not support temporal types
+* [ARROW-7889](https://issues.apache.org/jira/browse/ARROW-7889) - [Rust] Datafusion CLI does not support registering Parquet files
+* [ARROW-7899](https://issues.apache.org/jira/browse/ARROW-7899) - [Integration][Java] null type integration test
+* [ARROW-7908](https://issues.apache.org/jira/browse/ARROW-7908) - [R] Can't install package without setting LIBARROW\_DOWNLOAD=true
+* [ARROW-7922](https://issues.apache.org/jira/browse/ARROW-7922) - [CI][Crossbow] Nightly macOS wheel builds fail (brew bundle edition)
+* [ARROW-7923](https://issues.apache.org/jira/browse/ARROW-7923) - [CI][Crossbow] macOS autobrew fails on homebrew-versions
+* [ARROW-7926](https://issues.apache.org/jira/browse/ARROW-7926) - [Developer] "archery lint" target is not ergonomic for running a single check like IWYU
+* [ARROW-7928](https://issues.apache.org/jira/browse/ARROW-7928) - [Python] Example of flight server and client not working
+* [ARROW-7931](https://issues.apache.org/jira/browse/ARROW-7931) - [C++] Fix crash on corrupt Map array input (OSS-Fuzz)
+* [ARROW-7936](https://issues.apache.org/jira/browse/ARROW-7936) - [Python] FileSystem.from\_uri test fails on python 3.5
+* [ARROW-7940](https://issues.apache.org/jira/browse/ARROW-7940) - [C++] Unable to generate cmake build with settings other than default
+* [ARROW-7944](https://issues.apache.org/jira/browse/ARROW-7944) - [Python] Test failures without Pandas
+* [ARROW-7956](https://issues.apache.org/jira/browse/ARROW-7956) - [Python] Memory leak in pyarrow functions .ipc.serialize\_pandas/deserialize\_pandas
+* [ARROW-7958](https://issues.apache.org/jira/browse/ARROW-7958) - [Java] Update Avro to version 1.9.2
+* [ARROW-7962](https://issues.apache.org/jira/browse/ARROW-7962) - [R][Dataset] Followup to "Consolidate Source and Dataset classes"
+* [ARROW-7968](https://issues.apache.org/jira/browse/ARROW-7968) - [C++] orc\_ep build fails on 64-bit Raspbian
+* [ARROW-7973](https://issues.apache.org/jira/browse/ARROW-7973) - [Developer][C++] ResourceWarnings in run\_cpplint.py
+* [ARROW-7974](https://issues.apache.org/jira/browse/ARROW-7974) - [Developer][C++] ResourceWarning in "make check-format"
+* [ARROW-7975](https://issues.apache.org/jira/browse/ARROW-7975) - [C++] Do not include padding bytes in "Buffer" IPC metadata accounting
+* [ARROW-7978](https://issues.apache.org/jira/browse/ARROW-7978) - [Developer] GitHub Actions "lint" task is running include-what-you-use and failing
+* [ARROW-7980](https://issues.apache.org/jira/browse/ARROW-7980) - [Python] Deserialization with pyarrow fails for certain Timestamp-based data frame
+* [ARROW-7981](https://issues.apache.org/jira/browse/ARROW-7981) - [C++][Dataset] Fails to compile on gcc 5.4
+* [ARROW-7985](https://issues.apache.org/jira/browse/ARROW-7985) - [C++] ListBuilder.Finish fails if underlying value builder is empty and .Reserve'd
+* [ARROW-7990](https://issues.apache.org/jira/browse/ARROW-7990) - [C++][Developer] Add "archery lint" option for running "iwyu.sh all"
+* [ARROW-7992](https://issues.apache.org/jira/browse/ARROW-7992) - [C++] MSVC warning causing Appveyor failure in sort\_to\_indices.cc
+* [ARROW-7996](https://issues.apache.org/jira/browse/ARROW-7996) - [Python] Error serializing empty pandas DataFrame with pyarrow
+* [ARROW-7997](https://issues.apache.org/jira/browse/ARROW-7997) - [Python] Schema equals method with inconsistent docs in pyarrow
+* [ARROW-7999](https://issues.apache.org/jira/browse/ARROW-7999) - [C++] Fix crash on corrupt Map array input (OSS-Fuzz)
+* [ARROW-8000](https://issues.apache.org/jira/browse/ARROW-8000) - [C++] gcc 4.8 build failures
+* [ARROW-8003](https://issues.apache.org/jira/browse/ARROW-8003) - [C++] -DBZip2\_SOURCE=BUNDLED fails when building with clang
+* [ARROW-8006](https://issues.apache.org/jira/browse/ARROW-8006) - [C++] Unsafe arrow dictionary recovered from parquet
+* [ARROW-8007](https://issues.apache.org/jira/browse/ARROW-8007) - [Python] Remove unused and defunct assert\_get\_object\_equal in plasma tests
+* [ARROW-8008](https://issues.apache.org/jira/browse/ARROW-8008) - [C++/Python] Framework Python is preferred even though not the activated one
+* [ARROW-8009](https://issues.apache.org/jira/browse/ARROW-8009) - [Java] Fix the hash code methods for BitVector
+* [ARROW-8011](https://issues.apache.org/jira/browse/ARROW-8011) - [C++] Some buffers not resized when reading from Parquet
+* [ARROW-8013](https://issues.apache.org/jira/browse/ARROW-8013) - [Python][Packaging] Fix manylinux wheels
+* [ARROW-8021](https://issues.apache.org/jira/browse/ARROW-8021) - [Python] Appveyor does not appear to be including pandas in test runs
+* [ARROW-8029](https://issues.apache.org/jira/browse/ARROW-8029) - [R] rstudio/r-base:3.6-centos7 GHA build failing on master
+* [ARROW-8036](https://issues.apache.org/jira/browse/ARROW-8036) - [C++] Compilation failure with gtest 1.10.0
+* [ARROW-8042](https://issues.apache.org/jira/browse/ARROW-8042) - [Python] pyarrow.ChunkedArray docstring is incorrect regarding zero-length ChunkedArray having no chunks
+* [ARROW-8057](https://issues.apache.org/jira/browse/ARROW-8057) - [Python] Don't check Schema metadata in \_\_eq\_\_ and \_\_ne\_\_
+* [ARROW-8070](https://issues.apache.org/jira/browse/ARROW-8070) - [C++] Cast segfaults on unsupported cast from list<binary\> to utf8
+* [ARROW-8071](https://issues.apache.org/jira/browse/ARROW-8071) - [GLib] Build error with configure
+* [ARROW-8075](https://issues.apache.org/jira/browse/ARROW-8075) - [R] Loading R.utils after arrow breaks some arrow functions
+* [ARROW-8088](https://issues.apache.org/jira/browse/ARROW-8088) - [C++][Dataset] Partition columns with specified dictionary type result in all nulls
+* [ARROW-8091](https://issues.apache.org/jira/browse/ARROW-8091) - [CI][Crossbow] Fix nightly homebrew and R failures
+* [ARROW-8092](https://issues.apache.org/jira/browse/ARROW-8092) - [CI][Crossbow] OSX wheels fail on bundled bzip2
+* [ARROW-8094](https://issues.apache.org/jira/browse/ARROW-8094) - [CI][Crossbow] Nightly valgrind test fails
+* [ARROW-8095](https://issues.apache.org/jira/browse/ARROW-8095) - [CI][Crossbow] Nightly turbodbc job fails
+* [ARROW-8098](https://issues.apache.org/jira/browse/ARROW-8098) - [go] Checkptr Failures on Go 1.14
+* [ARROW-8099](https://issues.apache.org/jira/browse/ARROW-8099) - [Integration] archery integration --with-LANG flags don't work
+* [ARROW-8101](https://issues.apache.org/jira/browse/ARROW-8101) - [FlightRPC][Java] Can't read/write only an empty null array
+* [ARROW-8102](https://issues.apache.org/jira/browse/ARROW-8102) - [Dev] Crossbow's version detection doesn't work in the comment bot's scenario
+* [ARROW-8105](https://issues.apache.org/jira/browse/ARROW-8105) - [Python] pyarrow.array segfaults when passed masked array with shrunken mask
+* [ARROW-8106](https://issues.apache.org/jira/browse/ARROW-8106) - [Python] Builds on master broken by pandas 1.0.2 release
+* [ARROW-8110](https://issues.apache.org/jira/browse/ARROW-8110) - [C\#] BuildArrays fails if NestedType is included
+* [ARROW-8112](https://issues.apache.org/jira/browse/ARROW-8112) - [FlightRPC][C++] Some status codes don't round-trip through gRPC
+* [ARROW-8119](https://issues.apache.org/jira/browse/ARROW-8119) - [Dev] Make Yaml optional dependency for archery
+* [ARROW-8122](https://issues.apache.org/jira/browse/ARROW-8122) - [Python] Empty numpy arrays with shape cannot be deserialized
+* [ARROW-8125](https://issues.apache.org/jira/browse/ARROW-8125) - [C++] "arrow-tests" target broken with ninja build
+* [ARROW-8127](https://issues.apache.org/jira/browse/ARROW-8127) - [C++] [Parquet] Incorrect column chunk metadata for multipage batch writes
+* [ARROW-8128](https://issues.apache.org/jira/browse/ARROW-8128) - [C\#] NestedType children serialized on wrong length
+* [ARROW-8132](https://issues.apache.org/jira/browse/ARROW-8132) - [C++] arrow-s3fs-test failing on master
+* [ARROW-8133](https://issues.apache.org/jira/browse/ARROW-8133) - [CI] Github Actions sometimes fail to checkout Arrow
+* [ARROW-8136](https://issues.apache.org/jira/browse/ARROW-8136) - [C++][Python] Creating dataset from relative path no longer working
+* [ARROW-8136](https://issues.apache.org/jira/browse/ARROW-8136) - [C++][Python] Creating dataset from relative path no longer working
+* [ARROW-8138](https://issues.apache.org/jira/browse/ARROW-8138) - [C++] parquet::arrow::FileReader cannot read multiple RowGroup
+* [ARROW-8139](https://issues.apache.org/jira/browse/ARROW-8139) - [C++] FileSystem enum causes attributes warning
+* [ARROW-8142](https://issues.apache.org/jira/browse/ARROW-8142) - [C++] Casting a chunked array with 0 chunks critical failure
+* [ARROW-8144](https://issues.apache.org/jira/browse/ARROW-8144) - [CI] Cmake 3.2 nightly build fails
+* [ARROW-8154](https://issues.apache.org/jira/browse/ARROW-8154) - [Python] HDFS Filesystem does not set environment variables in pyarrow 0.16.0 release
+* [ARROW-8159](https://issues.apache.org/jira/browse/ARROW-8159) - [Python] pyarrow.Schema.from\_pandas doesn't support ExtensionDtype
+* [ARROW-8166](https://issues.apache.org/jira/browse/ARROW-8166) - [C++] AVX512 intrinsics fail to compile with clang-8 on Ubuntu 18.04
+* [ARROW-8176](https://issues.apache.org/jira/browse/ARROW-8176) - [FlightRPC][Integration] Have Flight services bind to port 0 in integration
+* [ARROW-8186](https://issues.apache.org/jira/browse/ARROW-8186) - [Python] Dataset expression != returns bool instead of expression for invalid value
+* [ARROW-8188](https://issues.apache.org/jira/browse/ARROW-8188) - [R] Adapt to latest checks in R-devel
+* [ARROW-8193](https://issues.apache.org/jira/browse/ARROW-8193) - [C++] arrow-future-test fails to compile on gcc 4.8
+* [ARROW-8197](https://issues.apache.org/jira/browse/ARROW-8197) - [Rust] DataFusion "create\_physical\_plan" returns incorrect schema?
+* [ARROW-8206](https://issues.apache.org/jira/browse/ARROW-8206) - [R] Minor fix for backwards compatibility on Linux installation
+* [ARROW-8209](https://issues.apache.org/jira/browse/ARROW-8209) - [Python] Accessing duplicate column of Table by name gives wrong error
+* [ARROW-8213](https://issues.apache.org/jira/browse/ARROW-8213) - [Python][Dataset] Opening a dataset with a local incorrect path gives confusing error message
+* [ARROW-8216](https://issues.apache.org/jira/browse/ARROW-8216) - [R][C++][Dataset] Filtering returns all-missing rows where the filtering column is missing
+* [ARROW-8217](https://issues.apache.org/jira/browse/ARROW-8217) - [R][C++] Fix crashing test in test-dataset.R on 32-bit Windows from ARROW-7979
+* [ARROW-8219](https://issues.apache.org/jira/browse/ARROW-8219) - [Rust] sqlparser crate needs to be bumped to version 0.2.5
+* [ARROW-8223](https://issues.apache.org/jira/browse/ARROW-8223) - [Python] Schema.from\_pandas breaks with pandas nullable integer dtype
+* [ARROW-8233](https://issues.apache.org/jira/browse/ARROW-8233) - [CI] Build timeouts on "AMD64 Windows MinGW 64 GLib & Ruby "
+* [ARROW-8234](https://issues.apache.org/jira/browse/ARROW-8234) - [CI] Build timeouts on "AMD64 Windows RTools 35"
+* [ARROW-8236](https://issues.apache.org/jira/browse/ARROW-8236) - [Rust] Linting GitHub Actions task failing
+* [ARROW-8237](https://issues.apache.org/jira/browse/ARROW-8237) - [Python] Review Developer build instructions for conda and non-conda users
+* [ARROW-8237](https://issues.apache.org/jira/browse/ARROW-8237) - [Python] Review Developer build instructions for conda and non-conda users
+* [ARROW-8238](https://issues.apache.org/jira/browse/ARROW-8238) - [C++][Compute] Failed to build compute tests on windows with msvc2015
+* [ARROW-8239](https://issues.apache.org/jira/browse/ARROW-8239) - [Java] fix param checks in splitAndTransfer method
+* [ARROW-8245](https://issues.apache.org/jira/browse/ARROW-8245) - [Python][Parquet] Skip hidden directories when reading partitioned parquet files
+* [ARROW-8254](https://issues.apache.org/jira/browse/ARROW-8254) - [Rust] [DataFusion] CLI is not working as expected
+* [ARROW-8255](https://issues.apache.org/jira/browse/ARROW-8255) - [Rust] [DataFusion] COUNT(\*) results in confusing error
+* [ARROW-8259](https://issues.apache.org/jira/browse/ARROW-8259) - [Rust] [DataFusion] ProjectionPushDownRule does not rewrite LIMIT
+* [ARROW-8268](https://issues.apache.org/jira/browse/ARROW-8268) - [Ruby] Test failure due to lack of built ZSTD support
+* [ARROW-8269](https://issues.apache.org/jira/browse/ARROW-8269) - [Python] Failure in "nopandas" build in test\_parquet\_row\_group\_fragments
+* [ARROW-8270](https://issues.apache.org/jira/browse/ARROW-8270) - [Python][Flight] Example Flight server with TLS's certificate and key is not working
+* [ARROW-8272](https://issues.apache.org/jira/browse/ARROW-8272) - [CI][Python] Test failure on Ubuntu 16.04
+* [ARROW-8274](https://issues.apache.org/jira/browse/ARROW-8274) - [C++] Use LZ4 frame format for "LZ4" compression in IPC write
+* [ARROW-8276](https://issues.apache.org/jira/browse/ARROW-8276) - [C++][Dataset] Scanning a Fragment does not take into account the partition columns
+* [ARROW-8280](https://issues.apache.org/jira/browse/ARROW-8280) - [C++] MinGW builds failing due to CARES-related toolchain issue
+* [ARROW-8286](https://issues.apache.org/jira/browse/ARROW-8286) - [Python] Creating dataset from pathlib results in UnionDataset instead of FileSystemDataset
+* [ARROW-8298](https://issues.apache.org/jira/browse/ARROW-8298) - [C++][CI] MinGW builds fail building grpc
+* [ARROW-8303](https://issues.apache.org/jira/browse/ARROW-8303) - [Python] Fix test failure caused by non-deterministic dict key ordering on Python 3.5
+* [ARROW-8304](https://issues.apache.org/jira/browse/ARROW-8304) - [Flight][Python] Flight client with TLS root certificate is reporting error on do\_get()
+* [ARROW-8305](https://issues.apache.org/jira/browse/ARROW-8305) - [Java] ExtensionTypeVector should make sure underlyingVector not null
+* [ARROW-8310](https://issues.apache.org/jira/browse/ARROW-8310) - [C++] Minio's exceptions not recognized by IsConnectError()
+* [ARROW-8315](https://issues.apache.org/jira/browse/ARROW-8315) - [Python][Dataset] Don't rely on ordered dict keys in test\_dataset.py
+* [ARROW-8323](https://issues.apache.org/jira/browse/ARROW-8323) - [C++] Pin gRPC at v1.27 to avoid compilation error in its headers
+* [ARROW-8326](https://issues.apache.org/jira/browse/ARROW-8326) - [C++] Don't use deprecated TYPED\_TEST\_CASE
+* [ARROW-8327](https://issues.apache.org/jira/browse/ARROW-8327) - [FlightRPC][Java] gRPC trailers may be null
+* [ARROW-8331](https://issues.apache.org/jira/browse/ARROW-8331) - [C++] arrow-compute-filter-benchmark fails to compile
+* [ARROW-8333](https://issues.apache.org/jira/browse/ARROW-8333) - [C++][CI] Always compile benchmarks in some C++ CI entry
+* [ARROW-8334](https://issues.apache.org/jira/browse/ARROW-8334) - [C++] [Gandiva] Missing DATE32 in LLVM Types / Simple D32 Compute Functions
+* [ARROW-8342](https://issues.apache.org/jira/browse/ARROW-8342) - [Python] dask and kartothek integration tests are failing
+* [ARROW-8345](https://issues.apache.org/jira/browse/ARROW-8345) - [Python] feather.read\_table should not require pandas
+* [ARROW-8346](https://issues.apache.org/jira/browse/ARROW-8346) - [CI][Ruby] GLib/Ruby macOS build fails on zlib
+* [ARROW-8349](https://issues.apache.org/jira/browse/ARROW-8349) - [CI][NIGHTLY:gandiva-jar-osx] Use latest pygit2
+* [ARROW-8353](https://issues.apache.org/jira/browse/ARROW-8353) - [C++] is\_nullable maybe not initialized in parquet writer
+* [ARROW-8354](https://issues.apache.org/jira/browse/ARROW-8354) - [R] Fix segfault in Table to Array conversion
+* [ARROW-8357](https://issues.apache.org/jira/browse/ARROW-8357) - [Rust] [DataFusion] Dockerfile for CLI is missing format dir
+* [ARROW-8358](https://issues.apache.org/jira/browse/ARROW-8358) - [C++] Fix -Wrange-loop-construct warnings in clang-11
+* [ARROW-8365](https://issues.apache.org/jira/browse/ARROW-8365) - [C++] Error when writing files to S3 larger than 5 GB
+* [ARROW-8366](https://issues.apache.org/jira/browse/ARROW-8366) - [Rust] Need to revert recent arrow-flight build change
+* [ARROW-8369](https://issues.apache.org/jira/browse/ARROW-8369) - [CI] Fix crossbow wildcard groups
+* [ARROW-8373](https://issues.apache.org/jira/browse/ARROW-8373) - [GLib] Problems resolving gobject-introspection, arrow in Meson builds
+* [ARROW-8380](https://issues.apache.org/jira/browse/ARROW-8380) - [RUST] StringDictionaryBuilder not publicly exported from arrow::array
+* [ARROW-8384](https://issues.apache.org/jira/browse/ARROW-8384) - [C++][Python] arrow/filesystem/hdfs.h and Python wrapper does not have an option for setting a path to a Kerberos ticket
+* [ARROW-8386](https://issues.apache.org/jira/browse/ARROW-8386) - [Python] pyarrow.jvm raises error for empty Arrays
+* [ARROW-8388](https://issues.apache.org/jira/browse/ARROW-8388) - [C++] GCC 4.8 fails to move on return
+* [ARROW-8397](https://issues.apache.org/jira/browse/ARROW-8397) - [C++] Fail to compile aggregate\_test.cc on Ubuntu 16.04
+* [ARROW-8406](https://issues.apache.org/jira/browse/ARROW-8406) - [Python] test\_fs fails when run from a different drive on Windows
+* [ARROW-8410](https://issues.apache.org/jira/browse/ARROW-8410) - [C++] CMake fails on aarch64 systems that do not support -march=armv8-a+crc+crypto
+* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py
+* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py
+* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py
+* [ARROW-8415](https://issues.apache.org/jira/browse/ARROW-8415) - [C++][Packaging] fix gandiva linux job
+* [ARROW-8416](https://issues.apache.org/jira/browse/ARROW-8416) - [Python] Provide a "feather" alias in the dataset API
+* [ARROW-8420](https://issues.apache.org/jira/browse/ARROW-8420) - [C++] CMake fails to configure on armv7l platform (e.g. Raspberry Pi 3)
+* [ARROW-8427](https://issues.apache.org/jira/browse/ARROW-8427) - [C++][Dataset] Do not ignore file paths with underscore/dot when full path was specified
+* [ARROW-8428](https://issues.apache.org/jira/browse/ARROW-8428) - [C++][NIGHTLY:gandiva-jar-trusty] GCC 4.8 failures in C++ unit tests
+* [ARROW-8429](https://issues.apache.org/jira/browse/ARROW-8429) - [C++] Fix Buffer::CopySlice on 0-sized buffer
+* [ARROW-8432](https://issues.apache.org/jira/browse/ARROW-8432) - [Python][CI] Failure to download Hadoop
+* [ARROW-8437](https://issues.apache.org/jira/browse/ARROW-8437) - [C++] Remove std::move return value from MakeRandomNullBitmap test utility
+* [ARROW-8438](https://issues.apache.org/jira/browse/ARROW-8438) - [C++] arrow-io-memory-benchmark crashes
+* [ARROW-8439](https://issues.apache.org/jira/browse/ARROW-8439) - [Python] Filesystem docs are outdated
+* [ARROW-8441](https://issues.apache.org/jira/browse/ARROW-8441) - [C++] Fix crashes on invalid input (OSS-Fuzz)
+* [ARROW-8442](https://issues.apache.org/jira/browse/ARROW-8442) - [Python] NullType.to\_pandas\_dtype inconsisent with dtype returned in to\_pandas/to\_numpy
+* [ARROW-8460](https://issues.apache.org/jira/browse/ARROW-8460) - [Packaging][deb] Ubuntu Focal build is failed
+* [ARROW-8465](https://issues.apache.org/jira/browse/ARROW-8465) - [Packaging][Python] Windows py35 wheel build fails because of boost
+* [ARROW-8466](https://issues.apache.org/jira/browse/ARROW-8466) - [Packaging] The python unittests are not running in the windows wheel builds
+* [ARROW-8468](https://issues.apache.org/jira/browse/ARROW-8468) - [Document] Fix the incorrect null bits description
+* [ARROW-8469](https://issues.apache.org/jira/browse/ARROW-8469) - [Dev] Fix nightly docker tests on azure
+* [ARROW-8478](https://issues.apache.org/jira/browse/ARROW-8478) - [Java] Rollback contrib package changes.
+* [ARROW-8498](https://issues.apache.org/jira/browse/ARROW-8498) - [Python] Schema.from\_pandas fails on extension type, while Table.from\_pandas works
+* [PARQUET-1780](https://issues.apache.org/jira/browse/PARQUET-1780) - [C++] Set ColumnMetadata.encoding\_stats field
+* [PARQUET-1788](https://issues.apache.org/jira/browse/PARQUET-1788) - [C++] ColumnWriter has undefined behavior when writing arrow chunks
+* [PARQUET-1797](https://issues.apache.org/jira/browse/PARQUET-1797) - [C++] Fix fuzzing errors
+* [PARQUET-1799](https://issues.apache.org/jira/browse/PARQUET-1799) - [C++] Stream API: Relax schema checking when reading
+* [PARQUET-1810](https://issues.apache.org/jira/browse/PARQUET-1810) - [C++] Fix undefined behaviour on invalid enum values (OSS-Fuzz)
+* [PARQUET-1813](https://issues.apache.org/jira/browse/PARQUET-1813) - [C++] Remove logging statement in unit test
+* [PARQUET-1819](https://issues.apache.org/jira/browse/PARQUET-1819) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
+* [PARQUET-1819](https://issues.apache.org/jira/browse/PARQUET-1819) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz)
+* [PARQUET-1823](https://issues.apache.org/jira/browse/PARQUET-1823) - [C++] Invalid RowGroup returned when reading with parquet::arrow::FileReader-\>RowGroup(i)-\>Column(j)
+* [PARQUET-1824](https://issues.apache.org/jira/browse/PARQUET-1824) - [C++] Fix crashes on invalid input (OSS-Fuzz)
+* [PARQUET-1829](https://issues.apache.org/jira/browse/PARQUET-1829) - [C++] Fix crashes on invalid input (OSS-Fuzz)
+* [PARQUET-1831](https://issues.apache.org/jira/browse/PARQUET-1831) - [C++] Fix crashes on invalid input (OSS-Fuzz)
+* [PARQUET-1835](https://issues.apache.org/jira/browse/PARQUET-1835) - [C++] Fix crashes on invalid input (OSS-Fuzz)
+
+
+## New Features and Improvements
+
+* [ARROW-590](https://issues.apache.org/jira/browse/ARROW-590) - [Integration] Add integration tests for Union types
+* [ARROW-1470](https://issues.apache.org/jira/browse/ARROW-1470) - [C++] Add BufferAllocator abstract interface
+* [ARROW-1560](https://issues.apache.org/jira/browse/ARROW-1560) - [C++] Kernel implementations for "match" function
+* [ARROW-1571](https://issues.apache.org/jira/browse/ARROW-1571) - [C++] Implement argsort kernels (sort indices) for integers using O(n) counting sort
+* [ARROW-1581](https://issues.apache.org/jira/browse/ARROW-1581) - [Packaging] Tooling to make nightly wheels available for install
+* [ARROW-1582](https://issues.apache.org/jira/browse/ARROW-1582) - [Python] Set up + document nightly conda builds for macOS
+* [ARROW-1636](https://issues.apache.org/jira/browse/ARROW-1636) - [Format] Integration tests for null type
+* [ARROW-2447](https://issues.apache.org/jira/browse/ARROW-2447) - [C++] Create a device abstraction
+* [ARROW-2882](https://issues.apache.org/jira/browse/ARROW-2882) - [C++][Python] Support AWS Firehose partition\_scheme implementation for Parquet datasets
+* [ARROW-3054](https://issues.apache.org/jira/browse/ARROW-3054) - [Packaging] Tooling to enable nightly conda packages to be updated to some anaconda.org channel
+* [ARROW-3410](https://issues.apache.org/jira/browse/ARROW-3410) - [C++][Dataset] Streaming CSV reader interface for memory-constrainted environments
+* [ARROW-3750](https://issues.apache.org/jira/browse/ARROW-3750) - [R] Pass various wrapped Arrow objects created in Python into R with zero copy via reticulate
+* [ARROW-4120](https://issues.apache.org/jira/browse/ARROW-4120) - [Python] Define process for testing procedures that check for no macro-level memory leaks
+* [ARROW-4226](https://issues.apache.org/jira/browse/ARROW-4226) - [Format][C++] Add CSF sparse tensor support
+* [ARROW-4286](https://issues.apache.org/jira/browse/ARROW-4286) - [C++/R] Namespace vendored Boost
+* [ARROW-4304](https://issues.apache.org/jira/browse/ARROW-4304) - [Rust] Enhance documentation for arrow
+* [ARROW-4428](https://issues.apache.org/jira/browse/ARROW-4428) - [R] Feature flags for R build
+* [ARROW-4482](https://issues.apache.org/jira/browse/ARROW-4482) - [Website] Add blog archive page
+* [ARROW-4815](https://issues.apache.org/jira/browse/ARROW-4815) - [Rust] [DataFusion] Add support for \* in SQL projection
+* [ARROW-5357](https://issues.apache.org/jira/browse/ARROW-5357) - [Rust] Add capacity field in Buffer
+* [ARROW-5405](https://issues.apache.org/jira/browse/ARROW-5405) - [Documentation] Move integration testing documentation to Sphinx docs, add instructions for JavaScript
+* [ARROW-5497](https://issues.apache.org/jira/browse/ARROW-5497) - [Release] Build and publish R/Java/JS docs
+* [ARROW-5501](https://issues.apache.org/jira/browse/ARROW-5501) - [R] Reorganize read/write file/stream functions
+* [ARROW-5510](https://issues.apache.org/jira/browse/ARROW-5510) - [Format] Feather V2 based on Arrow IPC file format, with compression support
+* [ARROW-5563](https://issues.apache.org/jira/browse/ARROW-5563) - [Format] Update integration test JSON format documentation
+* [ARROW-5585](https://issues.apache.org/jira/browse/ARROW-5585) - [Go] rename arrow.TypeEquals into arrow.TypeEqual
+* [ARROW-5742](https://issues.apache.org/jira/browse/ARROW-5742) - [CI] Add daily / weekly Valgrind build
+* [ARROW-5757](https://issues.apache.org/jira/browse/ARROW-5757) - [Python] Stop supporting Python 2.7
+* [ARROW-5949](https://issues.apache.org/jira/browse/ARROW-5949) - [Rust] Implement DictionaryArray
+* [ARROW-6165](https://issues.apache.org/jira/browse/ARROW-6165) - [Integration] Use multiprocessing to run integration tests on multiple CPU cores
+* [ARROW-6176](https://issues.apache.org/jira/browse/ARROW-6176) - [Python] Allow to subclass ExtensionArray to attach to custom extension type
+* [ARROW-6275](https://issues.apache.org/jira/browse/ARROW-6275) - [C++] Deprecate RecordBatchReader::ReadNext
+* [ARROW-6393](https://issues.apache.org/jira/browse/ARROW-6393) - [C++] Add EqualOptions support in SparseTensor::Equals
+* [ARROW-6479](https://issues.apache.org/jira/browse/ARROW-6479) - [C++] inline errors from external projects' build logs
+* [ARROW-6510](https://issues.apache.org/jira/browse/ARROW-6510) - [Python][Filesystem] Expose nanosecond resolution mtime
+* [ARROW-6666](https://issues.apache.org/jira/browse/ARROW-6666) - [Rust] [DataFusion] Implement string literal expression
+* [ARROW-6724](https://issues.apache.org/jira/browse/ARROW-6724) - [C++] Add simpler static ctor for BufferOutputStream than the current Create function
+* [ARROW-6821](https://issues.apache.org/jira/browse/ARROW-6821) - [C++][Parquet] Do not require Thrift compiler when building (but still require library)
+* [ARROW-6823](https://issues.apache.org/jira/browse/ARROW-6823) - [C++][Python][R] Support metadata in the feather format?
+* [ARROW-6829](https://issues.apache.org/jira/browse/ARROW-6829) - [Docs] Migrate integration test docs to Sphinx, fix instructions after ARROW-6466
+* [ARROW-6837](https://issues.apache.org/jira/browse/ARROW-6837) - [C++/Python] access File Footer custom\_metadata
+* [ARROW-6841](https://issues.apache.org/jira/browse/ARROW-6841) - [C++] Upgrade to LLVM 8
+* [ARROW-6875](https://issues.apache.org/jira/browse/ARROW-6875) - [FlightRPC] Implement Criteria for ListFlights RPC / list\_flights method
+* [ARROW-6915](https://issues.apache.org/jira/browse/ARROW-6915) - [Developer] Do not overwrite minor release version with merge script, even if not specified by committer
+* [ARROW-6947](https://issues.apache.org/jira/browse/ARROW-6947) - [Rust] [DataFusion] Add support for scalar UDFs
+* [ARROW-6996](https://issues.apache.org/jira/browse/ARROW-6996) - [Python] Expose boolean filter kernel on Table
+* [ARROW-7044](https://issues.apache.org/jira/browse/ARROW-7044) - [Release] Create a post release script for the home-brew formulas
+* [ARROW-7048](https://issues.apache.org/jira/browse/ARROW-7048) - [Java] Support for combining multiple vectors under VectorSchemaRoot
+* [ARROW-7063](https://issues.apache.org/jira/browse/ARROW-7063) - [C++] Schema print method prints too much metadata
+* [ARROW-7073](https://issues.apache.org/jira/browse/ARROW-7073) - [Java] Support concating vectors values in batch
+* [ARROW-7080](https://issues.apache.org/jira/browse/ARROW-7080) - [Python][Parquet][C++] Expose parquet field\_id in Schema objects
+* [ARROW-7091](https://issues.apache.org/jira/browse/ARROW-7091) - [C++] Move all factories to type\_fwd.h
+* [ARROW-7119](https://issues.apache.org/jira/browse/ARROW-7119) - [C++][CI] Use scripts/util\_coredump.sh to show automatic backtraces
+* [ARROW-7201](https://issues.apache.org/jira/browse/ARROW-7201) - [GLib][Gandiva] Add support for BooleanNode
+* [ARROW-7202](https://issues.apache.org/jira/browse/ARROW-7202) - [R][CI] Improve rwinlib building on CI to stop re-downloading dependencies
+* [ARROW-7222](https://issues.apache.org/jira/browse/ARROW-7222) - [Python][Release] Wipe any existing generated Python API documentation when updating website
+* [ARROW-7233](https://issues.apache.org/jira/browse/ARROW-7233) - [C++] Add Result<T\> APIs to IPC module
+* [ARROW-7256](https://issues.apache.org/jira/browse/ARROW-7256) - [C++] Remove ARROW\_MEMORY\_POOL\_DEFAULT macro
+* [ARROW-7330](https://issues.apache.org/jira/browse/ARROW-7330) - [C++] Add Result<T\> to APIs to arrow/gpu
+* [ARROW-7332](https://issues.apache.org/jira/browse/ARROW-7332) - [C++][Parquet] Explicitly catch status exceptions in PARQUET\_CATCH\_NOT\_OK
+* [ARROW-7336](https://issues.apache.org/jira/browse/ARROW-7336) - [C++] Implement MinMax options to not skip nulls
+* [ARROW-7338](https://issues.apache.org/jira/browse/ARROW-7338) - [C++] Improve InMemoryDataSource to support generator instead of static list
+* [ARROW-7365](https://issues.apache.org/jira/browse/ARROW-7365) - [Python] Support FixedSizeList type in conversion to numpy/pandas
+* [ARROW-7373](https://issues.apache.org/jira/browse/ARROW-7373) - [C++][Dataset] Remove FileSource
+* [ARROW-7400](https://issues.apache.org/jira/browse/ARROW-7400) - [Java] Avoids the worst case for quick sort
+* [ARROW-7412](https://issues.apache.org/jira/browse/ARROW-7412) - [C++][Dataset] Ensure that dataset code is robust to schemas with duplicate field names
+* [ARROW-7419](https://issues.apache.org/jira/browse/ARROW-7419) - [Python] Support SparseCSCMatrix
+* [ARROW-7427](https://issues.apache.org/jira/browse/ARROW-7427) - [Python] Support SparseCSFTensor
+* [ARROW-7428](https://issues.apache.org/jira/browse/ARROW-7428) - [Format][C++] Add serialization for CSF sparse tensors
+* [ARROW-7444](https://issues.apache.org/jira/browse/ARROW-7444) - [GLib] Add LocalFileSystem support
+* [ARROW-7462](https://issues.apache.org/jira/browse/ARROW-7462) - [C++] Add CpuInfo detection for Arm64 Architecture
+* [ARROW-7491](https://issues.apache.org/jira/browse/ARROW-7491) - [Java] Improve the performance of aligning
+* [ARROW-7499](https://issues.apache.org/jira/browse/ARROW-7499) - [C++] CMake should collect libs when making static build
+* [ARROW-7501](https://issues.apache.org/jira/browse/ARROW-7501) - [C++] CMake build\_thrift should build flex and bison if necessary
+* [ARROW-7515](https://issues.apache.org/jira/browse/ARROW-7515) - [C++] Rename nonexistent and non\_existent to not\_found
+* [ARROW-7524](https://issues.apache.org/jira/browse/ARROW-7524) - [C++][CI] Build parquet support in the VS2019 GitHub Actions job
+* [ARROW-7530](https://issues.apache.org/jira/browse/ARROW-7530) - [Developer] Do not include list of commits from PR in squashed summary message
+* [ARROW-7534](https://issues.apache.org/jira/browse/ARROW-7534) - [Java] Create a new java/contrib module
+* [ARROW-7547](https://issues.apache.org/jira/browse/ARROW-7547) - [C++] [Python] [Dataset] Additional reader options in ParquetFileFormat
+* [ARROW-7555](https://issues.apache.org/jira/browse/ARROW-7555) - [Python] Drop support for python 2.7
+* [ARROW-7587](https://issues.apache.org/jira/browse/ARROW-7587) - [C++][Compute] Add Top-k kernel
+* [ARROW-7608](https://issues.apache.org/jira/browse/ARROW-7608) - [C++][Dataset] Expose more informational properties
+* [ARROW-7615](https://issues.apache.org/jira/browse/ARROW-7615) - [CI][Gandiva] Ensure that the gandiva\_jni library has only a whitelisted set of shared dependencies as part of Travis CI job
+* [ARROW-7616](https://issues.apache.org/jira/browse/ARROW-7616) - [Java] Support comparing value ranges for dense union vector
+* [ARROW-7625](https://issues.apache.org/jira/browse/ARROW-7625) - [GLib] Parquet GLib and Red Parquet (Ruby) do not allow specifying compression type
+* [ARROW-7641](https://issues.apache.org/jira/browse/ARROW-7641) - [R] Make dataset vignette have executable code
+* [ARROW-7662](https://issues.apache.org/jira/browse/ARROW-7662) - [R] Support creating ListArray from R list
+* [ARROW-7664](https://issues.apache.org/jira/browse/ARROW-7664) - [C++] Extract localfs default from FileSystemFromUri
+* [ARROW-7675](https://issues.apache.org/jira/browse/ARROW-7675) - [R][CI] Move Windows CI from Appveyor to GHA
+* [ARROW-7679](https://issues.apache.org/jira/browse/ARROW-7679) - [R] Cleaner interface for creating UnionDataset
+* [ARROW-7684](https://issues.apache.org/jira/browse/ARROW-7684) - [Rust] Provide example of Flight server for DataFusion
+* [ARROW-7685](https://issues.apache.org/jira/browse/ARROW-7685) - [Developer] Add support for GitHub Actions to Crossbow
+* [ARROW-7691](https://issues.apache.org/jira/browse/ARROW-7691) - [C++] Verify missing fields when walking Flatbuffers data
+* [ARROW-7708](https://issues.apache.org/jira/browse/ARROW-7708) - [Release] Include PARQUET commits from git changelog in release changelogs
+* [ARROW-7712](https://issues.apache.org/jira/browse/ARROW-7712) - [CI][Crossbow] Fix or delete fuzzit jobs
+* [ARROW-7720](https://issues.apache.org/jira/browse/ARROW-7720) - [C++][Python] Add check\_metadata argument to Table.equals
+* [ARROW-7725](https://issues.apache.org/jira/browse/ARROW-7725) - [C++] Add infrastructure for unity builds and precompiled headers
+* [ARROW-7726](https://issues.apache.org/jira/browse/ARROW-7726) - [CI] [C++] Use boost binaries on Windows GHA build
+* [ARROW-7729](https://issues.apache.org/jira/browse/ARROW-7729) - [Python][CI] Pin pandas version to 0.25 in the dask integration test
+* [ARROW-7733](https://issues.apache.org/jira/browse/ARROW-7733) - [Developer] Install locally a new enough version of Go for release verification script
+* [ARROW-7735](https://issues.apache.org/jira/browse/ARROW-7735) - [Release] conda-forge channel is missing for verifying wheels
+* [ARROW-7736](https://issues.apache.org/jira/browse/ARROW-7736) - [Release] Binary verification sometimes fails with transient error
+* [ARROW-7739](https://issues.apache.org/jira/browse/ARROW-7739) - [GLib] Use placement new to initialize shared\_ptr object in private structs
+* [ARROW-7741](https://issues.apache.org/jira/browse/ARROW-7741) - [C++][Parquet] Incorporate new level generation logic in parquet write path with a flag to revert back to old logic
+* [ARROW-7742](https://issues.apache.org/jira/browse/ARROW-7742) - [GLib] Add support for MapArray
+* [ARROW-7745](https://issues.apache.org/jira/browse/ARROW-7745) - [Doc] [C++] Update Parquet documentation
+* [ARROW-7749](https://issues.apache.org/jira/browse/ARROW-7749) - [C++] Link some more tests together
+* [ARROW-7750](https://issues.apache.org/jira/browse/ARROW-7750) - [Release] Make the source release verification script restartable
+* [ARROW-7751](https://issues.apache.org/jira/browse/ARROW-7751) - [Release] macOS wheel verification also needs arrow-testing
+* [ARROW-7752](https://issues.apache.org/jira/browse/ARROW-7752) - [Release] Enable and test dataset in the verification script
+* [ARROW-7754](https://issues.apache.org/jira/browse/ARROW-7754) - [C++] Result<T\> is slow
+* [ARROW-7761](https://issues.apache.org/jira/browse/ARROW-7761) - [C++] Add S3 support to fs::FileSystemFromUri
+* [ARROW-7764](https://issues.apache.org/jira/browse/ARROW-7764) - [C++] Builders allocate a null bitmap buffer even if there is no nulls
+* [ARROW-7771](https://issues.apache.org/jira/browse/ARROW-7771) - [Developer] Use ARROW\_TMPDIR environment variable in the verification scripts instead of TMPDIR
+* [ARROW-7774](https://issues.apache.org/jira/browse/ARROW-7774) - [Packaging][Python] Update macos and windows wheel filenames
+* [ARROW-7787](https://issues.apache.org/jira/browse/ARROW-7787) - [Rust] Add collect to Table API
+* [ARROW-7788](https://issues.apache.org/jira/browse/ARROW-7788) - [C++] Add schema conversion support for map type
+* [ARROW-7790](https://issues.apache.org/jira/browse/ARROW-7790) - [Website] Update how to install Linux packages
+* [ARROW-7795](https://issues.apache.org/jira/browse/ARROW-7795) - [Rust - DataFusion] Support boolean negation (NOT)
+* [ARROW-7796](https://issues.apache.org/jira/browse/ARROW-7796) - [R] write\_\* functions should invisibly return their inputs
+* [ARROW-7799](https://issues.apache.org/jira/browse/ARROW-7799) - [R][CI] Remove flatbuffers from homebrew formulae
+* [ARROW-7804](https://issues.apache.org/jira/browse/ARROW-7804) - [C++][R] Compile error on macOS 10.11
+* [ARROW-7812](https://issues.apache.org/jira/browse/ARROW-7812) - [Packaging][Python] Upgrade LLVM in manylinux1 docker image
+* [ARROW-7817](https://issues.apache.org/jira/browse/ARROW-7817) - [CI] macOS R autobrew nightly failed on installing dependency from source
+* [ARROW-7819](https://issues.apache.org/jira/browse/ARROW-7819) - [C++][Gandiva] Add DumpIR to Filter/Projector classes
+* [ARROW-7824](https://issues.apache.org/jira/browse/ARROW-7824) - [C++][Dataset] Provide Dataset writing to IPC format
+* [ARROW-7828](https://issues.apache.org/jira/browse/ARROW-7828) - [Release] Remove SSH keys for internal use
+* [ARROW-7829](https://issues.apache.org/jira/browse/ARROW-7829) - [R] Test R bindings on clang
+* [ARROW-7833](https://issues.apache.org/jira/browse/ARROW-7833) - [R] Make install\_arrow() actually install arrow
+* [ARROW-7834](https://issues.apache.org/jira/browse/ARROW-7834) - [Release] Post release task for updating the documentations
+* [ARROW-7839](https://issues.apache.org/jira/browse/ARROW-7839) - [Python][Dataset] Add IPC format to python bindings
+* [ARROW-7846](https://issues.apache.org/jira/browse/ARROW-7846) - [Python][Dev] Remove last dependencies on six
+* [ARROW-7847](https://issues.apache.org/jira/browse/ARROW-7847) - [Website] Write a blog post about fuzzing
+* [ARROW-7849](https://issues.apache.org/jira/browse/ARROW-7849) - [Packaging][Python] Remove the remaining py27 crossbow wheel tasks from the nightlies
+* [ARROW-7858](https://issues.apache.org/jira/browse/ARROW-7858) - [C++][Python] Support casting an Extension type to its storage type
+* [ARROW-7859](https://issues.apache.org/jira/browse/ARROW-7859) - [R] Minor patches for CRAN submission 0.16.0.2
+* [ARROW-7860](https://issues.apache.org/jira/browse/ARROW-7860) - [C++] Support cast to/from halffloat
+* [ARROW-7862](https://issues.apache.org/jira/browse/ARROW-7862) - [R] Linux installation should run quieter by default
+* [ARROW-7863](https://issues.apache.org/jira/browse/ARROW-7863) - [C++][Python][CI] Ensure running HDFS related tests
+* [ARROW-7864](https://issues.apache.org/jira/browse/ARROW-7864) - [R] Make sure bundled installation works even if there are system packages
+* [ARROW-7865](https://issues.apache.org/jira/browse/ARROW-7865) - [R] Test builds on latest Linux versions
+* [ARROW-7868](https://issues.apache.org/jira/browse/ARROW-7868) - [Crossbow] Reduce GitHub API query parallelism
+* [ARROW-7869](https://issues.apache.org/jira/browse/ARROW-7869) - [Python] Boost::system and boost::filesystem not necessary anymore in Python wheels
+* [ARROW-7872](https://issues.apache.org/jira/browse/ARROW-7872) - [Python] Support conversion of list-of-struct in Array/Table.to\_pandas
+* [ARROW-7874](https://issues.apache.org/jira/browse/ARROW-7874) - [Python][Archery] Validate docstrings with numpydoc
+* [ARROW-7876](https://issues.apache.org/jira/browse/ARROW-7876) - [R] Installation fails in the documentation generation image
+* [ARROW-7877](https://issues.apache.org/jira/browse/ARROW-7877) - [Packaging] Fix crossbow deployment to github artifacts
+* [ARROW-7879](https://issues.apache.org/jira/browse/ARROW-7879) - [C++][Doc] Add doc for the Device API
+* [ARROW-7880](https://issues.apache.org/jira/browse/ARROW-7880) - [CI][R] R sanitizer job is not really working
+* [ARROW-7881](https://issues.apache.org/jira/browse/ARROW-7881) - [C++] Fix pedantic warnings
+* [ARROW-7882](https://issues.apache.org/jira/browse/ARROW-7882) - [C++][Gandiva] Optimise like function for substring pattern
+* [ARROW-7886](https://issues.apache.org/jira/browse/ARROW-7886) - [C++][Dataset] Consolidate Source and Dataset
+* [ARROW-7888](https://issues.apache.org/jira/browse/ARROW-7888) - [Python] Allow using a more modern version of jpype in pyarrow.jvm
+* [ARROW-7890](https://issues.apache.org/jira/browse/ARROW-7890) - [C++] Add Promise / Future implementation
+* [ARROW-7891](https://issues.apache.org/jira/browse/ARROW-7891) - [C++] RecordBatch-\>Equals should also have a check\_metadata argument
+* [ARROW-7892](https://issues.apache.org/jira/browse/ARROW-7892) - [Python] Expose FilesystemSource.format attribute
+* [ARROW-7895](https://issues.apache.org/jira/browse/ARROW-7895) - [Python] Remove more python 2.7 cruft
+* [ARROW-7896](https://issues.apache.org/jira/browse/ARROW-7896) - [C++] Refactor from \#include guards to \#pragma once
+* [ARROW-7897](https://issues.apache.org/jira/browse/ARROW-7897) - [Packaging] Temporarily disable artifact uploading until we fix the deployment issues
+* [ARROW-7898](https://issues.apache.org/jira/browse/ARROW-7898) - [Python] Reduce the number docstring violations using numpydoc
+* [ARROW-7904](https://issues.apache.org/jira/browse/ARROW-7904) - [C++] Decide about Field/Schema metadata printing parameters and how much to show by default
+* [ARROW-7907](https://issues.apache.org/jira/browse/ARROW-7907) - [Python] Conversion to pandas of empty table with timestamp type aborts
+* [ARROW-7912](https://issues.apache.org/jira/browse/ARROW-7912) - [Format] C data interface
+* [ARROW-7913](https://issues.apache.org/jira/browse/ARROW-7913) - [C++][Python][R] C++ implementation of C data interface
+* [ARROW-7915](https://issues.apache.org/jira/browse/ARROW-7915) - [CI] [Python] Run tests with Python development mode enabled
+* [ARROW-7916](https://issues.apache.org/jira/browse/ARROW-7916) - [C++][Dataset] Project IPC record batches to materialized fields
+* [ARROW-7917](https://issues.apache.org/jira/browse/ARROW-7917) - [CMake] FindPythonInterp should check for python3
+* [ARROW-7919](https://issues.apache.org/jira/browse/ARROW-7919) - [R] install\_arrow() should conda install if appropriate
+* [ARROW-7920](https://issues.apache.org/jira/browse/ARROW-7920) - [R] Fill in some missing input validation
+* [ARROW-7921](https://issues.apache.org/jira/browse/ARROW-7921) - [Go] Add Reset method to various components and clean up comments
+* [ARROW-7927](https://issues.apache.org/jira/browse/ARROW-7927) - [C++] Fix 'cpu\_info.cc' compilation warning
+* [ARROW-7929](https://issues.apache.org/jira/browse/ARROW-7929) - [C++] CMake target names differ from upstream provided names
+* [ARROW-7930](https://issues.apache.org/jira/browse/ARROW-7930) - [Python][CI] Test jpype integration in CI
+* [ARROW-7932](https://issues.apache.org/jira/browse/ARROW-7932) - [Rust] [Parquet] Implement array reader for temporal types
+* [ARROW-7934](https://issues.apache.org/jira/browse/ARROW-7934) - [C++] Fix UriEscape for empty string
+* [ARROW-7935](https://issues.apache.org/jira/browse/ARROW-7935) - [Java] Remove Netty dependency for BufferAllocator and ReferenceManager
+* [ARROW-7937](https://issues.apache.org/jira/browse/ARROW-7937) - [Python][Packaging] Remove boost from the macos wheels
+* [ARROW-7941](https://issues.apache.org/jira/browse/ARROW-7941) - [Rust] [DataFusion] Logical plan should support unresolved column references
+* [ARROW-7943](https://issues.apache.org/jira/browse/ARROW-7943) - [C++][Parquet] Add a new level builder capable of handling nested data
+* [ARROW-7947](https://issues.apache.org/jira/browse/ARROW-7947) - [Rust] [Flight] [DataFusion] Implement example for get\_schema
+* [ARROW-7949](https://issues.apache.org/jira/browse/ARROW-7949) - [Developer] Update to '.gitignore' to not track user specific 'cpp/Brewfile.lock.json' file
+* [ARROW-7951](https://issues.apache.org/jira/browse/ARROW-7951) - [Python][Parquet] Expose BYTE\_STREAM\_SPLIT to pyarrow
+* [ARROW-7959](https://issues.apache.org/jira/browse/ARROW-7959) - [Ruby] Add support for Ruby 2.3 again
+* [ARROW-7963](https://issues.apache.org/jira/browse/ARROW-7963) - [C++][Python][Dataset] Expose listing fragments
+* [ARROW-7965](https://issues.apache.org/jira/browse/ARROW-7965) - [Python] Refine higher level dataset API
+* [ARROW-7966](https://issues.apache.org/jira/browse/ARROW-7966) - [Integration][Flight][C++] Client should verify each batch independently
+* [ARROW-7969](https://issues.apache.org/jira/browse/ARROW-7969) - [Packaging] Use cURL to upload artifacts
+* [ARROW-7970](https://issues.apache.org/jira/browse/ARROW-7970) - [Packaging][Python] Use system boost to build the macos wheels
+* [ARROW-7971](https://issues.apache.org/jira/browse/ARROW-7971) - [Rust] Create rowcount utility
+* [ARROW-7977](https://issues.apache.org/jira/browse/ARROW-7977) - [C++] Rename fs::FileStats to fs::FileInfo
+* [ARROW-7979](https://issues.apache.org/jira/browse/ARROW-7979) - [C++] Implement experimental buffer compression in IPC messages
+* [ARROW-7982](https://issues.apache.org/jira/browse/ARROW-7982) - [C++] Let ArrayDataVisitor accept void-returning functions
+* [ARROW-7983](https://issues.apache.org/jira/browse/ARROW-7983) - [CI][R] Nightly builds should be more verbose when they fail
+* [ARROW-7984](https://issues.apache.org/jira/browse/ARROW-7984) - [R] Check for valid inputs in more places
+* [ARROW-7986](https://issues.apache.org/jira/browse/ARROW-7986) - [Python] pa.Array.from\_pandas cannot convert pandas.Series containing pyspark.ml.linalg.SparseVector
+* [ARROW-7987](https://issues.apache.org/jira/browse/ARROW-7987) - [CI][R] Fix for verbose nightly builds
+* [ARROW-7988](https://issues.apache.org/jira/browse/ARROW-7988) - [R] Fix on.exit calls in reticulate bindings
+* [ARROW-7991](https://issues.apache.org/jira/browse/ARROW-7991) - [C++][Plasma] Allow option for evicting if full when creating an object
+* [ARROW-7993](https://issues.apache.org/jira/browse/ARROW-7993) - [Java] Support decimal type in ComplexCopier
+* [ARROW-7994](https://issues.apache.org/jira/browse/ARROW-7994) - [CI][C++] Move AppVeyor MinGW builds to GitHub Actions
+* [ARROW-7995](https://issues.apache.org/jira/browse/ARROW-7995) - [C++] IO: coalescing and caching read ranges
+* [ARROW-7998](https://issues.apache.org/jira/browse/ARROW-7998) - [C++][Plasma] Make Seal requests synchronous
+* [ARROW-8005](https://issues.apache.org/jira/browse/ARROW-8005) - [Website] Review and adjust any usages of Apache dist system from website / tools
+* [ARROW-8014](https://issues.apache.org/jira/browse/ARROW-8014) - [C++] Provide CMake targets to test only within a given label
+* [ARROW-8016](https://issues.apache.org/jira/browse/ARROW-8016) - [Developer] Fix deprecation warning in PR merge tool
+* [ARROW-8018](https://issues.apache.org/jira/browse/ARROW-8018) - [C++][Parquet]Parquet Modular Encryption
+* [ARROW-8024](https://issues.apache.org/jira/browse/ARROW-8024) - [R] Bindings for BinaryType and FixedBinaryType
+* [ARROW-8026](https://issues.apache.org/jira/browse/ARROW-8026) - [Python] Support memoryview in addition to string value types for constructing string and binary type arrays
+* [ARROW-8027](https://issues.apache.org/jira/browse/ARROW-8027) - [Developer][Integration] Add integration tests for duplicate field names
+* [ARROW-8028](https://issues.apache.org/jira/browse/ARROW-8028) - [Go] Allow duplicate field names in schemas and nested types
+* [ARROW-8030](https://issues.apache.org/jira/browse/ARROW-8030) - [C++][Plasma] Fix inconsistent comment style
+* [ARROW-8035](https://issues.apache.org/jira/browse/ARROW-8035) - [Developer][Integration] Add integration tests for extension types
+* [ARROW-8039](https://issues.apache.org/jira/browse/ARROW-8039) - [Python][Dataset] Support using dataset API in pyarrow.parquet with a minimal ParquetDataset shim
+* [ARROW-8044](https://issues.apache.org/jira/browse/ARROW-8044) - [CI][NIGHTLY:gandiva-jar-osx] pygit2 needs libgit2 v1.0.x
+* [ARROW-8055](https://issues.apache.org/jira/browse/ARROW-8055) - [GLib][Ruby] Add some metadata bindings to GArrowSchema
+* [ARROW-8058](https://issues.apache.org/jira/browse/ARROW-8058) - [C++][Python][Dataset] Provide an option to toggle validation and schema inference in FileSystemDatasetFactoryOptions
+* [ARROW-8059](https://issues.apache.org/jira/browse/ARROW-8059) - [Python] Make FileSystem objects serializable
+* [ARROW-8060](https://issues.apache.org/jira/browse/ARROW-8060) - [Python] Make dataset Expression objects serializable
+* [ARROW-8061](https://issues.apache.org/jira/browse/ARROW-8061) - [C++][Dataset] Ability to specify granularity of ParquetFileFragment (support row groups)
+* [ARROW-8063](https://issues.apache.org/jira/browse/ARROW-8063) - [Python] Add user guide documentation for Datasets API
+* [ARROW-8064](https://issues.apache.org/jira/browse/ARROW-8064) - [Dev] Implement Comment bot via Github actions
+* [ARROW-8069](https://issues.apache.org/jira/browse/ARROW-8069) - [C++] Should the default value of "check\_metadata" arguments of Equals methods be "true"?
+* [ARROW-8072](https://issues.apache.org/jira/browse/ARROW-8072) - [C++][Plasma] Add const constraint when parsing data
+* [ARROW-8077](https://issues.apache.org/jira/browse/ARROW-8077) - [Python] Add wheel build script and Crossbow configuration for Windows on Python 3.5
+* [ARROW-8079](https://issues.apache.org/jira/browse/ARROW-8079) - [Python] Implement a wrapper for KeyValueMetadata, duck-typing dict where relevant
+* [ARROW-8080](https://issues.apache.org/jira/browse/ARROW-8080) - [C++] Add AVX512 build option
+* [ARROW-8082](https://issues.apache.org/jira/browse/ARROW-8082) - [Java][Plasma] Add JNI list() interface
+* [ARROW-8083](https://issues.apache.org/jira/browse/ARROW-8083) - [GLib] Add support for Peek() to GIOInputStream
+* [ARROW-8086](https://issues.apache.org/jira/browse/ARROW-8086) - [Java] Support writing decimal from big endian byte array in UnionListWriter
+* [ARROW-8087](https://issues.apache.org/jira/browse/ARROW-8087) - [C++][Dataset] Order of keys with HivePartitioning is lost in resulting schema
+* [ARROW-8096](https://issues.apache.org/jira/browse/ARROW-8096) - [C++][Gandiva] Create null node of Interval type
+* [ARROW-8097](https://issues.apache.org/jira/browse/ARROW-8097) - [Dev] Comment bot's crossbow command acts on the master branch
+* [ARROW-8103](https://issues.apache.org/jira/browse/ARROW-8103) - [R] Make default Linux build more minimal
+* [ARROW-8104](https://issues.apache.org/jira/browse/ARROW-8104) - [C++] Don't install bundled Thrift
+* [ARROW-8107](https://issues.apache.org/jira/browse/ARROW-8107) - [Packaging][APT] Use HTTPS for LLVM APT repository for Debian GNU/Linux stretch
+* [ARROW-8109](https://issues.apache.org/jira/browse/ARROW-8109) - [Packaging][APT] Drop support for Ubuntu Disco
+* [ARROW-8117](https://issues.apache.org/jira/browse/ARROW-8117) - [Rust] [Datafusion] Allow CAST from number to timestamp
+* [ARROW-8118](https://issues.apache.org/jira/browse/ARROW-8118) - [R] dim method for FileSystemDataset
+* [ARROW-8120](https://issues.apache.org/jira/browse/ARROW-8120) - [Packaging][APT] Add support for Ubuntu Focal
+* [ARROW-8123](https://issues.apache.org/jira/browse/ARROW-8123) - [Rust] [DataFusion] Create LogicalPlanBuilder
+* [ARROW-8124](https://issues.apache.org/jira/browse/ARROW-8124) - [Rust] Update library dependencies
+* [ARROW-8126](https://issues.apache.org/jira/browse/ARROW-8126) - [C++][Compute] Add Top-K kernel benchmark
+* [ARROW-8129](https://issues.apache.org/jira/browse/ARROW-8129) - [C++][Compute] Refine compare sorting kernel
+* [ARROW-8130](https://issues.apache.org/jira/browse/ARROW-8130) - [C++][Gandiva] Fix Dex visitor in llvm\_generator to handle interval type
+* [ARROW-8140](https://issues.apache.org/jira/browse/ARROW-8140) - [Developer] Follow NullType -\> NullField change
+* [ARROW-8141](https://issues.apache.org/jira/browse/ARROW-8141) - [C++] Optimize BM\_PlainDecodingBoolean performance using AVX512 Intrinsics API
+* [ARROW-8145](https://issues.apache.org/jira/browse/ARROW-8145) - [C++] Rename GetTargetInfos
+* [ARROW-8146](https://issues.apache.org/jira/browse/ARROW-8146) - [C++] Add per-filesystem facility to sanitize a path
+* [ARROW-8150](https://issues.apache.org/jira/browse/ARROW-8150) - [Rust] Allow writing custom FileMetaData k/v pairs
+* [ARROW-8151](https://issues.apache.org/jira/browse/ARROW-8151) - [Benchmarking][Dataset] Benchmark Parquet read performance with S3File
+* [ARROW-8153](https://issues.apache.org/jira/browse/ARROW-8153) - [Packaging] Update the conda feedstock files and upload artifacts to Anaconda
+* [ARROW-8158](https://issues.apache.org/jira/browse/ARROW-8158) - [Java] Getting length of data buffer and base variable width vector
+* [ARROW-8164](https://issues.apache.org/jira/browse/ARROW-8164) - [C++][Dataset] Let datasets be viewable with non-identical schema
+* [ARROW-8165](https://issues.apache.org/jira/browse/ARROW-8165) - [Packaging] Make nightly wheels available on a PyPI server
+* [ARROW-8167](https://issues.apache.org/jira/browse/ARROW-8167) - [CI] Add support for skipping builds with skip pattern in pull request title
+* [ARROW-8168](https://issues.apache.org/jira/browse/ARROW-8168) - [Java][Plasma] Improve Java Plasma client off-heap memory usage
+* [ARROW-8177](https://issues.apache.org/jira/browse/ARROW-8177) - [Rust] Make schema\_to\_fb\_offset public
+* [ARROW-8178](https://issues.apache.org/jira/browse/ARROW-8178) - [C++] Upgrade to Flatbuffers 1.12
+* [ARROW-8179](https://issues.apache.org/jira/browse/ARROW-8179) - [R] Windows build script tweaking for nightly packaging on GHA
+* [ARROW-8181](https://issues.apache.org/jira/browse/ARROW-8181) - [Java][FlightRPC] Expose transport error metadata
+* [ARROW-8182](https://issues.apache.org/jira/browse/ARROW-8182) - [Packaging] Increment the version number detected from the latest git tag
+* [ARROW-8183](https://issues.apache.org/jira/browse/ARROW-8183) - [c++][FlightRPC] Expose transport error metadata
+* [ARROW-8184](https://issues.apache.org/jira/browse/ARROW-8184) - [Packaging] Use arrow-nightlies organization name on Anaconda and Gemfury to host the nightlies
+* [ARROW-8185](https://issues.apache.org/jira/browse/ARROW-8185) - [Packaging] Document the available nightly wheels and conda packages
+* [ARROW-8187](https://issues.apache.org/jira/browse/ARROW-8187) - [R] Make test assertions robust to i18n
+* [ARROW-8191](https://issues.apache.org/jira/browse/ARROW-8191) - [Packaging][APT] Fix cmake removal in Debian GNU/Linux Stretch
+* [ARROW-8192](https://issues.apache.org/jira/browse/ARROW-8192) - [C++] script for unpack avx512 intrinsics code
+* [ARROW-8194](https://issues.apache.org/jira/browse/ARROW-8194) - [CI] Github Actions Windows job should run tests in parallel
+* [ARROW-8195](https://issues.apache.org/jira/browse/ARROW-8195) - [CI] Remove Boost download step in Github Actions
+* [ARROW-8198](https://issues.apache.org/jira/browse/ARROW-8198) - [C++] Diffing should handle null arrays
+* [ARROW-8200](https://issues.apache.org/jira/browse/ARROW-8200) - [GLib] Rename garrow\_file\_system\_target\_info{,s}() to ...\_file\_info{,s}()
+* [ARROW-8203](https://issues.apache.org/jira/browse/ARROW-8203) - [C\#] "dotnet pack" is failed
+* [ARROW-8204](https://issues.apache.org/jira/browse/ARROW-8204) - [Rust] [DataFusion] Add support for aliased expressions in SQL
+* [ARROW-8207](https://issues.apache.org/jira/browse/ARROW-8207) - [Packaging][wheel] Use LLVM 8 in manylinux2010 and manylinux2014
+* [ARROW-8215](https://issues.apache.org/jira/browse/ARROW-8215) - [CI][GLib] Meson install fails in the macOS build
+* [ARROW-8218](https://issues.apache.org/jira/browse/ARROW-8218) - [C++] Parallelize decompression at field level in experimental IPC compression code
+* [ARROW-8220](https://issues.apache.org/jira/browse/ARROW-8220) - [Python] Make dataset FileFormat objects serializable
+* [ARROW-8222](https://issues.apache.org/jira/browse/ARROW-8222) - [C++] Use bcp to make a slim boost for bundled build
+* [ARROW-8224](https://issues.apache.org/jira/browse/ARROW-8224) - [C++] Remove APIs deprecated prior to 0.16.0
+* [ARROW-8225](https://issues.apache.org/jira/browse/ARROW-8225) - [Rust] IPC reader must respect continuation markers
+* [ARROW-8225](https://issues.apache.org/jira/browse/ARROW-8225) - [Rust] IPC reader must respect continuation markers
+* [ARROW-8227](https://issues.apache.org/jira/browse/ARROW-8227) - [C++] Refine SIMD feature definitions
+* [ARROW-8231](https://issues.apache.org/jira/browse/ARROW-8231) - [Rust] Parse key\_value\_metadata from parquet FileMetaData into arrow schema metadata
+* [ARROW-8232](https://issues.apache.org/jira/browse/ARROW-8232) - [Python] Deprecate pa.open\_file and pa.open\_stream in favor of pa.ipc.open\_file/open\_stream
+* [ARROW-8235](https://issues.apache.org/jira/browse/ARROW-8235) - [C++][Compute] Filter out nulls by default
+* [ARROW-8241](https://issues.apache.org/jira/browse/ARROW-8241) - [Rust] Add convenience methods to Schema
+* [ARROW-8242](https://issues.apache.org/jira/browse/ARROW-8242) - [C++] Flight fails to compile on GCC 4.8
+* [ARROW-8243](https://issues.apache.org/jira/browse/ARROW-8243) - [Rust] [DataFusion] Fix inconsistent API in LogicalPlanBuilder
+* [ARROW-8244](https://issues.apache.org/jira/browse/ARROW-8244) - [Python][Parquet] Add \`write\_to\_dataset\` option to populate the "file\_path" metadata fields
+* [ARROW-8246](https://issues.apache.org/jira/browse/ARROW-8246) - [C++] Add -Wa,-mbig-obj when compiling with MinGW to avoid linking errors
+* [ARROW-8247](https://issues.apache.org/jira/browse/ARROW-8247) - [Python] Expose Parquet writing "engine" setting in pyarrow.parquet.write\_table
+* [ARROW-8249](https://issues.apache.org/jira/browse/ARROW-8249) - [Rust] [DataFusion] Make Table and LogicalPlanBuilder APIs more consistent
+* [ARROW-8252](https://issues.apache.org/jira/browse/ARROW-8252) - [CI][Ruby] Add Ubuntu 20.04
+* [ARROW-8256](https://issues.apache.org/jira/browse/ARROW-8256) - [Rust] [DataFusion] Update CLI documentation for 0.17.0 release
+* [ARROW-8264](https://issues.apache.org/jira/browse/ARROW-8264) - [Rust] [DataFusion] Create utility for printing record batches
+* [ARROW-8266](https://issues.apache.org/jira/browse/ARROW-8266) - [C++] Add backup mirrors for external project source downloads
+* [ARROW-8267](https://issues.apache.org/jira/browse/ARROW-8267) - [CI][GLib] Failed to build on Ubuntu 16.04
+* [ARROW-8271](https://issues.apache.org/jira/browse/ARROW-8271) - [Packaging] Allow wheel upload failures to gemfury
+* [ARROW-8275](https://issues.apache.org/jira/browse/ARROW-8275) - [Python][Docs] Review Feather + IPC file documentation per "Feather V2" changes
+* [ARROW-8277](https://issues.apache.org/jira/browse/ARROW-8277) - [Python] RecordBatch interface improvements
+* [ARROW-8279](https://issues.apache.org/jira/browse/ARROW-8279) - [C++] Do not export symbols from Codec implementations, remove need for PIMPL pattern
+* [ARROW-8288](https://issues.apache.org/jira/browse/ARROW-8288) - [Python] Expose with\_ modifiers on DataType
+* [ARROW-8290](https://issues.apache.org/jira/browse/ARROW-8290) - [Python][Dataset] Improve ergonomy of the FileSystemDataset constructor
+* [ARROW-8291](https://issues.apache.org/jira/browse/ARROW-8291) - [Packaging] Conda nightly builds can't locate Numpy
+* [ARROW-8292](https://issues.apache.org/jira/browse/ARROW-8292) - [Python][Dataset] Passthrough schema to Factory.finish() in dataset() function
+* [ARROW-8294](https://issues.apache.org/jira/browse/ARROW-8294) - [Format][Flight] Add DoExchange RPC to Flight protocol
+* [ARROW-8295](https://issues.apache.org/jira/browse/ARROW-8295) - [C++][Dataset] IpcFileFormat should expliclity push down column projection
+* [ARROW-8299](https://issues.apache.org/jira/browse/ARROW-8299) - [C++] Reusable "optional ParallelFor" function for optional use of multithreading
+* [ARROW-8300](https://issues.apache.org/jira/browse/ARROW-8300) - [R] Documentation and changelog updates for 0.17
+* [ARROW-8307](https://issues.apache.org/jira/browse/ARROW-8307) - [Python] Expose use\_memory\_map option in pyarrow.feather APIs
+* [ARROW-8308](https://issues.apache.org/jira/browse/ARROW-8308) - [Rust] [Flight] Implement DoExchange on examples
+* [ARROW-8309](https://issues.apache.org/jira/browse/ARROW-8309) - [CI] C++/Java/Rust workflows should trigger on changes to Flight.proto
+* [ARROW-8311](https://issues.apache.org/jira/browse/ARROW-8311) - [C++] Add push style stream format reader
+* [ARROW-8316](https://issues.apache.org/jira/browse/ARROW-8316) - [CI] Set docker-compose to use docker-cli instead of docker-py for building images
+* [ARROW-8319](https://issues.apache.org/jira/browse/ARROW-8319) - [CI] Install thrift compiler in the debian build
+* [ARROW-8320](https://issues.apache.org/jira/browse/ARROW-8320) - [Documentation][Format] Clarify (lack of) alignment requirements in C data interface
+* [ARROW-8321](https://issues.apache.org/jira/browse/ARROW-8321) - [CI] Use bundled thrift in Fedora 30 build
+* [ARROW-8322](https://issues.apache.org/jira/browse/ARROW-8322) - [CI] Fix C\# workflow file syntax
+* [ARROW-8325](https://issues.apache.org/jira/browse/ARROW-8325) - [R][CI] Stop including boost in R windows bundle
+* [ARROW-8329](https://issues.apache.org/jira/browse/ARROW-8329) - [Documentation][C++] Undocumented FilterOptions argument in Filter kernel
+* [ARROW-8330](https://issues.apache.org/jira/browse/ARROW-8330) - [Documentation] The post release script generates the documentation with a development version
+* [ARROW-8332](https://issues.apache.org/jira/browse/ARROW-8332) - [C++] Require Thrift compiler to use system libthrift for Parquet build
+* [ARROW-8335](https://issues.apache.org/jira/browse/ARROW-8335) - [Release] Add crossbow jobs to run release verification
+* [ARROW-8336](https://issues.apache.org/jira/browse/ARROW-8336) - [Packaging][deb] Use libthrift-dev on Debian 10 and Ubuntu 19.10 or later
+* [ARROW-8341](https://issues.apache.org/jira/browse/ARROW-8341) - [Packaging][deb] Fail to build by no disk space
+* [ARROW-8343](https://issues.apache.org/jira/browse/ARROW-8343) - [GLib] Add GArrowRecordBatchIterator
+* [ARROW-8347](https://issues.apache.org/jira/browse/ARROW-8347) - [C++] Add Result<T\> APIs to Array methods
+* [ARROW-8351](https://issues.apache.org/jira/browse/ARROW-8351) - [R][CI] Store the Rtools-built Arrow C++ library as a build artifact
+* [ARROW-8352](https://issues.apache.org/jira/browse/ARROW-8352) - [R] Add install\_pyarrow()
+* [ARROW-8356](https://issues.apache.org/jira/browse/ARROW-8356) - [Developer] Support \* wildcards with "crossbow submit" via GitHub actions
+* [ARROW-8361](https://issues.apache.org/jira/browse/ARROW-8361) - [C++] Add Result<T\> APIs to Buffer methods and functions
+* [ARROW-8362](https://issues.apache.org/jira/browse/ARROW-8362) - [Crossbow] Ensure that the locally generated version is used in the docker tasks
+* [ARROW-8367](https://issues.apache.org/jira/browse/ARROW-8367) - [C++] Deprecate Buffer::FromString(..., pool)
+* [ARROW-8368](https://issues.apache.org/jira/browse/ARROW-8368) - [Format] In C interface, clarify resource management for consumers needing only a subset of child fields in ArrowArray
+* [ARROW-8370](https://issues.apache.org/jira/browse/ARROW-8370) - [C++] Add Result<T\> to type / schema APIs
+* [ARROW-8371](https://issues.apache.org/jira/browse/ARROW-8371) - [Crossbow] Implement and exercise sanity checks for tasks.yml
+* [ARROW-8372](https://issues.apache.org/jira/browse/ARROW-8372) - [C++] Add Result<T\> to table / record batch APIs
+* [ARROW-8375](https://issues.apache.org/jira/browse/ARROW-8375) - [CI][R] Make Windows tests more verbose in case of segfault
+* [ARROW-8376](https://issues.apache.org/jira/browse/ARROW-8376) - [R] Add experimental interface to ScanTask/RecordBatch iterators
+* [ARROW-8387](https://issues.apache.org/jira/browse/ARROW-8387) - [Rust] Make schema\_to\_fb public
+* [ARROW-8389](https://issues.apache.org/jira/browse/ARROW-8389) - [Integration] Run tests in parallel
+* [ARROW-8390](https://issues.apache.org/jira/browse/ARROW-8390) - [R] Expose schema unification features
+* [ARROW-8393](https://issues.apache.org/jira/browse/ARROW-8393) - [C++][Gandiva] Make gandiva function registry case-insensitive
+* [ARROW-8396](https://issues.apache.org/jira/browse/ARROW-8396) - [Rust] Remove libc from dependencies
+* [ARROW-8398](https://issues.apache.org/jira/browse/ARROW-8398) - [Python] Remove deprecation warnings originating from python tests
+* [ARROW-8401](https://issues.apache.org/jira/browse/ARROW-8401) - [C++] Add AVX2/AVX512 version of ByteStreamSplitDecode/ByteStreamSplitEncode
+* [ARROW-8403](https://issues.apache.org/jira/browse/ARROW-8403) - [C++] Add ToString() to ChunkedArray, Table and RecordBatch
+* [ARROW-8407](https://issues.apache.org/jira/browse/ARROW-8407) - [Rust] Add rustdoc for Dictionary type
+* [ARROW-8408](https://issues.apache.org/jira/browse/ARROW-8408) - [Python] Add memory\_map= toggle to pyarrow.feather.read\_feather
+* [ARROW-8409](https://issues.apache.org/jira/browse/ARROW-8409) - [R] Add arrow::cpu\_count, arrow::set\_cpu\_count wrapper functions a la Python
+* [ARROW-8412](https://issues.apache.org/jira/browse/ARROW-8412) - [C++][Gandiva] Fix gandiva date\_diff function definitions
+* [ARROW-8433](https://issues.apache.org/jira/browse/ARROW-8433) - [R] Add feather alias for ipc format in dataset API
+* [ARROW-8444](https://issues.apache.org/jira/browse/ARROW-8444) - [Documentation] Fix spelling errors across the codebase
+* [ARROW-8449](https://issues.apache.org/jira/browse/ARROW-8449) - [R] Use CMAKE\_UNITY\_BUILD everywhere
+* [ARROW-8450](https://issues.apache.org/jira/browse/ARROW-8450) - [Integration][C++] Implement large list/binary/utf8 integration
+* [ARROW-8457](https://issues.apache.org/jira/browse/ARROW-8457) - [C++] bridge test does not take care of endianness
+* [ARROW-8458](https://issues.apache.org/jira/browse/ARROW-8458) - [C++] Prefer the original mirrors for the bundled thirdparty dependencies
+* [ARROW-8461](https://issues.apache.org/jira/browse/ARROW-8461) - [Packaging][deb] Use zstd package for Ubuntu Xenial
+* [ARROW-8463](https://issues.apache.org/jira/browse/ARROW-8463) - [CI] Balance the nightly test builds between CircleCI, Azure and Github
+* [ARROW-8679](https://issues.apache.org/jira/browse/ARROW-8679) - [Python] supporting pandas sparse series in pyarrow
+* [PARQUET-458](https://issues.apache.org/jira/browse/PARQUET-458) - [C++] Implement support for DataPageV2
+* [PARQUET-1663](https://issues.apache.org/jira/browse/PARQUET-1663) - [C++] Provide API to check the presence of complex data types
+* [PARQUET-1716](https://issues.apache.org/jira/browse/PARQUET-1716) - [C++] Add support for BYTE\_STREAM\_SPLIT encoding
+* [PARQUET-1770](https://issues.apache.org/jira/browse/PARQUET-1770) - [C++][CI] Add fuzz target for reading Parquet files
+* [PARQUET-1785](https://issues.apache.org/jira/browse/PARQUET-1785) - [C++] Improve code reusability in encoding-test.cc
+* [PARQUET-1786](https://issues.apache.org/jira/browse/PARQUET-1786) - [C++] Use simd to improve BYTE\_STREAM\_SPLIT decoding performance
+* [PARQUET-1806](https://issues.apache.org/jira/browse/PARQUET-1806) - [C++] [CI] Improve fuzzing seed corpus
+* [PARQUET-1825](https://issues.apache.org/jira/browse/PARQUET-1825) - [C++] Fix compilation error in column\_io\_benchmark.cc
+* [PARQUET-1828](https://issues.apache.org/jira/browse/PARQUET-1828) - [C++] Add a SSE2 path for the ByteStreamSplit encoder implementation
+* [PARQUET-1840](https://issues.apache.org/jira/browse/PARQUET-1840) - [C++] DecodeSpaced copies more values then necessary
+
+
+
+# Apache Arrow 0.16.0 (2020-02-07)
+
+## Bug Fixes
+
+* [ARROW-3783](https://issues.apache.org/jira/browse/ARROW-3783) - [R] Incorrect collection of float type
+* [ARROW-3962](https://issues.apache.org/jira/browse/ARROW-3962) - [Go] Support null values while reading a CSV file.
+* [ARROW-4470](https://issues.apache.org/jira/browse/ARROW-4470) - [Python] Pyarrow using considerable more memory when reading partitioned Parquet file
+* [ARROW-4998](https://issues.apache.org/jira/browse/ARROW-4998) - [R] R package fails to install on OSX
+* [ARROW-5575](https://issues.apache.org/jira/browse/ARROW-5575) - [C++] arrowConfig.cmake includes uninstalled targets
+* [ARROW-5655](https://issues.apache.org/jira/browse/ARROW-5655) - [Python] Table.from\_pydict/from\_arrays not using types in specified schema correctly
+* [ARROW-5680](https://issues.apache.org/jira/browse/ARROW-5680) - [Rust] datafusion group-by tests depends on result set order
+* [ARROW-6157](https://issues.apache.org/jira/browse/ARROW-6157) - [Python][C++] UnionArray with invalid data passes validation / leads to segfaults
+* [ARROW-6195](https://issues.apache.org/jira/browse/ARROW-6195) - [C++] CMake fails with file not found error while bundling thrift if python is not installed
+* [ARROW-6298](https://issues.apache.org/jira/browse/ARROW-6298) - [Rust] [CI] Examples are not being tested in CI
+* [ARROW-6320](https://issues.apache.org/jira/browse/ARROW-6320) - [C++] Arrow utilities are linked statically
+* [ARROW-6429](https://issues.apache.org/jira/browse/ARROW-6429) - [CI][Crossbow] Nightly spark integration job fails
+* [ARROW-6445](https://issues.apache.org/jira/browse/ARROW-6445) - [CI][Crossbow] Nightly Gandiva jar trusty job fails
+* [ARROW-6567](https://issues.apache.org/jira/browse/ARROW-6567) - [Rust] [DataFusion] SQL aggregate query execution assume grouping expressions precede aggregate expressions
+* [ARROW-6581](https://issues.apache.org/jira/browse/ARROW-6581) - [C++] Fix fuzzit job submission
+* [ARROW-6704](https://issues.apache.org/jira/browse/ARROW-6704) - [C++] Cast from timestamp to higher resolution does not check out of bounds timestamps
+* [ARROW-6708](https://issues.apache.org/jira/browse/ARROW-6708) - [C++] "cannot find -lboost\_filesystem\_static"
+* [ARROW-6728](https://issues.apache.org/jira/browse/ARROW-6728) - [C\#] Support reading and writing Date32 and Date64 arrays
+* [ARROW-6736](https://issues.apache.org/jira/browse/ARROW-6736) - [Rust] [DataFusion] Aggregate expressions get evaluated repeatedly
+* [ARROW-6740](https://issues.apache.org/jira/browse/ARROW-6740) - [Python] Unable to delete closed MemoryMappedFile on Windows
+* [ARROW-6745](https://issues.apache.org/jira/browse/ARROW-6745) - [Rust] Fix a variety of typos
+* [ARROW-6749](https://issues.apache.org/jira/browse/ARROW-6749) - [Python] Conversion of non-ns timestamp array to numpy gives wrong values
+* [ARROW-6750](https://issues.apache.org/jira/browse/ARROW-6750) - [Python] Silence S3 error logs by default
+* [ARROW-6761](https://issues.apache.org/jira/browse/ARROW-6761) - [Rust] Travis CI builds not respecting rust-toolchain
+* [ARROW-6762](https://issues.apache.org/jira/browse/ARROW-6762) - [C++] JSON reader segfaults on newline
+* [ARROW-6785](https://issues.apache.org/jira/browse/ARROW-6785) - [JS] Remove superfluous child assignment
+* [ARROW-6786](https://issues.apache.org/jira/browse/ARROW-6786) - [C++] arrow-dataset-file-parquet-test is slow
+* [ARROW-6795](https://issues.apache.org/jira/browse/ARROW-6795) - [C\#] Reading large Arrow files in C\# results in an exception
+* [ARROW-6798](https://issues.apache.org/jira/browse/ARROW-6798) - [CI] [Rust] Improve build times by caching dependencies in the Docker image
+* [ARROW-6801](https://issues.apache.org/jira/browse/ARROW-6801) - [Rust] Arrow source release tarball is missing benchmarks
+* [ARROW-6806](https://issues.apache.org/jira/browse/ARROW-6806) - [C++] Segfault deserializing ListArray containing null/empty list
+* [ARROW-6808](https://issues.apache.org/jira/browse/ARROW-6808) - [Ruby] Ensure requiring suitable MSYS2 package
+* [ARROW-6809](https://issues.apache.org/jira/browse/ARROW-6809) - [RUBY] Gem does not install on macOS due to glib2 3.3.7 compilation failure
+* [ARROW-6812](https://issues.apache.org/jira/browse/ARROW-6812) - [Java] Remove Dremio Corp. from License Header
+* [ARROW-6813](https://issues.apache.org/jira/browse/ARROW-6813) - [Ruby] Arrow::Table.load with headers=true leads to exception in Arrow 0.15
+* [ARROW-6820](https://issues.apache.org/jira/browse/ARROW-6820) - [C++] [Doc] [Format] Map specification and implementation inconsistent
+* [ARROW-6834](https://issues.apache.org/jira/browse/ARROW-6834) - [C++] Pin gtest to 1.8.1 to triage failing Appveyor / MSVC build
+* [ARROW-6835](https://issues.apache.org/jira/browse/ARROW-6835) - [Archery][CMake] Restore ARROW\_LINT\_ONLY
+* [ARROW-6842](https://issues.apache.org/jira/browse/ARROW-6842) - [Website] Jekyll error building website
+* [ARROW-6844](https://issues.apache.org/jira/browse/ARROW-6844) - [C++][Parquet][Python] List<scalar type\> columns read broken with 0.15.0
+* [ARROW-6846](https://issues.apache.org/jira/browse/ARROW-6846) - [C++] Build failures with glog enabled
+* [ARROW-6857](https://issues.apache.org/jira/browse/ARROW-6857) - [Python][C++] Segfault for dictionary\_encode on empty chunked\_array (edge case)
+* [ARROW-6859](https://issues.apache.org/jira/browse/ARROW-6859) - [CI][Nightly] Disable docker layer caching for CircleCI tasks
+* [ARROW-6860](https://issues.apache.org/jira/browse/ARROW-6860) - [Python] Only link libarrow\_flight.so to pyarrow.\_flight
+* [ARROW-6861](https://issues.apache.org/jira/browse/ARROW-6861) - [Python] arrow-0.15.0 reading arrow-0.14.1-output Parquet dictionary column: Failure reading column: IOError: Arrow error: Invalid: Resize cannot downsize
+* [ARROW-6864](https://issues.apache.org/jira/browse/ARROW-6864) - [C++] bz2 / zstd tests not enabled
+* [ARROW-6867](https://issues.apache.org/jira/browse/ARROW-6867) - [FlightRPC][Java] Flight server can hang JVM on shutdown
+* [ARROW-6868](https://issues.apache.org/jira/browse/ARROW-6868) - [Go] slicing Struct array does not slice child fields
+* [ARROW-6869](https://issues.apache.org/jira/browse/ARROW-6869) - [C++] Dictionary "delta" building logic in builder\_dict.h produces invalid arrays
+* [ARROW-6873](https://issues.apache.org/jira/browse/ARROW-6873) - [Python] Stale CColumn reference break Cython cimport pyarrow
+* [ARROW-6874](https://issues.apache.org/jira/browse/ARROW-6874) - [Python] Memory leak in Table.to\_pandas() when conversion to object dtype
+* [ARROW-6876](https://issues.apache.org/jira/browse/ARROW-6876) - [Python] Reading parquet file with many columns becomes slow for 0.15.0
+* [ARROW-6877](https://issues.apache.org/jira/browse/ARROW-6877) - [C++] Boost not found from the correct environment
+* [ARROW-6878](https://issues.apache.org/jira/browse/ARROW-6878) - [Python] pa.array() does not handle list of dicts with bytes keys correctly under python3
+* [ARROW-6882](https://issues.apache.org/jira/browse/ARROW-6882) - [Python] cannot create a chunked\_array from dictionary\_encoding result
+* [ARROW-6885](https://issues.apache.org/jira/browse/ARROW-6885) - [Python] Remove superfluous skipped timedelta test
+* [ARROW-6886](https://issues.apache.org/jira/browse/ARROW-6886) - [C++] arrow::io header nvcc compiler warnings
+* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes
+* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes
+* [ARROW-6899](https://issues.apache.org/jira/browse/ARROW-6899) - [Python] to\_pandas() not implemented on list<dictionary<values=string, indices=int32\>
+* [ARROW-6901](https://issues.apache.org/jira/browse/ARROW-6901) - [Rust][Parquet] SerializedFileWriter writes total\_num\_rows as zero
+* [ARROW-6903](https://issues.apache.org/jira/browse/ARROW-6903) - [Python] Wheels broken after ARROW-6860 changes
+* [ARROW-6905](https://issues.apache.org/jira/browse/ARROW-6905) - [Packaging][OSX] Nightly builds on MacOS are failing because of brew compile timeouts
+* [ARROW-6910](https://issues.apache.org/jira/browse/ARROW-6910) - [Python] pyarrow.parquet.read\_table(...) takes up lots of memory which is not released until program exits
+* [ARROW-6913](https://issues.apache.org/jira/browse/ARROW-6913) - [R] Potential bug in compute.cc
+* [ARROW-6914](https://issues.apache.org/jira/browse/ARROW-6914) - [CI] docker-clang-format nightly failing
+* [ARROW-6922](https://issues.apache.org/jira/browse/ARROW-6922) - [Python] Pandas master build is failing (MultiIndex.levels change)
+* [ARROW-6925](https://issues.apache.org/jira/browse/ARROW-6925) - [C++] Arrow fails to buld on MacOS 10.13.6 using brew gcc 7 and 8
+* [ARROW-6929](https://issues.apache.org/jira/browse/ARROW-6929) - [C++] ValidateArray is out of sync with the ListArray IPC specification
+* [ARROW-6937](https://issues.apache.org/jira/browse/ARROW-6937) - [Packaging][Python] Fix conda linux and OSX wheel nightly builds
+* [ARROW-6938](https://issues.apache.org/jira/browse/ARROW-6938) - [Python] Windows wheel depends on zstd.dll and libbz2.dll, which are not bundled
+* [ARROW-6948](https://issues.apache.org/jira/browse/ARROW-6948) - [Rust] [Parquet] Fix bool array support in arrow reader.
+* [ARROW-6950](https://issues.apache.org/jira/browse/ARROW-6950) - [C++][Dataset] Add example/benchmark for reading parquet files with dataset
+* [ARROW-6957](https://issues.apache.org/jira/browse/ARROW-6957) - [CI][Crossbow] Nightly R with sanitizers build fails installing dependencies
+* [ARROW-6962](https://issues.apache.org/jira/browse/ARROW-6962) - [C++] [CI] Stop compiling with -Weverything
+* [ARROW-6966](https://issues.apache.org/jira/browse/ARROW-6966) - [Go] 32bit memset is null
+* [ARROW-6977](https://issues.apache.org/jira/browse/ARROW-6977) - [C++] Only enable jemalloc background\_thread if feature is supported
+* [ARROW-6983](https://issues.apache.org/jira/browse/ARROW-6983) - [C++] Threaded task group crashes sometimes
+* [ARROW-6989](https://issues.apache.org/jira/browse/ARROW-6989) - [Python][C++] Assert is triggered when decimal type inference occurs on a value with out of range precision
+* [ARROW-6992](https://issues.apache.org/jira/browse/ARROW-6992) - [C++]: Undefined Behavior sanitizer build option fails with GCC
+* [ARROW-6999](https://issues.apache.org/jira/browse/ARROW-6999) - [Python] KeyError: '\_\_index\_level\_0\_\_' passing Table.from\_pandas its own schema
+* [ARROW-7013](https://issues.apache.org/jira/browse/ARROW-7013) - [C++] arrow-dataset pkgconfig is incomplete
+* [ARROW-7020](https://issues.apache.org/jira/browse/ARROW-7020) - [Java] Fix the bugs when calculating vector hash code
+* [ARROW-7021](https://issues.apache.org/jira/browse/ARROW-7021) - [Java] UnionFixedSizeListWriter decimal type should check writer index
+* [ARROW-7022](https://issues.apache.org/jira/browse/ARROW-7022) - [Python] \_\_arrow\_array\_\_ does not work for ExtensionTypes in Table.from\_pandas
+* [ARROW-7023](https://issues.apache.org/jira/browse/ARROW-7023) - [Python] pa.array does not use "from\_pandas" semantics for pd.Index
+* [ARROW-7024](https://issues.apache.org/jira/browse/ARROW-7024) - [CI][R] Update R dependencies for Conda build
+* [ARROW-7027](https://issues.apache.org/jira/browse/ARROW-7027) - [Python] pa.table(..) returns instead of raises error if passing invalid object
+* [ARROW-7033](https://issues.apache.org/jira/browse/ARROW-7033) - [C++] Error in./configure step for jemalloc when building on OSX 10.14.6
+* [ARROW-7045](https://issues.apache.org/jira/browse/ARROW-7045) - [R] Factor type not preserved in Parquet roundtrip
+* [ARROW-7050](https://issues.apache.org/jira/browse/ARROW-7050) - [R] Fix compiler warnings in R bindings
+* [ARROW-7053](https://issues.apache.org/jira/browse/ARROW-7053) - [Python] setuptools-scm produces incorrect version at apache-arrow-0.15.1 tag
+* [ARROW-7056](https://issues.apache.org/jira/browse/ARROW-7056) - [Python] Test errors without S3
+* [ARROW-7059](https://issues.apache.org/jira/browse/ARROW-7059) - [Python] Reading parquet file with many columns is much slower in 0.15.x versus 0.14.x
+* [ARROW-7074](https://issues.apache.org/jira/browse/ARROW-7074) - [C++] ASSERT\_OK\_AND\_ASSIGN crashes when failing
+* [ARROW-7077](https://issues.apache.org/jira/browse/ARROW-7077) - [C++] Unsupported Dict-\>T cast crashes instead of returning error
+* [ARROW-7087](https://issues.apache.org/jira/browse/ARROW-7087) - [Python] Table Metadata disappear when we write a partitioned dataset
+* [ARROW-7097](https://issues.apache.org/jira/browse/ARROW-7097) - [Rust][CI] Builds failing due to rust nightly formatting
+* [ARROW-7100](https://issues.apache.org/jira/browse/ARROW-7100) - [C++] libjvm.so not found on ubuntu 19.04 with openjdk-11
+* [ARROW-7105](https://issues.apache.org/jira/browse/ARROW-7105) - [CI][Crossbow] Nightly homebrew-cpp job fails
+* [ARROW-7106](https://issues.apache.org/jira/browse/ARROW-7106) - [Java] Fix the problem that flight perf test hangs endlessly
+* [ARROW-7117](https://issues.apache.org/jira/browse/ARROW-7117) - [C++][CI] Fix the hanging C++ tests in Windows 2019
+* [ARROW-7128](https://issues.apache.org/jira/browse/ARROW-7128) - [CI] Fedora cron jobs are failing because of wrong fedora version
+* [ARROW-7133](https://issues.apache.org/jira/browse/ARROW-7133) - [CI] Allow GH Actions to run on all branches
+* [ARROW-7142](https://issues.apache.org/jira/browse/ARROW-7142) - [C++] Compile error with GCC 5.4.0
+* [ARROW-7152](https://issues.apache.org/jira/browse/ARROW-7152) - [Java] Delete useless class DiffFunction
+* [ARROW-7157](https://issues.apache.org/jira/browse/ARROW-7157) - [R] Add validation, helpful error message to Object$new()
+* [ARROW-7158](https://issues.apache.org/jira/browse/ARROW-7158) - [C++][Visual Studio]Build config Error on non English Version visual studio.
+* [ARROW-7163](https://issues.apache.org/jira/browse/ARROW-7163) - [Doc] Fix double-and typos
+* [ARROW-7164](https://issues.apache.org/jira/browse/ARROW-7164) - [CI] Dev cron github action is failing every 15 minutes
+* [ARROW-7167](https://issues.apache.org/jira/browse/ARROW-7167) - [CI][Python] Add nightly tests for older pandas versions to Github Actions
+* [ARROW-7168](https://issues.apache.org/jira/browse/ARROW-7168) - [Python] pa.array() doesn't respect specified dictionary type
+* [ARROW-7170](https://issues.apache.org/jira/browse/ARROW-7170) - [C++] Bundled ORC fails linking
+* [ARROW-7180](https://issues.apache.org/jira/browse/ARROW-7180) - [CI] Java builds are not triggered on the master branch
+* [ARROW-7181](https://issues.apache.org/jira/browse/ARROW-7181) - [Python][Nightly] Wheel builds could NOT find ArrowPython
+* [ARROW-7183](https://issues.apache.org/jira/browse/ARROW-7183) - [CI][Crossbow] Re-skip r-sanitizer nightly tests
+* [ARROW-7187](https://issues.apache.org/jira/browse/ARROW-7187) - [C++][Doc] doxygen broken on master because of @
+* [ARROW-7188](https://issues.apache.org/jira/browse/ARROW-7188) - [C++][Doc] doxygen broken on master: missing param implicit\_casts
+* [ARROW-7189](https://issues.apache.org/jira/browse/ARROW-7189) - [CI][Crossbow] Nightly conda osx builds fail
+* [ARROW-7194](https://issues.apache.org/jira/browse/ARROW-7194) - [Rust] CSV Writer causing recursion errors
+* [ARROW-7199](https://issues.apache.org/jira/browse/ARROW-7199) - [Java] ConcurrentModificationException in BaseAllocator::getChildAllocators
+* [ARROW-7200](https://issues.apache.org/jira/browse/ARROW-7200) - [C++][Flight] Running Arrow Flight benchmark on two hosts doesn't work
+* [ARROW-7209](https://issues.apache.org/jira/browse/ARROW-7209) - [Python] tests with pandas master are failing now \_\_from\_arrow\_\_ support landed in pandas
+* [ARROW-7212](https://issues.apache.org/jira/browse/ARROW-7212) - "go test -bench=8192 -run=. ./math" fails
+* [ARROW-7214](https://issues.apache.org/jira/browse/ARROW-7214) - [Python] unpickling a pyarrow table with dictionary fields crashes
+* [ARROW-7217](https://issues.apache.org/jira/browse/ARROW-7217) - ARROW-7217: [CI][Python] Use correct python version in Github Actions
+* [ARROW-7225](https://issues.apache.org/jira/browse/ARROW-7225) - [C++] \`\*std::move(Result<T\>)\` calls T copy constructor
+* [ARROW-7249](https://issues.apache.org/jira/browse/ARROW-7249) - [CI] Release test fails in master due to new arrow-flight Rust crate
+* [ARROW-7250](https://issues.apache.org/jira/browse/ARROW-7250) - [C++] Undefined symbols for StringToFloatConverter::Impl with clang 4.x
+* [ARROW-7253](https://issues.apache.org/jira/browse/ARROW-7253) - [CI] Fix master failure with release test
+* [ARROW-7254](https://issues.apache.org/jira/browse/ARROW-7254) - BaseVariableWidthVector\#setSafe appears to make value offsets inconsistent
+* [ARROW-7264](https://issues.apache.org/jira/browse/ARROW-7264) - [Java] RangeEqualsVisitor type check is not correct
+* [ARROW-7266](https://issues.apache.org/jira/browse/ARROW-7266) - [Python] dictionary\_encode() of a slice gives wrong result
+* [ARROW-7271](https://issues.apache.org/jira/browse/ARROW-7271) - [C++][Flight] Use the single parameter version of SetTotalBytesLimit
+* [ARROW-7281](https://issues.apache.org/jira/browse/ARROW-7281) - [C++] AdaptiveIntBuilder::length() does not consider pending\_pos\_.
+* [ARROW-7282](https://issues.apache.org/jira/browse/ARROW-7282) - [Python] IO functions should raise FileNotFoundError when appropriate
+* [ARROW-7291](https://issues.apache.org/jira/browse/ARROW-7291) - [Dev] Fix FORMAT\_DIR in update-flatbuffers.sh
+* [ARROW-7294](https://issues.apache.org/jira/browse/ARROW-7294) - [Python] converted\_type\_name\_from\_enum(): Incorrect name for INT\_64
+* [ARROW-7295](https://issues.apache.org/jira/browse/ARROW-7295) - [R] Fix bad test that causes failure on R < 3.5
+* [ARROW-7298](https://issues.apache.org/jira/browse/ARROW-7298) - [C++] cpp/thirdparty/download-dependencies.sh is broken
+* [ARROW-7314](https://issues.apache.org/jira/browse/ARROW-7314) - [Python] Compiler warning in pyarrow
+* [ARROW-7318](https://issues.apache.org/jira/browse/ARROW-7318) - [C\#] TimestampArray serialization failure
+* [ARROW-7320](https://issues.apache.org/jira/browse/ARROW-7320) - [C++] Target arrow-type-benchmark failed to be built on bullx Linux
+* [ARROW-7327](https://issues.apache.org/jira/browse/ARROW-7327) - [CI] Failing C GLib and R buildbot builders
+* [ARROW-7328](https://issues.apache.org/jira/browse/ARROW-7328) - [CI] GitHub Actions should trigger on changes to GitHub Actions configuration
+* [ARROW-7341](https://issues.apache.org/jira/browse/ARROW-7341) - [CI] Unbreak nightly Conda R job
+* [ARROW-7343](https://issues.apache.org/jira/browse/ARROW-7343) - [Java] Memory leak in Flight DoGet when client cancels
+* [ARROW-7349](https://issues.apache.org/jira/browse/ARROW-7349) - [C++] Fix the bug of parsing string hex values
+* [ARROW-7353](https://issues.apache.org/jira/browse/ARROW-7353) - [C++] Disable -Wmissing-braces when building with clang
+* [ARROW-7354](https://issues.apache.org/jira/browse/ARROW-7354) - [C++] TestHadoopFileSystem::ThreadSafety fails with sigabort
+* [ARROW-7355](https://issues.apache.org/jira/browse/ARROW-7355) - [CI] Environment variables are defined twice for the fuzzit builds
+* [ARROW-7358](https://issues.apache.org/jira/browse/ARROW-7358) - [CI] [Dev] [C++] ccache disabled on conda-python-hdfs
+* [ARROW-7359](https://issues.apache.org/jira/browse/ARROW-7359) - [C++][Gandiva] Don't throw error for locate function with start position exceeding string length, return 0 instead
+* [ARROW-7360](https://issues.apache.org/jira/browse/ARROW-7360) - [R] Can't use dplyr filter() with variables defined in parent scope
+* [ARROW-7361](https://issues.apache.org/jira/browse/ARROW-7361) - [Rust] Build directory is not passed to ci/scripts/rust\_test.sh
+* [ARROW-7362](https://issues.apache.org/jira/browse/ARROW-7362) - [Python] ListArray.flatten() should take care of slicing offsets
+* [ARROW-7374](https://issues.apache.org/jira/browse/ARROW-7374) - [Dev] [C++] cuda-cpp docker image fails compiling Arrow
+* [ARROW-7381](https://issues.apache.org/jira/browse/ARROW-7381) - [C++][Packaging] Iterator change broke manylinux1 wheels
+* [ARROW-7386](https://issues.apache.org/jira/browse/ARROW-7386) - [C\#] Array offset does not work properly
+* [ARROW-7388](https://issues.apache.org/jira/browse/ARROW-7388) - [Python] Skip HDFS tests if libhdfs cannot be located
+* [ARROW-7389](https://issues.apache.org/jira/browse/ARROW-7389) - [Python][Packaging] Remove pyarrow.s3fs import check from the recipe
+* [ARROW-7393](https://issues.apache.org/jira/browse/ARROW-7393) - [Plasma] Fix plasma executable name in build for Java
+* [ARROW-7395](https://issues.apache.org/jira/browse/ARROW-7395) - [C++] Logical "or" with constants is a Clang warning
+* [ARROW-7397](https://issues.apache.org/jira/browse/ARROW-7397) - [C++] Json white space length detection error
+* [ARROW-7404](https://issues.apache.org/jira/browse/ARROW-7404) - [C++][Gandiva] Fix utf8 char length error on Arm64
+* [ARROW-7406](https://issues.apache.org/jira/browse/ARROW-7406) - [Java] NonNullableStructVector\#hashCode should pass hasher to child vectors
+* [ARROW-7407](https://issues.apache.org/jira/browse/ARROW-7407) - [Python] Failed to install pyarrow 0.15.1 on Python 3.8
+* [ARROW-7408](https://issues.apache.org/jira/browse/ARROW-7408) - [C++] Reference benchmarks fail compiling
+* [ARROW-7435](https://issues.apache.org/jira/browse/ARROW-7435) - Security issue: ValidateOffsets() does not prevent buffer over-read
+* [ARROW-7436](https://issues.apache.org/jira/browse/ARROW-7436) - [Archery] Fix benchmark default configuration
+* [ARROW-7437](https://issues.apache.org/jira/browse/ARROW-7437) - [Java] ReadChannel\#readFully does not set writer index correctly
+* [ARROW-7442](https://issues.apache.org/jira/browse/ARROW-7442) - [Ruby] Specifying column type as time causes segmentation fault
+* [ARROW-7447](https://issues.apache.org/jira/browse/ARROW-7447) - [Java] ComplexCopier does incorrect copy in some cases
+* [ARROW-7450](https://issues.apache.org/jira/browse/ARROW-7450) - [CI][C++] test-ubuntu-18.04-cpp-static failing with linking error in arrow-io-hdfs-test
+* [ARROW-7458](https://issues.apache.org/jira/browse/ARROW-7458) - [GLib] incorrect build dependency in Makefile
+* [ARROW-7471](https://issues.apache.org/jira/browse/ARROW-7471) - [Python] Cython flake8 failures
+* [ARROW-7472](https://issues.apache.org/jira/browse/ARROW-7472) - [Java] Fix some incorrect behavior in UnionListWriter
+* [ARROW-7478](https://issues.apache.org/jira/browse/ARROW-7478) - [Rust] [DataFusion] Group by expression ignored unless paired with aggregate expression
+* [ARROW-7492](https://issues.apache.org/jira/browse/ARROW-7492) - [CI][Crossbow] Nightly homebrew-cpp job fails on Python installation
+* [ARROW-7497](https://issues.apache.org/jira/browse/ARROW-7497) - [Python] Test asserts: pandas.util.testing is deprecated, use pandas.testing instead
+* [ARROW-7500](https://issues.apache.org/jira/browse/ARROW-7500) - [C++][Dataset] regex\_error in hive partition on centos7 and opensuse42
+* [ARROW-7503](https://issues.apache.org/jira/browse/ARROW-7503) - [Rust] Rust builds are failing on master
+* [ARROW-7506](https://issues.apache.org/jira/browse/ARROW-7506) - [Java] JMH benchmarks should be called from main methods
+* [ARROW-7508](https://issues.apache.org/jira/browse/ARROW-7508) - [C\#] DateTime32 Reading is Broken
+* [ARROW-7510](https://issues.apache.org/jira/browse/ARROW-7510) - [C++] Array::null\_count() is not thread-compatible
+* [ARROW-7516](https://issues.apache.org/jira/browse/ARROW-7516) - [C\#] .NET Benchmarks are broken
+* [ARROW-7518](https://issues.apache.org/jira/browse/ARROW-7518) - [Python] Use PYARROW\_WITH\_HDFS when building wheels, conda packages
+* [ARROW-7527](https://issues.apache.org/jira/browse/ARROW-7527) - [Python] pandas/feather tests failing on pandas master
+* [ARROW-7528](https://issues.apache.org/jira/browse/ARROW-7528) - [Python] The pandas.datetime class (import of datetime.datetime) and pandas.np are deprecated
+* [ARROW-7535](https://issues.apache.org/jira/browse/ARROW-7535) - [C++] ASAN failure in validation
+* [ARROW-7543](https://issues.apache.org/jira/browse/ARROW-7543) - [R] arrow::write\_parquet() code examples do not work
+* [ARROW-7545](https://issues.apache.org/jira/browse/ARROW-7545) - [C++] [Dataset] Scanning dataset with dictionary type hangs
+* [ARROW-7551](https://issues.apache.org/jira/browse/ARROW-7551) - [FlightRPC][C++] Flight test on macOS fails due to Homebrew gRPC
+* [ARROW-7552](https://issues.apache.org/jira/browse/ARROW-7552) - [C++] TestSlowInputStream is flaky
+* [ARROW-7554](https://issues.apache.org/jira/browse/ARROW-7554) - [C++] Unknown CMake command "externalproject\_add".
+* [ARROW-7559](https://issues.apache.org/jira/browse/ARROW-7559) - [Rust] Possibly incorrect index check assertion in StringArray and BinaryArray
+* [ARROW-7561](https://issues.apache.org/jira/browse/ARROW-7561) - [Doc][Python] fix conda environment command
+* [ARROW-7563](https://issues.apache.org/jira/browse/ARROW-7563) - [Rust] failed to select a version for \`byteorder\`
+* [ARROW-7582](https://issues.apache.org/jira/browse/ARROW-7582) - [Rust][Flight] Unable to compile arrow.flight.protocol.rs
+* [ARROW-7583](https://issues.apache.org/jira/browse/ARROW-7583) - [C++][Flight] Auth handler tests fragile on Windows
+* [ARROW-7591](https://issues.apache.org/jira/browse/ARROW-7591) - [Python] DictionaryArray.to\_numpy returns dict of parts instead of numpy array
+* [ARROW-7592](https://issues.apache.org/jira/browse/ARROW-7592) - [C++] Fix crashes on corrupt IPC input
+* [ARROW-7593](https://issues.apache.org/jira/browse/ARROW-7593) - [CI][Python] Python datasets failing on master / not run on CI
+* [ARROW-7595](https://issues.apache.org/jira/browse/ARROW-7595) - [R][CI] R appveyor job fails due to pacman compression change
+* [ARROW-7596](https://issues.apache.org/jira/browse/ARROW-7596) - [Python] Only apply zero-copy DataFrame block optimizations when split\_blocks=True
+* [ARROW-7599](https://issues.apache.org/jira/browse/ARROW-7599) - [Java] Fix build break due to change in RangeEqualsVisitor
+* [ARROW-7603](https://issues.apache.org/jira/browse/ARROW-7603) - [CI][Crossbow] Nightly centos 8 job fails
+* [ARROW-7611](https://issues.apache.org/jira/browse/ARROW-7611) - [Packaging][Python] Artifacts patterns for wheel are wrong
+* [ARROW-7612](https://issues.apache.org/jira/browse/ARROW-7612) - [Packaging][Python] Artifact paths for Conda on WIndows are wrong
+* [ARROW-7614](https://issues.apache.org/jira/browse/ARROW-7614) - [Python] Slow performance in test\_parquet.py::test\_set\_data\_page\_size
+* [ARROW-7618](https://issues.apache.org/jira/browse/ARROW-7618) - [C++] Fix crashes or undefined behaviour on corrupt IPC input
+* [ARROW-7620](https://issues.apache.org/jira/browse/ARROW-7620) - [Rust] Windows builds failing due to flatbuffer compile error
+* [ARROW-7621](https://issues.apache.org/jira/browse/ARROW-7621) - [Doc] Doc build fails
+* [ARROW-7634](https://issues.apache.org/jira/browse/ARROW-7634) - [Python] Dataset tests failing on Windows to parse file path
+* [ARROW-7638](https://issues.apache.org/jira/browse/ARROW-7638) - [Python] Segfault when inspecting dataset.Source with invalid file/partitioning
+* [ARROW-7639](https://issues.apache.org/jira/browse/ARROW-7639) - [R] Cannot convert Dictionary Array to R when values aren't strings
+* [ARROW-7640](https://issues.apache.org/jira/browse/ARROW-7640) - [C++][Dataset] segfault when reading compressed Parquet files if build didn't include support for codec
+* [ARROW-7647](https://issues.apache.org/jira/browse/ARROW-7647) - [C++] JSON reader fails to read arrays with few values
+* [ARROW-7650](https://issues.apache.org/jira/browse/ARROW-7650) - [C++] Dataset tests not built on Windows
+* [ARROW-7651](https://issues.apache.org/jira/browse/ARROW-7651) - [CI][Crossbow] Nightly macOS wheel builds fail
+* [ARROW-7652](https://issues.apache.org/jira/browse/ARROW-7652) - [Python][Dataset] Insert implicit cast in ScannerBuilder.filter
+* [ARROW-7661](https://issues.apache.org/jira/browse/ARROW-7661) - [Python] Non-optimal CSV chunking when no newline at end
+* [ARROW-7689](https://issues.apache.org/jira/browse/ARROW-7689) - [C++] Sporadic Flight test crash on macOS
+* [ARROW-7690](https://issues.apache.org/jira/browse/ARROW-7690) - [R] Cannot write parquet to OutputStream
+* [ARROW-7693](https://issues.apache.org/jira/browse/ARROW-7693) - [CI] Fix test-conda-python-3.7-spark-master nightly errors
+* [ARROW-7709](https://issues.apache.org/jira/browse/ARROW-7709) - [Python] Conversion from Table Column to Pandas loses name for Timestamps
+* [ARROW-7714](https://issues.apache.org/jira/browse/ARROW-7714) - [Release] Variable expansion is missing
+* [ARROW-7718](https://issues.apache.org/jira/browse/ARROW-7718) - [Release] Fix auto-retry in the binary release script
+* [ARROW-7723](https://issues.apache.org/jira/browse/ARROW-7723) - [Python] StructArray timestamp type with timezone to\_pandas convert error
+* [ARROW-7727](https://issues.apache.org/jira/browse/ARROW-7727) - [Python] Unable to read a ParquetDataset when schema validation is on.
+* [ARROW-8135](https://issues.apache.org/jira/browse/ARROW-8135) - [Python] Problem importing PyArrow on a cluster
+* [ARROW-8638](https://issues.apache.org/jira/browse/ARROW-8638) - Arrow Cython API Usage Gives an error when calling CTable API Endpoints
+* [PARQUET-1692](https://issues.apache.org/jira/browse/PARQUET-1692) - [C++] LogicalType::FromThrift error on Centos 7 RPM
+* [PARQUET-1692](https://issues.apache.org/jira/browse/PARQUET-1692) - [C++] LogicalType::FromThrift error on Centos 7 RPM
+* [PARQUET-1693](https://issues.apache.org/jira/browse/PARQUET-1693) - [C++] Build examples don't account for CMAKE compression feature flags
+* [PARQUET-1702](https://issues.apache.org/jira/browse/PARQUET-1702) - [C++] Make BufferedRowGroupWriter compatible with parquet encryption
+* [PARQUET-1706](https://issues.apache.org/jira/browse/PARQUET-1706) - [C++] Wrong dictionary\_page\_offset when writing only data pages via BufferedPageWriter
+* [PARQUET-1707](https://issues.apache.org/jira/browse/PARQUET-1707) - [C++] parquet-arrow-test fails with undefined behaviour sanitizer
+* [PARQUET-1709](https://issues.apache.org/jira/browse/PARQUET-1709) - [C++] Avoid unnecessary temporary std::shared\_ptr copies
+* [PARQUET-1715](https://issues.apache.org/jira/browse/PARQUET-1715) - [C++] Add the Parquet code samples to CI + Refactor Parquet Encryption Samples
+* [PARQUET-1720](https://issues.apache.org/jira/browse/PARQUET-1720) - [C++] Parquet JSONPrint not showing version correctly
+* [PARQUET-1747](https://issues.apache.org/jira/browse/PARQUET-1747) - [C++] Access to ColumnChunkMetaData fails when encryption is on
+* [PARQUET-1766](https://issues.apache.org/jira/browse/PARQUET-1766) - [C++] parquet NaN/null double statistics can result in endless loop
+* [PARQUET-1772](https://issues.apache.org/jira/browse/PARQUET-1772) - [C++] ParquetFileWriter: Data overwritten when output stream opened in append mode
+
+
+## New Features and Improvements
+
+* [ARROW-412](https://issues.apache.org/jira/browse/ARROW-412) - [Format] Handling of buffer padding in the IPC metadata
+* [ARROW-501](https://issues.apache.org/jira/browse/ARROW-501) - [C++] Implement concurrent / buffering InputStream for streaming data use cases
+* [ARROW-772](https://issues.apache.org/jira/browse/ARROW-772) - [C++] Implement take kernel functions
+* [ARROW-843](https://issues.apache.org/jira/browse/ARROW-843) - [C++] Implement Schema unification, merging unequal but equivalent schemas
+* [ARROW-976](https://issues.apache.org/jira/browse/ARROW-976) - [C++][Python] Provide API for defining and reading Parquet datasets with more ad hoc partition schemes
+* [ARROW-1036](https://issues.apache.org/jira/browse/ARROW-1036) - [C++] Define abstract API for filtering Arrow streams (e.g. predicate evaluation)
+* [ARROW-1119](https://issues.apache.org/jira/browse/ARROW-1119) - [Python/C++] Implement NativeFile interfaces for Amazon S3
+* [ARROW-1175](https://issues.apache.org/jira/browse/ARROW-1175) - [Java] Implement/test dictionary-encoded subfields
+* [ARROW-1456](https://issues.apache.org/jira/browse/ARROW-1456) - [Python] Run s3fs unit tests in Travis CI
+* [ARROW-1562](https://issues.apache.org/jira/browse/ARROW-1562) - [C++] Numeric kernel implementations for add (+)
+* [ARROW-1638](https://issues.apache.org/jira/browse/ARROW-1638) - [Java] IPC roundtrip for null type
+* [ARROW-1900](https://issues.apache.org/jira/browse/ARROW-1900) - [C++] Add kernel functions for determining value range (maximum and minimum) of integer arrays
+* [ARROW-2428](https://issues.apache.org/jira/browse/ARROW-2428) - [Python] Add API to map Arrow types (including extension types) to pandas ExtensionArray instances for to\_pandas conversions
+* [ARROW-2602](https://issues.apache.org/jira/browse/ARROW-2602) - [Packaging] Automate build of development docker containers
+* [ARROW-2863](https://issues.apache.org/jira/browse/ARROW-2863) - [Python] Add context manager APIs to RecordBatch\*Writer/Reader classes
+* [ARROW-3085](https://issues.apache.org/jira/browse/ARROW-3085) - [Rust] Add an adapter for parquet.
+* [ARROW-3408](https://issues.apache.org/jira/browse/ARROW-3408) - [C++] Add option to CSV reader to dictionary encode individual columns or all string / binary columns
+* [ARROW-3444](https://issues.apache.org/jira/browse/ARROW-3444) - [Python] Table.nbytes attribute
+* [ARROW-3706](https://issues.apache.org/jira/browse/ARROW-3706) - [Rust] Add record batch reader trait.
+* [ARROW-3789](https://issues.apache.org/jira/browse/ARROW-3789) - [Python] Enable calling object in Table.to\_pandas to "self-destruct" for improved memory use
+* [ARROW-3808](https://issues.apache.org/jira/browse/ARROW-3808) - [R] Implement [.arrow::Array
+* [ARROW-3813](https://issues.apache.org/jira/browse/ARROW-3813) - [R] lower level construction of Dictionary Arrays
+* [ARROW-4059](https://issues.apache.org/jira/browse/ARROW-4059) - [Rust] Parquet/Arrow Integration
+* [ARROW-4091](https://issues.apache.org/jira/browse/ARROW-4091) - [C++] Curate default list of CSV null spellings
+* [ARROW-4208](https://issues.apache.org/jira/browse/ARROW-4208) - [CI/Python] Have automatized tests for S3
+* [ARROW-4219](https://issues.apache.org/jira/browse/ARROW-4219) - [Rust] [Parquet] Implement ArrowReader
+* [ARROW-4223](https://issues.apache.org/jira/browse/ARROW-4223) - [Python] Support scipy.sparse integration
+* [ARROW-4224](https://issues.apache.org/jira/browse/ARROW-4224) - [Python] Support integration with pydata/sparse library
+* [ARROW-4225](https://issues.apache.org/jira/browse/ARROW-4225) - [Format][C++] Add CSC sparse matrix support
+* [ARROW-4722](https://issues.apache.org/jira/browse/ARROW-4722) - [C++] Implement Bitmap class to modularize handling of bitmaps
+* [ARROW-4748](https://issues.apache.org/jira/browse/ARROW-4748) - [Rust] [DataFusion] GROUP BY performance could be optimized
+* [ARROW-4930](https://issues.apache.org/jira/browse/ARROW-4930) - [Python] Remove LIBDIR assumptions in Python build
+* [ARROW-5180](https://issues.apache.org/jira/browse/ARROW-5180) - [Rust] IPC Support
+* [ARROW-5181](https://issues.apache.org/jira/browse/ARROW-5181) - [Rust] Create Arrow File reader
+* [ARROW-5182](https://issues.apache.org/jira/browse/ARROW-5182) - [Rust] Create Arrow File writer
+* [ARROW-5227](https://issues.apache.org/jira/browse/ARROW-5227) - [Rust] [DataFusion] Re-implement query execution with an extensible physical query plan
+* [ARROW-5277](https://issues.apache.org/jira/browse/ARROW-5277) - [C\#] MemoryAllocator.Allocate(length: 0) should not return null
+* [ARROW-5333](https://issues.apache.org/jira/browse/ARROW-5333) - [C++] Fit build option summary into narrower console
+* [ARROW-5366](https://issues.apache.org/jira/browse/ARROW-5366) - [Rust] Implement Duration and Interval Arrays
+* [ARROW-5400](https://issues.apache.org/jira/browse/ARROW-5400) - [Rust] Test/ensure that reader and writer support zero-length record batches
+* [ARROW-5445](https://issues.apache.org/jira/browse/ARROW-5445) - [Website] Remove language that encourages pinning a version
+* [ARROW-5454](https://issues.apache.org/jira/browse/ARROW-5454) - [C++] Implement Take on ChunkedArray for DataFrame use
+* [ARROW-5502](https://issues.apache.org/jira/browse/ARROW-5502) - [R] file readers should mmap
+* [ARROW-5508](https://issues.apache.org/jira/browse/ARROW-5508) - [C++] Create reusable Iterator<T\> interface
+* [ARROW-5523](https://issues.apache.org/jira/browse/ARROW-5523) - [Python] [Packaging] Use HTTPS consistently for downloading dependencies
+* [ARROW-5712](https://issues.apache.org/jira/browse/ARROW-5712) - [C++][Parquet] Arrow time32/time64/timestamp ConvertedType not being restored properly
+* [ARROW-5767](https://issues.apache.org/jira/browse/ARROW-5767) - [Format] Permit dictionary replacements in IPC protocol
+* [ARROW-5801](https://issues.apache.org/jira/browse/ARROW-5801) - [CI] Dockerize (add to docker-compose) all Travis CI Linux tasks
+* [ARROW-5802](https://issues.apache.org/jira/browse/ARROW-5802) - [CI] Dockerize "lint" Travis CI job
+* [ARROW-5804](https://issues.apache.org/jira/browse/ARROW-5804) - [C++] Dockerize C++ CI job with conda-forge toolchain, code coverage from Travis CI
+* [ARROW-5805](https://issues.apache.org/jira/browse/ARROW-5805) - [Python] Dockerize (add to docker-compose) Python Travis CI job
+* [ARROW-5806](https://issues.apache.org/jira/browse/ARROW-5806) - [CI] Dockerize (add to docker-compose) Integration tests Travis CI entry
+* [ARROW-5807](https://issues.apache.org/jira/browse/ARROW-5807) - [JS] Dockerize NodeJS Travis CI entry
+* [ARROW-5808](https://issues.apache.org/jira/browse/ARROW-5808) - [GLib][Ruby] Dockerize (add to docker-compose) current GLib + Ruby Travis CI entry
+* [ARROW-5809](https://issues.apache.org/jira/browse/ARROW-5809) - [Rust] Dockerize (add to docker-compose) Rust Travis CI build
+* [ARROW-5810](https://issues.apache.org/jira/browse/ARROW-5810) - [Go] Dockerize Travis CI Go build
+* [ARROW-5831](https://issues.apache.org/jira/browse/ARROW-5831) - [Release] Migrate and improve binary release verification script
+* [ARROW-5839](https://issues.apache.org/jira/browse/ARROW-5839) - [Python] Test manylinux2010 in CI
+* [ARROW-5855](https://issues.apache.org/jira/browse/ARROW-5855) - [Python] Add support for Duration type
+* [ARROW-5859](https://issues.apache.org/jira/browse/ARROW-5859) - [Python] Support ExtentionType on conversion to numpy/pandas
+* [ARROW-5971](https://issues.apache.org/jira/browse/ARROW-5971) - [Website] Blog post introducing Arrow Flight
+* [ARROW-5994](https://issues.apache.org/jira/browse/ARROW-5994) - [CI] [Rust] Create nightly releases of the Rust implementation
+* [ARROW-6003](https://issues.apache.org/jira/browse/ARROW-6003) - [C++] Better input validation and error messaging in CSV reader
+* [ARROW-6074](https://issues.apache.org/jira/browse/ARROW-6074) - [FlightRPC] Implement middleware
+* [ARROW-6091](https://issues.apache.org/jira/browse/ARROW-6091) - [Rust] [DataFusion] Implement parallel execution for limit
+* [ARROW-6109](https://issues.apache.org/jira/browse/ARROW-6109) - [Integration] Docker image for integration testing can't be built on windows
+* [ARROW-6112](https://issues.apache.org/jira/browse/ARROW-6112) - [Java] Update APIs to support 64-bit address space
+* [ARROW-6184](https://issues.apache.org/jira/browse/ARROW-6184) - [Java] Provide hash table based dictionary encoder
+* [ARROW-6251](https://issues.apache.org/jira/browse/ARROW-6251) - [Developer] Add PR merge tool to apache/arrow-site
+* [ARROW-6257](https://issues.apache.org/jira/browse/ARROW-6257) - [C++] Add fnmatch compatible globbing function
+* [ARROW-6274](https://issues.apache.org/jira/browse/ARROW-6274) - [Rust] [DataFusion] Add support for writing results to CSV
+* [ARROW-6277](https://issues.apache.org/jira/browse/ARROW-6277) - [C++][Parquet] Support reading/writing other Parquet primitive types to DictionaryArray
+* [ARROW-6283](https://issues.apache.org/jira/browse/ARROW-6283) - [Rust] [DataFusion] Implement operator to write query results to partitioned CSV
+* [ARROW-6285](https://issues.apache.org/jira/browse/ARROW-6285) - [GLib] Add support for LargeBinary and LargeString types
+* [ARROW-6286](https://issues.apache.org/jira/browse/ARROW-6286) - [GLib] Add support for LargeList type
+* [ARROW-6299](https://issues.apache.org/jira/browse/ARROW-6299) - [C++] Simplify FileFormat classes to singletons
+* [ARROW-6321](https://issues.apache.org/jira/browse/ARROW-6321) - [Python] Ability to create ExtensionBlock on conversion to pandas
+* [ARROW-6340](https://issues.apache.org/jira/browse/ARROW-6340) - [R] Implements low-level bindings to Dataset classes
+* [ARROW-6341](https://issues.apache.org/jira/browse/ARROW-6341) - [Python] Implement low-level bindings for Dataset
+* [ARROW-6352](https://issues.apache.org/jira/browse/ARROW-6352) - [Java] Add implementation of DenseUnionVector.
+* [ARROW-6367](https://issues.apache.org/jira/browse/ARROW-6367) - [C++][Gandiva] Implement string reverse
+* [ARROW-6378](https://issues.apache.org/jira/browse/ARROW-6378) - [C++][Dataset] Implement TreeDataSource
+* [ARROW-6386](https://issues.apache.org/jira/browse/ARROW-6386) - [C++][Documentation] Explicit documentation of null slot interpretation
+* [ARROW-6394](https://issues.apache.org/jira/browse/ARROW-6394) - [Java] Support conversions between delta vector and partial sum vector
+* [ARROW-6396](https://issues.apache.org/jira/browse/ARROW-6396) - [C++] Add ResolveNullOptions to Logical kernels
+* [ARROW-6398](https://issues.apache.org/jira/browse/ARROW-6398) - [C++] Consolidate ScanOptions and ScanContext
+* [ARROW-6405](https://issues.apache.org/jira/browse/ARROW-6405) - [Python] Add std::move wrapper for use in Cython
+* [ARROW-6452](https://issues.apache.org/jira/browse/ARROW-6452) - [Java] Override ValueVector toString() method
+* [ARROW-6463](https://issues.apache.org/jira/browse/ARROW-6463) - [C++][Python] Rename arrow::fs::Selector to FileSelector
+* [ARROW-6466](https://issues.apache.org/jira/browse/ARROW-6466) - [Developer] Refactor integration/integration\_test.py into a proper Python package
+* [ARROW-6468](https://issues.apache.org/jira/browse/ARROW-6468) - [C++] Remove unused hashing routines
+* [ARROW-6473](https://issues.apache.org/jira/browse/ARROW-6473) - [Format] Clarify dictionary encoding edge cases
+* [ARROW-6503](https://issues.apache.org/jira/browse/ARROW-6503) - [C++] Add an argument of memory pool object to SparseTensorConverter
+* [ARROW-6508](https://issues.apache.org/jira/browse/ARROW-6508) - [C++] Add Tensor and SparseTensor factory function with validations
+* [ARROW-6515](https://issues.apache.org/jira/browse/ARROW-6515) - [C++] Clean type\_traits.h definitions
+* [ARROW-6578](https://issues.apache.org/jira/browse/ARROW-6578) - [C++] Casting int64 to string columns
+* [ARROW-6592](https://issues.apache.org/jira/browse/ARROW-6592) - [Java] Add support for skipping decoding of columns/field in Avro converter
+* [ARROW-6594](https://issues.apache.org/jira/browse/ARROW-6594) - [Java] Support logical type encodings from Avro
+* [ARROW-6598](https://issues.apache.org/jira/browse/ARROW-6598) - [Java] Sort the code for ApproxEqualsVisitor
+* [ARROW-6608](https://issues.apache.org/jira/browse/ARROW-6608) - [C++] Make default for ARROW\_HDFS to be OFF
+* [ARROW-6610](https://issues.apache.org/jira/browse/ARROW-6610) - [C++] Add ARROW\_FILESYSTEM=ON/OFF CMake configuration flag
+* [ARROW-6611](https://issues.apache.org/jira/browse/ARROW-6611) - [C++] Make ARROW\_JSON=OFF the default
+* [ARROW-6612](https://issues.apache.org/jira/browse/ARROW-6612) - [C++] Add ARROW\_CSV CMake build flag
+* [ARROW-6619](https://issues.apache.org/jira/browse/ARROW-6619) - [Ruby] Add support for building Gandiva::Expression by Arrow::Schema\#build\_expression
+* [ARROW-6624](https://issues.apache.org/jira/browse/ARROW-6624) - [C++] Add SparseTensor.ToTensor() method
+* [ARROW-6625](https://issues.apache.org/jira/browse/ARROW-6625) - [Python] Allow concat\_tables to null or default fill missing columns
+* [ARROW-6631](https://issues.apache.org/jira/browse/ARROW-6631) - [C++] Do not build with any compression library dependencies by default
+* [ARROW-6632](https://issues.apache.org/jira/browse/ARROW-6632) - [C++] Do not build with ARROW\_COMPUTE=on and ARROW\_DATASET=on by default
+* [ARROW-6633](https://issues.apache.org/jira/browse/ARROW-6633) - [C++] Do not require double-conversion for default build
+* [ARROW-6634](https://issues.apache.org/jira/browse/ARROW-6634) - [C++] Do not require flatbuffers or flatbuffers\_ep to build
+* [ARROW-6634](https://issues.apache.org/jira/browse/ARROW-6634) - [C++] Do not require flatbuffers or flatbuffers\_ep to build
+* [ARROW-6635](https://issues.apache.org/jira/browse/ARROW-6635) - [C++] Do not require glog for default build
+* [ARROW-6636](https://issues.apache.org/jira/browse/ARROW-6636) - [C++] Do not build C++ command line utilities by default
+* [ARROW-6637](https://issues.apache.org/jira/browse/ARROW-6637) - [C++] Zero-dependency default core build
+* [ARROW-6637](https://issues.apache.org/jira/browse/ARROW-6637) - [C++] Zero-dependency default core build
+* [ARROW-6646](https://issues.apache.org/jira/browse/ARROW-6646) - [Go] Amend NullType IPC implementation to append no buffers in RecordBatch message
+* [ARROW-6650](https://issues.apache.org/jira/browse/ARROW-6650) - [Rust] [Integration] Create methods to test Arrow files against Integration JSON
+* [ARROW-6656](https://issues.apache.org/jira/browse/ARROW-6656) - [Rust] [DataFusion] Implement MIN and MAX aggregate expressions
+* [ARROW-6657](https://issues.apache.org/jira/browse/ARROW-6657) - [Rust] [DataFusion] Implement COUNT aggregate expression
+* [ARROW-6658](https://issues.apache.org/jira/browse/ARROW-6658) - [Rust] [DataFusion] Implement AVG aggregate expression
+* [ARROW-6659](https://issues.apache.org/jira/browse/ARROW-6659) - [Rust] [DataFusion] Refactor of HashAggregateExec to support custom merge
+* [ARROW-6662](https://issues.apache.org/jira/browse/ARROW-6662) - [Java] Implement equals/approxEquals API for VectorSchemaRoot
+* [ARROW-6671](https://issues.apache.org/jira/browse/ARROW-6671) - [C++] Sparse tensor naming
+* [ARROW-6672](https://issues.apache.org/jira/browse/ARROW-6672) - [Java] Extract a common interface for dictionary builders
+* [ARROW-6685](https://issues.apache.org/jira/browse/ARROW-6685) - [C++/Python] S3 FileStat object's base\_path and type depends on trailing slash
+* [ARROW-6686](https://issues.apache.org/jira/browse/ARROW-6686) - [CI] Pull and push docker images to speed up the nightly builds
+* [ARROW-6688](https://issues.apache.org/jira/browse/ARROW-6688) - [Packaging] Include s3 support in the conda packages
+* [ARROW-6690](https://issues.apache.org/jira/browse/ARROW-6690) - [Rust] [DataFusion] HashAggregate without GROUP BY should use SIMD
+* [ARROW-6692](https://issues.apache.org/jira/browse/ARROW-6692) - [Rust] [DataFusion] Update examples to use physical query plan
+* [ARROW-6693](https://issues.apache.org/jira/browse/ARROW-6693) - [Rust] [DataFusion] Update unit tests to use physical query plan
+* [ARROW-6694](https://issues.apache.org/jira/browse/ARROW-6694) - [Rust] [DataFusion] Update integration tests to use physical plan
+* [ARROW-6695](https://issues.apache.org/jira/browse/ARROW-6695) - [Rust] [DataFusion] Remove execution of logical plan
+* [ARROW-6696](https://issues.apache.org/jira/browse/ARROW-6696) - [Rust] [DataFusion] Implement simple math operations in physical query plan
+* [ARROW-6700](https://issues.apache.org/jira/browse/ARROW-6700) - [Rust] [DataFusion] Use new parquet arrow reader
+* [ARROW-6707](https://issues.apache.org/jira/browse/ARROW-6707) - [Java] Improve the performance of JDBC adapters by using nullable information
+* [ARROW-6710](https://issues.apache.org/jira/browse/ARROW-6710) - [Java] Add JDBC adapter test to cover cases which contains some null values
+* [ARROW-6711](https://issues.apache.org/jira/browse/ARROW-6711) - [C++] Consolidate Filter and Expression classes
+* [ARROW-6721](https://issues.apache.org/jira/browse/ARROW-6721) - [JAVA] Avro adapter benchmark only runs once in JMH
+* [ARROW-6722](https://issues.apache.org/jira/browse/ARROW-6722) - [Java] Provide a uniform way to get vector name
+* [ARROW-6729](https://issues.apache.org/jira/browse/ARROW-6729) - [C++] StlStringBuffer constructor is not zero-copy
+* [ARROW-6730](https://issues.apache.org/jira/browse/ARROW-6730) - [CI] Use GitHub Actions for "C++ with clang 7" docker image
+* [ARROW-6731](https://issues.apache.org/jira/browse/ARROW-6731) - [CI] [Rust] Set up Github Action to run Rust tests
+* [ARROW-6732](https://issues.apache.org/jira/browse/ARROW-6732) - [Java] Implement quick sort in a non-recursive way to avoid stack overflow
+* [ARROW-6741](https://issues.apache.org/jira/browse/ARROW-6741) - [Release] Update changelog.py to use APACHE\_ prefixed JIRA\_USERNAME and JIRA\_PASSWORD environment variables
+* [ARROW-6742](https://issues.apache.org/jira/browse/ARROW-6742) - [C++] Remove usage of boost::filesystem::path from arrow/io/hdfs\_internal.cc
+* [ARROW-6743](https://issues.apache.org/jira/browse/ARROW-6743) - [C++] Completely remove usage of boost::filesystem (except in hdfs\_internal)
+* [ARROW-6744](https://issues.apache.org/jira/browse/ARROW-6744) - [Rust] Export JsonEqual trait in the array module
+* [ARROW-6754](https://issues.apache.org/jira/browse/ARROW-6754) - [C++] Merge arrow/allocator.h and arrow/stl.h, or rename allocator.h
+* [ARROW-6758](https://issues.apache.org/jira/browse/ARROW-6758) - [Release] Install ephemeral node/npm/npx in release verification script
+* [ARROW-6764](https://issues.apache.org/jira/browse/ARROW-6764) - [C++] Add readahead iterator
+* [ARROW-6767](https://issues.apache.org/jira/browse/ARROW-6767) - [JS] lazily bind batches in scan/scanReverse
+* [ARROW-6768](https://issues.apache.org/jira/browse/ARROW-6768) - [C++][Dataset] Implement dataset::Scan to Table helper function
+* [ARROW-6769](https://issues.apache.org/jira/browse/ARROW-6769) - [C++][Dataset] End to End dataset integration test case
+* [ARROW-6770](https://issues.apache.org/jira/browse/ARROW-6770) - [CI][Travis] Download Minio quietly
+* [ARROW-6777](https://issues.apache.org/jira/browse/ARROW-6777) - [GLib][CI] Unpin gobject-introspection gem
+* [ARROW-6778](https://issues.apache.org/jira/browse/ARROW-6778) - [C++] Support DurationType in Cast kernel
+* [ARROW-6782](https://issues.apache.org/jira/browse/ARROW-6782) - [C++] Build minimal core Arrow libraries without any Boost headers
+* [ARROW-6784](https://issues.apache.org/jira/browse/ARROW-6784) - [C++][R] Move filter and take code from Rcpp to C++ library
+* [ARROW-6787](https://issues.apache.org/jira/browse/ARROW-6787) - [CI] Decommission "C++ with clang 7 and system packages" Travis CI job
+* [ARROW-6788](https://issues.apache.org/jira/browse/ARROW-6788) - [CI] Migrate Travis CI lint job to GitHub Actions
+* [ARROW-6789](https://issues.apache.org/jira/browse/ARROW-6789) - [Python] Automatically box bytes/buffer-like values yielded from \`FlightServerBase.do\_action\` in Result values
+* [ARROW-6790](https://issues.apache.org/jira/browse/ARROW-6790) - [Release] Automatically disable integration test cases in release verification
+* [ARROW-6793](https://issues.apache.org/jira/browse/ARROW-6793) - [R] Arrow C++ binary packaging for Linux
+* [ARROW-6797](https://issues.apache.org/jira/browse/ARROW-6797) - [Release] Use a separately cloned arrow-site repository in the website post release script
+* [ARROW-6802](https://issues.apache.org/jira/browse/ARROW-6802) - [Packaging][deb][RPM] Update qemu-user-static package URL
+* [ARROW-6803](https://issues.apache.org/jira/browse/ARROW-6803) - [Rust] [DataFusion] Aggregate queries are slower with new physical query plan
+* [ARROW-6804](https://issues.apache.org/jira/browse/ARROW-6804) - [CI] [Rust] Migrate Travis Rust job to Github Actions
+* [ARROW-6807](https://issues.apache.org/jira/browse/ARROW-6807) - [Java][FlightRPC] Expose gRPC service
+* [ARROW-6810](https://issues.apache.org/jira/browse/ARROW-6810) - [Website] Add docs for R package 0.15 release
+* [ARROW-6811](https://issues.apache.org/jira/browse/ARROW-6811) - [R] Assorted post-0.15 release cleanups
+* [ARROW-6814](https://issues.apache.org/jira/browse/ARROW-6814) - [C++] Resolve compiler warnings occurred on release build
+* [ARROW-6822](https://issues.apache.org/jira/browse/ARROW-6822) - [Website] merge\_pr.py is published
+* [ARROW-6824](https://issues.apache.org/jira/browse/ARROW-6824) - [Plasma] Support batched create and seal requests for small objects
+* [ARROW-6825](https://issues.apache.org/jira/browse/ARROW-6825) - [C++] Rework CSV reader IO around readahead iterator
+* [ARROW-6831](https://issues.apache.org/jira/browse/ARROW-6831) - [R] Update R macOS/Windows builds for change in cmake compression defaults
+* [ARROW-6832](https://issues.apache.org/jira/browse/ARROW-6832) - [R] Implement Codec::IsAvailable
+* [ARROW-6833](https://issues.apache.org/jira/browse/ARROW-6833) - [R][CI] Add crossbow job for full R autobrew macOS build
+* [ARROW-6836](https://issues.apache.org/jira/browse/ARROW-6836) - [Format] add a custom\_metadata:[KeyValue] field to the Footer table in File.fbs
+* [ARROW-6843](https://issues.apache.org/jira/browse/ARROW-6843) - [Website] Disable deploy on pull request
+* [ARROW-6847](https://issues.apache.org/jira/browse/ARROW-6847) - [C++] Add a range\_expression interface to Iterator<\>
+* [ARROW-6850](https://issues.apache.org/jira/browse/ARROW-6850) - [Java] Jdbc converter support Null type
+* [ARROW-6852](https://issues.apache.org/jira/browse/ARROW-6852) - [C++] memory-benchmark build failed on Arm64
+* [ARROW-6853](https://issues.apache.org/jira/browse/ARROW-6853) - [Java] Support vector and dictionary encoder use different hasher for calculating hashCode
+* [ARROW-6855](https://issues.apache.org/jira/browse/ARROW-6855) - [C++][Python][Flight] Implement Flight middleware
+* [ARROW-6862](https://issues.apache.org/jira/browse/ARROW-6862) - [Developer] Check pull request title
+* [ARROW-6863](https://issues.apache.org/jira/browse/ARROW-6863) - [Java] Provide parallel searcher
+* [ARROW-6865](https://issues.apache.org/jira/browse/ARROW-6865) - [Java] Improve the performance of comparing an ArrowBuf against a byte array
+* [ARROW-6866](https://issues.apache.org/jira/browse/ARROW-6866) - [Java] Improve the performance of calculating hash code for struct vector
+* [ARROW-6879](https://issues.apache.org/jira/browse/ARROW-6879) - [Rust] Add explicit SIMD for sum kernel
+* [ARROW-6880](https://issues.apache.org/jira/browse/ARROW-6880) - [Rust] Add explicit SIMD for min/max kernel
+* [ARROW-6881](https://issues.apache.org/jira/browse/ARROW-6881) - [Rust] Remove "array\_ops" in favor of the "compute" sub-module
+* [ARROW-6884](https://issues.apache.org/jira/browse/ARROW-6884) - [Python][Flight] Make server-side RPC exceptions more friendly?
+* [ARROW-6887](https://issues.apache.org/jira/browse/ARROW-6887) - [Java] Create prose documentation for using ValueVectors
+* [ARROW-6888](https://issues.apache.org/jira/browse/ARROW-6888) - [Java] Support copy operation for vector value comparators
+* [ARROW-6889](https://issues.apache.org/jira/browse/ARROW-6889) - [Java] ComplexCopier enable FixedSizeList type & fix RangeEualsVisitor StackOverFlow
+* [ARROW-6891](https://issues.apache.org/jira/browse/ARROW-6891) - [Rust] [Parquet] Add Utf8 support to ArrowReader
+* [ARROW-6902](https://issues.apache.org/jira/browse/ARROW-6902) - [C++] Add String\*/Binary\* support for Compare kernels
+* [ARROW-6904](https://issues.apache.org/jira/browse/ARROW-6904) - [Python] Implement MapArray and MapType
+* [ARROW-6907](https://issues.apache.org/jira/browse/ARROW-6907) - [C++][Plasma] Allow Plasma store to batch notifications to clients
+* [ARROW-6911](https://issues.apache.org/jira/browse/ARROW-6911) - [Java] Provide composite comparator
+* [ARROW-6912](https://issues.apache.org/jira/browse/ARROW-6912) - [Java] Extract a common base class for avro converter consumers
+* [ARROW-6916](https://issues.apache.org/jira/browse/ARROW-6916) - [Developer] Alphabetize task names in nightly Crossbow report
+* [ARROW-6918](https://issues.apache.org/jira/browse/ARROW-6918) - [R] Make docker-compose setup faster
+* [ARROW-6919](https://issues.apache.org/jira/browse/ARROW-6919) - [Python] Expose more builders in Cython
+* [ARROW-6920](https://issues.apache.org/jira/browse/ARROW-6920) - [Python] create manylinux wheels for python3.8
+* [ARROW-6926](https://issues.apache.org/jira/browse/ARROW-6926) - [Python] Support \_\_sizeof\_\_ protocol for Python objects
+* [ARROW-6927](https://issues.apache.org/jira/browse/ARROW-6927) - [C++] Add gRPC version check
+* [ARROW-6928](https://issues.apache.org/jira/browse/ARROW-6928) - [Rust] Add FixedSizeList type
+* [ARROW-6930](https://issues.apache.org/jira/browse/ARROW-6930) - [Java] Create utility class for populating vector values used for test purpose only
+* [ARROW-6932](https://issues.apache.org/jira/browse/ARROW-6932) - [Java] incorrect log on known extension type
+* [ARROW-6933](https://issues.apache.org/jira/browse/ARROW-6933) - [Java] Suppor linear dictionary encoder
+* [ARROW-6936](https://issues.apache.org/jira/browse/ARROW-6936) - [Python] Improve error message when object of wrong type is given
+* [ARROW-6942](https://issues.apache.org/jira/browse/ARROW-6942) - [Developer] Add support for Parquet in pull request check by GitHub Actions
+* [ARROW-6943](https://issues.apache.org/jira/browse/ARROW-6943) - [Website] Translate Apache Arrow Flight introduction to Japanese
+* [ARROW-6944](https://issues.apache.org/jira/browse/ARROW-6944) - [Rust] Add StringType
+* [ARROW-6949](https://issues.apache.org/jira/browse/ARROW-6949) - [Java] Fix promotable write to handle nullvectors
+* [ARROW-6951](https://issues.apache.org/jira/browse/ARROW-6951) - [C++][Dataset] Ensure column projection is passed to ParquetDataFragment
+* [ARROW-6952](https://issues.apache.org/jira/browse/ARROW-6952) - [C++][Dataset] Ensure expression filter is passed ParquetDataFragment
+* [ARROW-6954](https://issues.apache.org/jira/browse/ARROW-6954) - [Python] [CI] Add Python 3.8 to CI matrix
+* [ARROW-6960](https://issues.apache.org/jira/browse/ARROW-6960) - [R] Add support for more compression codecs in Windows build
+* [ARROW-6961](https://issues.apache.org/jira/browse/ARROW-6961) - [C++][Gandiva] Add lower\_utf8 function in Gandiva
+* [ARROW-6963](https://issues.apache.org/jira/browse/ARROW-6963) - [Packaging][Wheel][OSX] Use crossbow's command to deploy artifacts from travis builds
+* [ARROW-6964](https://issues.apache.org/jira/browse/ARROW-6964) - [C++][Dataset] Expose a nested parallel option for Scanner::ToTable
+* [ARROW-6965](https://issues.apache.org/jira/browse/ARROW-6965) - [C++][Dataset] Optionally expose partition keys as materialized columns
+* [ARROW-6967](https://issues.apache.org/jira/browse/ARROW-6967) - [C++] Add filter expressions for IN, IS\_VALID
+* [ARROW-6969](https://issues.apache.org/jira/browse/ARROW-6969) - [C++][Dataset] ParquetScanTask eagerly load file
+* [ARROW-6970](https://issues.apache.org/jira/browse/ARROW-6970) - [Packaging][RPM] Add support for CentOS 8
+* [ARROW-6973](https://issues.apache.org/jira/browse/ARROW-6973) - [C++][ThreadPool] Use perfect forwarding in Submit
+* [ARROW-6975](https://issues.apache.org/jira/browse/ARROW-6975) - [C++] Put make\_unique in its own header
+* [ARROW-6980](https://issues.apache.org/jira/browse/ARROW-6980) - [R] dplyr backend for RecordBatch/Table
+* [ARROW-6984](https://issues.apache.org/jira/browse/ARROW-6984) - [C++] Update LZ4 to 1.9.2 for CVE-2019-17543
+* [ARROW-6986](https://issues.apache.org/jira/browse/ARROW-6986) - [R] Add basic Expression class
+* [ARROW-6987](https://issues.apache.org/jira/browse/ARROW-6987) - [CI] Travis OSX failing to install sdk headers
+* [ARROW-6991](https://issues.apache.org/jira/browse/ARROW-6991) - [Packaging][deb] Add support for Ubuntu 19.10
+* [ARROW-6994](https://issues.apache.org/jira/browse/ARROW-6994) - [C++] Research jemalloc memory page reclamation configuration on macOS when background\_thread option is unavailable
+* [ARROW-6997](https://issues.apache.org/jira/browse/ARROW-6997) - [Packaging] Add support for RHEL
+* [ARROW-7000](https://issues.apache.org/jira/browse/ARROW-7000) - [C++][Gandiva] Handle empty inputs in string lower, upper functions
+* [ARROW-7003](https://issues.apache.org/jira/browse/ARROW-7003) - [Format] [Rust] Generate flatbuffers files in build script
+* [ARROW-7004](https://issues.apache.org/jira/browse/ARROW-7004) - [Plasma] Make it possible to bump up object in LRU cache
+* [ARROW-7006](https://issues.apache.org/jira/browse/ARROW-7006) - [Rust] Bump flatbuffers version to avoid vulnerability
+* [ARROW-7007](https://issues.apache.org/jira/browse/ARROW-7007) - [C++] Enable mmap option for LocalFs
+* [ARROW-7014](https://issues.apache.org/jira/browse/ARROW-7014) - [Developer] Write script to verify Linux wheels given local environment with conda or virtualenv
+* [ARROW-7015](https://issues.apache.org/jira/browse/ARROW-7015) - [Developer] Write script to verify macOS wheels given local environment with conda or virtualenv
+* [ARROW-7016](https://issues.apache.org/jira/browse/ARROW-7016) - [Developer][Python] Write script to verify Windows wheels given local environment with conda
+* [ARROW-7019](https://issues.apache.org/jira/browse/ARROW-7019) - [Java] Improve the performance of loading validity buffers
+* [ARROW-7026](https://issues.apache.org/jira/browse/ARROW-7026) - [Java] Remove assertions in MessageSerializer/vector/writer/reader
+* [ARROW-7031](https://issues.apache.org/jira/browse/ARROW-7031) - [Python] Expose the offsets of a ListArray in python
+* [ARROW-7031](https://issues.apache.org/jira/browse/ARROW-7031) - [Python] Expose the offsets of a ListArray in python
+* [ARROW-7032](https://issues.apache.org/jira/browse/ARROW-7032) - [Release] Run the python unit tests in the release verification script
+* [ARROW-7034](https://issues.apache.org/jira/browse/ARROW-7034) - [CI][Crossbow] Skip known nightly failures
+* [ARROW-7035](https://issues.apache.org/jira/browse/ARROW-7035) - [R] Default arguments are unclear in write\_parquet docs
+* [ARROW-7036](https://issues.apache.org/jira/browse/ARROW-7036) - [C++] Version up ORC to avoid compile errors
+* [ARROW-7037](https://issues.apache.org/jira/browse/ARROW-7037) - [C++ ] Compile error on the combination of protobuf \>= 3.9 and clang
+* [ARROW-7039](https://issues.apache.org/jira/browse/ARROW-7039) - [Python] Typecheck expects pandas to be installed
+* [ARROW-7047](https://issues.apache.org/jira/browse/ARROW-7047) - [C++][Dataset] Filter expressions should not require exact type match
+* [ARROW-7052](https://issues.apache.org/jira/browse/ARROW-7052) - [C++] Datasets example fails to build with ARROW\_SHARED=OFF
+* [ARROW-7054](https://issues.apache.org/jira/browse/ARROW-7054) - [Docs] Add option to override displayed docs version with an environment variable
+* [ARROW-7057](https://issues.apache.org/jira/browse/ARROW-7057) - [C++] Add API to parse URI query strings
+* [ARROW-7058](https://issues.apache.org/jira/browse/ARROW-7058) - [C++] FileSystemDataSourceDiscovery should apply partition schemes relative to the base\_dir of its selector
+* [ARROW-7060](https://issues.apache.org/jira/browse/ARROW-7060) - [R] Post-0.15.1 cleanup
+* [ARROW-7061](https://issues.apache.org/jira/browse/ARROW-7061) - [C++][Dataset] FileSystemDiscovery with ParquetFileFormat should ignore files that aren't Parquet
+* [ARROW-7062](https://issues.apache.org/jira/browse/ARROW-7062) - [C++] Parquet file parse error messages should include the file name
+* [ARROW-7064](https://issues.apache.org/jira/browse/ARROW-7064) - [R] Implement null type
+* [ARROW-7066](https://issues.apache.org/jira/browse/ARROW-7066) - [Python] support returning ChunkedArray from \_\_arrow\_array\_\_ ?
+* [ARROW-7067](https://issues.apache.org/jira/browse/ARROW-7067) - [CI] Disable code coverage on Travis-CI
+* [ARROW-7069](https://issues.apache.org/jira/browse/ARROW-7069) - [C++][Dataset] Replace ConstantPartitionScheme with PrefixDictionaryPartitionScheme
+* [ARROW-7070](https://issues.apache.org/jira/browse/ARROW-7070) - [Packaging][deb] Update package names for 1.0.0
+* [ARROW-7072](https://issues.apache.org/jira/browse/ARROW-7072) - [Java] Support concating validity bits efficiently
+* [ARROW-7082](https://issues.apache.org/jira/browse/ARROW-7082) - [Packaging][deb] Add apache-arrow-archive-keyring
+* [ARROW-7086](https://issues.apache.org/jira/browse/ARROW-7086) - [C++] Provide a wrapper for invoking factories to produce a Result
+* [ARROW-7092](https://issues.apache.org/jira/browse/ARROW-7092) - [R] Add vignette for dplyr and datasets
+* [ARROW-7093](https://issues.apache.org/jira/browse/ARROW-7093) - [R] Support creating ScalarExpressions for more data types
+* [ARROW-7094](https://issues.apache.org/jira/browse/ARROW-7094) - [C++] FileSystemDataSource should use an owning pointer for fs::Filesystem
+* [ARROW-7095](https://issues.apache.org/jira/browse/ARROW-7095) - [R] Better handling of unsupported filter and mutate expressions in dplyr methods
+* [ARROW-7096](https://issues.apache.org/jira/browse/ARROW-7096) - [C++] Add options structs for concatenation-with-promotion and schema unification
+* [ARROW-7098](https://issues.apache.org/jira/browse/ARROW-7098) - [Java] Improve the performance of comparing two memory blocks
+* [ARROW-7099](https://issues.apache.org/jira/browse/ARROW-7099) - [C++] Disambiguate function calls in csv parser test
+* [ARROW-7101](https://issues.apache.org/jira/browse/ARROW-7101) - [CI] Refactor docker-compose setup and use it with GitHub Actions
+* [ARROW-7103](https://issues.apache.org/jira/browse/ARROW-7103) - [R] Various minor cleanups
+* [ARROW-7107](https://issues.apache.org/jira/browse/ARROW-7107) - [C++][MinGW] Enable Flight on AppVeyor
+* [ARROW-7110](https://issues.apache.org/jira/browse/ARROW-7110) - [GLib] Add filter support for GArrowTable, GArrowChunkedArray, and GArrowRecordBatch
+* [ARROW-7111](https://issues.apache.org/jira/browse/ARROW-7111) - [GLib] Add take support for GArrowTable, GArrowChunkedArray, and GArrowRecordBatch
+* [ARROW-7113](https://issues.apache.org/jira/browse/ARROW-7113) - [Rust] Buffer should accept memory owned by others
+* [ARROW-7116](https://issues.apache.org/jira/browse/ARROW-7116) - [CI] Use the docker repository provided by apache organisation
+* [ARROW-7120](https://issues.apache.org/jira/browse/ARROW-7120) - [C++][CI] Add .ccache to the docker-compose volume mounts
+* [ARROW-7146](https://issues.apache.org/jira/browse/ARROW-7146) - [R][CI] Various fixes and speedups for the R docker-compose setup
+* [ARROW-7147](https://issues.apache.org/jira/browse/ARROW-7147) - [C++][Dataset] Refactor dataset's API to use Result<T\>
+* [ARROW-7148](https://issues.apache.org/jira/browse/ARROW-7148) - [C++][Dataset] API cleanup
+* [ARROW-7149](https://issues.apache.org/jira/browse/ARROW-7149) - [C++] Remove experimental status on filesystem APIs
+* [ARROW-7155](https://issues.apache.org/jira/browse/ARROW-7155) - [Java][CI] add maven wrapper to make setup process simple
+* [ARROW-7159](https://issues.apache.org/jira/browse/ARROW-7159) - [CI] Run HDFS tests as cron task
+* [ARROW-7160](https://issues.apache.org/jira/browse/ARROW-7160) - [C++] Update string\_view backport
+* [ARROW-7161](https://issues.apache.org/jira/browse/ARROW-7161) - [C++] Migrate filesystem layer from Status to Result
+* [ARROW-7162](https://issues.apache.org/jira/browse/ARROW-7162) - [C++] Cleanup warnings in cmake\_modules/SetupCxxFlags.cmake
+* [ARROW-7166](https://issues.apache.org/jira/browse/ARROW-7166) - [Java] Remove redundant code for Jdbc adapters
+* [ARROW-7169](https://issues.apache.org/jira/browse/ARROW-7169) - [C++] Vendor uriparser library
+* [ARROW-7171](https://issues.apache.org/jira/browse/ARROW-7171) - [Ruby] Pass Array<Boolean\> for Arrow::Table\#filter
+* [ARROW-7172](https://issues.apache.org/jira/browse/ARROW-7172) - [C++][Dataset] Improve format of Expression::ToString
+* [ARROW-7176](https://issues.apache.org/jira/browse/ARROW-7176) - [C++] Fix arrow::ipc compiler warning
+* [ARROW-7178](https://issues.apache.org/jira/browse/ARROW-7178) - [C++] Vendor forward compatible std::optional
+* [ARROW-7185](https://issues.apache.org/jira/browse/ARROW-7185) - [R][Dataset] Add bindings for IN, IS\_VALID expressions
+* [ARROW-7186](https://issues.apache.org/jira/browse/ARROW-7186) - [R] Add inline comments to document the dplyr code
+* [ARROW-7192](https://issues.apache.org/jira/browse/ARROW-7192) - [Rust] Implement Flight crate
+* [ARROW-7193](https://issues.apache.org/jira/browse/ARROW-7193) - [Rust] Create Arrow stream reader
+* [ARROW-7195](https://issues.apache.org/jira/browse/ARROW-7195) - [Ruby] Improve \#filter, \#take, and \#is\_in
+* [ARROW-7196](https://issues.apache.org/jira/browse/ARROW-7196) - [Ruby] Remove needless BinaryArrayBuilder\#append\_values
+* [ARROW-7197](https://issues.apache.org/jira/browse/ARROW-7197) - [Ruby] Suppress keyword argument related warnings with Ruby 2.7
+* [ARROW-7204](https://issues.apache.org/jira/browse/ARROW-7204) - [C++][Dataset] In expression should not require exact type match
+* [ARROW-7206](https://issues.apache.org/jira/browse/ARROW-7206) - [Java] Avoid string concatenation when calling Preconditions\#checkArgument
+* [ARROW-7207](https://issues.apache.org/jira/browse/ARROW-7207) - [Rust] Update Generated Flatbuffer Files
+* [ARROW-7210](https://issues.apache.org/jira/browse/ARROW-7210) - [C++] Scalar cast should support time-based types
+* [ARROW-7211](https://issues.apache.org/jira/browse/ARROW-7211) - [Rust] [Parquet] Support writing to byte buffers
+* [ARROW-7216](https://issues.apache.org/jira/browse/ARROW-7216) - [Java] Improve the performance of setting/clearing individual bits
+* [ARROW-7219](https://issues.apache.org/jira/browse/ARROW-7219) - [CI][Python] Install pickle5 in the conda-python docker image for python version 3.6
+* [ARROW-7227](https://issues.apache.org/jira/browse/ARROW-7227) - [Python] Provide wrappers for ConcatenateWithPromotion()
+* [ARROW-7228](https://issues.apache.org/jira/browse/ARROW-7228) - [Python] Expose RecordBatch.FromStructArray in Python.
+* [ARROW-7235](https://issues.apache.org/jira/browse/ARROW-7235) - [C++] Add Result<T\> to APIs to arrow/io
+* [ARROW-7236](https://issues.apache.org/jira/browse/ARROW-7236) - [C++] Add Result<T\> to APIs to arrow/csv
+* [ARROW-7240](https://issues.apache.org/jira/browse/ARROW-7240) - [C++] Add Result<T\> to APIs to arrow/util
+* [ARROW-7246](https://issues.apache.org/jira/browse/ARROW-7246) - [CI][Python] wheel can't be built by SSL\_ST\_INIT error
+* [ARROW-7247](https://issues.apache.org/jira/browse/ARROW-7247) - [CI][Python] wheel can't be built by wget and OpenSSL error
+* [ARROW-7248](https://issues.apache.org/jira/browse/ARROW-7248) - [Rust] Automatically Regenerate IPC messages from Flatbuffers
+* [ARROW-7255](https://issues.apache.org/jira/browse/ARROW-7255) - [CI] Run source release test on pull request
+* [ARROW-7257](https://issues.apache.org/jira/browse/ARROW-7257) - [CI] Homebrew formula is failed by openssl formula name update
+* [ARROW-7258](https://issues.apache.org/jira/browse/ARROW-7258) - [CI] Fuzzit job is failed by nonexistent directory
+* [ARROW-7259](https://issues.apache.org/jira/browse/ARROW-7259) - [Java] Support subfield encoder use different hasher
+* [ARROW-7260](https://issues.apache.org/jira/browse/ARROW-7260) - [CI] Ubuntu 14.04 test is failed by user defined literal
+* [ARROW-7261](https://issues.apache.org/jira/browse/ARROW-7261) - [Python] Python support for fixed size list type
+* [ARROW-7262](https://issues.apache.org/jira/browse/ARROW-7262) - [C++][Gandiva] Implement replace function in Gandiva
+* [ARROW-7263](https://issues.apache.org/jira/browse/ARROW-7263) - [C++][Gandiva] Implement locate and position functions
+* [ARROW-7268](https://issues.apache.org/jira/browse/ARROW-7268) - [Rust] Propagate \`custom\_metadata\` field from IPC message
+* [ARROW-7269](https://issues.apache.org/jira/browse/ARROW-7269) - [C++] Fix arrow::parquet compiler warning
+* [ARROW-7270](https://issues.apache.org/jira/browse/ARROW-7270) - [Go] preserve CSV reading behaviour, improve memory usage
+* [ARROW-7274](https://issues.apache.org/jira/browse/ARROW-7274) - [C++] Add Result<T\> APIs to Decimal class
+* [ARROW-7275](https://issues.apache.org/jira/browse/ARROW-7275) - [Ruby] Add support for Arrow::ListDataType.new(data\_type)
+* [ARROW-7276](https://issues.apache.org/jira/browse/ARROW-7276) - [Ruby] Add support for building Arrow::ListArray from [[...]]
+* [ARROW-7277](https://issues.apache.org/jira/browse/ARROW-7277) - [Document] Add discussion about vector lifecycle
+* [ARROW-7279](https://issues.apache.org/jira/browse/ARROW-7279) - [C++] Rename UnionArray::type\_ids to UnionArray::type\_codes
+* [ARROW-7284](https://issues.apache.org/jira/browse/ARROW-7284) - [Java] ensure java implementation meets clarified dictionary spec
+* [ARROW-7289](https://issues.apache.org/jira/browse/ARROW-7289) - [C\#] ListType constructor argument is redundant
+* [ARROW-7290](https://issues.apache.org/jira/browse/ARROW-7290) - [C\#] Implement ListArray Builder
+* [ARROW-7292](https://issues.apache.org/jira/browse/ARROW-7292) - [C++] [CI] [Dev] Add ASAN / UBSAN CI run
+* [ARROW-7293](https://issues.apache.org/jira/browse/ARROW-7293) - [Dev] [C++] Persist ccache in docker-compose build volumes
+* [ARROW-7296](https://issues.apache.org/jira/browse/ARROW-7296) - [Python] Add ORC api documentation
+* [ARROW-7299](https://issues.apache.org/jira/browse/ARROW-7299) - [GLib] Use Result instead of Status
+* [ARROW-7303](https://issues.apache.org/jira/browse/ARROW-7303) - [C++] Refactor benchmarks to use new Result APIs
+* [ARROW-7306](https://issues.apache.org/jira/browse/ARROW-7306) - [C++] Add Result-returning version of FileSystemFromUri
+* [ARROW-7307](https://issues.apache.org/jira/browse/ARROW-7307) - [CI][GLib] Documentation isn't generated
+* [ARROW-7309](https://issues.apache.org/jira/browse/ARROW-7309) - [Python] Support HDFS federation viewfs://
+* [ARROW-7310](https://issues.apache.org/jira/browse/ARROW-7310) - [Python] Expose HDFS implementation for pyarrow.fs
+* [ARROW-7311](https://issues.apache.org/jira/browse/ARROW-7311) - [Python] Return filesystem and path from URI
+* [ARROW-7312](https://issues.apache.org/jira/browse/ARROW-7312) - [Rust] ArrowError should implement std::error:Error
+* [ARROW-7317](https://issues.apache.org/jira/browse/ARROW-7317) - [C++] Migrate Iterator API to Result<T\>
+* [ARROW-7319](https://issues.apache.org/jira/browse/ARROW-7319) - [C++] Refactor Iterator<T\> to yield Result<T\>
+* [ARROW-7321](https://issues.apache.org/jira/browse/ARROW-7321) - [CI][GLib] Failed to build with GLib warning
+* [ARROW-7322](https://issues.apache.org/jira/browse/ARROW-7322) - [CI][Python] Fall back to arrowdev dockerhub organization for manylinux images
+* [ARROW-7323](https://issues.apache.org/jira/browse/ARROW-7323) - [CI][Rust] Nightly CI is failed by different toolchain
+* [ARROW-7324](https://issues.apache.org/jira/browse/ARROW-7324) - [Rust] Add Timezone to Timestamp
+* [ARROW-7325](https://issues.apache.org/jira/browse/ARROW-7325) - [Rust] [Parquet] Update to parquet-format 2.6 and thrift 0.12
+* [ARROW-7329](https://issues.apache.org/jira/browse/ARROW-7329) - [Java] AllocationManager: Allow managing different types of memory other than those are allocated using Netty
+* [ARROW-7333](https://issues.apache.org/jira/browse/ARROW-7333) - [CI][Rust] Remove duplicated nightly job
+* [ARROW-7334](https://issues.apache.org/jira/browse/ARROW-7334) - [CI][Python] macOS uses Python 2
+* [ARROW-7339](https://issues.apache.org/jira/browse/ARROW-7339) - [CMake] Thrift version not respected in CMake configuration version.txt
+* [ARROW-7340](https://issues.apache.org/jira/browse/ARROW-7340) - [CI] Prune defunct appveyor build setup
+* [ARROW-7344](https://issues.apache.org/jira/browse/ARROW-7344) - [Packaging][Python] Build manylinux2014 wheels
+* [ARROW-7346](https://issues.apache.org/jira/browse/ARROW-7346) - [CI] Explicit usage of ccache across the builds
+* [ARROW-7347](https://issues.apache.org/jira/browse/ARROW-7347) - [C++] Update bundled Boost to 1.71.0
+* [ARROW-7348](https://issues.apache.org/jira/browse/ARROW-7348) - [Rust] Add api to return references of buffer of null bitmap.
+* [ARROW-7351](https://issues.apache.org/jira/browse/ARROW-7351) - [Developer] Only suggest cpp-\* fix versions when merging Parquet patches
+* [ARROW-7357](https://issues.apache.org/jira/browse/ARROW-7357) - [Go] migrate from pkg/errors to x/xerrors
+* [ARROW-7366](https://issues.apache.org/jira/browse/ARROW-7366) - [C++][Dataset] Use PartitionSchemeDiscovery in DataSourceDiscovery
+* [ARROW-7367](https://issues.apache.org/jira/browse/ARROW-7367) - [Python] Use np.full instead of np.array.repeat in ParquetDatasetPiece
+* [ARROW-7368](https://issues.apache.org/jira/browse/ARROW-7368) - [Ruby] Use :arrow\_file and :arrow\_streaming for format name
+* [ARROW-7369](https://issues.apache.org/jira/browse/ARROW-7369) - [GLib] Add garrow\_table\_combine\_chunks
+* [ARROW-7370](https://issues.apache.org/jira/browse/ARROW-7370) - [C++] Old Protobuf with AUTO detection is failed
+* [ARROW-7377](https://issues.apache.org/jira/browse/ARROW-7377) - [C++][Dataset] Simplify parquet column projection
+* [ARROW-7378](https://issues.apache.org/jira/browse/ARROW-7378) - [C++][Gandiva] Loop vectorization broken in IR optimization
+* [ARROW-7379](https://issues.apache.org/jira/browse/ARROW-7379) - [C++] Introduce SchemaBuilder companion class and Field::IsCompatibleWith
+* [ARROW-7380](https://issues.apache.org/jira/browse/ARROW-7380) - [C++][Dataset] Implement DatasetFactory
+* [ARROW-7382](https://issues.apache.org/jira/browse/ARROW-7382) - [C++][Dataset] Refactor FsDsDiscovery constructors
+* [ARROW-7387](https://issues.apache.org/jira/browse/ARROW-7387) - [C\#] Support ListType Serialization
+* [ARROW-7392](https://issues.apache.org/jira/browse/ARROW-7392) - [Packaging] Add conda packaging tasks for python 3.8
+* [ARROW-7398](https://issues.apache.org/jira/browse/ARROW-7398) - [Packaging][Python] Conda builds are failing on macOS
+* [ARROW-7399](https://issues.apache.org/jira/browse/ARROW-7399) - [C++][Gandiva] Gandiva does not pick runtime cpu features
+* [ARROW-7402](https://issues.apache.org/jira/browse/ARROW-7402) - [C++] Add more information on CUDA error
+* [ARROW-7403](https://issues.apache.org/jira/browse/ARROW-7403) - [C++][JSON] Enable Rapidjson on Arm64 Neon
+* [ARROW-7410](https://issues.apache.org/jira/browse/ARROW-7410) - [Python] [Doc] Document filesystem APIs
+* [ARROW-7411](https://issues.apache.org/jira/browse/ARROW-7411) - [C++][Flight] Incorrect Arrow Flight benchmark output
+* [ARROW-7413](https://issues.apache.org/jira/browse/ARROW-7413) - [Python][Dataset] Add tests for PartitionSchemeDiscovery
+* [ARROW-7414](https://issues.apache.org/jira/browse/ARROW-7414) - [R][Dataset] Implement PartitionSchemeDiscovery
+* [ARROW-7415](https://issues.apache.org/jira/browse/ARROW-7415) - [C++][Dataset] Implement IpcFormat for sources composed of ipc files
+* [ARROW-7416](https://issues.apache.org/jira/browse/ARROW-7416) - [R][Nightly] Fix macos-r-autobrew build on R 3.6.2
+* [ARROW-7417](https://issues.apache.org/jira/browse/ARROW-7417) - [C++] Add a docker-compose entry for CUDA 10.1
+* [ARROW-7418](https://issues.apache.org/jira/browse/ARROW-7418) - [C++] Can't build with g++ 5.4.0 on Ubuntu 16.04
+* [ARROW-7420](https://issues.apache.org/jira/browse/ARROW-7420) - [C++] Migrate tensor related APIs to Result-returning version
+* [ARROW-7429](https://issues.apache.org/jira/browse/ARROW-7429) - [Java] Enhance code style checking for Java code (remove consecutive spaces)
+* [ARROW-7430](https://issues.apache.org/jira/browse/ARROW-7430) - [Python] Add more docstrings to dataset bindings
+* [ARROW-7431](https://issues.apache.org/jira/browse/ARROW-7431) - [Python] Add dataset API to reference docs
+* [ARROW-7432](https://issues.apache.org/jira/browse/ARROW-7432) - [Python] Add higher-level datasets functions
+* [ARROW-7439](https://issues.apache.org/jira/browse/ARROW-7439) - [C++][Dataset] Remove dataset pointer aliases
+* [ARROW-7449](https://issues.apache.org/jira/browse/ARROW-7449) - [GLib] Make GObject Introspection optional
+* [ARROW-7452](https://issues.apache.org/jira/browse/ARROW-7452) - [GLib] Make GArrowTimeDataType abstract
+* [ARROW-7453](https://issues.apache.org/jira/browse/ARROW-7453) - [Ruby] Add support for Arrow::NullArray\#[]
+* [ARROW-7454](https://issues.apache.org/jira/browse/ARROW-7454) - [Ruby] Add support for saving/loading TSV
+* [ARROW-7455](https://issues.apache.org/jira/browse/ARROW-7455) - [Ruby] Use Arrow::DataType.resolve for all GArrowDataType input
+* [ARROW-7456](https://issues.apache.org/jira/browse/ARROW-7456) - [C++] Add support for YYYY-MM-DDThh and YYYY-MM-DDThh:mm timestamp formats
+* [ARROW-7457](https://issues.apache.org/jira/browse/ARROW-7457) - [Doc] Fix typos
+* [ARROW-7459](https://issues.apache.org/jira/browse/ARROW-7459) - [Python] Documentation lint is failed
+* [ARROW-7460](https://issues.apache.org/jira/browse/ARROW-7460) - [Rust] Improve some kernels with autovectorisation
+* [ARROW-7461](https://issues.apache.org/jira/browse/ARROW-7461) - [Java] Fix typos and spelling
+* [ARROW-7463](https://issues.apache.org/jira/browse/ARROW-7463) - [Doc] Fix a broken link and typos
+* [ARROW-7464](https://issues.apache.org/jira/browse/ARROW-7464) - [C++] Refine CpuInfo singleton with std::call\_once
+* [ARROW-7465](https://issues.apache.org/jira/browse/ARROW-7465) - [C++] Add Arrow memory benchmark for Arm64
+* [ARROW-7468](https://issues.apache.org/jira/browse/ARROW-7468) - [Python] Fix typos
+* [ARROW-7469](https://issues.apache.org/jira/browse/ARROW-7469) - [C++] Improve division related bit operations
+* [ARROW-7470](https://issues.apache.org/jira/browse/ARROW-7470) - [JS] Fix typos
+* [ARROW-7474](https://issues.apache.org/jira/browse/ARROW-7474) - [Ruby] Save CSV files faster
+* [ARROW-7475](https://issues.apache.org/jira/browse/ARROW-7475) - [Rust] Create Arrow Stream writer
+* [ARROW-7477](https://issues.apache.org/jira/browse/ARROW-7477) - [FlightRPC][Java] Flight gRPC service is missing reflection info
+* [ARROW-7479](https://issues.apache.org/jira/browse/ARROW-7479) - [Rust][Ruby][R] Fix typos
+* [ARROW-7481](https://issues.apache.org/jira/browse/ARROW-7481) - [C\#] Fix typos
+* [ARROW-7482](https://issues.apache.org/jira/browse/ARROW-7482) - [C++] Fix typos
+* [ARROW-7484](https://issues.apache.org/jira/browse/ARROW-7484) - [C++][Gandiva] Fix typos
+* [ARROW-7485](https://issues.apache.org/jira/browse/ARROW-7485) - [C++][Plasma] Fix typos
+* [ARROW-7487](https://issues.apache.org/jira/browse/ARROW-7487) - [Developer] Fix typos
+* [ARROW-7488](https://issues.apache.org/jira/browse/ARROW-7488) - [GLib] Fix typos and broken links
+* [ARROW-7489](https://issues.apache.org/jira/browse/ARROW-7489) - [CI] Fix typos
+* [ARROW-7490](https://issues.apache.org/jira/browse/ARROW-7490) - [Java] Avro converter should convert attributes and props to FieldType metadata
+* [ARROW-7493](https://issues.apache.org/jira/browse/ARROW-7493) - [Python] Expose sum kernel in pyarrow.compute and support ChunkedArray inputs
+* [ARROW-7498](https://issues.apache.org/jira/browse/ARROW-7498) - [C++][Dataset] Rename DataFragment/DataSource/PartitionScheme
+* [ARROW-7502](https://issues.apache.org/jira/browse/ARROW-7502) - [Integration] Remove Spark Integration patch that not needed anymore
+* [ARROW-7513](https://issues.apache.org/jira/browse/ARROW-7513) - [JS] Arrow Tutorial: Common data types
+* [ARROW-7514](https://issues.apache.org/jira/browse/ARROW-7514) - [C\#] Make GetValueOffset Obsolete
+* [ARROW-7519](https://issues.apache.org/jira/browse/ARROW-7519) - [Python] Build wheels, conda packages with dataset support
+* [ARROW-7521](https://issues.apache.org/jira/browse/ARROW-7521) - [Rust] Remove tuple on FixedSizeList datatype
+* [ARROW-7523](https://issues.apache.org/jira/browse/ARROW-7523) - [Developer] Relax clang-tidy check
+* [ARROW-7526](https://issues.apache.org/jira/browse/ARROW-7526) - [C++][Compute]: Optimize small integer sorting
+* [ARROW-7532](https://issues.apache.org/jira/browse/ARROW-7532) - [CI] Unskip brew test after Homebrew fixes it upstream
+* [ARROW-7537](https://issues.apache.org/jira/browse/ARROW-7537) - [CI][R] Nightly macOS autobrew job should be more verbose if it fails
+* [ARROW-7538](https://issues.apache.org/jira/browse/ARROW-7538) - Clarify actual and desired size in AllocationManager
+* [ARROW-7540](https://issues.apache.org/jira/browse/ARROW-7540) - [C++] License files aren't installed
+* [ARROW-7541](https://issues.apache.org/jira/browse/ARROW-7541) - [GLib] Install license files
+* [ARROW-7542](https://issues.apache.org/jira/browse/ARROW-7542) - [CI][C++] nproc isn't available on macOS
+* [ARROW-7549](https://issues.apache.org/jira/browse/ARROW-7549) - [Java] Reorganize Flight modules to keep top level clean/organized
+* [ARROW-7550](https://issues.apache.org/jira/browse/ARROW-7550) - [R][CI] Run donttest examples in CI
+* [ARROW-7557](https://issues.apache.org/jira/browse/ARROW-7557) - [C++][Compute] Validate sorting stability in random test
+* [ARROW-7558](https://issues.apache.org/jira/browse/ARROW-7558) - [Packaging][deb][RPM] Use the host owner and group for artifacts
+* [ARROW-7560](https://issues.apache.org/jira/browse/ARROW-7560) - [Rust] Reduce Rc/Refcell usage
+* [ARROW-7565](https://issues.apache.org/jira/browse/ARROW-7565) - [Website] Add support for download URL redirect
+* [ARROW-7566](https://issues.apache.org/jira/browse/ARROW-7566) - [CI] Use more recent Miniconda on AppVeyor
+* [ARROW-7567](https://issues.apache.org/jira/browse/ARROW-7567) - [Java] Bump Checkstyle from 6.19 to 8.18
+* [ARROW-7567](https://issues.apache.org/jira/browse/ARROW-7567) - [Java] Bump Checkstyle from 6.19 to 8.18
+* [ARROW-7568](https://issues.apache.org/jira/browse/ARROW-7568) - [Java] Bump Apache Avro from 1.9.0 to 1.9.1
+* [ARROW-7569](https://issues.apache.org/jira/browse/ARROW-7569) - [Python] Add API to map Arrow types to pandas ExtensionDtypes for to\_pandas conversions
+* [ARROW-7570](https://issues.apache.org/jira/browse/ARROW-7570) - [Java] Fix high severity issues reported by LGTM
+* [ARROW-7571](https://issues.apache.org/jira/browse/ARROW-7571) - [Java] Correct minimal java version on README
+* [ARROW-7572](https://issues.apache.org/jira/browse/ARROW-7572) - [Java] Enfore Maven 3.3+ as mentioned in README
+* [ARROW-7573](https://issues.apache.org/jira/browse/ARROW-7573) - [Rust] Reduce boxing and cleanup
+* [ARROW-7575](https://issues.apache.org/jira/browse/ARROW-7575) - [R] Linux binary packaging followup
+* [ARROW-7576](https://issues.apache.org/jira/browse/ARROW-7576) - [C++][Dev] Improve fuzzing setup
+* [ARROW-7577](https://issues.apache.org/jira/browse/ARROW-7577) - [C++][CI] Check fuzzer setup in CI
+* [ARROW-7578](https://issues.apache.org/jira/browse/ARROW-7578) - [R] Add support for datasets with IPC files and with multiple sources
+* [ARROW-7580](https://issues.apache.org/jira/browse/ARROW-7580) - [Website] 0.16 release post
+* [ARROW-7581](https://issues.apache.org/jira/browse/ARROW-7581) - [R] Documentation/polishing for 0.16 release
+* [ARROW-7590](https://issues.apache.org/jira/browse/ARROW-7590) - [C++] Managed files in thirdparty/ are ignored
+* [ARROW-7597](https://issues.apache.org/jira/browse/ARROW-7597) - [C++] Improvements to CMake configuration console summary
+* [ARROW-7600](https://issues.apache.org/jira/browse/ARROW-7600) - [C++][Parquet] Add a basic disabled unit test to excercise nesting functionality
+* [ARROW-7601](https://issues.apache.org/jira/browse/ARROW-7601) - [Doc] [C++] Update fuzzing documentation
+* [ARROW-7602](https://issues.apache.org/jira/browse/ARROW-7602) - [Archery] Add more build options
+* [ARROW-7613](https://issues.apache.org/jira/browse/ARROW-7613) - [Rust] Remove redundant \`::\` prefixes
+* [ARROW-7622](https://issues.apache.org/jira/browse/ARROW-7622) - [Format] Mark Tensor and SparseTensor fields required
+* [ARROW-7623](https://issues.apache.org/jira/browse/ARROW-7623) - [C++] Update generated flatbuffers files
+* [ARROW-7626](https://issues.apache.org/jira/browse/ARROW-7626) - [Parquet][GLib] Add support for version macros
+* [ARROW-7627](https://issues.apache.org/jira/browse/ARROW-7627) - [C++][Gandiva] Optimize string truncate function
+* [ARROW-7629](https://issues.apache.org/jira/browse/ARROW-7629) - [C++][CI] Add fuzz regression files to arrow-testing
+* [ARROW-7630](https://issues.apache.org/jira/browse/ARROW-7630) - [C++][CI] Check fuzz crash regressions in CI
+* [ARROW-7632](https://issues.apache.org/jira/browse/ARROW-7632) - [C++] [CI] Improve fuzzing seed corpus
+* [ARROW-7635](https://issues.apache.org/jira/browse/ARROW-7635) - [C++] Add pkg-config support for each components
+* [ARROW-7636](https://issues.apache.org/jira/browse/ARROW-7636) - [Python] Clean-up the pyarrow.dataset.partitioning() API
+* [ARROW-7644](https://issues.apache.org/jira/browse/ARROW-7644) - Add vcpkg installation instructions
+* [ARROW-7645](https://issues.apache.org/jira/browse/ARROW-7645) - [Packaging][deb][RPM] arm64 build by crossbow is broken
+* [ARROW-7648](https://issues.apache.org/jira/browse/ARROW-7648) - [C++] Sanitize local paths on Windows
+* [ARROW-7658](https://issues.apache.org/jira/browse/ARROW-7658) - [R] Support dplyr filtering on date/time
+* [ARROW-7659](https://issues.apache.org/jira/browse/ARROW-7659) - [Rust] Reduce Rc usage
+* [ARROW-7660](https://issues.apache.org/jira/browse/ARROW-7660) - [C++][Gandiva] Optimise castVarchar(string, int) function for single byte characters
+* [ARROW-7665](https://issues.apache.org/jira/browse/ARROW-7665) - [R] linuxLibs.R should build in parallel
+* [ARROW-7666](https://issues.apache.org/jira/browse/ARROW-7666) - [Packaging][deb] Always use NInja to reduce build time
+* [ARROW-7667](https://issues.apache.org/jira/browse/ARROW-7667) - [Packaging][deb] ubuntu-eoan is missing in nightly jobs
+* [ARROW-7668](https://issues.apache.org/jira/browse/ARROW-7668) - [Packaging][RPM] Use NInja if possible to reduce build time
+* [ARROW-7670](https://issues.apache.org/jira/browse/ARROW-7670) - [Python][Dataset] Better ergonomics for the filter expressions
+* [ARROW-7671](https://issues.apache.org/jira/browse/ARROW-7671) - [Python][Dataset] Add bindings for the DatasetFactory
+* [ARROW-7674](https://issues.apache.org/jira/browse/ARROW-7674) - Add helpful message for captcha challenge in merge\_arrow\_pr.py
+* [ARROW-7682](https://issues.apache.org/jira/browse/ARROW-7682) - [Packaging][APT][Yum] Add support for arm64 APT/Yum repositories
+* [ARROW-7683](https://issues.apache.org/jira/browse/ARROW-7683) - [Packaging] Set 0.16.0 as the next version
+* [ARROW-7686](https://issues.apache.org/jira/browse/ARROW-7686) - [Packaging][deb][RPM] Include more arrow-\*.pc
+* [ARROW-7687](https://issues.apache.org/jira/browse/ARROW-7687) - [C++] C++ developer document links in README are broken
+* [ARROW-7692](https://issues.apache.org/jira/browse/ARROW-7692) - [Rust] Several pattern matches are hard to read
+* [ARROW-7694](https://issues.apache.org/jira/browse/ARROW-7694) - [Packaging][deb][RPM] Can't build repository packages for RC
+* [ARROW-7695](https://issues.apache.org/jira/browse/ARROW-7695) - [Release] Update java versions to 0.16-SNAPSHOT
+* [ARROW-7696](https://issues.apache.org/jira/browse/ARROW-7696) - [Release] Unit test on release branch is failed
+* [ARROW-7697](https://issues.apache.org/jira/browse/ARROW-7697) - [Release] Add a test for updating Linux packages by 00-prepare.sh
+* [ARROW-7710](https://issues.apache.org/jira/browse/ARROW-7710) - [Release][C\#] .NET download URL is redirected
+* [ARROW-7711](https://issues.apache.org/jira/browse/ARROW-7711) - [C\#] Date32 test depends on system timezone
+* [ARROW-7715](https://issues.apache.org/jira/browse/ARROW-7715) - [Release][APT] Ignore some arm64 verifications
+* [ARROW-7716](https://issues.apache.org/jira/browse/ARROW-7716) - [Packaging][APT] Use the "main" component for Ubuntu 19.10
+* [ARROW-7719](https://issues.apache.org/jira/browse/ARROW-7719) - [Python][Dataset] Table equality check occasionally fails
+* [ARROW-7724](https://issues.apache.org/jira/browse/ARROW-7724) - [Release][Yum] Ignore some arm64 verifications
+* [ARROW-7743](https://issues.apache.org/jira/browse/ARROW-7743) - [Rust] [Parquet] Support reading timestamp micros
+* [ARROW-7768](https://issues.apache.org/jira/browse/ARROW-7768) - [Rust] Implement Length and TryClone traits for Cursor<Vec<u8\>\> in reader.rs
+* [ARROW-8015](https://issues.apache.org/jira/browse/ARROW-8015) - [Python] Build 0.16.0 wheel install for Windows + Python 3.5 and publish to PyPI
+* [PARQUET-517](https://issues.apache.org/jira/browse/PARQUET-517) - [C++] Use arrow::MemoryPool for all heap allocations
+* [PARQUET-1300](https://issues.apache.org/jira/browse/PARQUET-1300) - [C++] Parquet modular encryption
+* [PARQUET-1664](https://issues.apache.org/jira/browse/PARQUET-1664) - [C++] Provide API to return metadata string from FileMetadata.
+* [PARQUET-1678](https://issues.apache.org/jira/browse/PARQUET-1678) - [C++] Provide classes for reading/writing using input/output operators
+* [PARQUET-1688](https://issues.apache.org/jira/browse/PARQUET-1688) - [C++] StreamWriter/StreamReader can't be built with g++ 4.8.5 on CentOS 7
+* [PARQUET-1689](https://issues.apache.org/jira/browse/PARQUET-1689) - [C++] Stream API: Allow for columns/rows to be skipped when reading
+* [PARQUET-1701](https://issues.apache.org/jira/browse/PARQUET-1701) - [C++] Stream API: Add support for optional fields
+* [PARQUET-1704](https://issues.apache.org/jira/browse/PARQUET-1704) - [C++] Add re-usable encryption buffer to SerializedPageWriter
+* [PARQUET-1705](https://issues.apache.org/jira/browse/PARQUET-1705) - [C++] Disable shrink-to-fit on the re-usable decryption buffer
+* [PARQUET-1712](https://issues.apache.org/jira/browse/PARQUET-1712) - [C++] Stop using deprecated APIs in examples
+* [PARQUET-1721](https://issues.apache.org/jira/browse/PARQUET-1721) - [C++] Arrow dependency is missing in parquet.pc
+* [PARQUET-1734](https://issues.apache.org/jira/browse/PARQUET-1734) - [C++] Fix typos
+* [PARQUET-1769](https://issues.apache.org/jira/browse/PARQUET-1769) - [C++] Update to parquet-format 2.8.0
+
+
+
+# Apache Arrow 0.15.1 (2019-11-01)
+
+## Bug Fixes
+
+* [ARROW-6464](https://issues.apache.org/jira/browse/ARROW-6464) - [Java] Refactor FixedSizeListVector\#splitAndTransfer with slice API
+* [ARROW-6728](https://issues.apache.org/jira/browse/ARROW-6728) - [C\#] Support reading and writing Date32 and Date64 arrays
+* [ARROW-6740](https://issues.apache.org/jira/browse/ARROW-6740) - [Python] Unable to delete closed MemoryMappedFile on Windows
+* [ARROW-6762](https://issues.apache.org/jira/browse/ARROW-6762) - [C++] JSON reader segfaults on newline
+* [ARROW-6795](https://issues.apache.org/jira/browse/ARROW-6795) - [C\#] Reading large Arrow files in C\# results in an exception
+* [ARROW-6806](https://issues.apache.org/jira/browse/ARROW-6806) - [C++] Segfault deserializing ListArray containing null/empty list
+* [ARROW-6809](https://issues.apache.org/jira/browse/ARROW-6809) - [RUBY] Gem does not install on macOS due to glib2 3.3.7 compilation failure
+* [ARROW-6813](https://issues.apache.org/jira/browse/ARROW-6813) - [Ruby] Arrow::Table.load with headers=true leads to exception in Arrow 0.15
+* [ARROW-6834](https://issues.apache.org/jira/browse/ARROW-6834) - [C++] Pin gtest to 1.8.1 to triage failing Appveyor / MSVC build
+* [ARROW-6844](https://issues.apache.org/jira/browse/ARROW-6844) - [C++][Parquet][Python] List<scalar type\> columns read broken with 0.15.0
+* [ARROW-6857](https://issues.apache.org/jira/browse/ARROW-6857) - [Python][C++] Segfault for dictionary\_encode on empty chunked\_array (edge case)
+* [ARROW-6860](https://issues.apache.org/jira/browse/ARROW-6860) - [Python] Only link libarrow\_flight.so to pyarrow.\_flight
+* [ARROW-6861](https://issues.apache.org/jira/browse/ARROW-6861) - [Python] arrow-0.15.0 reading arrow-0.14.1-output Parquet dictionary column: Failure reading column: IOError: Arrow error: Invalid: Resize cannot downsize
+* [ARROW-6869](https://issues.apache.org/jira/browse/ARROW-6869) - [C++] Dictionary "delta" building logic in builder\_dict.h produces invalid arrays
+* [ARROW-6873](https://issues.apache.org/jira/browse/ARROW-6873) - [Python] Stale CColumn reference break Cython cimport pyarrow
+* [ARROW-6874](https://issues.apache.org/jira/browse/ARROW-6874) - [Python] Memory leak in Table.to\_pandas() when conversion to object dtype
+* [ARROW-6876](https://issues.apache.org/jira/browse/ARROW-6876) - [Python] Reading parquet file with many columns becomes slow for 0.15.0
+* [ARROW-6877](https://issues.apache.org/jira/browse/ARROW-6877) - [C++] Boost not found from the correct environment
+* [ARROW-6878](https://issues.apache.org/jira/browse/ARROW-6878) - [Python] pa.array() does not handle list of dicts with bytes keys correctly under python3
+* [ARROW-6882](https://issues.apache.org/jira/browse/ARROW-6882) - [Python] cannot create a chunked\_array from dictionary\_encoding result
+* [ARROW-6886](https://issues.apache.org/jira/browse/ARROW-6886) - [C++] arrow::io header nvcc compiler warnings
+* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes
+* [ARROW-6903](https://issues.apache.org/jira/browse/ARROW-6903) - [Python] Wheels broken after ARROW-6860 changes
+* [ARROW-6905](https://issues.apache.org/jira/browse/ARROW-6905) - [Packaging][OSX] Nightly builds on MacOS are failing because of brew compile timeouts
+* [ARROW-6910](https://issues.apache.org/jira/browse/ARROW-6910) - [Python] pyarrow.parquet.read\_table(...) takes up lots of memory which is not released until program exits
+* [ARROW-6922](https://issues.apache.org/jira/browse/ARROW-6922) - [Python] Pandas master build is failing (MultiIndex.levels change)
+* [ARROW-6937](https://issues.apache.org/jira/browse/ARROW-6937) - [Packaging][Python] Fix conda linux and OSX wheel nightly builds
+* [ARROW-6938](https://issues.apache.org/jira/browse/ARROW-6938) - [Python] Windows wheel depends on zstd.dll and libbz2.dll, which are not bundled
+* [ARROW-6962](https://issues.apache.org/jira/browse/ARROW-6962) - [C++] [CI] Stop compiling with -Weverything
+* [ARROW-6977](https://issues.apache.org/jira/browse/ARROW-6977) - [C++] Only enable jemalloc background\_thread if feature is supported
+* [ARROW-6983](https://issues.apache.org/jira/browse/ARROW-6983) - [C++] Threaded task group crashes sometimes
+* [ARROW-7422](https://issues.apache.org/jira/browse/ARROW-7422) - [Python] Improper CPU flags failing pyarrow install in ARM devices
+* [ARROW-7423](https://issues.apache.org/jira/browse/ARROW-7423) - Pyarrow ARM install fails from source with no clear error
+* [ARROW-9349](https://issues.apache.org/jira/browse/ARROW-9349) - [Python] parquet.read\_table causes crashes on Windows Server 2016 w/ Xeon Processor
+
+
+## New Features and Improvements
+
+* [ARROW-6610](https://issues.apache.org/jira/browse/ARROW-6610) - [C++] Add ARROW\_FILESYSTEM=ON/OFF CMake configuration flag
+* [ARROW-6661](https://issues.apache.org/jira/browse/ARROW-6661) - [Java] Implement APIs like slice to enhance VectorSchemaRoot
+* [ARROW-6777](https://issues.apache.org/jira/browse/ARROW-6777) - [GLib][CI] Unpin gobject-introspection gem
+* [ARROW-6852](https://issues.apache.org/jira/browse/ARROW-6852) - [C++] memory-benchmark build failed on Arm64
+* [ARROW-6927](https://issues.apache.org/jira/browse/ARROW-6927) - [C++] Add gRPC version check
+* [ARROW-6963](https://issues.apache.org/jira/browse/ARROW-6963) - [Packaging][Wheel][OSX] Use crossbow's command to deploy artifacts from travis builds
+
+
+
+# Apache Arrow 0.15.0 (2019-10-05)
+
+## New Features and Improvements
+
+* [ARROW-453](https://issues.apache.org/jira/browse/ARROW-453) - [C++] Add filesystem implementation for Amazon S3
+* [ARROW-517](https://issues.apache.org/jira/browse/ARROW-517) - [C++] Verbose Array::Equals
+* [ARROW-750](https://issues.apache.org/jira/browse/ARROW-750) - [Format] Add LargeBinary and LargeString types
+* [ARROW-1324](https://issues.apache.org/jira/browse/ARROW-1324) - [C++] Support ARROW\_BOOST\_VENDORED on Windows / MSVC
+* [ARROW-1561](https://issues.apache.org/jira/browse/ARROW-1561) - [C++] Kernel implementations for "isin" (set containment)
+* [ARROW-1566](https://issues.apache.org/jira/browse/ARROW-1566) - [C++] Implement non-materializing sort kernels
+* [ARROW-1741](https://issues.apache.org/jira/browse/ARROW-1741) - [C++] Comparison function for DictionaryArray to determine if indices are "compatible"
+* [ARROW-1786](https://issues.apache.org/jira/browse/ARROW-1786) - [Format] List expected on-wire buffer layouts for each kind of Arrow physical type in specification
+* [ARROW-1789](https://issues.apache.org/jira/browse/ARROW-1789) - [Format] Consolidate specification documents and improve clarity for new implementation authors
+* [ARROW-1875](https://issues.apache.org/jira/browse/ARROW-1875) - [Java] Write 64-bit ints as strings in integration test JSON files
+* [ARROW-2006](https://issues.apache.org/jira/browse/ARROW-2006) - [C++] Add option to trim excess padding when writing IPC messages
+* [ARROW-2431](https://issues.apache.org/jira/browse/ARROW-2431) - [Rust] Schema fidelity
+* [ARROW-2769](https://issues.apache.org/jira/browse/ARROW-2769) - [C++][Python] Deprecate and rename add\_metadata methods
+* [ARROW-2931](https://issues.apache.org/jira/browse/ARROW-2931) - [Crossbow] Windows builds are attempting to run linux and osx packaging tasks
+* [ARROW-3032](https://issues.apache.org/jira/browse/ARROW-3032) - [Python] Clean up NumPy-related C++ headers
+* [ARROW-3204](https://issues.apache.org/jira/browse/ARROW-3204) - [R] Enable package to be made available on CRAN
+* [ARROW-3243](https://issues.apache.org/jira/browse/ARROW-3243) - [C++] Upgrade jemalloc to version 5
+* [ARROW-3246](https://issues.apache.org/jira/browse/ARROW-3246) - [Python][Parquet] direct reading/writing of pandas categoricals in parquet
+* [ARROW-3325](https://issues.apache.org/jira/browse/ARROW-3325) - [Python] Support reading Parquet binary/string columns directly as DictionaryArray
+* [ARROW-3325](https://issues.apache.org/jira/browse/ARROW-3325) - [Python] Support reading Parquet binary/string columns directly as DictionaryArray
+* [ARROW-3531](https://issues.apache.org/jira/browse/ARROW-3531) - [Python] Deprecate Schema.field\_by\_name in favor of \_\_getitem\_\_
+* [ARROW-3538](https://issues.apache.org/jira/browse/ARROW-3538) - [Python] ability to override the automated assignment of uuid for filenames when writing datasets
+* [ARROW-3579](https://issues.apache.org/jira/browse/ARROW-3579) - [Crossbow] Unintuitive error message when remote branch has not been pushed
+* [ARROW-3643](https://issues.apache.org/jira/browse/ARROW-3643) - [Rust] Optimize \`push\_slice\` of \`BufferBuilder<bool\>\`
+* [ARROW-3710](https://issues.apache.org/jira/browse/ARROW-3710) - [Crossbow][Python] Run nightly tests against pandas master
+* [ARROW-3772](https://issues.apache.org/jira/browse/ARROW-3772) - [C++] Read Parquet dictionary encoded ColumnChunks directly into an Arrow DictionaryArray
+* [ARROW-3777](https://issues.apache.org/jira/browse/ARROW-3777) - [C++] Implement a mock "high latency" filesystem
+* [ARROW-3817](https://issues.apache.org/jira/browse/ARROW-3817) - [R] $ method for RecordBatch
+* [ARROW-3829](https://issues.apache.org/jira/browse/ARROW-3829) - [Python] Support protocols to extract Arrow objects from third-party classes
+* [ARROW-3943](https://issues.apache.org/jira/browse/ARROW-3943) - [R] Write vignette for R package
+* [ARROW-4036](https://issues.apache.org/jira/browse/ARROW-4036) - [C++] Make status codes pluggable
+* [ARROW-4095](https://issues.apache.org/jira/browse/ARROW-4095) - [C++] Implement optimizations for dictionary unification where dictionaries are prefixes of the unified dictionary
+* [ARROW-4111](https://issues.apache.org/jira/browse/ARROW-4111) - [Python] Create time types from Python sequences of integers
+* [ARROW-4218](https://issues.apache.org/jira/browse/ARROW-4218) - [Rust] [Parquet] Implement ColumnReader
+* [ARROW-4220](https://issues.apache.org/jira/browse/ARROW-4220) - [Python] Add buffered input and output stream ASV benchmarks with simulated high latency IO
+* [ARROW-4365](https://issues.apache.org/jira/browse/ARROW-4365) - [Rust] [Parquet] Implement RecordReader
+* [ARROW-4398](https://issues.apache.org/jira/browse/ARROW-4398) - [Python] Add benchmarks for Arrow<\>Parquet BYTE\_ARRAY serialization (read and write)
+* [ARROW-4473](https://issues.apache.org/jira/browse/ARROW-4473) - [Website] Add instructions to do a test-deploy of Arrow website and fix bugs
+* [ARROW-4507](https://issues.apache.org/jira/browse/ARROW-4507) - [Format] Create outline and introduction for new document.
+* [ARROW-4508](https://issues.apache.org/jira/browse/ARROW-4508) - [Format] Copy content from Layout.rst to new document.
+* [ARROW-4509](https://issues.apache.org/jira/browse/ARROW-4509) - [Format] Copy content from Metadata.rst to new document.
+* [ARROW-4510](https://issues.apache.org/jira/browse/ARROW-4510) - [Format] copy content from IPC.rst to new document.
+* [ARROW-4511](https://issues.apache.org/jira/browse/ARROW-4511) - [Format] remove individual documents in favor of new document once all content is moved
+* [ARROW-4648](https://issues.apache.org/jira/browse/ARROW-4648) - [C++/Question] Naming/organizational inconsistencies in cpp codebase
+* [ARROW-4648](https://issues.apache.org/jira/browse/ARROW-4648) - [C++/Question] Naming/organizational inconsistencies in cpp codebase
+* [ARROW-4649](https://issues.apache.org/jira/browse/ARROW-4649) - [C++/CI/R] Add (nightly) job that builds \`brew install apache-arrow --HEAD\`
+* [ARROW-4752](https://issues.apache.org/jira/browse/ARROW-4752) - [Rust] Add explicit SIMD vectorization for the divide kernel
+* [ARROW-4810](https://issues.apache.org/jira/browse/ARROW-4810) - [Format][C++] Add "LargeList" type with 64-bit offsets
+* [ARROW-4841](https://issues.apache.org/jira/browse/ARROW-4841) - [C++] Persist CMake options in generated CMake config
+* [ARROW-4860](https://issues.apache.org/jira/browse/ARROW-4860) - [C++] Build AWS C++ SDK for Windows in conda-forge
+* [ARROW-5134](https://issues.apache.org/jira/browse/ARROW-5134) - [R][CI] Run nightly tests against multiple R versions
+* [ARROW-5211](https://issues.apache.org/jira/browse/ARROW-5211) - [Format] Missing documentation under \`Dictionary encoding\` section on MetaData page
+* [ARROW-5216](https://issues.apache.org/jira/browse/ARROW-5216) - [CI] Add Appveyor badge to README
+* [ARROW-5307](https://issues.apache.org/jira/browse/ARROW-5307) - [CI][GLib] Enable GTK-Doc
+* [ARROW-5337](https://issues.apache.org/jira/browse/ARROW-5337) - [C++] Add RecordBatch::field method, possibly deprecate "column"
+* [ARROW-5343](https://issues.apache.org/jira/browse/ARROW-5343) - [C++] Consider using Buffer for transpose maps in DictionaryType::Unify instead of std::vector
+* [ARROW-5344](https://issues.apache.org/jira/browse/ARROW-5344) - [C++] Use ArrayDataVisitor in implementation of dictionary unpacking in compute/kernels/cast.cc
+* [ARROW-5351](https://issues.apache.org/jira/browse/ARROW-5351) - [Rust] Add support for take kernel functions
+* [ARROW-5358](https://issues.apache.org/jira/browse/ARROW-5358) - [Rust] Implement equality check for ArrayData and Array
+* [ARROW-5380](https://issues.apache.org/jira/browse/ARROW-5380) - [C++] Fix and enable UBSan for unaligned accesses.
+* [ARROW-5439](https://issues.apache.org/jira/browse/ARROW-5439) - [Java] Utilize stream EOS in File format
+* [ARROW-5444](https://issues.apache.org/jira/browse/ARROW-5444) - [Release][Website] After 0.14 release, update what is an "official" release
+* [ARROW-5458](https://issues.apache.org/jira/browse/ARROW-5458) - [C++] ARMv8 parallel CRC32c computation optimization
+* [ARROW-5480](https://issues.apache.org/jira/browse/ARROW-5480) - [Python] Pandas categorical type doesn't survive a round-trip through parquet
+* [ARROW-5483](https://issues.apache.org/jira/browse/ARROW-5483) - [Java] add ValueVector constructors that take a Field object
+* [ARROW-5494](https://issues.apache.org/jira/browse/ARROW-5494) - [Python] Create FileSystem bindings
+* [ARROW-5505](https://issues.apache.org/jira/browse/ARROW-5505) - [R] Stop masking base R functions/rethink namespacing
+* [ARROW-5527](https://issues.apache.org/jira/browse/ARROW-5527) - [C++] HashTable/MemoTable should use Buffer(s)/Builder(s) for heap data
+* [ARROW-5558](https://issues.apache.org/jira/browse/ARROW-5558) - [C++] Support Array::View on arrays with non-zero offsets
+* [ARROW-5559](https://issues.apache.org/jira/browse/ARROW-5559) - [C++] Introduce IpcOptions struct object for better API-stability when adding new options
+* [ARROW-5564](https://issues.apache.org/jira/browse/ARROW-5564) - [C++] Add uriparser to conda-forge
+* [ARROW-5579](https://issues.apache.org/jira/browse/ARROW-5579) - [Java] shade flatbuffer dependency
+* [ARROW-5580](https://issues.apache.org/jira/browse/ARROW-5580) - [C++][Gandiva] Correct definitions of timestamp functions in Gandiva
+* [ARROW-5588](https://issues.apache.org/jira/browse/ARROW-5588) - [C++] Better support for building UnionArrays
+* [ARROW-5594](https://issues.apache.org/jira/browse/ARROW-5594) - [C++] add support for UnionArrays to Take and Filter
+* [ARROW-5610](https://issues.apache.org/jira/browse/ARROW-5610) - [Python] Define extension type API in Python to "receive" or "send" a foreign extension type
+* [ARROW-5646](https://issues.apache.org/jira/browse/ARROW-5646) - [Crossbow][Documentation] Move the user guide to the Sphinx documentation
+* [ARROW-5681](https://issues.apache.org/jira/browse/ARROW-5681) - [FlightRPC] Wrap gRPC exceptions/statuses
+* [ARROW-5686](https://issues.apache.org/jira/browse/ARROW-5686) - [R] Review R Windows CI build
+* [ARROW-5716](https://issues.apache.org/jira/browse/ARROW-5716) - [Developer] Improve merge PR script to acknowledge co-authors
+* [ARROW-5717](https://issues.apache.org/jira/browse/ARROW-5717) - [Python] Support dictionary unification when converting variable dictionaries to pandas
+* [ARROW-5719](https://issues.apache.org/jira/browse/ARROW-5719) - [Java] Support in-place vector sorting
+* [ARROW-5722](https://issues.apache.org/jira/browse/ARROW-5722) - [Rust] Implement std::fmt::Debug for ListArray, BinaryArray and StructArray
+* [ARROW-5734](https://issues.apache.org/jira/browse/ARROW-5734) - [Python] Dispatch to Table.from\_arrays from pyarrow.table factory function
+* [ARROW-5736](https://issues.apache.org/jira/browse/ARROW-5736) - [Format][C++] Support small bit-width indices in sparse tensor
+* [ARROW-5741](https://issues.apache.org/jira/browse/ARROW-5741) - [JS] Make numeric vector from functions consistent with TypedArray.from
+* [ARROW-5743](https://issues.apache.org/jira/browse/ARROW-5743) - [C++] Add CMake option to enable "large memory" unit tests
+* [ARROW-5746](https://issues.apache.org/jira/browse/ARROW-5746) - [Website] Move website source out of apache/arrow
+* [ARROW-5747](https://issues.apache.org/jira/browse/ARROW-5747) - [C++] Better column name and header support in CSV reader
+* [ARROW-5758](https://issues.apache.org/jira/browse/ARROW-5758) - [C++][Gandiva] Support casting decimals to varchar and vice versa
+* [ARROW-5762](https://issues.apache.org/jira/browse/ARROW-5762) - [Integration][JS] Integration Tests for Map Type
+* [ARROW-5777](https://issues.apache.org/jira/browse/ARROW-5777) - [C++] BasicDecimal128 is a small object it doesn't always make sense to pass by const ref
+* [ARROW-5778](https://issues.apache.org/jira/browse/ARROW-5778) - [Java] Extract the logic for vector data copying to the super classes
+* [ARROW-5784](https://issues.apache.org/jira/browse/ARROW-5784) - [Release][GLib] Replace c\_glib/ after running c\_glib/autogen.sh in dev/release/02-source.sh
+* [ARROW-5786](https://issues.apache.org/jira/browse/ARROW-5786) - [Release] Use arrow-jni profile in dev/release/01-prepare.sh
+* [ARROW-5788](https://issues.apache.org/jira/browse/ARROW-5788) - [Rust] Use { version = "...", path = "../..." } for arrow and parquet dependencies
+* [ARROW-5789](https://issues.apache.org/jira/browse/ARROW-5789) - [C++] Small Warning/Linkage cleanups
+* [ARROW-5792](https://issues.apache.org/jira/browse/ARROW-5792) - [Rust] [Parquet] A visitor trait for parquet types.
+* [ARROW-5798](https://issues.apache.org/jira/browse/ARROW-5798) - [Packaging][deb] Update doc architecture
+* [ARROW-5800](https://issues.apache.org/jira/browse/ARROW-5800) - [R] Dockerize R Travis CI tests so they can be run anywhere via docker-compose
+* [ARROW-5803](https://issues.apache.org/jira/browse/ARROW-5803) - [C++] Dockerize C++ with clang 7 Travis CI unit test logic
+* [ARROW-5812](https://issues.apache.org/jira/browse/ARROW-5812) - [Java] Refactor method name and param type in BaseIntVector
+* [ARROW-5813](https://issues.apache.org/jira/browse/ARROW-5813) - [C++] Support checking the equality of the different contiguous tensors
+* [ARROW-5814](https://issues.apache.org/jira/browse/ARROW-5814) - [Java] Implement a <Object, int\> HashMap for DictionaryEncoder
+* [ARROW-5827](https://issues.apache.org/jira/browse/ARROW-5827) - [C++] Require c-ares CMake config
+* [ARROW-5828](https://issues.apache.org/jira/browse/ARROW-5828) - [C++] Add Protocol Buffers version check
+* [ARROW-5830](https://issues.apache.org/jira/browse/ARROW-5830) - [C++] Stop using memcmp in TensorEquals
+* [ARROW-5832](https://issues.apache.org/jira/browse/ARROW-5832) - [Java] Support search operations for vector data
+* [ARROW-5833](https://issues.apache.org/jira/browse/ARROW-5833) - [C++] Factor out status copying code from cast.cc
+* [ARROW-5834](https://issues.apache.org/jira/browse/ARROW-5834) - [Java] Apply new hash map in DictionaryEncoder
+* [ARROW-5835](https://issues.apache.org/jira/browse/ARROW-5835) - [Java] Support Dictionary Encoding for binary type
+* [ARROW-5841](https://issues.apache.org/jira/browse/ARROW-5841) - [Website] Add 0.14.0 release note
+* [ARROW-5842](https://issues.apache.org/jira/browse/ARROW-5842) - [Java] Revise the semantic of lastSet in ListVector
+* [ARROW-5843](https://issues.apache.org/jira/browse/ARROW-5843) - [Java] Improve the readability and performance of BitVectorHelper\#getNullCount
+* [ARROW-5844](https://issues.apache.org/jira/browse/ARROW-5844) - [Java] Support comparison & sort for more numeric types
+* [ARROW-5846](https://issues.apache.org/jira/browse/ARROW-5846) - [Java] Create Avro adapter module and add dependencies
+* [ARROW-5853](https://issues.apache.org/jira/browse/ARROW-5853) - [Python] Expose boolean filter kernel on Array
+* [ARROW-5861](https://issues.apache.org/jira/browse/ARROW-5861) - [Java] Initial implement to convert Avro record with primitive types
+* [ARROW-5862](https://issues.apache.org/jira/browse/ARROW-5862) - [Java] Provide dictionary builder
+* [ARROW-5864](https://issues.apache.org/jira/browse/ARROW-5864) - [Python] simplify cython wrapping of Result
+* [ARROW-5865](https://issues.apache.org/jira/browse/ARROW-5865) - [Release] Helper script for rebasing open pull requests on master
+* [ARROW-5866](https://issues.apache.org/jira/browse/ARROW-5866) - [C++] Remove duplicate library in cpp/Brewfile
+* [ARROW-5867](https://issues.apache.org/jira/browse/ARROW-5867) - [C++][Gandiva] Add support for cast int to decimal
+* [ARROW-5872](https://issues.apache.org/jira/browse/ARROW-5872) - Support mod(double, double) method in Gandiva
+* [ARROW-5876](https://issues.apache.org/jira/browse/ARROW-5876) - [FlightRPC] Implement basic auth across all languages
+* [ARROW-5877](https://issues.apache.org/jira/browse/ARROW-5877) - [FlightRPC] Fix auth incompatibilities between Python/Java
+* [ARROW-5880](https://issues.apache.org/jira/browse/ARROW-5880) - [C++] Update arrow parquet writer to use TypedBufferBuilder
+* [ARROW-5881](https://issues.apache.org/jira/browse/ARROW-5881) - [Java] Provide functionalities to efficiently determine if a validity buffer has completely 1 bits/0 bits
+* [ARROW-5883](https://issues.apache.org/jira/browse/ARROW-5883) - [Java] Support dictionary encoding for List and Struct type
+* [ARROW-5888](https://issues.apache.org/jira/browse/ARROW-5888) - [Python][C++] Add metadata to store Arrow time zones in Parquet file metadata
+* [ARROW-5891](https://issues.apache.org/jira/browse/ARROW-5891) - [C++][Gandiva] Remove duplicates in function registries
+* [ARROW-5892](https://issues.apache.org/jira/browse/ARROW-5892) - [C++][Gandiva] Support function aliases
+* [ARROW-5893](https://issues.apache.org/jira/browse/ARROW-5893) - [C++] Remove arrow::Column class from C++ library
+* [ARROW-5897](https://issues.apache.org/jira/browse/ARROW-5897) - [Java] Remove duplicated logic in MapVector
+* [ARROW-5898](https://issues.apache.org/jira/browse/ARROW-5898) - [Java] Provide functionality to efficiently compute hash code for arbitrary memory segment
+* [ARROW-5900](https://issues.apache.org/jira/browse/ARROW-5900) - [Gandiva] [Java] Decimal precision,scale bounds check
+* [ARROW-5901](https://issues.apache.org/jira/browse/ARROW-5901) - [Rust] Implement PartialEq to compare array and json values
+* [ARROW-5902](https://issues.apache.org/jira/browse/ARROW-5902) - [Java] Implement hash table and equals & hashCode API for dictionary encoding
+* [ARROW-5903](https://issues.apache.org/jira/browse/ARROW-5903) - [Java] Set methods in DecimalVector are slow
+* [ARROW-5904](https://issues.apache.org/jira/browse/ARROW-5904) - [Java] [Plasma] Fix compilation of Plasma Java client
+* [ARROW-5906](https://issues.apache.org/jira/browse/ARROW-5906) - [CI] Set -DARROW\_VERBOSE\_THIRDPARTY\_BUILD=OFF in builds running in Travis CI, maybe all docker-compose builds by default
+* [ARROW-5908](https://issues.apache.org/jira/browse/ARROW-5908) - [C\#] ArrowStreamWriter doesn't align buffers to 8 bytes
+* [ARROW-5909](https://issues.apache.org/jira/browse/ARROW-5909) - [Java] Optimize ByteFunctionHelpers equals & compare logic
+* [ARROW-5911](https://issues.apache.org/jira/browse/ARROW-5911) - [Java] Make ListVector and MapVector create reader lazily
+* [ARROW-5917](https://issues.apache.org/jira/browse/ARROW-5917) - [Java] Redesign the dictionary encoder
+* [ARROW-5918](https://issues.apache.org/jira/browse/ARROW-5918) - [Java] Add get to BaseIntVector interface
+* [ARROW-5919](https://issues.apache.org/jira/browse/ARROW-5919) - [R] Add nightly tests for building r-arrow with dependencies from conda-forge
+* [ARROW-5920](https://issues.apache.org/jira/browse/ARROW-5920) - [Java] Support sort & compare for all variable width vectors
+* [ARROW-5924](https://issues.apache.org/jira/browse/ARROW-5924) - [C++][Plasma] It is not convenient to release a GPU object
+* [ARROW-5934](https://issues.apache.org/jira/browse/ARROW-5934) - [Python] Bundle arrow's LICENSE with the wheels
+* [ARROW-5937](https://issues.apache.org/jira/browse/ARROW-5937) - [Release] Stop parallel binary upload
+* [ARROW-5938](https://issues.apache.org/jira/browse/ARROW-5938) - [Release] Create branch for adding release note automatically
+* [ARROW-5939](https://issues.apache.org/jira/browse/ARROW-5939) - [Release] Add support for generating vote email template separately
+* [ARROW-5940](https://issues.apache.org/jira/browse/ARROW-5940) - [Release] Add support for re-uploading sign/checksum for binary artifacts
+* [ARROW-5941](https://issues.apache.org/jira/browse/ARROW-5941) - [Release] Avoid re-uploading already uploaded binary artifacts
+* [ARROW-5943](https://issues.apache.org/jira/browse/ARROW-5943) - [GLib][Gandiva] Add support for function aliases
+* [ARROW-5944](https://issues.apache.org/jira/browse/ARROW-5944) - [C++][Gandiva] Remove 'div' alias for 'divide'
+* [ARROW-5945](https://issues.apache.org/jira/browse/ARROW-5945) - [Rust] [DataFusion] Table trait should support building complete queries
+* [ARROW-5947](https://issues.apache.org/jira/browse/ARROW-5947) - [Rust] [DataFusion] Remove serde\_json dependency
+* [ARROW-5948](https://issues.apache.org/jira/browse/ARROW-5948) - [Rust] [DataFusion] create\_logical\_plan should not call optimizer
+* [ARROW-5955](https://issues.apache.org/jira/browse/ARROW-5955) - [Plasma] Support setting memory quotas per plasma client for better isolation
+* [ARROW-5957](https://issues.apache.org/jira/browse/ARROW-5957) - [C++][Gandiva] Implement div function in Gandiva
+* [ARROW-5958](https://issues.apache.org/jira/browse/ARROW-5958) - [Python] Link zlib statically in the wheels
+* [ARROW-5961](https://issues.apache.org/jira/browse/ARROW-5961) - [R] Be able to run R-only tests even without C++ library
+* [ARROW-5962](https://issues.apache.org/jira/browse/ARROW-5962) - [CI][Python] Do not test manylinux1 wheels in Travis CI
+* [ARROW-5967](https://issues.apache.org/jira/browse/ARROW-5967) - [Java] DateUtility\#timeZoneList is not correct
+* [ARROW-5970](https://issues.apache.org/jira/browse/ARROW-5970) - [Java] Provide pointer to Arrow buffer
+* [ARROW-5974](https://issues.apache.org/jira/browse/ARROW-5974) - [Python][C++] Enable CSV reader to read from concatenated gzip stream
+* [ARROW-5975](https://issues.apache.org/jira/browse/ARROW-5975) - [C++][Gandiva] Add method to cast Date(in Milliseconds) to timestamp
+* [ARROW-5976](https://issues.apache.org/jira/browse/ARROW-5976) - [C++] RETURN\_IF\_ERROR(ctx) should be namespaced
+* [ARROW-5977](https://issues.apache.org/jira/browse/ARROW-5977) - [C++] [Python] Method for read\_csv to limit which columns are read?
+* [ARROW-5979](https://issues.apache.org/jira/browse/ARROW-5979) - [FlightRPC] Expose (de)serialization of protocol types
+* [ARROW-5985](https://issues.apache.org/jira/browse/ARROW-5985) - [Developer] Do not suggest setting Fix Version for point releases in dev/merge\_arrow\_pr.py
+* [ARROW-5986](https://issues.apache.org/jira/browse/ARROW-5986) - [Java] Code cleanup for dictionary encoding
+* [ARROW-5988](https://issues.apache.org/jira/browse/ARROW-5988) - [Java] Avro adapter implement simple Record type
+* [ARROW-5997](https://issues.apache.org/jira/browse/ARROW-5997) - [Java] Support dictionary encoding for Union type
+* [ARROW-5998](https://issues.apache.org/jira/browse/ARROW-5998) - [Java] Open a document to track the API changes
+* [ARROW-6000](https://issues.apache.org/jira/browse/ARROW-6000) - [Python] Expose LargeBinaryType and LargeStringType
+* [ARROW-6008](https://issues.apache.org/jira/browse/ARROW-6008) - [Release] Don't parallelize the bintray upload script
+* [ARROW-6009](https://issues.apache.org/jira/browse/ARROW-6009) - [Release][JS] Ignore NPM errors in the javascript release script
+* [ARROW-6013](https://issues.apache.org/jira/browse/ARROW-6013) - [Java] Support range searcher
+* [ARROW-6017](https://issues.apache.org/jira/browse/ARROW-6017) - [FlightRPC] Allow creating Locations with unknown schemes
+* [ARROW-6020](https://issues.apache.org/jira/browse/ARROW-6020) - [Java] Refactor ByteFunctionHelper\#hash with new added ArrowBufHasher
+* [ARROW-6021](https://issues.apache.org/jira/browse/ARROW-6021) - [Java] Extract copyFrom and copyFromSafe methods to ValueVector interface
+* [ARROW-6022](https://issues.apache.org/jira/browse/ARROW-6022) - [Java] Support equals API in ValueVector to compare two vectors equal
+* [ARROW-6023](https://issues.apache.org/jira/browse/ARROW-6023) - [C++][Gandiva] Add functions in Gandiva
+* [ARROW-6024](https://issues.apache.org/jira/browse/ARROW-6024) - [Java] Provide more hash algorithms
+* [ARROW-6026](https://issues.apache.org/jira/browse/ARROW-6026) - [Doc] Add CONTRIBUTING.md
+* [ARROW-6030](https://issues.apache.org/jira/browse/ARROW-6030) - [Java] Efficiently compute hash code for ArrowBufPointer
+* [ARROW-6031](https://issues.apache.org/jira/browse/ARROW-6031) - [Java] Support iterating a vector by ArrowBufPointer
+* [ARROW-6034](https://issues.apache.org/jira/browse/ARROW-6034) - [C++][Gandiva] Add string functions in Gandiva
+* [ARROW-6035](https://issues.apache.org/jira/browse/ARROW-6035) - [Java] Avro adapter support convert nullable value
+* [ARROW-6036](https://issues.apache.org/jira/browse/ARROW-6036) - [GLib] Add support for skip rows and column\_names CSV read option
+* [ARROW-6037](https://issues.apache.org/jira/browse/ARROW-6037) - [GLib] Add a missing version macro
+* [ARROW-6039](https://issues.apache.org/jira/browse/ARROW-6039) - [GLib] Add garrow\_array\_filter()
+* [ARROW-6041](https://issues.apache.org/jira/browse/ARROW-6041) - [Website] Blog post announcing R package release
+* [ARROW-6042](https://issues.apache.org/jira/browse/ARROW-6042) - [C++] Implement alternative DictionaryBuilder that always yields int32 indices
+* [ARROW-6045](https://issues.apache.org/jira/browse/ARROW-6045) - [C++] Benchmark for Parquet float and NaN encoding/decoding
+* [ARROW-6048](https://issues.apache.org/jira/browse/ARROW-6048) - [C++] Add ChunkedArray::View which calls to Array::View
+* [ARROW-6049](https://issues.apache.org/jira/browse/ARROW-6049) - [C++] Support using Array::View from compatible dictionary type to another
+* [ARROW-6053](https://issues.apache.org/jira/browse/ARROW-6053) - [Python] RecordBatchStreamReader::Open2 cdef type signature doesn't match C++
+* [ARROW-6063](https://issues.apache.org/jira/browse/ARROW-6063) - [FlightRPC] Implement "half-closed" semantics for DoPut
+* [ARROW-6065](https://issues.apache.org/jira/browse/ARROW-6065) - [C++] Reorganize parquet/arrow/reader.cc, remove code duplication, improve readability
+* [ARROW-6069](https://issues.apache.org/jira/browse/ARROW-6069) - [Rust] [Parquet] Implement Converter to convert record reader to arrow primitive array.
+* [ARROW-6070](https://issues.apache.org/jira/browse/ARROW-6070) - [Java] Avoid creating new schema before IPC sending
+* [ARROW-6077](https://issues.apache.org/jira/browse/ARROW-6077) - [C++][Parquet] Build logical schema tree mapping Arrow fields to Parquet schema levels
+* [ARROW-6078](https://issues.apache.org/jira/browse/ARROW-6078) - [Java] Implement dictionary-encoded subfields for List type
+* [ARROW-6079](https://issues.apache.org/jira/browse/ARROW-6079) - [Java] Implement/test UnionFixedSizeListWriter for FixedSizeListVector
+* [ARROW-6080](https://issues.apache.org/jira/browse/ARROW-6080) - [Java] Support compare and search operation for BaseRepeatedValueVector
+* [ARROW-6083](https://issues.apache.org/jira/browse/ARROW-6083) - [Java] Refactor Jdbc adapter consume logic
+* [ARROW-6084](https://issues.apache.org/jira/browse/ARROW-6084) - [Python] Support LargeList
+* [ARROW-6085](https://issues.apache.org/jira/browse/ARROW-6085) - [Rust] [DataFusion] Create traits for phsyical query plan
+* [ARROW-6086](https://issues.apache.org/jira/browse/ARROW-6086) - [Rust] [DataFusion] Implement parallel execution for parquet scan
+* [ARROW-6087](https://issues.apache.org/jira/browse/ARROW-6087) - [Rust] [DataFusion] Implement parallel execution for CSV scan
+* [ARROW-6088](https://issues.apache.org/jira/browse/ARROW-6088) - [Rust] [DataFusion] Implement parallel execution for projection
+* [ARROW-6089](https://issues.apache.org/jira/browse/ARROW-6089) - [Rust] [DataFusion] Implement parallel execution for selection
+* [ARROW-6090](https://issues.apache.org/jira/browse/ARROW-6090) - [Rust] [DataFusion] Implement parallel execution for hash aggregate
+* [ARROW-6093](https://issues.apache.org/jira/browse/ARROW-6093) - [Java] reduce branches in algo for first match in VectorRangeSearcher
+* [ARROW-6094](https://issues.apache.org/jira/browse/ARROW-6094) - [Format][Flight] Add GetFlightSchema to Flight RPC
+* [ARROW-6096](https://issues.apache.org/jira/browse/ARROW-6096) - [C++] Conditionally depend on boost regex library
+* [ARROW-6097](https://issues.apache.org/jira/browse/ARROW-6097) - [Java] Avro adapter implement unions type
+* [ARROW-6100](https://issues.apache.org/jira/browse/ARROW-6100) - [Rust] Pin to specific Rust nightly release
+* [ARROW-6101](https://issues.apache.org/jira/browse/ARROW-6101) - [Rust] [DataFusion] Create physical plan from logical plan
+* [ARROW-6102](https://issues.apache.org/jira/browse/ARROW-6102) - [Testing] Add partitioned CSV file to arrow-testing repo
+* [ARROW-6104](https://issues.apache.org/jira/browse/ARROW-6104) - [Rust] [DataFusion] Don't allow bare\_trait\_objects
+* [ARROW-6105](https://issues.apache.org/jira/browse/ARROW-6105) - [C++][Parquet][Python] Add test case showing dictionary-encoded subfields in nested type
+* [ARROW-6113](https://issues.apache.org/jira/browse/ARROW-6113) - [Java] Support vector deduplicate function
+* [ARROW-6115](https://issues.apache.org/jira/browse/ARROW-6115) - [Python] support LargeList, LargeString, LargeBinary in conversion to pandas
+* [ARROW-6118](https://issues.apache.org/jira/browse/ARROW-6118) - [Java] Replace google Preconditions with Arrow Preconditions
+* [ARROW-6121](https://issues.apache.org/jira/browse/ARROW-6121) - [Tools] Improve merge tool cli ergonomic
+* [ARROW-6125](https://issues.apache.org/jira/browse/ARROW-6125) - [Python] Remove any APIs deprecated prior to 0.14.x
+* [ARROW-6127](https://issues.apache.org/jira/browse/ARROW-6127) - [Website] Add favicons and meta tags
+* [ARROW-6128](https://issues.apache.org/jira/browse/ARROW-6128) - [C++] Can't build with g++ 8.3.0 by class-memaccess warning
+* [ARROW-6130](https://issues.apache.org/jira/browse/ARROW-6130) - [Release] Use 0.15.0 as the next release
+* [ARROW-6134](https://issues.apache.org/jira/browse/ARROW-6134) - [C++][Gandiva] Add concat function in Gandiva
+* [ARROW-6137](https://issues.apache.org/jira/browse/ARROW-6137) - [C++][Gandiva] Change output format of castVARCHAR(timestamp) in Gandiva
+* [ARROW-6137](https://issues.apache.org/jira/browse/ARROW-6137) - [C++][Gandiva] Change output format of castVARCHAR(timestamp) in Gandiva
+* [ARROW-6138](https://issues.apache.org/jira/browse/ARROW-6138) - [C++] Add a basic (single RecordBatch) implementation of Dataset
+* [ARROW-6139](https://issues.apache.org/jira/browse/ARROW-6139) - [Documentation][R] Build R docs (pkgdown) site and add to arrow-site
+* [ARROW-6141](https://issues.apache.org/jira/browse/ARROW-6141) - [C++] Enable memory-mapping a file region that is offset from the beginning of the file
+* [ARROW-6142](https://issues.apache.org/jira/browse/ARROW-6142) - [R] Install instructions on linux could be clearer
+* [ARROW-6143](https://issues.apache.org/jira/browse/ARROW-6143) - [Java] Unify the copyFrom and copyFromSafe methods for all vectors
+* [ARROW-6144](https://issues.apache.org/jira/browse/ARROW-6144) - [C++][Gandiva] Implement random function in Gandiva
+* [ARROW-6155](https://issues.apache.org/jira/browse/ARROW-6155) - [Java] Extract a super interface for vectors whose elements reside in continuous memory segments
+* [ARROW-6156](https://issues.apache.org/jira/browse/ARROW-6156) - [Java] Support compare semantics for ArrowBufPointer
+* [ARROW-6161](https://issues.apache.org/jira/browse/ARROW-6161) - [C++] Implements dataset::ParquetFile and associated Scan structures
+* [ARROW-6162](https://issues.apache.org/jira/browse/ARROW-6162) - [C++][Gandiva] Do not truncate string in castVARCHAR\_varchar when out\_len parameter is zero
+* [ARROW-6164](https://issues.apache.org/jira/browse/ARROW-6164) - [Docs][Format] Document project versioning schema and forward/backward compatibility policies
+* [ARROW-6172](https://issues.apache.org/jira/browse/ARROW-6172) - [Java] Provide benchmarks to set IntVector with different methods
+* [ARROW-6177](https://issues.apache.org/jira/browse/ARROW-6177) - [C++] Add Array::Validate()
+* [ARROW-6180](https://issues.apache.org/jira/browse/ARROW-6180) - [C++] Create InputStream that is an isolated reader of a segment of a RandomAccessFile
+* [ARROW-6181](https://issues.apache.org/jira/browse/ARROW-6181) - [R] Only allow R package to install without libarrow on linux
+* [ARROW-6183](https://issues.apache.org/jira/browse/ARROW-6183) - [R] Document that you don't have to use tidyselect if you don't want
+* [ARROW-6185](https://issues.apache.org/jira/browse/ARROW-6185) - [Java] Provide hash table based dictionary builder
+* [ARROW-6187](https://issues.apache.org/jira/browse/ARROW-6187) - [C++] fallback to storage type when writing ExtensionType to Parquet
+* [ARROW-6188](https://issues.apache.org/jira/browse/ARROW-6188) - [GLib] Add garrow\_array\_is\_in()
+* [ARROW-6192](https://issues.apache.org/jira/browse/ARROW-6192) - [GLib] Use the same SO version as C++
+* [ARROW-6194](https://issues.apache.org/jira/browse/ARROW-6194) - [Java] Add non-static approach in DictionaryEncoder making it easy to extend and reuse
+* [ARROW-6196](https://issues.apache.org/jira/browse/ARROW-6196) - [Ruby] Add support for building Arrow::TimeNNArray by .new
+* [ARROW-6197](https://issues.apache.org/jira/browse/ARROW-6197) - [GLib] Add garrow\_decimal128\_rescale()
+* [ARROW-6199](https://issues.apache.org/jira/browse/ARROW-6199) - [Java] Avro adapter avoid potential resource leak.
+* [ARROW-6203](https://issues.apache.org/jira/browse/ARROW-6203) - [GLib] Add garrow\_array\_sort\_to\_indices()
+* [ARROW-6204](https://issues.apache.org/jira/browse/ARROW-6204) - [GLib] Add garrow\_array\_is\_in\_chunked\_array()
+* [ARROW-6206](https://issues.apache.org/jira/browse/ARROW-6206) - [Java][Docs] Document environment variables/java properties
+* [ARROW-6209](https://issues.apache.org/jira/browse/ARROW-6209) - [Java] Extract set null method to the base class for fixed width vectors
+* [ARROW-6212](https://issues.apache.org/jira/browse/ARROW-6212) - [Java] Support vector rank operation
+* [ARROW-6216](https://issues.apache.org/jira/browse/ARROW-6216) - [C++] Allow user to select the compression level
+* [ARROW-6217](https://issues.apache.org/jira/browse/ARROW-6217) - [Website] Remove needless \_site/ directory
+* [ARROW-6219](https://issues.apache.org/jira/browse/ARROW-6219) - [Java] Add API for JDBC adapter that can convert less then the full result set at a time.
+* [ARROW-6220](https://issues.apache.org/jira/browse/ARROW-6220) - [Java] Add API to avro adapter to limit number of rows returned at a time.
+* [ARROW-6225](https://issues.apache.org/jira/browse/ARROW-6225) - [Website] Update arrow-site/README and any other places to point website contributors in right direction
+* [ARROW-6229](https://issues.apache.org/jira/browse/ARROW-6229) - [C++] Add a DataSource implementation which scans a directory
+* [ARROW-6230](https://issues.apache.org/jira/browse/ARROW-6230) - [R] Reading in Parquet files are 20x slower than reading fst files in R
+* [ARROW-6231](https://issues.apache.org/jira/browse/ARROW-6231) - [C++][Python] Consider assigning default column names when reading CSV file and header\_rows=0
+* [ARROW-6232](https://issues.apache.org/jira/browse/ARROW-6232) - [C++] Rename Argsort kernel to SortToIndices
+* [ARROW-6237](https://issues.apache.org/jira/browse/ARROW-6237) - [R] Add option to set CXXFLAGS when compiling R package with $ARROW\_R\_CXXFLAGS
+* [ARROW-6238](https://issues.apache.org/jira/browse/ARROW-6238) - [C++] Implement SimpleDataSource/SimpleDataFragment
+* [ARROW-6240](https://issues.apache.org/jira/browse/ARROW-6240) - [Ruby] Arrow::Decimal128Array returns BigDecimal
+* [ARROW-6242](https://issues.apache.org/jira/browse/ARROW-6242) - [C++] Implements basic Dataset/Scanner/ScannerBuilder
+* [ARROW-6243](https://issues.apache.org/jira/browse/ARROW-6243) - [C++] Implement basic Filter expression classes
+* [ARROW-6244](https://issues.apache.org/jira/browse/ARROW-6244) - [C++] Implement Partition DataSource
+* [ARROW-6246](https://issues.apache.org/jira/browse/ARROW-6246) - [Website] Add link to R documentation site
+* [ARROW-6247](https://issues.apache.org/jira/browse/ARROW-6247) - [Java] Provide a common interface for float4 and float8 vectors
+* [ARROW-6249](https://issues.apache.org/jira/browse/ARROW-6249) - [Java] Remove useless class ByteArrayWrapper
+* [ARROW-6250](https://issues.apache.org/jira/browse/ARROW-6250) - [Java] Implement ApproxEqualsVisitor comparing approx for floating point
+* [ARROW-6252](https://issues.apache.org/jira/browse/ARROW-6252) - [Python] Add pyarrow.Array.diff method that exposes arrow::Diff
+* [ARROW-6253](https://issues.apache.org/jira/browse/ARROW-6253) - [Python] Expose "enable\_buffered\_stream" option from parquet::ReaderProperties in pyarrow.parquet.read\_table
+* [ARROW-6258](https://issues.apache.org/jira/browse/ARROW-6258) - [R] Add macOS build scripts
+* [ARROW-6260](https://issues.apache.org/jira/browse/ARROW-6260) - [Website] Use deploy key on Travis to build and push to asf-site
+* [ARROW-6262](https://issues.apache.org/jira/browse/ARROW-6262) - [Developer] Show JIRA issue before merging
+* [ARROW-6264](https://issues.apache.org/jira/browse/ARROW-6264) - [Java] There is no need to consider byte order in ArrowBufHasher
+* [ARROW-6265](https://issues.apache.org/jira/browse/ARROW-6265) - [Java] Avro adapter implement Array/Map/Fixed type
+* [ARROW-6267](https://issues.apache.org/jira/browse/ARROW-6267) - [Ruby] Add Arrow::Time for Arrow::Time{32,64}DataType value
+* [ARROW-6271](https://issues.apache.org/jira/browse/ARROW-6271) - [Rust] [DataFusion] Add example for running SQL against Parquet
+* [ARROW-6272](https://issues.apache.org/jira/browse/ARROW-6272) - [Rust] [DataFusion] Add register\_parquet convenience method to ExecutionContext
+* [ARROW-6278](https://issues.apache.org/jira/browse/ARROW-6278) - [R] Read parquet files from raw vector
+* [ARROW-6279](https://issues.apache.org/jira/browse/ARROW-6279) - [Python] Add Table.slice method or allow slices in \_\_getitem\_\_
+* [ARROW-6284](https://issues.apache.org/jira/browse/ARROW-6284) - [C++] Allow references in std::tuple when converting tuple to arrow array
+* [ARROW-6287](https://issues.apache.org/jira/browse/ARROW-6287) - [Rust] [DataFusion] Refactor TableProvider to return thread-safe BatchIterator
+* [ARROW-6288](https://issues.apache.org/jira/browse/ARROW-6288) - [Java] Implement TypeEqualsVisitor comparing vector type equals considering names and metadata
+* [ARROW-6289](https://issues.apache.org/jira/browse/ARROW-6289) - [Java] Add empty() in UnionVector to create instance
+* [ARROW-6292](https://issues.apache.org/jira/browse/ARROW-6292) - [C++] Add an option to build with mimalloc
+* [ARROW-6294](https://issues.apache.org/jira/browse/ARROW-6294) - [C++] Use hyphen for plasma-store-server executable
+* [ARROW-6295](https://issues.apache.org/jira/browse/ARROW-6295) - [Rust][DataFusion] ExecutionError Cannot compare Float32 with Float64
+* [ARROW-6296](https://issues.apache.org/jira/browse/ARROW-6296) - [Java] Cleanup JDBC interfaces and eliminate one memcopy for binary/varchar fields
+* [ARROW-6297](https://issues.apache.org/jira/browse/ARROW-6297) - [Java] Compare ArrowBufPointers by unsinged integers
+* [ARROW-6300](https://issues.apache.org/jira/browse/ARROW-6300) - [C++] Add io::OutputStream::Abort()
+* [ARROW-6303](https://issues.apache.org/jira/browse/ARROW-6303) - [Rust] Add a feature to disable SIMD
+* [ARROW-6304](https://issues.apache.org/jira/browse/ARROW-6304) - [Java] Add description to each maven artifact
+* [ARROW-6306](https://issues.apache.org/jira/browse/ARROW-6306) - [Java] Support stable sort by stable comparators
+* [ARROW-6310](https://issues.apache.org/jira/browse/ARROW-6310) - [C++] Write 64-bit integers as strings in JSON integration test files
+* [ARROW-6311](https://issues.apache.org/jira/browse/ARROW-6311) - [Java] Make ApproxEqualsVisitor accept DiffFunction to make it more flexible
+* [ARROW-6313](https://issues.apache.org/jira/browse/ARROW-6313) - [Format] Tracking for ensuring flatbuffer serialized values are aligned in stream/files.
+* [ARROW-6314](https://issues.apache.org/jira/browse/ARROW-6314) - [C++] Implement changes to ensure flatbuffer alignment.
+* [ARROW-6314](https://issues.apache.org/jira/browse/ARROW-6314) - [C++] Implement changes to ensure flatbuffer alignment.
+* [ARROW-6315](https://issues.apache.org/jira/browse/ARROW-6315) - [Java] Make change to ensure flatbuffer reads are aligned
+* [ARROW-6316](https://issues.apache.org/jira/browse/ARROW-6316) - [Go] Make change to ensure flatbuffer reads are aligned
+* [ARROW-6317](https://issues.apache.org/jira/browse/ARROW-6317) - [JS] Implement changes to ensure flatbuffer alignment
+* [ARROW-6318](https://issues.apache.org/jira/browse/ARROW-6318) - [Integration] Update integration test to use generated binaries to ensure backwards compatibility
+* [ARROW-6319](https://issues.apache.org/jira/browse/ARROW-6319) - [C++] Extract the core of NumericTensor<T\>::Value as Tensor::Value<T\>
+* [ARROW-6326](https://issues.apache.org/jira/browse/ARROW-6326) - [C++] Nullable fields when converting std::tuple to Table
+* [ARROW-6328](https://issues.apache.org/jira/browse/ARROW-6328) - Click.option-s should have help text
+* [ARROW-6329](https://issues.apache.org/jira/browse/ARROW-6329) - [Format] Add 4-byte "stream continuation" to IPC message format to align Flatbuffers
+* [ARROW-6331](https://issues.apache.org/jira/browse/ARROW-6331) - [Java] Incorporate ErrorProne into the java build
+* [ARROW-6334](https://issues.apache.org/jira/browse/ARROW-6334) - [Java] Improve the dictionary builder API to return the position of the value in the dictionary
+* [ARROW-6335](https://issues.apache.org/jira/browse/ARROW-6335) - [Java] Improve the performance of DictionaryHashTable
+* [ARROW-6336](https://issues.apache.org/jira/browse/ARROW-6336) - [Python] Clarify pyarrow.serialize/deserialize docstrings viz-a-viz relationship with Arrow IPC protocol
+* [ARROW-6337](https://issues.apache.org/jira/browse/ARROW-6337) - [R] as\_tibble in R API is a misnomer
+* [ARROW-6338](https://issues.apache.org/jira/browse/ARROW-6338) - [R] Type function names don't match type names
+* [ARROW-6342](https://issues.apache.org/jira/browse/ARROW-6342) - [Python] Add pyarrow.record\_batch factory function with same basic API / semantics as pyarrow.table
+* [ARROW-6346](https://issues.apache.org/jira/browse/ARROW-6346) - [GLib] Add garrow\_array\_view()
+* [ARROW-6347](https://issues.apache.org/jira/browse/ARROW-6347) - [GLib] Add garrow\_array\_diff\_unified()
+* [ARROW-6350](https://issues.apache.org/jira/browse/ARROW-6350) - [Ruby] Remove Arrow::Struct and use Hash instead
+* [ARROW-6351](https://issues.apache.org/jira/browse/ARROW-6351) - [Ruby] Improve Arrow\#values performance
+* [ARROW-6353](https://issues.apache.org/jira/browse/ARROW-6353) - [Python] Allow user to select compression level in pyarrow.parquet.write\_table
+* [ARROW-6355](https://issues.apache.org/jira/browse/ARROW-6355) - [Java] Make range equal visitor reusable
+* [ARROW-6356](https://issues.apache.org/jira/browse/ARROW-6356) - [Java] Avro adapter implement Enum type and nested Record type
+* [ARROW-6357](https://issues.apache.org/jira/browse/ARROW-6357) - [C++] S3: allow for background writes
+* [ARROW-6358](https://issues.apache.org/jira/browse/ARROW-6358) - [C++] FileSystem::DeleteDir should make it optional to delete the directory itself
+* [ARROW-6360](https://issues.apache.org/jira/browse/ARROW-6360) - [R] Update support for compression
+* [ARROW-6362](https://issues.apache.org/jira/browse/ARROW-6362) - [C++] S3: more flexible credential options
+* [ARROW-6365](https://issues.apache.org/jira/browse/ARROW-6365) - [R] Should be able to coerce numeric to integer with schema
+* [ARROW-6366](https://issues.apache.org/jira/browse/ARROW-6366) - [Java] Make field vectors final explicitly
+* [ARROW-6368](https://issues.apache.org/jira/browse/ARROW-6368) - [C++] Add RecordBatch projection functionality
+* [ARROW-6373](https://issues.apache.org/jira/browse/ARROW-6373) - [C++] Make FixedWidthBinaryBuilder consistent with other primitive fixed width builders
+* [ARROW-6375](https://issues.apache.org/jira/browse/ARROW-6375) - [C++] Extend ConversionTraits to allow efficiently appending list values in STL API
+* [ARROW-6379](https://issues.apache.org/jira/browse/ARROW-6379) - [C++] Do not append any buffers when serializing NullType for IPC
+* [ARROW-6381](https://issues.apache.org/jira/browse/ARROW-6381) - [C++] BufferOutputStream::Write is slow for many small writes
+* [ARROW-6383](https://issues.apache.org/jira/browse/ARROW-6383) - [Java] report outstanding child allocators on parent allocator close
+* [ARROW-6384](https://issues.apache.org/jira/browse/ARROW-6384) - [C++] Bump dependencies
+* [ARROW-6385](https://issues.apache.org/jira/browse/ARROW-6385) - [C++] Investigate xxh3
+* [ARROW-6391](https://issues.apache.org/jira/browse/ARROW-6391) - [Python][Flight] Add built-in methods on FlightServerBase to start server and wait for it to be available
+* [ARROW-6397](https://issues.apache.org/jira/browse/ARROW-6397) - [C++][CI] Fix S3 minio failure
+* [ARROW-6401](https://issues.apache.org/jira/browse/ARROW-6401) - [Java] Implement dictionary-encoded subfields for Struct type
+* [ARROW-6402](https://issues.apache.org/jira/browse/ARROW-6402) - [C++] Suppress sign-compare warning with g++ 9.2.1
+* [ARROW-6403](https://issues.apache.org/jira/browse/ARROW-6403) - [Python] Expose FileReader::ReadRowGroups() to Python
+* [ARROW-6408](https://issues.apache.org/jira/browse/ARROW-6408) - [Rust] Use "if cfg!" pattern in SIMD kernel implementations
+* [ARROW-6413](https://issues.apache.org/jira/browse/ARROW-6413) - [R] Support autogenerating column names
+* [ARROW-6415](https://issues.apache.org/jira/browse/ARROW-6415) - [R] Remove usage of R CMD config CXXCPP
+* [ARROW-6416](https://issues.apache.org/jira/browse/ARROW-6416) - [Python] Confusing API & documentation regarding chunksizes
+* [ARROW-6417](https://issues.apache.org/jira/browse/ARROW-6417) - [C++][Parquet] Non-dictionary BinaryArray reads from Parquet format have slowed down since 0.11.x
+* [ARROW-6419](https://issues.apache.org/jira/browse/ARROW-6419) - [Website] Blog post about Parquet dictionary performance work coming in 0.15.x release
+* [ARROW-6422](https://issues.apache.org/jira/browse/ARROW-6422) - [Gandiva] Fix double-conversion linker issue
+* [ARROW-6426](https://issues.apache.org/jira/browse/ARROW-6426) - [FlightRPC] Expose gRPC configuration knobs in Flight
+* [ARROW-6427](https://issues.apache.org/jira/browse/ARROW-6427) - [GLib] Add support for column names autogeneration CSV read option
+* [ARROW-6438](https://issues.apache.org/jira/browse/ARROW-6438) - [R] Add bindings for filesystem API
+* [ARROW-6447](https://issues.apache.org/jira/browse/ARROW-6447) - [C++] Builds with ARROW\_JEMALLOC=ON wait until jemalloc\_ep is complete before building any libarrow .cc files
+* [ARROW-6450](https://issues.apache.org/jira/browse/ARROW-6450) - [C++] Use 2x reallocation strategy in arrow::BufferBuilder instead of 1.5x
+* [ARROW-6451](https://issues.apache.org/jira/browse/ARROW-6451) - [Format] Add clarifications to Columnar.rst about the contents of "null" slots in Varbinary or List arrays
+* [ARROW-6453](https://issues.apache.org/jira/browse/ARROW-6453) - [C++] More informative error messages from S3
+* [ARROW-6454](https://issues.apache.org/jira/browse/ARROW-6454) - [Developer] Add LLVM license to LICENSE.txt due to binary redistribution in packages
+* [ARROW-6458](https://issues.apache.org/jira/browse/ARROW-6458) - [Java] Remove value boxing/unboxing for ApproxEqualsVisitor
+* [ARROW-6460](https://issues.apache.org/jira/browse/ARROW-6460) - [Java] Add benchmark and large fake data UT for avro adapter
+* [ARROW-6462](https://issues.apache.org/jira/browse/ARROW-6462) - [C++] Can't build with bundled double-conversion on CentOS 6 x86\_64
+* [ARROW-6465](https://issues.apache.org/jira/browse/ARROW-6465) - [Python] Improve Windows build instructions
+* [ARROW-6474](https://issues.apache.org/jira/browse/ARROW-6474) - [Python] Provide mechanism for python to write out old format
+* [ARROW-6475](https://issues.apache.org/jira/browse/ARROW-6475) - [C++] Don't try to dictionary encode dictionary arrays
+* [ARROW-6477](https://issues.apache.org/jira/browse/ARROW-6477) - [Packaging][Crossbow] Use Azure Pipelines to build linux packages
+* [ARROW-6480](https://issues.apache.org/jira/browse/ARROW-6480) - [Developer] Add command to generate and send e-mail report for a Crossbow run
+* [ARROW-6484](https://issues.apache.org/jira/browse/ARROW-6484) - [Java] Enable create indexType for DictionaryEncoding according to dictionary value count
+* [ARROW-6487](https://issues.apache.org/jira/browse/ARROW-6487) - [Rust] [DataFusion] Create test utils module
+* [ARROW-6489](https://issues.apache.org/jira/browse/ARROW-6489) - [Developer][Documentation] Fix merge script and readme
+* [ARROW-6490](https://issues.apache.org/jira/browse/ARROW-6490) - [Java] log error for leak in allocator close
+* [ARROW-6491](https://issues.apache.org/jira/browse/ARROW-6491) - [Java] fix master build failure caused by ErrorProne
+* [ARROW-6494](https://issues.apache.org/jira/browse/ARROW-6494) - [C++][Dataset] Implement basic PartitionScheme
+* [ARROW-6504](https://issues.apache.org/jira/browse/ARROW-6504) - [Python][Packaging] Add mimalloc to conda packages for better performance
+* [ARROW-6505](https://issues.apache.org/jira/browse/ARROW-6505) - [Website] Add new committers
+* [ARROW-6518](https://issues.apache.org/jira/browse/ARROW-6518) - [Packaging][Python] Flight failing in OSX Python wheel builds
+* [ARROW-6519](https://issues.apache.org/jira/browse/ARROW-6519) - [Java] Use IPC continuation token to mark EOS
+* [ARROW-6524](https://issues.apache.org/jira/browse/ARROW-6524) - [Developer][Packaging] Nightly build report's subject should contain Arrow
+* [ARROW-6525](https://issues.apache.org/jira/browse/ARROW-6525) - [C++] CloseFromDestructor() should perhaps not crash
+* [ARROW-6526](https://issues.apache.org/jira/browse/ARROW-6526) - [C++] Poison data in PoolBuffer destructor
+* [ARROW-6527](https://issues.apache.org/jira/browse/ARROW-6527) - [C++] Add OutputStream::Write() variant taking an owned buffer
+* [ARROW-6531](https://issues.apache.org/jira/browse/ARROW-6531) - [Python] Add detach() method to buffered streams
+* [ARROW-6532](https://issues.apache.org/jira/browse/ARROW-6532) - [R] Write parquet files with compression
+* [ARROW-6533](https://issues.apache.org/jira/browse/ARROW-6533) - [R] Compression codec should take a "level"
+* [ARROW-6534](https://issues.apache.org/jira/browse/ARROW-6534) - [Java] Fix typos and spelling
+* [ARROW-6539](https://issues.apache.org/jira/browse/ARROW-6539) - [R] Provide mechanism to write out old format
+* [ARROW-6540](https://issues.apache.org/jira/browse/ARROW-6540) - [R] Add Validate() methods
+* [ARROW-6541](https://issues.apache.org/jira/browse/ARROW-6541) - [Format][C++] Use two-part EOS and amend Format documentation
+* [ARROW-6542](https://issues.apache.org/jira/browse/ARROW-6542) - [R] Add View() method to array types
+* [ARROW-6544](https://issues.apache.org/jira/browse/ARROW-6544) - [R] Documentation/polishing for 0.15 release
+* [ARROW-6545](https://issues.apache.org/jira/browse/ARROW-6545) - [Go] Update Go IPC writer to use two-part EOS per mailing list discussion
+* [ARROW-6546](https://issues.apache.org/jira/browse/ARROW-6546) - [C++] Add missing FlatBuffers source dependency
+* [ARROW-6549](https://issues.apache.org/jira/browse/ARROW-6549) - [C++] Switch back to latest jemalloc 5.x
+* [ARROW-6556](https://issues.apache.org/jira/browse/ARROW-6556) - [Python] Prepare for pandas release without SparseDataFrame
+* [ARROW-6556](https://issues.apache.org/jira/browse/ARROW-6556) - [Python] Prepare for pandas release without SparseDataFrame
+* [ARROW-6557](https://issues.apache.org/jira/browse/ARROW-6557) - [Python] Always return pandas.Series from Array/ChunkedArray.to\_pandas, propagate field names to Series from RecordBatch, Table
+* [ARROW-6558](https://issues.apache.org/jira/browse/ARROW-6558) - [C++] Refactor Iterator to a type erased handle
+* [ARROW-6559](https://issues.apache.org/jira/browse/ARROW-6559) - [Developer][C++] Add "archery" option to specify system toolchain for C++ builds
+* [ARROW-6563](https://issues.apache.org/jira/browse/ARROW-6563) - [Rust] [DataFusion] Create "merge" execution plan
+* [ARROW-6569](https://issues.apache.org/jira/browse/ARROW-6569) - [Website] Add support for auto deployment by GitHub Actions
+* [ARROW-6570](https://issues.apache.org/jira/browse/ARROW-6570) - [Python] Use MemoryPool to allocate memory for NumPy arrays in to\_pandas calls
+* [ARROW-6580](https://issues.apache.org/jira/browse/ARROW-6580) - [Java] Support comparison for unsigned integers
+* [ARROW-6584](https://issues.apache.org/jira/browse/ARROW-6584) - [Python][Wheel] Bundle zlib again with the windows wheels
+* [ARROW-6588](https://issues.apache.org/jira/browse/ARROW-6588) - [C++] Suppress class-memaccess warning with g++ 9.2.1
+* [ARROW-6589](https://issues.apache.org/jira/browse/ARROW-6589) - [C++] Support BinaryType in MakeArrayOfNull
+* [ARROW-6590](https://issues.apache.org/jira/browse/ARROW-6590) - [C++] Do not require ARROW\_JSON=ON when ARROW\_IPC=ON
+* [ARROW-6591](https://issues.apache.org/jira/browse/ARROW-6591) - [R] Ignore .Rhistory files in source control
+* [ARROW-6599](https://issues.apache.org/jira/browse/ARROW-6599) - [Rust] [DataFusion] Implement SUM aggregate expression
+* [ARROW-6601](https://issues.apache.org/jira/browse/ARROW-6601) - [Java] Improve JDBC adapter performance & add benchmark
+* [ARROW-6605](https://issues.apache.org/jira/browse/ARROW-6605) - [C++] Add recursion depth control to fs::Selector
+* [ARROW-6606](https://issues.apache.org/jira/browse/ARROW-6606) - [C++] Construct tree structure from std::vector<fs::FileStats\>
+* [ARROW-6609](https://issues.apache.org/jira/browse/ARROW-6609) - [C++] Add minimal build Dockerfile example
+* [ARROW-6613](https://issues.apache.org/jira/browse/ARROW-6613) - [C++] Remove dependency on boost::filesystem
+* [ARROW-6614](https://issues.apache.org/jira/browse/ARROW-6614) - [C++][Dataset] Implement FileSystemDataSourceDiscovery
+* [ARROW-6616](https://issues.apache.org/jira/browse/ARROW-6616) - [Website] Release announcement blog post for 0.15
+* [ARROW-6621](https://issues.apache.org/jira/browse/ARROW-6621) - [Rust][DataFusion] Examples for DataFusion are not executed in CI
+* [ARROW-6629](https://issues.apache.org/jira/browse/ARROW-6629) - [Doc][C++] Document the FileSystem API
+* [ARROW-6630](https://issues.apache.org/jira/browse/ARROW-6630) - [Doc][C++] Document the file readers (CSV, JSON, Parquet, etc.)
+* [ARROW-6644](https://issues.apache.org/jira/browse/ARROW-6644) - [JS] Amend NullType IPC protocol to append no buffers
+* [ARROW-6647](https://issues.apache.org/jira/browse/ARROW-6647) - [C++] Can't build with g++ 4.8.5 on CentOS 7 by member initializer for shared\_ptr
+* [ARROW-6648](https://issues.apache.org/jira/browse/ARROW-6648) - [Go] Expose the bitutil package
+* [ARROW-6649](https://issues.apache.org/jira/browse/ARROW-6649) - [R] print() methods for Table, RecordBatch, etc.
+* [ARROW-6653](https://issues.apache.org/jira/browse/ARROW-6653) - [Developer] Add support for auto JIRA link on pull request
+* [ARROW-6655](https://issues.apache.org/jira/browse/ARROW-6655) - [Python] Filesystem bindings for S3
+* [ARROW-6664](https://issues.apache.org/jira/browse/ARROW-6664) - [C++] Add option to build without SSE4.2
+* [ARROW-6665](https://issues.apache.org/jira/browse/ARROW-6665) - [Rust] [DataFusion] Implement numeric literal expressions
+* [ARROW-6667](https://issues.apache.org/jira/browse/ARROW-6667) - [Python] Avoid Reference Cycles in pyarrow.parquet
+* [ARROW-6668](https://issues.apache.org/jira/browse/ARROW-6668) - [Rust] [DataFusion] Implement CAST expression
+* [ARROW-6669](https://issues.apache.org/jira/browse/ARROW-6669) - [Rust] [DataFusion] Implement physical expression for binary expressions
+* [ARROW-6675](https://issues.apache.org/jira/browse/ARROW-6675) - [JS] Add scanReverse function to dataFrame and filteredDataframe
+* [ARROW-6683](https://issues.apache.org/jira/browse/ARROW-6683) - [Python] Add unit tests that validate cross-compatibility with pyarrow.parquet when fastparquet is installed
+* [ARROW-6725](https://issues.apache.org/jira/browse/ARROW-6725) - [CI] Disable 3rdparty fuzzit nightly builds
+* [ARROW-6735](https://issues.apache.org/jira/browse/ARROW-6735) - [C++] Suppress sign-compare warning with g++ 9.2.1
+* [ARROW-6752](https://issues.apache.org/jira/browse/ARROW-6752) - [Go] implement Stringer for Null array
+* [ARROW-6755](https://issues.apache.org/jira/browse/ARROW-6755) - [Release] Improvements to Windows release verification script
+* [ARROW-6771](https://issues.apache.org/jira/browse/ARROW-6771) - [Packaging][Python] Missing pytest dependency from conda and wheel builds
+* [PARQUET-1468](https://issues.apache.org/jira/browse/PARQUET-1468) - [C++] Consolidate RecordReader, ColumnReader code paths
+
+
+## Bug Fixes
+
+* [ARROW-1184](https://issues.apache.org/jira/browse/ARROW-1184) - [Java] Dictionary.equals is not working correctly
+* [ARROW-2041](https://issues.apache.org/jira/browse/ARROW-2041) - [Python] pyarrow.serialize has high overhead for list of NumPy arrays
+* [ARROW-2248](https://issues.apache.org/jira/browse/ARROW-2248) - [Python] Nightly or on-demand HDFS test builds
+* [ARROW-2317](https://issues.apache.org/jira/browse/ARROW-2317) - [Python] fix C linkage warning
+* [ARROW-2490](https://issues.apache.org/jira/browse/ARROW-2490) - [C++] input stream locking inconsistent
+* [ARROW-3176](https://issues.apache.org/jira/browse/ARROW-3176) - [Python] Overflow in Date32 column conversion to pandas
+* [ARROW-3203](https://issues.apache.org/jira/browse/ARROW-3203) - [C++] Build error on Debian Buster
+* [ARROW-3651](https://issues.apache.org/jira/browse/ARROW-3651) - [Python] Datetimes from non-DateTimeIndex cannot be deserialized
+* [ARROW-3652](https://issues.apache.org/jira/browse/ARROW-3652) - [Python] CategoricalIndex is lost after reading back
+* [ARROW-3762](https://issues.apache.org/jira/browse/ARROW-3762) - [C++] Parquet arrow::Table reads error when overflowing capacity of BinaryArray
+* [ARROW-3933](https://issues.apache.org/jira/browse/ARROW-3933) - [Python] Segfault reading Parquet files from GNOMAD
+* [ARROW-4187](https://issues.apache.org/jira/browse/ARROW-4187) - [C++] file-benchmark uses <poll.h\>
+* [ARROW-4746](https://issues.apache.org/jira/browse/ARROW-4746) - [C++/Python] PyDataTime\_Date wrongly casted to PyDataTime\_DateTime
+* [ARROW-4836](https://issues.apache.org/jira/browse/ARROW-4836) - [Python] "Cannot tell() a compressed stream" when using RecordBatchStreamWriter
+* [ARROW-4848](https://issues.apache.org/jira/browse/ARROW-4848) - [C++] Static libparquet not compiled with -DARROW\_STATIC on Windows
+* [ARROW-4880](https://issues.apache.org/jira/browse/ARROW-4880) - [Python] python/asv-build.sh is probably broken after CMake refactor
+* [ARROW-4883](https://issues.apache.org/jira/browse/ARROW-4883) - [Python] read\_csv() returns garbage if given file object in text mode
+* [ARROW-5028](https://issues.apache.org/jira/browse/ARROW-5028) - [Python][C++] Creating list<string\> with pyarrow.array can overflow child builder
+* [ARROW-5072](https://issues.apache.org/jira/browse/ARROW-5072) - [Python] write\_table fails silently on S3 errors
+* [ARROW-5085](https://issues.apache.org/jira/browse/ARROW-5085) - [Python/C++] Conversion of dict encoded null column fails in parquet writing when using RowGroups
+* [ARROW-5086](https://issues.apache.org/jira/browse/ARROW-5086) - [Python] Space leak in ParquetFile.read\_row\_group()
+* [ARROW-5089](https://issues.apache.org/jira/browse/ARROW-5089) - [C++/Python] Writing dictionary encoded columns to parquet is extremely slow when using chunk size
+* [ARROW-5103](https://issues.apache.org/jira/browse/ARROW-5103) - [Python] Segfault when using chunked\_array.to\_pandas on array different types (edge case)
+* [ARROW-5125](https://issues.apache.org/jira/browse/ARROW-5125) - [Python] Cannot roundtrip extreme dates through pyarrow
+* [ARROW-5161](https://issues.apache.org/jira/browse/ARROW-5161) - [Python] Cannot convert struct type from Pandas object column
+* [ARROW-5220](https://issues.apache.org/jira/browse/ARROW-5220) - [Python] index / unknown columns in specified schema in Table.from\_pandas
+* [ARROW-5220](https://issues.apache.org/jira/browse/ARROW-5220) - [Python] index / unknown columns in specified schema in Table.from\_pandas
+* [ARROW-5292](https://issues.apache.org/jira/browse/ARROW-5292) - [C++] Static libraries are built on AppVeyor
+* [ARROW-5300](https://issues.apache.org/jira/browse/ARROW-5300) - [C++] 0.13 FAILED to build with option -DARROW\_NO\_DEFAULT\_MEMORY\_POOL
+* [ARROW-5374](https://issues.apache.org/jira/browse/ARROW-5374) - [Python] Misleading error message when calling pyarrow.read\_record\_batch on a complete IPC stream
+* [ARROW-5414](https://issues.apache.org/jira/browse/ARROW-5414) - [C++] Using "Ninja" build system generator overrides default Release build type on Windows
+* [ARROW-5450](https://issues.apache.org/jira/browse/ARROW-5450) - [Python] TimestampArray.to\_pylist() fails with OverflowError: Python int too large to convert to C long
+* [ARROW-5471](https://issues.apache.org/jira/browse/ARROW-5471) - [C++][Gandiva]Array offset is ignored in Gandiva projector
+* [ARROW-5522](https://issues.apache.org/jira/browse/ARROW-5522) - [Packaging][Documentation] Comments out of date in python/manylinux1/build\_arrow.sh
+* [ARROW-5525](https://issues.apache.org/jira/browse/ARROW-5525) - [C++][CI] Enable continuous fuzzing
+* [ARROW-5560](https://issues.apache.org/jira/browse/ARROW-5560) - [C++][Plasma] Cannot create Plasma object after OutOfMemory error
+* [ARROW-5562](https://issues.apache.org/jira/browse/ARROW-5562) - [C++][Parquet] parquet writer does not handle negative zero correctly
+* [ARROW-5630](https://issues.apache.org/jira/browse/ARROW-5630) - [Python][Parquet] Table of nested arrays doesn't round trip
+* [ARROW-5638](https://issues.apache.org/jira/browse/ARROW-5638) - [C++] cmake fails to generate Xcode project when Gandiva JNI bindings are enabled
+* [ARROW-5651](https://issues.apache.org/jira/browse/ARROW-5651) - [Python] Incorrect conversion from strided Numpy array when other type is specified
+* [ARROW-5682](https://issues.apache.org/jira/browse/ARROW-5682) - [Python] from\_pandas conversion casts values to string inconsistently
+* [ARROW-5731](https://issues.apache.org/jira/browse/ARROW-5731) - [CI] Turbodbc integration tests are failing
+* [ARROW-5753](https://issues.apache.org/jira/browse/ARROW-5753) - [Rust] Fix test failure in CI code coverage
+* [ARROW-5772](https://issues.apache.org/jira/browse/ARROW-5772) - [GLib][Plasma][CUDA] Plasma::Client\#refer\_object test is failed
+* [ARROW-5775](https://issues.apache.org/jira/browse/ARROW-5775) - [C++] StructArray : cached boxed fields not thread-safe
+* [ARROW-5776](https://issues.apache.org/jira/browse/ARROW-5776) - [Gandiva][Crossbow] Revert template to have commit ids.
+* [ARROW-5790](https://issues.apache.org/jira/browse/ARROW-5790) - [Python] Passing zero-dim numpy array to pa.array causes segfault
+* [ARROW-5817](https://issues.apache.org/jira/browse/ARROW-5817) - [Python] Use pytest marks for Flight test to avoid silently skipping unit tests due to import failures
+* [ARROW-5823](https://issues.apache.org/jira/browse/ARROW-5823) - [Rust] CI scripts miss --all-targets cargo argument
+* [ARROW-5824](https://issues.apache.org/jira/browse/ARROW-5824) - [Gandiva] [C++] Fix decimal null
+* [ARROW-5836](https://issues.apache.org/jira/browse/ARROW-5836) - [Java][OSX] Flight tests are failing: address already in use
+* [ARROW-5838](https://issues.apache.org/jira/browse/ARROW-5838) - [C++][Flight][OSX] Building 3rdparty grpc cannot find OpenSSL
+* [ARROW-5848](https://issues.apache.org/jira/browse/ARROW-5848) - [C++] SO versioning schema after release 1.0.0
+* [ARROW-5849](https://issues.apache.org/jira/browse/ARROW-5849) - [C++] Compiler warnings on mingw-w64
+* [ARROW-5850](https://issues.apache.org/jira/browse/ARROW-5850) - [CI][R] R appveyor job is broken after release
+* [ARROW-5851](https://issues.apache.org/jira/browse/ARROW-5851) - [C++] Compilation of reference benchmarks fails
+* [ARROW-5856](https://issues.apache.org/jira/browse/ARROW-5856) - [Python] linking 3rd party cython modules against pyarrow fails since 0.14.0
+* [ARROW-5860](https://issues.apache.org/jira/browse/ARROW-5860) - [Java] [Vector] Fix decimal byte setter
+* [ARROW-5863](https://issues.apache.org/jira/browse/ARROW-5863) - [Python] Segmentation Fault via pytest-runner
+* [ARROW-5868](https://issues.apache.org/jira/browse/ARROW-5868) - [Python] manylinux2010 wheels have shared library dependency on liblz4
+* [ARROW-5870](https://issues.apache.org/jira/browse/ARROW-5870) - [C++] Development compile instructions need to include "make"
+* [ARROW-5873](https://issues.apache.org/jira/browse/ARROW-5873) - [Python] Segmentation fault when comparing schema with None
+* [ARROW-5874](https://issues.apache.org/jira/browse/ARROW-5874) - [Python] pyarrow 0.14.0 macOS wheels depend on shared libs under /usr/local/opt
+* [ARROW-5878](https://issues.apache.org/jira/browse/ARROW-5878) - [Python][C++] Parquet reader not forward compatible for timestamps without timezone
+* [ARROW-5884](https://issues.apache.org/jira/browse/ARROW-5884) - [Java] Fix the get method of StructVector
+* [ARROW-5886](https://issues.apache.org/jira/browse/ARROW-5886) - [Python][Packaging] Manylinux1/2010 compliance issue with libz
+* [ARROW-5887](https://issues.apache.org/jira/browse/ARROW-5887) - [C\#] ArrowStreamWriter writes FieldNodes in wrong order
+* [ARROW-5889](https://issues.apache.org/jira/browse/ARROW-5889) - [Python][C++] Parquet backwards compat for timestamps without timezone broken
+* [ARROW-5894](https://issues.apache.org/jira/browse/ARROW-5894) - [C++] libgandiva.so.14 is exporting libstdc++ symbols
+* [ARROW-5899](https://issues.apache.org/jira/browse/ARROW-5899) - [Python][Packaging] Bundle uriparser.dll in windows wheels
+* [ARROW-5910](https://issues.apache.org/jira/browse/ARROW-5910) - [Python] read\_tensor() fails on non-seekable streams
+* [ARROW-5921](https://issues.apache.org/jira/browse/ARROW-5921) - [C++][Fuzzing] Missing nullptr checks in IPC
+* [ARROW-5923](https://issues.apache.org/jira/browse/ARROW-5923) - [C++] Fix int96 comment
+* [ARROW-5925](https://issues.apache.org/jira/browse/ARROW-5925) - [Gandiva][C++] cast decimal to int should round up
+* [ARROW-5930](https://issues.apache.org/jira/browse/ARROW-5930) - [FlightRPC] [Python] Flight CI tests are failing
+* [ARROW-5930](https://issues.apache.org/jira/browse/ARROW-5930) - [FlightRPC] [Python] Flight CI tests are failing
+* [ARROW-5935](https://issues.apache.org/jira/browse/ARROW-5935) - [C++] ArrayBuilders with mutable type are not robustly supported
+* [ARROW-5946](https://issues.apache.org/jira/browse/ARROW-5946) - [Rust] [DataFusion] Projection push down with aggregate producing incorrect results
+* [ARROW-5952](https://issues.apache.org/jira/browse/ARROW-5952) - [Python] Segfault when reading empty table with category as pandas dataframe
+* [ARROW-5959](https://issues.apache.org/jira/browse/ARROW-5959) - [C++][CI] Fuzzit does not know about branch + commit hash
+* [ARROW-5960](https://issues.apache.org/jira/browse/ARROW-5960) - [C++] Boost dependencies are specified in wrong order
+* [ARROW-5963](https://issues.apache.org/jira/browse/ARROW-5963) - [R] R Appveyor job does not test changes in the C++ library
+* [ARROW-5964](https://issues.apache.org/jira/browse/ARROW-5964) - [C++][Gandiva] Cast double to decimal with rounding returns 0
+* [ARROW-5965](https://issues.apache.org/jira/browse/ARROW-5965) - [Python] Regression: segfault when reading hive table with v0.14
+* [ARROW-5966](https://issues.apache.org/jira/browse/ARROW-5966) - [Python] Capacity error when converting large UTF32 numpy array to arrow array
+* [ARROW-5968](https://issues.apache.org/jira/browse/ARROW-5968) - [Java] Remove duplicate Preconditions check in JDBC adapter
+* [ARROW-5969](https://issues.apache.org/jira/browse/ARROW-5969) - [CI] [R] Lint failures
+* [ARROW-5973](https://issues.apache.org/jira/browse/ARROW-5973) - [Java] Variable width vectors' get methods should return null when the underlying data is null
+* [ARROW-5978](https://issues.apache.org/jira/browse/ARROW-5978) - [FlightRPC] [Java] Integration test client doesn't close buffers
+* [ARROW-5989](https://issues.apache.org/jira/browse/ARROW-5989) - [C++][Python] pyarrow.lib.ArrowIOError: Unable to load libjvm when using openjdk-8
+* [ARROW-5990](https://issues.apache.org/jira/browse/ARROW-5990) - [Python] RowGroupMetaData.column misses bounds check
+* [ARROW-5992](https://issues.apache.org/jira/browse/ARROW-5992) - [C++] Array::View fails for string/utf8 as binary
+* [ARROW-5993](https://issues.apache.org/jira/browse/ARROW-5993) - [Python] Reading a dictionary column from Parquet results in disproportionate memory usage
+* [ARROW-5996](https://issues.apache.org/jira/browse/ARROW-5996) - [Java] Avoid resource leak in flight service
+* [ARROW-5999](https://issues.apache.org/jira/browse/ARROW-5999) - [C++] Required header files missing when built with -DARROW\_DATASET=OFF
+* [ARROW-6002](https://issues.apache.org/jira/browse/ARROW-6002) - [C++][Gandiva] TestCastFunctions does not test int64 casting\`
+* [ARROW-6004](https://issues.apache.org/jira/browse/ARROW-6004) - [C++] CSV reader ignore\_empty\_lines option doesn't handle empty lines
+* [ARROW-6005](https://issues.apache.org/jira/browse/ARROW-6005) - [C++] parquet::arrow::FileReader::GetRecordBatchReader() does not behave as documented since ARROW-1012
+* [ARROW-6006](https://issues.apache.org/jira/browse/ARROW-6006) - [C++] Empty IPC streams containing a dictionary are corrupt
+* [ARROW-6012](https://issues.apache.org/jira/browse/ARROW-6012) - [C++] Fall back on known Apache mirror for Thrift downloads
+* [ARROW-6015](https://issues.apache.org/jira/browse/ARROW-6015) - [Python] pyarrow wheel: \`DLL load failed\` when importing on windows
+* [ARROW-6016](https://issues.apache.org/jira/browse/ARROW-6016) - [Python] pyarrow get\_library\_dirs assertion error
+* [ARROW-6029](https://issues.apache.org/jira/browse/ARROW-6029) - [R] Improve R docs on how to fix library version mismatch
+* [ARROW-6032](https://issues.apache.org/jira/browse/ARROW-6032) - [C++] CountSetBits doesn't ensure 64-bit aligned accesses
+* [ARROW-6038](https://issues.apache.org/jira/browse/ARROW-6038) - [Python] pyarrow.Table.from\_batches produces corrupted table if any of the batches were empty
+* [ARROW-6040](https://issues.apache.org/jira/browse/ARROW-6040) - [Java] Dictionary entries are required in IPC streams even when empty
+* [ARROW-6046](https://issues.apache.org/jira/browse/ARROW-6046) - [C++] Slice RecordBatch of String array with offset 0 returns whole batch
+* [ARROW-6047](https://issues.apache.org/jira/browse/ARROW-6047) - [Rust] Rust nightly 1.38.0 builds failing
+* [ARROW-6050](https://issues.apache.org/jira/browse/ARROW-6050) - [Java] Update out-of-date java/flight/README.md
+* [ARROW-6054](https://issues.apache.org/jira/browse/ARROW-6054) - pyarrow.serialize should respect the value of structured dtype of numpy
+* [ARROW-6058](https://issues.apache.org/jira/browse/ARROW-6058) - [Python][Parquet] Failure when reading Parquet file from S3 with s3fs
+* [ARROW-6059](https://issues.apache.org/jira/browse/ARROW-6059) - [Python] Regression memory issue when calling pandas.read\_parquet
+* [ARROW-6060](https://issues.apache.org/jira/browse/ARROW-6060) - [Python] too large memory cost using pyarrow.parquet.read\_table with use\_threads=True
+* [ARROW-6061](https://issues.apache.org/jira/browse/ARROW-6061) - [C++] Cannot build libarrow without rapidjson
+* [ARROW-6066](https://issues.apache.org/jira/browse/ARROW-6066) - [Website] Fix blog post author header
+* [ARROW-6067](https://issues.apache.org/jira/browse/ARROW-6067) - [Python] Large memory test failures
+* [ARROW-6068](https://issues.apache.org/jira/browse/ARROW-6068) - [Python] Hypothesis test failure, Add StructType::Make that accepts vector of fields
+* [ARROW-6073](https://issues.apache.org/jira/browse/ARROW-6073) - [C++] Decimal128Builder is not reset in Finish()
+* [ARROW-6082](https://issues.apache.org/jira/browse/ARROW-6082) - [Python] create pa.dictionary() type with non-integer indices type crashes
+* [ARROW-6092](https://issues.apache.org/jira/browse/ARROW-6092) - [C++] Python 2.7: arrow\_python\_test failure
+* [ARROW-6095](https://issues.apache.org/jira/browse/ARROW-6095) - [C++] Python subproject ignores ARROW\_TEST\_LINKAGE
+* [ARROW-6108](https://issues.apache.org/jira/browse/ARROW-6108) - [C++] Appveyor Build\_Debug configuration is hanging in C++ unit tests
+* [ARROW-6116](https://issues.apache.org/jira/browse/ARROW-6116) - [C++][Gandiva] Fix bug in TimedTestFilterAdd2
+* [ARROW-6117](https://issues.apache.org/jira/browse/ARROW-6117) - [Java] Fix the set method of FixedSizeBinaryVector
+* [ARROW-6119](https://issues.apache.org/jira/browse/ARROW-6119) - [Python] PyArrow wheel import fails on Windows Python 3.7
+* [ARROW-6120](https://issues.apache.org/jira/browse/ARROW-6120) - [C++][Gandiva] including some headers causes decimal\_test to fail
+* [ARROW-6126](https://issues.apache.org/jira/browse/ARROW-6126) - [C++] IPC stream reader handling of empty streams potentially not robust
+* [ARROW-6132](https://issues.apache.org/jira/browse/ARROW-6132) - [Python] ListArray.from\_arrays does not check validity of input arrays
+* [ARROW-6135](https://issues.apache.org/jira/browse/ARROW-6135) - [C++] KeyValueMetadata::Equals should not be order-sensitive
+* [ARROW-6136](https://issues.apache.org/jira/browse/ARROW-6136) - [FlightRPC][Java] Don't double-close response stream
+* [ARROW-6145](https://issues.apache.org/jira/browse/ARROW-6145) - [Java] UnionVector created by MinorType\#getNewVector could not keep field type info properly
+* [ARROW-6148](https://issues.apache.org/jira/browse/ARROW-6148) - [C++][Packaging] Improve aarch64 support
+* [ARROW-6152](https://issues.apache.org/jira/browse/ARROW-6152) - [C++][Parquet] Write arrow::Array directly into parquet::TypedColumnWriter<T\>
+* [ARROW-6153](https://issues.apache.org/jira/browse/ARROW-6153) - [R] Address parquet deprecation warning
+* [ARROW-6158](https://issues.apache.org/jira/browse/ARROW-6158) - [Python] possible to create StructArray with type that conflicts with child array's types
+* [ARROW-6159](https://issues.apache.org/jira/browse/ARROW-6159) - [C++] PrettyPrint of arrow::Schema missing identation for first line
+* [ARROW-6160](https://issues.apache.org/jira/browse/ARROW-6160) - [Java] AbstractStructVector\#getPrimitiveVectors fails to work with complex child vectors
+* [ARROW-6166](https://issues.apache.org/jira/browse/ARROW-6166) - [Go] Slice of slice causes index out of range panic
+* [ARROW-6167](https://issues.apache.org/jira/browse/ARROW-6167) - [R] macOS binary R packages on CRAN don't have arrow\_available
+* [ARROW-6168](https://issues.apache.org/jira/browse/ARROW-6168) - [C++] IWYU docker-compose job is broken
+* [ARROW-6170](https://issues.apache.org/jira/browse/ARROW-6170) - [R] "docker-compose build r" is slow
+* [ARROW-6171](https://issues.apache.org/jira/browse/ARROW-6171) - [R] "docker-compose run r" fails
+* [ARROW-6174](https://issues.apache.org/jira/browse/ARROW-6174) - [C++] Validate chunks in ChunkedArray::Validate
+* [ARROW-6175](https://issues.apache.org/jira/browse/ARROW-6175) - [Java] Fix MapVector\#getMinorType and extend AbstractContainerVector addOrGet complex vector API
+* [ARROW-6178](https://issues.apache.org/jira/browse/ARROW-6178) - [Developer] Don't fail in merge script on bad primary author input in multi-author PRs
+* [ARROW-6182](https://issues.apache.org/jira/browse/ARROW-6182) - [R] Add note to README about r-arrow conda installation
+* [ARROW-6186](https://issues.apache.org/jira/browse/ARROW-6186) - [Packaging][C++] Plasma headers not included for ubuntu-xenial libplasma-dev debian package
+* [ARROW-6190](https://issues.apache.org/jira/browse/ARROW-6190) - [C++] Define and declare functions regardless of NDEBUG
+* [ARROW-6193](https://issues.apache.org/jira/browse/ARROW-6193) - [GLib] Add missing require in test
+* [ARROW-6200](https://issues.apache.org/jira/browse/ARROW-6200) - [Java] Method getBufferSizeFor in BaseRepeatedValueVector/ListVector not correct
+* [ARROW-6202](https://issues.apache.org/jira/browse/ARROW-6202) - [Java] Exception in thread "main" org.apache.arrow.memory.OutOfMemoryException: Unable to allocate buffer of size 4 due to memory limit. Current allocation: 2147483646
+* [ARROW-6205](https://issues.apache.org/jira/browse/ARROW-6205) - [C++] ARROW\_DEPRECATED warning when including io/interfaces.h from CUDA (.cu) source
+* [ARROW-6208](https://issues.apache.org/jira/browse/ARROW-6208) - [Java] Correct byte order before comparing in ByteFunctionHelpers
+* [ARROW-6210](https://issues.apache.org/jira/browse/ARROW-6210) - [Java] remove equals API from ValueVector
+* [ARROW-6211](https://issues.apache.org/jira/browse/ARROW-6211) - [Java] Remove dependency on RangeEqualsVisitor from ValueVector interface
+* [ARROW-6214](https://issues.apache.org/jira/browse/ARROW-6214) - [R] Sanitizer errors triggered via R bindings
+* [ARROW-6215](https://issues.apache.org/jira/browse/ARROW-6215) - [Java] RangeEqualVisitor does not properly compare ZeroVector
+* [ARROW-6218](https://issues.apache.org/jira/browse/ARROW-6218) - [Java] Add UINT type test in integration to avoid potential overflow
+* [ARROW-6223](https://issues.apache.org/jira/browse/ARROW-6223) - [C++] Configuration error with Anaconda Python 3.7.4
+* [ARROW-6224](https://issues.apache.org/jira/browse/ARROW-6224) - [Python] remaining usages of the 'data' attribute (from previous Column) cause warnings
+* [ARROW-6227](https://issues.apache.org/jira/browse/ARROW-6227) - [Python] pyarrow.array() shouldn't coerce np.nan to string
+* [ARROW-6234](https://issues.apache.org/jira/browse/ARROW-6234) - [Java] ListVector hashCode() is not correct
+* [ARROW-6241](https://issues.apache.org/jira/browse/ARROW-6241) - [Java] Failures on master
+* [ARROW-6255](https://issues.apache.org/jira/browse/ARROW-6255) - [Rust] [Parquet] Cannot use any published parquet crate due to parquet-format breaking change
+* [ARROW-6259](https://issues.apache.org/jira/browse/ARROW-6259) - [C++][CI] Flatbuffers-related failures in CI on macOS
+* [ARROW-6263](https://issues.apache.org/jira/browse/ARROW-6263) - [Python] RecordBatch.from\_arrays does not check array types against a passed schema
+* [ARROW-6266](https://issues.apache.org/jira/browse/ARROW-6266) - [Java] Resolve the ambiguous method overload in RangeEqualsVisitor
+* [ARROW-6268](https://issues.apache.org/jira/browse/ARROW-6268) - Empty buffer should have a valid address
+* [ARROW-6269](https://issues.apache.org/jira/browse/ARROW-6269) - [C++][Fuzzing] IPC reads do not check decimal precision
+* [ARROW-6270](https://issues.apache.org/jira/browse/ARROW-6270) - [C++][Fuzzing] IPC reads do not check buffer indices
+* [ARROW-6290](https://issues.apache.org/jira/browse/ARROW-6290) - [Rust] [DataFusion] sql\_csv example errors when running
+* [ARROW-6291](https://issues.apache.org/jira/browse/ARROW-6291) - [C++] CMake ignores ARROW\_PARQUET
+* [ARROW-6293](https://issues.apache.org/jira/browse/ARROW-6293) - [Rust] datafusion 0.15.0-SNAPSHOT error
+* [ARROW-6301](https://issues.apache.org/jira/browse/ARROW-6301) - [Python] atexit: pyarrow.lib.ArrowKeyError: 'No type extension with name arrow.py\_extension\_type found'
+* [ARROW-6302](https://issues.apache.org/jira/browse/ARROW-6302) - [Python][Parquet] Reading dictionary type with serialized Arrow schema does not restore "ordered" type property
+* [ARROW-6309](https://issues.apache.org/jira/browse/ARROW-6309) - [C++] Parquet tests and executables are linked statically
+* [ARROW-6323](https://issues.apache.org/jira/browse/ARROW-6323) - [R] Expand file paths when passing to readers
+* [ARROW-6325](https://issues.apache.org/jira/browse/ARROW-6325) - [Python] wrong conversion of DataFrame with boolean values
+* [ARROW-6330](https://issues.apache.org/jira/browse/ARROW-6330) - [C++] Include missing headers in api.h
+* [ARROW-6332](https://issues.apache.org/jira/browse/ARROW-6332) - [Java][C++][Gandiva] Handle size of varchar vectors correctly
+* [ARROW-6339](https://issues.apache.org/jira/browse/ARROW-6339) - [Python][C++] Rowgroup statistics for pd.NaT array ill defined
+* [ARROW-6343](https://issues.apache.org/jira/browse/ARROW-6343) - [Java] [Vector] Fix allocation helper
+* [ARROW-6344](https://issues.apache.org/jira/browse/ARROW-6344) - [C++][Gandiva] substring does not handle multibyte characters
+* [ARROW-6345](https://issues.apache.org/jira/browse/ARROW-6345) - [C++][Python] "ordered" flag seemingly not taken into account when comparing DictionaryType values for equality
+* [ARROW-6348](https://issues.apache.org/jira/browse/ARROW-6348) - [R] arrow::read\_csv\_arrow namespace error when package not loaded
+* [ARROW-6354](https://issues.apache.org/jira/browse/ARROW-6354) - [C++] Building without Parquet fails
+* [ARROW-6363](https://issues.apache.org/jira/browse/ARROW-6363) - [R] segfault in Table\_\_from\_dots with unexpected schema
+* [ARROW-6364](https://issues.apache.org/jira/browse/ARROW-6364) - [R] Handling unexpected input to time64() et al
+* [ARROW-6369](https://issues.apache.org/jira/browse/ARROW-6369) - [Python] Support list-of-boolean in Array.to\_pandas conversion
+* [ARROW-6371](https://issues.apache.org/jira/browse/ARROW-6371) - [Doc] Row to columnar conversion example mentions arrow::Column in comments
+* [ARROW-6372](https://issues.apache.org/jira/browse/ARROW-6372) - [Rust][Datafusion] Casting from Un-signed to Signed Integers not supported
+* [ARROW-6376](https://issues.apache.org/jira/browse/ARROW-6376) - [Developer] PR merge script has "master" target ref hard-coded
+* [ARROW-6387](https://issues.apache.org/jira/browse/ARROW-6387) - [Archery] Errors with make
+* [ARROW-6392](https://issues.apache.org/jira/browse/ARROW-6392) - [Python][Flight] list\_actions Server RPC is not tested in test\_flight.py, nor is return value validated
+* [ARROW-6395](https://issues.apache.org/jira/browse/ARROW-6395) - [Python] Bug when using bool arrays with stride greater than 1
+* [ARROW-6406](https://issues.apache.org/jira/browse/ARROW-6406) - [C++] jemalloc\_ep fails for offline build
+* [ARROW-6411](https://issues.apache.org/jira/browse/ARROW-6411) - [C++][Parquet] DictEncoderImpl<T\>::PutIndicesTyped has bad performance on some systems
+* [ARROW-6412](https://issues.apache.org/jira/browse/ARROW-6412) - [C++] arrow-flight-test can crash because of port allocation
+* [ARROW-6418](https://issues.apache.org/jira/browse/ARROW-6418) - [C++] Plasma cmake targets are not exported
+* [ARROW-6423](https://issues.apache.org/jira/browse/ARROW-6423) - [Python] pyarrow.CompressedOutputStream() never completes with compression='snappy'
+* [ARROW-6424](https://issues.apache.org/jira/browse/ARROW-6424) - [C++][Fuzzing] Fuzzit nightly is broken
+* [ARROW-6425](https://issues.apache.org/jira/browse/ARROW-6425) - [C++] ValidateArray fail for slice of list array
+* [ARROW-6428](https://issues.apache.org/jira/browse/ARROW-6428) - [CI][Crossbow] Nightly turbodbc job fails
+* [ARROW-6430](https://issues.apache.org/jira/browse/ARROW-6430) - [CI][Crossbow] Nightly R docker job fails
+* [ARROW-6431](https://issues.apache.org/jira/browse/ARROW-6431) - [Python] Test suite fails without pandas installed
+* [ARROW-6432](https://issues.apache.org/jira/browse/ARROW-6432) - [CI][Crossbow] Remove alpine crossbow jobs
+* [ARROW-6433](https://issues.apache.org/jira/browse/ARROW-6433) - [CI][Crossbow] Nightly java docker job fails
+* [ARROW-6434](https://issues.apache.org/jira/browse/ARROW-6434) - [CI][Crossbow] Nightly HDFS integration job fails
+* [ARROW-6435](https://issues.apache.org/jira/browse/ARROW-6435) - [CI][Crossbow] Nightly dask integration job fails
+* [ARROW-6440](https://issues.apache.org/jira/browse/ARROW-6440) - [CI][Crossbow] Nightly ubuntu, debian, and centos package builds fail
+* [ARROW-6441](https://issues.apache.org/jira/browse/ARROW-6441) - [CI][Crossbow] Nightly Centos 6 job fails
+* [ARROW-6442](https://issues.apache.org/jira/browse/ARROW-6442) - [CI][Crossbow] Nightly gandiva jar osx build fails
+* [ARROW-6443](https://issues.apache.org/jira/browse/ARROW-6443) - [CI][Crossbow] Nightly conda osx builds fail
+* [ARROW-6444](https://issues.apache.org/jira/browse/ARROW-6444) - [CI][Crossbow] Nightly conda Windows builds fail (time out)
+* [ARROW-6446](https://issues.apache.org/jira/browse/ARROW-6446) - [OSX][Python][Wheel] Turn off ORC feature in the wheel building scripts
+* [ARROW-6449](https://issues.apache.org/jira/browse/ARROW-6449) - [R] io "tell()" methods are inconsistently named and untested
+* [ARROW-6457](https://issues.apache.org/jira/browse/ARROW-6457) - [C++] CMake build locally fails with MSVC 2015 build generator
+* [ARROW-6461](https://issues.apache.org/jira/browse/ARROW-6461) - [Java] EchoServer can close socket before client has finished reading
+* [ARROW-6472](https://issues.apache.org/jira/browse/ARROW-6472) - [Java] ValueVector\#accept may has potential cast exception
+* [ARROW-6476](https://issues.apache.org/jira/browse/ARROW-6476) - [Java][CI] Travis java all-jdks job is broken
+* [ARROW-6478](https://issues.apache.org/jira/browse/ARROW-6478) - [C++] Roll back to jemalloc stable-4 branch until performance issues in 5.2.x addressed
+* [ARROW-6481](https://issues.apache.org/jira/browse/ARROW-6481) - [Python][C++] Bad performance of read\_csv() with column\_types
+* [ARROW-6488](https://issues.apache.org/jira/browse/ARROW-6488) - [Python] pyarrow.NULL equals to itself
+* [ARROW-6492](https://issues.apache.org/jira/browse/ARROW-6492) - [Python] file written with latest fastparquet cannot be read with latest pyarrow
+* [ARROW-6502](https://issues.apache.org/jira/browse/ARROW-6502) - [GLib][CI] MinGW failure in CI
+* [ARROW-6506](https://issues.apache.org/jira/browse/ARROW-6506) - [C++] Validation of ExtensionType with nested type fails
+* [ARROW-6509](https://issues.apache.org/jira/browse/ARROW-6509) - [C++][Gandiva] Re-enable Gandiva JNI tests and fix Travis CI failure
+* [ARROW-6509](https://issues.apache.org/jira/browse/ARROW-6509) - [C++][Gandiva] Re-enable Gandiva JNI tests and fix Travis CI failure
+* [ARROW-6520](https://issues.apache.org/jira/browse/ARROW-6520) - [Python] Segmentation fault on writing tables with fixed size binary fields
+* [ARROW-6522](https://issues.apache.org/jira/browse/ARROW-6522) - [Python] Test suite fails with pandas 0.23.4, pytest 3.8.1
+* [ARROW-6530](https://issues.apache.org/jira/browse/ARROW-6530) - [CI][Crossbow][R] Nightly R job doesn't install all dependencies
+* [ARROW-6550](https://issues.apache.org/jira/browse/ARROW-6550) - [C++] Filter expressions PR failing manylinux package builds
+* [ARROW-6551](https://issues.apache.org/jira/browse/ARROW-6551) - [Python] Dask Parquet integration test failure
+* [ARROW-6552](https://issues.apache.org/jira/browse/ARROW-6552) - [C++] boost::optional in STL test fails compiling in gcc 4.8.2
+* [ARROW-6560](https://issues.apache.org/jira/browse/ARROW-6560) - [Python] Failures in \*-nopandas integration tests
+* [ARROW-6561](https://issues.apache.org/jira/browse/ARROW-6561) - [Python] pandas-master integration test failure
+* [ARROW-6562](https://issues.apache.org/jira/browse/ARROW-6562) - [GLib] Fix wrong sliced data of GArrowBuffer
+* [ARROW-6564](https://issues.apache.org/jira/browse/ARROW-6564) - [Python] Do not require pandas for invoking Array.\_\_array\_\_
+* [ARROW-6565](https://issues.apache.org/jira/browse/ARROW-6565) - [Rust] [DataFusion] Intermittent test failure due to temp dir already existing
+* [ARROW-6568](https://issues.apache.org/jira/browse/ARROW-6568) - [C++][Python][Parquet] pyarrow.parquet crash writing zero-chunk dictionary-type column
+* [ARROW-6572](https://issues.apache.org/jira/browse/ARROW-6572) - [C++] Reading some Parquet data can return uninitialized memory
+* [ARROW-6573](https://issues.apache.org/jira/browse/ARROW-6573) - [Python] Segfault when writing to parquet
+* [ARROW-6576](https://issues.apache.org/jira/browse/ARROW-6576) - [R] Fix sparklyr integration tests
+* [ARROW-6586](https://issues.apache.org/jira/browse/ARROW-6586) - [Python][Packaging] Windows wheel builds failing with "DLL load failure"
+* [ARROW-6597](https://issues.apache.org/jira/browse/ARROW-6597) - [Python] Segfault in test\_pandas with Python 2.7
+* [ARROW-6618](https://issues.apache.org/jira/browse/ARROW-6618) - [Python] Reading a zero-size buffer can segfault
+* [ARROW-6620](https://issues.apache.org/jira/browse/ARROW-6620) - [Python][CI] pandas-master build failing due to removal of "to\_sparse" method
+* [ARROW-6622](https://issues.apache.org/jira/browse/ARROW-6622) - [C++][R] SubTreeFileSystem path error on Windows
+* [ARROW-6623](https://issues.apache.org/jira/browse/ARROW-6623) - [CI][Python] Dask docker integration test broken perhaps by statistics-related change
+* [ARROW-6639](https://issues.apache.org/jira/browse/ARROW-6639) - [Packaging][RPM] Add support for CentOS 7 on aarch64
+* [ARROW-6640](https://issues.apache.org/jira/browse/ARROW-6640) - [C++] Error when BufferedInputStream Peek more than bytes buffered
+* [ARROW-6641](https://issues.apache.org/jira/browse/ARROW-6641) - [C++] Remove Deprecated WriteableFile warning
+* [ARROW-6642](https://issues.apache.org/jira/browse/ARROW-6642) - [Python] chained access of ParquetDataset's metadata segfaults
+* [ARROW-6651](https://issues.apache.org/jira/browse/ARROW-6651) - [R] Fix R conda job
+* [ARROW-6652](https://issues.apache.org/jira/browse/ARROW-6652) - [Python] to\_pandas conversion removes timezone from type
+* [ARROW-6652](https://issues.apache.org/jira/browse/ARROW-6652) - [Python] to\_pandas conversion removes timezone from type
+* [ARROW-6660](https://issues.apache.org/jira/browse/ARROW-6660) - [Rust] [DataFusion] Minor docs update for 0.15.0 release
+* [ARROW-6670](https://issues.apache.org/jira/browse/ARROW-6670) - [CI][R] Fix fix for R nightly jobs
+* [ARROW-6674](https://issues.apache.org/jira/browse/ARROW-6674) - [Python] Fix or ignore the test warnings
+* [ARROW-6677](https://issues.apache.org/jira/browse/ARROW-6677) - [FlightRPC][C++] Document using Flight in C++
+* [ARROW-6678](https://issues.apache.org/jira/browse/ARROW-6678) - [C++] Regression in Parquet file compatibility introduced by ARROW-3246
+* [ARROW-6679](https://issues.apache.org/jira/browse/ARROW-6679) - [RELEASE] autobrew license in LICENSE.txt is not acceptable
+* [ARROW-6682](https://issues.apache.org/jira/browse/ARROW-6682) - [C\#] Arrow R/C++ hangs reading binary file generated by C\#
+* [ARROW-6687](https://issues.apache.org/jira/browse/ARROW-6687) - [Rust] [DataFusion] Query returns incorrect row count
+* [ARROW-6687](https://issues.apache.org/jira/browse/ARROW-6687) - [Rust] [DataFusion] Query returns incorrect row count
+* [ARROW-6701](https://issues.apache.org/jira/browse/ARROW-6701) - [C++][R] Lint failing on R cpp code
+* [ARROW-6703](https://issues.apache.org/jira/browse/ARROW-6703) - [Packaging][Linux] Restore ARROW\_VERSION environment variable
+* [ARROW-6705](https://issues.apache.org/jira/browse/ARROW-6705) - [Rust] [DataFusion] README has invalid github URL
+* [ARROW-6709](https://issues.apache.org/jira/browse/ARROW-6709) - [JAVA] Jdbc adapter currentIndex should increment when value is null
+* [ARROW-6714](https://issues.apache.org/jira/browse/ARROW-6714) - [R] Fix untested RecordBatchWriter case
+* [ARROW-6716](https://issues.apache.org/jira/browse/ARROW-6716) - [CI] [Rust] New 1.40.0 nightly causing builds to fail
+* [ARROW-6748](https://issues.apache.org/jira/browse/ARROW-6748) - [RUBY] gem compilation error
+* [ARROW-6751](https://issues.apache.org/jira/browse/ARROW-6751) - [CI] ccache doesn't cache on Travis-CI
+* [ARROW-6760](https://issues.apache.org/jira/browse/ARROW-6760) - [C++] JSON: improve error message when column changed type
+* [ARROW-6773](https://issues.apache.org/jira/browse/ARROW-6773) - [C++] Filter kernel returns invalid data when filtering with an Array slice
+* [ARROW-6796](https://issues.apache.org/jira/browse/ARROW-6796) - Certain moderately-sized (\~100MB) default-Snappy-compressed Parquet files take enormous memory and long time to load by pyarrow.parquet.read\_table
+* [ARROW-7112](https://issues.apache.org/jira/browse/ARROW-7112) - Wrong contents when initializinga pyarrow.Table from boolean DataFrame
+* [PARQUET-1623](https://issues.apache.org/jira/browse/PARQUET-1623) - [C++] Invalid memory access with a magic number of records
+* [PARQUET-1631](https://issues.apache.org/jira/browse/PARQUET-1631) - [C++] ParquetInputWrapper::GetSize always returns 0
+* [PARQUET-1640](https://issues.apache.org/jira/browse/PARQUET-1640) - [C++] parquet-encoding-benchmark crashes
+
+
+
+# Apache Arrow 0.14.1 (2019-07-22)
+
+## Bug Fixes
+
+* [ARROW-5775](https://issues.apache.org/jira/browse/ARROW-5775) - [C++] StructArray : cached boxed fields not thread-safe
+* [ARROW-5790](https://issues.apache.org/jira/browse/ARROW-5790) - [Python] Passing zero-dim numpy array to pa.array causes segfault
+* [ARROW-5791](https://issues.apache.org/jira/browse/ARROW-5791) - [Python] pyarrow.csv.read\_csv hangs + eats all RAM
+* [ARROW-5816](https://issues.apache.org/jira/browse/ARROW-5816) - [Release] Parallel curl does not work reliably in verify-release-candidate-sh
+* [ARROW-5836](https://issues.apache.org/jira/browse/ARROW-5836) - [Java][OSX] Flight tests are failing: address already in use
+* [ARROW-5838](https://issues.apache.org/jira/browse/ARROW-5838) - [C++][Flight][OSX] Building 3rdparty grpc cannot find OpenSSL
+* [ARROW-5849](https://issues.apache.org/jira/browse/ARROW-5849) - [C++] Compiler warnings on mingw-w64
+* [ARROW-5850](https://issues.apache.org/jira/browse/ARROW-5850) - [CI][R] R appveyor job is broken after release
+* [ARROW-5851](https://issues.apache.org/jira/browse/ARROW-5851) - [C++] Compilation of reference benchmarks fails
+* [ARROW-5856](https://issues.apache.org/jira/browse/ARROW-5856) - [Python] linking 3rd party cython modules against pyarrow fails since 0.14.0
+* [ARROW-5863](https://issues.apache.org/jira/browse/ARROW-5863) - [Python] Segmentation Fault via pytest-runner
+* [ARROW-5868](https://issues.apache.org/jira/browse/ARROW-5868) - [Python] manylinux2010 wheels have shared library dependency on liblz4
+* [ARROW-5873](https://issues.apache.org/jira/browse/ARROW-5873) - [Python] Segmentation fault when comparing schema with None
+* [ARROW-5874](https://issues.apache.org/jira/browse/ARROW-5874) - [Python] pyarrow 0.14.0 macOS wheels depend on shared libs under /usr/local/opt
+* [ARROW-5878](https://issues.apache.org/jira/browse/ARROW-5878) - [Python][C++] Parquet reader not forward compatible for timestamps without timezone
+* [ARROW-5886](https://issues.apache.org/jira/browse/ARROW-5886) - [Python][Packaging] Manylinux1/2010 compliance issue with libz
+* [ARROW-5887](https://issues.apache.org/jira/browse/ARROW-5887) - [C\#] ArrowStreamWriter writes FieldNodes in wrong order
+* [ARROW-5889](https://issues.apache.org/jira/browse/ARROW-5889) - [Python][C++] Parquet backwards compat for timestamps without timezone broken
+* [ARROW-5899](https://issues.apache.org/jira/browse/ARROW-5899) - [Python][Packaging] Bundle uriparser.dll in windows wheels
+* [ARROW-5921](https://issues.apache.org/jira/browse/ARROW-5921) - [C++][Fuzzing] Missing nullptr checks in IPC
+* [PARQUET-1623](https://issues.apache.org/jira/browse/PARQUET-1623) - [C++] Invalid memory access with a magic number of records
+
+
+## New Features and Improvements
+
+* [ARROW-5101](https://issues.apache.org/jira/browse/ARROW-5101) - [Packaging] Avoid bundling static libraries in Windows conda packages
+* [ARROW-5380](https://issues.apache.org/jira/browse/ARROW-5380) - [C++] Fix and enable UBSan for unaligned accesses.
+* [ARROW-5564](https://issues.apache.org/jira/browse/ARROW-5564) - [C++] Add uriparser to conda-forge
+* [ARROW-5609](https://issues.apache.org/jira/browse/ARROW-5609) - [C++] Set CMP0068 CMake policy to avoid macOS warnings
+* [ARROW-5784](https://issues.apache.org/jira/browse/ARROW-5784) - [Release][GLib] Replace c\_glib/ after running c\_glib/autogen.sh in dev/release/02-source.sh
+* [ARROW-5785](https://issues.apache.org/jira/browse/ARROW-5785) - [Rust] Rust datafusion implementation should not depend on rustyline
+* [ARROW-5787](https://issues.apache.org/jira/browse/ARROW-5787) - [Release][Rust] Use local modules to verify RC
+* [ARROW-5793](https://issues.apache.org/jira/browse/ARROW-5793) - [Release] Avoid duplicate known host SSH error in dev/release/03-binary.sh
+* [ARROW-5794](https://issues.apache.org/jira/browse/ARROW-5794) - [Release] Skip uploading already uploaded binaries
+* [ARROW-5795](https://issues.apache.org/jira/browse/ARROW-5795) - [Release] Add missing waits on uploading binaries
+* [ARROW-5796](https://issues.apache.org/jira/browse/ARROW-5796) - [Release][APT] Update expected package list
+* [ARROW-5797](https://issues.apache.org/jira/browse/ARROW-5797) - [Release][APT] Update supported distributions
+* [ARROW-5820](https://issues.apache.org/jira/browse/ARROW-5820) - [Release] Remove undefined variable check from verify script
+* [ARROW-5827](https://issues.apache.org/jira/browse/ARROW-5827) - [C++] Require c-ares CMake config
+* [ARROW-5828](https://issues.apache.org/jira/browse/ARROW-5828) - [C++] Add Protocol Buffers version check
+* [ARROW-5866](https://issues.apache.org/jira/browse/ARROW-5866) - [C++] Remove duplicate library in cpp/Brewfile
+* [ARROW-5877](https://issues.apache.org/jira/browse/ARROW-5877) - [FlightRPC] Fix auth incompatibilities between Python/Java
+* [ARROW-5904](https://issues.apache.org/jira/browse/ARROW-5904) - [Java] [Plasma] Fix compilation of Plasma Java client
+* [ARROW-5908](https://issues.apache.org/jira/browse/ARROW-5908) - [C\#] ArrowStreamWriter doesn't align buffers to 8 bytes
+* [ARROW-5934](https://issues.apache.org/jira/browse/ARROW-5934) - [Python] Bundle arrow's LICENSE with the wheels
+* [ARROW-5937](https://issues.apache.org/jira/browse/ARROW-5937) - [Release] Stop parallel binary upload
+* [ARROW-5938](https://issues.apache.org/jira/browse/ARROW-5938) - [Release] Create branch for adding release note automatically
+* [ARROW-5939](https://issues.apache.org/jira/browse/ARROW-5939) - [Release] Add support for generating vote email template separately
+* [ARROW-5940](https://issues.apache.org/jira/browse/ARROW-5940) - [Release] Add support for re-uploading sign/checksum for binary artifacts
+* [ARROW-5941](https://issues.apache.org/jira/browse/ARROW-5941) - [Release] Avoid re-uploading already uploaded binary artifacts
+* [ARROW-5958](https://issues.apache.org/jira/browse/ARROW-5958) - [Python] Link zlib statically in the wheels
+
+
+
+# Apache Arrow 0.14.0 (2019-07-04)
+
+## New Features and Improvements
+
+* [ARROW-258](https://issues.apache.org/jira/browse/ARROW-258) - [Format] clarify definition of Buffer in context of RPC, IPC, File
+* [ARROW-653](https://issues.apache.org/jira/browse/ARROW-653) - [Python / C++] Add debugging function to print an array's buffer contents in hexadecimal
+* [ARROW-767](https://issues.apache.org/jira/browse/ARROW-767) - [C++] Adopt FileSystem abstraction
+* [ARROW-835](https://issues.apache.org/jira/browse/ARROW-835) - [Format] Add Timedelta type to describe time intervals
+* [ARROW-840](https://issues.apache.org/jira/browse/ARROW-840) - [Python] Provide Python API for creating user-defined data types that can survive Arrow IPC
+* [ARROW-973](https://issues.apache.org/jira/browse/ARROW-973) - [Website] Add FAQ page about project
+* [ARROW-1012](https://issues.apache.org/jira/browse/ARROW-1012) - [C++] Create a configurable implementation of RecordBatchReader that reads from Apache Parquet files
+* [ARROW-1207](https://issues.apache.org/jira/browse/ARROW-1207) - [C++] Implement Map logical type
+* [ARROW-1261](https://issues.apache.org/jira/browse/ARROW-1261) - [Java] Add container type for Map logical type
+* [ARROW-1278](https://issues.apache.org/jira/browse/ARROW-1278) - Integration tests for Fixed Size List type
+* [ARROW-1279](https://issues.apache.org/jira/browse/ARROW-1279) - [Integration][Java] Integration tests for Map type
+* [ARROW-1280](https://issues.apache.org/jira/browse/ARROW-1280) - [C++] Implement Fixed Size List type
+* [ARROW-1349](https://issues.apache.org/jira/browse/ARROW-1349) - [Packaging] Provide APT and Yum repositories
+* [ARROW-1496](https://issues.apache.org/jira/browse/ARROW-1496) - [JS] Upload coverage data to codecov.io
+* [ARROW-1558](https://issues.apache.org/jira/browse/ARROW-1558) - [C++] Implement boolean selection kernels
+* [ARROW-1587](https://issues.apache.org/jira/browse/ARROW-1587) - [Format] Add metadata for user-defined logical types
+* [ARROW-1774](https://issues.apache.org/jira/browse/ARROW-1774) - [C++] Add "view" function to create zero-copy views for compatible types, if supported
+* [ARROW-1833](https://issues.apache.org/jira/browse/ARROW-1833) - [Java] Add accessor methods for data buffers that skip null checking
+* [ARROW-1957](https://issues.apache.org/jira/browse/ARROW-1957) - [Python] Write nanosecond timestamps using new NANO LogicalType Parquet unit
+* [ARROW-1983](https://issues.apache.org/jira/browse/ARROW-1983) - [Python] Add ability to write parquet \`\_metadata\` file
+* [ARROW-2057](https://issues.apache.org/jira/browse/ARROW-2057) - [Python] Configure size of data pages in pyarrow.parquet.write\_table
+* [ARROW-2102](https://issues.apache.org/jira/browse/ARROW-2102) - [C++] Implement take kernel functions - primitive value type
+* [ARROW-2103](https://issues.apache.org/jira/browse/ARROW-2103) - [C++] Implement take kernel functions - string/binary value type
+* [ARROW-2104](https://issues.apache.org/jira/browse/ARROW-2104) - [C++] Implement take kernel functions - nested array value type
+* [ARROW-2105](https://issues.apache.org/jira/browse/ARROW-2105) - [C++] Implement take kernel functions - properly handle special indices
+* [ARROW-2186](https://issues.apache.org/jira/browse/ARROW-2186) - [C++] Clean up architecture specific compiler flags
+* [ARROW-2217](https://issues.apache.org/jira/browse/ARROW-2217) - [C++] Add option to use dynamic linking for compression library dependencies
+* [ARROW-2298](https://issues.apache.org/jira/browse/ARROW-2298) - [Python] Add option to not consider NaN to be null when converting to an integer Arrow type
+* [ARROW-2412](https://issues.apache.org/jira/browse/ARROW-2412) - [Integration] Add nested dictionary integration test
+* [ARROW-2467](https://issues.apache.org/jira/browse/ARROW-2467) - [Rust] Generate code using Flatbuffers
+* [ARROW-2517](https://issues.apache.org/jira/browse/ARROW-2517) - [Java] Add list<decimal\> writer
+* [ARROW-2618](https://issues.apache.org/jira/browse/ARROW-2618) - [Rust] Bitmap constructor should accept for flag for default state (0 or 1)
+* [ARROW-2667](https://issues.apache.org/jira/browse/ARROW-2667) - [C++/Python] Add pandas-like take method to Array
+* [ARROW-2707](https://issues.apache.org/jira/browse/ARROW-2707) - [C++] Implement Table::Slice methods using Column::Slice
+* [ARROW-2709](https://issues.apache.org/jira/browse/ARROW-2709) - [Python] write\_to\_dataset poor performance when splitting
+* [ARROW-2730](https://issues.apache.org/jira/browse/ARROW-2730) - [C++] Set up CMAKE\_C\_FLAGS more thoughtfully instead of using CMAKE\_CXX\_FLAGS
+* [ARROW-2796](https://issues.apache.org/jira/browse/ARROW-2796) - [C++] Simplify symbols.map file, use when building libarrow\_python
+* [ARROW-2818](https://issues.apache.org/jira/browse/ARROW-2818) - [Python] Better error message when passing SparseDataFrame into Table.from\_pandas
+* [ARROW-2835](https://issues.apache.org/jira/browse/ARROW-2835) - [C++] ReadAt/WriteAt are inconsistent with moving the files position
+* [ARROW-2969](https://issues.apache.org/jira/browse/ARROW-2969) - [R] Convert between StructArray and "nested" data.frame column containing data frame in each cell
+* [ARROW-2981](https://issues.apache.org/jira/browse/ARROW-2981) - [C++] Support scripts / documentation for running clang-tidy on codebase
+* [ARROW-2984](https://issues.apache.org/jira/browse/ARROW-2984) - [JS] Refactor release verification script to share code with main source release verification script
+* [ARROW-3040](https://issues.apache.org/jira/browse/ARROW-3040) - [Go] add support for comparing Arrays
+* [ARROW-3041](https://issues.apache.org/jira/browse/ARROW-3041) - [Go] add support for TimeArray
+* [ARROW-3052](https://issues.apache.org/jira/browse/ARROW-3052) - [C++] Detect ORC system packages
+* [ARROW-3087](https://issues.apache.org/jira/browse/ARROW-3087) - [C++] Add kernels for comparison operations to scalars
+* [ARROW-3144](https://issues.apache.org/jira/browse/ARROW-3144) - [C++] Move "dictionary" member from DictionaryType to ArrayData to allow for changing dictionaries between Array chunks
+* [ARROW-3150](https://issues.apache.org/jira/browse/ARROW-3150) - [Python] Ship Flight-enabled Python wheels on Linux and Windows
+* [ARROW-3166](https://issues.apache.org/jira/browse/ARROW-3166) - [C++] Consolidate IO interfaces used in arrow/io and parquet-cpp
+* [ARROW-3191](https://issues.apache.org/jira/browse/ARROW-3191) - [Java] Add support for ArrowBuf to point to arbitrary memory.
+* [ARROW-3200](https://issues.apache.org/jira/browse/ARROW-3200) - [C++] Add support for reading Flight streams with dictionaries
+* [ARROW-3290](https://issues.apache.org/jira/browse/ARROW-3290) - [C++] Toolchain support for secure gRPC
+* [ARROW-3294](https://issues.apache.org/jira/browse/ARROW-3294) - [C++] Test Flight RPC on Windows / Appveyor
+* [ARROW-3314](https://issues.apache.org/jira/browse/ARROW-3314) - [R] Set -rpath using pkg-config when building
+* [ARROW-3330](https://issues.apache.org/jira/browse/ARROW-3330) - [C++] Spawn multiple Flight performance servers in flight-benchmark to test parallel get performance
+* [ARROW-3419](https://issues.apache.org/jira/browse/ARROW-3419) - [C++] Run include-what-you-use checks as nightly build
+* [ARROW-3459](https://issues.apache.org/jira/browse/ARROW-3459) - [C++][Gandiva] Add support for variable length output vectors
+* [ARROW-3475](https://issues.apache.org/jira/browse/ARROW-3475) - [C++] Int64Builder.Finish(NumericArray<Int64Type\>)
+* [ARROW-3570](https://issues.apache.org/jira/browse/ARROW-3570) - [Packaging] Don't bundle test data files with python wheels
+* [ARROW-3572](https://issues.apache.org/jira/browse/ARROW-3572) - [Packaging] Correctly handle ssh origin urls for crossbow
+* [ARROW-3671](https://issues.apache.org/jira/browse/ARROW-3671) - [Go] implement Interval array
+* [ARROW-3676](https://issues.apache.org/jira/browse/ARROW-3676) - [Go] implement Decimal128 array
+* [ARROW-3679](https://issues.apache.org/jira/browse/ARROW-3679) - [Go] implement IPC protocol
+* [ARROW-3680](https://issues.apache.org/jira/browse/ARROW-3680) - [Go] implement Float16 array
+* [ARROW-3686](https://issues.apache.org/jira/browse/ARROW-3686) - [Python] Support for masked arrays in to/from numpy
+* [ARROW-3702](https://issues.apache.org/jira/browse/ARROW-3702) - [R] POSIXct mapped to DateType not TimestampType?
+* [ARROW-3714](https://issues.apache.org/jira/browse/ARROW-3714) - [CI] Run RAT checks in pre-commit hooks
+* [ARROW-3729](https://issues.apache.org/jira/browse/ARROW-3729) - [C++] Support for writing TIMESTAMP\_NANOS Parquet metadata
+* [ARROW-3732](https://issues.apache.org/jira/browse/ARROW-3732) - [R] Add functions to write RecordBatch or Schema to Message value, then read back
+* [ARROW-3758](https://issues.apache.org/jira/browse/ARROW-3758) - [R] Build R library on Windows, document build instructions for Windows developers
+* [ARROW-3759](https://issues.apache.org/jira/browse/ARROW-3759) - [R][CI] Build and test on Windows in Appveyor
+* [ARROW-3767](https://issues.apache.org/jira/browse/ARROW-3767) - [C++] Add cast for Null to any type
+* [ARROW-3780](https://issues.apache.org/jira/browse/ARROW-3780) - [R] Failed to fetch data: invalid data when collecting int16
+* [ARROW-3791](https://issues.apache.org/jira/browse/ARROW-3791) - [C++] Add type inference for boolean values in CSV files
+* [ARROW-3794](https://issues.apache.org/jira/browse/ARROW-3794) - [R] Consider mapping INT8 to integer() not raw()
+* [ARROW-3804](https://issues.apache.org/jira/browse/ARROW-3804) - [R] Consider lowering required R runtime
+* [ARROW-3810](https://issues.apache.org/jira/browse/ARROW-3810) - [R] type= argument for Array and ChunkedArray
+* [ARROW-3811](https://issues.apache.org/jira/browse/ARROW-3811) - [R] struct arrays inference
+* [ARROW-3814](https://issues.apache.org/jira/browse/ARROW-3814) - [R] RecordBatch$from\_arrays()
+* [ARROW-3815](https://issues.apache.org/jira/browse/ARROW-3815) - [R] refine record batch factory
+* [ARROW-3848](https://issues.apache.org/jira/browse/ARROW-3848) - [R] allow nbytes to be missing in RandomAccessFile$Read()
+* [ARROW-3897](https://issues.apache.org/jira/browse/ARROW-3897) - [MATLAB] Add MATLAB support for writing numeric datatypes to a Feather file
+* [ARROW-3904](https://issues.apache.org/jira/browse/ARROW-3904) - [C++/Python] Validate scale and precision of decimal128 type
+* [ARROW-4013](https://issues.apache.org/jira/browse/ARROW-4013) - [Documentation][C++] Document how to build Apache Arrow on MSYS2
+* [ARROW-4020](https://issues.apache.org/jira/browse/ARROW-4020) - [Release] Remove source artifacts from dev dist system after release vote passes
+* [ARROW-4047](https://issues.apache.org/jira/browse/ARROW-4047) - [Python] Document use of int96 timestamps and options in Parquet docs
+* [ARROW-4086](https://issues.apache.org/jira/browse/ARROW-4086) - [Java] Add apis to debug alloc failures
+* [ARROW-4121](https://issues.apache.org/jira/browse/ARROW-4121) - [C++] Refactor memory allocation from InvertKernel
+* [ARROW-4159](https://issues.apache.org/jira/browse/ARROW-4159) - [C++] Check for -Wdocumentation issues
+* [ARROW-4194](https://issues.apache.org/jira/browse/ARROW-4194) - [Format] Metadata.rst does not specify timezone for Timestamp type
+* [ARROW-4302](https://issues.apache.org/jira/browse/ARROW-4302) - [C++] Add OpenSSL to C++ build toolchain
+* [ARROW-4337](https://issues.apache.org/jira/browse/ARROW-4337) - [C\#] Array / RecordBatch Builder Fluent API
+* [ARROW-4343](https://issues.apache.org/jira/browse/ARROW-4343) - [C++] Add as complete as possible Ubuntu Trusty / 14.04 build to docker-compose setup
+* [ARROW-4356](https://issues.apache.org/jira/browse/ARROW-4356) - [CI] Add integration (docker) test for turbodbc
+* [ARROW-4369](https://issues.apache.org/jira/browse/ARROW-4369) - [Packaging] Release verification script should test linux packages via docker
+* [ARROW-4452](https://issues.apache.org/jira/browse/ARROW-4452) - [Python] Serializing sparse torch tensors
+* [ARROW-4453](https://issues.apache.org/jira/browse/ARROW-4453) - [Python] Create Cython wrappers for SparseTensor
+* [ARROW-4467](https://issues.apache.org/jira/browse/ARROW-4467) - [Rust] [DataFusion] Create a REPL & Dockerfile for DataFusion
+* [ARROW-4503](https://issues.apache.org/jira/browse/ARROW-4503) - [C\#] ArrowStreamReader allocates and copies data excessively
+* [ARROW-4504](https://issues.apache.org/jira/browse/ARROW-4504) - [C++] Reduce the number of unit test executables
+* [ARROW-4505](https://issues.apache.org/jira/browse/ARROW-4505) - [C++] Nicer PrettyPrint for date32
+* [ARROW-4566](https://issues.apache.org/jira/browse/ARROW-4566) - [C++][Flight] Add option to run arrow-flight-benchmark against a perf server running on a different host
+* [ARROW-4596](https://issues.apache.org/jira/browse/ARROW-4596) - [Rust] [DataFusion] Implement COUNT aggregate function
+* [ARROW-4622](https://issues.apache.org/jira/browse/ARROW-4622) - [C++] [Python] MakeDense and MakeSparse in UnionArray should accept a vector of Field
+* [ARROW-4625](https://issues.apache.org/jira/browse/ARROW-4625) - [Flight] Wrap server busy-wait methods
+* [ARROW-4626](https://issues.apache.org/jira/browse/ARROW-4626) - [Flight] Add application metadata field to DoGet
+* [ARROW-4627](https://issues.apache.org/jira/browse/ARROW-4627) - [Flight] Add application metadata field to DoPut
+* [ARROW-4701](https://issues.apache.org/jira/browse/ARROW-4701) - [C++] Add JSON chunker benchmarks
+* [ARROW-4702](https://issues.apache.org/jira/browse/ARROW-4702) - [C++] Upgrade dependency versions
+* [ARROW-4708](https://issues.apache.org/jira/browse/ARROW-4708) - [C++] Add multithreaded JSON reader
+* [ARROW-4708](https://issues.apache.org/jira/browse/ARROW-4708) - [C++] Add multithreaded JSON reader
+* [ARROW-4714](https://issues.apache.org/jira/browse/ARROW-4714) - [C++][Java] Providing JNI interface to Read ORC file via Arrow C++
+* [ARROW-4717](https://issues.apache.org/jira/browse/ARROW-4717) - [C\#] Consider exposing ValueTask instead of Task
+* [ARROW-4719](https://issues.apache.org/jira/browse/ARROW-4719) - [C\#] Implement ChunkedArray, Column and Table in C\#
+* [ARROW-4741](https://issues.apache.org/jira/browse/ARROW-4741) - [Java] Add documentation to all classes and enable checkstyle for class javadocs
+* [ARROW-4787](https://issues.apache.org/jira/browse/ARROW-4787) - [C++] Include "null" values (perhaps with an option to toggle on/off) in hash kernel actions
+* [ARROW-4788](https://issues.apache.org/jira/browse/ARROW-4788) - [C++] Develop less verbose API for constructing StructArray
+* [ARROW-4800](https://issues.apache.org/jira/browse/ARROW-4800) - [C++] Create/port a StatusOr implementation to be able to return a status or a type
+* [ARROW-4805](https://issues.apache.org/jira/browse/ARROW-4805) - [Rust] Write temporal arrays to CSV
+* [ARROW-4806](https://issues.apache.org/jira/browse/ARROW-4806) - [Rust] Support casting temporal arrays in cast kernels
+* [ARROW-4824](https://issues.apache.org/jira/browse/ARROW-4824) - [Python] read\_csv should accept io.StringIO objects
+* [ARROW-4827](https://issues.apache.org/jira/browse/ARROW-4827) - [C++] Implement benchmark comparison between two git revisions
+* [ARROW-4847](https://issues.apache.org/jira/browse/ARROW-4847) - [Python] Add pyarrow.table factory function that dispatches to various ctors based on type of input
+* [ARROW-4904](https://issues.apache.org/jira/browse/ARROW-4904) - [C++] Move implementations in arrow/ipc/test-common.h into libarrow\_testing
+* [ARROW-4911](https://issues.apache.org/jira/browse/ARROW-4911) - [R] Support for building package for Windows
+* [ARROW-4912](https://issues.apache.org/jira/browse/ARROW-4912) - [C++, Python] Allow specifying column names to CSV reader
+* [ARROW-4913](https://issues.apache.org/jira/browse/ARROW-4913) - [Java][Memory] Limit number of ledgers and arrowbufs
+* [ARROW-4945](https://issues.apache.org/jira/browse/ARROW-4945) - [Flight] Enable Flight integration tests in Travis
+* [ARROW-4956](https://issues.apache.org/jira/browse/ARROW-4956) - [C\#] Allow ArrowBuffers to wrap external Memory in C\#
+* [ARROW-4959](https://issues.apache.org/jira/browse/ARROW-4959) - [Gandiva][Crossbow] Builds broken
+* [ARROW-4968](https://issues.apache.org/jira/browse/ARROW-4968) - [Rust] StructArray builder and From<\> methods should check that field types match schema
+* [ARROW-4971](https://issues.apache.org/jira/browse/ARROW-4971) - [Go] DataType equality
+* [ARROW-4972](https://issues.apache.org/jira/browse/ARROW-4972) - [Go] Array equality
+* [ARROW-4973](https://issues.apache.org/jira/browse/ARROW-4973) - [Go] Slice Array equality
+* [ARROW-4974](https://issues.apache.org/jira/browse/ARROW-4974) - [Go] Array approx equality
+* [ARROW-4990](https://issues.apache.org/jira/browse/ARROW-4990) - [C++] Kernel to compare array with array
+* [ARROW-4993](https://issues.apache.org/jira/browse/ARROW-4993) - [C++] Display summary at the end of CMake configuration
+* [ARROW-5000](https://issues.apache.org/jira/browse/ARROW-5000) - [Python] Fix deprecation warning from setup.py
+* [ARROW-5007](https://issues.apache.org/jira/browse/ARROW-5007) - [C++] Move DCHECK out of sse-utils
+* [ARROW-5020](https://issues.apache.org/jira/browse/ARROW-5020) - [C++][Gandiva] Split Gandiva-related conda packages for builds into separate .yml conda env file
+* [ARROW-5027](https://issues.apache.org/jira/browse/ARROW-5027) - [Python] Add JSON Reader
+* [ARROW-5037](https://issues.apache.org/jira/browse/ARROW-5037) - [Rust] [DataFusion] Refactor aggregate module
+* [ARROW-5038](https://issues.apache.org/jira/browse/ARROW-5038) - [Rust] [DataFusion] Implement AVG aggregate function
+* [ARROW-5039](https://issues.apache.org/jira/browse/ARROW-5039) - [Rust] [DataFusion] Fix bugs in CAST support
+* [ARROW-5040](https://issues.apache.org/jira/browse/ARROW-5040) - [C++] ArrayFromJSON can't parse Timestamp from strings
+* [ARROW-5045](https://issues.apache.org/jira/browse/ARROW-5045) - [Rust] Code coverage silently failing in CI
+* [ARROW-5053](https://issues.apache.org/jira/browse/ARROW-5053) - [Rust] [DataFusion] Use env var for location of arrow test data
+* [ARROW-5054](https://issues.apache.org/jira/browse/ARROW-5054) - [C++][Release] Test Flight in verify-release-candidate.sh
+* [ARROW-5056](https://issues.apache.org/jira/browse/ARROW-5056) - [Packaging] Adjust conda recipes to use ORC conda-forge package on unix systems
+* [ARROW-5061](https://issues.apache.org/jira/browse/ARROW-5061) - [Release] Improve 03-binary performance
+* [ARROW-5062](https://issues.apache.org/jira/browse/ARROW-5062) - [Java] Shade Java Guava dependency for Flight
+* [ARROW-5063](https://issues.apache.org/jira/browse/ARROW-5063) - [Java] FlightClient should not create a child allocator
+* [ARROW-5064](https://issues.apache.org/jira/browse/ARROW-5064) - [Release] Pass PKG\_CONFIG\_PATH to glib in the verification script
+* [ARROW-5066](https://issues.apache.org/jira/browse/ARROW-5066) - [Integration] Add flags to enable/disable implementations in integration/integration\_test.py
+* [ARROW-5071](https://issues.apache.org/jira/browse/ARROW-5071) - [Benchmarking] Performs a benchmark run with archery
+* [ARROW-5076](https://issues.apache.org/jira/browse/ARROW-5076) - [Packaging] Improve post binary upload performance
+* [ARROW-5077](https://issues.apache.org/jira/browse/ARROW-5077) - [Rust] Release process should change Cargo.toml to use release versions
+* [ARROW-5078](https://issues.apache.org/jira/browse/ARROW-5078) - [Documentation] Sphinx is failed by RemovedInSphinx30Warning
+* [ARROW-5079](https://issues.apache.org/jira/browse/ARROW-5079) - [Release] Add a script to release C\# package
+* [ARROW-5080](https://issues.apache.org/jira/browse/ARROW-5080) - [Release] Add a script to release Rust packages
+* [ARROW-5081](https://issues.apache.org/jira/browse/ARROW-5081) - [C++] Consistently use PATH\_SUFFIXES in CMake config
+* [ARROW-5083](https://issues.apache.org/jira/browse/ARROW-5083) - [Developer] In merge\_arrow\_pr.py script, allow user to set a released Fix Version
+* [ARROW-5088](https://issues.apache.org/jira/browse/ARROW-5088) - [C++] Do not set -Werror when using BUILD\_WARNING\_LEVEL=CHECKIN in release mode
+* [ARROW-5091](https://issues.apache.org/jira/browse/ARROW-5091) - [Flight] Rename FlightGetInfo message to FlightInfo
+* [ARROW-5093](https://issues.apache.org/jira/browse/ARROW-5093) - [Packaging] Add support for selective binary upload
+* [ARROW-5094](https://issues.apache.org/jira/browse/ARROW-5094) - [Packaging] Add APT/Yum verification scripts
+* [ARROW-5102](https://issues.apache.org/jira/browse/ARROW-5102) - [C++] Reduce header dependencies
+* [ARROW-5108](https://issues.apache.org/jira/browse/ARROW-5108) - [Go] implement reading primitive arrays from Arrow file
+* [ARROW-5109](https://issues.apache.org/jira/browse/ARROW-5109) - [Go] implement reading binary/string arrays from Arrow file
+* [ARROW-5110](https://issues.apache.org/jira/browse/ARROW-5110) - [Go] implement reading struct arrays from Arrow file
+* [ARROW-5111](https://issues.apache.org/jira/browse/ARROW-5111) - [Go] implement reading list arrays from Arrow file
+* [ARROW-5112](https://issues.apache.org/jira/browse/ARROW-5112) - [Go] implement writing arrays to Arrow file
+* [ARROW-5113](https://issues.apache.org/jira/browse/ARROW-5113) - [C++][Flight] Unit tests in C++ for DoPut
+* [ARROW-5115](https://issues.apache.org/jira/browse/ARROW-5115) - [JS] Implement the Vector Builders
+* [ARROW-5116](https://issues.apache.org/jira/browse/ARROW-5116) - [Rust] move kernel related files under compute/kernels
+* [ARROW-5124](https://issues.apache.org/jira/browse/ARROW-5124) - [C++] Add support for Parquet in MinGW build
+* [ARROW-5126](https://issues.apache.org/jira/browse/ARROW-5126) - [Rust] [Parquet] Convert parquet column desc to arrow data type
+* [ARROW-5127](https://issues.apache.org/jira/browse/ARROW-5127) - [Rust] [Parquet] Add page iterator
+* [ARROW-5136](https://issues.apache.org/jira/browse/ARROW-5136) - [Flight] Implement call options (timeouts)
+* [ARROW-5137](https://issues.apache.org/jira/browse/ARROW-5137) - [Flight] Implement authentication APIs
+* [ARROW-5145](https://issues.apache.org/jira/browse/ARROW-5145) - [C++] Release mode lacks convenience input validation
+* [ARROW-5150](https://issues.apache.org/jira/browse/ARROW-5150) - [Ruby] Add Arrow::Table\#raw\_records
+* [ARROW-5155](https://issues.apache.org/jira/browse/ARROW-5155) - [GLib][Ruby] Add support for building union arrays from data type
+* [ARROW-5157](https://issues.apache.org/jira/browse/ARROW-5157) - [Website] Add MATLAB to powered by Apache Arrow page
+* [ARROW-5162](https://issues.apache.org/jira/browse/ARROW-5162) - [Rust] [Parquet] Rename mod reader to arrow.
+* [ARROW-5163](https://issues.apache.org/jira/browse/ARROW-5163) - [Gandiva] Cast timestamp/date are incorrectly evaluating year 0097 to 1997
+* [ARROW-5164](https://issues.apache.org/jira/browse/ARROW-5164) - [Gandiva] [C++] Introduce 32bit hash functions
+* [ARROW-5165](https://issues.apache.org/jira/browse/ARROW-5165) - [Python][Documentation] Build docs don't suggest assigning $ARROW\_BUILD\_TYPE
+* [ARROW-5168](https://issues.apache.org/jira/browse/ARROW-5168) - [GLib] Add garrow\_array\_take()
+* [ARROW-5171](https://issues.apache.org/jira/browse/ARROW-5171) - [C++] Use LESS instead of LOWER in compare enum option.
+* [ARROW-5172](https://issues.apache.org/jira/browse/ARROW-5172) - [Go] implement reading fixed-size binary arrays from Arrow file
+* [ARROW-5178](https://issues.apache.org/jira/browse/ARROW-5178) - [Python] Allow creating Table from Python dict
+* [ARROW-5179](https://issues.apache.org/jira/browse/ARROW-5179) - [Python] Return plain dicts, not OrderedDict, on Python 3.7+
+* [ARROW-5185](https://issues.apache.org/jira/browse/ARROW-5185) - [C++] Add support for Boost with CMake configuration file
+* [ARROW-5187](https://issues.apache.org/jira/browse/ARROW-5187) - [Rust] Ability to flatten StructArray into a RecordBatch
+* [ARROW-5188](https://issues.apache.org/jira/browse/ARROW-5188) - [Rust] Add temporal builders for StructArray
+* [ARROW-5189](https://issues.apache.org/jira/browse/ARROW-5189) - [Rust] [Parquet] Format individual fields within a parquet row
+* [ARROW-5190](https://issues.apache.org/jira/browse/ARROW-5190) - [R] Discussion: tibble dependency in R package
+* [ARROW-5191](https://issues.apache.org/jira/browse/ARROW-5191) - [Rust] Expose CSV and JSON reader schemas
+* [ARROW-5203](https://issues.apache.org/jira/browse/ARROW-5203) - [GLib] Add support for Compare filter
+* [ARROW-5204](https://issues.apache.org/jira/browse/ARROW-5204) - [C++] Improve BufferBuilder performance
+* [ARROW-5212](https://issues.apache.org/jira/browse/ARROW-5212) - [Go] Array BinaryBuilder in Go library has no access to resize the values buffer
+* [ARROW-5218](https://issues.apache.org/jira/browse/ARROW-5218) - [C++] Improve build when third-party library locations are specified
+* [ARROW-5219](https://issues.apache.org/jira/browse/ARROW-5219) - [C++] Build protobuf\_ep in parallel when using Ninja
+* [ARROW-5222](https://issues.apache.org/jira/browse/ARROW-5222) - [Python] Issues with installing pyarrow for development on MacOS
+* [ARROW-5225](https://issues.apache.org/jira/browse/ARROW-5225) - [Java] Improve performance of BaseValueVector\#getValidityBufferSizeFromCount
+* [ARROW-5226](https://issues.apache.org/jira/browse/ARROW-5226) - [Gandiva] support compare operators for decimal
+* [ARROW-5238](https://issues.apache.org/jira/browse/ARROW-5238) - [Python] Improve usability of pyarrow.dictionary function
+* [ARROW-5241](https://issues.apache.org/jira/browse/ARROW-5241) - [Python] Add option to disable writing statistics to parquet file
+* [ARROW-5250](https://issues.apache.org/jira/browse/ARROW-5250) - [Java] remove javadoc suppression on methods.
+* [ARROW-5252](https://issues.apache.org/jira/browse/ARROW-5252) - [C++] Change variant implementation
+* [ARROW-5256](https://issues.apache.org/jira/browse/ARROW-5256) - [Packaging][deb] Failed to build with LLVM 7.1.0
+* [ARROW-5257](https://issues.apache.org/jira/browse/ARROW-5257) - [Website] Update site to use "official" Apache Arrow logo, add clearly marked links to logo
+* [ARROW-5258](https://issues.apache.org/jira/browse/ARROW-5258) - [C++/Python] Expose file metadata of dataset pieces to caller
+* [ARROW-5261](https://issues.apache.org/jira/browse/ARROW-5261) - [C++] Finish implementation of scalar types for Duration and Interval
+* [ARROW-5262](https://issues.apache.org/jira/browse/ARROW-5262) - [Python] Fix typo
+* [ARROW-5264](https://issues.apache.org/jira/browse/ARROW-5264) - [Java] Allow enabling/disabling boundary checking by environmental variable
+* [ARROW-5266](https://issues.apache.org/jira/browse/ARROW-5266) - [Go] implement read/write IPC for Float16
+* [ARROW-5268](https://issues.apache.org/jira/browse/ARROW-5268) - [GLib] Add GArrowJSONReader
+* [ARROW-5269](https://issues.apache.org/jira/browse/ARROW-5269) - [C++] Whitelist benchmarks candidates for regression checks
+* [ARROW-5275](https://issues.apache.org/jira/browse/ARROW-5275) - [C++] Write generic filesystem tests
+* [ARROW-5281](https://issues.apache.org/jira/browse/ARROW-5281) - [Rust] [Parquet] Move DataPageBuilder to test\_common
+* [ARROW-5284](https://issues.apache.org/jira/browse/ARROW-5284) - [Rust] Replace libc with std::alloc for memory allocation
+* [ARROW-5286](https://issues.apache.org/jira/browse/ARROW-5286) - [Python] support Structs in Table.from\_pandas given a known schema
+* [ARROW-5288](https://issues.apache.org/jira/browse/ARROW-5288) - [Documentation] Enrich the contribution guidelines
+* [ARROW-5289](https://issues.apache.org/jira/browse/ARROW-5289) - [C++] Move arrow/util/concatenate.h to arrow/array/
+* [ARROW-5290](https://issues.apache.org/jira/browse/ARROW-5290) - [Java] Provide a flag to enable/disable null-checking in vectors' get methods
+* [ARROW-5291](https://issues.apache.org/jira/browse/ARROW-5291) - [Python] Add wrapper for "take" kernel on Array
+* [ARROW-5298](https://issues.apache.org/jira/browse/ARROW-5298) - [Rust] Add debug implementation for Buffer
+* [ARROW-5299](https://issues.apache.org/jira/browse/ARROW-5299) - [C++] ListArray comparison is incorrect
+* [ARROW-5309](https://issues.apache.org/jira/browse/ARROW-5309) - [Python] Add clarifications to Python "append" methods that return new objects
+* [ARROW-5311](https://issues.apache.org/jira/browse/ARROW-5311) - [C++] Return more specific invalid Status in Take kernel
+* [ARROW-5313](https://issues.apache.org/jira/browse/ARROW-5313) - [Format] Comments on Field table are a bit confusing
+* [ARROW-5317](https://issues.apache.org/jira/browse/ARROW-5317) - [Rust] [Parquet] impl IntoIterator for SerializedFileReader
+* [ARROW-5319](https://issues.apache.org/jira/browse/ARROW-5319) - [CI] Enable ccache with MinGW builds
+* [ARROW-5321](https://issues.apache.org/jira/browse/ARROW-5321) - [Gandiva][C++] add isnull and isnotnull for utf8 and binary types
+* [ARROW-5323](https://issues.apache.org/jira/browse/ARROW-5323) - [CI] Use compression with clcache
+* [ARROW-5328](https://issues.apache.org/jira/browse/ARROW-5328) - [R] Add shell scripts to do a full package rebuild and test locally
+* [ARROW-5329](https://issues.apache.org/jira/browse/ARROW-5329) - Add support for building MATLAB interface to Feather directly within MATLAB
+* [ARROW-5334](https://issues.apache.org/jira/browse/ARROW-5334) - [C++] Add "Type" to names of arrow::Integer, arrow::FloatingPoint classes for consistency
+* [ARROW-5335](https://issues.apache.org/jira/browse/ARROW-5335) - [Python] Raise on variable dictionaries when converting to pandas
+* [ARROW-5339](https://issues.apache.org/jira/browse/ARROW-5339) - [C++] Add jemalloc to thirdparty dependency download script
+* [ARROW-5341](https://issues.apache.org/jira/browse/ARROW-5341) - [C++] Add instructions about fixing and testing for -Wdocumentation clang warnings locally
+* [ARROW-5342](https://issues.apache.org/jira/browse/ARROW-5342) - [Format] Formalize extension type metadata in IPC protocol
+* [ARROW-5346](https://issues.apache.org/jira/browse/ARROW-5346) - [C++] Revert changes to qualify duration in vendored date code
+* [ARROW-5349](https://issues.apache.org/jira/browse/ARROW-5349) - [Python/C++] Provide a way to specify the file path in parquet ColumnChunkMetaData
+* [ARROW-5361](https://issues.apache.org/jira/browse/ARROW-5361) - [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144
+* [ARROW-5363](https://issues.apache.org/jira/browse/ARROW-5363) - [GLib] Fix coding styles
+* [ARROW-5364](https://issues.apache.org/jira/browse/ARROW-5364) - [C++] Use ASCII rather than UTF-8 in BuildUtils.cmake comment
+* [ARROW-5365](https://issues.apache.org/jira/browse/ARROW-5365) - [C++][CI] Add UBSan and ASAN into CI
+* [ARROW-5368](https://issues.apache.org/jira/browse/ARROW-5368) - [C++] Disable jemalloc by default with MinGW
+* [ARROW-5369](https://issues.apache.org/jira/browse/ARROW-5369) - [C++] Add support for glog on Windows
+* [ARROW-5370](https://issues.apache.org/jira/browse/ARROW-5370) - [C++] Detect system uriparser by default
+* [ARROW-5372](https://issues.apache.org/jira/browse/ARROW-5372) - [GLib] Add support for null/boolean values CSV read option
+* [ARROW-5378](https://issues.apache.org/jira/browse/ARROW-5378) - [C++] Add local FileSystem implementation
+* [ARROW-5384](https://issues.apache.org/jira/browse/ARROW-5384) - [Go] add FixedSizeList array
+* [ARROW-5389](https://issues.apache.org/jira/browse/ARROW-5389) - [C++] Add an internal temporary directory API
+* [ARROW-5392](https://issues.apache.org/jira/browse/ARROW-5392) - [C++][CI][MinGW] Disable static library build on AppVeyor
+* [ARROW-5393](https://issues.apache.org/jira/browse/ARROW-5393) - [R] Add tests and example for read\_parquet()
+* [ARROW-5395](https://issues.apache.org/jira/browse/ARROW-5395) - [C++] Utilize stream EOS in File format
+* [ARROW-5396](https://issues.apache.org/jira/browse/ARROW-5396) - [JS] Ensure reader and writer support files and streams with no RecordBatches
+* [ARROW-5401](https://issues.apache.org/jira/browse/ARROW-5401) - [CI] [C++] Print ccache statistics on Travis-CI
+* [ARROW-5404](https://issues.apache.org/jira/browse/ARROW-5404) - [C++] nonstd::string\_view conflicts with std::string\_view in c++17
+* [ARROW-5407](https://issues.apache.org/jira/browse/ARROW-5407) - [C++] Integration test Travis CI entry builds many unnecessary targets
+* [ARROW-5413](https://issues.apache.org/jira/browse/ARROW-5413) - [C++] CSV reader doesn't remove BOM
+* [ARROW-5415](https://issues.apache.org/jira/browse/ARROW-5415) - [Release] Release script should update R version everywhere
+* [ARROW-5416](https://issues.apache.org/jira/browse/ARROW-5416) - [Website] Add Homebrew to project installation page
+* [ARROW-5418](https://issues.apache.org/jira/browse/ARROW-5418) - [CI][R] Run code coverage and report to codecov.io
+* [ARROW-5420](https://issues.apache.org/jira/browse/ARROW-5420) - [Java] Implement or remove getCurrentSizeInBytes in VariableWidthVector
+* [ARROW-5427](https://issues.apache.org/jira/browse/ARROW-5427) - [Python] RangeIndex serialization change implications
+* [ARROW-5428](https://issues.apache.org/jira/browse/ARROW-5428) - [C++] Add option to set "read extent" in arrow::io::BufferedInputStream
+* [ARROW-5429](https://issues.apache.org/jira/browse/ARROW-5429) - [Java] Provide alternative buffer allocation policy
+* [ARROW-5432](https://issues.apache.org/jira/browse/ARROW-5432) - [Python] Add 'read\_at' method to pyarrow.NativeFile
+* [ARROW-5433](https://issues.apache.org/jira/browse/ARROW-5433) - [C++][Parquet] improve parquet-reader columns information
+* [ARROW-5434](https://issues.apache.org/jira/browse/ARROW-5434) - [Java] Introduce wrappers for backward compatibility for ArrowBuf changes in ARROW-3191
+* [ARROW-5436](https://issues.apache.org/jira/browse/ARROW-5436) - [Python] expose filters argument in parquet.read\_table
+* [ARROW-5438](https://issues.apache.org/jira/browse/ARROW-5438) - [JS] Utilize stream EOS in File format
+* [ARROW-5441](https://issues.apache.org/jira/browse/ARROW-5441) - [C++] Implement FindArrowFlight.cmake
+* [ARROW-5442](https://issues.apache.org/jira/browse/ARROW-5442) - [Website] Clarify what makes a release artifact "official"
+* [ARROW-5443](https://issues.apache.org/jira/browse/ARROW-5443) - [Gandiva][Crossbow] Turn parquet encryption off
+* [ARROW-5447](https://issues.apache.org/jira/browse/ARROW-5447) - [CI] [Ruby] CI is failed on AppVeyor
+* [ARROW-5449](https://issues.apache.org/jira/browse/ARROW-5449) - [C++] Local filesystem implementation: investigate Windows UNC paths
+* [ARROW-5451](https://issues.apache.org/jira/browse/ARROW-5451) - [C++][Gandiva] Add round functions for decimals
+* [ARROW-5452](https://issues.apache.org/jira/browse/ARROW-5452) - [R] Add documentation website (pkgdown)
+* [ARROW-5461](https://issues.apache.org/jira/browse/ARROW-5461) - [Java] Add micro-benchmarks for Float8Vector and allocators
+* [ARROW-5463](https://issues.apache.org/jira/browse/ARROW-5463) - [Rust] Implement AsRef for Buffer
+* [ARROW-5464](https://issues.apache.org/jira/browse/ARROW-5464) - [Archery] Bad --benchmark-filter default
+* [ARROW-5465](https://issues.apache.org/jira/browse/ARROW-5465) - [Crossbow] Support writing submitted job definition yaml to a file
+* [ARROW-5466](https://issues.apache.org/jira/browse/ARROW-5466) - [Java] Dockerize Java builds in Travis CI, run multiple JDKs in single entry
+* [ARROW-5467](https://issues.apache.org/jira/browse/ARROW-5467) - [Go] implement read/write IPC for Time32/Time64 arrays
+* [ARROW-5468](https://issues.apache.org/jira/browse/ARROW-5468) - [Go] implement read/write IPC for Timestamp arrays
+* [ARROW-5469](https://issues.apache.org/jira/browse/ARROW-5469) - [Go] implement read/write IPC for Date32/Date64 arrays
+* [ARROW-5470](https://issues.apache.org/jira/browse/ARROW-5470) - [CI] C++ local filesystem patch breaks Travis R job
+* [ARROW-5472](https://issues.apache.org/jira/browse/ARROW-5472) - [Development] Add warning to PR merge tool if no JIRA component is set
+* [ARROW-5474](https://issues.apache.org/jira/browse/ARROW-5474) - [C++] Document required Boost version
+* [ARROW-5475](https://issues.apache.org/jira/browse/ARROW-5475) - [Python] Add Python binding for arrow::Concatenate
+* [ARROW-5476](https://issues.apache.org/jira/browse/ARROW-5476) - [Java][Memory] Fix Netty ArrowBuf Slice
+* [ARROW-5477](https://issues.apache.org/jira/browse/ARROW-5477) - [C++] Check required RapidJSON version
+* [ARROW-5478](https://issues.apache.org/jira/browse/ARROW-5478) - [Packaging] Drop Ubuntu 14.04 support
+* [ARROW-5481](https://issues.apache.org/jira/browse/ARROW-5481) - [GLib] garrow\_seekable\_input\_stream\_peek() misses "error" parameter document
+* [ARROW-5485](https://issues.apache.org/jira/browse/ARROW-5485) - [Gandiva][Crossbow] OSx builds failing
+* [ARROW-5485](https://issues.apache.org/jira/browse/ARROW-5485) - [Gandiva][Crossbow] OSx builds failing
+* [ARROW-5486](https://issues.apache.org/jira/browse/ARROW-5486) - [GLib] Add binding of gandiva::FunctionRegistry and related things
+* [ARROW-5488](https://issues.apache.org/jira/browse/ARROW-5488) - [R] Workaround when C++ lib not available
+* [ARROW-5490](https://issues.apache.org/jira/browse/ARROW-5490) - [C++] Remove ARROW\_BOOST\_HEADER\_ONLY
+* [ARROW-5491](https://issues.apache.org/jira/browse/ARROW-5491) - [C++] Remove unecessary semicolons following MACRO definitions
+* [ARROW-5492](https://issues.apache.org/jira/browse/ARROW-5492) - [R] Add "col\_select" argument to read\_\* functions to read subset of columns
+* [ARROW-5495](https://issues.apache.org/jira/browse/ARROW-5495) - [C++] Use HTTPS consistently for downloading dependencies
+* [ARROW-5496](https://issues.apache.org/jira/browse/ARROW-5496) - [R][CI] Fix relative paths in R codecov.io reporting
+* [ARROW-5498](https://issues.apache.org/jira/browse/ARROW-5498) - [C++] Build failure with Flatbuffers 1.11.0 and MinGW
+* [ARROW-5499](https://issues.apache.org/jira/browse/ARROW-5499) - [R] Alternate bindings for when libarrow is not found
+* [ARROW-5500](https://issues.apache.org/jira/browse/ARROW-5500) - [R] read\_csv\_arrow() signature should match readr::read\_csv()
+* [ARROW-5503](https://issues.apache.org/jira/browse/ARROW-5503) - [R] add read\_json()
+* [ARROW-5504](https://issues.apache.org/jira/browse/ARROW-5504) - [R] move use\_threads argument to global option
+* [ARROW-5509](https://issues.apache.org/jira/browse/ARROW-5509) - [R] write\_parquet()
+* [ARROW-5511](https://issues.apache.org/jira/browse/ARROW-5511) - [Packaging] Enable Flight in Conda packages
+* [ARROW-5512](https://issues.apache.org/jira/browse/ARROW-5512) - [C++] Draft initial public APIs for Datasets project
+* [ARROW-5513](https://issues.apache.org/jira/browse/ARROW-5513) - [Java] Refactor method name for getstartOffset to use camel case
+* [ARROW-5516](https://issues.apache.org/jira/browse/ARROW-5516) - [Python] Development page for pyarrow has a missing dependency in using pip
+* [ARROW-5518](https://issues.apache.org/jira/browse/ARROW-5518) - [Java] Set VectorSchemaRoot rowCount to 0 on allocateNew and clear
+* [ARROW-5524](https://issues.apache.org/jira/browse/ARROW-5524) - [C++] Turn off PARQUET\_BUILD\_ENCRYPTION in CMake if OpenSSL not found
+* [ARROW-5526](https://issues.apache.org/jira/browse/ARROW-5526) - [Developer] Add more prominent notice to GitHub issue template to direct bug reports to JIRA
+* [ARROW-5529](https://issues.apache.org/jira/browse/ARROW-5529) - [Flight] Allow serving with multiple TLS certificates
+* [ARROW-5531](https://issues.apache.org/jira/browse/ARROW-5531) - [Python] Support binary, utf8, and nested types in Array.from\_buffers
+* [ARROW-5533](https://issues.apache.org/jira/browse/ARROW-5533) - [Plasma] Plasma client should be thread-safe
+* [ARROW-5534](https://issues.apache.org/jira/browse/ARROW-5534) - [GLib] Add garrow\_table\_concatenate()
+* [ARROW-5535](https://issues.apache.org/jira/browse/ARROW-5535) - [GLib] Add garrow\_table\_slice()
+* [ARROW-5537](https://issues.apache.org/jira/browse/ARROW-5537) - [JS] Support delta dictionaries in RecordBatchWriter and DictionaryBuilder
+* [ARROW-5538](https://issues.apache.org/jira/browse/ARROW-5538) - [C++] Restrict minimum OpenSSL version to 1.0.2
+* [ARROW-5541](https://issues.apache.org/jira/browse/ARROW-5541) - [R] cast from negative int32 to uint32 and uint64 are now safe
+* [ARROW-5544](https://issues.apache.org/jira/browse/ARROW-5544) - [Archery] should not return non-zero in \`benchmark diff\` sub command on regression
+* [ARROW-5545](https://issues.apache.org/jira/browse/ARROW-5545) - [C++][Docs] Clarify expectation of UTC values for timestamps with time zones in C++ API docs
+* [ARROW-5547](https://issues.apache.org/jira/browse/ARROW-5547) - [C++][FlightRPC] arrow-flight.pc isn't provided
+* [ARROW-5552](https://issues.apache.org/jira/browse/ARROW-5552) - [Go] make Schema and Field implement Stringer
+* [ARROW-5554](https://issues.apache.org/jira/browse/ARROW-5554) - Add a python wrapper for arrow::Concatenate
+* [ARROW-5555](https://issues.apache.org/jira/browse/ARROW-5555) - [R] Add install\_arrow() function to assist the user in obtaining C++ runtime libraries
+* [ARROW-5556](https://issues.apache.org/jira/browse/ARROW-5556) - [Doc] Document JSON reader
+* [ARROW-5557](https://issues.apache.org/jira/browse/ARROW-5557) - [C++] Investigate performance of VisitBitsUnrolled on different platforms
+* [ARROW-5565](https://issues.apache.org/jira/browse/ARROW-5565) - [Python] Document how to use gdb when working on pyarrow
+* [ARROW-5567](https://issues.apache.org/jira/browse/ARROW-5567) - [C++] Fix build error of memory-benchmark
+* [ARROW-5571](https://issues.apache.org/jira/browse/ARROW-5571) - [R] Rework handing of ARROW\_R\_WITH\_PARQUET
+* [ARROW-5574](https://issues.apache.org/jira/browse/ARROW-5574) - [R] documentation error for read\_arrow()
+* [ARROW-5581](https://issues.apache.org/jira/browse/ARROW-5581) - [Java] Provide interfaces and initial implementations for vector sorting
+* [ARROW-5582](https://issues.apache.org/jira/browse/ARROW-5582) - [Go] add support for comparing Records
+* [ARROW-5586](https://issues.apache.org/jira/browse/ARROW-5586) - [R] convert Array of LIST type to R lists
+* [ARROW-5587](https://issues.apache.org/jira/browse/ARROW-5587) - [Java] Add more maven style check for Java code
+* [ARROW-5590](https://issues.apache.org/jira/browse/ARROW-5590) - [R] Run "no libarrow" R build in the same CI entry if possible
+* [ARROW-5591](https://issues.apache.org/jira/browse/ARROW-5591) - [Go] implement read/write IPC for Duration & Intervals
+* [ARROW-5597](https://issues.apache.org/jira/browse/ARROW-5597) - [Packaging][deb] Add Flight packages
+* [ARROW-5600](https://issues.apache.org/jira/browse/ARROW-5600) - [R] R package namespace cleanup
+* [ARROW-5602](https://issues.apache.org/jira/browse/ARROW-5602) - [Java][Gandiva] Add test for decimal round functions
+* [ARROW-5604](https://issues.apache.org/jira/browse/ARROW-5604) - [Go] improve test coverage of type-traits
+* [ARROW-5609](https://issues.apache.org/jira/browse/ARROW-5609) - [C++] Set CMP0068 CMake policy to avoid macOS warnings
+* [ARROW-5612](https://issues.apache.org/jira/browse/ARROW-5612) - [Python][Documentation] Clarify date\_as\_object option behavior
+* [ARROW-5621](https://issues.apache.org/jira/browse/ARROW-5621) - [Go] implement read/write IPC for Decimal128 arrays
+* [ARROW-5622](https://issues.apache.org/jira/browse/ARROW-5622) - [C++][Dataset] arrow-dataset.pc isn't provided
+* [ARROW-5625](https://issues.apache.org/jira/browse/ARROW-5625) - [R] convert Array of struct type to data frame columns
+* [ARROW-5632](https://issues.apache.org/jira/browse/ARROW-5632) - [Doc] Add some documentation describing compile/debug workflow on macOS with Xcode IDE
+* [ARROW-5633](https://issues.apache.org/jira/browse/ARROW-5633) - [Python] Enable bz2 in Linux wheels
+* [ARROW-5635](https://issues.apache.org/jira/browse/ARROW-5635) - [C++] Support "compacting" a table
+* [ARROW-5637](https://issues.apache.org/jira/browse/ARROW-5637) - [Gandiva] [Java]Complete IN Expression
+* [ARROW-5639](https://issues.apache.org/jira/browse/ARROW-5639) - [Java] Remove floating point computation from getOffsetBufferValueCapacity
+* [ARROW-5641](https://issues.apache.org/jira/browse/ARROW-5641) - [GLib] Remove enums files generated by GNU Autotools from Git targets
+* [ARROW-5643](https://issues.apache.org/jira/browse/ARROW-5643) - [Flight] Add ability to override hostname checking
+* [ARROW-5650](https://issues.apache.org/jira/browse/ARROW-5650) - [Python] Update manylinux dependency versions
+* [ARROW-5652](https://issues.apache.org/jira/browse/ARROW-5652) - [CI] Fix iwyu docker image
+* [ARROW-5653](https://issues.apache.org/jira/browse/ARROW-5653) - [CI] Fix cpp docker image
+* [ARROW-5656](https://issues.apache.org/jira/browse/ARROW-5656) - [Python] Enable Flight wheels on macOS
+* [ARROW-5659](https://issues.apache.org/jira/browse/ARROW-5659) - [C++] Add support for finding OpenSSL installed by Homebrew
+* [ARROW-5660](https://issues.apache.org/jira/browse/ARROW-5660) - [GLib][CI] Use the latest macOS image and all Homebrew based libraries
+* [ARROW-5661](https://issues.apache.org/jira/browse/ARROW-5661) - Support hash functions for decimal in Gandiva
+* [ARROW-5662](https://issues.apache.org/jira/browse/ARROW-5662) - [C++] Add support for BOOST\_SOURCE=AUTO|BUNDLED|SYSTEM
+* [ARROW-5663](https://issues.apache.org/jira/browse/ARROW-5663) - [Packaging][RPM] Update CentOS packages for 0.14.0
+* [ARROW-5664](https://issues.apache.org/jira/browse/ARROW-5664) - [Crossbow] Execute nightly crossbow tests on CircleCI instead of Travis
+* [ARROW-5668](https://issues.apache.org/jira/browse/ARROW-5668) - [Python] Display "not null" in Schema.\_\_repr\_\_ for non-nullable fields
+* [ARROW-5669](https://issues.apache.org/jira/browse/ARROW-5669) - [Crossbow] manylinux1 wheel building failing
+* [ARROW-5670](https://issues.apache.org/jira/browse/ARROW-5670) - [Crossbow] get\_apache\_mirror.py fails with TLS error on macOS with Python 3.5
+* [ARROW-5671](https://issues.apache.org/jira/browse/ARROW-5671) - [crossbow] mac os python wheels failing
+* [ARROW-5672](https://issues.apache.org/jira/browse/ARROW-5672) - [Java] Refactor redundant method modifier
+* [ARROW-5683](https://issues.apache.org/jira/browse/ARROW-5683) - [R] Add snappy to Rtools Windows builds
+* [ARROW-5684](https://issues.apache.org/jira/browse/ARROW-5684) - [Packaging][deb] Add support for Ubuntu 19.04
+* [ARROW-5685](https://issues.apache.org/jira/browse/ARROW-5685) - [Packaging][deb] Add support for Apache Arrow Datasets
+* [ARROW-5687](https://issues.apache.org/jira/browse/ARROW-5687) - [C++] Remove remaining uses of ARROW\_BOOST\_VENDORED
+* [ARROW-5690](https://issues.apache.org/jira/browse/ARROW-5690) - [Packaging][Python] macOS wheels broken: libprotobuf.18.dylib missing
+* [ARROW-5694](https://issues.apache.org/jira/browse/ARROW-5694) - [Python] List of decimals are not supported when converting to pandas
+* [ARROW-5695](https://issues.apache.org/jira/browse/ARROW-5695) - [C\#][Release] Run sourcelink test in verify-release-candidate.sh
+* [ARROW-5696](https://issues.apache.org/jira/browse/ARROW-5696) - [Gandiva] [C++] Introduce castVarcharVarchar
+* [ARROW-5699](https://issues.apache.org/jira/browse/ARROW-5699) - [C++] Optimize parsing of Decimal128 in CSV
+* [ARROW-5701](https://issues.apache.org/jira/browse/ARROW-5701) - [C++][Gandiva] Build expressions only for the required selection vector types
+* [ARROW-5702](https://issues.apache.org/jira/browse/ARROW-5702) - [C++] parquet::arrow::FileReader::GetSchema()
+* [ARROW-5704](https://issues.apache.org/jira/browse/ARROW-5704) - [C++] Stop using ARROW\_TEMPLATE\_EXPORT for SparseTensorImpl class
+* [ARROW-5705](https://issues.apache.org/jira/browse/ARROW-5705) - [Java] Optimize BaseValueVector\#computeCombinedBufferSize logic
+* [ARROW-5706](https://issues.apache.org/jira/browse/ARROW-5706) - [Java] Remove type conversion in getValidityBufferValueCapacity
+* [ARROW-5707](https://issues.apache.org/jira/browse/ARROW-5707) - [Java] Improve the performance and code structure for ArrowRecordBatch
+* [ARROW-5710](https://issues.apache.org/jira/browse/ARROW-5710) - [C++] Allow compiling Gandiva with Ninja on Windows
+* [ARROW-5715](https://issues.apache.org/jira/browse/ARROW-5715) - [Release] Verify Ubuntu 19.04 APT repository
+* [ARROW-5718](https://issues.apache.org/jira/browse/ARROW-5718) - [R] auto splice data frames in record\_batch() and table()
+* [ARROW-5720](https://issues.apache.org/jira/browse/ARROW-5720) - [C++] Create benchmarks for decimal related classes.
+* [ARROW-5721](https://issues.apache.org/jira/browse/ARROW-5721) - [Rust] Move array related code into a separate module
+* [ARROW-5724](https://issues.apache.org/jira/browse/ARROW-5724) - [R] [CI] AppVeyor build should use ccache
+* [ARROW-5725](https://issues.apache.org/jira/browse/ARROW-5725) - [Crossbow] Port conda recipes to azure pipelines
+* [ARROW-5726](https://issues.apache.org/jira/browse/ARROW-5726) - [Java] Implement a common interface for int vectors
+* [ARROW-5727](https://issues.apache.org/jira/browse/ARROW-5727) - [Python] [CI] Install pytest-faulthandler before running tests
+* [ARROW-5748](https://issues.apache.org/jira/browse/ARROW-5748) - [Packaging][deb] Add support for Debian GNU/Linux buster
+* [ARROW-5749](https://issues.apache.org/jira/browse/ARROW-5749) - [Python] Add Python binding for Table::CombineChunks()
+* [ARROW-5751](https://issues.apache.org/jira/browse/ARROW-5751) - [Packaging][Python] Python macOS wheels have dynamic dependency on libcares
+* [ARROW-5752](https://issues.apache.org/jira/browse/ARROW-5752) - [Java] Improve the performance of ArrowBuf\#setZero
+* [ARROW-5755](https://issues.apache.org/jira/browse/ARROW-5755) - [Rust] [Parquet] Add derived clone for Type
+* [ARROW-5768](https://issues.apache.org/jira/browse/ARROW-5768) - [Release] There are needless newlines at the end of CHANGELOG.md
+* [ARROW-5773](https://issues.apache.org/jira/browse/ARROW-5773) - [R] Clean up documentation before release
+* [ARROW-5780](https://issues.apache.org/jira/browse/ARROW-5780) - [C++] Add benchmark for Decimal128 operations
+* [ARROW-5782](https://issues.apache.org/jira/browse/ARROW-5782) - [Release] Setup test data for Flight in dev/release/01-perform.sh
+* [ARROW-5783](https://issues.apache.org/jira/browse/ARROW-5783) - [Release][C\#] Exclude dummy.git from RAT check
+* [ARROW-5785](https://issues.apache.org/jira/browse/ARROW-5785) - [Rust] Rust datafusion implementation should not depend on rustyline
+* [ARROW-5787](https://issues.apache.org/jira/browse/ARROW-5787) - [Release][Rust] Use local modules to verify RC
+* [ARROW-5793](https://issues.apache.org/jira/browse/ARROW-5793) - [Release] Avoid duplicate known host SSH error in dev/release/03-binary.sh
+* [ARROW-5794](https://issues.apache.org/jira/browse/ARROW-5794) - [Release] Skip uploading already uploaded binaries
+* [ARROW-5795](https://issues.apache.org/jira/browse/ARROW-5795) - [Release] Add missing waits on uploading binaries
+* [ARROW-5796](https://issues.apache.org/jira/browse/ARROW-5796) - [Release][APT] Update expected package list
+* [ARROW-5797](https://issues.apache.org/jira/browse/ARROW-5797) - [Release][APT] Update supported distributions
+* [ARROW-5818](https://issues.apache.org/jira/browse/ARROW-5818) - [Java][Gandiva] support varlen output vectors
+* [ARROW-5820](https://issues.apache.org/jira/browse/ARROW-5820) - [Release] Remove undefined variable check from verify script
+* [ARROW-5826](https://issues.apache.org/jira/browse/ARROW-5826) - [Website] Blog post for 0.14.0 release announcement
+* [PARQUET-1243](https://issues.apache.org/jira/browse/PARQUET-1243) - [C++] Improve quality of error message for zero-length files, otherwise corrupted files
+* [PARQUET-1411](https://issues.apache.org/jira/browse/PARQUET-1411) - [C++] Upgrade to use LogicalType annotations instead of ConvertedType
+* [PARQUET-1422](https://issues.apache.org/jira/browse/PARQUET-1422) - [C++] Use Arrow IO interfaces natively rather than current parquet:: wrappers
+* [PARQUET-1517](https://issues.apache.org/jira/browse/PARQUET-1517) - [C++] Update cpp crypto package to match signed-off specification
+* [PARQUET-1523](https://issues.apache.org/jira/browse/PARQUET-1523) - [C++] Vectorize comparator interface
+* [PARQUET-1569](https://issues.apache.org/jira/browse/PARQUET-1569) - [C++] Consolidate testing header files
+* [PARQUET-1582](https://issues.apache.org/jira/browse/PARQUET-1582) - [C++] Add ToString method ColumnDescriptor
+* [PARQUET-1583](https://issues.apache.org/jira/browse/PARQUET-1583) - [C++] Remove parquet::Vector class
+* [PARQUET-1586](https://issues.apache.org/jira/browse/PARQUET-1586) - [C++] Add --dump options to parquet-reader tool to dump def/rep levels
+* [PARQUET-1603](https://issues.apache.org/jira/browse/PARQUET-1603) - [C++] rename parquet::LogicalType to parquet::ConvertedType
+
+
+## Bug Fixes
+
+* [ARROW-61](https://issues.apache.org/jira/browse/ARROW-61) - [Java] Method can return the value bigger than long MAX\_VALUE
+* [ARROW-352](https://issues.apache.org/jira/browse/ARROW-352) - [Format] Interval(DAY\_TIME) has no unit
+* [ARROW-1837](https://issues.apache.org/jira/browse/ARROW-1837) - [Java] Unable to read unsigned integers outside signed range for bit width in integration tests
+* [ARROW-2119](https://issues.apache.org/jira/browse/ARROW-2119) - [C++][Java] Handle Arrow stream with zero record batch
+* [ARROW-2136](https://issues.apache.org/jira/browse/ARROW-2136) - [Python] Non-nullable schema fields not checked in conversions from pandas
+* [ARROW-2256](https://issues.apache.org/jira/browse/ARROW-2256) - [C++] Fuzzer builds fail out of the box on Ubuntu 16.04 using LLVM apt repos
+* [ARROW-2461](https://issues.apache.org/jira/browse/ARROW-2461) - [Python] Build wheels for manylinux2010 tag
+* [ARROW-2590](https://issues.apache.org/jira/browse/ARROW-2590) - [Python] Pyspark python\_udf serialization error on grouped map (Amazon EMR)
+* [ARROW-3344](https://issues.apache.org/jira/browse/ARROW-3344) - [Python] test\_plasma.py fails (in test\_plasma\_list)
+* [ARROW-3399](https://issues.apache.org/jira/browse/ARROW-3399) - [Python] Cannot serialize numpy matrix object
+* [ARROW-3650](https://issues.apache.org/jira/browse/ARROW-3650) - [Python] Mixed column indexes are read back as strings
+* [ARROW-3801](https://issues.apache.org/jira/browse/ARROW-3801) - [Python] Pandas-Arrow roundtrip makes pd categorical index not writeable
+* [ARROW-4021](https://issues.apache.org/jira/browse/ARROW-4021) - [Ruby] Error building red-arrow on msys2
+* [ARROW-4076](https://issues.apache.org/jira/browse/ARROW-4076) - [Python] schema validation and filters
+* [ARROW-4139](https://issues.apache.org/jira/browse/ARROW-4139) - [Python] Cast Parquet column statistics to unicode if UTF8 ConvertedType is set
+* [ARROW-4301](https://issues.apache.org/jira/browse/ARROW-4301) - [Java][Gandiva] Maven snapshot version update does not seem to update Gandiva submodule
+* [ARROW-4301](https://issues.apache.org/jira/browse/ARROW-4301) - [Java][Gandiva] Maven snapshot version update does not seem to update Gandiva submodule
+* [ARROW-4324](https://issues.apache.org/jira/browse/ARROW-4324) - [Python] Array dtype inference incorrect when created from list of mixed numpy scalars
+* [ARROW-4350](https://issues.apache.org/jira/browse/ARROW-4350) - [Python] dtype=object arrays cannot be converted to a list-of-list ListArray
+* [ARROW-4433](https://issues.apache.org/jira/browse/ARROW-4433) - [R] Segmentation fault when instantiating arrow::table from data frame
+* [ARROW-4447](https://issues.apache.org/jira/browse/ARROW-4447) - [C++] Investigate dynamic linking for libthift
+* [ARROW-4516](https://issues.apache.org/jira/browse/ARROW-4516) - [Python] Error while creating a ParquetDataset on a path without \`\_common\_dataset\` but with an empty \`\_tempfile\`
+* [ARROW-4523](https://issues.apache.org/jira/browse/ARROW-4523) - [JS] Add row proxy generation benchmark
+* [ARROW-4651](https://issues.apache.org/jira/browse/ARROW-4651) - [Format] Flight Location should be more flexible than a (host, port) pair
+* [ARROW-4665](https://issues.apache.org/jira/browse/ARROW-4665) - [C++] With glog activated, DCHECK macros are redefined
+* [ARROW-4675](https://issues.apache.org/jira/browse/ARROW-4675) - [Python] Error serializing bool ndarray in py2 and deserializing in py3
+* [ARROW-4694](https://issues.apache.org/jira/browse/ARROW-4694) - [CI] detect-changes.py is inconsistent
+* [ARROW-4723](https://issues.apache.org/jira/browse/ARROW-4723) - [Python] Skip \_files when reading a directory containing parquet files
+* [ARROW-4725](https://issues.apache.org/jira/browse/ARROW-4725) - [C++] Dictionary tests disabled under MinGW builds
+* [ARROW-4823](https://issues.apache.org/jira/browse/ARROW-4823) - [Python] read\_csv shouldn't close file handles it doesn't own
+* [ARROW-4832](https://issues.apache.org/jira/browse/ARROW-4832) - [Python] pandas Index metadata for RangeIndex is incorrect
+* [ARROW-4845](https://issues.apache.org/jira/browse/ARROW-4845) - [R] Compiler warnings on Windows MingW64
+* [ARROW-4851](https://issues.apache.org/jira/browse/ARROW-4851) - [Java] BoundsChecking.java defaulting behavior for old drill parameter seems off
+* [ARROW-4877](https://issues.apache.org/jira/browse/ARROW-4877) - [Plasma] CI failure in test\_plasma\_list
+* [ARROW-4884](https://issues.apache.org/jira/browse/ARROW-4884) - [C++] conda-forge thrift-cpp package not available via pkg-config or cmake
+* [ARROW-4885](https://issues.apache.org/jira/browse/ARROW-4885) - [Python] read\_csv() can't handle decimal128 columns
+* [ARROW-4886](https://issues.apache.org/jira/browse/ARROW-4886) - [Rust] Inconsistent behaviour with casting sliced primitive array to list array
+* [ARROW-4923](https://issues.apache.org/jira/browse/ARROW-4923) - Expose setters for Decimal vector that take long and double inputs
+* [ARROW-4934](https://issues.apache.org/jira/browse/ARROW-4934) - [Python] Address deprecation notice that will be a bug in Python 3.8
+* [ARROW-5019](https://issues.apache.org/jira/browse/ARROW-5019) - [C\#] ArrowStreamWriter doesn't work on a non-seekable stream
+* [ARROW-5049](https://issues.apache.org/jira/browse/ARROW-5049) - [Python] org/apache/hadoop/fs/FileSystem class not found when pyarrow FileSystem used in spark
+* [ARROW-5051](https://issues.apache.org/jira/browse/ARROW-5051) - [GLib][Gandiva] Test failure in release verification script
+* [ARROW-5055](https://issues.apache.org/jira/browse/ARROW-5055) - [Ruby][MSYS2] libparquet needs to be installed in MSYS2 for ruby
+* [ARROW-5058](https://issues.apache.org/jira/browse/ARROW-5058) - [Release] 02-source.sh generates e-mail template with wrong links
+* [ARROW-5059](https://issues.apache.org/jira/browse/ARROW-5059) - [C++][Gandiva] cbrt\_\* floating point tests can fail due to exact comparisons
+* [ARROW-5065](https://issues.apache.org/jira/browse/ARROW-5065) - [Rust] cast kernel does not support casting from Int64
+* [ARROW-5068](https://issues.apache.org/jira/browse/ARROW-5068) - [Gandiva][Packaging] Fix gandiva nightly builds after the CMake refactor
+* [ARROW-5090](https://issues.apache.org/jira/browse/ARROW-5090) - Parquet linking fails on MacOS due to @rpath in dylib
+* [ARROW-5092](https://issues.apache.org/jira/browse/ARROW-5092) - [C\#] Source Link doesn't work with the C\# release script
+* [ARROW-5095](https://issues.apache.org/jira/browse/ARROW-5095) - [Flight][C++] Flight DoGet doesn't expose server error message
+* [ARROW-5096](https://issues.apache.org/jira/browse/ARROW-5096) - [Packaging][deb] plasma-store-server packages are missing
+* [ARROW-5097](https://issues.apache.org/jira/browse/ARROW-5097) - [Packaging][CentOS6] arrow-lib has unresolvable dependencies
+* [ARROW-5098](https://issues.apache.org/jira/browse/ARROW-5098) - [Website] Update APT install document for 0.13.0
+* [ARROW-5100](https://issues.apache.org/jira/browse/ARROW-5100) - [JS] Writer swaps byte order if buffers share the same underlying ArrayBuffer
+* [ARROW-5117](https://issues.apache.org/jira/browse/ARROW-5117) - [Go] Panic when appending zero slices after initializing a builder
+* [ARROW-5119](https://issues.apache.org/jira/browse/ARROW-5119) - [Go] invalid Stringer implementation for array.Boolean
+* [ARROW-5122](https://issues.apache.org/jira/browse/ARROW-5122) - [Python] pyarrow.parquet.read\_table raises non-file path error when given a windows path to a directory
+* [ARROW-5128](https://issues.apache.org/jira/browse/ARROW-5128) - [Packaging][CentOS][Conda] Numpy not found in nightly builds
+* [ARROW-5129](https://issues.apache.org/jira/browse/ARROW-5129) - [Rust][Parquet] Column writer bug: check dictionary encoder when adding a new data page
+* [ARROW-5130](https://issues.apache.org/jira/browse/ARROW-5130) - [Python] Segfault when importing TensorFlow after Pyarrow
+* [ARROW-5132](https://issues.apache.org/jira/browse/ARROW-5132) - [Java] Errors on building gandiva\_jni.dll on Windows with Visual Studio 2017
+* [ARROW-5138](https://issues.apache.org/jira/browse/ARROW-5138) - [Python/C++] Row group retrieval doesn't restore index properly
+* [ARROW-5140](https://issues.apache.org/jira/browse/ARROW-5140) - [Bug?][Parquet] Can write a jagged array column of strings to disk, but hit \`ArrowNotImplementedError\` on read
+* [ARROW-5142](https://issues.apache.org/jira/browse/ARROW-5142) - [CI] Fix conda calls in AppVeyor scripts
+* [ARROW-5144](https://issues.apache.org/jira/browse/ARROW-5144) - [Python] ParquetDataset and ParquetPiece not serializable
+* [ARROW-5146](https://issues.apache.org/jira/browse/ARROW-5146) - [Dev] Merge script imposes directory name
+* [ARROW-5147](https://issues.apache.org/jira/browse/ARROW-5147) - [C++] get an error in building: Could NOT find DoubleConversion
+* [ARROW-5148](https://issues.apache.org/jira/browse/ARROW-5148) - [CI] [C++] LLVM-related compile errors
+* [ARROW-5149](https://issues.apache.org/jira/browse/ARROW-5149) - [Packaging][Wheel] Pin LLVM to version 7 in windows builds
+* [ARROW-5152](https://issues.apache.org/jira/browse/ARROW-5152) - [Python] CMake warnings when building
+* [ARROW-5159](https://issues.apache.org/jira/browse/ARROW-5159) - Unable to build benches in arrow crate.
+* [ARROW-5160](https://issues.apache.org/jira/browse/ARROW-5160) - [C++] ABORT\_NOT\_OK evalutes expression twice
+* [ARROW-5166](https://issues.apache.org/jira/browse/ARROW-5166) - [Python][Parquet] Statistics for uint64 columns may overflow
+* [ARROW-5167](https://issues.apache.org/jira/browse/ARROW-5167) - [C++] Upgrade string-view-light to latest
+* [ARROW-5169](https://issues.apache.org/jira/browse/ARROW-5169) - [Python] non-nullable fields are converted to nullable in {{Table.from\_pandas}}
+* [ARROW-5173](https://issues.apache.org/jira/browse/ARROW-5173) - [Go] handle multiple concatenated streams back-to-back
+* [ARROW-5174](https://issues.apache.org/jira/browse/ARROW-5174) - [Go] implement Stringer for DataTypes
+* [ARROW-5177](https://issues.apache.org/jira/browse/ARROW-5177) - [Python] ParquetReader.read\_column() doesn't check bounds
+* [ARROW-5183](https://issues.apache.org/jira/browse/ARROW-5183) - [CI] MinGW build failures on AppVeyor
+* [ARROW-5184](https://issues.apache.org/jira/browse/ARROW-5184) - [Rust] Broken links and other documentation warnings
+* [ARROW-5186](https://issues.apache.org/jira/browse/ARROW-5186) - [Plasma] Crash on deleting CUDA memory
+* [ARROW-5194](https://issues.apache.org/jira/browse/ARROW-5194) - [C++][Plasma] TEST(PlasmaSerialization, GetReply) is failing
+* [ARROW-5195](https://issues.apache.org/jira/browse/ARROW-5195) - [Python] read\_csv ignores null\_values on string types
+* [ARROW-5201](https://issues.apache.org/jira/browse/ARROW-5201) - [Python] Import ABCs from collections is deprecated in Python 3.7
+* [ARROW-5208](https://issues.apache.org/jira/browse/ARROW-5208) - [Python] Inconsistent resulting type during casting in pa.array() when mask is present
+* [ARROW-5214](https://issues.apache.org/jira/browse/ARROW-5214) - [C++] Offline dependency downloader misses some libraries
+* [ARROW-5217](https://issues.apache.org/jira/browse/ARROW-5217) - [Rust] [CI] DataFusion test failure
+* [ARROW-5232](https://issues.apache.org/jira/browse/ARROW-5232) - [Java] value vector size increases rapidly in case of clear/setSafe loop
+* [ARROW-5233](https://issues.apache.org/jira/browse/ARROW-5233) - [Go] migrate to new flatbuffers-v1.11.0
+* [ARROW-5237](https://issues.apache.org/jira/browse/ARROW-5237) - [Python] pandas\_version key in pandas metadata no longer populated
+* [ARROW-5240](https://issues.apache.org/jira/browse/ARROW-5240) - [C++][CI] cmake\_format 0.5.0 appears to fail the build
+* [ARROW-5242](https://issues.apache.org/jira/browse/ARROW-5242) - [C++] Arrow doesn't compile cleanly with Visual Studio 2017 Update 9 or later due to narrowing
+* [ARROW-5243](https://issues.apache.org/jira/browse/ARROW-5243) - [Java][Gandiva] Add test for decimal compare functions
+* [ARROW-5245](https://issues.apache.org/jira/browse/ARROW-5245) - [C++][CI] Unpin cmake\_format
+* [ARROW-5246](https://issues.apache.org/jira/browse/ARROW-5246) - [Go] use Go-1.12 in CI
+* [ARROW-5249](https://issues.apache.org/jira/browse/ARROW-5249) - [Java] Flight client doesn't handle auth correctly in some cases
+* [ARROW-5253](https://issues.apache.org/jira/browse/ARROW-5253) - [C++] external Snappy fails on Alpine
+* [ARROW-5254](https://issues.apache.org/jira/browse/ARROW-5254) - [Flight][Java] DoAction does not support result streams
+* [ARROW-5255](https://issues.apache.org/jira/browse/ARROW-5255) - [Java] Implement user-defined data types API
+* [ARROW-5260](https://issues.apache.org/jira/browse/ARROW-5260) - [Python][C++] Crash when deserializing from components in a fresh new process
+* [ARROW-5274](https://issues.apache.org/jira/browse/ARROW-5274) - [JavaScript] Wrong array type for countBy
+* [ARROW-5283](https://issues.apache.org/jira/browse/ARROW-5283) - [C++][Plasma] Server crash when creating an aborted object 3 times
+* [ARROW-5285](https://issues.apache.org/jira/browse/ARROW-5285) - [C++][Plasma] GpuProcessHandle is not released when GPU object deleted
+* [ARROW-5293](https://issues.apache.org/jira/browse/ARROW-5293) - [C++] Take kernel on DictionaryArray does not preserve ordered flag
+* [ARROW-5294](https://issues.apache.org/jira/browse/ARROW-5294) - [CI] setuptools\_scm failures
+* [ARROW-5296](https://issues.apache.org/jira/browse/ARROW-5296) - [Java] Sporadic Flight test failures
+* [ARROW-5301](https://issues.apache.org/jira/browse/ARROW-5301) - [Python] parquet documentation outdated on nthreads argument
+* [ARROW-5304](https://issues.apache.org/jira/browse/ARROW-5304) - [C++] CudaDeviceManager::GetInstance is not thread-safe
+* [ARROW-5306](https://issues.apache.org/jira/browse/ARROW-5306) - [CI] [GLib] Disable GTK-Doc
+* [ARROW-5308](https://issues.apache.org/jira/browse/ARROW-5308) - [Go] remove deprecated Feather format
+* [ARROW-5314](https://issues.apache.org/jira/browse/ARROW-5314) - [Go] Incorrect Printing for String Arrays with Offsets
+* [ARROW-5314](https://issues.apache.org/jira/browse/ARROW-5314) - [Go] Incorrect Printing for String Arrays with Offsets
+* [ARROW-5318](https://issues.apache.org/jira/browse/ARROW-5318) - [Python] pyarrow hdfs reader overrequests
+* [ARROW-5325](https://issues.apache.org/jira/browse/ARROW-5325) - [Archery][Benchmark] Output properly formatted jsonlines from benchmark diff cli command
+* [ARROW-5330](https://issues.apache.org/jira/browse/ARROW-5330) - [Python] [CI] Run Python Flight tests on Travis-CI
+* [ARROW-5332](https://issues.apache.org/jira/browse/ARROW-5332) - [R] R package fails to build/install: error in dyn.load()
+* [ARROW-5348](https://issues.apache.org/jira/browse/ARROW-5348) - [CI] [Java] Gandiva checkstyle failure
+* [ARROW-5360](https://issues.apache.org/jira/browse/ARROW-5360) - [Rust] Builds are broken by rustyline on nightly 2019-05-16+
+* [ARROW-5362](https://issues.apache.org/jira/browse/ARROW-5362) - [C++] Compression round trip test can cause some sanitizers to to fail
+* [ARROW-5371](https://issues.apache.org/jira/browse/ARROW-5371) - [Release] Add tests for dev/release/00-prepare.sh
+* [ARROW-5373](https://issues.apache.org/jira/browse/ARROW-5373) - [Java] Add missing details for Gandiva Java Build
+* [ARROW-5376](https://issues.apache.org/jira/browse/ARROW-5376) - [C++] Compile failure on gcc 5.4.0
+* [ARROW-5383](https://issues.apache.org/jira/browse/ARROW-5383) - [Go] update IPC flatbuf (new Duration type)
+* [ARROW-5387](https://issues.apache.org/jira/browse/ARROW-5387) - [Go] properly handle sub-slice of List
+* [ARROW-5388](https://issues.apache.org/jira/browse/ARROW-5388) - [Go] use arrow.TypeEqual in array.NewChunked
+* [ARROW-5390](https://issues.apache.org/jira/browse/ARROW-5390) - [CI] Job time limit exceeded on Travis
+* [ARROW-5397](https://issues.apache.org/jira/browse/ARROW-5397) - Test Flight TLS support
+* [ARROW-5398](https://issues.apache.org/jira/browse/ARROW-5398) - [Python] Flight tests broken by URI changes
+* [ARROW-5403](https://issues.apache.org/jira/browse/ARROW-5403) - [C++] Test failures not propagated in Windows shared builds
+* [ARROW-5411](https://issues.apache.org/jira/browse/ARROW-5411) - [C++][Python] Build error building on Mac OS Mojave
+* [ARROW-5412](https://issues.apache.org/jira/browse/ARROW-5412) - [Java] Integration test fails with UnsupportedOperationException
+* [ARROW-5419](https://issues.apache.org/jira/browse/ARROW-5419) - [C++] CSV strings\_can\_be\_null option doesn't respect all null\_values
+* [ARROW-5421](https://issues.apache.org/jira/browse/ARROW-5421) - [Packaging][Crossbow] Duplicated key in nightly test configuration
+* [ARROW-5422](https://issues.apache.org/jira/browse/ARROW-5422) - [CI] [C++] Build failure with Google Benchmark
+* [ARROW-5430](https://issues.apache.org/jira/browse/ARROW-5430) - [Python] Can read but not write parquet partitioned on large ints
+* [ARROW-5435](https://issues.apache.org/jira/browse/ARROW-5435) - [Java] add test for IntervalYearVector\#getAsStringBuilder
+* [ARROW-5437](https://issues.apache.org/jira/browse/ARROW-5437) - [Python] Missing pandas pytest marker from parquet tests
+* [ARROW-5446](https://issues.apache.org/jira/browse/ARROW-5446) - [C++] Use cmake header install directory instead of include
+* [ARROW-5448](https://issues.apache.org/jira/browse/ARROW-5448) - [CI] MinGW build failures on AppVeyor
+* [ARROW-5453](https://issues.apache.org/jira/browse/ARROW-5453) - [C++] Just-released cmake-format 0.5.2 breaks the build
+* [ARROW-5455](https://issues.apache.org/jira/browse/ARROW-5455) - [Rust] Build broken by 2019-05-30 Rust nightly
+* [ARROW-5456](https://issues.apache.org/jira/browse/ARROW-5456) - [GLib][Plasma] Installed plasma-glib may be used on building document
+* [ARROW-5457](https://issues.apache.org/jira/browse/ARROW-5457) - [GLib][Plasma] Environment variable name for test is wrong
+* [ARROW-5459](https://issues.apache.org/jira/browse/ARROW-5459) - [Go] implement Stringer for Float16 DataType
+* [ARROW-5462](https://issues.apache.org/jira/browse/ARROW-5462) - [Go] support writing zero-length List
+* [ARROW-5479](https://issues.apache.org/jira/browse/ARROW-5479) - [Rust] [DataFusion] Use ARROW\_TEST\_DATA instead of relative path for testing
+* [ARROW-5487](https://issues.apache.org/jira/browse/ARROW-5487) - [CI] [Python] Failure in docs build
+* [ARROW-5493](https://issues.apache.org/jira/browse/ARROW-5493) - [Integration/Go] add Go support for IPC integration tests
+* [ARROW-5507](https://issues.apache.org/jira/browse/ARROW-5507) - [Plasma] [CUDA] Compile error
+* [ARROW-5514](https://issues.apache.org/jira/browse/ARROW-5514) - [C++] Printer for uint64 shows wrong values
+* [ARROW-5517](https://issues.apache.org/jira/browse/ARROW-5517) - [C++] Header collection CMake logic should only consider filename without directory included
+* [ARROW-5520](https://issues.apache.org/jira/browse/ARROW-5520) - [C++][Packaging] No NVidia CUDA toolkit on AArch64C
+* [ARROW-5521](https://issues.apache.org/jira/browse/ARROW-5521) - [Packaging] License check fails with Apache RAT 0.13
+* [ARROW-5528](https://issues.apache.org/jira/browse/ARROW-5528) - Concatenate() crashes when concatenating empty binary arrays.
+* [ARROW-5532](https://issues.apache.org/jira/browse/ARROW-5532) - [JS] Field Metadata Not Read
+* [ARROW-5551](https://issues.apache.org/jira/browse/ARROW-5551) - [Go] invalid FixedSizeArray representation
+* [ARROW-5553](https://issues.apache.org/jira/browse/ARROW-5553) - [Ruby] red-arrow gem does not compile on ruby:2.5 docker image
+* [ARROW-5576](https://issues.apache.org/jira/browse/ARROW-5576) - [C++] Flaky thrift\_ep tarball downloads
+* [ARROW-5577](https://issues.apache.org/jira/browse/ARROW-5577) - [C++] Link failure due to googletest shared library on Alpine Linux
+* [ARROW-5583](https://issues.apache.org/jira/browse/ARROW-5583) - [Java] When the isSet of a NullableValueHolder is 0, the buffer field should not be used
+* [ARROW-5584](https://issues.apache.org/jira/browse/ARROW-5584) - [Java] Add import for link reference in FieldReader javadoc
+* [ARROW-5589](https://issues.apache.org/jira/browse/ARROW-5589) - [C++][Fuzzing] arrow-ipc-fuzzing-test crash 2354085db0125113f04f7bd23f54b85cca104713
+* [ARROW-5592](https://issues.apache.org/jira/browse/ARROW-5592) - [Go] implement Duration array
+* [ARROW-5596](https://issues.apache.org/jira/browse/ARROW-5596) - [Python] Flight tests failing on Python 2.7
+* [ARROW-5601](https://issues.apache.org/jira/browse/ARROW-5601) - [gandiva] Error when projector with a string field
+* [ARROW-5603](https://issues.apache.org/jira/browse/ARROW-5603) - [Python] register pytest markers to avoid warnings
+* [ARROW-5605](https://issues.apache.org/jira/browse/ARROW-5605) - [C++][Fuzzing] arrow-ipc-fuzzing-test crash 74aec871d14bb6b07c72ea8f0e8c9f72cbe6b73c
+* [ARROW-5606](https://issues.apache.org/jira/browse/ARROW-5606) - [Python] pandas.RangeIndex.\_start/\_stop/\_step are deprecated
+* [ARROW-5608](https://issues.apache.org/jira/browse/ARROW-5608) - [C++][parquet] Invalid memory access when using parquet::arrow::ColumnReader
+* [ARROW-5615](https://issues.apache.org/jira/browse/ARROW-5615) - [C++] Compilation error due to C++11 string literals on gcc 5.4.0 Ubuntu 16.04
+* [ARROW-5616](https://issues.apache.org/jira/browse/ARROW-5616) - [Python] C++ build failure against Python 2.7 headers
+* [ARROW-5617](https://issues.apache.org/jira/browse/ARROW-5617) - [C++] thrift\_ep 0.12.0 fails to build when using ARROW\_BOOST\_VENDORED=ON
+* [ARROW-5619](https://issues.apache.org/jira/browse/ARROW-5619) - [C++] get\_apache\_mirror.py doesn't work with Python 3.5
+* [ARROW-5623](https://issues.apache.org/jira/browse/ARROW-5623) - [CI][GLib] Failed on macOS
+* [ARROW-5624](https://issues.apache.org/jira/browse/ARROW-5624) - [C++] -Duriparser\_SOURCE=BUNDLED is broken
+* [ARROW-5626](https://issues.apache.org/jira/browse/ARROW-5626) - [C++][Gandiva] Expression cache should consider precision and scale too
+* [ARROW-5629](https://issues.apache.org/jira/browse/ARROW-5629) - [C++] Fix Coverity issues
+* [ARROW-5631](https://issues.apache.org/jira/browse/ARROW-5631) - [C++] CMake 3.2 build is broken
+* [ARROW-5644](https://issues.apache.org/jira/browse/ARROW-5644) - [Python] test\_flight.py::test\_tls\_do\_get appears to hang
+* [ARROW-5647](https://issues.apache.org/jira/browse/ARROW-5647) - [Python] Accessing a file from Databricks using pandas read\_parquet using the pyarrow engine fails with : Passed non-file path: /mnt/aa/example.parquet
+* [ARROW-5648](https://issues.apache.org/jira/browse/ARROW-5648) - [C++] Build fails on mingw without codecvt
+* [ARROW-5654](https://issues.apache.org/jira/browse/ARROW-5654) - [C++] ChunkedArray should validate the types of the arrays
+* [ARROW-5657](https://issues.apache.org/jira/browse/ARROW-5657) - [C++] "docker-compose run cpp" broken in master
+* [ARROW-5674](https://issues.apache.org/jira/browse/ARROW-5674) - [Python] Missing pandas pytest markers from test\_parquet.py
+* [ARROW-5675](https://issues.apache.org/jira/browse/ARROW-5675) - [Doc] Fix typo in documentation describing compile/debug workflow on macOS with Xcode IDE
+* [ARROW-5678](https://issues.apache.org/jira/browse/ARROW-5678) - [R][Lint] Fix hadolint docker linting error
+* [ARROW-5693](https://issues.apache.org/jira/browse/ARROW-5693) - [Go] skip IPC integration test for Decimal128
+* [ARROW-5697](https://issues.apache.org/jira/browse/ARROW-5697) - [GLib] c\_glib/Dockerfile is broken
+* [ARROW-5698](https://issues.apache.org/jira/browse/ARROW-5698) - [R] r/Dockerfile docker-compose build is broken
+* [ARROW-5709](https://issues.apache.org/jira/browse/ARROW-5709) - [C++] gandiva-date\_time\_test failure on Windows
+* [ARROW-5714](https://issues.apache.org/jira/browse/ARROW-5714) - [JS] Inconsistent behavior in Int64Builder with/without BigNum
+* [ARROW-5723](https://issues.apache.org/jira/browse/ARROW-5723) - [Gandiva][Crossbow] Builds failing
+* [ARROW-5728](https://issues.apache.org/jira/browse/ARROW-5728) - [Python] [CI] Travis-CI failures in test\_jvm.py
+* [ARROW-5729](https://issues.apache.org/jira/browse/ARROW-5729) - [Python][Java] ArrowType.Int object has no attribute 'isSigned'
+* [ARROW-5730](https://issues.apache.org/jira/browse/ARROW-5730) - [Python][CI] Selectively skip test cases in the dask integration test
+* [ARROW-5732](https://issues.apache.org/jira/browse/ARROW-5732) - [C++] macOS builds failing idiosyncratically on master with warnings from pmmintrin.h
+* [ARROW-5735](https://issues.apache.org/jira/browse/ARROW-5735) - [C++] Appveyor builds failing persistently in thrift\_ep build
+* [ARROW-5737](https://issues.apache.org/jira/browse/ARROW-5737) - [C++][Gandiva] Gandiva not building in manylinux
+* [ARROW-5738](https://issues.apache.org/jira/browse/ARROW-5738) - [Crossbow][Conda] OSX package builds are failing with missing intrinsics
+* [ARROW-5739](https://issues.apache.org/jira/browse/ARROW-5739) - [CI] Fix docker python build
+* [ARROW-5750](https://issues.apache.org/jira/browse/ARROW-5750) - [Java] Java compilation failures on master
+* [ARROW-5754](https://issues.apache.org/jira/browse/ARROW-5754) - [C++]Missing override for \~GrpcStreamWriter?
+* [ARROW-5765](https://issues.apache.org/jira/browse/ARROW-5765) - [C++] TestDictionary.Validate test is crashed with release build
+* [ARROW-5769](https://issues.apache.org/jira/browse/ARROW-5769) - [Java] org.apache.arrow.flight.TestTls is failed via dev/release/00-prepare.sh
+* [ARROW-5770](https://issues.apache.org/jira/browse/ARROW-5770) - [C++] Fix -Wpessimizing-move in result.h
+* [ARROW-5771](https://issues.apache.org/jira/browse/ARROW-5771) - [Python] Docker python-nopandas job fails
+* [ARROW-5774](https://issues.apache.org/jira/browse/ARROW-5774) - [Java][Documentation] Document the need to checkout git submodules for flight
+* [ARROW-5781](https://issues.apache.org/jira/browse/ARROW-5781) - [Archery] Ensure benchmark clone accepts remotes in revision
+* [ARROW-5791](https://issues.apache.org/jira/browse/ARROW-5791) - [Python] pyarrow.csv.read\_csv hangs + eats all RAM
+* [ARROW-5816](https://issues.apache.org/jira/browse/ARROW-5816) - [Release] Parallel curl does not work reliably in verify-release-candidate-sh
+* [ARROW-5922](https://issues.apache.org/jira/browse/ARROW-5922) - [Python] Unable to connect to HDFS from a worker/data node on a Kerberized cluster using pyarrow' hdfs API
+* [PARQUET-1402](https://issues.apache.org/jira/browse/PARQUET-1402) - [C++] incorrect calculation column start offset for files created by parquet-mr 1.8.1
+* [PARQUET-1405](https://issues.apache.org/jira/browse/PARQUET-1405) - [C++] 'Couldn't deserialize thrift' error when reading large binary column
+* [PARQUET-1405](https://issues.apache.org/jira/browse/PARQUET-1405) - [C++] 'Couldn't deserialize thrift' error when reading large binary column
+* [PARQUET-1565](https://issues.apache.org/jira/browse/PARQUET-1565) - [C++] SEGV in FromParquetSchema with corrupt file from PARQUET-1481
+* [PARQUET-1571](https://issues.apache.org/jira/browse/PARQUET-1571) - [C++] Can't read data from parquet file in C++ library
+* [PARQUET-1574](https://issues.apache.org/jira/browse/PARQUET-1574) - [C++] parquet-encoding-test failed with msvc
+* [PARQUET-1581](https://issues.apache.org/jira/browse/PARQUET-1581) - [C++] Fix undefined behavior in encoding.cc when num\_dictionary\_values is 0.
+
+
+
+# Apache Arrow 0.13.0 (2019-04-01)
+
+## Bug Fixes
+
+* [ARROW-295](https://issues.apache.org/jira/browse/ARROW-295) - Create DOAP File
+* [ARROW-1171](https://issues.apache.org/jira/browse/ARROW-1171) - [C++] Segmentation faults on Fedora 24 with pyarrow-manylinux1 and self-compiled turbodbc
+* [ARROW-2392](https://issues.apache.org/jira/browse/ARROW-2392) - [Python] pyarrow RecordBatchStreamWriter allows writing batches with different schemas
+* [ARROW-2399](https://issues.apache.org/jira/browse/ARROW-2399) - [Rust] Builder<T\> should not provide a set() method
+* [ARROW-2598](https://issues.apache.org/jira/browse/ARROW-2598) - [Python] table.to\_pandas segfault
+* [ARROW-3086](https://issues.apache.org/jira/browse/ARROW-3086) - [GLib] GISCAN fails due to conda-shipped openblas
+* [ARROW-3096](https://issues.apache.org/jira/browse/ARROW-3096) - [Python] Update Python source build instructions given Anaconda/conda-forge toolchain migration
+* [ARROW-3133](https://issues.apache.org/jira/browse/ARROW-3133) - [C++] Logical boolean kernels in kernels/boolean.cc cannot write into preallocated memory
+* [ARROW-3133](https://issues.apache.org/jira/browse/ARROW-3133) - [C++] Logical boolean kernels in kernels/boolean.cc cannot write into preallocated memory
+* [ARROW-3208](https://issues.apache.org/jira/browse/ARROW-3208) - [C++] Segmentation fault when casting dictionary to numeric with nullptr valid\_bitmap
+* [ARROW-3426](https://issues.apache.org/jira/browse/ARROW-3426) - [CI] Java integration test very verbose
+* [ARROW-3564](https://issues.apache.org/jira/browse/ARROW-3564) - [Python] writing version 2.0 parquet format with dictionary encoding enabled
+* [ARROW-3578](https://issues.apache.org/jira/browse/ARROW-3578) - [Release] Address spurious Apache RAT failures in source release script
+* [ARROW-3593](https://issues.apache.org/jira/browse/ARROW-3593) - [R] CI builds failing due to GitHub API rate limits
+* [ARROW-3606](https://issues.apache.org/jira/browse/ARROW-3606) - [Python] flake8 fails on Crossbow
+* [ARROW-3669](https://issues.apache.org/jira/browse/ARROW-3669) - [Python] Convert big-endian numbers or raise error in pyarrow.array
+* [ARROW-3843](https://issues.apache.org/jira/browse/ARROW-3843) - [Python] Writing Parquet file from empty table created with Table.from\_pandas(..., preserve\_index=False) fails
+* [ARROW-3923](https://issues.apache.org/jira/browse/ARROW-3923) - [Java] JDBC-to-Arrow Conversion: Unnecessary Calendar Requirement
+* [ARROW-4007](https://issues.apache.org/jira/browse/ARROW-4007) - [Java][Plasma] Plasma JNI tests failing
+* [ARROW-4050](https://issues.apache.org/jira/browse/ARROW-4050) - [Python][Parquet] core dump on reading parquet file
+* [ARROW-4081](https://issues.apache.org/jira/browse/ARROW-4081) - [Go] Sum methods on Mac OS X panic when the array is empty
+* [ARROW-4104](https://issues.apache.org/jira/browse/ARROW-4104) - [Java] race in AllocationManager during release
+* [ARROW-4108](https://issues.apache.org/jira/browse/ARROW-4108) - [Python/Java] Spark integration tests do not work
+* [ARROW-4117](https://issues.apache.org/jira/browse/ARROW-4117) - [Python] "asv dev" command fails with latest revision
+* [ARROW-4140](https://issues.apache.org/jira/browse/ARROW-4140) - [C++][Gandiva] Compiled LLVM bitcode file path may result in libraries being non-relocatable
+* [ARROW-4145](https://issues.apache.org/jira/browse/ARROW-4145) - [C++] Find Windows-compatible strptime implementation
+* [ARROW-4181](https://issues.apache.org/jira/browse/ARROW-4181) - [Python] TestConvertStructTypes.test\_from\_numpy\_large failing
+* [ARROW-4192](https://issues.apache.org/jira/browse/ARROW-4192) - "./dev/run\_docker\_compose.sh" is out of date
+* [ARROW-4213](https://issues.apache.org/jira/browse/ARROW-4213) - [Flight] C++ and Java implementations are incompatible
+* [ARROW-4244](https://issues.apache.org/jira/browse/ARROW-4244) - Clarify language around padding/alignment
+* [ARROW-4250](https://issues.apache.org/jira/browse/ARROW-4250) - [C++][Gandiva] Use approximate comparisons for floating point numbers in gandiva-projector-test
+* [ARROW-4252](https://issues.apache.org/jira/browse/ARROW-4252) - [C++] Status error context strings missing lines of code
+* [ARROW-4253](https://issues.apache.org/jira/browse/ARROW-4253) - [GLib] Cannot use non-system Boost specified with $BOOST\_ROOT
+* [ARROW-4254](https://issues.apache.org/jira/browse/ARROW-4254) - [C++] Gandiva tests fail to compile with Boost in Ubuntu 14.04 apt
+* [ARROW-4255](https://issues.apache.org/jira/browse/ARROW-4255) - [C++] Schema::GetFieldIndex is not thread-safe
+* [ARROW-4261](https://issues.apache.org/jira/browse/ARROW-4261) - [C++] CMake paths for IPC, Flight, Thrift, and Plasma don't support using Arrow as a subproject
+* [ARROW-4264](https://issues.apache.org/jira/browse/ARROW-4264) - [C++] Document why DCHECKs are used in kernels
+* [ARROW-4267](https://issues.apache.org/jira/browse/ARROW-4267) - [Python/C++][Parquet] Segfault when reading rowgroups with duplicated columns
+* [ARROW-4274](https://issues.apache.org/jira/browse/ARROW-4274) - [Gandiva] static jni library broken after decimal changes
+* [ARROW-4275](https://issues.apache.org/jira/browse/ARROW-4275) - [C++] gandiva-decimal\_single\_test extremely slow
+* [ARROW-4280](https://issues.apache.org/jira/browse/ARROW-4280) - [C++][Documentation] It looks like flex and bison are required for parquet
+* [ARROW-4282](https://issues.apache.org/jira/browse/ARROW-4282) - [Rust] builder benchmark is broken
+* [ARROW-4284](https://issues.apache.org/jira/browse/ARROW-4284) - [C\#] File / Stream serialization fails due to type mismatch / missing footer
+* [ARROW-4295](https://issues.apache.org/jira/browse/ARROW-4295) - [Plasma] Incorrect log message when evicting objects
+* [ARROW-4296](https://issues.apache.org/jira/browse/ARROW-4296) - [Plasma] Starting Plasma store with use\_one\_memory\_mapped\_file enabled crashes due to improper memory alignment
+* [ARROW-4308](https://issues.apache.org/jira/browse/ARROW-4308) - [Python] pyarrow has a hard dependency on pandas
+* [ARROW-4311](https://issues.apache.org/jira/browse/ARROW-4311) - [Python] Regression on pq.ParquetWriter incorrectly handling source string
+* [ARROW-4312](https://issues.apache.org/jira/browse/ARROW-4312) - [C++] Lint doesn't work anymore ("[Errno 24] Too many open files")
+* [ARROW-4319](https://issues.apache.org/jira/browse/ARROW-4319) - plasma/store.h pulls ins flatbuffer dependency
+* [ARROW-4320](https://issues.apache.org/jira/browse/ARROW-4320) - [C++] Add tests for non-contiguous tensors
+* [ARROW-4322](https://issues.apache.org/jira/browse/ARROW-4322) - [CI] docker nightlies fails after conda-forge compiler migration
+* [ARROW-4323](https://issues.apache.org/jira/browse/ARROW-4323) - [Packaging] Fix failing OSX clang conda forge builds
+* [ARROW-4326](https://issues.apache.org/jira/browse/ARROW-4326) - [C++] Development instructions in python/development.rst will not work for many Linux distros with new conda-forge toolchain
+* [ARROW-4327](https://issues.apache.org/jira/browse/ARROW-4327) - [Python] Add requirements-build.txt file to simplify setting up Python build environment
+* [ARROW-4328](https://issues.apache.org/jira/browse/ARROW-4328) - Make R build compatible with DARROW\_TENSORFLOW=ON
+* [ARROW-4329](https://issues.apache.org/jira/browse/ARROW-4329) - Python should include the parquet headers
+* [ARROW-4342](https://issues.apache.org/jira/browse/ARROW-4342) - [Gandiva][Java] spurious failures in projector cache test
+* [ARROW-4347](https://issues.apache.org/jira/browse/ARROW-4347) - [Python] Run Python Travis CI unit tests on Linux when Java codebase changed
+* [ARROW-4349](https://issues.apache.org/jira/browse/ARROW-4349) - [C++] Build all benchmarks on Windows without failing
+* [ARROW-4351](https://issues.apache.org/jira/browse/ARROW-4351) - [C++] Fail to build with static parquet
+* [ARROW-4355](https://issues.apache.org/jira/browse/ARROW-4355) - [C++] test-util functions are no longer part of libarrow
+* [ARROW-4360](https://issues.apache.org/jira/browse/ARROW-4360) - [C++] Query homebrew for Thrift
+* [ARROW-4364](https://issues.apache.org/jira/browse/ARROW-4364) - [C++] Fix -weverything -wextra compilation errors
+* [ARROW-4366](https://issues.apache.org/jira/browse/ARROW-4366) - [Docs] Change extension from format/README.md to format/README.rst
+* [ARROW-4367](https://issues.apache.org/jira/browse/ARROW-4367) - [C++] StringDictionaryBuilder segfaults on Finish with only null entries
+* [ARROW-4368](https://issues.apache.org/jira/browse/ARROW-4368) - Bintray repository signature verification fails
+* [ARROW-4370](https://issues.apache.org/jira/browse/ARROW-4370) - [Python] Table to pandas conversion fails for list of bool
+* [ARROW-4374](https://issues.apache.org/jira/browse/ARROW-4374) - [C++] DictionaryBuilder does not correctly report length and null\_count
+* [ARROW-4381](https://issues.apache.org/jira/browse/ARROW-4381) - [Docker] docker-compose build lint fails
+* [ARROW-4382](https://issues.apache.org/jira/browse/ARROW-4382) - [C++] Improve new cpplint output readability
+* [ARROW-4384](https://issues.apache.org/jira/browse/ARROW-4384) - [C++] Running "format" target on new Windows 10 install opens "how do you want to open this file" dialog
+* [ARROW-4385](https://issues.apache.org/jira/browse/ARROW-4385) - [Python] default\_version of a release should not include SNAPSHOT
+* [ARROW-4389](https://issues.apache.org/jira/browse/ARROW-4389) - [R] Installing clang-tools in CI is failing on trusty
+* [ARROW-4395](https://issues.apache.org/jira/browse/ARROW-4395) - ts-node throws type error running \`bin/arrow2csv.js\`
+* [ARROW-4400](https://issues.apache.org/jira/browse/ARROW-4400) - [CI] install of clang tools failing
+* [ARROW-4403](https://issues.apache.org/jira/browse/ARROW-4403) - [Rust] CI fails due to formatting errors
+* [ARROW-4404](https://issues.apache.org/jira/browse/ARROW-4404) - [CI] AppVeyor toolchain build does not build anything
+* [ARROW-4407](https://issues.apache.org/jira/browse/ARROW-4407) - [C++] ExternalProject\_Add does not capture CC/CXX correctly
+* [ARROW-4410](https://issues.apache.org/jira/browse/ARROW-4410) - [C++] Fix InvertKernel edge cases
+* [ARROW-4413](https://issues.apache.org/jira/browse/ARROW-4413) - [Python] pyarrow.hdfs.connect() failing
+* [ARROW-4414](https://issues.apache.org/jira/browse/ARROW-4414) - [C++] Stop using cmake COMMAND\_EXPAND\_LISTS because it breaks package builds for older distros
+* [ARROW-4417](https://issues.apache.org/jira/browse/ARROW-4417) - [C++] Doc build broken
+* [ARROW-4420](https://issues.apache.org/jira/browse/ARROW-4420) - [INTEGRATION] Make spark integration test pass and test against spark's master branch
+* [ARROW-4421](https://issues.apache.org/jira/browse/ARROW-4421) - [Flight][C++] Handle large Flight data messages
+* [ARROW-4434](https://issues.apache.org/jira/browse/ARROW-4434) - [Python] Cannot create empty StructArray via pa.StructArray.from\_arrays
+* [ARROW-4440](https://issues.apache.org/jira/browse/ARROW-4440) - [C++] Fix flatbuffers build using msvc
+* [ARROW-4457](https://issues.apache.org/jira/browse/ARROW-4457) - [Python] Cannot create Decimal128 array using integers
+* [ARROW-4469](https://issues.apache.org/jira/browse/ARROW-4469) - [Python][C++] CI Failing for Python 2.7 and 3.6 with valgrind
+* [ARROW-4471](https://issues.apache.org/jira/browse/ARROW-4471) - [C++] Pass AR and RANLIB to all external projects
+* [ARROW-4474](https://issues.apache.org/jira/browse/ARROW-4474) - [Flight] FlightInfo should use signed integer types for payload size
+* [ARROW-4480](https://issues.apache.org/jira/browse/ARROW-4480) - [Python] Drive letter removed when writing parquet file
+* [ARROW-4487](https://issues.apache.org/jira/browse/ARROW-4487) - [C++] Appveyor toolchain build does not actually build the project
+* [ARROW-4494](https://issues.apache.org/jira/browse/ARROW-4494) - [Java] arrow-jdbc JAR is not uploaded on release
+* [ARROW-4496](https://issues.apache.org/jira/browse/ARROW-4496) - [CI] CI failing for python Xcode 7.3
+* [ARROW-4498](https://issues.apache.org/jira/browse/ARROW-4498) - [Plasma] Plasma fails building with CUDA enabled
+* [ARROW-4500](https://issues.apache.org/jira/browse/ARROW-4500) - [C++] librt and pthread hacks can cause linking problems
+* [ARROW-4501](https://issues.apache.org/jira/browse/ARROW-4501) - [C++] Unique returns non-unique strings
+* [ARROW-4525](https://issues.apache.org/jira/browse/ARROW-4525) - [Rust] [Parquet] Convert ArrowError to ParquetError
+* [ARROW-4527](https://issues.apache.org/jira/browse/ARROW-4527) - [Packaging] Update linux packaging tasks to align with the LLVM 7 migration
+* [ARROW-4532](https://issues.apache.org/jira/browse/ARROW-4532) - [Java] varchar value buffer much larger than expected
+* [ARROW-4533](https://issues.apache.org/jira/browse/ARROW-4533) - [Python] Document how to run hypothesis tests
+* [ARROW-4535](https://issues.apache.org/jira/browse/ARROW-4535) - [C++] Fix MakeBuilder to preserve ListType's field name
+* [ARROW-4536](https://issues.apache.org/jira/browse/ARROW-4536) - Add data\_type argument in garrow\_list\_array\_new
+* [ARROW-4538](https://issues.apache.org/jira/browse/ARROW-4538) - [PYTHON] Remove index column from subschema in write\_to\_dataframe
+* [ARROW-4549](https://issues.apache.org/jira/browse/ARROW-4549) - [C++] Can't build benchmark code on CUDA enabled build
+* [ARROW-4550](https://issues.apache.org/jira/browse/ARROW-4550) - [JS] Fix AMD pattern
+* [ARROW-4559](https://issues.apache.org/jira/browse/ARROW-4559) - [Python] pyarrow can't read/write filenames with special characters
+* [ARROW-4563](https://issues.apache.org/jira/browse/ARROW-4563) - [Python] pa.decimal128 should validate inputs
+* [ARROW-4571](https://issues.apache.org/jira/browse/ARROW-4571) - [Format] Tensor.fbs file has multiple root\_type declarations
+* [ARROW-4573](https://issues.apache.org/jira/browse/ARROW-4573) - [Python] Add Flight unit tests
+* [ARROW-4576](https://issues.apache.org/jira/browse/ARROW-4576) - [Python] Benchmark failures
+* [ARROW-4577](https://issues.apache.org/jira/browse/ARROW-4577) - [C++] Interface link libraries declared on arrow\_shared target that are actually non-interface
+* [ARROW-4581](https://issues.apache.org/jira/browse/ARROW-4581) - [C++] gbenchmark\_ep is a dependency of unit tests when ARROW\_BUILD\_BENCHMARKS=ON
+* [ARROW-4582](https://issues.apache.org/jira/browse/ARROW-4582) - [C++/Python] Memory corruption on Pandas-\>Arrow conversion
+* [ARROW-4584](https://issues.apache.org/jira/browse/ARROW-4584) - [Python] Add built wheel to manylinux1 dockerignore.
+* [ARROW-4585](https://issues.apache.org/jira/browse/ARROW-4585) - [C++] Dependency of Flight C++ sources on generated protobuf is not respected
+* [ARROW-4587](https://issues.apache.org/jira/browse/ARROW-4587) - Flight C++ DoPut segfaults
+* [ARROW-4597](https://issues.apache.org/jira/browse/ARROW-4597) - [C++] Targets for system Google Mock shared library are missing
+* [ARROW-4601](https://issues.apache.org/jira/browse/ARROW-4601) - [Python] Master build is broken due to missing licence for .dockerignore
+* [ARROW-4606](https://issues.apache.org/jira/browse/ARROW-4606) - [Rust] [DataFusion] FilterRelation created RecordBatch with empty schema
+* [ARROW-4608](https://issues.apache.org/jira/browse/ARROW-4608) - [C++] cmake script assumes that double-conversion installs static libs
+* [ARROW-4617](https://issues.apache.org/jira/browse/ARROW-4617) - [C++] Support double-conversion<3.1
+* [ARROW-4624](https://issues.apache.org/jira/browse/ARROW-4624) - [C++] Linker errors when building benchmarks
+* [ARROW-4629](https://issues.apache.org/jira/browse/ARROW-4629) - [Python] Pandas to arrow conversion slowed down by local imports
+* [ARROW-4635](https://issues.apache.org/jira/browse/ARROW-4635) - [Java] StructVector growing validity buffer unnecessarily
+* [ARROW-4639](https://issues.apache.org/jira/browse/ARROW-4639) - [CI] Crossbow build failing for Gandiva jars
+* [ARROW-4641](https://issues.apache.org/jira/browse/ARROW-4641) - [C++] Flight builds complain of -Wstrict-aliasing
+* [ARROW-4642](https://issues.apache.org/jira/browse/ARROW-4642) - [R] Change \`f\` to \`file\` in \`read\_parquet\_file()\`
+* [ARROW-4653](https://issues.apache.org/jira/browse/ARROW-4653) - [C++] decimal multiply broken when both args are negative
+* [ARROW-4654](https://issues.apache.org/jira/browse/ARROW-4654) - [C++] Implicit Flight target dependencies cause compilation failure
+* [ARROW-4657](https://issues.apache.org/jira/browse/ARROW-4657) - [Release] gbenchmark should not be needed for verification
+* [ARROW-4658](https://issues.apache.org/jira/browse/ARROW-4658) - [C++] Shared gflags is also a run-time conda requirement
+* [ARROW-4659](https://issues.apache.org/jira/browse/ARROW-4659) - [CI] ubuntu/debian nightlies fail because of missing gandiva files
+* [ARROW-4660](https://issues.apache.org/jira/browse/ARROW-4660) - [C++] gflags fails to build due to CMake error
+* [ARROW-4664](https://issues.apache.org/jira/browse/ARROW-4664) - [C++] DCHECK macro conditions are evaluated in release builds
+* [ARROW-4669](https://issues.apache.org/jira/browse/ARROW-4669) - [Java] No Bounds checking on ArrowBuf.slice
+* [ARROW-4672](https://issues.apache.org/jira/browse/ARROW-4672) - [C++] clang-7 matrix entry is build using gcc
+* [ARROW-4680](https://issues.apache.org/jira/browse/ARROW-4680) - [CI] [Rust] Travis CI builds fail with latest Rust 1.34.0-nightly (2019-02-25)
+* [ARROW-4684](https://issues.apache.org/jira/browse/ARROW-4684) - [Python] CI failures in test\_cython.py
+* [ARROW-4687](https://issues.apache.org/jira/browse/ARROW-4687) - [Python] FlightServerBase.run should exit on Ctrl-C
+* [ARROW-4688](https://issues.apache.org/jira/browse/ARROW-4688) - [C++][Parquet] 16MB limit on (nested) column chunk prevents tuning row\_group\_size
+* [ARROW-4696](https://issues.apache.org/jira/browse/ARROW-4696) - Verify release script is over optimist with CUDA detection
+* [ARROW-4699](https://issues.apache.org/jira/browse/ARROW-4699) - [C++] json parser should not rely on null terminated buffers
+* [ARROW-4704](https://issues.apache.org/jira/browse/ARROW-4704) - [CI][GLib] Plasma test is flaky
+* [ARROW-4710](https://issues.apache.org/jira/browse/ARROW-4710) - [C++][R] New linting script skip files with "cpp" extension
+* [ARROW-4712](https://issues.apache.org/jira/browse/ARROW-4712) - [C++][CI] Clang7 Valgrind complains when not move shared\_ptr
+* [ARROW-4721](https://issues.apache.org/jira/browse/ARROW-4721) - [Rust] [DataFusion] Propagate schema in filter
+* [ARROW-4724](https://issues.apache.org/jira/browse/ARROW-4724) - [C++] Python not being built nor test under MinGW builds
+* [ARROW-4728](https://issues.apache.org/jira/browse/ARROW-4728) - [JS] Failing test Table\#assign with a zero-length Null column round-trips through serialization
+* [ARROW-4737](https://issues.apache.org/jira/browse/ARROW-4737) - [C\#] tests are not running in CI
+* [ARROW-4744](https://issues.apache.org/jira/browse/ARROW-4744) - [CI][C++] Mingw32 builds failing
+* [ARROW-4750](https://issues.apache.org/jira/browse/ARROW-4750) - [C++] RapidJSON triggers Wclass-memaccess on GCC 8+
+* [ARROW-4760](https://issues.apache.org/jira/browse/ARROW-4760) - [C++] protobuf 3.7 defines EXPECT\_OK that clashes with Arrow's macro
+* [ARROW-4766](https://issues.apache.org/jira/browse/ARROW-4766) - [C++] Casting empty boolean array causes segfault
+* [ARROW-4767](https://issues.apache.org/jira/browse/ARROW-4767) - [C\#] ArrowStreamReader crashes while reading the end of a stream
+* [ARROW-4768](https://issues.apache.org/jira/browse/ARROW-4768) - [C++][CI] arrow-test-array sometimes gets stuck in MinGW build
+* [ARROW-4774](https://issues.apache.org/jira/browse/ARROW-4774) - [C++][Parquet] Call Table::Validate when writing a table
+* [ARROW-4775](https://issues.apache.org/jira/browse/ARROW-4775) - [Website] Site navbar cannot be expanded
+* [ARROW-4783](https://issues.apache.org/jira/browse/ARROW-4783) - [C++][CI] Mingw32 builds sometimes timeout
+* [ARROW-4793](https://issues.apache.org/jira/browse/ARROW-4793) - [Ruby] Suppress unused variable warning
+* [ARROW-4796](https://issues.apache.org/jira/browse/ARROW-4796) - [Flight][Python] segfault in simple server implementation
+* [ARROW-4802](https://issues.apache.org/jira/browse/ARROW-4802) - [Python] Hadoop classpath discovery broken HADOOP\_HOME is a symlink
+* [ARROW-4807](https://issues.apache.org/jira/browse/ARROW-4807) - [Rust] Fix csv\_writer benchmark
+* [ARROW-4811](https://issues.apache.org/jira/browse/ARROW-4811) - [C++] An incorrect dependency leads "ninja" to re-evaluate steps unnecessarily on subsequent calls
+* [ARROW-4813](https://issues.apache.org/jira/browse/ARROW-4813) - [Ruby] Add tests for \#== and \#!=
+* [ARROW-4820](https://issues.apache.org/jira/browse/ARROW-4820) - [Python] hadoop class path derived not correct
+* [ARROW-4822](https://issues.apache.org/jira/browse/ARROW-4822) - [C++/Python] pyarrow.Table.equals segmentation fault on None
+* [ARROW-4828](https://issues.apache.org/jira/browse/ARROW-4828) - [Python] manylinux1 docker-compose context should be python/manylinux1
+* [ARROW-4850](https://issues.apache.org/jira/browse/ARROW-4850) - [CI] Integration test failures do not fail the Travis CI build
+* [ARROW-4853](https://issues.apache.org/jira/browse/ARROW-4853) - [Rust] Array slice doesn't work on ListArray and StructArray
+* [ARROW-4857](https://issues.apache.org/jira/browse/ARROW-4857) - [C++/Python/CI] docker-compose in manylinux1 crossbow jobs too old
+* [ARROW-4866](https://issues.apache.org/jira/browse/ARROW-4866) - [C++] zstd ExternalProject failing on Windows
+* [ARROW-4867](https://issues.apache.org/jira/browse/ARROW-4867) - [Python] Table.from\_pandas() column order not respected
+* [ARROW-4869](https://issues.apache.org/jira/browse/ARROW-4869) - [C++] Use of gmock fails in compute/kernels/util-internal-test.cc
+* [ARROW-4870](https://issues.apache.org/jira/browse/ARROW-4870) - [Ruby] gemspec has wrong msys2 dependency listed
+* [ARROW-4871](https://issues.apache.org/jira/browse/ARROW-4871) - [Flight][Java] Handle large Flight messages
+* [ARROW-4872](https://issues.apache.org/jira/browse/ARROW-4872) - [Python] Keep backward compatibility for ParquetDatasetPiece
+* [ARROW-4879](https://issues.apache.org/jira/browse/ARROW-4879) - [C++] cmake can't use conda's flatbuffers
+* [ARROW-4881](https://issues.apache.org/jira/browse/ARROW-4881) - [Python] bundle\_zlib CMake function still uses ARROW\_BUILD\_TOOLCHAIN
+* [ARROW-4900](https://issues.apache.org/jira/browse/ARROW-4900) - mingw-w64 < 5 does not have \_\_cpuidex
+* [ARROW-4903](https://issues.apache.org/jira/browse/ARROW-4903) - [C++] Building tests using only static libs not possible
+* [ARROW-4906](https://issues.apache.org/jira/browse/ARROW-4906) - [Format] Fix document to describe that SparseMatrixIndexCSR assumes indptr is sorted for each row
+* [ARROW-4918](https://issues.apache.org/jira/browse/ARROW-4918) - [C++] Add cmake-format to pre-commit
+* [ARROW-4928](https://issues.apache.org/jira/browse/ARROW-4928) - [Python] Hypothesis test failures
+* [ARROW-4931](https://issues.apache.org/jira/browse/ARROW-4931) - [C++] CMake fails on gRPC ExternalProject
+* [ARROW-4938](https://issues.apache.org/jira/browse/ARROW-4938) - [Glib] Undefined symbols error occurred when GIR file is being generated.
+* [ARROW-4942](https://issues.apache.org/jira/browse/ARROW-4942) - [Ruby] Remove needless omits
+* [ARROW-4948](https://issues.apache.org/jira/browse/ARROW-4948) - [JS] Nightly test failing with "Cannot assign to read only property"
+* [ARROW-4950](https://issues.apache.org/jira/browse/ARROW-4950) - [C++] Thirdparty CMake error get\_target\_property() called with non-existent target LZ4::lz4
+* [ARROW-4952](https://issues.apache.org/jira/browse/ARROW-4952) - [C++] Equals / ApproxEquals behaviour undefined on FP NaNs
+* [ARROW-4953](https://issues.apache.org/jira/browse/ARROW-4953) - [Ruby] Not loading libarrow-glib
+* [ARROW-4954](https://issues.apache.org/jira/browse/ARROW-4954) - [Python] test failure with Flight enabled
+* [ARROW-4958](https://issues.apache.org/jira/browse/ARROW-4958) - [C++] Purely static linking broken
+* [ARROW-4961](https://issues.apache.org/jira/browse/ARROW-4961) - [C++][Python] Add GTest\_SOURCE=BUNDLED to relevant build docs that use conda-forge toolchain
+* [ARROW-4962](https://issues.apache.org/jira/browse/ARROW-4962) - [C++] Warning level to CHECKIN can't compile on modern GCC
+* [ARROW-4976](https://issues.apache.org/jira/browse/ARROW-4976) - [JS] RecordBatchReader should reset its Node/DOM streams
+* [ARROW-4982](https://issues.apache.org/jira/browse/ARROW-4982) - [GLib][CI] Run tests on AppVeyor
+* [ARROW-4984](https://issues.apache.org/jira/browse/ARROW-4984) - [Flight][C++] Flight server segfaults when port is in use
+* [ARROW-4986](https://issues.apache.org/jira/browse/ARROW-4986) - [CI] Travis fails to install llvm@7
+* [ARROW-4989](https://issues.apache.org/jira/browse/ARROW-4989) - [C++] Builds fails to find Ubuntu-packaged re2 library
+* [ARROW-4991](https://issues.apache.org/jira/browse/ARROW-4991) - [CI] Bump travis node version to 11.12
+* [ARROW-4997](https://issues.apache.org/jira/browse/ARROW-4997) - [C\#] ArrowStreamReader doesn't consume whole stream and doesn't implement sync read
+* [ARROW-5009](https://issues.apache.org/jira/browse/ARROW-5009) - [C++] Cleanup using to std::\* in files
+* [ARROW-5010](https://issues.apache.org/jira/browse/ARROW-5010) - [Release] Fix release script with llvm-7
+* [ARROW-5012](https://issues.apache.org/jira/browse/ARROW-5012) - [C++] "testing" headers not installed
+* [ARROW-5023](https://issues.apache.org/jira/browse/ARROW-5023) - [Release] Default value syntax in shell is wrong
+* [ARROW-5024](https://issues.apache.org/jira/browse/ARROW-5024) - [Release] crossbow.py --arrow-version causes missing variable error
+* [ARROW-5025](https://issues.apache.org/jira/browse/ARROW-5025) - [Python][Packaging] wheel for Windows are broken
+* [ARROW-5026](https://issues.apache.org/jira/browse/ARROW-5026) - [Python][Packaging] conda package on non Windows is broken
+* [ARROW-5029](https://issues.apache.org/jira/browse/ARROW-5029) - [C++] Compilation warnings in release mode
+* [ARROW-5031](https://issues.apache.org/jira/browse/ARROW-5031) - [Dev] Release verification script does not run CUDA tests in Python
+* [ARROW-5042](https://issues.apache.org/jira/browse/ARROW-5042) - [Release] Wrong ARROW\_DEPENDENCY\_SOURCE in verification script
+* [ARROW-5043](https://issues.apache.org/jira/browse/ARROW-5043) - [Release][Ruby] red-arrow dependency can't be resolve in verification script
+* [ARROW-5044](https://issues.apache.org/jira/browse/ARROW-5044) - [Release][Rust] Format error in verification script
+* [ARROW-5046](https://issues.apache.org/jira/browse/ARROW-5046) - [Release][C++] Plasma test is fragile in verification script
+* [ARROW-5047](https://issues.apache.org/jira/browse/ARROW-5047) - [Release] Always set up parquet-testing in verification script
+* [ARROW-5048](https://issues.apache.org/jira/browse/ARROW-5048) - [Release][Rust] arrow-testing is missing in verification script
+* [ARROW-5050](https://issues.apache.org/jira/browse/ARROW-5050) - [C++] cares\_ep should build before grpc\_ep
+* [ARROW-5087](https://issues.apache.org/jira/browse/ARROW-5087) - [Debian] APT repository no longer contains libarrow-dev
+* [ARROW-5658](https://issues.apache.org/jira/browse/ARROW-5658) - [JAVA] Provide ability to resync VectorSchemaRoot if types change
+* [PARQUET-1482](https://issues.apache.org/jira/browse/PARQUET-1482) - [C++] Unable to read data from parquet file generated with parquetjs
+* [PARQUET-1494](https://issues.apache.org/jira/browse/PARQUET-1494) - [C++] Can't access parquet statistics on binary columns
+* [PARQUET-1532](https://issues.apache.org/jira/browse/PARQUET-1532) - [C++] Can't build column reader test with MinGW
+
+
+## New Features and Improvements
+
+* [ARROW-47](https://issues.apache.org/jira/browse/ARROW-47) - [C++] Consider adding a scalar type object model
+* [ARROW-331](https://issues.apache.org/jira/browse/ARROW-331) - [Python] Timeline for dropping Python 2.7 support
+* [ARROW-549](https://issues.apache.org/jira/browse/ARROW-549) - [C++] Add function to concatenate like-typed arrays
+* [ARROW-572](https://issues.apache.org/jira/browse/ARROW-572) - [C++] Apply visitor pattern in IPC metadata
+* [ARROW-585](https://issues.apache.org/jira/browse/ARROW-585) - [C++] Define public API for user-defined data types
+* [ARROW-694](https://issues.apache.org/jira/browse/ARROW-694) - [C++] Build JSON "scanner" for reading record batches from line-delimited JSON files
+* [ARROW-1425](https://issues.apache.org/jira/browse/ARROW-1425) - [Python] Document semantic differences between Spark timestamps and Arrow timestamps
+* [ARROW-1572](https://issues.apache.org/jira/browse/ARROW-1572) - [C++] Implement "value counts" kernels for tabulating value frequencies
+* [ARROW-1639](https://issues.apache.org/jira/browse/ARROW-1639) - [Python] More efficient serialization for RangeIndex in serialize\_pandas
+* [ARROW-1642](https://issues.apache.org/jira/browse/ARROW-1642) - [GLib] Build GLib using Meson in Appveyor
+* [ARROW-1807](https://issues.apache.org/jira/browse/ARROW-1807) - [JAVA] Reduce Heap Usage (Phase 3): consolidate buffers
+* [ARROW-1896](https://issues.apache.org/jira/browse/ARROW-1896) - [C++] Do not allocate memory for primitive outputs in CastKernel::Call implementation
+* [ARROW-2015](https://issues.apache.org/jira/browse/ARROW-2015) - [Java] Use Java Time and Date APIs instead of JodaTime
+* [ARROW-2022](https://issues.apache.org/jira/browse/ARROW-2022) - [Format] Add custom metadata field specific to a RecordBatch message
+* [ARROW-2112](https://issues.apache.org/jira/browse/ARROW-2112) - [C++] Enable cpplint to be run on Windows
+* [ARROW-2243](https://issues.apache.org/jira/browse/ARROW-2243) - [C++] Enable IPO/LTO
+* [ARROW-2409](https://issues.apache.org/jira/browse/ARROW-2409) - [Rust] Test for build warnings, remove current warnings
+* [ARROW-2460](https://issues.apache.org/jira/browse/ARROW-2460) - [Rust] Schema and DataType::Struct should use Vec<Rc<Field\>\>
+* [ARROW-2487](https://issues.apache.org/jira/browse/ARROW-2487) - [C++] Provide a variant of AppendValues that takes bytemaps for the nullability
+* [ARROW-2523](https://issues.apache.org/jira/browse/ARROW-2523) - [Rust] Implement CAST operations for arrays
+* [ARROW-2620](https://issues.apache.org/jira/browse/ARROW-2620) - [Rust] Integrate memory pool abstraction with rest of codebase
+* [ARROW-2627](https://issues.apache.org/jira/browse/ARROW-2627) - [Python] Add option (or some equivalent) to toggle memory mapping functionality when using parquet.ParquetFile or other read entry points
+* [ARROW-2904](https://issues.apache.org/jira/browse/ARROW-2904) - [C++] Use FirstTimeBitmapWriter instead of SetBit functions in builder.h/cc
+* [ARROW-3066](https://issues.apache.org/jira/browse/ARROW-3066) - [Wiki] Add "How to contribute" to developer wiki
+* [ARROW-3084](https://issues.apache.org/jira/browse/ARROW-3084) - [Python] Do we need to build both unicode variants of pyarrow wheels?
+* [ARROW-3107](https://issues.apache.org/jira/browse/ARROW-3107) - [C++] arrow::PrettyPrint for Column instances
+* [ARROW-3121](https://issues.apache.org/jira/browse/ARROW-3121) - [C++] Mean kernel aggregate
+* [ARROW-3123](https://issues.apache.org/jira/browse/ARROW-3123) - [C++] Incremental Count, Count Not Null aggregator
+* [ARROW-3135](https://issues.apache.org/jira/browse/ARROW-3135) - [C++] Add helper functions for validity bitmap propagation in kernel context
+* [ARROW-3149](https://issues.apache.org/jira/browse/ARROW-3149) - [C++] Use gRPC (when it exists) from conda-forge for CI builds
+* [ARROW-3162](https://issues.apache.org/jira/browse/ARROW-3162) - [Python] Enable Flight servers to be implemented in pure Python
+* [ARROW-3162](https://issues.apache.org/jira/browse/ARROW-3162) - [Python] Enable Flight servers to be implemented in pure Python
+* [ARROW-3239](https://issues.apache.org/jira/browse/ARROW-3239) - [C++] Improve random data generation functions
+* [ARROW-3255](https://issues.apache.org/jira/browse/ARROW-3255) - [C++/Python] Migrate Travis CI jobs off Xcode 6.4
+* [ARROW-3289](https://issues.apache.org/jira/browse/ARROW-3289) - [C++] Implement DoPut command for Flight on client and server side
+* [ARROW-3292](https://issues.apache.org/jira/browse/ARROW-3292) - [C++] Test Flight RPC in Travis CI
+* [ARROW-3295](https://issues.apache.org/jira/browse/ARROW-3295) - [Packaging] Package gRPC libraries in conda-forge for use in builds, packaging
+* [ARROW-3297](https://issues.apache.org/jira/browse/ARROW-3297) - [Python] Python bindings for Flight C++ client
+* [ARROW-3311](https://issues.apache.org/jira/browse/ARROW-3311) - [R] Functions for deserializing IPC components from arrow::Buffer or from IO interface
+* [ARROW-3328](https://issues.apache.org/jira/browse/ARROW-3328) - [Flight] Allow for optional unique flight identifier to be sent with FlightGetInfo
+* [ARROW-3361](https://issues.apache.org/jira/browse/ARROW-3361) - [R] Run cpp/build-support/cpplint.py on C++ source files
+* [ARROW-3364](https://issues.apache.org/jira/browse/ARROW-3364) - [Doc] Document docker compose setup
+* [ARROW-3367](https://issues.apache.org/jira/browse/ARROW-3367) - [INTEGRATION] Port Spark integration test to the docker-compose setup
+* [ARROW-3422](https://issues.apache.org/jira/browse/ARROW-3422) - [C++] Add "toolchain" target to ensure that all required toolchain libraries are built
+* [ARROW-3434](https://issues.apache.org/jira/browse/ARROW-3434) - [Packaging] Add Apache ORC C++ library to conda-forge
+* [ARROW-3435](https://issues.apache.org/jira/browse/ARROW-3435) - [C++] Add option to use dynamic linking with re2
+* [ARROW-3511](https://issues.apache.org/jira/browse/ARROW-3511) - [Gandiva] support input selection vectors for both projector and filter
+* [ARROW-3532](https://issues.apache.org/jira/browse/ARROW-3532) - [Python] Schema, StructType, StructArray field retrieval by name should raise warning or exception for multiple matches
+* [ARROW-3550](https://issues.apache.org/jira/browse/ARROW-3550) - [C++] Use kUnknownNullCount in NumericArray constructor
+* [ARROW-3554](https://issues.apache.org/jira/browse/ARROW-3554) - [C++] Reverse traits for C++
+* [ARROW-3594](https://issues.apache.org/jira/browse/ARROW-3594) - [Packaging] Build "cares" library in conda-forge
+* [ARROW-3595](https://issues.apache.org/jira/browse/ARROW-3595) - [Packaging] Build boringssl in conda-forge
+* [ARROW-3596](https://issues.apache.org/jira/browse/ARROW-3596) - [Packaging] Build gRPC in conda-forge
+* [ARROW-3619](https://issues.apache.org/jira/browse/ARROW-3619) - [R] Expose global thread pool optins
+* [ARROW-3631](https://issues.apache.org/jira/browse/ARROW-3631) - [C\#] Add Appveyor build for C\#
+* [ARROW-3653](https://issues.apache.org/jira/browse/ARROW-3653) - [Python/C++] Support data copying between different GPU devices
+* [ARROW-3735](https://issues.apache.org/jira/browse/ARROW-3735) - [Python] Proper error handling in \_ensure\_type
+* [ARROW-3761](https://issues.apache.org/jira/browse/ARROW-3761) - [R] Bindings for CompressedInputStream, CompressedOutputStream
+* [ARROW-3763](https://issues.apache.org/jira/browse/ARROW-3763) - [C++] Write Parquet ByteArray / FixedLenByteArray reader batches directly into arrow::BinaryBuilder
+* [ARROW-3769](https://issues.apache.org/jira/browse/ARROW-3769) - [C++] Support reading non-dictionary encoded binary Parquet columns directly as DictionaryArray
+* [ARROW-3770](https://issues.apache.org/jira/browse/ARROW-3770) - [C++] Validate or add option to validate arrow::Table schema in parquet::arrow::FileWriter::WriteTable
+* [ARROW-3816](https://issues.apache.org/jira/browse/ARROW-3816) - [R] nrow.RecordBatch method
+* [ARROW-3824](https://issues.apache.org/jira/browse/ARROW-3824) - [R] Document developer workflow for building project, running unit tests in r/README.md
+* [ARROW-3838](https://issues.apache.org/jira/browse/ARROW-3838) - [Rust] Implement CSV Writer
+* [ARROW-3846](https://issues.apache.org/jira/browse/ARROW-3846) - [Gandiva] Build on Windows
+* [ARROW-3882](https://issues.apache.org/jira/browse/ARROW-3882) - [Rust] PrimitiveArray<T\> should support cast operations
+* [ARROW-3903](https://issues.apache.org/jira/browse/ARROW-3903) - [Python] Random array generator for Arrow conversion and Parquet testing
+* [ARROW-3926](https://issues.apache.org/jira/browse/ARROW-3926) - [Python] Add Gandiva bindings to Python wheels
+* [ARROW-3951](https://issues.apache.org/jira/browse/ARROW-3951) - [Go] implement a CSV writer
+* [ARROW-3954](https://issues.apache.org/jira/browse/ARROW-3954) - [Rust] Add Slice to Array and ArrayData
+* [ARROW-3965](https://issues.apache.org/jira/browse/ARROW-3965) - [Java] JDBC-to-Arrow Conversion: Configuration Object
+* [ARROW-3966](https://issues.apache.org/jira/browse/ARROW-3966) - [Java] JDBC-to-Arrow Conversion: JDBC Metadata in Schema Fields
+* [ARROW-3972](https://issues.apache.org/jira/browse/ARROW-3972) - [C++] Update to LLVM and Clang bits to 7.0
+* [ARROW-3981](https://issues.apache.org/jira/browse/ARROW-3981) - [C++] Rename json.h
+* [ARROW-3985](https://issues.apache.org/jira/browse/ARROW-3985) - [C++] Pass -C option when compiling with ccache to avoid some warnings
+* [ARROW-4012](https://issues.apache.org/jira/browse/ARROW-4012) - [Documentation][C++] Document how to install Apache Arrow on MSYS2
+* [ARROW-4014](https://issues.apache.org/jira/browse/ARROW-4014) - [C++] Fix "LIBCMT" warnings on MSVC
+* [ARROW-4023](https://issues.apache.org/jira/browse/ARROW-4023) - [Gandiva] Address long CI times in macOS builds
+* [ARROW-4024](https://issues.apache.org/jira/browse/ARROW-4024) - [Python] Cython compilation error on cython==0.27.3
+* [ARROW-4031](https://issues.apache.org/jira/browse/ARROW-4031) - [C++] Refactor ArrayBuilder bitmap logic into TypedBufferBuilder<bool\>
+* [ARROW-4040](https://issues.apache.org/jira/browse/ARROW-4040) - [Rust] Add array\_ops method for filtering an array
+* [ARROW-4056](https://issues.apache.org/jira/browse/ARROW-4056) - [C++] Upgrade to boost-cpp 1.69.0 again
+* [ARROW-4061](https://issues.apache.org/jira/browse/ARROW-4061) - [Rust] [Parquet] Implement "spaced" version for non-dictionary encoding/decoding
+* [ARROW-4068](https://issues.apache.org/jira/browse/ARROW-4068) - [Gandiva] Support building with Xcode 6.4
+* [ARROW-4071](https://issues.apache.org/jira/browse/ARROW-4071) - [Rust] Add rustfmt as a pre-commit hook
+* [ARROW-4072](https://issues.apache.org/jira/browse/ARROW-4072) - [Rust] Set default value for PARQUET\_TEST\_DATA
+* [ARROW-4092](https://issues.apache.org/jira/browse/ARROW-4092) - [Rust] Implement common Reader / DataSource trait for CSV and Parquet
+* [ARROW-4094](https://issues.apache.org/jira/browse/ARROW-4094) - [Python] Store RangeIndex in Parquet files as metadata rather than a physical data column
+* [ARROW-4110](https://issues.apache.org/jira/browse/ARROW-4110) - [C++] Do not generate distinct cast kernels when input and output type are the same
+* [ARROW-4123](https://issues.apache.org/jira/browse/ARROW-4123) - [C++] Improve linting workflow and documentation for Windows-based developers
+* [ARROW-4124](https://issues.apache.org/jira/browse/ARROW-4124) - [C++] Abstract aggregation kernel API
+* [ARROW-4142](https://issues.apache.org/jira/browse/ARROW-4142) - [Java] JDBC-to-Arrow: JDBC Arrays
+* [ARROW-4165](https://issues.apache.org/jira/browse/ARROW-4165) - [C++] Port cpp/apidoc/Windows.md and other files to Sphinx / rst
+* [ARROW-4180](https://issues.apache.org/jira/browse/ARROW-4180) - [Java] Reduce verbose logging of ArrowBuf creation events?
+* [ARROW-4196](https://issues.apache.org/jira/browse/ARROW-4196) - [Rust] Add explicit SIMD vectorization for arithmetic ops in "array\_ops"
+* [ARROW-4198](https://issues.apache.org/jira/browse/ARROW-4198) - [Gandiva] Add support to cast timestamp
+* [ARROW-4204](https://issues.apache.org/jira/browse/ARROW-4204) - [Gandiva] implement decimal subtract
+* [ARROW-4205](https://issues.apache.org/jira/browse/ARROW-4205) - [Gandiva] Implement decimal multiply
+* [ARROW-4206](https://issues.apache.org/jira/browse/ARROW-4206) - [Gandiva] Implement decimal divide
+* [ARROW-4212](https://issues.apache.org/jira/browse/ARROW-4212) - [Python] [CUDA] Creating a CUDA buffer from Numba device array should be easier
+* [ARROW-4230](https://issues.apache.org/jira/browse/ARROW-4230) - [C++] Enable building flight against system gRPC
+* [ARROW-4232](https://issues.apache.org/jira/browse/ARROW-4232) - [C++] Follow conda-forge compiler ABI migration
+* [ARROW-4234](https://issues.apache.org/jira/browse/ARROW-4234) - [C++] Add memory bandwidth benchmarks to arrow/util/machine-benchmark.cc
+* [ARROW-4235](https://issues.apache.org/jira/browse/ARROW-4235) - [GLib] Use "column\_builder" in GArrowRecordBatchBuilder
+* [ARROW-4236](https://issues.apache.org/jira/browse/ARROW-4236) - [JAVA] Distinct plasma client create exceptions
+* [ARROW-4245](https://issues.apache.org/jira/browse/ARROW-4245) - [Rust] Add Rustdoc header to each source file
+* [ARROW-4247](https://issues.apache.org/jira/browse/ARROW-4247) - [Packaging] Update verify script for 0.12.0
+* [ARROW-4251](https://issues.apache.org/jira/browse/ARROW-4251) - [C++] Add option to use vendored Boost in verify-release-candidate.sh
+* [ARROW-4262](https://issues.apache.org/jira/browse/ARROW-4262) - [Website] Blog post to give preview into using R and Arrow with Apache Spark
+* [ARROW-4263](https://issues.apache.org/jira/browse/ARROW-4263) - [Rust] Donate DataFusion
+* [ARROW-4265](https://issues.apache.org/jira/browse/ARROW-4265) - [C++] Automatic conversion between Table and std::vector<std::tuple<..\>\>
+* [ARROW-4268](https://issues.apache.org/jira/browse/ARROW-4268) - [C++] Add C primitive to Arrow:Type compile time in TypeTraits
+* [ARROW-4271](https://issues.apache.org/jira/browse/ARROW-4271) - [Rust] Move Parquet specific info to Parquet Readme
+* [ARROW-4273](https://issues.apache.org/jira/browse/ARROW-4273) - [Release] Fix verification script to use cf201901 conda-forge label
+* [ARROW-4277](https://issues.apache.org/jira/browse/ARROW-4277) - [C++] Add gmock to toolchain
+* [ARROW-4281](https://issues.apache.org/jira/browse/ARROW-4281) - [CI] Use Ubuntu Xenial (16.04) VMs on Travis-CI
+* [ARROW-4285](https://issues.apache.org/jira/browse/ARROW-4285) - [Python] Use proper builder interface for serialization
+* [ARROW-4287](https://issues.apache.org/jira/browse/ARROW-4287) - [C++] Ensure minimal bison version on OSX for Thrift
+* [ARROW-4289](https://issues.apache.org/jira/browse/ARROW-4289) - [C++] Forward AR and RANLIB to thirdparty builds
+* [ARROW-4290](https://issues.apache.org/jira/browse/ARROW-4290) - [C++/Gandiva] Support detecting correct LLVM version in Homebrew
+* [ARROW-4291](https://issues.apache.org/jira/browse/ARROW-4291) - [Dev] Support selecting features in release scripts
+* [ARROW-4294](https://issues.apache.org/jira/browse/ARROW-4294) - [Plasma] Add support for evicting objects to external store
+* [ARROW-4297](https://issues.apache.org/jira/browse/ARROW-4297) - [C++] Fix build for 32-bit MSYS2
+* [ARROW-4298](https://issues.apache.org/jira/browse/ARROW-4298) - [Java] Building Flight fails with OpenJDK 11
+* [ARROW-4299](https://issues.apache.org/jira/browse/ARROW-4299) - [Ruby] Depend on the same version as Red Arrow
+* [ARROW-4300](https://issues.apache.org/jira/browse/ARROW-4300) - [C++] Restore apache-arrow Homebrew recipe and define process for maintaining and updating for releases
+* [ARROW-4303](https://issues.apache.org/jira/browse/ARROW-4303) - [Gandiva/Python] Build LLVM with RTTI in manylinux1 container
+* [ARROW-4305](https://issues.apache.org/jira/browse/ARROW-4305) - [Rust] Fix parquet version number in README
+* [ARROW-4307](https://issues.apache.org/jira/browse/ARROW-4307) - [C++] FIx doxygen warnings, include doxygen warning checks in CI linting
+* [ARROW-4310](https://issues.apache.org/jira/browse/ARROW-4310) - [Website] Update install document for 0.12.0
+* [ARROW-4313](https://issues.apache.org/jira/browse/ARROW-4313) - Define general benchmark database schema
+* [ARROW-4315](https://issues.apache.org/jira/browse/ARROW-4315) - [Website] Home page of https://arrow.apache.org/ does not mention Go or Rust
+* [ARROW-4318](https://issues.apache.org/jira/browse/ARROW-4318) - [C++] Add Tensor::CountNonZero
+* [ARROW-4321](https://issues.apache.org/jira/browse/ARROW-4321) - [CI] Setup conda-forge channel globally in docker containers
+* [ARROW-4330](https://issues.apache.org/jira/browse/ARROW-4330) - [C++] Use FindThreads.cmake to handle -pthread compiler/link options
+* [ARROW-4331](https://issues.apache.org/jira/browse/ARROW-4331) - [C++] Extend Scalar Datum to support more types
+* [ARROW-4332](https://issues.apache.org/jira/browse/ARROW-4332) - [Website] Instructions and scripts for publishing web site appear to be incorrect
+* [ARROW-4334](https://issues.apache.org/jira/browse/ARROW-4334) - [CI] Setup conda-forge channel globally in travis builds
+* [ARROW-4335](https://issues.apache.org/jira/browse/ARROW-4335) - [C++] Better document sparse tensor support
+* [ARROW-4336](https://issues.apache.org/jira/browse/ARROW-4336) - [C++] Default BUILD\_WARNING\_LEVEL to CHECKIN
+* [ARROW-4339](https://issues.apache.org/jira/browse/ARROW-4339) - [C++] rewrite cpp/README shorter, with a separate contribution guide
+* [ARROW-4340](https://issues.apache.org/jira/browse/ARROW-4340) - [C++] Update IWYU version in the \`lint\` dockerfile
+* [ARROW-4341](https://issues.apache.org/jira/browse/ARROW-4341) - [C++] Use TypedBufferBuilder<bool\> in BooleanBuilder
+* [ARROW-4344](https://issues.apache.org/jira/browse/ARROW-4344) - [Java] Further cleanup maven output
+* [ARROW-4345](https://issues.apache.org/jira/browse/ARROW-4345) - [C++] Add Apache 2.0 license file to the Parquet-testing repository
+* [ARROW-4346](https://issues.apache.org/jira/browse/ARROW-4346) - [C++] Fix compiler warnings with gcc 8.2.0
+* [ARROW-4352](https://issues.apache.org/jira/browse/ARROW-4352) - [C++] Add support for system Google Test
+* [ARROW-4353](https://issues.apache.org/jira/browse/ARROW-4353) - [CI] Add jobs for 32-bit and 64-bit MinGW
+* [ARROW-4358](https://issues.apache.org/jira/browse/ARROW-4358) - [Gandiva][Crossbow] Trusty build broken
+* [ARROW-4361](https://issues.apache.org/jira/browse/ARROW-4361) - [Website] Update commiters list
+* [ARROW-4362](https://issues.apache.org/jira/browse/ARROW-4362) - [Java] Test OpenJDK 11 in CI
+* [ARROW-4363](https://issues.apache.org/jira/browse/ARROW-4363) - [C++] Add CMake format checks
+* [ARROW-4372](https://issues.apache.org/jira/browse/ARROW-4372) - [C++] Embed precompiled bitcode in the gandiva library
+* [ARROW-4373](https://issues.apache.org/jira/browse/ARROW-4373) - [Packaging] Travis fails to deploy conda packages on OSX
+* [ARROW-4375](https://issues.apache.org/jira/browse/ARROW-4375) - [CI] Sphinx dependencies were removed from docs conda environment
+* [ARROW-4376](https://issues.apache.org/jira/browse/ARROW-4376) - [Rust] Implement from\_buf\_reader for csv::Reader
+* [ARROW-4377](https://issues.apache.org/jira/browse/ARROW-4377) - [Rust] Implement std::fmt::Debug for all PrimitiveArrays
+* [ARROW-4379](https://issues.apache.org/jira/browse/ARROW-4379) - Register pyarrow serializers for collections.Counter and collections.deque.
+* [ARROW-4383](https://issues.apache.org/jira/browse/ARROW-4383) - [C++] Use the CMake's standard find features
+* [ARROW-4386](https://issues.apache.org/jira/browse/ARROW-4386) - [Rust] Implement Date and Time Arrays
+* [ARROW-4388](https://issues.apache.org/jira/browse/ARROW-4388) - [Go] add DimNames() method to tensor Interface?
+* [ARROW-4393](https://issues.apache.org/jira/browse/ARROW-4393) - [Rust] coding style: apply 90 characters per line limit
+* [ARROW-4396](https://issues.apache.org/jira/browse/ARROW-4396) - Update Typedoc to support TypeScript 3.2
+* [ARROW-4397](https://issues.apache.org/jira/browse/ARROW-4397) - [C++] dim\_names in Tensor and SparseTensor
+* [ARROW-4399](https://issues.apache.org/jira/browse/ARROW-4399) - [C++] Remove usage of "extern template class" from NumericArray<T\>
+* [ARROW-4401](https://issues.apache.org/jira/browse/ARROW-4401) - [Python] Alpine dockerfile fails to build because pandas requires numpy as build dependency
+* [ARROW-4406](https://issues.apache.org/jira/browse/ARROW-4406) - Ignore "\*\_$folder$" files on S3
+* [ARROW-4408](https://issues.apache.org/jira/browse/ARROW-4408) - [CPP/Doc] Remove outdated Parquet documentation
+* [ARROW-4422](https://issues.apache.org/jira/browse/ARROW-4422) - [Plasma] Enforce memory limit in plasma, rather than relying on dlmalloc\_set\_footprint\_limit
+* [ARROW-4423](https://issues.apache.org/jira/browse/ARROW-4423) - [C++] Update version of vendored gtest to 1.8.1
+* [ARROW-4424](https://issues.apache.org/jira/browse/ARROW-4424) - [Python] Manylinux CI builds failing
+* [ARROW-4425](https://issues.apache.org/jira/browse/ARROW-4425) - Add link to 'Contributing' page in the top-level Arrow README
+* [ARROW-4430](https://issues.apache.org/jira/browse/ARROW-4430) - [C++] add unit test for currently unused append method
+* [ARROW-4431](https://issues.apache.org/jira/browse/ARROW-4431) - [C++] Build gRPC as ExternalProject without allowing it to build its vendored dependencies
+* [ARROW-4435](https://issues.apache.org/jira/browse/ARROW-4435) - [C\#] Add .sln file and minor .csproj fix ups
+* [ARROW-4436](https://issues.apache.org/jira/browse/ARROW-4436) - [Documentation] Clarify instructions for building documentation
+* [ARROW-4442](https://issues.apache.org/jira/browse/ARROW-4442) - [JS] Overly broad type annotation for Chunked typeId leading to type mismatches in generated typing
+* [ARROW-4444](https://issues.apache.org/jira/browse/ARROW-4444) - [Testing] Add DataFusion test files to arrow-testing repo
+* [ARROW-4445](https://issues.apache.org/jira/browse/ARROW-4445) - [C++][Gandiva] Run Gandiva-LLVM tests in Appveyor
+* [ARROW-4446](https://issues.apache.org/jira/browse/ARROW-4446) - [Python] Run Gandiva tests on Windows and Appveyor
+* [ARROW-4448](https://issues.apache.org/jira/browse/ARROW-4448) - [JAVA][Flight] Flaky Flight java test
+* [ARROW-4449](https://issues.apache.org/jira/browse/ARROW-4449) - [Rust] Convert File to T: Read + Seek for schema inference
+* [ARROW-4454](https://issues.apache.org/jira/browse/ARROW-4454) - [C++] fix unused parameter warnings
+* [ARROW-4455](https://issues.apache.org/jira/browse/ARROW-4455) - [Plasma] g++ 8 reports class-memaccess warnings
+* [ARROW-4459](https://issues.apache.org/jira/browse/ARROW-4459) - [Testing] Add git submodule for arrow-testing data files
+* [ARROW-4460](https://issues.apache.org/jira/browse/ARROW-4460) - [Website] Write blog post to announce DataFusion donation
+* [ARROW-4461](https://issues.apache.org/jira/browse/ARROW-4461) - [C++] Expose bit-util methods for binary boolean operations that don't allocate
+* [ARROW-4462](https://issues.apache.org/jira/browse/ARROW-4462) - [C++] Upgrade LZ4 v1.7.5 to v1.8.3 to compile with VS2017
+* [ARROW-4464](https://issues.apache.org/jira/browse/ARROW-4464) - [Rust] [DataFusion] Add support for LIMIT
+* [ARROW-4466](https://issues.apache.org/jira/browse/ARROW-4466) - [Rust] [DataFusion] Add support for Parquet data sources
+* [ARROW-4468](https://issues.apache.org/jira/browse/ARROW-4468) - [Rust] Implement BitAnd/BitOr for &Buffer (with SIMD)
+* [ARROW-4472](https://issues.apache.org/jira/browse/ARROW-4472) - [Website][Python] Blog post about Python string memory use improvements in 0.12
+* [ARROW-4475](https://issues.apache.org/jira/browse/ARROW-4475) - [Python] Serializing objects that contain themselves
+* [ARROW-4476](https://issues.apache.org/jira/browse/ARROW-4476) - [Rust] [DataFusion] Post donation clean up tasks
+* [ARROW-4481](https://issues.apache.org/jira/browse/ARROW-4481) - [Website] Instructions for publishing web site are missing a step
+* [ARROW-4483](https://issues.apache.org/jira/browse/ARROW-4483) - [Website] Fix broken link (author) in DataFusion blog post
+* [ARROW-4485](https://issues.apache.org/jira/browse/ARROW-4485) - [CI] Determine maintenance approach to pinned conda-forge binutils package
+* [ARROW-4486](https://issues.apache.org/jira/browse/ARROW-4486) - [Python][CUDA] pyarrow.cuda.Context.foreign\_buffer should have a \`base=None\` argument
+* [ARROW-4488](https://issues.apache.org/jira/browse/ARROW-4488) - [Rust] From AsRef<[u8]\> for Buffer does not ensure correct padding
+* [ARROW-4489](https://issues.apache.org/jira/browse/ARROW-4489) - [Rust] PrimitiveArray.value\_slice performs bounds checking when it should not
+* [ARROW-4490](https://issues.apache.org/jira/browse/ARROW-4490) - [Rust] Add explicit SIMD vectorization for boolean ops in "array\_ops"
+* [ARROW-4491](https://issues.apache.org/jira/browse/ARROW-4491) - [Python] Remove usage of std::to\_string and std::stoi
+* [ARROW-4499](https://issues.apache.org/jira/browse/ARROW-4499) - [Python][CI] Upgrade to latest flake8 3.7.5 in travis\_lint.sh
+* [ARROW-4502](https://issues.apache.org/jira/browse/ARROW-4502) - [C\#] Add support for zero-copy reads
+* [ARROW-4506](https://issues.apache.org/jira/browse/ARROW-4506) - [Ruby] Add Arrow::RecordBatch\#raw\_records
+* [ARROW-4513](https://issues.apache.org/jira/browse/ARROW-4513) - [Rust] Implement BitAnd/BitOr for &Bitmap
+* [ARROW-4517](https://issues.apache.org/jira/browse/ARROW-4517) - [JS] remove version number as it is not used
+* [ARROW-4518](https://issues.apache.org/jira/browse/ARROW-4518) - [JS] add jsdelivr to package.json
+* [ARROW-4528](https://issues.apache.org/jira/browse/ARROW-4528) - [C++] Update lint docker container to LLVM-7
+* [ARROW-4529](https://issues.apache.org/jira/browse/ARROW-4529) - [C++] Add test coverage for BitUtils::RoundDown
+* [ARROW-4531](https://issues.apache.org/jira/browse/ARROW-4531) - [C++] Handling of non-aligned slices in Sum kernel
+* [ARROW-4537](https://issues.apache.org/jira/browse/ARROW-4537) - [CI] Suppress shell warning on travis-ci
+* [ARROW-4539](https://issues.apache.org/jira/browse/ARROW-4539) - [Java]List vector child value count not set correctly
+* [ARROW-4540](https://issues.apache.org/jira/browse/ARROW-4540) - [Rust] Add basic JSON reader
+* [ARROW-4543](https://issues.apache.org/jira/browse/ARROW-4543) - [C\#] Update Flat Buffers code to latest version
+* [ARROW-4546](https://issues.apache.org/jira/browse/ARROW-4546) - [C++] LICENSE.txt should be updated.
+* [ARROW-4547](https://issues.apache.org/jira/browse/ARROW-4547) - [Python][Documentation] Update python/development.rst with instructions for CUDA-enabled builds
+* [ARROW-4556](https://issues.apache.org/jira/browse/ARROW-4556) - [Rust] Preserve order of JSON inferred schema
+* [ARROW-4558](https://issues.apache.org/jira/browse/ARROW-4558) - [C++][Flight] Avoid undefined behavior with gRPC memory optimizations
+* [ARROW-4560](https://issues.apache.org/jira/browse/ARROW-4560) - [R] array() needs to take single input, not ...
+* [ARROW-4562](https://issues.apache.org/jira/browse/ARROW-4562) - [C++][Flight] Create outgoing composite grpc::ByteBuffer instead of allocating contiguous slice and copying IpcPayload into it
+* [ARROW-4564](https://issues.apache.org/jira/browse/ARROW-4564) - [C++] IWYU docker image silently fails
+* [ARROW-4565](https://issues.apache.org/jira/browse/ARROW-4565) - [R] Reading records with all non-null decimals SEGFAULTs
+* [ARROW-4568](https://issues.apache.org/jira/browse/ARROW-4568) - [C++] Add version macros to headers
+* [ARROW-4572](https://issues.apache.org/jira/browse/ARROW-4572) - [C++] Remove memory zeroing from PrimitiveAllocatingUnaryKernel
+* [ARROW-4583](https://issues.apache.org/jira/browse/ARROW-4583) - [Plasma] There are bugs reported by code scan tool
+* [ARROW-4586](https://issues.apache.org/jira/browse/ARROW-4586) - [Rust] Remove arrow/mod.rs as it is not needed
+* [ARROW-4589](https://issues.apache.org/jira/browse/ARROW-4589) - [Rust] [DataFusion] Implement projection push down query optimizer rule
+* [ARROW-4590](https://issues.apache.org/jira/browse/ARROW-4590) - [Rust] Add explicit SIMD vectorization for comparison ops in "array\_ops"
+* [ARROW-4592](https://issues.apache.org/jira/browse/ARROW-4592) - [GLib] Stop configure immediately when GLib isn't available
+* [ARROW-4593](https://issues.apache.org/jira/browse/ARROW-4593) - [Ruby] Arrow::Array\#[out\_of\_range] returns nil
+* [ARROW-4594](https://issues.apache.org/jira/browse/ARROW-4594) - [Ruby] Arrow::StructArray\#[] returns Arrow::Struct instead of Arrow::Array
+* [ARROW-4595](https://issues.apache.org/jira/browse/ARROW-4595) - [Rust] [DataFusion] Implement DataFrame style API
+* [ARROW-4598](https://issues.apache.org/jira/browse/ARROW-4598) - [CI] Remove needless LLVM\_DIR for macOS
+* [ARROW-4599](https://issues.apache.org/jira/browse/ARROW-4599) - [C++] Add support for system GFlags
+* [ARROW-4602](https://issues.apache.org/jira/browse/ARROW-4602) - [Rust][ [DataFusion] Integrate query optimizer with ExecutionContext
+* [ARROW-4603](https://issues.apache.org/jira/browse/ARROW-4603) - [Rust] [DataFusion] Execution context should allow in-memory data sources to be registered
+* [ARROW-4604](https://issues.apache.org/jira/browse/ARROW-4604) - [Rust] [DataFusion] Add benchmarks for SQL query execution
+* [ARROW-4605](https://issues.apache.org/jira/browse/ARROW-4605) - [Rust] Move filter and limit code from DataFusion into compute module
+* [ARROW-4609](https://issues.apache.org/jira/browse/ARROW-4609) - [C++] Use google benchmark from toolchain
+* [ARROW-4610](https://issues.apache.org/jira/browse/ARROW-4610) - [Plasma] Avoid JNI from crashing
+* [ARROW-4611](https://issues.apache.org/jira/browse/ARROW-4611) - [C++] Rework CMake third-party logic
+* [ARROW-4612](https://issues.apache.org/jira/browse/ARROW-4612) - [Python] Use cython from PyPI for windows wheels build
+* [ARROW-4613](https://issues.apache.org/jira/browse/ARROW-4613) - [C++] Alpine build failing as libgtestd.so is not found
+* [ARROW-4614](https://issues.apache.org/jira/browse/ARROW-4614) - [C++/CI] Activate flight build in ci/docker\_build\_cpp.sh
+* [ARROW-4615](https://issues.apache.org/jira/browse/ARROW-4615) - [C++] Add checked\_pointer\_cast
+* [ARROW-4616](https://issues.apache.org/jira/browse/ARROW-4616) - [C++] Log message in BuildUtils as STATUS
+* [ARROW-4618](https://issues.apache.org/jira/browse/ARROW-4618) - [Docker] Makefile to build dependent docker images
+* [ARROW-4619](https://issues.apache.org/jira/browse/ARROW-4619) - [R]: Fix the autobrew script
+* [ARROW-4620](https://issues.apache.org/jira/browse/ARROW-4620) - [C\#] Add unit tests for "Types" in arrow/csharp
+* [ARROW-4623](https://issues.apache.org/jira/browse/ARROW-4623) - [R] update Rcpp dependency
+* [ARROW-4628](https://issues.apache.org/jira/browse/ARROW-4628) - [Rust] [DataFusion] Implement type coercion query optimizer rule
+* [ARROW-4632](https://issues.apache.org/jira/browse/ARROW-4632) - [Ruby] Add BigDecimal\#to\_arrow
+* [ARROW-4634](https://issues.apache.org/jira/browse/ARROW-4634) - [Rust] [Parquet] Reorganize test\_common mod to allow more test util codes.
+* [ARROW-4637](https://issues.apache.org/jira/browse/ARROW-4637) - [Python] Avoid importing Pandas unless necessary
+* [ARROW-4638](https://issues.apache.org/jira/browse/ARROW-4638) - [R] install instructions using brew
+* [ARROW-4640](https://issues.apache.org/jira/browse/ARROW-4640) - [Python] Add docker-compose configuration to build and test the project without pandas installed
+* [ARROW-4643](https://issues.apache.org/jira/browse/ARROW-4643) - [C++] Add compiler diagnostic color when using Ninja
+* [ARROW-4644](https://issues.apache.org/jira/browse/ARROW-4644) - [C++/Docker] Build Gandiva in the docker containers
+* [ARROW-4645](https://issues.apache.org/jira/browse/ARROW-4645) - [C++/Packaging] Ship Gandiva with OSX and Windows wheels
+* [ARROW-4646](https://issues.apache.org/jira/browse/ARROW-4646) - [C++/Packaging] Ship gandiva with the conda-forge packages
+* [ARROW-4655](https://issues.apache.org/jira/browse/ARROW-4655) - [Packaging] Parallelize binary upload
+* [ARROW-4662](https://issues.apache.org/jira/browse/ARROW-4662) - [Python] Add type\_codes property in UnionType
+* [ARROW-4667](https://issues.apache.org/jira/browse/ARROW-4667) - [C++] Suppress unused function warnings with MinGW
+* [ARROW-4670](https://issues.apache.org/jira/browse/ARROW-4670) - [Rust] compute::sum performance issue
+* [ARROW-4671](https://issues.apache.org/jira/browse/ARROW-4671) - [C++] MakeBuilder doesn't support Type::DICTIONARY
+* [ARROW-4673](https://issues.apache.org/jira/browse/ARROW-4673) - [C++] Implement AssertDatumEquals
+* [ARROW-4676](https://issues.apache.org/jira/browse/ARROW-4676) - [C++] Add support for debug build with MinGW
+* [ARROW-4678](https://issues.apache.org/jira/browse/ARROW-4678) - [Rust] Minimize unstable feature usage
+* [ARROW-4679](https://issues.apache.org/jira/browse/ARROW-4679) - [Rust] [DataFusion] Implement in-memory DataSource
+* [ARROW-4681](https://issues.apache.org/jira/browse/ARROW-4681) - [Rust] [DataFusion] Implement parallel query execution using threads
+* [ARROW-4686](https://issues.apache.org/jira/browse/ARROW-4686) - Only accept 'y' or 'n' in merge\_arrow\_pr.py prompts
+* [ARROW-4689](https://issues.apache.org/jira/browse/ARROW-4689) - [Go] add support for WASM
+* [ARROW-4690](https://issues.apache.org/jira/browse/ARROW-4690) - [Python] Building TensorFlow compatible wheels for Arrow
+* [ARROW-4692](https://issues.apache.org/jira/browse/ARROW-4692) - [Format][Documentation] Add more details about "sidecar" to flight proto
+* [ARROW-4693](https://issues.apache.org/jira/browse/ARROW-4693) - [CI] Build boost library with multi precision
+* [ARROW-4697](https://issues.apache.org/jira/browse/ARROW-4697) - [C++] Add URI parsing facility
+* [ARROW-4703](https://issues.apache.org/jira/browse/ARROW-4703) - [C++] Upgrade dependency versions
+* [ARROW-4705](https://issues.apache.org/jira/browse/ARROW-4705) - [Rust] CSV reader should show line number and error message when failing to parse a line
+* [ARROW-4707](https://issues.apache.org/jira/browse/ARROW-4707) - [C++] move BitsetStack to bit-util.h
+* [ARROW-4718](https://issues.apache.org/jira/browse/ARROW-4718) - Add ArrowStreamWriter/Reader ctors that leave open the underlying Stream
+* [ARROW-4727](https://issues.apache.org/jira/browse/ARROW-4727) - [Rust] Implement ability to check if two schemas are the same
+* [ARROW-4730](https://issues.apache.org/jira/browse/ARROW-4730) - [C++] Add docker-compose entry for testing Fedora build with system packages
+* [ARROW-4731](https://issues.apache.org/jira/browse/ARROW-4731) - [C++] Add docker-compose entry for testing Ubuntu Xenial build with system packages
+* [ARROW-4732](https://issues.apache.org/jira/browse/ARROW-4732) - [C++] Add docker-compose entry for testing Debian Testing build with system packages
+* [ARROW-4733](https://issues.apache.org/jira/browse/ARROW-4733) - [C++] Add CI entry that builds without the conda-forge toolchain but with system packages
+* [ARROW-4734](https://issues.apache.org/jira/browse/ARROW-4734) - [Go] Add option to write a header for CSV writer
+* [ARROW-4735](https://issues.apache.org/jira/browse/ARROW-4735) - [Go] Benchmark strconv.Format vs. fmt.Sprintf for CSV writer
+* [ARROW-4739](https://issues.apache.org/jira/browse/ARROW-4739) - [Rust] [DataFusion] It should be possible to share a logical plan between threads
+* [ARROW-4740](https://issues.apache.org/jira/browse/ARROW-4740) - [Java] Upgrade to JUnit 5
+* [ARROW-4743](https://issues.apache.org/jira/browse/ARROW-4743) - [Java] Fix documentation in arrow memory module
+* [ARROW-4745](https://issues.apache.org/jira/browse/ARROW-4745) - [C++][Documentation] Document process for replicating static\_crt builds on windows
+* [ARROW-4749](https://issues.apache.org/jira/browse/ARROW-4749) - [Rust] RecordBatch::new() should return result instead of panicking
+* [ARROW-4751](https://issues.apache.org/jira/browse/ARROW-4751) - [C++] Add pkg-config to conda\_env\_cpp.yml
+* [ARROW-4754](https://issues.apache.org/jira/browse/ARROW-4754) - [CI][Java] Flaky TestAuth Flight test
+* [ARROW-4756](https://issues.apache.org/jira/browse/ARROW-4756) - [CI] document the procedure to update docker image for manylinux1 builds
+* [ARROW-4758](https://issues.apache.org/jira/browse/ARROW-4758) - [Flight] Build fails on Mac due to missing Schema\_generated.h
+* [ARROW-4769](https://issues.apache.org/jira/browse/ARROW-4769) - [Rust] Improve array limit function where max records \> len
+* [ARROW-4772](https://issues.apache.org/jira/browse/ARROW-4772) - Provide new ORC adapter interface that allow user to specify row number
+* [ARROW-4776](https://issues.apache.org/jira/browse/ARROW-4776) - [C++] DictionaryBuilder should support bootstrapping from an existing dict type
+* [ARROW-4777](https://issues.apache.org/jira/browse/ARROW-4777) - [C++/Python] manylinux1: Update lz4 to 1.8.3
+* [ARROW-4778](https://issues.apache.org/jira/browse/ARROW-4778) - [C++/Python] manylinux1: Update Thrift to 0.12.0
+* [ARROW-4782](https://issues.apache.org/jira/browse/ARROW-4782) - [C++] Prototype scalar and array expression types for developing deferred operator algebra
+* [ARROW-4786](https://issues.apache.org/jira/browse/ARROW-4786) - [C++/Python] Support better parallelisation in manylinux1 base build
+* [ARROW-4789](https://issues.apache.org/jira/browse/ARROW-4789) - [C++] Deprecate and and later remove arrow::io::ReadableFileInterface
+* [ARROW-4790](https://issues.apache.org/jira/browse/ARROW-4790) - [Python/Packaging] Update manylinux docker image in crossbow task
+* [ARROW-4791](https://issues.apache.org/jira/browse/ARROW-4791) - Unused dependencies in arrow and datafusion
+* [ARROW-4794](https://issues.apache.org/jira/browse/ARROW-4794) - [Python] Make pandas an optional test dependency
+* [ARROW-4797](https://issues.apache.org/jira/browse/ARROW-4797) - [Plasma] Avoid store crash if not enough memory is available
+* [ARROW-4801](https://issues.apache.org/jira/browse/ARROW-4801) - [GLib] Suppress pkgconfig.generate() warnings
+* [ARROW-4808](https://issues.apache.org/jira/browse/ARROW-4808) - [Java][Vector] Convenience methods for setting decimal vector
+* [ARROW-4812](https://issues.apache.org/jira/browse/ARROW-4812) - [Rust] [DataFusion] Table.scan() should return one iterator per partition
+* [ARROW-4817](https://issues.apache.org/jira/browse/ARROW-4817) - [Rust] [DataFusion] Small re-org of modules
+* [ARROW-4818](https://issues.apache.org/jira/browse/ARROW-4818) - [Rust] [DataFusion] Parquet data source does not support null values
+* [ARROW-4826](https://issues.apache.org/jira/browse/ARROW-4826) - [Go] export Flush method for CSV writer
+* [ARROW-4831](https://issues.apache.org/jira/browse/ARROW-4831) - [C++] CMAKE\_AR is not passed to ZSTD thirdparty dependency
+* [ARROW-4833](https://issues.apache.org/jira/browse/ARROW-4833) - [Release] Document how to update the brew formula in the release management guide
+* [ARROW-4834](https://issues.apache.org/jira/browse/ARROW-4834) - [R] Feature flag to disable parquet
+* [ARROW-4835](https://issues.apache.org/jira/browse/ARROW-4835) - [GLib] Add boolean operations
+* [ARROW-4837](https://issues.apache.org/jira/browse/ARROW-4837) - [C++] Support c++filt on a custom path in the run-test.sh script
+* [ARROW-4839](https://issues.apache.org/jira/browse/ARROW-4839) - [C\#] Add NuGet support
+* [ARROW-4843](https://issues.apache.org/jira/browse/ARROW-4843) - [Rust] [DataFusion] Parquet data source should support DATE
+* [ARROW-4846](https://issues.apache.org/jira/browse/ARROW-4846) - [Java] Update Jackson to 2.9.8
+* [ARROW-4849](https://issues.apache.org/jira/browse/ARROW-4849) - [C++] Add docker-compose entry for testing Ubuntu Bionic build with system packages
+* [ARROW-4854](https://issues.apache.org/jira/browse/ARROW-4854) - [Rust] Use Array Slice for limit kernel
+* [ARROW-4855](https://issues.apache.org/jira/browse/ARROW-4855) - [Packaging] Generate default package version based on cpp tags in crossbow.py
+* [ARROW-4858](https://issues.apache.org/jira/browse/ARROW-4858) - [Flight][Python] Enable custom FlightDataStream in Python
+* [ARROW-4859](https://issues.apache.org/jira/browse/ARROW-4859) - [GLib] Add garrow\_numeric\_array\_mean()
+* [ARROW-4862](https://issues.apache.org/jira/browse/ARROW-4862) - [GLib] Add GArrowCastOptions::allow-invalid-utf8 property
+* [ARROW-4862](https://issues.apache.org/jira/browse/ARROW-4862) - [GLib] Add GArrowCastOptions::allow-invalid-utf8 property
+* [ARROW-4865](https://issues.apache.org/jira/browse/ARROW-4865) - [Rust] Support casting lists and primitives to lists
+* [ARROW-4873](https://issues.apache.org/jira/browse/ARROW-4873) - [C++] Clarify documentation about how to use external ARROW\_PACKAGE\_PREFIX while also using CONDA dependency resolution
+* [ARROW-4878](https://issues.apache.org/jira/browse/ARROW-4878) - [C++] ARROW\_DEPENDENCY\_SOURCE=CONDA does not work properly with MSVC
+* [ARROW-4882](https://issues.apache.org/jira/browse/ARROW-4882) - [GLib] Add "Sum" functions
+* [ARROW-4887](https://issues.apache.org/jira/browse/ARROW-4887) - [GLib] Add garrow\_array\_count()
+* [ARROW-4889](https://issues.apache.org/jira/browse/ARROW-4889) - [C++] Add STATUS messages for Protobuf in CMake
+* [ARROW-4891](https://issues.apache.org/jira/browse/ARROW-4891) - [C++] ZLIB include directories not added
+* [ARROW-4892](https://issues.apache.org/jira/browse/ARROW-4892) - [Rust] [DataFusion] Move SQL parser and planner into sql package
+* [ARROW-4893](https://issues.apache.org/jira/browse/ARROW-4893) - [C++] conda packages should use $PREFIX inside of conda-build
+* [ARROW-4894](https://issues.apache.org/jira/browse/ARROW-4894) - [Rust] [DataFusion] Remove all uses of panic! from aggregate.rs
+* [ARROW-4895](https://issues.apache.org/jira/browse/ARROW-4895) - [Rust] [DataFusion] Move error.rs to top level package
+* [ARROW-4896](https://issues.apache.org/jira/browse/ARROW-4896) - [Rust] [DataFusion] Remove all uses of panic! from tests
+* [ARROW-4897](https://issues.apache.org/jira/browse/ARROW-4897) - [Rust] [DataFusion] Improve Rustdoc
+* [ARROW-4898](https://issues.apache.org/jira/browse/ARROW-4898) - [C++] Old versions of FindProtobuf.cmake use ALL-CAPS for variables
+* [ARROW-4899](https://issues.apache.org/jira/browse/ARROW-4899) - [Rust] [DataFusion] Remove all uses of panic! from expression.rs
+* [ARROW-4901](https://issues.apache.org/jira/browse/ARROW-4901) - [Go] Run tests in Appveyor
+* [ARROW-4905](https://issues.apache.org/jira/browse/ARROW-4905) - [C++][Plasma] Remove dlmalloc from client library
+* [ARROW-4907](https://issues.apache.org/jira/browse/ARROW-4907) - [CI] Add docker container to inspect docker context
+* [ARROW-4908](https://issues.apache.org/jira/browse/ARROW-4908) - [Rust] [DataFusion] Add support for parquet date/time in int32/64 encoding
+* [ARROW-4909](https://issues.apache.org/jira/browse/ARROW-4909) - [CI] Use hadolint to lint Dockerfiles
+* [ARROW-4910](https://issues.apache.org/jira/browse/ARROW-4910) - [Rust] [DataFusion] Remove all uses of unimplemented!
+* [ARROW-4915](https://issues.apache.org/jira/browse/ARROW-4915) - [GLib] Add support for arrow::NullBuilder
+* [ARROW-4922](https://issues.apache.org/jira/browse/ARROW-4922) - [Packaging] Use system libraris for .deb and .rpm
+* [ARROW-4924](https://issues.apache.org/jira/browse/ARROW-4924) - [Ruby] Add Decimal128\#to\_s(scale=nil)
+* [ARROW-4925](https://issues.apache.org/jira/browse/ARROW-4925) - [Rust] [DataFusion] Remove duplicate implementations of collect\_expr
+* [ARROW-4926](https://issues.apache.org/jira/browse/ARROW-4926) - [Rust] [DataFusion] Update README for 0.13.0 release
+* [ARROW-4929](https://issues.apache.org/jira/browse/ARROW-4929) - [GLib] Add garrow\_array\_count\_values()
+* [ARROW-4932](https://issues.apache.org/jira/browse/ARROW-4932) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE macro
+* [ARROW-4933](https://issues.apache.org/jira/browse/ARROW-4933) - [R] Autodetect Parquet support using pkg-config
+* [ARROW-4937](https://issues.apache.org/jira/browse/ARROW-4937) - [R] Clean pkg-config related logic
+* [ARROW-4939](https://issues.apache.org/jira/browse/ARROW-4939) - [Python] Add wrapper for "sum" kernel
+* [ARROW-4940](https://issues.apache.org/jira/browse/ARROW-4940) - [Rust] Enhance documentation for datafusion
+* [ARROW-4944](https://issues.apache.org/jira/browse/ARROW-4944) - [C++] Raise minimal required thrift-cpp to 0.11 in conda environment
+* [ARROW-4946](https://issues.apache.org/jira/browse/ARROW-4946) - [C++] Support detection of flatbuffers without FlatbuffersConfig.cmake
+* [ARROW-4947](https://issues.apache.org/jira/browse/ARROW-4947) - [Flight][C++/Python] Remove redundant schema parameter in DoGet
+* [ARROW-4951](https://issues.apache.org/jira/browse/ARROW-4951) - [C++] Turn off cpp benchmarks in cpp docker images
+* [ARROW-4955](https://issues.apache.org/jira/browse/ARROW-4955) - [GLib] Add garrow\_file\_is\_closed()
+* [ARROW-4964](https://issues.apache.org/jira/browse/ARROW-4964) - [Ruby] Add closed check if available on auto close
+* [ARROW-4969](https://issues.apache.org/jira/browse/ARROW-4969) - [C++] Set RPATH in correct order for test executables on OSX
+* [ARROW-4977](https://issues.apache.org/jira/browse/ARROW-4977) - [Ruby] Add support for building on Windows
+* [ARROW-4978](https://issues.apache.org/jira/browse/ARROW-4978) - [Ruby] Fix wrong internal variable name for table data
+* [ARROW-4979](https://issues.apache.org/jira/browse/ARROW-4979) - [GLib] Add missing lock to garrow::GIOInputStream
+* [ARROW-4980](https://issues.apache.org/jira/browse/ARROW-4980) - [GLib] Use GInputStream as the parent of GArrowInputStream
+* [ARROW-4981](https://issues.apache.org/jira/browse/ARROW-4981) - [Ruby] Add support for CSV data encoding conversion
+* [ARROW-4983](https://issues.apache.org/jira/browse/ARROW-4983) - [Plasma] Unmap memory when the client is destroyed
+* [ARROW-4994](https://issues.apache.org/jira/browse/ARROW-4994) - [website] Update Details for ptgoetz
+* [ARROW-4995](https://issues.apache.org/jira/browse/ARROW-4995) - [R] Make sure winbuilder tests pass for package
+* [ARROW-4996](https://issues.apache.org/jira/browse/ARROW-4996) - [Plasma] There are many log files in /tmp
+* [ARROW-5003](https://issues.apache.org/jira/browse/ARROW-5003) - [R] remove dependency on withr
+* [ARROW-5006](https://issues.apache.org/jira/browse/ARROW-5006) - [R] parquet.cpp does not include enough Rcpp
+* [ARROW-5011](https://issues.apache.org/jira/browse/ARROW-5011) - [Release] Add support in the source release script for custom hash
+* [ARROW-5013](https://issues.apache.org/jira/browse/ARROW-5013) - [Rust] [DataFusion] Refactor runtime expression support
+* [ARROW-5014](https://issues.apache.org/jira/browse/ARROW-5014) - [Java] Fix typos in Flight module
+* [ARROW-5018](https://issues.apache.org/jira/browse/ARROW-5018) - [Release] Include JavaScript implementation
+* [ARROW-5032](https://issues.apache.org/jira/browse/ARROW-5032) - [C++] Headers in vendored/datetime directory aren't installed
+* [ARROW-5041](https://issues.apache.org/jira/browse/ARROW-5041) - [Release][C++] use bundled gtest and gmock in verify-release-candidate.bat
+* [ARROW-5075](https://issues.apache.org/jira/browse/ARROW-5075) - [Release] Add 0.13.0 release note
+* [ARROW-5084](https://issues.apache.org/jira/browse/ARROW-5084) - [Website] Blog post / release announcement for 0.13.0
+* [PARQUET-1477](https://issues.apache.org/jira/browse/PARQUET-1477) - Thrift crypto updates
+* [PARQUET-1508](https://issues.apache.org/jira/browse/PARQUET-1508) - [C++] Enable reading from ByteArray and FixedLenByteArray decoders directly into arrow::BinaryBuilder or arrow::BinaryDictionaryBuilder
+* [PARQUET-1519](https://issues.apache.org/jira/browse/PARQUET-1519) - [C++] Remove use of "extern template class" from parquet/column\_reader.h
+* [PARQUET-1521](https://issues.apache.org/jira/browse/PARQUET-1521) - [C++] Do not use "extern template class" with parquet::ColumnWriter
+* [PARQUET-1525](https://issues.apache.org/jira/browse/PARQUET-1525) - [C++] remove dependency on getopt in parquet tools
+
+
+
+# Apache Arrow 0.12.1 (2019-02-25)
+
+## Bug Fixes
+
+* [ARROW-3564](https://issues.apache.org/jira/browse/ARROW-3564) - [Python] writing version 2.0 parquet format with dictionary encoding enabled
+* [ARROW-4255](https://issues.apache.org/jira/browse/ARROW-4255) - [C++] Schema::GetFieldIndex is not thread-safe
+* [ARROW-4267](https://issues.apache.org/jira/browse/ARROW-4267) - [Python/C++][Parquet] Segfault when reading rowgroups with duplicated columns
+* [ARROW-4323](https://issues.apache.org/jira/browse/ARROW-4323) - [Packaging] Fix failing OSX clang conda forge builds
+* [ARROW-4367](https://issues.apache.org/jira/browse/ARROW-4367) - [C++] StringDictionaryBuilder segfaults on Finish with only null entries
+* [ARROW-4374](https://issues.apache.org/jira/browse/ARROW-4374) - [C++] DictionaryBuilder does not correctly report length and null\_count
+* [ARROW-4492](https://issues.apache.org/jira/browse/ARROW-4492) - [Python] Failure reading Parquet column as pandas Categorical in 0.12
+* [ARROW-4501](https://issues.apache.org/jira/browse/ARROW-4501) - [C++] Unique returns non-unique strings
+* [ARROW-4582](https://issues.apache.org/jira/browse/ARROW-4582) - [C++/Python] Memory corruption on Pandas-\>Arrow conversion
+* [ARROW-4629](https://issues.apache.org/jira/browse/ARROW-4629) - [Python] Pandas to arrow conversion slowed down by local imports
+* [ARROW-4636](https://issues.apache.org/jira/browse/ARROW-4636) - [Python/Packaging] Crossbow builds for conda-osx fail on upload with Ruby linkage errors
+* [ARROW-4647](https://issues.apache.org/jira/browse/ARROW-4647) - [Packaging] dev/release/00-prepare.sh fails for minor version changes
+
+
+## New Features and Improvements
+
+* [ARROW-4291](https://issues.apache.org/jira/browse/ARROW-4291) - [Dev] Support selecting features in release scripts
+* [ARROW-4298](https://issues.apache.org/jira/browse/ARROW-4298) - [Java] Building Flight fails with OpenJDK 11
+* [ARROW-4373](https://issues.apache.org/jira/browse/ARROW-4373) - [Packaging] Travis fails to deploy conda packages on OSX
+
+
+
+# Apache Arrow 0.12.0 (2019-01-20)
+
+## New Features and Improvements
+
+* [ARROW-45](https://issues.apache.org/jira/browse/ARROW-45) - [Python] Add unnest/flatten function for List types
+* [ARROW-536](https://issues.apache.org/jira/browse/ARROW-536) - [C++] Provide non-SSE4 versions of functions that use CPU intrinsics for older processors
+* [ARROW-554](https://issues.apache.org/jira/browse/ARROW-554) - [C++] Implement functions to conform unequal dictionaries amongst multiple Arrow arrays
+* [ARROW-766](https://issues.apache.org/jira/browse/ARROW-766) - [C++] Introduce zero-copy "StringPiece" type
+* [ARROW-854](https://issues.apache.org/jira/browse/ARROW-854) - [Format] Support sparse tensor
+* [ARROW-912](https://issues.apache.org/jira/browse/ARROW-912) - [Python] Account for multiarch systems in development.rst
+* [ARROW-1019](https://issues.apache.org/jira/browse/ARROW-1019) - [C++] Implement input stream and output stream with Gzip codec
+* [ARROW-1055](https://issues.apache.org/jira/browse/ARROW-1055) - [C++] GPU support library development
+* [ARROW-1262](https://issues.apache.org/jira/browse/ARROW-1262) - [Packaging] Packaging automation in arrow-dist
+* [ARROW-1423](https://issues.apache.org/jira/browse/ARROW-1423) - [C++] Create non-owned CudaContext from context handle provided by thirdparty user
+* [ARROW-1492](https://issues.apache.org/jira/browse/ARROW-1492) - [C++] Type casting function kernel suite
+* [ARROW-1688](https://issues.apache.org/jira/browse/ARROW-1688) - [Java] Fail build on checkstyle warnings
+* [ARROW-1696](https://issues.apache.org/jira/browse/ARROW-1696) - [C++] Add codec benchmarks
+* [ARROW-1822](https://issues.apache.org/jira/browse/ARROW-1822) - [C++] Add SSE4.2-accelerated hash kernels and use if host CPU supports
+* [ARROW-1993](https://issues.apache.org/jira/browse/ARROW-1993) - [Python] Add function for determining implied Arrow schema from pandas.DataFrame
+* [ARROW-1994](https://issues.apache.org/jira/browse/ARROW-1994) - [Python] Test against Pandas master
+* [ARROW-2183](https://issues.apache.org/jira/browse/ARROW-2183) - [C++] Add helper CMake function for globbing the right header files
+* [ARROW-2211](https://issues.apache.org/jira/browse/ARROW-2211) - [C++] Use simpler hash functions for integers
+* [ARROW-2216](https://issues.apache.org/jira/browse/ARROW-2216) - [CI] CI descriptions and envars are misleading
+* [ARROW-2337](https://issues.apache.org/jira/browse/ARROW-2337) - [Scripts] Windows release verification script should use boost DSOs instead of static linkage
+* [ARROW-2374](https://issues.apache.org/jira/browse/ARROW-2374) - [Rust] Add support for array of List<T\>
+* [ARROW-2475](https://issues.apache.org/jira/browse/ARROW-2475) - [Format] Confusing array length description
+* [ARROW-2476](https://issues.apache.org/jira/browse/ARROW-2476) - [Python/Question] Maximum length of an Array created from ndarray
+* [ARROW-2483](https://issues.apache.org/jira/browse/ARROW-2483) - [Rust] use bit-packing for boolean vectors
+* [ARROW-2504](https://issues.apache.org/jira/browse/ARROW-2504) - [Website] Add ApacheCon NA link
+* [ARROW-2535](https://issues.apache.org/jira/browse/ARROW-2535) - [Python] Provide pre-commit hooks that check flake8
+* [ARROW-2560](https://issues.apache.org/jira/browse/ARROW-2560) - [Rust] The Rust README should include Rust-specific information on contributing
+* [ARROW-2624](https://issues.apache.org/jira/browse/ARROW-2624) - [Python] Random schema and data generator for Arrow conversion and Parquet testing
+* [ARROW-2637](https://issues.apache.org/jira/browse/ARROW-2637) - [C++/Python] Build support and instructions for development on Alpine Linux
+* [ARROW-2648](https://issues.apache.org/jira/browse/ARROW-2648) - [Packaging] Follow up packaging tasks
+* [ARROW-2653](https://issues.apache.org/jira/browse/ARROW-2653) - [C++] Refactor hash table support
+* [ARROW-2670](https://issues.apache.org/jira/browse/ARROW-2670) - [C++/Python] Add Ubuntu 18.04 / gcc7 as a nightly build
+* [ARROW-2673](https://issues.apache.org/jira/browse/ARROW-2673) - [Python] Add documentation + docstring for ARROW-2661
+* [ARROW-2684](https://issues.apache.org/jira/browse/ARROW-2684) - [Python] Various documentation improvements
+* [ARROW-2712](https://issues.apache.org/jira/browse/ARROW-2712) - [C\#] Initial C\# .NET library
+* [ARROW-2720](https://issues.apache.org/jira/browse/ARROW-2720) - [C++] Clean up cmake CXX\_STANDARD and PIC flag setting
+* [ARROW-2759](https://issues.apache.org/jira/browse/ARROW-2759) - Export notification socket of Plasma
+* [ARROW-2803](https://issues.apache.org/jira/browse/ARROW-2803) - [C++] Put hashing function into src/arrow/util
+* [ARROW-2807](https://issues.apache.org/jira/browse/ARROW-2807) - [Python] Enable memory-mapping to be toggled in get\_reader when reading Parquet files
+* [ARROW-2808](https://issues.apache.org/jira/browse/ARROW-2808) - [Python] Add unit tests for ProxyMemoryPool, enable new default MemoryPool to be constructed
+* [ARROW-2919](https://issues.apache.org/jira/browse/ARROW-2919) - [C++] Improve error message when listing empty HDFS file
+* [ARROW-2968](https://issues.apache.org/jira/browse/ARROW-2968) - [R] Multi-threaded conversion from Arrow table to R data.frame
+* [ARROW-2995](https://issues.apache.org/jira/browse/ARROW-2995) - [CI] Build Python libraries in same run when running C++ unit tests so project does not need to be rebuilt again right away
+* [ARROW-3020](https://issues.apache.org/jira/browse/ARROW-3020) - [Python] Addition of option to allow empty Parquet row groups
+* [ARROW-3038](https://issues.apache.org/jira/browse/ARROW-3038) - [Go] add support for StringArray
+* [ARROW-3063](https://issues.apache.org/jira/browse/ARROW-3063) - [Go] move list of supported/TODO features to confluence
+* [ARROW-3070](https://issues.apache.org/jira/browse/ARROW-3070) - [Release] Host binary artifacts for RCs and releases on ASF Bintray account instead of dist/mirror system
+* [ARROW-3108](https://issues.apache.org/jira/browse/ARROW-3108) - [C++] arrow::PrettyPrint for Table instances
+* [ARROW-3126](https://issues.apache.org/jira/browse/ARROW-3126) - [Python] Make Buffered\* IO classes available to Python, incorporate into input\_stream, output\_stream factory functions
+* [ARROW-3131](https://issues.apache.org/jira/browse/ARROW-3131) - [Go] add test for Go-1.11
+* [ARROW-3161](https://issues.apache.org/jira/browse/ARROW-3161) - [Packaging] Ensure to run pyarrow unit tests in conda and wheel builds
+* [ARROW-3169](https://issues.apache.org/jira/browse/ARROW-3169) - [C++] Break array-test.cc and array.cc into multiple compilation units
+* [ARROW-3184](https://issues.apache.org/jira/browse/ARROW-3184) - [C++] Add modular build targets, "all" target, and require explicit target when invoking make or ninja
+* [ARROW-3194](https://issues.apache.org/jira/browse/ARROW-3194) - [Java] Fix setValueCount in spitAndTransfer for variable width vectors
+* [ARROW-3199](https://issues.apache.org/jira/browse/ARROW-3199) - [Plasma] Check for EAGAIN in recvmsg and sendmsg
+* [ARROW-3209](https://issues.apache.org/jira/browse/ARROW-3209) - [C++] Rename libarrow\_gpu to libarrow\_cuda
+* [ARROW-3230](https://issues.apache.org/jira/browse/ARROW-3230) - [Python] Missing comparisons on ChunkedArray, Table
+* [ARROW-3233](https://issues.apache.org/jira/browse/ARROW-3233) - [Python] Sphinx documentation for pyarrow.cuda GPU support
+* [ARROW-3248](https://issues.apache.org/jira/browse/ARROW-3248) - [C++] Arrow tests should have label "arrow"
+* [ARROW-3254](https://issues.apache.org/jira/browse/ARROW-3254) - [C++] Add option to ADD\_ARROW\_TEST to compose a test executable from multiple .cc files containing unit tests
+* [ARROW-3260](https://issues.apache.org/jira/browse/ARROW-3260) - [CI] Make linting a separate job
+* [ARROW-3272](https://issues.apache.org/jira/browse/ARROW-3272) - [Java] Document checkstyle deviations from Google style guide
+* [ARROW-3273](https://issues.apache.org/jira/browse/ARROW-3273) - [Java] checkstyle - fix javadoc style
+* [ARROW-3278](https://issues.apache.org/jira/browse/ARROW-3278) - [Python] Retrieve StructType's and StructArray's field by name
+* [ARROW-3291](https://issues.apache.org/jira/browse/ARROW-3291) - [C++] Convenience API for constructing arrow::io::BufferReader from std::string
+* [ARROW-3293](https://issues.apache.org/jira/browse/ARROW-3293) - [C++] Test Flight RPC in Travis CI
+* [ARROW-3296](https://issues.apache.org/jira/browse/ARROW-3296) - [Python] Add Flight support to manylinux1 wheels
+* [ARROW-3303](https://issues.apache.org/jira/browse/ARROW-3303) - [C++] Enable example arrays to be written with a simplified JSON representation
+* [ARROW-3306](https://issues.apache.org/jira/browse/ARROW-3306) - [R] Objects and support functions different kinds of arrow::Buffer
+* [ARROW-3307](https://issues.apache.org/jira/browse/ARROW-3307) - [R] Convert chunked arrow::Column to R vector
+* [ARROW-3310](https://issues.apache.org/jira/browse/ARROW-3310) - [R] Create wrapper classes for various Arrow IO interfaces
+* [ARROW-3312](https://issues.apache.org/jira/browse/ARROW-3312) - [R] Use same .clang-format file for both R binding C++ code and main C++ codebase
+* [ARROW-3315](https://issues.apache.org/jira/browse/ARROW-3315) - [R] Support for multi-threaded conversions from RecordBatch, Table to R data.frame
+* [ARROW-3318](https://issues.apache.org/jira/browse/ARROW-3318) - [C++] Convenience method for reading all batches from an IPC stream or file as arrow::Table
+* [ARROW-3323](https://issues.apache.org/jira/browse/ARROW-3323) - [Java] checkstyle - fix naming
+* [ARROW-3331](https://issues.apache.org/jira/browse/ARROW-3331) - [C++] Add re2 to ThirdpartyToolchain
+* [ARROW-3340](https://issues.apache.org/jira/browse/ARROW-3340) - [R] support for dates and time classes
+* [ARROW-3347](https://issues.apache.org/jira/browse/ARROW-3347) - [Rust] Implement PrimitiveArrayBuilder
+* [ARROW-3353](https://issues.apache.org/jira/browse/ARROW-3353) - [Packaging] Build python 3.7 wheels
+* [ARROW-3355](https://issues.apache.org/jira/browse/ARROW-3355) - [R] Support for factors
+* [ARROW-3358](https://issues.apache.org/jira/browse/ARROW-3358) - [Gandiva][C++] Replace usages of gandiva/status.h with arrow/status.h
+* [ARROW-3362](https://issues.apache.org/jira/browse/ARROW-3362) - [R] Guard against null buffers
+* [ARROW-3366](https://issues.apache.org/jira/browse/ARROW-3366) - [R] Dockerfile for docker-compose setup
+* [ARROW-3368](https://issues.apache.org/jira/browse/ARROW-3368) - [Integration/CI/Python] Add dask integration test to docker-compose setup
+* [ARROW-3380](https://issues.apache.org/jira/browse/ARROW-3380) - [Python] Support reading CSV files and more from a gzipped file
+* [ARROW-3381](https://issues.apache.org/jira/browse/ARROW-3381) - [C++] Implement InputStream for bz2 files
+* [ARROW-3383](https://issues.apache.org/jira/browse/ARROW-3383) - [Java] Run Gandiva tests in Travis CI
+* [ARROW-3384](https://issues.apache.org/jira/browse/ARROW-3384) - [Gandiva] Sync remaining commits from gandiva repo
+* [ARROW-3385](https://issues.apache.org/jira/browse/ARROW-3385) - [Java] [Gandiva] Deploy gandiva snapshot jars automatically
+* [ARROW-3387](https://issues.apache.org/jira/browse/ARROW-3387) - [C++] Function to cast binary to string/utf8 with UTF8 validation
+* [ARROW-3398](https://issues.apache.org/jira/browse/ARROW-3398) - [Rust] Update existing Builder to use MutableBuffer internally
+* [ARROW-3402](https://issues.apache.org/jira/browse/ARROW-3402) - [Gandiva][C++] Utilize common bitmap operation implementations in precompiled IR routines
+* [ARROW-3407](https://issues.apache.org/jira/browse/ARROW-3407) - [C++] Add UTF8 conversion modes in CSV reader conversion options
+* [ARROW-3409](https://issues.apache.org/jira/browse/ARROW-3409) - [C++] Add streaming compression interfaces
+* [ARROW-3421](https://issues.apache.org/jira/browse/ARROW-3421) - [C++] Add include-what-you-use setup to primary docker-compose.yml
+* [ARROW-3427](https://issues.apache.org/jira/browse/ARROW-3427) - [C++] Add Windows support, Unix static libs for double-conversion package in conda-forge
+* [ARROW-3429](https://issues.apache.org/jira/browse/ARROW-3429) - [Packaging] Add a script to release binaries that use source archive at dist.apache.orgtable bit
+* [ARROW-3430](https://issues.apache.org/jira/browse/ARROW-3430) - [Packaging] Add workaround to verify 0.11.0
+* [ARROW-3431](https://issues.apache.org/jira/browse/ARROW-3431) - [GLib] Include Gemfile to archive
+* [ARROW-3432](https://issues.apache.org/jira/browse/ARROW-3432) - [Packaging] Variables aren't expanded Subversion commit message
+* [ARROW-3433](https://issues.apache.org/jira/browse/ARROW-3433) - [C++] Validate re2 with Windows toolchain, EP
+* [ARROW-3439](https://issues.apache.org/jira/browse/ARROW-3439) - [R] R language bindings for Feather format
+* [ARROW-3440](https://issues.apache.org/jira/browse/ARROW-3440) - [Gandiva][C++] Remove outdated cpp/src/gandiva/README.md, add build documentation to cpp/README.md
+* [ARROW-3441](https://issues.apache.org/jira/browse/ARROW-3441) - [Gandiva][C++] Produce fewer test executables
+* [ARROW-3442](https://issues.apache.org/jira/browse/ARROW-3442) - [C++] Use dynamic linking for unit tests, ensure coverage working properly with clang
+* [ARROW-3450](https://issues.apache.org/jira/browse/ARROW-3450) - [R] Wrap MemoryMappedFile class
+* [ARROW-3451](https://issues.apache.org/jira/browse/ARROW-3451) - [Python] Allocate CUDA memory from a CUcontext created by numba.cuda
+* [ARROW-3455](https://issues.apache.org/jira/browse/ARROW-3455) - [Gandiva][C++] Support pkg-config for Gandiva
+* [ARROW-3456](https://issues.apache.org/jira/browse/ARROW-3456) - [CI] Reuse docker images and optimize docker-compose containers
+* [ARROW-3460](https://issues.apache.org/jira/browse/ARROW-3460) - [Packaging] Add a script to rebase master on local release branch
+* [ARROW-3461](https://issues.apache.org/jira/browse/ARROW-3461) - [Packaging] Add a script to upload RC artifacts as the official release
+* [ARROW-3462](https://issues.apache.org/jira/browse/ARROW-3462) - [Packaging] Update CHANGELOG for 0.11.0
+* [ARROW-3463](https://issues.apache.org/jira/browse/ARROW-3463) - [Website] Update for 0.11.0
+* [ARROW-3464](https://issues.apache.org/jira/browse/ARROW-3464) - [Packaging] Build shared libraries for gandiva fat JAR via crossbow
+* [ARROW-3465](https://issues.apache.org/jira/browse/ARROW-3465) - [Documentation] Fix gen\_apidocs' docker image
+* [ARROW-3469](https://issues.apache.org/jira/browse/ARROW-3469) - [Gandiva] add travis entry for gandiva on OSX
+* [ARROW-3472](https://issues.apache.org/jira/browse/ARROW-3472) - [Gandiva] remove gandiva helpers library
+* [ARROW-3473](https://issues.apache.org/jira/browse/ARROW-3473) - [Format] Update Layout.md document to clarify use of 64-bit array lengths
+* [ARROW-3474](https://issues.apache.org/jira/browse/ARROW-3474) - [GLib] Extend gparquet API with get\_schema and read\_column
+* [ARROW-3479](https://issues.apache.org/jira/browse/ARROW-3479) - [R] Support to write record\_batch as stream
+* [ARROW-3482](https://issues.apache.org/jira/browse/ARROW-3482) - [C++] Build with JEMALLOC by default
+* [ARROW-3487](https://issues.apache.org/jira/browse/ARROW-3487) - [Gandiva] simplify NULL\_IF\_NULL functions that can return errors
+* [ARROW-3488](https://issues.apache.org/jira/browse/ARROW-3488) - [Packaging] Separate crossbow task definition files for packaging and tests
+* [ARROW-3489](https://issues.apache.org/jira/browse/ARROW-3489) - [Gandiva] Support for in expressions
+* [ARROW-3490](https://issues.apache.org/jira/browse/ARROW-3490) - [R] streaming arrow objects to output streams
+* [ARROW-3492](https://issues.apache.org/jira/browse/ARROW-3492) - [C++] Build jemalloc in parallel
+* [ARROW-3493](https://issues.apache.org/jira/browse/ARROW-3493) - [Java] Document BOUNDS\_CHECKING\_ENABLED
+* [ARROW-3499](https://issues.apache.org/jira/browse/ARROW-3499) - [R] Expose arrow::ipc::Message type
+* [ARROW-3501](https://issues.apache.org/jira/browse/ARROW-3501) - [Gandiva] Enable building with gcc 4.8.x on Ubuntu Trusty, similar distros
+* [ARROW-3504](https://issues.apache.org/jira/browse/ARROW-3504) - [Plasma] Add support for Plasma Client to put/get raw bytes without pyarrow serialization.
+* [ARROW-3505](https://issues.apache.org/jira/browse/ARROW-3505) - [R] Read record batch and table
+* [ARROW-3506](https://issues.apache.org/jira/browse/ARROW-3506) - [Packaging] Nightly tests for docker-compose images
+* [ARROW-3508](https://issues.apache.org/jira/browse/ARROW-3508) - [C++] Build against double-conversion from conda-forge
+* [ARROW-3515](https://issues.apache.org/jira/browse/ARROW-3515) - Introduce NumericTensor class
+* [ARROW-3518](https://issues.apache.org/jira/browse/ARROW-3518) - [C++] Detect HOMEBREW\_PREFIX automatically
+* [ARROW-3519](https://issues.apache.org/jira/browse/ARROW-3519) - [Gandiva] Add support for functions that can return variable len output
+* [ARROW-3521](https://issues.apache.org/jira/browse/ARROW-3521) - [GLib] Run Python using find\_program in meson.build
+* [ARROW-3529](https://issues.apache.org/jira/browse/ARROW-3529) - [Ruby] Import Red Parquet
+* [ARROW-3530](https://issues.apache.org/jira/browse/ARROW-3530) - [Java/Python] Add conversion for pyarrow.Schema from org.apache…pojo.Schema
+* [ARROW-3533](https://issues.apache.org/jira/browse/ARROW-3533) - [Python/Documentation] Use sphinx\_rtd\_theme instead of Bootstrap
+* [ARROW-3536](https://issues.apache.org/jira/browse/ARROW-3536) - [C++] Fast UTF8 validation functions
+* [ARROW-3537](https://issues.apache.org/jira/browse/ARROW-3537) - [Rust] Implement Tensor Type
+* [ARROW-3539](https://issues.apache.org/jira/browse/ARROW-3539) - [CI/Packaging] Update scripts to build against vendored jemalloc
+* [ARROW-3540](https://issues.apache.org/jira/browse/ARROW-3540) - [Rust] Incorporate BooleanArray into PrimitiveArray
+* [ARROW-3542](https://issues.apache.org/jira/browse/ARROW-3542) - [C++] Use unsafe appends when building array from CSV
+* [ARROW-3545](https://issues.apache.org/jira/browse/ARROW-3545) - [C++/Python] Normalize child/field terminology with StructType
+* [ARROW-3547](https://issues.apache.org/jira/browse/ARROW-3547) - [R] Protect against Null crash when reading from RecordBatch
+* [ARROW-3548](https://issues.apache.org/jira/browse/ARROW-3548) - Speed up storing small objects in the object store.
+* [ARROW-3551](https://issues.apache.org/jira/browse/ARROW-3551) - Change MapD to OmniSci on Powered By page
+* [ARROW-3553](https://issues.apache.org/jira/browse/ARROW-3553) - [R] Error when losing data on int64, uint64 conversions to double
+* [ARROW-3555](https://issues.apache.org/jira/browse/ARROW-3555) - [Plasma] Unify plasma client get function using metadata.
+* [ARROW-3556](https://issues.apache.org/jira/browse/ARROW-3556) - [CI] Disable optimizations on Windows
+* [ARROW-3557](https://issues.apache.org/jira/browse/ARROW-3557) - [Python] Set language\_level in Cython sources
+* [ARROW-3558](https://issues.apache.org/jira/browse/ARROW-3558) - [Plasma] Remove fatal error when plasma client calls get on an unsealed object that it created.
+* [ARROW-3559](https://issues.apache.org/jira/browse/ARROW-3559) - Statically link libraries for plasma\_store\_server executable.
+* [ARROW-3562](https://issues.apache.org/jira/browse/ARROW-3562) - [R] Disallow creation of objects with null shared\_ptr<T\>
+* [ARROW-3563](https://issues.apache.org/jira/browse/ARROW-3563) - [C++] Declare public link dependencies so arrow\_static, plasma\_static automatically pull in transitive dependencies
+* [ARROW-3566](https://issues.apache.org/jira/browse/ARROW-3566) - Clarify that the type of dictionary encoded field should be the encoded(index) type
+* [ARROW-3567](https://issues.apache.org/jira/browse/ARROW-3567) - [Gandiva] [GLib] Add GLib bindings of Gandiva
+* [ARROW-3568](https://issues.apache.org/jira/browse/ARROW-3568) - [Packaging] Run pyarrow unittests for windows wheels
+* [ARROW-3569](https://issues.apache.org/jira/browse/ARROW-3569) - [Packaging] Run pyarrow unittests when building conda package
+* [ARROW-3574](https://issues.apache.org/jira/browse/ARROW-3574) - Fix remaining bug with plasma static versus shared libraries.
+* [ARROW-3575](https://issues.apache.org/jira/browse/ARROW-3575) - [Python] New documentation page for CSV reader
+* [ARROW-3576](https://issues.apache.org/jira/browse/ARROW-3576) - [Python] Expose compressed file readers as NativeFile
+* [ARROW-3577](https://issues.apache.org/jira/browse/ARROW-3577) - [Go] add support for ChunkedArray
+* [ARROW-3581](https://issues.apache.org/jira/browse/ARROW-3581) - [Gandiva][C++] ARROW\_PROTOBUF\_USE\_SHARED isn't used
+* [ARROW-3582](https://issues.apache.org/jira/browse/ARROW-3582) - [CI] Gandiva C++ build is always triggered
+* [ARROW-3583](https://issues.apache.org/jira/browse/ARROW-3583) - [Python/Java] Create RecordBatch from VectorSchemaRoot
+* [ARROW-3584](https://issues.apache.org/jira/browse/ARROW-3584) - [Go] add support for Table
+* [ARROW-3587](https://issues.apache.org/jira/browse/ARROW-3587) - [Python] Efficient serialization for Arrow Objects (array, table, tensor, etc)
+* [ARROW-3588](https://issues.apache.org/jira/browse/ARROW-3588) - [Java] checkstyle - fix license
+* [ARROW-3589](https://issues.apache.org/jira/browse/ARROW-3589) - [Gandiva] Make it possible to compile gandiva without JNI
+* [ARROW-3591](https://issues.apache.org/jira/browse/ARROW-3591) - [R] Support to collect decimal type
+* [ARROW-3592](https://issues.apache.org/jira/browse/ARROW-3592) - [Python] Get BinaryArray value as zero copy memory view
+* [ARROW-3597](https://issues.apache.org/jira/browse/ARROW-3597) - [Gandiva] gandiva should integrate with ADD\_ARROW\_TEST for tests
+* [ARROW-3600](https://issues.apache.org/jira/browse/ARROW-3600) - [Packaging] Support Ubuntu 18.10
+* [ARROW-3601](https://issues.apache.org/jira/browse/ARROW-3601) - [Rust] Release 0.11.0
+* [ARROW-3602](https://issues.apache.org/jira/browse/ARROW-3602) - [Gandiva] [Python] Add preliminary Cython bindings for Gandiva
+* [ARROW-3603](https://issues.apache.org/jira/browse/ARROW-3603) - [Gandiva][C++] Can't build with vendored Boost
+* [ARROW-3605](https://issues.apache.org/jira/browse/ARROW-3605) - Remove AE library from plasma header files.
+* [ARROW-3607](https://issues.apache.org/jira/browse/ARROW-3607) - [Java] delete() method via JNI for plasma
+* [ARROW-3608](https://issues.apache.org/jira/browse/ARROW-3608) - [R] Support for time32 and time64 array types
+* [ARROW-3609](https://issues.apache.org/jira/browse/ARROW-3609) - [Gandiva] Move benchmark tests out of unit test
+* [ARROW-3610](https://issues.apache.org/jira/browse/ARROW-3610) - [C++] Add interface to turn stl\_allocator into arrow::MemoryPool
+* [ARROW-3611](https://issues.apache.org/jira/browse/ARROW-3611) - Give error more quickly when pyarrow serialization context is used incorrectly.
+* [ARROW-3612](https://issues.apache.org/jira/browse/ARROW-3612) - [Go] implement RecordBatch and RecordBatchReader
+* [ARROW-3615](https://issues.apache.org/jira/browse/ARROW-3615) - [R] Support for NaN
+* [ARROW-3616](https://issues.apache.org/jira/browse/ARROW-3616) - [Java] checkstyle - fix remaining coding checks
+* [ARROW-3618](https://issues.apache.org/jira/browse/ARROW-3618) - [Packaging/Documentation] Add \`-c conda-forge\` option to avoid PackagesNotFoundError
+* [ARROW-3620](https://issues.apache.org/jira/browse/ARROW-3620) - [Python] Document multithreading options in Sphinx and add to api.rst
+* [ARROW-3621](https://issues.apache.org/jira/browse/ARROW-3621) - [Go] implement TableBatchReader
+* [ARROW-3622](https://issues.apache.org/jira/browse/ARROW-3622) - [Go] implement Schema.Equal
+* [ARROW-3623](https://issues.apache.org/jira/browse/ARROW-3623) - [Go] implement Field.Equal
+* [ARROW-3624](https://issues.apache.org/jira/browse/ARROW-3624) - [Python/C++] Support for zero-sized device buffers
+* [ARROW-3625](https://issues.apache.org/jira/browse/ARROW-3625) - [Go] add examples for Table, Record and {Table,Record}Reader
+* [ARROW-3626](https://issues.apache.org/jira/browse/ARROW-3626) - [Go] add a CSV TableReader
+* [ARROW-3627](https://issues.apache.org/jira/browse/ARROW-3627) - [Go] add RecordBatchBuilder
+* [ARROW-3629](https://issues.apache.org/jira/browse/ARROW-3629) - [Python] Add write\_to\_dataset to Python Sphinx API listing
+* [ARROW-3630](https://issues.apache.org/jira/browse/ARROW-3630) - [Plasma] [GLib] Add GLib bindings of Plasma
+* [ARROW-3632](https://issues.apache.org/jira/browse/ARROW-3632) - [Packaging] Update deb names in dev/tasks/tasks.yml in dev/release/00-prepare.sh
+* [ARROW-3633](https://issues.apache.org/jira/browse/ARROW-3633) - [Packaging] Update deb names in dev/tasks/tasks.yml for 0.12.0
+* [ARROW-3636](https://issues.apache.org/jira/browse/ARROW-3636) - [C++/Python] Update arrow/python/pyarrow\_api.h
+* [ARROW-3638](https://issues.apache.org/jira/browse/ARROW-3638) - [C++][Python] Move reading from Feather as Table feature to C++ from Python
+* [ARROW-3639](https://issues.apache.org/jira/browse/ARROW-3639) - [Packaging] Run gandiva nightly packaging tasks
+* [ARROW-3640](https://issues.apache.org/jira/browse/ARROW-3640) - [Go] add support for Tensors
+* [ARROW-3641](https://issues.apache.org/jira/browse/ARROW-3641) - [C++/Python] remove public keyword from Cython api functions
+* [ARROW-3642](https://issues.apache.org/jira/browse/ARROW-3642) - [C++] Add arrowConfig.cmake generation
+* [ARROW-3644](https://issues.apache.org/jira/browse/ARROW-3644) - [Rust] Implement ListArrayBuilder
+* [ARROW-3645](https://issues.apache.org/jira/browse/ARROW-3645) - [Python] Document compression support in Sphinx
+* [ARROW-3646](https://issues.apache.org/jira/browse/ARROW-3646) - [Python] Add convenience factories to create IO streams
+* [ARROW-3647](https://issues.apache.org/jira/browse/ARROW-3647) - [R] Crash after unloading bit64 package
+* [ARROW-3648](https://issues.apache.org/jira/browse/ARROW-3648) - [Plasma] Add API to get metadata and data at the same time
+* [ARROW-3649](https://issues.apache.org/jira/browse/ARROW-3649) - [Rust] Refactor MutableBuffer's resize
+* [ARROW-3656](https://issues.apache.org/jira/browse/ARROW-3656) - [C++] Allow whitespace in numeric CSV fields
+* [ARROW-3657](https://issues.apache.org/jira/browse/ARROW-3657) - [R] Require bit64 package
+* [ARROW-3659](https://issues.apache.org/jira/browse/ARROW-3659) - [C++] Clang Travis build (matrix entry 2) might not actually be using clang
+* [ARROW-3660](https://issues.apache.org/jira/browse/ARROW-3660) - [C++] Don't unnecessarily lock MemoryMappedFile for resizing in readonly files
+* [ARROW-3661](https://issues.apache.org/jira/browse/ARROW-3661) - [Gandiva][GLib] Improve constant name
+* [ARROW-3662](https://issues.apache.org/jira/browse/ARROW-3662) - [C++] Add a const overload to MemoryMappedFile::GetSize
+* [ARROW-3664](https://issues.apache.org/jira/browse/ARROW-3664) - [Rust] Add benchmark for PrimitiveArrayBuilder
+* [ARROW-3665](https://issues.apache.org/jira/browse/ARROW-3665) - [Rust] Implement StructArrayBuilder
+* [ARROW-3666](https://issues.apache.org/jira/browse/ARROW-3666) - [C++] Improve CSV parser performance
+* [ARROW-3672](https://issues.apache.org/jira/browse/ARROW-3672) - [Go] implement Time32 array
+* [ARROW-3673](https://issues.apache.org/jira/browse/ARROW-3673) - [Go] implement Time64 array
+* [ARROW-3674](https://issues.apache.org/jira/browse/ARROW-3674) - [Go] implement Date32 array
+* [ARROW-3675](https://issues.apache.org/jira/browse/ARROW-3675) - [Go] implement Date64 array
+* [ARROW-3677](https://issues.apache.org/jira/browse/ARROW-3677) - [Go] implement FixedSizedBinary array
+* [ARROW-3681](https://issues.apache.org/jira/browse/ARROW-3681) - [Go] add benchmarks for CSV reader
+* [ARROW-3682](https://issues.apache.org/jira/browse/ARROW-3682) - [Go] unexport encoding/csv.Reader from CSV reader
+* [ARROW-3683](https://issues.apache.org/jira/browse/ARROW-3683) - [Go] add functional-option style to CSV reader
+* [ARROW-3684](https://issues.apache.org/jira/browse/ARROW-3684) - [Go] add chunk size option to CSV reader
+* [ARROW-3692](https://issues.apache.org/jira/browse/ARROW-3692) - [Gandiva] [Ruby] Add Ruby bindings of Gandiva
+* [ARROW-3693](https://issues.apache.org/jira/browse/ARROW-3693) - [R] Invalid buffer for empty characters with null data
+* [ARROW-3694](https://issues.apache.org/jira/browse/ARROW-3694) - [Java] Avoid superfluous string creation when logging level is disabled
+* [ARROW-3695](https://issues.apache.org/jira/browse/ARROW-3695) - [Gandiva] Use add\_arrow\_lib()
+* [ARROW-3696](https://issues.apache.org/jira/browse/ARROW-3696) - [C++] Add feather::TableWriter::Write(table)
+* [ARROW-3697](https://issues.apache.org/jira/browse/ARROW-3697) - [Ruby] Add schema\#[]
+* [ARROW-3701](https://issues.apache.org/jira/browse/ARROW-3701) - [Gandiva] Add support for decimal operations
+* [ARROW-3708](https://issues.apache.org/jira/browse/ARROW-3708) - [Packaging] Nightly CentOS builds are failing
+* [ARROW-3713](https://issues.apache.org/jira/browse/ARROW-3713) - [Rust] Implement BinaryArrayBuilder
+* [ARROW-3718](https://issues.apache.org/jira/browse/ARROW-3718) - [Gandiva] Remove spurious gtest include
+* [ARROW-3719](https://issues.apache.org/jira/browse/ARROW-3719) - [GLib] Support read/write table to/from Feather
+* [ARROW-3720](https://issues.apache.org/jira/browse/ARROW-3720) - [GLib] Use "indices" instead of "indexes"
+* [ARROW-3721](https://issues.apache.org/jira/browse/ARROW-3721) - [Gandiva] [Python] Support all Gandiva literals
+* [ARROW-3722](https://issues.apache.org/jira/browse/ARROW-3722) - [C++] Allow specifying column types to CSV reader
+* [ARROW-3723](https://issues.apache.org/jira/browse/ARROW-3723) - [Plasma] [Ruby] Add Ruby bindings of Plasma
+* [ARROW-3724](https://issues.apache.org/jira/browse/ARROW-3724) - [GLib] Update gitignore
+* [ARROW-3725](https://issues.apache.org/jira/browse/ARROW-3725) - [GLib] Add field readers to GArrowStructDataType
+* [ARROW-3726](https://issues.apache.org/jira/browse/ARROW-3726) - [Rust] CSV Reader & Writer
+* [ARROW-3727](https://issues.apache.org/jira/browse/ARROW-3727) - [Python] Document use of pyarrow.foreign\_buffer, cuda.foreign\_buffer in Sphinx
+* [ARROW-3731](https://issues.apache.org/jira/browse/ARROW-3731) - [R] R API for reading and writing Parquet files
+* [ARROW-3733](https://issues.apache.org/jira/browse/ARROW-3733) - [GLib] Add to\_string() to GArrowTable and GArrowColumn
+* [ARROW-3736](https://issues.apache.org/jira/browse/ARROW-3736) - [CI/Docker] Ninja test in docker-compose run cpp hangs
+* [ARROW-3738](https://issues.apache.org/jira/browse/ARROW-3738) - [C++] Add CSV conversion option to parse ISO8601-like timestamp strings
+* [ARROW-3741](https://issues.apache.org/jira/browse/ARROW-3741) - [R] Add support for arrow::compute::Cast to convert Arrow arrays from one type to another
+* [ARROW-3743](https://issues.apache.org/jira/browse/ARROW-3743) - [Ruby] Add support for saving/loading Feather
+* [ARROW-3744](https://issues.apache.org/jira/browse/ARROW-3744) - [Ruby] Use garrow\_table\_to\_string() in Arrow::Table\#to\_s
+* [ARROW-3746](https://issues.apache.org/jira/browse/ARROW-3746) - [Gandiva] [Python] Make it possible to list all functions registered with Gandiva
+* [ARROW-3747](https://issues.apache.org/jira/browse/ARROW-3747) - [C++] Flip order of data members in arrow::Decimal128
+* [ARROW-3748](https://issues.apache.org/jira/browse/ARROW-3748) - [GLib] Add GArrowCSVReader
+* [ARROW-3749](https://issues.apache.org/jira/browse/ARROW-3749) - [GLib] Typos in documentation and test case name
+* [ARROW-3751](https://issues.apache.org/jira/browse/ARROW-3751) - [Python] Add more cython bindings for gandiva
+* [ARROW-3752](https://issues.apache.org/jira/browse/ARROW-3752) - [C++] Remove unused status::ArrowError
+* [ARROW-3753](https://issues.apache.org/jira/browse/ARROW-3753) - [Gandiva] Remove debug print
+* [ARROW-3755](https://issues.apache.org/jira/browse/ARROW-3755) - [GLib] Support for CompressedInputStream, CompressedOutputStream
+* [ARROW-3760](https://issues.apache.org/jira/browse/ARROW-3760) - [R] Support Arrow CSV reader
+* [ARROW-3773](https://issues.apache.org/jira/browse/ARROW-3773) - [C++] Remove duplicated AssertArraysEqual code in parquet/arrow/arrow-reader-writer-test.cc
+* [ARROW-3778](https://issues.apache.org/jira/browse/ARROW-3778) - [C++] Don't put implementations in test-util.h
+* [ARROW-3781](https://issues.apache.org/jira/browse/ARROW-3781) - [C++] Configure buffer size in arrow::io::BufferedOutputStream
+* [ARROW-3782](https://issues.apache.org/jira/browse/ARROW-3782) - [C++] Implement BufferedReader for C++
+* [ARROW-3784](https://issues.apache.org/jira/browse/ARROW-3784) - [R] Array with type fails with x is not a vector
+* [ARROW-3785](https://issues.apache.org/jira/browse/ARROW-3785) - [C++] Use double-conversion conda package in CI toolchain
+* [ARROW-3787](https://issues.apache.org/jira/browse/ARROW-3787) - Implement From<ListArray\> for BinaryArray
+* [ARROW-3788](https://issues.apache.org/jira/browse/ARROW-3788) - [Ruby] Add support for CSV parser writtin in C++
+* [ARROW-3795](https://issues.apache.org/jira/browse/ARROW-3795) - [R] Support for retrieving NAs from INT64 arrays
+* [ARROW-3796](https://issues.apache.org/jira/browse/ARROW-3796) - [Rust] Add Example for PrimitiveArrayBuilder
+* [ARROW-3798](https://issues.apache.org/jira/browse/ARROW-3798) - [GLib] Add support for column type CSV read options
+* [ARROW-3800](https://issues.apache.org/jira/browse/ARROW-3800) - [C++] Vendor a string\_view backport
+* [ARROW-3803](https://issues.apache.org/jira/browse/ARROW-3803) - [C++/Python] Split C++ and Python unit test Travis CI jobs, run all C++ tests (including Gandiva) together
+* [ARROW-3807](https://issues.apache.org/jira/browse/ARROW-3807) - [R] Missing Field API
+* [ARROW-3819](https://issues.apache.org/jira/browse/ARROW-3819) - [Packaging] Update conda variant files to conform with feedstock after compiler migration
+* [ARROW-3821](https://issues.apache.org/jira/browse/ARROW-3821) - [Format/Documentation]: Fix typos and grammar issues in Flight.proto comments
+* [ARROW-3823](https://issues.apache.org/jira/browse/ARROW-3823) - [R] + buffer.complex
+* [ARROW-3825](https://issues.apache.org/jira/browse/ARROW-3825) - [Python] The Python README.md does not show how to run the unit test suite
+* [ARROW-3826](https://issues.apache.org/jira/browse/ARROW-3826) - [C++] Determine if using ccache caching in Travis CI actually improves build times
+* [ARROW-3830](https://issues.apache.org/jira/browse/ARROW-3830) - [GLib] Add GArrowCodec
+* [ARROW-3834](https://issues.apache.org/jira/browse/ARROW-3834) - [Doc] Merge Python & C++ and move to top-level
+* [ARROW-3836](https://issues.apache.org/jira/browse/ARROW-3836) - [C++] Add PREFIX option to ADD\_ARROW\_BENCHMARK
+* [ARROW-3839](https://issues.apache.org/jira/browse/ARROW-3839) - [Rust] Add ability to infer schema in CSV reader
+* [ARROW-3841](https://issues.apache.org/jira/browse/ARROW-3841) - [C++] warning: catching polymorphic type by value
+* [ARROW-3842](https://issues.apache.org/jira/browse/ARROW-3842) - [R] RecordBatchStreamWriter api
+* [ARROW-3844](https://issues.apache.org/jira/browse/ARROW-3844) - [C++] Remove ARROW\_USE\_SSE and ARROW\_SSE3
+* [ARROW-3845](https://issues.apache.org/jira/browse/ARROW-3845) - [Gandiva] [GLib] Add GGandivaNode
+* [ARROW-3847](https://issues.apache.org/jira/browse/ARROW-3847) - [GLib] Remove unnecessary “\”.
+* [ARROW-3849](https://issues.apache.org/jira/browse/ARROW-3849) - Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64.
+* [ARROW-3851](https://issues.apache.org/jira/browse/ARROW-3851) - [C++] "make check-format" is slow
+* [ARROW-3852](https://issues.apache.org/jira/browse/ARROW-3852) - [C++] used uninitialized warning
+* [ARROW-3853](https://issues.apache.org/jira/browse/ARROW-3853) - [C++] Implement string to timestamp cast
+* [ARROW-3854](https://issues.apache.org/jira/browse/ARROW-3854) - [GLib] Deprecate garrow\_gio\_{input,output}\_stream\_get\_raw()
+* [ARROW-3855](https://issues.apache.org/jira/browse/ARROW-3855) - [Rust] Schema/Field/Datatype should implement serde traits
+* [ARROW-3856](https://issues.apache.org/jira/browse/ARROW-3856) - [Ruby] Support compressed CSV save/load
+* [ARROW-3858](https://issues.apache.org/jira/browse/ARROW-3858) - [GLib] Use {class\_name}\_get\_instance\_private
+* [ARROW-3859](https://issues.apache.org/jira/browse/ARROW-3859) - [Java] Fix ComplexWriter backward incompatible change
+* [ARROW-3860](https://issues.apache.org/jira/browse/ARROW-3860) - [Gandiva] [C++] Add option to use -static-libstdc++ when building libgandiva\_jni.so
+* [ARROW-3862](https://issues.apache.org/jira/browse/ARROW-3862) - [C++] Improve dependencies download script
+* [ARROW-3863](https://issues.apache.org/jira/browse/ARROW-3863) - [GLib] Use travis\_retry with brew bundle command
+* [ARROW-3864](https://issues.apache.org/jira/browse/ARROW-3864) - [GLib] Add support for allow-float-truncate cast option
+* [ARROW-3865](https://issues.apache.org/jira/browse/ARROW-3865) - [Packaging] Add double-conversion dependency to conda forge recipes and the windows wheel build
+* [ARROW-3867](https://issues.apache.org/jira/browse/ARROW-3867) - [Documentation] Uploading binary realase artifacts to Bintray
+* [ARROW-3868](https://issues.apache.org/jira/browse/ARROW-3868) - [Rust] Build against nightly Rust in CI
+* [ARROW-3870](https://issues.apache.org/jira/browse/ARROW-3870) - [C++] Add Peek to InputStream API
+* [ARROW-3871](https://issues.apache.org/jira/browse/ARROW-3871) - [R] Replace usages of C++ GetValuesSafely with new methods on ArrayData
+* [ARROW-3878](https://issues.apache.org/jira/browse/ARROW-3878) - [Rust] Improve primitive types
+* [ARROW-3880](https://issues.apache.org/jira/browse/ARROW-3880) - [Rust] PrimitiveArray<T\> should support simple math operations
+* [ARROW-3881](https://issues.apache.org/jira/browse/ARROW-3881) - [Rust] PrimitiveArray<T\> should support comparison operators
+* [ARROW-3883](https://issues.apache.org/jira/browse/ARROW-3883) - [Rust] Update Rust README to reflect new functionality
+* [ARROW-3884](https://issues.apache.org/jira/browse/ARROW-3884) - [Python] Add LLVM6 to manylinux1 base image
+* [ARROW-3885](https://issues.apache.org/jira/browse/ARROW-3885) - [Rust] Update version to 0.12.0 and update release instructions on wiki
+* [ARROW-3886](https://issues.apache.org/jira/browse/ARROW-3886) - [C++] Additional test cases for ARROW-3831
+* [ARROW-3891](https://issues.apache.org/jira/browse/ARROW-3891) - [Java] Remove Long.bitCount with simple bitmap operations
+* [ARROW-3893](https://issues.apache.org/jira/browse/ARROW-3893) - [C++] Improve adaptive int builder performance
+* [ARROW-3895](https://issues.apache.org/jira/browse/ARROW-3895) - [Rust] CSV reader should return Result<Option<\>\> not Option<Result<\>\>
+* [ARROW-3899](https://issues.apache.org/jira/browse/ARROW-3899) - [Python] Table.to\_pandas converts Arrow date32[day] to pandas datetime64[ns]
+* [ARROW-3900](https://issues.apache.org/jira/browse/ARROW-3900) - [GLib] Add garrow\_mutable\_buffer\_set\_data()
+* [ARROW-3905](https://issues.apache.org/jira/browse/ARROW-3905) - [Ruby] Add StructDataType\#[]
+* [ARROW-3906](https://issues.apache.org/jira/browse/ARROW-3906) - [C++] Break builder.cc into multiple compilation units
+* [ARROW-3908](https://issues.apache.org/jira/browse/ARROW-3908) - [Rust] Update rust dockerfile to use nightly toolchain
+* [ARROW-3910](https://issues.apache.org/jira/browse/ARROW-3910) - [Python] Set date\_as\_object to True in \*.to\_pandas as default after deduplicating logic implemented
+* [ARROW-3911](https://issues.apache.org/jira/browse/ARROW-3911) - [Python] Deduplicate datetime.date objects in Table.to\_pandas internals
+* [ARROW-3912](https://issues.apache.org/jira/browse/ARROW-3912) - [Plasma][GLib] Add support for creating and referring objects
+* [ARROW-3913](https://issues.apache.org/jira/browse/ARROW-3913) - [Gandiva] [GLib] Add GGandivaLiteralNode
+* [ARROW-3914](https://issues.apache.org/jira/browse/ARROW-3914) - [C++/Python/Packaging] Docker-compose setup for Alpine linux
+* [ARROW-3916](https://issues.apache.org/jira/browse/ARROW-3916) - [Python] Support caller-provided filesystem in \`ParquetWriter\` constructor
+* [ARROW-3921](https://issues.apache.org/jira/browse/ARROW-3921) - [CI][GLib] Log Homebrew output
+* [ARROW-3922](https://issues.apache.org/jira/browse/ARROW-3922) - [C++] improve the performance of bitmap operations
+* [ARROW-3924](https://issues.apache.org/jira/browse/ARROW-3924) - [Packaging][Plasma] Add support for Plasma deb/rpm packages
+* [ARROW-3925](https://issues.apache.org/jira/browse/ARROW-3925) - [Python] Include autoconf in Linux/macOS dependencies in conda environment
+* [ARROW-3928](https://issues.apache.org/jira/browse/ARROW-3928) - [Python] Add option to deduplicate PyBytes / PyString / PyUnicode objects in Table.to\_pandas conversion path
+* [ARROW-3929](https://issues.apache.org/jira/browse/ARROW-3929) - [Go] improve memory usage of CSV reader to improve runtime performances
+* [ARROW-3930](https://issues.apache.org/jira/browse/ARROW-3930) - [C++] Random test data generation is slow
+* [ARROW-3932](https://issues.apache.org/jira/browse/ARROW-3932) - [Python/Documentation] Include Benchmarks.md in Sphinx docs
+* [ARROW-3934](https://issues.apache.org/jira/browse/ARROW-3934) - [Gandiva] Don't compile precompiled tests if ARROW\_GANDIVA\_BUILD\_TESTS=off
+* [ARROW-3938](https://issues.apache.org/jira/browse/ARROW-3938) - [Packaging] Stop to refer java/pom.xml to get version information
+* [ARROW-3939](https://issues.apache.org/jira/browse/ARROW-3939) - [Rust] Remove macro definition for ListArrayBuilder
+* [ARROW-3945](https://issues.apache.org/jira/browse/ARROW-3945) - [Website] Blog post about Gandiva code donation
+* [ARROW-3946](https://issues.apache.org/jira/browse/ARROW-3946) - [GLib] Add support for union
+* [ARROW-3948](https://issues.apache.org/jira/browse/ARROW-3948) - [CI][GLib] Set timeout to Homebrew
+* [ARROW-3950](https://issues.apache.org/jira/browse/ARROW-3950) - [Plasma] Don't force loading the TensorFlow op on import
+* [ARROW-3952](https://issues.apache.org/jira/browse/ARROW-3952) - [Rust] Specify edition="2018" in Cargo.toml
+* [ARROW-3958](https://issues.apache.org/jira/browse/ARROW-3958) - [Plasma] Reduce number of IPCs
+* [ARROW-3959](https://issues.apache.org/jira/browse/ARROW-3959) - [Rust] Time and Timestamp Support
+* [ARROW-3960](https://issues.apache.org/jira/browse/ARROW-3960) - [Rust] remove extern crate for Rust 2018
+* [ARROW-3963](https://issues.apache.org/jira/browse/ARROW-3963) - [Packaging/Docker] Nightly test for building sphinx documentations
+* [ARROW-3964](https://issues.apache.org/jira/browse/ARROW-3964) - [Go] More readable example for csv.Reader
+* [ARROW-3967](https://issues.apache.org/jira/browse/ARROW-3967) - [Gandiva] [C++] Make gandiva/node.h public
+* [ARROW-3970](https://issues.apache.org/jira/browse/ARROW-3970) - [Gandiva][C++] Remove unnecessary boost dependencies
+* [ARROW-3971](https://issues.apache.org/jira/browse/ARROW-3971) - [Python] Remove APIs deprecated in 0.11 and prior
+* [ARROW-3974](https://issues.apache.org/jira/browse/ARROW-3974) - [C++] Combine field\_builders\_ and children\_ members in array/builder.h
+* [ARROW-3982](https://issues.apache.org/jira/browse/ARROW-3982) - [C++] Allow "binary" input in simple JSON format
+* [ARROW-3983](https://issues.apache.org/jira/browse/ARROW-3983) - [Gandiva][Crossbow] Use static boost while packaging
+* [ARROW-3984](https://issues.apache.org/jira/browse/ARROW-3984) - [C++] Exit with error if user hits zstd ExternalProject path
+* [ARROW-3986](https://issues.apache.org/jira/browse/ARROW-3986) - [C++] Write prose documentation
+* [ARROW-3986](https://issues.apache.org/jira/browse/ARROW-3986) - [C++] Write prose documentation
+* [ARROW-3987](https://issues.apache.org/jira/browse/ARROW-3987) - [Java] Benchmark results for ARROW-1807
+* [ARROW-3988](https://issues.apache.org/jira/browse/ARROW-3988) - [C++] Do not build unit tests by default in build system
+* [ARROW-3993](https://issues.apache.org/jira/browse/ARROW-3993) - [JS] CI Jobs Failing
+* [ARROW-3994](https://issues.apache.org/jira/browse/ARROW-3994) - [C++] Remove ARROW\_GANDIVA\_BUILD\_TESTS option
+* [ARROW-3995](https://issues.apache.org/jira/browse/ARROW-3995) - [CI] Use understandable names in Travis Matrix
+* [ARROW-3997](https://issues.apache.org/jira/browse/ARROW-3997) - [C++] [Doc] Clarify dictionary encoding integer signedness (and width?)
+* [ARROW-4002](https://issues.apache.org/jira/browse/ARROW-4002) - [C++][Gandiva] Remove CMake version check
+* [ARROW-4004](https://issues.apache.org/jira/browse/ARROW-4004) - [GLib] Replace GPU with CUDA
+* [ARROW-4005](https://issues.apache.org/jira/browse/ARROW-4005) - [Plasma] [GLib] Add gplasma\_client\_disconnect()
+* [ARROW-4006](https://issues.apache.org/jira/browse/ARROW-4006) - Add CODE\_OF\_CONDUCT.md
+* [ARROW-4009](https://issues.apache.org/jira/browse/ARROW-4009) - [CI] Run Valgrind and C++ code coverage in different bulds
+* [ARROW-4010](https://issues.apache.org/jira/browse/ARROW-4010) - [C++] Enable Travis CI scripts to only build and install only certain targets
+* [ARROW-4015](https://issues.apache.org/jira/browse/ARROW-4015) - [Plasma] remove legacy interfaces for plasma manager
+* [ARROW-4017](https://issues.apache.org/jira/browse/ARROW-4017) - [C++] Check and update vendored libraries
+* [ARROW-4026](https://issues.apache.org/jira/browse/ARROW-4026) - [C++] Use separate modular $COMPONENT-test targets for unit tests
+* [ARROW-4028](https://issues.apache.org/jira/browse/ARROW-4028) - [Rust] Merge parquet-rs codebase
+* [ARROW-4029](https://issues.apache.org/jira/browse/ARROW-4029) - [C++] Define and document naming convention for internal / private header files not to be installed
+* [ARROW-4030](https://issues.apache.org/jira/browse/ARROW-4030) - [CI] Use travis\_terminate to halt builds when a step fails
+* [ARROW-4035](https://issues.apache.org/jira/browse/ARROW-4035) - [Ruby] Support msys2 mingw dependencies
+* [ARROW-4037](https://issues.apache.org/jira/browse/ARROW-4037) - [Packaging] Remove workaround to verify 0.11.0
+* [ARROW-4038](https://issues.apache.org/jira/browse/ARROW-4038) - [Rust] Add array\_ops methods for boolean AND, OR, NOT
+* [ARROW-4039](https://issues.apache.org/jira/browse/ARROW-4039) - [Python] Update link to 'development.rst' page from Python README.md
+* [ARROW-4042](https://issues.apache.org/jira/browse/ARROW-4042) - [Rust] Inconsistent method naming between BinaryArray and PrimitiveArray
+* [ARROW-4043](https://issues.apache.org/jira/browse/ARROW-4043) - [Packaging/Docker] Python tests on alpine miss pytest dependency
+* [ARROW-4044](https://issues.apache.org/jira/browse/ARROW-4044) - [Packaging/Python] Add hypothesis test dependency to pyarrow conda recipe
+* [ARROW-4045](https://issues.apache.org/jira/browse/ARROW-4045) - [Packaging/Python] Add hypothesis test dependency to wheel crossbow tests
+* [ARROW-4048](https://issues.apache.org/jira/browse/ARROW-4048) - [GLib] Return ChunkedArray instead of Array in gparquet\_arrow\_file\_reader\_read\_column
+* [ARROW-4051](https://issues.apache.org/jira/browse/ARROW-4051) - [Gandiva] [GLib] Add support for null literal
+* [ARROW-4054](https://issues.apache.org/jira/browse/ARROW-4054) - [Python] Update gtest, flatbuffers and OpenSSL in manylinux1 base image
+* [ARROW-4060](https://issues.apache.org/jira/browse/ARROW-4060) - [Rust] Add Parquet/Arrow schema converter
+* [ARROW-4069](https://issues.apache.org/jira/browse/ARROW-4069) - [Python] Add tests for casting from binary to utf8
+* [ARROW-4075](https://issues.apache.org/jira/browse/ARROW-4075) - [Rust] Reuse array builder after calling finish()
+* [ARROW-4079](https://issues.apache.org/jira/browse/ARROW-4079) - [C++] Add machine benchmarks
+* [ARROW-4080](https://issues.apache.org/jira/browse/ARROW-4080) - [Rust] Improving lengthy build times in Appveyor
+* [ARROW-4082](https://issues.apache.org/jira/browse/ARROW-4082) - [C++] CMake tweaks: allow RelWithDebInfo, improve FindClangTools
+* [ARROW-4084](https://issues.apache.org/jira/browse/ARROW-4084) - [C++] Simplify Status and stringstream boilerplate
+* [ARROW-4085](https://issues.apache.org/jira/browse/ARROW-4085) - [GLib] Use "field" for struct data type
+* [ARROW-4087](https://issues.apache.org/jira/browse/ARROW-4087) - [C++] Make CSV nulls configurable
+* [ARROW-4093](https://issues.apache.org/jira/browse/ARROW-4093) - [C++] Deprecated method suggests wrong method
+* [ARROW-4098](https://issues.apache.org/jira/browse/ARROW-4098) - [Python] Deprecate pyarrow.open\_stream,open\_file in favor of pa.ipc.open\_stream/open\_file
+* [ARROW-4100](https://issues.apache.org/jira/browse/ARROW-4100) - [Gandiva][C++] Fix regex to ignore "." character
+* [ARROW-4102](https://issues.apache.org/jira/browse/ARROW-4102) - [C++] FixedSizeBinary identity cast not implemented
+* [ARROW-4103](https://issues.apache.org/jira/browse/ARROW-4103) - [Documentation] Add README to docs/ root
+* [ARROW-4105](https://issues.apache.org/jira/browse/ARROW-4105) - Add rust-toolchain to enforce user to use nightly toolchain for building
+* [ARROW-4107](https://issues.apache.org/jira/browse/ARROW-4107) - [Python] Use ninja in pyarrow manylinux1 build
+* [ARROW-4112](https://issues.apache.org/jira/browse/ARROW-4112) - [Packaging][Gandiva] Add support for deb packages
+* [ARROW-4116](https://issues.apache.org/jira/browse/ARROW-4116) - [Python] Clarify in development.rst that virtualenv cannot be used with miniconda/Anaconda
+* [ARROW-4122](https://issues.apache.org/jira/browse/ARROW-4122) - [C++] Initialize some uninitialized class members
+* [ARROW-4127](https://issues.apache.org/jira/browse/ARROW-4127) - [Documentation] Add Docker build instructions
+* [ARROW-4129](https://issues.apache.org/jira/browse/ARROW-4129) - [Python] Fix syntax problem in benchmark docs
+* [ARROW-4132](https://issues.apache.org/jira/browse/ARROW-4132) - [GLib] Add more GArrowTable constructors
+* [ARROW-4141](https://issues.apache.org/jira/browse/ARROW-4141) - [Ruby] Add support for creating schema from raw Ruby objects
+* [ARROW-4148](https://issues.apache.org/jira/browse/ARROW-4148) - [CI/Python] Disable ORC on nightly Alpine builds
+* [ARROW-4150](https://issues.apache.org/jira/browse/ARROW-4150) - [C++] Do not return buffers containing nullptr from internal allocations
+* [ARROW-4151](https://issues.apache.org/jira/browse/ARROW-4151) - [Rust] Restructure project directories
+* [ARROW-4152](https://issues.apache.org/jira/browse/ARROW-4152) - [GLib] Remove an example to show Torch integration
+* [ARROW-4153](https://issues.apache.org/jira/browse/ARROW-4153) - [GLib] Add builder\_append\_value() for consistency
+* [ARROW-4154](https://issues.apache.org/jira/browse/ARROW-4154) - [GLib] Add GArrowDecimal128DataType
+* [ARROW-4155](https://issues.apache.org/jira/browse/ARROW-4155) - [Rust] Implement array\_ops::sum() for PrimitiveArray<T\>
+* [ARROW-4156](https://issues.apache.org/jira/browse/ARROW-4156) - [C++] xcodebuild failure for cmake generated project
+* [ARROW-4158](https://issues.apache.org/jira/browse/ARROW-4158) - [Dev] Allow maintainers to use a GitHub API token when merging pull requests
+* [ARROW-4160](https://issues.apache.org/jira/browse/ARROW-4160) - [Rust] Add README and executable files to parquet
+* [ARROW-4161](https://issues.apache.org/jira/browse/ARROW-4161) - [GLib] Add GPlasmaClientOptions
+* [ARROW-4162](https://issues.apache.org/jira/browse/ARROW-4162) - [Ruby] Add support for creating data types from description
+* [ARROW-4166](https://issues.apache.org/jira/browse/ARROW-4166) - [Ruby] Add support for saving to and loading from buffer
+* [ARROW-4167](https://issues.apache.org/jira/browse/ARROW-4167) - [Gandiva] switch to arrow/util/variant
+* [ARROW-4168](https://issues.apache.org/jira/browse/ARROW-4168) - [GLib] Use property to keep GArrowDataType passed in garrow\_field\_new()
+* [ARROW-4172](https://issues.apache.org/jira/browse/ARROW-4172) - [Rust] more consistent naming in array builders
+* [ARROW-4174](https://issues.apache.org/jira/browse/ARROW-4174) - [Ruby] Add support for building composite array from raw Ruby objects
+* [ARROW-4175](https://issues.apache.org/jira/browse/ARROW-4175) - [GLib] Add support for decimal compare operators
+* [ARROW-4177](https://issues.apache.org/jira/browse/ARROW-4177) - [C++] Add ThreadPool and TaskGroup microbenchmarks
+* [ARROW-4183](https://issues.apache.org/jira/browse/ARROW-4183) - [Ruby] Add Arrow::Struct as an element of Arrow::StructArray
+* [ARROW-4184](https://issues.apache.org/jira/browse/ARROW-4184) - [Ruby] Add Arrow::RecordBatch\#to\_table
+* [ARROW-4191](https://issues.apache.org/jira/browse/ARROW-4191) - [C++] Use same CC and AR for jemalloc as for the main sources
+* [ARROW-4199](https://issues.apache.org/jira/browse/ARROW-4199) - [GLib] Add garrow\_seekable\_input\_stream\_peek()
+* [ARROW-4207](https://issues.apache.org/jira/browse/ARROW-4207) - [Gandiva] [GLib] Add support for IfNode
+* [ARROW-4210](https://issues.apache.org/jira/browse/ARROW-4210) - [Python] Mention boost-cpp directly in the conda meta.yaml for pyarrow
+* [ARROW-4211](https://issues.apache.org/jira/browse/ARROW-4211) - [GLib] Add GArrowFixedSizeBinaryDataType
+* [ARROW-4214](https://issues.apache.org/jira/browse/ARROW-4214) - [Ruby] Add support for building RecordBatch from raw Ruby objects
+* [ARROW-4216](https://issues.apache.org/jira/browse/ARROW-4216) - [Python] Add CUDA API docs
+* [ARROW-4228](https://issues.apache.org/jira/browse/ARROW-4228) - [GLib] Add garrow\_list\_data\_type\_get\_field()
+* [ARROW-4229](https://issues.apache.org/jira/browse/ARROW-4229) - [Packaging] Set crossbow target explicitly to enable building arbitrary arrow repo
+* [ARROW-4233](https://issues.apache.org/jira/browse/ARROW-4233) - [Packaging] Create a Dockerfile to build source archive
+* [ARROW-4239](https://issues.apache.org/jira/browse/ARROW-4239) - [Release] Updating .deb package names in the prepare script failed to run on OSX
+* [ARROW-4240](https://issues.apache.org/jira/browse/ARROW-4240) - [Packaging] Documents for Plasma GLib and Gandiva GLib are missing in source archive
+* [ARROW-4241](https://issues.apache.org/jira/browse/ARROW-4241) - [Packaging] Disable crossbow conda OSX clang builds
+* [ARROW-4243](https://issues.apache.org/jira/browse/ARROW-4243) - [Python] Test failure with pandas 0.24.0rc1
+* [ARROW-4249](https://issues.apache.org/jira/browse/ARROW-4249) - [Plasma] Remove reference to logging.h from plasma/common.h
+* [ARROW-4257](https://issues.apache.org/jira/browse/ARROW-4257) - [Release] Update release verification script to check binaries on Bintray
+* [ARROW-4266](https://issues.apache.org/jira/browse/ARROW-4266) - [Python][CI] Disable ORC tests in dask integration test
+* [ARROW-4269](https://issues.apache.org/jira/browse/ARROW-4269) - [Python] AttributeError: module 'pandas.core' has no attribute 'arrays'
+* [ARROW-4270](https://issues.apache.org/jira/browse/ARROW-4270) - [Packaging][Conda] Update xcode version and remove toolchain builds
+* [ARROW-4276](https://issues.apache.org/jira/browse/ARROW-4276) - [Release] Remove needless Bintray authentication from binaries verify script
+* [ARROW-4306](https://issues.apache.org/jira/browse/ARROW-4306) - [Release] Update website and add blog post announcing 0.12.0 release
+* [PARQUET-690](https://issues.apache.org/jira/browse/PARQUET-690) - [C++] Investigate / improve performance of Thrift utilities
+* [PARQUET-1271](https://issues.apache.org/jira/browse/PARQUET-1271) - [C++] "parquet\_reader" should be "parquet-reader"
+* [PARQUET-1439](https://issues.apache.org/jira/browse/PARQUET-1439) - [C++] Parquet build fails when PARQUET\_ARROW\_LINKAGE is static
+* [PARQUET-1449](https://issues.apache.org/jira/browse/PARQUET-1449) - [C++] Can't build with ARROW\_BOOST\_VENDORED=ON
+* [PARQUET-1463](https://issues.apache.org/jira/browse/PARQUET-1463) - [C++] Utilize revamped common hashing machinery for dictionary encoding
+* [PARQUET-1467](https://issues.apache.org/jira/browse/PARQUET-1467) - [C++] Remove ChunkedAllocator code, now unused
+* [PARQUET-1473](https://issues.apache.org/jira/browse/PARQUET-1473) - [C++] Add helper function that converts ParquetVersion to human-friendly string
+* [PARQUET-1484](https://issues.apache.org/jira/browse/PARQUET-1484) - [C++] Improve memory usage of FileMetaDataBuilder
+
+
+## Bug Fixes
+
+* [ARROW-1847](https://issues.apache.org/jira/browse/ARROW-1847) - [Doc] Document the difference between RecordBatch and Table in an FAQ fashion
+* [ARROW-2026](https://issues.apache.org/jira/browse/ARROW-2026) - [Python] Cast all timestamp resolutions to INT96 use\_deprecated\_int96\_timestamps=True
+* [ARROW-2038](https://issues.apache.org/jira/browse/ARROW-2038) - [Python] Follow-up bug fixes for s3fs Parquet support
+* [ARROW-2113](https://issues.apache.org/jira/browse/ARROW-2113) - [Python] Incomplete CLASSPATH with "hadoop" contained in it can fool the classpath setting HDFS logic
+* [ARROW-2591](https://issues.apache.org/jira/browse/ARROW-2591) - [Python] Segmentation fault when writing empty ListType column to Parquet
+* [ARROW-2592](https://issues.apache.org/jira/browse/ARROW-2592) - [Python] Error reading old Parquet file due to metadata backwards compatibility issue
+* [ARROW-2654](https://issues.apache.org/jira/browse/ARROW-2654) - [Python] Error with errno 22 when loading 3.6 GB Parquet file
+* [ARROW-2708](https://issues.apache.org/jira/browse/ARROW-2708) - [C++] Internal GetValues function in arrow::compute should check for nullptr
+* [ARROW-2831](https://issues.apache.org/jira/browse/ARROW-2831) - [Plasma] MemoryError in teardown
+* [ARROW-2970](https://issues.apache.org/jira/browse/ARROW-2970) - [Python] NumPyConverter::Visit for Binary/String/FixedSizeBinary can overflow
+* [ARROW-2987](https://issues.apache.org/jira/browse/ARROW-2987) - [Python] test\_cython\_api can fail if run in an environment where vsvarsall.bat has been run more than once
+* [ARROW-3048](https://issues.apache.org/jira/browse/ARROW-3048) - [Python] Import pyarrow fails if scikit-learn is installed from conda (boost-cpp / libboost issue)
+* [ARROW-3058](https://issues.apache.org/jira/browse/ARROW-3058) - [Python] Feather reads fail with unintuitive error when conversion from pandas yields ChunkedArray
+* [ARROW-3186](https://issues.apache.org/jira/browse/ARROW-3186) - [GLib] mesonbuild failures in Travis CI
+* [ARROW-3202](https://issues.apache.org/jira/browse/ARROW-3202) - [C++] Build does not succeed on Alpine Linux
+* [ARROW-3225](https://issues.apache.org/jira/browse/ARROW-3225) - [C++/Python] Pandas object conversion of ListType<DateType\> and ListType<TimeType\>
+* [ARROW-3324](https://issues.apache.org/jira/browse/ARROW-3324) - [Parquet] Free more internal resources when writing multiple row groups
+* [ARROW-3343](https://issues.apache.org/jira/browse/ARROW-3343) - [Java] Java tests fail non-deterministically with memory leak from Flight tests
+* [ARROW-3405](https://issues.apache.org/jira/browse/ARROW-3405) - [Python] Document CSV reader
+* [ARROW-3428](https://issues.apache.org/jira/browse/ARROW-3428) - [Python] from\_pandas gives incorrect results when converting floating point to bool
+* [ARROW-3436](https://issues.apache.org/jira/browse/ARROW-3436) - [C++] Boost version required by Gandiva is too new for Ubuntu 14.04
+* [ARROW-3437](https://issues.apache.org/jira/browse/ARROW-3437) - [Gandiva][C++] Configure static linking of libgcc, libstdc++ with LDFLAGS
+* [ARROW-3438](https://issues.apache.org/jira/browse/ARROW-3438) - [Packaging] Escaped bulletpoints in changelog
+* [ARROW-3445](https://issues.apache.org/jira/browse/ARROW-3445) - [GLib] Parquet GLib doesn't link Arrow GLib
+* [ARROW-3449](https://issues.apache.org/jira/browse/ARROW-3449) - [C++] Support CMake 3.2 for "out of the box" builds
+* [ARROW-3466](https://issues.apache.org/jira/browse/ARROW-3466) - [Python] Crash when importing tensorflow and pyarrow
+* [ARROW-3467](https://issues.apache.org/jira/browse/ARROW-3467) - Building against external double conversion is broken
+* [ARROW-3470](https://issues.apache.org/jira/browse/ARROW-3470) - [C++] Row-wise conversion tutorial has fallen out of date
+* [ARROW-3477](https://issues.apache.org/jira/browse/ARROW-3477) - [C++] Testsuite fails on 32 bit arch
+* [ARROW-3480](https://issues.apache.org/jira/browse/ARROW-3480) - [Website] Install document for Ubuntu is broken
+* [ARROW-3483](https://issues.apache.org/jira/browse/ARROW-3483) - [CI] Python 3.6 build failure on Travis-CI
+* [ARROW-3485](https://issues.apache.org/jira/browse/ARROW-3485) - [C++] Examples fail with Protobuf error
+* [ARROW-3494](https://issues.apache.org/jira/browse/ARROW-3494) - [C++] re2 conda-forge package not working in toolchain
+* [ARROW-3498](https://issues.apache.org/jira/browse/ARROW-3498) - [R] Make IPC APIs consistent
+* [ARROW-3516](https://issues.apache.org/jira/browse/ARROW-3516) - [C++] Use unsigned type for difference of pointers in parallel\_memcpy
+* [ARROW-3517](https://issues.apache.org/jira/browse/ARROW-3517) - [C++] MinGW 32bit build causes g++ segv
+* [ARROW-3524](https://issues.apache.org/jira/browse/ARROW-3524) - [C++] Fix compiler warnings from ARROW-3409 on clang-6
+* [ARROW-3527](https://issues.apache.org/jira/browse/ARROW-3527) - [R] Unused variables in R-package C++ code
+* [ARROW-3528](https://issues.apache.org/jira/browse/ARROW-3528) - [R] Typo in R documentation
+* [ARROW-3535](https://issues.apache.org/jira/browse/ARROW-3535) - [Python] pip install tensorflow install too new numpy in manylinux1 build
+* [ARROW-3541](https://issues.apache.org/jira/browse/ARROW-3541) - [Rust] Update BufferBuilder to allow for new bit-packed BooleanArray
+* [ARROW-3544](https://issues.apache.org/jira/browse/ARROW-3544) - [Gandiva] Populate function registry in multiple compilation units to mitigate long compile times in release mode
+* [ARROW-3549](https://issues.apache.org/jira/browse/ARROW-3549) - [Rust] Replace i64 with usize for some bit utility functions
+* [ARROW-3573](https://issues.apache.org/jira/browse/ARROW-3573) - [Rust] with\_bitset does not set valid bits correctly
+* [ARROW-3580](https://issues.apache.org/jira/browse/ARROW-3580) - [Gandiva][C++] Build error with g++ 8.2.0
+* [ARROW-3586](https://issues.apache.org/jira/browse/ARROW-3586) - [Python] Segmentation fault when converting empty table to pandas with categoricals
+* [ARROW-3598](https://issues.apache.org/jira/browse/ARROW-3598) - [Plasma] plasma\_store\_server fails linking with GPU enabled
+* [ARROW-3613](https://issues.apache.org/jira/browse/ARROW-3613) - [Go] Resize does not correctly update the length
+* [ARROW-3613](https://issues.apache.org/jira/browse/ARROW-3613) - [Go] Resize does not correctly update the length
+* [ARROW-3614](https://issues.apache.org/jira/browse/ARROW-3614) - [R] Handle Type::TIMESTAMP from Arrow to R
+* [ARROW-3634](https://issues.apache.org/jira/browse/ARROW-3634) - [GLib] cuda.cpp compile error
+* [ARROW-3637](https://issues.apache.org/jira/browse/ARROW-3637) - [Go] Implement Stringer for arrays
+* [ARROW-3658](https://issues.apache.org/jira/browse/ARROW-3658) - [Rust] validation of offsets buffer is incorrect for \`List<T\>\`
+* [ARROW-3670](https://issues.apache.org/jira/browse/ARROW-3670) - [C++] Use FindBacktrace to find execinfo.h support
+* [ARROW-3687](https://issues.apache.org/jira/browse/ARROW-3687) - [Rust] Anything measuring array slots should be \`usize\`
+* [ARROW-3698](https://issues.apache.org/jira/browse/ARROW-3698) - [C++] Segmentation fault when using a large table in Gandiva
+* [ARROW-3700](https://issues.apache.org/jira/browse/ARROW-3700) - [C++] CSV parser should allow ignoring empty lines
+* [ARROW-3703](https://issues.apache.org/jira/browse/ARROW-3703) - [Python] DataFrame.to\_parquet crashes if datetime column has time zones
+* [ARROW-3704](https://issues.apache.org/jira/browse/ARROW-3704) - [Gandiva] Can't build with g++ 8.2.0
+* [ARROW-3707](https://issues.apache.org/jira/browse/ARROW-3707) - [C++] test failure with zstd 1.3.7
+* [ARROW-3711](https://issues.apache.org/jira/browse/ARROW-3711) - [C++] Don't pass CXX\_FLAGS to C\_FLAGS
+* [ARROW-3712](https://issues.apache.org/jira/browse/ARROW-3712) - [CI] License check regression (RAT failure)
+* [ARROW-3715](https://issues.apache.org/jira/browse/ARROW-3715) - [C++] gflags\_ep fails to build with CMake 3.13
+* [ARROW-3716](https://issues.apache.org/jira/browse/ARROW-3716) - [R] Missing cases for ChunkedArray conversion
+* [ARROW-3728](https://issues.apache.org/jira/browse/ARROW-3728) - [Python] Merging Parquet Files - Pandas Meta in Schema Mismatch
+* [ARROW-3734](https://issues.apache.org/jira/browse/ARROW-3734) - [C++] Linking static zstd library fails on Arch x86-64
+* [ARROW-3740](https://issues.apache.org/jira/browse/ARROW-3740) - [C++] Calling ArrayBuilder::Resize with length smaller than current appended length results in invalid state
+* [ARROW-3742](https://issues.apache.org/jira/browse/ARROW-3742) - Fix pyarrow.types & gandiva cython bindings
+* [ARROW-3745](https://issues.apache.org/jira/browse/ARROW-3745) - [C++] CMake passes static libraries multiple times to linker
+* [ARROW-3754](https://issues.apache.org/jira/browse/ARROW-3754) - [Packaging] Zstd configure error on linux package builds
+* [ARROW-3756](https://issues.apache.org/jira/browse/ARROW-3756) - [CI/Docker/Java] Java tests are failing in docker-compose setup
+* [ARROW-3765](https://issues.apache.org/jira/browse/ARROW-3765) - [Gandiva] Segfault when the validity bitmap has not been allocated
+* [ARROW-3766](https://issues.apache.org/jira/browse/ARROW-3766) - [Python] pa.Table.from\_pandas doesn't use schema ordering
+* [ARROW-3768](https://issues.apache.org/jira/browse/ARROW-3768) - [Python] set classpath to hdfs not hadoop executable
+* [ARROW-3775](https://issues.apache.org/jira/browse/ARROW-3775) - [C++] Handling Parquet Arrow reads that overflow a BinaryArray capacity
+* [ARROW-3790](https://issues.apache.org/jira/browse/ARROW-3790) - [C++] Signed to unsigned integer cast yields incorrect results when type sizes are the same
+* [ARROW-3792](https://issues.apache.org/jira/browse/ARROW-3792) - [Python] Segmentation fault when writing empty RecordBatches to Parquet
+* [ARROW-3793](https://issues.apache.org/jira/browse/ARROW-3793) - [C++] TestScalarAppendUnsafe is not testing unsafe appends
+* [ARROW-3797](https://issues.apache.org/jira/browse/ARROW-3797) - [Rust] BinaryArray::value\_offset incorrect in offset case
+* [ARROW-3805](https://issues.apache.org/jira/browse/ARROW-3805) - [Gandiva] handle null validity bitmap in if-else expressions
+* [ARROW-3831](https://issues.apache.org/jira/browse/ARROW-3831) - [C++] arrow::util::Codec::Decompress() doesn't return decompressed data size
+* [ARROW-3835](https://issues.apache.org/jira/browse/ARROW-3835) - [C++] arrow::io::CompressedOutputStream::raw() impementation is missing
+* [ARROW-3837](https://issues.apache.org/jira/browse/ARROW-3837) - [C++] gflags link errors on Windows
+* [ARROW-3866](https://issues.apache.org/jira/browse/ARROW-3866) - [Python] Column metadata is not transferred to tables in pyarrow
+* [ARROW-3869](https://issues.apache.org/jira/browse/ARROW-3869) - [Rust] "invalid fastbin errors" since Rust nightly-2018-11-03
+* [ARROW-3874](https://issues.apache.org/jira/browse/ARROW-3874) - [Gandiva] Cannot build: LLVM not detected correctly
+* [ARROW-3879](https://issues.apache.org/jira/browse/ARROW-3879) - [C++] cuda-test failure
+* [ARROW-3888](https://issues.apache.org/jira/browse/ARROW-3888) - [C++] Compilation warnings with gcc 7.3.0
+* [ARROW-3889](https://issues.apache.org/jira/browse/ARROW-3889) - [Python] creating schema with invalid paramaters causes segmanetation fault
+* [ARROW-3890](https://issues.apache.org/jira/browse/ARROW-3890) - [Python] Creating Array with explicit string type fails on Python 2.7
+* [ARROW-3894](https://issues.apache.org/jira/browse/ARROW-3894) - [Python] Error reading IPC file with no record batches
+* [ARROW-3898](https://issues.apache.org/jira/browse/ARROW-3898) - parquet-arrow example has compilation errors
+* [ARROW-3909](https://issues.apache.org/jira/browse/ARROW-3909) - [Python] Table.from\_pandas call that seemingly should zero copy does not
+* [ARROW-3918](https://issues.apache.org/jira/browse/ARROW-3918) - [Python] ParquetWriter.write\_table doesn't support coerce\_timestamps or allow\_truncated\_timestamps
+* [ARROW-3920](https://issues.apache.org/jira/browse/ARROW-3920) - Plasma reference counting not properly done in TensorFlow custom operator.
+* [ARROW-3931](https://issues.apache.org/jira/browse/ARROW-3931) - [C++] Make possible to build regardless of LANG
+* [ARROW-3936](https://issues.apache.org/jira/browse/ARROW-3936) - Add \_O\_NOINHERIT to the file open flags on Windows
+* [ARROW-3937](https://issues.apache.org/jira/browse/ARROW-3937) - [Rust] Rust nightly build is failing
+* [ARROW-3940](https://issues.apache.org/jira/browse/ARROW-3940) - [Python/Documentation] Add required packages to the development instruction
+* [ARROW-3941](https://issues.apache.org/jira/browse/ARROW-3941) - [R] RecordBatchStreamReader$schema
+* [ARROW-3942](https://issues.apache.org/jira/browse/ARROW-3942) - [R] Feather api fixes
+* [ARROW-3953](https://issues.apache.org/jira/browse/ARROW-3953) - Compat with pandas 0.24 rename of MultiIndex labels -\> codes
+* [ARROW-3955](https://issues.apache.org/jira/browse/ARROW-3955) - [GLib] Add (transfer full) to free when no longer needed
+* [ARROW-3957](https://issues.apache.org/jira/browse/ARROW-3957) - [Python] Better error message when user connects to HDFS cluster with wrong port
+* [ARROW-3961](https://issues.apache.org/jira/browse/ARROW-3961) - [Python/Documentation] Fix wrong path in the pyarrow README
+* [ARROW-3969](https://issues.apache.org/jira/browse/ARROW-3969) - [Rust] CI build broken because rustfmt not available on nightly toolchain
+* [ARROW-3976](https://issues.apache.org/jira/browse/ARROW-3976) - [Ruby] Homebrew donation solicitation on CLI breaking CI builds
+* [ARROW-3977](https://issues.apache.org/jira/browse/ARROW-3977) - [Gandiva] gandiva cpp tests not running in CI
+* [ARROW-3979](https://issues.apache.org/jira/browse/ARROW-3979) - [Gandiva] fix all valgrind reported errors
+* [ARROW-3980](https://issues.apache.org/jira/browse/ARROW-3980) - [C++] Fix CRTP use in json-simple.cc
+* [ARROW-3989](https://issues.apache.org/jira/browse/ARROW-3989) - [Rust] CSV reader should handle case sensitivity for boolean values
+* [ARROW-3996](https://issues.apache.org/jira/browse/ARROW-3996) - [C++] Insufficient description on build
+* [ARROW-4008](https://issues.apache.org/jira/browse/ARROW-4008) - [C++] Integration test executable failure
+* [ARROW-4011](https://issues.apache.org/jira/browse/ARROW-4011) - [Gandiva] Refer irhelpers.bc in build directory
+* [ARROW-4019](https://issues.apache.org/jira/browse/ARROW-4019) - [C++] Fix coverity issues
+* [ARROW-4033](https://issues.apache.org/jira/browse/ARROW-4033) - [C++] thirdparty/download\_dependencies.sh uses tools or options not available in older Linuxes
+* [ARROW-4034](https://issues.apache.org/jira/browse/ARROW-4034) - [Ruby] Interface for FileOutputStream doesn't respect append=True
+* [ARROW-4041](https://issues.apache.org/jira/browse/ARROW-4041) - [CI] Python 2.7 run uses Python 3.6
+* [ARROW-4049](https://issues.apache.org/jira/browse/ARROW-4049) - [C++] Arrow never use glog even though glog is linked.
+* [ARROW-4052](https://issues.apache.org/jira/browse/ARROW-4052) - [C++] Linker errors with glog and gflags
+* [ARROW-4053](https://issues.apache.org/jira/browse/ARROW-4053) - [Python/Integration] HDFS Tests failing with I/O operation on closed file
+* [ARROW-4055](https://issues.apache.org/jira/browse/ARROW-4055) - [Python] Fails to convert pytz.utc with versions 2018.3 and earlier
+* [ARROW-4058](https://issues.apache.org/jira/browse/ARROW-4058) - [C++] arrow-io-hdfs-test fails when run against HDFS cluster from docker-compose
+* [ARROW-4065](https://issues.apache.org/jira/browse/ARROW-4065) - [C++] arrowTargets.cmake is broken
+* [ARROW-4066](https://issues.apache.org/jira/browse/ARROW-4066) - Instructions to create Sphinx documentation
+* [ARROW-4070](https://issues.apache.org/jira/browse/ARROW-4070) - [C++] ARROW\_BOOST\_VENDORED doesn't work properly with ninja build
+* [ARROW-4073](https://issues.apache.org/jira/browse/ARROW-4073) - [Python] Parquet test failures on AppVeyor
+* [ARROW-4074](https://issues.apache.org/jira/browse/ARROW-4074) - [Python] test\_get\_library\_dirs\_win32 fails if libraries installed someplace different from conda or wheel packages
+* [ARROW-4078](https://issues.apache.org/jira/browse/ARROW-4078) - [CI] Run Travis job where documentation is built when docs/ is changed
+* [ARROW-4088](https://issues.apache.org/jira/browse/ARROW-4088) - [Python] Table.from\_batches() fails when passed a schema with metadata
+* [ARROW-4089](https://issues.apache.org/jira/browse/ARROW-4089) - [Plasma] The tutorial is wrong regarding the parameter type of PlasmaClient.Create
+* [ARROW-4101](https://issues.apache.org/jira/browse/ARROW-4101) - [C++] Binary identity cast not implemented
+* [ARROW-4106](https://issues.apache.org/jira/browse/ARROW-4106) - [Python] Tests fail to run because hypothesis update broke its API
+* [ARROW-4109](https://issues.apache.org/jira/browse/ARROW-4109) - [Packaging] Missing glog dependency from arrow-cpp conda recipe
+* [ARROW-4113](https://issues.apache.org/jira/browse/ARROW-4113) - [R] Version number patch broke build
+* [ARROW-4114](https://issues.apache.org/jira/browse/ARROW-4114) - [C++][DOCUMENTATION] Add "python" to Linux build instructions
+* [ARROW-4115](https://issues.apache.org/jira/browse/ARROW-4115) - [Gandiva] valgrind complains that boolean output data buffer has uninited data
+* [ARROW-4118](https://issues.apache.org/jira/browse/ARROW-4118) - [Python] Error with "asv run"
+* [ARROW-4125](https://issues.apache.org/jira/browse/ARROW-4125) - [Python] ASV benchmarks fail to run if Plasma extension is not built (e.g. on Windows)
+* [ARROW-4126](https://issues.apache.org/jira/browse/ARROW-4126) - [Go] offset not used when accessing boolean array
+* [ARROW-4128](https://issues.apache.org/jira/browse/ARROW-4128) - [C++][DOCUMENTATION] Update style guide to reflect some more exceptions
+* [ARROW-4130](https://issues.apache.org/jira/browse/ARROW-4130) - [Go] offset not used when accessing binary array
+* [ARROW-4134](https://issues.apache.org/jira/browse/ARROW-4134) - [Packaging] Properly setup timezone in docker tests to prevent ORC adapter's abort
+* [ARROW-4135](https://issues.apache.org/jira/browse/ARROW-4135) - [Python] Can't reload a pandas dataframe containing a list of datetime.time
+* [ARROW-4137](https://issues.apache.org/jira/browse/ARROW-4137) - [Rust] Move parquet code into a separate crate
+* [ARROW-4138](https://issues.apache.org/jira/browse/ARROW-4138) - [Python] setuptools\_scm customization does not work for versions above 0.9.0 on Windows
+* [ARROW-4147](https://issues.apache.org/jira/browse/ARROW-4147) - [JAVA] Reduce heap usage for variable width vectors
+* [ARROW-4149](https://issues.apache.org/jira/browse/ARROW-4149) - [CI/C++] Parquet test misses ZSTD compression codec in CMake 3.2 nightly builds
+* [ARROW-4157](https://issues.apache.org/jira/browse/ARROW-4157) - [C++] -Wdocumentation failures with clang 6.0 on Ubuntu 18.04
+* [ARROW-4171](https://issues.apache.org/jira/browse/ARROW-4171) - [Rust] fix parquet crate release version
+* [ARROW-4173](https://issues.apache.org/jira/browse/ARROW-4173) - JIRA library name is wrong in error message of dev/merge\_arrow\_pr.py
+* [ARROW-4178](https://issues.apache.org/jira/browse/ARROW-4178) - [C++] Fix TSan and UBSan errors
+* [ARROW-4179](https://issues.apache.org/jira/browse/ARROW-4179) - [Python] Tests crashing on all platforms in CI
+* [ARROW-4182](https://issues.apache.org/jira/browse/ARROW-4182) - [Python][CI] SEGV frequency
+* [ARROW-4185](https://issues.apache.org/jira/browse/ARROW-4185) - [Rust] Appveyor builds are broken
+* [ARROW-4186](https://issues.apache.org/jira/browse/ARROW-4186) - [C++] BitmapWriters clobber the first byte when length=0
+* [ARROW-4188](https://issues.apache.org/jira/browse/ARROW-4188) - [Rust] There should be a README in the top level rust directory
+* [ARROW-4197](https://issues.apache.org/jira/browse/ARROW-4197) - [C++] Emscripten compiler fails building Arrow
+* [ARROW-4200](https://issues.apache.org/jira/browse/ARROW-4200) - [C++] conda\_env\_\* files cannot be used to create a fresh conda environment on Windows
+* [ARROW-4209](https://issues.apache.org/jira/browse/ARROW-4209) - [Gandiva] returning IR structs causes issues with windows
+* [ARROW-4215](https://issues.apache.org/jira/browse/ARROW-4215) - [GLib] Fix typos in documentation
+* [ARROW-4227](https://issues.apache.org/jira/browse/ARROW-4227) - [GLib] Field in composite data type returns wrong data type
+* [ARROW-4237](https://issues.apache.org/jira/browse/ARROW-4237) - [Packaging] Fix CMAKE\_INSTALL\_LIBDIR in release verification script
+* [ARROW-4238](https://issues.apache.org/jira/browse/ARROW-4238) - [Packaging] Fix RC version conflict between crossbow and rake
+* [ARROW-4246](https://issues.apache.org/jira/browse/ARROW-4246) - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma
+* [ARROW-4246](https://issues.apache.org/jira/browse/ARROW-4246) - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma
+* [ARROW-4256](https://issues.apache.org/jira/browse/ARROW-4256) - [Release] Update Windows verification script for 0.12 release
+* [ARROW-4258](https://issues.apache.org/jira/browse/ARROW-4258) - [Python] Safe cast fails from numpy float64 array with nans to integer
+* [ARROW-4260](https://issues.apache.org/jira/browse/ARROW-4260) - [Python] test\_serialize\_deserialize\_pandas is failing in multiple build entries
+* [PARQUET-1426](https://issues.apache.org/jira/browse/PARQUET-1426) - [C++] parquet-dump-schema has poor usability
+* [PARQUET-1458](https://issues.apache.org/jira/browse/PARQUET-1458) - [C++] parquet::CompressionToString not recognizing brotli compression
+* [PARQUET-1469](https://issues.apache.org/jira/browse/PARQUET-1469) - [C++] DefinitionLevelsToBitmap can overwrite prior decoded data
+* [PARQUET-1471](https://issues.apache.org/jira/browse/PARQUET-1471) - [C++] Out of bounds access in statistics UpdateSpaced when writing optional list with null list slots
+* [PARQUET-1481](https://issues.apache.org/jira/browse/PARQUET-1481) - [C++] SEGV when reading corrupt parquet file
+
+
+
+# Apache Arrow 0.11.1 (2018-10-23)
+
+## New Features and Improvements
+
+* [ARROW-3353](https://issues.apache.org/jira/browse/ARROW-3353) - [Packaging] Build python 3.7 wheels
+* [ARROW-3534](https://issues.apache.org/jira/browse/ARROW-3534) - [Python] Update zlib library in manylinux1 image
+* [ARROW-3546](https://issues.apache.org/jira/browse/ARROW-3546) - [Python] Provide testing setup to verify wheel binaries work in one or more common Linux distributions
+* [ARROW-3565](https://issues.apache.org/jira/browse/ARROW-3565) - [Python] Pin tensorflow to 1.11.0 in manylinux1 container
+
+
+## Bug Fixes
+
+* [ARROW-3514](https://issues.apache.org/jira/browse/ARROW-3514) - [Python] zlib deflate exception when writing Parquet file
+* [ARROW-3907](https://issues.apache.org/jira/browse/ARROW-3907) - [Python] from\_pandas errors when schemas are used with lower resolution timestamps
+
+
+
+# Apache Arrow 0.11.0 (2018-10-08)
+
+## New Features and Improvements
+
+* [ARROW-25](https://issues.apache.org/jira/browse/ARROW-25) - [C++] Implement delimited file scanner / CSV reader
+* [ARROW-249](https://issues.apache.org/jira/browse/ARROW-249) - [Flight] Define GRPC IDL / wire protocol for messaging with Arrow data
+* [ARROW-614](https://issues.apache.org/jira/browse/ARROW-614) - [C++] Use glog (or some other tool) to print stack traces in debug builds on errors
+* [ARROW-1325](https://issues.apache.org/jira/browse/ARROW-1325) - [R] Bootstrap R bindings subproject
+* [ARROW-1424](https://issues.apache.org/jira/browse/ARROW-1424) - [Python] Initial bindings for libarrow\_gpu
+* [ARROW-1491](https://issues.apache.org/jira/browse/ARROW-1491) - [C++] Add casting implementations from strings to numbers or boolean
+* [ARROW-1521](https://issues.apache.org/jira/browse/ARROW-1521) - [C++] Add Reset method to BufferOutputStream to enable object reuse
+* [ARROW-1563](https://issues.apache.org/jira/browse/ARROW-1563) - [C++] Implement logical unary and binary kernels for boolean arrays
+* [ARROW-1563](https://issues.apache.org/jira/browse/ARROW-1563) - [C++] Implement logical unary and binary kernels for boolean arrays
+* [ARROW-1860](https://issues.apache.org/jira/browse/ARROW-1860) - [C++] Add data structure to "stage" a sequence of IPC messages from in-memory data
+* [ARROW-1949](https://issues.apache.org/jira/browse/ARROW-1949) - [Python/C++] Add option to Array.from\_pandas and pyarrow.array to perform unsafe casts
+* [ARROW-1963](https://issues.apache.org/jira/browse/ARROW-1963) - [C++/Python] Create Array from sequence of numpy.datetime64
+* [ARROW-1968](https://issues.apache.org/jira/browse/ARROW-1968) - [Python] Unit testing setup for ORC files
+* [ARROW-2165](https://issues.apache.org/jira/browse/ARROW-2165) - enhance AllocatorListener to listen for child allocator addition and removal
+* [ARROW-2338](https://issues.apache.org/jira/browse/ARROW-2338) - [Scripts] Windows release verification script should create a conda environment
+* [ARROW-2352](https://issues.apache.org/jira/browse/ARROW-2352) - [C++/Python] Test OSX packaging in Travis matrix
+* [ARROW-2519](https://issues.apache.org/jira/browse/ARROW-2519) - [Rust] Implement min/max for primitive arrays
+* [ARROW-2520](https://issues.apache.org/jira/browse/ARROW-2520) - [Rust] CI should also build against nightly Rust
+* [ARROW-2555](https://issues.apache.org/jira/browse/ARROW-2555) - [Python] Provide an option to convert on coerce\_timestamps instead of error
+* [ARROW-2583](https://issues.apache.org/jira/browse/ARROW-2583) - [Rust] Buffer should be typeless
+* [ARROW-2617](https://issues.apache.org/jira/browse/ARROW-2617) - [Rust] Schema should contain fields not columns
+* [ARROW-2687](https://issues.apache.org/jira/browse/ARROW-2687) - [JS] Example usage in README is outdated
+* [ARROW-2734](https://issues.apache.org/jira/browse/ARROW-2734) - [Python] Cython api example doesn't work by default on macOS
+* [ARROW-2750](https://issues.apache.org/jira/browse/ARROW-2750) - [MATLAB] Add MATLAB support for reading numeric types from Feather files
+* [ARROW-2799](https://issues.apache.org/jira/browse/ARROW-2799) - [Python] Add safe option to Table.from\_pandas to avoid unsafe casts
+* [ARROW-2813](https://issues.apache.org/jira/browse/ARROW-2813) - [C++] Strip uninformative lcov output from Travis CI logs
+* [ARROW-2813](https://issues.apache.org/jira/browse/ARROW-2813) - [C++] Strip uninformative lcov output from Travis CI logs
+* [ARROW-2817](https://issues.apache.org/jira/browse/ARROW-2817) - [C++] Enable libraries to be installed in msys2 on Windows
+* [ARROW-2840](https://issues.apache.org/jira/browse/ARROW-2840) - [C++] See if stream alignment logic can be simplified
+* [ARROW-2865](https://issues.apache.org/jira/browse/ARROW-2865) - [C++/Python] Reduce some duplicated code in python/builtin\_convert.cc
+* [ARROW-2889](https://issues.apache.org/jira/browse/ARROW-2889) - [C++] Add optional argument to ADD\_ARROW\_TEST CMake function to add unit test prefix
+* [ARROW-2900](https://issues.apache.org/jira/browse/ARROW-2900) - [Python] Improve performance of appending nested NumPy arrays in builtin\_convert.cc
+* [ARROW-2936](https://issues.apache.org/jira/browse/ARROW-2936) - [Python] Implement Table.cast for casting from one schema to another (if possible)
+* [ARROW-2948](https://issues.apache.org/jira/browse/ARROW-2948) - [Packaging] Generate changelog with crossbow
+* [ARROW-2950](https://issues.apache.org/jira/browse/ARROW-2950) - [C++] Clean up util/bit-util.h
+* [ARROW-2952](https://issues.apache.org/jira/browse/ARROW-2952) - [C++] Dockerfile for running include-what-you-use checks
+* [ARROW-2958](https://issues.apache.org/jira/browse/ARROW-2958) - [C++] Flatbuffers EP fails to compile with GCC 8.1
+* [ARROW-2960](https://issues.apache.org/jira/browse/ARROW-2960) - [Packaging] Fix verify-release-candidate for binary packages and fix release cutting script for lib64 cmake issue
+* [ARROW-2964](https://issues.apache.org/jira/browse/ARROW-2964) - [Go] wire all currently implemented array types in array.MakeFromData
+* [ARROW-2971](https://issues.apache.org/jira/browse/ARROW-2971) - [Python] Give more descriptive names to python\_to\_arrow.cc/arrow\_to\_python.cc
+* [ARROW-2972](https://issues.apache.org/jira/browse/ARROW-2972) - [Python] Implement inference logic for uint64 conversions in builtin\_convert.cc
+* [ARROW-2975](https://issues.apache.org/jira/browse/ARROW-2975) - [Plasma] TensorFlow op: Compilation only working if arrow found by pkg-config
+* [ARROW-2976](https://issues.apache.org/jira/browse/ARROW-2976) - [Python] Directory in pyarrow.get\_library\_dirs() on Travis doesn't contain libarrow.so
+* [ARROW-2979](https://issues.apache.org/jira/browse/ARROW-2979) - [GLib] Add operator functions in GArrowDecimal128
+* [ARROW-2983](https://issues.apache.org/jira/browse/ARROW-2983) - [Packaging] Verify source release and binary artifacts in different scripts
+* [ARROW-2989](https://issues.apache.org/jira/browse/ARROW-2989) - [C++] Remove deprecated APIs in 0.10.0 and below
+* [ARROW-2991](https://issues.apache.org/jira/browse/ARROW-2991) - [CI] Cut down number of AppVeyor jobs
+* [ARROW-2994](https://issues.apache.org/jira/browse/ARROW-2994) - [C++] Only include Python C header directories for Python-related compilation units
+* [ARROW-2996](https://issues.apache.org/jira/browse/ARROW-2996) - [C++] Fix typo in cpp/.clang-tidy
+* [ARROW-2998](https://issues.apache.org/jira/browse/ARROW-2998) - [C++] Add variants of AllocateBuffer, AllocateResizeableBuffer that return unique\_ptr<Buffer\>
+* [ARROW-2999](https://issues.apache.org/jira/browse/ARROW-2999) - [Python] Do not run ASV benchmarks in every Travis CI build to improve runtimes
+* [ARROW-3000](https://issues.apache.org/jira/browse/ARROW-3000) - [Python] Do not build unit tests other than python-test in travis\_script\_python.sh
+* [ARROW-3001](https://issues.apache.org/jira/browse/ARROW-3001) - [Packaging] Don't modify PATH during rust release verification
+* [ARROW-3002](https://issues.apache.org/jira/browse/ARROW-3002) - [Python] Implement better DataType hash function
+* [ARROW-3003](https://issues.apache.org/jira/browse/ARROW-3003) - [Doc] Enable Java doc in dev/gen\_apidocs/create\_documents.sh
+* [ARROW-3005](https://issues.apache.org/jira/browse/ARROW-3005) - [Website] Update website and write blog post for 0.10.0 release announcement
+* [ARROW-3008](https://issues.apache.org/jira/browse/ARROW-3008) - [Packaging] Verify GPU related modules if available
+* [ARROW-3009](https://issues.apache.org/jira/browse/ARROW-3009) - [Python] pyarrow.orc uses APIs now prohibited in 0.10.0
+* [ARROW-3010](https://issues.apache.org/jira/browse/ARROW-3010) - [GLib] Update README to use Bundler
+* [ARROW-3017](https://issues.apache.org/jira/browse/ARROW-3017) - [C++] Don't throw exception in arrow/util/thread-pool.h
+* [ARROW-3018](https://issues.apache.org/jira/browse/ARROW-3018) - [Plasma] Improve random ObjectID generation
+* [ARROW-3018](https://issues.apache.org/jira/browse/ARROW-3018) - [Plasma] Improve random ObjectID generation
+* [ARROW-3019](https://issues.apache.org/jira/browse/ARROW-3019) - [Packaging] Use Bundler to verify Arrow GLib
+* [ARROW-3021](https://issues.apache.org/jira/browse/ARROW-3021) - [Go] support for List
+* [ARROW-3022](https://issues.apache.org/jira/browse/ARROW-3022) - [Go] support for Struct
+* [ARROW-3023](https://issues.apache.org/jira/browse/ARROW-3023) - [C++] Use gold linker in builds if it is available
+* [ARROW-3024](https://issues.apache.org/jira/browse/ARROW-3024) - [C++] Replace usages of std::mutex with atomics in memory\_pool.cc
+* [ARROW-3025](https://issues.apache.org/jira/browse/ARROW-3025) - [C++] Add option to switch between dynamic and static linking in unit test executables
+* [ARROW-3026](https://issues.apache.org/jira/browse/ARROW-3026) - [Plasma] Only run Plasma Python unit tests under valgrind once instead of twice in CI
+* [ARROW-3027](https://issues.apache.org/jira/browse/ARROW-3027) - [Ruby] Stop "git tag" by "rake release"
+* [ARROW-3028](https://issues.apache.org/jira/browse/ARROW-3028) - [Python] Trim unneeded work from documentation build in Travis CI
+* [ARROW-3029](https://issues.apache.org/jira/browse/ARROW-3029) - [Python] pkg\_resources is slow
+* [ARROW-3031](https://issues.apache.org/jira/browse/ARROW-3031) - [Go] Streamline release of Arrays and Builders
+* [ARROW-3033](https://issues.apache.org/jira/browse/ARROW-3033) - [Dev] docker-compose test tooling does not seem to cache built Docker images
+* [ARROW-3034](https://issues.apache.org/jira/browse/ARROW-3034) - [Packaging] Source archive can't be extracted by bsdtar on MSYS2
+* [ARROW-3035](https://issues.apache.org/jira/browse/ARROW-3035) - [Rust] Examples in README.md do not run
+* [ARROW-3036](https://issues.apache.org/jira/browse/ARROW-3036) - [Go] add support for slicing Arrays
+* [ARROW-3037](https://issues.apache.org/jira/browse/ARROW-3037) - [Go] add support NullArray
+* [ARROW-3042](https://issues.apache.org/jira/browse/ARROW-3042) - [Go] add badge to GoDoc in the Go-Arrow README
+* [ARROW-3043](https://issues.apache.org/jira/browse/ARROW-3043) - [C++] pthread doesn't exist on MinGW
+* [ARROW-3044](https://issues.apache.org/jira/browse/ARROW-3044) - [Python] Remove all occurrences of cython's legacy property definition syntax
+* [ARROW-3045](https://issues.apache.org/jira/browse/ARROW-3045) - [Python] Remove nullcheck from ipc Message and MessageReader
+* [ARROW-3046](https://issues.apache.org/jira/browse/ARROW-3046) - [GLib] Use rubyish method in test-orc-file-reader.rb
+* [ARROW-3050](https://issues.apache.org/jira/browse/ARROW-3050) - [C++] Adopt HiveServer2 client C++ codebase
+* [ARROW-3051](https://issues.apache.org/jira/browse/ARROW-3051) - [C++] Status performance optimization from Impala/Kudu
+* [ARROW-3057](https://issues.apache.org/jira/browse/ARROW-3057) - [INTEGRATION] Fix spark and hdfs dockerfiles
+* [ARROW-3059](https://issues.apache.org/jira/browse/ARROW-3059) - [C++] Streamline namespace array::test
+* [ARROW-3060](https://issues.apache.org/jira/browse/ARROW-3060) - [C++] Factor out parsing routines
+* [ARROW-3062](https://issues.apache.org/jira/browse/ARROW-3062) - [Python] Extend fast libtensorflow\_framework.so compatibility workaround to Python 2.7
+* [ARROW-3064](https://issues.apache.org/jira/browse/ARROW-3064) - [C++] Add option to ADD\_ARROW\_TEST to indicate additional dependencies for particular unit test executables
+* [ARROW-3067](https://issues.apache.org/jira/browse/ARROW-3067) - [Packaging] Support dev/rc/release .deb/.rpm builds
+* [ARROW-3068](https://issues.apache.org/jira/browse/ARROW-3068) - [Packaging] Bump version to 0.11.0-SNAPSHOT
+* [ARROW-3069](https://issues.apache.org/jira/browse/ARROW-3069) - [Release] Stop using SHA1 checksums per ASF policy
+* [ARROW-3072](https://issues.apache.org/jira/browse/ARROW-3072) - [C++] Use ARROW\_RETURN\_NOT\_OK instead of RETURN\_NOT\_OK in header files
+* [ARROW-3075](https://issues.apache.org/jira/browse/ARROW-3075) - [C++] Incorporate apache/parquet-cpp codebase into Arrow C++ codebase and build system
+* [ARROW-3076](https://issues.apache.org/jira/browse/ARROW-3076) - [Website] Add Google Analytics tags to C++, Python API docs
+* [ARROW-3088](https://issues.apache.org/jira/browse/ARROW-3088) - [Rust] Use internal \`Result<T\>\` type instead of \`Result<T, ArrowError\>\`
+* [ARROW-3090](https://issues.apache.org/jira/browse/ARROW-3090) - [Rust] Accompany error messages with assertions
+* [ARROW-3094](https://issues.apache.org/jira/browse/ARROW-3094) - [Python] Allow lighter construction of pa.Schema / pa.StructType
+* [ARROW-3099](https://issues.apache.org/jira/browse/ARROW-3099) - [C++] Add benchmark for number parsing
+* [ARROW-3105](https://issues.apache.org/jira/browse/ARROW-3105) - [Plasma] Improve flushing error message
+* [ARROW-3106](https://issues.apache.org/jira/browse/ARROW-3106) - [Website] Update committers and PMC roster on website
+* [ARROW-3109](https://issues.apache.org/jira/browse/ARROW-3109) - [Python] Add Python 3.7 virtualenvs to manylinux1 container
+* [ARROW-3110](https://issues.apache.org/jira/browse/ARROW-3110) - [C++] Compilation warnings with gcc 7.3.0
+* [ARROW-3111](https://issues.apache.org/jira/browse/ARROW-3111) - [Java] Enable changing default logging level when running tests
+* [ARROW-3114](https://issues.apache.org/jira/browse/ARROW-3114) - [Website] Add information about user@ mailing list to website / Community page
+* [ARROW-3115](https://issues.apache.org/jira/browse/ARROW-3115) - [Java] Style Checks - Fix import ordering
+* [ARROW-3116](https://issues.apache.org/jira/browse/ARROW-3116) - [Plasma] Add "ls" to object store
+* [ARROW-3117](https://issues.apache.org/jira/browse/ARROW-3117) - [GLib] Add garrow\_chunked\_array\_to\_string()
+* [ARROW-3119](https://issues.apache.org/jira/browse/ARROW-3119) - [Packaging] Nightly packaging script fails
+* [ARROW-3127](https://issues.apache.org/jira/browse/ARROW-3127) - [C++] Add Tutorial about Sending Tensor from C++ to Python
+* [ARROW-3128](https://issues.apache.org/jira/browse/ARROW-3128) - [C++] Support system shared zlib
+* [ARROW-3129](https://issues.apache.org/jira/browse/ARROW-3129) - [Packaging] Stop to use deprecated BuildRoot and Group in .rpm
+* [ARROW-3130](https://issues.apache.org/jira/browse/ARROW-3130) - [Go] add initial support for Go modules
+* [ARROW-3136](https://issues.apache.org/jira/browse/ARROW-3136) - [C++] Clean up arrow:: public API
+* [ARROW-3142](https://issues.apache.org/jira/browse/ARROW-3142) - [C++] Fetch all libs from toolchain environment
+* [ARROW-3143](https://issues.apache.org/jira/browse/ARROW-3143) - [C++] CopyBitmap into existing memory
+* [ARROW-3146](https://issues.apache.org/jira/browse/ARROW-3146) - [C++] Barebones Flight RPC server and client implementations
+* [ARROW-3147](https://issues.apache.org/jira/browse/ARROW-3147) - [C++] MSVC version isn't detected in code page 932
+* [ARROW-3148](https://issues.apache.org/jira/browse/ARROW-3148) - [C++] MSVC shows C4819 warning on code page 932
+* [ARROW-3152](https://issues.apache.org/jira/browse/ARROW-3152) - [C++][Packaging] Use dynamic linking for zlib in conda recipes
+* [ARROW-3153](https://issues.apache.org/jira/browse/ARROW-3153) - [Packaging] Fix broken nightly package builds introduced with recent cmake changes and orc tests
+* [ARROW-3157](https://issues.apache.org/jira/browse/ARROW-3157) - [C++] Improve buffer creation for typed data
+* [ARROW-3158](https://issues.apache.org/jira/browse/ARROW-3158) - [C++] Handle float truncation during casting
+* [ARROW-3160](https://issues.apache.org/jira/browse/ARROW-3160) - [Python] Improve pathlib.Path support in parquet and filesystem modules
+* [ARROW-3163](https://issues.apache.org/jira/browse/ARROW-3163) - [Python] Cython dependency is missing in non wheel package
+* [ARROW-3167](https://issues.apache.org/jira/browse/ARROW-3167) - [CI] Limit clcache cache size
+* [ARROW-3168](https://issues.apache.org/jira/browse/ARROW-3168) - [C++] Restore pkgconfig for Parquet C++ libraries
+* [ARROW-3170](https://issues.apache.org/jira/browse/ARROW-3170) - [C++] Implement "readahead spooler" class for background input buffering
+* [ARROW-3171](https://issues.apache.org/jira/browse/ARROW-3171) - [Java] checkstyle - fix line length and indentation
+* [ARROW-3172](https://issues.apache.org/jira/browse/ARROW-3172) - [Rust] Update documentation for datatypes.rs
+* [ARROW-3174](https://issues.apache.org/jira/browse/ARROW-3174) - [Rust] run examples as part of CI
+* [ARROW-3177](https://issues.apache.org/jira/browse/ARROW-3177) - [Rust] Update expected error messages for tests that 'should panic'
+* [ARROW-3180](https://issues.apache.org/jira/browse/ARROW-3180) - [C++] Add docker-compose setup to simulate Travis CI run locally
+* [ARROW-3181](https://issues.apache.org/jira/browse/ARROW-3181) - [Packaging] Adjust conda package scripts to account for Parquet codebase migration
+* [ARROW-3182](https://issues.apache.org/jira/browse/ARROW-3182) - [C++] Merge Gandiva codebase
+* [ARROW-3187](https://issues.apache.org/jira/browse/ARROW-3187) - [Plasma] Make Plasma Log pluggable with glog
+* [ARROW-3195](https://issues.apache.org/jira/browse/ARROW-3195) - [C++] NumPy initialization error check is missing in test
+* [ARROW-3196](https://issues.apache.org/jira/browse/ARROW-3196) - Enable merge\_arrow\_py.py script to merge Parquet patches and set fix versions
+* [ARROW-3197](https://issues.apache.org/jira/browse/ARROW-3197) - [C++] Add instructions to cpp/README.md about Parquet-only development and Arrow+Parquet
+* [ARROW-3198](https://issues.apache.org/jira/browse/ARROW-3198) - [Website] Blog post for 0.11 release
+* [ARROW-3211](https://issues.apache.org/jira/browse/ARROW-3211) - [C++] gold linker doesn't work with MinGW-w64
+* [ARROW-3212](https://issues.apache.org/jira/browse/ARROW-3212) - [C++] Create deterministic IPC metadata
+* [ARROW-3213](https://issues.apache.org/jira/browse/ARROW-3213) - [C++] Use CMake to build vendored Snappy on Windows
+* [ARROW-3214](https://issues.apache.org/jira/browse/ARROW-3214) - [C++] Disable insecure warnings with MinGW build
+* [ARROW-3215](https://issues.apache.org/jira/browse/ARROW-3215) - [C++] Add support for finding libpython on MSYS2
+* [ARROW-3216](https://issues.apache.org/jira/browse/ARROW-3216) - [C++] libpython isn't linked to libarrow\_python in MinGW build
+* [ARROW-3217](https://issues.apache.org/jira/browse/ARROW-3217) - [C++] ARROW\_STATIC definition is missing in MinGW build
+* [ARROW-3218](https://issues.apache.org/jira/browse/ARROW-3218) - [C++] Utilities has needless pthread link in MinGW build
+* [ARROW-3219](https://issues.apache.org/jira/browse/ARROW-3219) - [C++] Use Win32 API in MinGW
+* [ARROW-3223](https://issues.apache.org/jira/browse/ARROW-3223) - [GLib] Use the same shared object versioning rule in C++
+* [ARROW-3229](https://issues.apache.org/jira/browse/ARROW-3229) - [Packaging]: Adjust wheel package scripts to account for Parquet codebase migration
+* [ARROW-3234](https://issues.apache.org/jira/browse/ARROW-3234) - [C++] Link order is wrong when ARROW\_ORC=on and ARROW\_PROTOBUF\_USE\_SHARED=ON
+* [ARROW-3235](https://issues.apache.org/jira/browse/ARROW-3235) - [Packaging] Update deb names
+* [ARROW-3236](https://issues.apache.org/jira/browse/ARROW-3236) - [C++] OutputStream bookkeeping logic when writing IPC file format is incorrect
+* [ARROW-3240](https://issues.apache.org/jira/browse/ARROW-3240) - [GLib] Add build instructions using Meson
+* [ARROW-3242](https://issues.apache.org/jira/browse/ARROW-3242) - [C++] Use coarser-grained dispatch to SIMD hash functions
+* [ARROW-3249](https://issues.apache.org/jira/browse/ARROW-3249) - [Python] Run flake8 on integration\_test.py and crossbow.py
+* [ARROW-3250](https://issues.apache.org/jira/browse/ARROW-3250) - [C++] Create Buffer implementation that takes ownership for the memory from a std::string via std::move
+* [ARROW-3252](https://issues.apache.org/jira/browse/ARROW-3252) - [C++] Do not hard code the "v" part of versions in thirdparty toolchain
+* [ARROW-3257](https://issues.apache.org/jira/browse/ARROW-3257) - [C++] Stop to use IMPORTED\_LINK\_INTERFACE\_LIBRARIES
+* [ARROW-3258](https://issues.apache.org/jira/browse/ARROW-3258) - [GLib] CI is failued on macOS
+* [ARROW-3259](https://issues.apache.org/jira/browse/ARROW-3259) - [GLib] Rename "writeable" to "writable"
+* [ARROW-3261](https://issues.apache.org/jira/browse/ARROW-3261) - [Python] Add "field" method to select fields from StructArray
+* [ARROW-3262](https://issues.apache.org/jira/browse/ARROW-3262) - [Python] Implement \_\_getitem\_\_ with integers on pyarrow.Column
+* [ARROW-3264](https://issues.apache.org/jira/browse/ARROW-3264) - [Java] checkstyle - fix whitespace
+* [ARROW-3267](https://issues.apache.org/jira/browse/ARROW-3267) - [Python] Create empty table from schema
+* [ARROW-3268](https://issues.apache.org/jira/browse/ARROW-3268) - [CI] Reduce conda times on AppVeyor
+* [ARROW-3269](https://issues.apache.org/jira/browse/ARROW-3269) - [Python] Fix warnings in unit test suite
+* [ARROW-3270](https://issues.apache.org/jira/browse/ARROW-3270) - [Release] Adjust release verification scripts to recent parquet migration
+* [ARROW-3274](https://issues.apache.org/jira/browse/ARROW-3274) - [Packaging] Missing glog dependency from conda-forge recipes
+* [ARROW-3276](https://issues.apache.org/jira/browse/ARROW-3276) - [Packaging] Add support Parquet related Linux packages
+* [ARROW-3281](https://issues.apache.org/jira/browse/ARROW-3281) - [Java] Make sure that WritableByteChannel in WriteChannel writes out complete bytes
+* [ARROW-3282](https://issues.apache.org/jira/browse/ARROW-3282) - [R] initial R functionality
+* [ARROW-3284](https://issues.apache.org/jira/browse/ARROW-3284) - [R] Adding R Error in Status
+* [ARROW-3285](https://issues.apache.org/jira/browse/ARROW-3285) - [GLib] Add arrow\_cpp\_build\_type and arrow\_cpp\_build\_dir Meson options
+* [ARROW-3286](https://issues.apache.org/jira/browse/ARROW-3286) - [C++] ARROW\_EXPORT for RecordBatchBuilder is missing
+* [ARROW-3287](https://issues.apache.org/jira/browse/ARROW-3287) - [C++] "redeclared without dllimport attribute after being referenced with dll linkage" with MinGW
+* [ARROW-3288](https://issues.apache.org/jira/browse/ARROW-3288) - [GLib] Add new API index for 0.11.0
+* [ARROW-3300](https://issues.apache.org/jira/browse/ARROW-3300) - [Release] Update .deb package names in preparation
+* [ARROW-3301](https://issues.apache.org/jira/browse/ARROW-3301) - [Website] Update Jekyll and Bootstrap 4
+* [ARROW-3305](https://issues.apache.org/jira/browse/ARROW-3305) - [JS] Incorrect development documentation link in javascript readme
+* [ARROW-3309](https://issues.apache.org/jira/browse/ARROW-3309) - [JS] Missing links from DEVELOP.md
+* [ARROW-3313](https://issues.apache.org/jira/browse/ARROW-3313) - [R] Run clang-format, cpplint checks on R C++ code
+* [ARROW-3313](https://issues.apache.org/jira/browse/ARROW-3313) - [R] Run clang-format, cpplint checks on R C++ code
+* [ARROW-3319](https://issues.apache.org/jira/browse/ARROW-3319) - [GLib] Expose AlignStream methods in InputStream, OutputStream classes
+* [ARROW-3320](https://issues.apache.org/jira/browse/ARROW-3320) - [C++] Improve float parsing performance
+* [ARROW-3321](https://issues.apache.org/jira/browse/ARROW-3321) - [C++] Improve integer parsing performance
+* [ARROW-3334](https://issues.apache.org/jira/browse/ARROW-3334) - [Python] Update conda packages to new numpy requirement
+* [ARROW-3335](https://issues.apache.org/jira/browse/ARROW-3335) - [Python] Add ccache to manylinux1 container
+* [ARROW-3339](https://issues.apache.org/jira/browse/ARROW-3339) - [R] Support for character vectors
+* [ARROW-3341](https://issues.apache.org/jira/browse/ARROW-3341) - [R] Support for logical vector
+* [ARROW-3349](https://issues.apache.org/jira/browse/ARROW-3349) - [C++] Use aligned API in MinGW
+* [ARROW-3350](https://issues.apache.org/jira/browse/ARROW-3350) - [Website] Fix powered by links
+* [ARROW-3352](https://issues.apache.org/jira/browse/ARROW-3352) - [Packaging] Fix recently failing wheel builds
+* [ARROW-3356](https://issues.apache.org/jira/browse/ARROW-3356) - [Python] Document parameters of Table.to\_pandas method
+* [ARROW-3357](https://issues.apache.org/jira/browse/ARROW-3357) - [Rust] Add a mutable buffer implementation
+* [ARROW-3360](https://issues.apache.org/jira/browse/ARROW-3360) - [GLib] Import Parquet bindings
+* [ARROW-3363](https://issues.apache.org/jira/browse/ARROW-3363) - [C++/Python] Add helper functions to detect scalar Python types
+* [ARROW-3371](https://issues.apache.org/jira/browse/ARROW-3371) - [Python] Remove check\_metadata argument for Field.equals docstring
+* [ARROW-3375](https://issues.apache.org/jira/browse/ARROW-3375) - [Rust] Remove memory\_pool.rs
+* [ARROW-3376](https://issues.apache.org/jira/browse/ARROW-3376) - [C++] Add double-conversion to cpp/thirdparty/download\_dependencies.sh
+* [ARROW-3377](https://issues.apache.org/jira/browse/ARROW-3377) - [Gandiva][C++] Remove If statement from bit map set function
+* [ARROW-3382](https://issues.apache.org/jira/browse/ARROW-3382) - [C++] Run Gandiva tests in Travis CI
+* [ARROW-3392](https://issues.apache.org/jira/browse/ARROW-3392) - [Python] Support filters in disjunctive normal form in ParquetDataset
+* [ARROW-3395](https://issues.apache.org/jira/browse/ARROW-3395) - [C++/Python] Add docker container for linting
+* [ARROW-3397](https://issues.apache.org/jira/browse/ARROW-3397) - [C++] Use relative CMake path for modules
+* [ARROW-3400](https://issues.apache.org/jira/browse/ARROW-3400) - [Packaging] Add support Parquet GLib related Linux packages
+* [ARROW-3404](https://issues.apache.org/jira/browse/ARROW-3404) - [C++] Make CSV chunker faster
+* [ARROW-3411](https://issues.apache.org/jira/browse/ARROW-3411) - [Packaging] dev/release/01-perform.sh doesn't have executable bit
+* [ARROW-3412](https://issues.apache.org/jira/browse/ARROW-3412) - [Packaging] rat failure in dev/release/02-source.sh
+* [ARROW-3413](https://issues.apache.org/jira/browse/ARROW-3413) - [Packaging] dev/release/02-source.sh doesn't generate Parquet GLib document
+* [ARROW-3415](https://issues.apache.org/jira/browse/ARROW-3415) - [Packaging] dev/release/verify-release-cndidate.sh fails in "conda activate arrow-test"
+* [ARROW-3416](https://issues.apache.org/jira/browse/ARROW-3416) - [Packaging] dev/release/02-source.sh must use SHA512 instead of SHA1
+* [ARROW-3417](https://issues.apache.org/jira/browse/ARROW-3417) - [Packaging] dev/release/verify-release-cndidate.sh fails Parquet C++ test
+* [ARROW-3418](https://issues.apache.org/jira/browse/ARROW-3418) - [C++] Update Parquet snapshot version for release
+* [ARROW-3423](https://issues.apache.org/jira/browse/ARROW-3423) - [Packaging] Remove RC information from deb/rpm
+* [ARROW-3443](https://issues.apache.org/jira/browse/ARROW-3443) - [Java] Flight reports memory leaks in TestBasicOperation
+* [PARQUET-169](https://issues.apache.org/jira/browse/PARQUET-169) - Parquet-cpp: Implement support for bulk reading and writing repetition/definition levels.
+* [PARQUET-267](https://issues.apache.org/jira/browse/PARQUET-267) - Detach thirdparty code from build configuration.
+* [PARQUET-416](https://issues.apache.org/jira/browse/PARQUET-416) - C++11, cpplint cleanup, package target and header installation
+* [PARQUET-418](https://issues.apache.org/jira/browse/PARQUET-418) - Add a utility to print contents of a Parquet file to stdout
+* [PARQUET-428](https://issues.apache.org/jira/browse/PARQUET-428) - Support INT96 and FIXED\_LEN\_BYTE\_ARRAY types
+* [PARQUET-434](https://issues.apache.org/jira/browse/PARQUET-434) - Add a ParquetFileReader class to encapsulate some low-level details of interacting with Parquet files
+* [PARQUET-435](https://issues.apache.org/jira/browse/PARQUET-435) - Provide vectorized ColumnReader interface
+* [PARQUET-436](https://issues.apache.org/jira/browse/PARQUET-436) - Implement ParquetFileWriter class entry point for generating new Parquet files
+* [PARQUET-437](https://issues.apache.org/jira/browse/PARQUET-437) - Incorporate googletest thirdparty dependency and add cmake tools (ADD\_PARQUET\_TEST) to simplify adding new unit tests
+* [PARQUET-438](https://issues.apache.org/jira/browse/PARQUET-438) - Update RLE encoder/decoder modules from Impala upstream changes and adapt unit tests
+* [PARQUET-439](https://issues.apache.org/jira/browse/PARQUET-439) - Conform all copyright headers to ASF requirements
+* [PARQUET-442](https://issues.apache.org/jira/browse/PARQUET-442) - Convert flat SchemaElement vector to implied nested schema data structure
+* [PARQUET-448](https://issues.apache.org/jira/browse/PARQUET-448) - Add cmake option to skip building the unit tests
+* [PARQUET-449](https://issues.apache.org/jira/browse/PARQUET-449) - Update to latest parquet.thrift
+* [PARQUET-451](https://issues.apache.org/jira/browse/PARQUET-451) - Add a RowGroup reader interface class
+* [PARQUET-456](https://issues.apache.org/jira/browse/PARQUET-456) - Add zlib codec support
+* [PARQUET-463](https://issues.apache.org/jira/browse/PARQUET-463) - Add DCHECK\* macros for assertions in debug builds
+* [PARQUET-468](https://issues.apache.org/jira/browse/PARQUET-468) - Add a cmake option to generate the Parquet thrift headers with the thriftc in the environment
+* [PARQUET-477](https://issues.apache.org/jira/browse/PARQUET-477) - Enable clang-format check during the Travis CI build
+* [PARQUET-482](https://issues.apache.org/jira/browse/PARQUET-482) - Organize src code file structure to have a very clear folder with public headers.
+* [PARQUET-485](https://issues.apache.org/jira/browse/PARQUET-485) - Decouple data page delimiting from column reader / scanner classes, create test fixtures
+* [PARQUET-488](https://issues.apache.org/jira/browse/PARQUET-488) - Add SSE-related cmake options to manage compiler flags
+* [PARQUET-489](https://issues.apache.org/jira/browse/PARQUET-489) - Add visibility macros to be used for public and internal APIs of libparquet
+* [PARQUET-494](https://issues.apache.org/jira/browse/PARQUET-494) - Implement PLAIN\_DICTIONARY encoding and decoding
+* [PARQUET-496](https://issues.apache.org/jira/browse/PARQUET-496) - Fix cpplint configuration to be more restrictive
+* [PARQUET-497](https://issues.apache.org/jira/browse/PARQUET-497) - Decouple Parquet physical file structure from FileReader class
+* [PARQUET-499](https://issues.apache.org/jira/browse/PARQUET-499) - Complete PlainEncoder implementation for all primitive types and test end to end
+* [PARQUET-501](https://issues.apache.org/jira/browse/PARQUET-501) - Add an OutputStream abstraction (capable of memory allocation) for Encoder public API
+* [PARQUET-503](https://issues.apache.org/jira/browse/PARQUET-503) - Re-enable parquet 2.0 encodings
+* [PARQUET-508](https://issues.apache.org/jira/browse/PARQUET-508) - Add ParquetFilePrinter
+* [PARQUET-508](https://issues.apache.org/jira/browse/PARQUET-508) - Add ParquetFilePrinter
+* [PARQUET-512](https://issues.apache.org/jira/browse/PARQUET-512) - Add optional google/benchmark 3rd-party dependency for performance testing
+* [PARQUET-515](https://issues.apache.org/jira/browse/PARQUET-515) - Add "Reset" to LevelEncoder and LevelDecoder
+* [PARQUET-518](https://issues.apache.org/jira/browse/PARQUET-518) - Review usages of size\_t and unsigned integers generally per Google style guide
+* [PARQUET-519](https://issues.apache.org/jira/browse/PARQUET-519) - Disable compiler warning supressions and fix all DEBUG build warnings
+* [PARQUET-520](https://issues.apache.org/jira/browse/PARQUET-520) - Add version of LocalFileSource that uses memory-mapping for zero-copy reads
+* [PARQUET-533](https://issues.apache.org/jira/browse/PARQUET-533) - Simplify RandomAccessSource API to combine Seek/Read
+* [PARQUET-538](https://issues.apache.org/jira/browse/PARQUET-538) - Improve ColumnReader Tests
+* [PARQUET-542](https://issues.apache.org/jira/browse/PARQUET-542) - Support memory allocation from external memory
+* [PARQUET-545](https://issues.apache.org/jira/browse/PARQUET-545) - Improve API to support Decimal type
+* [PARQUET-547](https://issues.apache.org/jira/browse/PARQUET-547) - Refactor most templates to use DataType structs rather than the Type::type enum
+* [PARQUET-551](https://issues.apache.org/jira/browse/PARQUET-551) - Handle compiler warnings due to disabled DCHECKs in release builds
+* [PARQUET-556](https://issues.apache.org/jira/browse/PARQUET-556) - Extend RowGroupStatistics to include "min" "max" statistics
+* [PARQUET-559](https://issues.apache.org/jira/browse/PARQUET-559) - Enable InputStream as a source to the ParquetFileReader
+* [PARQUET-564](https://issues.apache.org/jira/browse/PARQUET-564) - Add option to run unit tests with valgrind --tool=memcheck
+* [PARQUET-566](https://issues.apache.org/jira/browse/PARQUET-566) - Add method to retrieve the full column path
+* [PARQUET-568](https://issues.apache.org/jira/browse/PARQUET-568) - Read only specified top-level columns in DebugPrint
+* [PARQUET-572](https://issues.apache.org/jira/browse/PARQUET-572) - Rename parquet\_cpp namespace to parquet
+* [PARQUET-573](https://issues.apache.org/jira/browse/PARQUET-573) - C++: Create a public API for reading and writing file metadata
+* [PARQUET-582](https://issues.apache.org/jira/browse/PARQUET-582) - Conversion functions for Parquet enums to Thrift enums
+* [PARQUET-583](https://issues.apache.org/jira/browse/PARQUET-583) - Implement Parquet to Thrift schema conversion
+* [PARQUET-587](https://issues.apache.org/jira/browse/PARQUET-587) - Implement BufferReader::Read(int64\_t,uint8\_t\*)
+* [PARQUET-589](https://issues.apache.org/jira/browse/PARQUET-589) - Implement Chunked InMemoryInputStream for better memory usage
+* [PARQUET-592](https://issues.apache.org/jira/browse/PARQUET-592) - Support compressed writes
+* [PARQUET-593](https://issues.apache.org/jira/browse/PARQUET-593) - Add API for writing Page statistics
+* [PARQUET-595](https://issues.apache.org/jira/browse/PARQUET-595) - Add API for key-value metadata
+* [PARQUET-595](https://issues.apache.org/jira/browse/PARQUET-595) - Add API for key-value metadata
+* [PARQUET-597](https://issues.apache.org/jira/browse/PARQUET-597) - Add data rates to benchmark output
+* [PARQUET-598](https://issues.apache.org/jira/browse/PARQUET-598) - [C++] Test writing all primitive data types
+* [PARQUET-600](https://issues.apache.org/jira/browse/PARQUET-600) - Add benchmarks for RLE-Level encoding
+* [PARQUET-603](https://issues.apache.org/jira/browse/PARQUET-603) - Implement missing information in schema descriptor
+* [PARQUET-605](https://issues.apache.org/jira/browse/PARQUET-605) - Expose schema node in ColumnDescriptor
+* [PARQUET-607](https://issues.apache.org/jira/browse/PARQUET-607) - Public Writer header
+* [PARQUET-610](https://issues.apache.org/jira/browse/PARQUET-610) - Print ColumnMetaData for each RowGroup
+* [PARQUET-616](https://issues.apache.org/jira/browse/PARQUET-616) - C++: WriteBatch should accept const arrays
+* [PARQUET-619](https://issues.apache.org/jira/browse/PARQUET-619) - C++: Add OutputStream for local files
+* [PARQUET-625](https://issues.apache.org/jira/browse/PARQUET-625) - Improve RLE read performance
+* [PARQUET-633](https://issues.apache.org/jira/browse/PARQUET-633) - Add version to WriterProperties
+* [PARQUET-634](https://issues.apache.org/jira/browse/PARQUET-634) - Consistent private linking of dependencies
+* [PARQUET-636](https://issues.apache.org/jira/browse/PARQUET-636) - Expose selection for different encodings
+* [PARQUET-641](https://issues.apache.org/jira/browse/PARQUET-641) - Instantiate stringstream only if needed in SerializedPageReader::NextPage
+* [PARQUET-646](https://issues.apache.org/jira/browse/PARQUET-646) - [C++] Enable easier 3rd-party toolchain clang builds on Linux
+* [PARQUET-666](https://issues.apache.org/jira/browse/PARQUET-666) - PLAIN\_DICTIONARY write support
+* [PARQUET-671](https://issues.apache.org/jira/browse/PARQUET-671) - Improve performance of RLE/bit-packed decoding in parquet-cpp
+* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
+* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
+* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
+* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows
+* [PARQUET-681](https://issues.apache.org/jira/browse/PARQUET-681) - Add tool to scan a parquet file
+* [PARQUET-681](https://issues.apache.org/jira/browse/PARQUET-681) - Add tool to scan a parquet file
+* [PARQUET-687](https://issues.apache.org/jira/browse/PARQUET-687) - C++: Switch to PLAIN encoding if dictionary grows too large
+* [PARQUET-689](https://issues.apache.org/jira/browse/PARQUET-689) - C++: Compress DataPages eagerly
+* [PARQUET-699](https://issues.apache.org/jira/browse/PARQUET-699) - Update parquet.thrift from https://github.com/apache/parquet-format
+* [PARQUET-712](https://issues.apache.org/jira/browse/PARQUET-712) - C++: Read into Arrow memory
+* [PARQUET-721](https://issues.apache.org/jira/browse/PARQUET-721) - Performance benchmarks for reading into Arrow structures
+* [PARQUET-724](https://issues.apache.org/jira/browse/PARQUET-724) - Test more advanced properties setting
+* [PARQUET-728](https://issues.apache.org/jira/browse/PARQUET-728) - [C++] Bring parquet::arrow up to date with API changes in arrow::io
+* [PARQUET-728](https://issues.apache.org/jira/browse/PARQUET-728) - [C++] Bring parquet::arrow up to date with API changes in arrow::io
+* [PARQUET-731](https://issues.apache.org/jira/browse/PARQUET-731) - [CPP] Add API to return metadata size and Skip reading values
+* [PARQUET-737](https://issues.apache.org/jira/browse/PARQUET-737) - Use absolute namespace in macros
+* [PARQUET-752](https://issues.apache.org/jira/browse/PARQUET-752) - [C++] Conform parquet\_arrow to upstream API changes
+* [PARQUET-762](https://issues.apache.org/jira/browse/PARQUET-762) - C++: Use optimistic allocation instead of Arrow Builders
+* [PARQUET-763](https://issues.apache.org/jira/browse/PARQUET-763) - C++: Expose ParquetFileReader through Arrow reader
+* [PARQUET-769](https://issues.apache.org/jira/browse/PARQUET-769) - C++: Add support for Brotli Compression
+* [PARQUET-778](https://issues.apache.org/jira/browse/PARQUET-778) - Standardize the schema output to match the parquet-mr format
+* [PARQUET-782](https://issues.apache.org/jira/browse/PARQUET-782) - C++: Support writing to Arrow sinks
+* [PARQUET-785](https://issues.apache.org/jira/browse/PARQUET-785) - C++: List conversion for Arrow Schemas
+* [PARQUET-805](https://issues.apache.org/jira/browse/PARQUET-805) - C++: Read Int96 into Arrow Timestamp(ns)
+* [PARQUET-807](https://issues.apache.org/jira/browse/PARQUET-807) - [C++] Add API to read file metadata only from a file handle
+* [PARQUET-807](https://issues.apache.org/jira/browse/PARQUET-807) - [C++] Add API to read file metadata only from a file handle
+* [PARQUET-809](https://issues.apache.org/jira/browse/PARQUET-809) - [C++] Add API to determine if two files' schemas are compatible
+* [PARQUET-813](https://issues.apache.org/jira/browse/PARQUET-813) - C++: Build dependencies using CMake External project
+* [PARQUET-820](https://issues.apache.org/jira/browse/PARQUET-820) - C++: Decoders should directly emit arrays with spacing for null entries
+* [PARQUET-829](https://issues.apache.org/jira/browse/PARQUET-829) - C++: Make use of ARROW-469
+* [PARQUET-830](https://issues.apache.org/jira/browse/PARQUET-830) - [C++] Add additional configuration options to parquet::arrow::OpenFIle
+* [PARQUET-833](https://issues.apache.org/jira/browse/PARQUET-833) - C++: Provide API to write spaced arrays (e.g. Arrow)
+* [PARQUET-834](https://issues.apache.org/jira/browse/PARQUET-834) - C++: Support r/w of arrow::ListArray
+* [PARQUET-835](https://issues.apache.org/jira/browse/PARQUET-835) - [C++] Add option to parquet::arrow to read columns in parallel using a thread pool
+* [PARQUET-836](https://issues.apache.org/jira/browse/PARQUET-836) - [C++] Add column selection to parquet::arrow::FileReader
+* [PARQUET-844](https://issues.apache.org/jira/browse/PARQUET-844) - [C++] Consolidate encodings, schema, and compression subdirectories into fewer files
+* [PARQUET-848](https://issues.apache.org/jira/browse/PARQUET-848) - [C++] Consolidate libparquet\_thrift subcomponent
+* [PARQUET-857](https://issues.apache.org/jira/browse/PARQUET-857) - [C++] Flatten parquet/encodings directory
+* [PARQUET-858](https://issues.apache.org/jira/browse/PARQUET-858) - [C++] Flatten parquet/column directory, consolidate related code
+* [PARQUET-859](https://issues.apache.org/jira/browse/PARQUET-859) - [C++] Flatten parquet/file directory
+* [PARQUET-862](https://issues.apache.org/jira/browse/PARQUET-862) - Provide defaut cache size values if CPU info probing is not available
+* [PARQUET-866](https://issues.apache.org/jira/browse/PARQUET-866) - [C++] Account for API changes in ARROW-33
+* [PARQUET-867](https://issues.apache.org/jira/browse/PARQUET-867) - [C++] Support writing sliced Arrow arrays
+* [PARQUET-874](https://issues.apache.org/jira/browse/PARQUET-874) - [C++] Use default memory allocator from Arrow
+* [PARQUET-877](https://issues.apache.org/jira/browse/PARQUET-877) - C++: Update Arrow Hash, update Version in metadata.
+* [PARQUET-882](https://issues.apache.org/jira/browse/PARQUET-882) - [CPP] Improve Application Version parsing
+* [PARQUET-890](https://issues.apache.org/jira/browse/PARQUET-890) - C++: Support I/O of DATE columns in parquet\_arrow
+* [PARQUET-894](https://issues.apache.org/jira/browse/PARQUET-894) - Fix compilation warning
+* [PARQUET-894](https://issues.apache.org/jira/browse/PARQUET-894) - Fix compilation warning
+* [PARQUET-897](https://issues.apache.org/jira/browse/PARQUET-897) - [C++] Only use designated public headers from libarrow
+* [PARQUET-903](https://issues.apache.org/jira/browse/PARQUET-903) - C++: Add option to set RPATH to ORIGIN
+* [PARQUET-909](https://issues.apache.org/jira/browse/PARQUET-909) - [CPP]: Reduce buffer allocations (mallocs) on critical path
+* [PARQUET-909](https://issues.apache.org/jira/browse/PARQUET-909) - [CPP]: Reduce buffer allocations (mallocs) on critical path
+* [PARQUET-911](https://issues.apache.org/jira/browse/PARQUET-911) - C++: Support nested structs in parquet\_arrow
+* [PARQUET-928](https://issues.apache.org/jira/browse/PARQUET-928) - [C++] Support pkg-config
+* [PARQUET-929](https://issues.apache.org/jira/browse/PARQUET-929) - [C++] Handle arrow::DictionaryArray when writing Arrow data
+* [PARQUET-930](https://issues.apache.org/jira/browse/PARQUET-930) - [C++] Account for all Arrow date/time types
+* [PARQUET-934](https://issues.apache.org/jira/browse/PARQUET-934) - [C++] Support multiarch on Debian
+* [PARQUET-935](https://issues.apache.org/jira/browse/PARQUET-935) - [C++] Set shared library version for .deb packages
+* [PARQUET-946](https://issues.apache.org/jira/browse/PARQUET-946) - [C++] Refactoring in parquet::arrow::FileReader to be able to read a single row group
+* [PARQUET-953](https://issues.apache.org/jira/browse/PARQUET-953) - [C++] Change arrow::FileWriter API to be initialized from a Schema, and provide for writing multiple tables
+* [PARQUET-967](https://issues.apache.org/jira/browse/PARQUET-967) - [C++] Combine libparquet/libparquet\_arrow libraries
+* [PARQUET-970](https://issues.apache.org/jira/browse/PARQUET-970) - Add Add Lz4 and Zstd compression codecs
+* [PARQUET-978](https://issues.apache.org/jira/browse/PARQUET-978) - [C++] Minimizing footer reads for small(ish) metadata
+* [PARQUET-984](https://issues.apache.org/jira/browse/PARQUET-984) - C++: Add abi and so version to pkg-config
+* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
+* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
+* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
+* [PARQUET-999](https://issues.apache.org/jira/browse/PARQUET-999) - Improve MSVC build - Enable PARQUET\_BUILD\_BENCHMARKS
+* [PARQUET-1008](https://issues.apache.org/jira/browse/PARQUET-1008) - Update TypedColumnReader::ReadBatch method to accept batch\_size as int64\_t
+* [PARQUET-1035](https://issues.apache.org/jira/browse/PARQUET-1035) - Write Int96 from Arrow Timestamp(ns)
+* [PARQUET-1037](https://issues.apache.org/jira/browse/PARQUET-1037) - Allow final RowGroup to be unfilled
+* [PARQUET-1041](https://issues.apache.org/jira/browse/PARQUET-1041) - C++: Support Arrow's NullArray
+* [PARQUET-1043](https://issues.apache.org/jira/browse/PARQUET-1043) - [C++] Raise minimum supported CMake version to 3.2
+* [PARQUET-1044](https://issues.apache.org/jira/browse/PARQUET-1044) - [C++] Use compression libraries from Apache Arrow
+* [PARQUET-1045](https://issues.apache.org/jira/browse/PARQUET-1045) - [C++] Refactor to account for computational utility code migration in ARROW-1154
+* [PARQUET-1053](https://issues.apache.org/jira/browse/PARQUET-1053) - Fix unused result warnings due to unchecked Statuses
+* [PARQUET-1053](https://issues.apache.org/jira/browse/PARQUET-1053) - Fix unused result warnings due to unchecked Statuses
+* [PARQUET-1068](https://issues.apache.org/jira/browse/PARQUET-1068) - [C++] Use more vanilla Google C++ code formatting
+* [PARQUET-1068](https://issues.apache.org/jira/browse/PARQUET-1068) - [C++] Use more vanilla Google C++ code formatting
+* [PARQUET-1072](https://issues.apache.org/jira/browse/PARQUET-1072) - [C++] Add ARROW\_NO\_DEPRECATED\_API to CI to check for deprecated API use
+* [PARQUET-1078](https://issues.apache.org/jira/browse/PARQUET-1078) - [C++] Add Arrow writer option to coerce timestamps to milliseconds or microseconds
+* [PARQUET-1079](https://issues.apache.org/jira/browse/PARQUET-1079) - [C++] Account for Arrow API change in ARROW-1335
+* [PARQUET-1083](https://issues.apache.org/jira/browse/PARQUET-1083) - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking
+* [PARQUET-1083](https://issues.apache.org/jira/browse/PARQUET-1083) - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking
+* [PARQUET-1086](https://issues.apache.org/jira/browse/PARQUET-1086) - [C++] Remove usage of arrow/util/compiler-util.h after 1.3.0 release
+* [PARQUET-1087](https://issues.apache.org/jira/browse/PARQUET-1087) - [C++] Add wrapper for ScanFileContents in parquet::arrow that catches exceptions
+* [PARQUET-1092](https://issues.apache.org/jira/browse/PARQUET-1092) - [C++] Write Arrow tables with chunked columns
+* [PARQUET-1093](https://issues.apache.org/jira/browse/PARQUET-1093) - C++: Improve Arrow level generation error message
+* [PARQUET-1094](https://issues.apache.org/jira/browse/PARQUET-1094) - C++: Add benchmark for boolean Arrow column I/O
+* [PARQUET-1095](https://issues.apache.org/jira/browse/PARQUET-1095) - [C++] Read and write Arrow decimal values
+* [PARQUET-1104](https://issues.apache.org/jira/browse/PARQUET-1104) - [C++] Upgrade to Apache Arrow 0.7.0 RC0
+* [PARQUET-1150](https://issues.apache.org/jira/browse/PARQUET-1150) - C++: Hide statically linked boost symbols
+* [PARQUET-1160](https://issues.apache.org/jira/browse/PARQUET-1160) - [C++] Implement BYTE\_ARRAY-backed Decimal reads
+* [PARQUET-1164](https://issues.apache.org/jira/browse/PARQUET-1164) - [C++] Follow API changes in ARROW-1808
+* [PARQUET-1165](https://issues.apache.org/jira/browse/PARQUET-1165) - [C++] Pin clang-format version to 4.0
+* [PARQUET-1166](https://issues.apache.org/jira/browse/PARQUET-1166) - [API Proposal] Add GetRecordBatchReader in parquet/arrow/reader.h
+* [PARQUET-1177](https://issues.apache.org/jira/browse/PARQUET-1177) - [C++] Add more extensive compiler warnings when using Clang
+* [PARQUET-1177](https://issues.apache.org/jira/browse/PARQUET-1177) - [C++] Add more extensive compiler warnings when using Clang
+* [PARQUET-1196](https://issues.apache.org/jira/browse/PARQUET-1196) - [C++] Provide a parquet\_arrow example project incl. CMake setup
+* [PARQUET-1200](https://issues.apache.org/jira/browse/PARQUET-1200) - [C++] Support reading a single Arrow column from a Parquet file
+* [PARQUET-1218](https://issues.apache.org/jira/browse/PARQUET-1218) - [C++] More informative error message on too short pages
+* [PARQUET-1225](https://issues.apache.org/jira/browse/PARQUET-1225) - NaN values may lead to incorrect filtering under certain circumstances
+* [PARQUET-1227](https://issues.apache.org/jira/browse/PARQUET-1227) - Thrift crypto metadata structures
+* [PARQUET-1256](https://issues.apache.org/jira/browse/PARQUET-1256) - [C++] Add --print-key-value-metadata option to parquet\_reader tool
+* [PARQUET-1256](https://issues.apache.org/jira/browse/PARQUET-1256) - [C++] Add --print-key-value-metadata option to parquet\_reader tool
+* [PARQUET-1267](https://issues.apache.org/jira/browse/PARQUET-1267) - replace "unsafe" std::equal by std::memcmp
+* [PARQUET-1276](https://issues.apache.org/jira/browse/PARQUET-1276) - [C++] Reduce the amount of memory used for writing null decimal values
+* [PARQUET-1279](https://issues.apache.org/jira/browse/PARQUET-1279) - Use ASSERT\_NO\_FATAIL\_FAILURE in C++ unit tests
+* [PARQUET-1301](https://issues.apache.org/jira/browse/PARQUET-1301) - [C++] Crypto package in parquet-cpp
+* [PARQUET-1308](https://issues.apache.org/jira/browse/PARQUET-1308) - [C++] parquet::arrow should use thread pool, not ParallelFor
+* [PARQUET-1323](https://issues.apache.org/jira/browse/PARQUET-1323) - [C++] Fix compiler warnings with clang-6.0
+* [PARQUET-1332](https://issues.apache.org/jira/browse/PARQUET-1332) - [C++] Add bloom filter utility class
+* [PARQUET-1340](https://issues.apache.org/jira/browse/PARQUET-1340) - [C++] Fix Travis Ci valgrind errors related to std::random\_device
+* [PARQUET-1346](https://issues.apache.org/jira/browse/PARQUET-1346) - [C++] Protect against null values data in empty Arrow array
+* [PARQUET-1348](https://issues.apache.org/jira/browse/PARQUET-1348) - [C++] Allow Arrow FileWriter To Write FileMetaData
+* [PARQUET-1350](https://issues.apache.org/jira/browse/PARQUET-1350) - [C++] Use abstract ResizableBuffer instead of concrete PoolBuffer
+* [PARQUET-1360](https://issues.apache.org/jira/browse/PARQUET-1360) - [C++] Minor API + style changes follow up to PARQUET-1348
+* [PARQUET-1366](https://issues.apache.org/jira/browse/PARQUET-1366) - [C++] Streamline use of Arrow bit-util.h
+* [PARQUET-1372](https://issues.apache.org/jira/browse/PARQUET-1372) - [C++] Add an API to allow writing RowGroups based on their size rather than num\_rows
+* [PARQUET-1372](https://issues.apache.org/jira/browse/PARQUET-1372) - [C++] Add an API to allow writing RowGroups based on their size rather than num\_rows
+* [PARQUET-1378](https://issues.apache.org/jira/browse/PARQUET-1378) - [c++] Allow RowGroups with zero rows to be written
+* [PARQUET-1382](https://issues.apache.org/jira/browse/PARQUET-1382) - [C++] Prepare for arrow::test namespace removal
+* [PARQUET-1392](https://issues.apache.org/jira/browse/PARQUET-1392) - [C++] Supply row group indices to parquet::arrow::FileReader::ReadTable
+* [PARQUET-1398](https://issues.apache.org/jira/browse/PARQUET-1398) - Separate iv\_prefix for GCM and CTR modes
+* [PARQUET-1401](https://issues.apache.org/jira/browse/PARQUET-1401) - RowGroup offset and total compressed size fields
+* [PARQUET-1427](https://issues.apache.org/jira/browse/PARQUET-1427) - [C++] Move example executables and CLI tools to Apache Arrow repo
+* [PARQUET-1431](https://issues.apache.org/jira/browse/PARQUET-1431) - [C++] Automaticaly set thrift to use boost for thrift versions before 0.11
+
+
+## Bug Fixes
+
+* [ARROW-1380](https://issues.apache.org/jira/browse/ARROW-1380) - [C++] Fix "still reachable" valgrind warnings when PLASMA\_VALGRIND=1
+* [ARROW-1661](https://issues.apache.org/jira/browse/ARROW-1661) - [Python] Python 3.7 support
+* [ARROW-1799](https://issues.apache.org/jira/browse/ARROW-1799) - [Plasma C++] Make unittest does not create plasma store executable
+* [ARROW-1996](https://issues.apache.org/jira/browse/ARROW-1996) - [Python] pyarrow.read\_serialized cannot read concatenated records
+* [ARROW-2027](https://issues.apache.org/jira/browse/ARROW-2027) - [C++] ipc::Message::SerializeTo does not pad the message body
+* [ARROW-2220](https://issues.apache.org/jira/browse/ARROW-2220) - Change default fix version in merge tool to be the next mainline release version
+* [ARROW-2310](https://issues.apache.org/jira/browse/ARROW-2310) - Source release scripts fail with Java8
+* [ARROW-2646](https://issues.apache.org/jira/browse/ARROW-2646) - [C++/Python] Pandas roundtrip for date objects
+* [ARROW-2775](https://issues.apache.org/jira/browse/ARROW-2775) - [Python] ccache error when building manylinux1 wheels
+* [ARROW-2776](https://issues.apache.org/jira/browse/ARROW-2776) - [C++] Do not pass -Wno-noexcept-type for compilers that do not support it
+* [ARROW-2782](https://issues.apache.org/jira/browse/ARROW-2782) - [Python] Ongoing Travis CI failures in Plasma unit tests
+* [ARROW-2785](https://issues.apache.org/jira/browse/ARROW-2785) - [C++] Crash in json-integration-test
+* [ARROW-2814](https://issues.apache.org/jira/browse/ARROW-2814) - [Python] Unify PyObject\* sequence conversion paths for built-in sequences, NumPy arrays
+* [ARROW-2854](https://issues.apache.org/jira/browse/ARROW-2854) - [C++/Python] Casting float NaN to int should raise an error on safe cast
+* [ARROW-2925](https://issues.apache.org/jira/browse/ARROW-2925) - [JS] Documentation failing in docker container
+* [ARROW-2965](https://issues.apache.org/jira/browse/ARROW-2965) - [Python] Possible uint64 overflow issues in python\_to\_arrow.cc
+* [ARROW-2966](https://issues.apache.org/jira/browse/ARROW-2966) - [Python] Data type conversion error
+* [ARROW-2973](https://issues.apache.org/jira/browse/ARROW-2973) - [Python] pitrou/asv.git@customize\_commands does not work with the "new" way of activating conda
+* [ARROW-2974](https://issues.apache.org/jira/browse/ARROW-2974) - [Python] Replace usages of "source activate" with "conda activate" in CI scripts
+* [ARROW-2986](https://issues.apache.org/jira/browse/ARROW-2986) - [C++] /EHsc possibly needed for Visual Studio 2015 builds
+* [ARROW-2992](https://issues.apache.org/jira/browse/ARROW-2992) - [Python] Parquet benchmark failure
+* [ARROW-2992](https://issues.apache.org/jira/browse/ARROW-2992) - [Python] Parquet benchmark failure
+* [ARROW-3006](https://issues.apache.org/jira/browse/ARROW-3006) - [GLib] .gir/.typelib for GPU aren't installed
+* [ARROW-3007](https://issues.apache.org/jira/browse/ARROW-3007) - [Packaging] libarrow-gpu10 deb for Ubuntu 18.04 has broken dependencies
+* [ARROW-3011](https://issues.apache.org/jira/browse/ARROW-3011) - [CI] Remove Slack notification
+* [ARROW-3012](https://issues.apache.org/jira/browse/ARROW-3012) - [Python] Installation crashes with setuptools\_scm error
+* [ARROW-3013](https://issues.apache.org/jira/browse/ARROW-3013) - [Website] Fix download links on website for tarballs, checksums
+* [ARROW-3015](https://issues.apache.org/jira/browse/ARROW-3015) - [Python] Fix documentation typo for pa.uint8
+* [ARROW-3047](https://issues.apache.org/jira/browse/ARROW-3047) - [C++] cmake downloads and builds ORC even though it's installed
+* [ARROW-3049](https://issues.apache.org/jira/browse/ARROW-3049) - [C++/Python] ORC reader fails on empty file
+* [ARROW-3053](https://issues.apache.org/jira/browse/ARROW-3053) - [Python] Pandas decimal conversion segfault
+* [ARROW-3056](https://issues.apache.org/jira/browse/ARROW-3056) - [Python] Indicate in NativeFile docstrings methods that are part of the RawIOBase API but not implemented
+* [ARROW-3061](https://issues.apache.org/jira/browse/ARROW-3061) - [Java] headroom does not take into account reservation
+* [ARROW-3065](https://issues.apache.org/jira/browse/ARROW-3065) - [Python] concat\_tables() failing from bad Pandas Metadata
+* [ARROW-3083](https://issues.apache.org/jira/browse/ARROW-3083) - [Python] Version in manylinux1 wheel builds is wrong
+* [ARROW-3093](https://issues.apache.org/jira/browse/ARROW-3093) - [C++] Linking errors with ORC enabled
+* [ARROW-3095](https://issues.apache.org/jira/browse/ARROW-3095) - [Python] test\_plasma.py fails
+* [ARROW-3098](https://issues.apache.org/jira/browse/ARROW-3098) - [Python] BufferReader doesn't adhere to the seek protocol
+* [ARROW-3100](https://issues.apache.org/jira/browse/ARROW-3100) - [CI] C/glib build broken on OS X
+* [ARROW-3125](https://issues.apache.org/jira/browse/ARROW-3125) - [Python] Update ASV instructions
+* [ARROW-3125](https://issues.apache.org/jira/browse/ARROW-3125) - [Python] Update ASV instructions
+* [ARROW-3132](https://issues.apache.org/jira/browse/ARROW-3132) - Regenerate 0.10.0 changelog
+* [ARROW-3137](https://issues.apache.org/jira/browse/ARROW-3137) - [Python] pyarrow 0.10 requires newer version of numpy than specified in requirements
+* [ARROW-3140](https://issues.apache.org/jira/browse/ARROW-3140) - [Plasma] Plasma fails building with GPU enabled
+* [ARROW-3141](https://issues.apache.org/jira/browse/ARROW-3141) - [Python] Tensorflow support in pyarrow wheels pins numpy\>=1.14
+* [ARROW-3145](https://issues.apache.org/jira/browse/ARROW-3145) - [C++] Thrift compiler reruns in arrow/dbi/hiveserver2/thrift when using Ninja build
+* [ARROW-3173](https://issues.apache.org/jira/browse/ARROW-3173) - [Rust] dynamic\_types example does not run
+* [ARROW-3175](https://issues.apache.org/jira/browse/ARROW-3175) - [Java] Upgrade to official FlatBuffers release (Flatbuffers incompatibility)
+* [ARROW-3183](https://issues.apache.org/jira/browse/ARROW-3183) - [Python] get\_library\_dirs on Windows can give the wrong directory
+* [ARROW-3188](https://issues.apache.org/jira/browse/ARROW-3188) - [Python] Table.from\_arrays segfaults if lists and schema are passed
+* [ARROW-3190](https://issues.apache.org/jira/browse/ARROW-3190) - [C++] "WriteableFile" is misspelled, should be renamed "WritableFile" with deprecation for old name
+* [ARROW-3206](https://issues.apache.org/jira/browse/ARROW-3206) - [C++] Building with ARROW\_HIVESERVER2=ON with unit tests disabled causes error
+* [ARROW-3227](https://issues.apache.org/jira/browse/ARROW-3227) - [Python] NativeFile.write shouldn't accept unicode strings
+* [ARROW-3228](https://issues.apache.org/jira/browse/ARROW-3228) - [Python] Immutability of bytes is ignored
+* [ARROW-3231](https://issues.apache.org/jira/browse/ARROW-3231) - [Python] Sphinx's autodoc\_default\_flags is now deprecated
+* [ARROW-3237](https://issues.apache.org/jira/browse/ARROW-3237) - [CI] Update linux packaging filenames in rat exclusion list
+* [ARROW-3241](https://issues.apache.org/jira/browse/ARROW-3241) - [Plasma] test\_plasma\_list test failure on Ubuntu 14.04
+* [ARROW-3251](https://issues.apache.org/jira/browse/ARROW-3251) - [C++] Conversion warnings in cast.cc
+* [ARROW-3256](https://issues.apache.org/jira/browse/ARROW-3256) - [JS] File footer and message metadata is inconsistent
+* [ARROW-3271](https://issues.apache.org/jira/browse/ARROW-3271) - [Python] Manylinux1 builds timing out in Travis CI
+* [ARROW-3279](https://issues.apache.org/jira/browse/ARROW-3279) - [C++] Allow linking Arrow tests dynamically on Windows
+* [ARROW-3299](https://issues.apache.org/jira/browse/ARROW-3299) - [C++] Appveyor builds failing
+* [ARROW-3322](https://issues.apache.org/jira/browse/ARROW-3322) - [CI] Rust job always runs on AppVeyor
+* [ARROW-3327](https://issues.apache.org/jira/browse/ARROW-3327) - [Python] manylinux container confusing
+* [ARROW-3338](https://issues.apache.org/jira/browse/ARROW-3338) - [Python] Crash when schema and columns do not match
+* [ARROW-3342](https://issues.apache.org/jira/browse/ARROW-3342) - Appveyor builds have stopped triggering on GitHub
+* [ARROW-3348](https://issues.apache.org/jira/browse/ARROW-3348) - Plasma store dies when an object that a dead client is waiting for gets created.
+* [ARROW-3354](https://issues.apache.org/jira/browse/ARROW-3354) - [Python] read\_record\_batch interfaces differ in pyarrow and pyarrow.cuda
+* [ARROW-3369](https://issues.apache.org/jira/browse/ARROW-3369) - [Packaging] Wheel builds are failing due to wheel 0.32 release
+* [ARROW-3370](https://issues.apache.org/jira/browse/ARROW-3370) - [Packaging] Centos 6 build is failing
+* [ARROW-3373](https://issues.apache.org/jira/browse/ARROW-3373) - Fix bug in which plasma store can die when client gets multiple objects and object becomes available.
+* [ARROW-3374](https://issues.apache.org/jira/browse/ARROW-3374) - [Python] Dictionary has out-of-bound index when creating DictionaryArray from Pandas with NaN
+* [ARROW-3390](https://issues.apache.org/jira/browse/ARROW-3390) - [C++] cmake file under windows msys2 system doesn't work
+* [ARROW-3393](https://issues.apache.org/jira/browse/ARROW-3393) - [C++] Fix compiler warning in util/task-group-cc on clang 6
+* [ARROW-3394](https://issues.apache.org/jira/browse/ARROW-3394) - [Java] Remove duplicate dependency entry in Flight
+* [ARROW-3403](https://issues.apache.org/jira/browse/ARROW-3403) - [Website] Source tarball link missing from install page
+* [ARROW-3420](https://issues.apache.org/jira/browse/ARROW-3420) - [C++] Fix outstanding include-what-you-use issues in src/arrow, src/parquet codebases
+* [PARQUET-232](https://issues.apache.org/jira/browse/PARQUET-232) - minor compilation issue
+* [PARQUET-446](https://issues.apache.org/jira/browse/PARQUET-446) - Hide thrift dependency in parquet-cpp
+* [PARQUET-454](https://issues.apache.org/jira/browse/PARQUET-454) - Address inconsistencies in boolean decoding
+* [PARQUET-455](https://issues.apache.org/jira/browse/PARQUET-455) - Fix compiler warnings on OS X / Clang
+* [PARQUET-457](https://issues.apache.org/jira/browse/PARQUET-457) - Add compressed data page unit tests
+* [PARQUET-469](https://issues.apache.org/jira/browse/PARQUET-469) - Roll back Thrift bindings to 0.9.0
+* [PARQUET-472](https://issues.apache.org/jira/browse/PARQUET-472) - Clean up InputStream ownership semantics in ColumnReader
+* [PARQUET-505](https://issues.apache.org/jira/browse/PARQUET-505) - Column reader: automatically handle large data pages
+* [PARQUET-507](https://issues.apache.org/jira/browse/PARQUET-507) - Improve runtime of rle-test.cc
+* [PARQUET-513](https://issues.apache.org/jira/browse/PARQUET-513) - Valgrind errors are not failing the Travis CI build
+* [PARQUET-525](https://issues.apache.org/jira/browse/PARQUET-525) - Test coverage for malformed file failure modes on the read path
+* [PARQUET-537](https://issues.apache.org/jira/browse/PARQUET-537) - LocalFileSource leaks resources
+* [PARQUET-549](https://issues.apache.org/jira/browse/PARQUET-549) - Add scanner and column reader tests for dictionary data pages
+* [PARQUET-555](https://issues.apache.org/jira/browse/PARQUET-555) - Dictionary page metadata handling inconsistencies
+* [PARQUET-561](https://issues.apache.org/jira/browse/PARQUET-561) - ParquetFileReader::Contents PIMPL missing a virtual destructor
+* [PARQUET-599](https://issues.apache.org/jira/browse/PARQUET-599) - ColumnWriter::RleEncodeLevels' size estimation might be wrong
+* [PARQUET-604](https://issues.apache.org/jira/browse/PARQUET-604) - Install writer.h headers
+* [PARQUET-614](https://issues.apache.org/jira/browse/PARQUET-614) - C++: Remove unneeded LZ4-related code
+* [PARQUET-620](https://issues.apache.org/jira/browse/PARQUET-620) - C++: Duplicate calls to ParquetFileWriter::Close cause duplicate metdata writes
+* [PARQUET-621](https://issues.apache.org/jira/browse/PARQUET-621) - C++: Uninitialised DecimalMetadata is read
+* [PARQUET-629](https://issues.apache.org/jira/browse/PARQUET-629) - RowGroupSerializer should only close itself once
+* [PARQUET-639](https://issues.apache.org/jira/browse/PARQUET-639) - Do not export DCHECK in public headers
+* [PARQUET-643](https://issues.apache.org/jira/browse/PARQUET-643) - Add const modifier to schema pointer reference in ParquetFileWriter
+* [PARQUET-657](https://issues.apache.org/jira/browse/PARQUET-657) - [C++] Don't define DISALLOW\_COPY\_AND\_ASSIGN if already defined
+* [PARQUET-658](https://issues.apache.org/jira/browse/PARQUET-658) - ColumnReader has no virtual destructor
+* [PARQUET-659](https://issues.apache.org/jira/browse/PARQUET-659) - [C++] Instantiated template visibility is broken on clang / OS X
+* [PARQUET-662](https://issues.apache.org/jira/browse/PARQUET-662) - [C++] ParquetException must be explicitly exported in dynamic libraries
+* [PARQUET-676](https://issues.apache.org/jira/browse/PARQUET-676) - MAX\_VALUES\_PER\_LITERAL\_RUN causes RLE encoding failure
+* [PARQUET-691](https://issues.apache.org/jira/browse/PARQUET-691) - [C++] Write ColumnChunk metadata after each column chunk in the file
+* [PARQUET-694](https://issues.apache.org/jira/browse/PARQUET-694) - C++: Revert default data page size back to 1M
+* [PARQUET-700](https://issues.apache.org/jira/browse/PARQUET-700) - C++: Disable dictionary encoding for boolean columns
+* [PARQUET-701](https://issues.apache.org/jira/browse/PARQUET-701) - C++: Dictionary is written multiple times if close is called multiple times.
+* [PARQUET-702](https://issues.apache.org/jira/browse/PARQUET-702) - Add a writer + reader example with detailed comments
+* [PARQUET-702](https://issues.apache.org/jira/browse/PARQUET-702) - Add a writer + reader example with detailed comments
+* [PARQUET-703](https://issues.apache.org/jira/browse/PARQUET-703) - [C++] Validate num\_values metadata for columns with nulls
+* [PARQUET-704](https://issues.apache.org/jira/browse/PARQUET-704) - [C++] scan-all.h is not being installed
+* [PARQUET-708](https://issues.apache.org/jira/browse/PARQUET-708) - [C++] RleEncoder does not account for "worst case scenario" in MaxBufferSize for bit\_width \> 1
+* [PARQUET-710](https://issues.apache.org/jira/browse/PARQUET-710) - Remove unneeded private member variables from RowGroupReader ABI
+* [PARQUET-711](https://issues.apache.org/jira/browse/PARQUET-711) - Use metadata builders in parquet writer
+* [PARQUET-711](https://issues.apache.org/jira/browse/PARQUET-711) - Use metadata builders in parquet writer
+* [PARQUET-718](https://issues.apache.org/jira/browse/PARQUET-718) - Reading boolean pages written by parquet-cpp fails
+* [PARQUET-719](https://issues.apache.org/jira/browse/PARQUET-719) - Fix WriterBatch API to handle NULL values
+* [PARQUET-720](https://issues.apache.org/jira/browse/PARQUET-720) - Parquet-cpp fails to link when included in multiple TUs
+* [PARQUET-739](https://issues.apache.org/jira/browse/PARQUET-739) - Rle-decoding uses static buffer that is shared accross threads
+* [PARQUET-739](https://issues.apache.org/jira/browse/PARQUET-739) - Rle-decoding uses static buffer that is shared accross threads
+* [PARQUET-741](https://issues.apache.org/jira/browse/PARQUET-741) - compression\_buffer\_ is reused although it shouldn't
+* [PARQUET-742](https://issues.apache.org/jira/browse/PARQUET-742) - Add missing license headers
+* [PARQUET-745](https://issues.apache.org/jira/browse/PARQUET-745) - TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType
+* [PARQUET-747](https://issues.apache.org/jira/browse/PARQUET-747) - [C++] TypedRowGroupStatistics are not being exported in libparquet.so
+* [PARQUET-759](https://issues.apache.org/jira/browse/PARQUET-759) - Cannot store columns consisting of empty strings
+* [PARQUET-760](https://issues.apache.org/jira/browse/PARQUET-760) - On switching from dictionary to the fallback encoding, an incorrect encoding is set
+* [PARQUET-764](https://issues.apache.org/jira/browse/PARQUET-764) - [CPP] Parquet Writer does not write Boolean values correctly
+* [PARQUET-766](https://issues.apache.org/jira/browse/PARQUET-766) - C++: Expose ParquetFileReader through Arrow reader as const
+* [PARQUET-775](https://issues.apache.org/jira/browse/PARQUET-775) - C++: TrackingAllocator is not thread-safe
+* [PARQUET-779](https://issues.apache.org/jira/browse/PARQUET-779) - Export TypedRowGroupStatistics in libparquet
+* [PARQUET-780](https://issues.apache.org/jira/browse/PARQUET-780) - WriterBatch API does not properly handle NULL values for byte array types
+* [PARQUET-789](https://issues.apache.org/jira/browse/PARQUET-789) - [C++] Catch and translate ParquetException in parquet::arrow::FileReader::{ReadFlatColumn, ReadFlatTable}}
+* [PARQUET-793](https://issues.apache.org/jira/browse/PARQUET-793) - [CPP] Do not return incorrect statistics
+* [PARQUET-797](https://issues.apache.org/jira/browse/PARQUET-797) - [C++] Update for API changes in ARROW-418
+* [PARQUET-799](https://issues.apache.org/jira/browse/PARQUET-799) - concurrent usage of the file reader API
+* [PARQUET-812](https://issues.apache.org/jira/browse/PARQUET-812) - [C++] Failure reading BYTE\_ARRAY data from file in parquet-compatibility project
+* [PARQUET-816](https://issues.apache.org/jira/browse/PARQUET-816) - [C++] Failure decoding sample dict-encoded file from parquet-compatibility project
+* [PARQUET-818](https://issues.apache.org/jira/browse/PARQUET-818) - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow
+* [PARQUET-818](https://issues.apache.org/jira/browse/PARQUET-818) - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow
+* [PARQUET-819](https://issues.apache.org/jira/browse/PARQUET-819) - C++: Trying to install non-existing parquet/arrow/utils.h
+* [PARQUET-827](https://issues.apache.org/jira/browse/PARQUET-827) - [C++] Incorporate addition of arrow::MemoryPool::Reallocate
+* [PARQUET-828](https://issues.apache.org/jira/browse/PARQUET-828) - [C++] "version" field set improperly in file metadata
+* [PARQUET-837](https://issues.apache.org/jira/browse/PARQUET-837) - [C++] SerializedFile::ParseMetaData uses Seek, followed by Read, and could have race conditions
+* [PARQUET-841](https://issues.apache.org/jira/browse/PARQUET-841) - [C++] Writing wrong format version when using ParquetVersion::PARQUET\_1\_0
+* [PARQUET-842](https://issues.apache.org/jira/browse/PARQUET-842) - [C++] Impala rejects DOUBLE columns if decimal metadata is set
+* [PARQUET-843](https://issues.apache.org/jira/browse/PARQUET-843) - [C++] Impala unable to read files created by parquet-cpp
+* [PARQUET-846](https://issues.apache.org/jira/browse/PARQUET-846) - [CPP] CpuInfo::Init() is not thread safe
+* [PARQUET-880](https://issues.apache.org/jira/browse/PARQUET-880) - [CPP] Prevent destructors from throwing
+* [PARQUET-888](https://issues.apache.org/jira/browse/PARQUET-888) - C++ Memory leak in RowGroupSerializer
+* [PARQUET-889](https://issues.apache.org/jira/browse/PARQUET-889) - Fix compilation when PARQUET\_USE\_SSE is on
+* [PARQUET-892](https://issues.apache.org/jira/browse/PARQUET-892) - [C++] Clean up link library targets in CMake files
+* [PARQUET-895](https://issues.apache.org/jira/browse/PARQUET-895) - Reading of nested columns is broken
+* [PARQUET-898](https://issues.apache.org/jira/browse/PARQUET-898) - [C++] Change Travis CI OS X image to Xcode 6.4 and fix our thirdparty build
+* [PARQUET-908](https://issues.apache.org/jira/browse/PARQUET-908) - Fix for PARQUET-890 introduces undefined symbol in libparquet\_arrow.so
+* [PARQUET-914](https://issues.apache.org/jira/browse/PARQUET-914) - [C++] Throw more informative exception when user writes too many values to a column in a row group
+* [PARQUET-915](https://issues.apache.org/jira/browse/PARQUET-915) - Support Arrow Time Types in Schema
+* [PARQUET-918](https://issues.apache.org/jira/browse/PARQUET-918) - FromParquetSchema API crashes on nested schemas
+* [PARQUET-918](https://issues.apache.org/jira/browse/PARQUET-918) - FromParquetSchema API crashes on nested schemas
+* [PARQUET-919](https://issues.apache.org/jira/browse/PARQUET-919) - [C++] Account for API changes in ARROW-683
+* [PARQUET-923](https://issues.apache.org/jira/browse/PARQUET-923) - [C++] Account for Time metadata changes in ARROW-686
+* [PARQUET-933](https://issues.apache.org/jira/browse/PARQUET-933) - [C++] Account for Arrow Table API changes coming in ARROW-728
+* [PARQUET-936](https://issues.apache.org/jira/browse/PARQUET-936) - [C++] parquet::arrow::WriteTable can enter infinite loop if chunk\_size is 0
+* [PARQUET-943](https://issues.apache.org/jira/browse/PARQUET-943) - [C++] Overflow build error on x86
+* [PARQUET-947](https://issues.apache.org/jira/browse/PARQUET-947) - [C++] Refactor to account for ARROW-795 Arrow core library consolidation
+* [PARQUET-958](https://issues.apache.org/jira/browse/PARQUET-958) - [C++] Print Parquet metadata in JSON format
+* [PARQUET-958](https://issues.apache.org/jira/browse/PARQUET-958) - [C++] Print Parquet metadata in JSON format
+* [PARQUET-963](https://issues.apache.org/jira/browse/PARQUET-963) - [C++] Disallow reading struct types in Arrow reader for now
+* [PARQUET-965](https://issues.apache.org/jira/browse/PARQUET-965) - [C++] FIXED\_LEN\_BYTE\_ARRAY types are unhandled in the Arrow reader
+* [PARQUET-979](https://issues.apache.org/jira/browse/PARQUET-979) - [C++] Limit size of min, max or disable stats for long binary types
+* [PARQUET-992](https://issues.apache.org/jira/browse/PARQUET-992) - [C++] parquet/compression.h leaks zlib.h
+* [PARQUET-995](https://issues.apache.org/jira/browse/PARQUET-995) - [C++] Int96 reader in parquet\_arrow uses size of Int96Type instead of Int96
+* [PARQUET-997](https://issues.apache.org/jira/browse/PARQUET-997) - Fix override compiler warnings
+* [PARQUET-1002](https://issues.apache.org/jira/browse/PARQUET-1002) - [C++] Compute statistics based on Logical Types
+* [PARQUET-1003](https://issues.apache.org/jira/browse/PARQUET-1003) - [C++] Modify DEFAULT\_CREATED\_BY value for every new release version
+* [PARQUET-1007](https://issues.apache.org/jira/browse/PARQUET-1007) - [C++ ] Update parquet.thrift from https://github.com/apache/parquet-format
+* [PARQUET-1029](https://issues.apache.org/jira/browse/PARQUET-1029) - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported
+* [PARQUET-1029](https://issues.apache.org/jira/browse/PARQUET-1029) - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported
+* [PARQUET-1033](https://issues.apache.org/jira/browse/PARQUET-1033) - Mismatched Read and Write
+* [PARQUET-1038](https://issues.apache.org/jira/browse/PARQUET-1038) - Key value metadata should be nullptr if not set
+* [PARQUET-1040](https://issues.apache.org/jira/browse/PARQUET-1040) - Missing writer method implementations
+* [PARQUET-1042](https://issues.apache.org/jira/browse/PARQUET-1042) - C++: Compilation breaks on GCC 4.8
+* [PARQUET-1048](https://issues.apache.org/jira/browse/PARQUET-1048) - [C++] Static linking of libarrow is no longer supported
+* [PARQUET-1048](https://issues.apache.org/jira/browse/PARQUET-1048) - [C++] Static linking of libarrow is no longer supported
+* [PARQUET-1054](https://issues.apache.org/jira/browse/PARQUET-1054) - [C++] Account for Arrow API changes in ARROW-1199
+* [PARQUET-1071](https://issues.apache.org/jira/browse/PARQUET-1071) - [C++] parquet::arrow::FileWriter::Close is not idempotent
+* [PARQUET-1085](https://issues.apache.org/jira/browse/PARQUET-1085) - [C++] Backwards compatibility from macro cleanup in transitive dependencies in ARROW-1452
+* [PARQUET-1088](https://issues.apache.org/jira/browse/PARQUET-1088) - [CPP] remove parquet\_version.h from version control since it gets auto generated
+* [PARQUET-1090](https://issues.apache.org/jira/browse/PARQUET-1090) - [C++] Fix int32 overflow in Arrow table writer, add max row group size property
+* [PARQUET-1098](https://issues.apache.org/jira/browse/PARQUET-1098) - [C++] Install new header in parquet/util
+* [PARQUET-1100](https://issues.apache.org/jira/browse/PARQUET-1100) - [C++] Reading repeated types should decode number of records rather than number of values
+* [PARQUET-1108](https://issues.apache.org/jira/browse/PARQUET-1108) - [C++] Fix Int96 comparators
+* [PARQUET-1114](https://issues.apache.org/jira/browse/PARQUET-1114) - Apply fix for ARROW-1601 and ARROW-1611 to parquet-cpp
+* [PARQUET-1121](https://issues.apache.org/jira/browse/PARQUET-1121) - C++: DictionaryArrays of NullType cannot be written
+* [PARQUET-1123](https://issues.apache.org/jira/browse/PARQUET-1123) - [C++] Update parquet-cpp to use Arrow's AssertArraysEqual
+* [PARQUET-1138](https://issues.apache.org/jira/browse/PARQUET-1138) - [C++] Fix compilation with Arrow 0.7.1
+* [PARQUET-1167](https://issues.apache.org/jira/browse/PARQUET-1167) - [C++] FieldToNode function should return a status when throwing an exception
+* [PARQUET-1175](https://issues.apache.org/jira/browse/PARQUET-1175) - [C++] Fix usage of deprecated Arrow API
+* [PARQUET-1179](https://issues.apache.org/jira/browse/PARQUET-1179) - [C++] Support Apache Thrift 0.11
+* [PARQUET-1180](https://issues.apache.org/jira/browse/PARQUET-1180) - C++: Fix behaviour of num\_children element of primitive nodes
+* [PARQUET-1193](https://issues.apache.org/jira/browse/PARQUET-1193) - [CPP] Implement ColumnOrder to support min\_value and max\_value
+* [PARQUET-1226](https://issues.apache.org/jira/browse/PARQUET-1226) - [C++] Fix new build warnings with clang 5.0
+* [PARQUET-1233](https://issues.apache.org/jira/browse/PARQUET-1233) - [CPP ]Enable option to switch between stl classes and boost classes for thrift header
+* [PARQUET-1245](https://issues.apache.org/jira/browse/PARQUET-1245) - [C++] Segfault when writing Arrow table with duplicate columns
+* [PARQUET-1255](https://issues.apache.org/jira/browse/PARQUET-1255) - [C++] Exceptions thrown in some tests
+* [PARQUET-1265](https://issues.apache.org/jira/browse/PARQUET-1265) - Segfault on static ApplicationVersion initialization
+* [PARQUET-1268](https://issues.apache.org/jira/browse/PARQUET-1268) - [C++] Conversion of Arrow null list columns fails
+* [PARQUET-1270](https://issues.apache.org/jira/browse/PARQUET-1270) - [C++] Executable tools do not get installed
+* [PARQUET-1272](https://issues.apache.org/jira/browse/PARQUET-1272) - [C++] ScanFileContents reports wrong row count for nested columns
+* [PARQUET-1273](https://issues.apache.org/jira/browse/PARQUET-1273) - [Python] Error writing to partitioned Parquet dataset
+* [PARQUET-1274](https://issues.apache.org/jira/browse/PARQUET-1274) - [Python] SegFault in pyarrow.parquet.write\_table with specific options
+* [PARQUET-1283](https://issues.apache.org/jira/browse/PARQUET-1283) - [C++] FormatStatValue appends trailing space to string and int96
+* [PARQUET-1307](https://issues.apache.org/jira/browse/PARQUET-1307) - [C++] memory-test fails with latest Arrow
+* [PARQUET-1315](https://issues.apache.org/jira/browse/PARQUET-1315) - [C++] ColumnChunkMetaData.has\_dictionary\_page() should return bool, not int64\_t
+* [PARQUET-1333](https://issues.apache.org/jira/browse/PARQUET-1333) - [C++] Reading of files with dictionary size 0 fails on Windows with bad\_alloc
+* [PARQUET-1334](https://issues.apache.org/jira/browse/PARQUET-1334) - [C++] memory\_map parameter seems missleading in parquet file opener
+* [PARQUET-1357](https://issues.apache.org/jira/browse/PARQUET-1357) - [C++] FormatStatValue truncates binary statistics on zero character
+* [PARQUET-1358](https://issues.apache.org/jira/browse/PARQUET-1358) - [C++] index\_page\_offset should be unset as it is not supported.
+* [PARQUET-1369](https://issues.apache.org/jira/browse/PARQUET-1369) - [Python] Unavailable Parquet column statistics from Spark-generated file
+* [PARQUET-1384](https://issues.apache.org/jira/browse/PARQUET-1384) - [C++] Clang compiler warnings in bloom\_filter-test.cc
+
+
+
+# Apache Arrow 0.10.0 (2018-08-06)
+
+## Bug Fixes
+
+* [ARROW-198](https://issues.apache.org/jira/browse/ARROW-198) - [Java] OutOfMemoryError for vector test case
+* [ARROW-640](https://issues.apache.org/jira/browse/ARROW-640) - [Python] Arrow scalar values should have a sensible \_\_hash\_\_ and comparison
+* [ARROW-2020](https://issues.apache.org/jira/browse/ARROW-2020) - [Python] Parquet segfaults if coercing ns timestamps and writing 96-bit timestamps
+* [ARROW-2059](https://issues.apache.org/jira/browse/ARROW-2059) - [Python] Possible performance regression in Feather read/write path
+* [ARROW-2101](https://issues.apache.org/jira/browse/ARROW-2101) - [Python] from\_pandas reads 'str' type as binary Arrow data with Python 2
+* [ARROW-2122](https://issues.apache.org/jira/browse/ARROW-2122) - [Python] Pyarrow fails to serialize dataframe with timestamp.
+* [ARROW-2182](https://issues.apache.org/jira/browse/ARROW-2182) - [Python] ASV benchmark setup does not account for C++ library changing
+* [ARROW-2189](https://issues.apache.org/jira/browse/ARROW-2189) - [C++] Seg. fault on make\_shared<PoolBuffer\>
+* [ARROW-2193](https://issues.apache.org/jira/browse/ARROW-2193) - [Plasma] plasma\_store has runtime dependency on Boost shared libraries when ARROW\_BOOST\_USE\_SHARED=on
+* [ARROW-2195](https://issues.apache.org/jira/browse/ARROW-2195) - [Plasma] Segfault when retrieving RecordBatch from plasma store
+* [ARROW-2247](https://issues.apache.org/jira/browse/ARROW-2247) - [Python] Statically-linking boost\_regex in both libarrow and libparquet results in segfault
+* [ARROW-2273](https://issues.apache.org/jira/browse/ARROW-2273) - Cannot deserialize pandas SparseDataFrame
+* [ARROW-2300](https://issues.apache.org/jira/browse/ARROW-2300) - [Python] python/testing/test\_hdfs.sh no longer works
+* [ARROW-2305](https://issues.apache.org/jira/browse/ARROW-2305) - [Python] Cython 0.25.2 compilation failure
+* [ARROW-2314](https://issues.apache.org/jira/browse/ARROW-2314) - [Python] Union array slicing is defective
+* [ARROW-2326](https://issues.apache.org/jira/browse/ARROW-2326) - [Python] cannot import pip installed pyarrow on OS X (10.9)
+* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - Writing a slice with feather ignores the offset
+* [ARROW-2331](https://issues.apache.org/jira/browse/ARROW-2331) - [Python] Fix indexing implementations
+* [ARROW-2333](https://issues.apache.org/jira/browse/ARROW-2333) - [Python] boost bundling fails in setup.py
+* [ARROW-2342](https://issues.apache.org/jira/browse/ARROW-2342) - [Python] Aware timestamp type fails pickling
+* [ARROW-2346](https://issues.apache.org/jira/browse/ARROW-2346) - [Python] PYARROW\_CXXFLAGS doesn't accept multiple options
+* [ARROW-2349](https://issues.apache.org/jira/browse/ARROW-2349) - [Python] Boost shared library bundling is broken for MSVC
+* [ARROW-2351](https://issues.apache.org/jira/browse/ARROW-2351) - [C++] StringBuilder::append(vector<string\>...) not implemented
+* [ARROW-2354](https://issues.apache.org/jira/browse/ARROW-2354) - [C++] PyDecimal\_Check() is much too slow
+* [ARROW-2355](https://issues.apache.org/jira/browse/ARROW-2355) - [Python] Unable to import pyarrow [0.9.0] OSX
+* [ARROW-2357](https://issues.apache.org/jira/browse/ARROW-2357) - Benchmark PandasObjectIsNull
+* [ARROW-2368](https://issues.apache.org/jira/browse/ARROW-2368) - DecimalVector\#setBigEndian is not padding correctly for negative values
+* [ARROW-2369](https://issues.apache.org/jira/browse/ARROW-2369) - Large (\>\~20 GB) files written to Parquet via PyArrow are corrupted
+* [ARROW-2370](https://issues.apache.org/jira/browse/ARROW-2370) - [GLib] include path is wrong on Meson build
+* [ARROW-2371](https://issues.apache.org/jira/browse/ARROW-2371) - [GLib] gio-2.0 isn't required on GNU Autotools build
+* [ARROW-2372](https://issues.apache.org/jira/browse/ARROW-2372) - [Python] ArrowIOError: Invalid argument when reading Parquet file
+* [ARROW-2375](https://issues.apache.org/jira/browse/ARROW-2375) - [Rust] Buffer should release memory when dropped
+* [ARROW-2377](https://issues.apache.org/jira/browse/ARROW-2377) - [GLib] Travis-CI failures
+* [ARROW-2380](https://issues.apache.org/jira/browse/ARROW-2380) - [Python] Correct issues in numpy\_to\_arrow conversion routines
+* [ARROW-2382](https://issues.apache.org/jira/browse/ARROW-2382) - [Rust] List<T\> was not using memory safely
+* [ARROW-2383](https://issues.apache.org/jira/browse/ARROW-2383) - [C++] Debian packages need to depend on libprotobuf
+* [ARROW-2387](https://issues.apache.org/jira/browse/ARROW-2387) - [Python] negative decimal values get spurious rescaling error
+* [ARROW-2391](https://issues.apache.org/jira/browse/ARROW-2391) - [Python] Segmentation fault from PyArrow when mapping Pandas datetime column to pyarrow.date64
+* [ARROW-2393](https://issues.apache.org/jira/browse/ARROW-2393) - [C++] arrow/status.h does not define ARROW\_CHECK needed for ARROW\_CHECK\_OK
+* [ARROW-2403](https://issues.apache.org/jira/browse/ARROW-2403) - [C++] arrow::CpuInfo::model\_name\_ destructed twice on exit
+* [ARROW-2405](https://issues.apache.org/jira/browse/ARROW-2405) - [C++] <functional\> is missing in plasma/client.h
+* [ARROW-2418](https://issues.apache.org/jira/browse/ARROW-2418) - [Rust] List builder fails due to memory not being reserved correctly
+* [ARROW-2419](https://issues.apache.org/jira/browse/ARROW-2419) - [Site] Website generation depends on local timezone
+* [ARROW-2420](https://issues.apache.org/jira/browse/ARROW-2420) - [Rust] Memory is never released
+* [ARROW-2421](https://issues.apache.org/jira/browse/ARROW-2421) - [C++] Update LLVM version in cpp README
+* [ARROW-2423](https://issues.apache.org/jira/browse/ARROW-2423) - [Python] PyArrow datatypes raise ValueError on equality checks against non-PyArrow objects
+* [ARROW-2424](https://issues.apache.org/jira/browse/ARROW-2424) - [Rust] Missing import causing broken build
+* [ARROW-2425](https://issues.apache.org/jira/browse/ARROW-2425) - [Rust] Array::from missing mapping for u8 type
+* [ARROW-2426](https://issues.apache.org/jira/browse/ARROW-2426) - [CI] glib build failure
+* [ARROW-2432](https://issues.apache.org/jira/browse/ARROW-2432) - [Python] from\_pandas fails when converting decimals if have None values
+* [ARROW-2437](https://issues.apache.org/jira/browse/ARROW-2437) - [C++] Change of arrow::ipc::ReadMessage signature breaks ABI compability
+* [ARROW-2438](https://issues.apache.org/jira/browse/ARROW-2438) - [Rust] memory\_pool.rs misses license header
+* [ARROW-2441](https://issues.apache.org/jira/browse/ARROW-2441) - [Rust] Builder<T\>::slice\_mut assertions are too strict
+* [ARROW-2443](https://issues.apache.org/jira/browse/ARROW-2443) - [Python] Conversion from pandas of empty categorical fails with ArrowInvalid
+* [ARROW-2450](https://issues.apache.org/jira/browse/ARROW-2450) - [Python] Saving to parquet fails for empty lists
+* [ARROW-2452](https://issues.apache.org/jira/browse/ARROW-2452) - [TEST] Spark integration test fails with permission error
+* [ARROW-2454](https://issues.apache.org/jira/browse/ARROW-2454) - [Python] Empty chunked array slice crashes
+* [ARROW-2455](https://issues.apache.org/jira/browse/ARROW-2455) - [C++] The bytes\_allocated\_ in CudaContextImpl isn't initialized
+* [ARROW-2457](https://issues.apache.org/jira/browse/ARROW-2457) - garrow\_array\_builder\_append\_values() won't work for large arrays
+* [ARROW-2459](https://issues.apache.org/jira/browse/ARROW-2459) - pyarrow: Segfault with pyarrow.deserialize\_pandas
+* [ARROW-2462](https://issues.apache.org/jira/browse/ARROW-2462) - [C++] Segfault when writing a parquet table containing a dictionary column from Record Batch Stream
+* [ARROW-2465](https://issues.apache.org/jira/browse/ARROW-2465) - [Plasma] plasma\_store fails to find libarrow\_gpu.so
+* [ARROW-2466](https://issues.apache.org/jira/browse/ARROW-2466) - [C++] misleading "append" flag to FileOutputStream
+* [ARROW-2468](https://issues.apache.org/jira/browse/ARROW-2468) - [Rust] Builder::slice\_mut should take mut self
+* [ARROW-2471](https://issues.apache.org/jira/browse/ARROW-2471) - [Rust] Assertion when pushing value to Builder/ListBuilder with zero capacity
+* [ARROW-2473](https://issues.apache.org/jira/browse/ARROW-2473) - [Rust] List assertion error with list of zero length
+* [ARROW-2474](https://issues.apache.org/jira/browse/ARROW-2474) - [Rust] Add windows support for memory pool abstraction
+* [ARROW-2489](https://issues.apache.org/jira/browse/ARROW-2489) - [Plasma] test\_plasma.py crashes
+* [ARROW-2491](https://issues.apache.org/jira/browse/ARROW-2491) - [Python] Array.from\_buffers does not work for ListArray
+* [ARROW-2492](https://issues.apache.org/jira/browse/ARROW-2492) - [Python] Prevent segfault on accidental call of pyarrow.Array
+* [ARROW-2500](https://issues.apache.org/jira/browse/ARROW-2500) - [Java] IPC Writers/readers are not always setting validity bits correctly
+* [ARROW-2502](https://issues.apache.org/jira/browse/ARROW-2502) - [Rust] Restore Windows Compatibility
+* [ARROW-2503](https://issues.apache.org/jira/browse/ARROW-2503) - [Python] Trailing space character in RowGroup statistics of pyarrow.parquet.ParquetFile
+* [ARROW-2509](https://issues.apache.org/jira/browse/ARROW-2509) - [CI] Intermittent npm failures
+* [ARROW-2510](https://issues.apache.org/jira/browse/ARROW-2510) - [Python] Segmentation fault when converting empty column as categorical
+* [ARROW-2511](https://issues.apache.org/jira/browse/ARROW-2511) - BaseVariableWidthVector.allocateNew is not throwing OOM when it can't allocate memory
+* [ARROW-2514](https://issues.apache.org/jira/browse/ARROW-2514) - [Python] Inferring / converting nested Numpy array is very slow
+* [ARROW-2515](https://issues.apache.org/jira/browse/ARROW-2515) - Errors with DictionaryArray inside of ListArray or other DictionaryArray
+* [ARROW-2518](https://issues.apache.org/jira/browse/ARROW-2518) - [Java] Restore Java unit tests and javadoc test to CI matrix
+* [ARROW-2530](https://issues.apache.org/jira/browse/ARROW-2530) - [GLib] Out-of-source build is failed
+* [ARROW-2534](https://issues.apache.org/jira/browse/ARROW-2534) - [C++] libarrow.so leaks zlib symbols
+* [ARROW-2545](https://issues.apache.org/jira/browse/ARROW-2545) - [Python] Arrow fails linking against statically-compiled Python
+* [ARROW-2554](https://issues.apache.org/jira/browse/ARROW-2554) - pa.array type inference bug when using NS-timestamp
+* [ARROW-2557](https://issues.apache.org/jira/browse/ARROW-2557) - [Rust] Add badge for code coverage in README
+* [ARROW-2561](https://issues.apache.org/jira/browse/ARROW-2561) - [C++] Crash in cuda-test shutdown with coverage enabled
+* [ARROW-2564](https://issues.apache.org/jira/browse/ARROW-2564) - [C++] Rowwise Tutorial is out of date
+* [ARROW-2565](https://issues.apache.org/jira/browse/ARROW-2565) - [Plasma] new subscriber cannot receive notifications about existing objects
+* [ARROW-2570](https://issues.apache.org/jira/browse/ARROW-2570) - [Python] Add support for writing parquet files with LZ4 compression
+* [ARROW-2571](https://issues.apache.org/jira/browse/ARROW-2571) - [C++] Lz4Codec doesn't properly handle empty data
+* [ARROW-2575](https://issues.apache.org/jira/browse/ARROW-2575) - [Python] Exclude hidden files when reading Parquet dataset
+* [ARROW-2578](https://issues.apache.org/jira/browse/ARROW-2578) - [Plasma] Valgrind errors related to std::random\_device
+* [ARROW-2589](https://issues.apache.org/jira/browse/ARROW-2589) - [Python] test\_parquet.py regression with Pandas 0.23.0
+* [ARROW-2593](https://issues.apache.org/jira/browse/ARROW-2593) - [Python] TypeError: data type "mixed-integer" not understood
+* [ARROW-2594](https://issues.apache.org/jira/browse/ARROW-2594) - [Java] Vector reallocation does not properly clear reused buffers
+* [ARROW-2599](https://issues.apache.org/jira/browse/ARROW-2599) - [Python] pip install is not working without Arrow C++ being installed
+* [ARROW-2601](https://issues.apache.org/jira/browse/ARROW-2601) - [Python] MemoryPool bytes\_allocated causes seg
+* [ARROW-2603](https://issues.apache.org/jira/browse/ARROW-2603) - [Python] from pandas raises ArrowInvalid for date(time) subclasses
+* [ARROW-2615](https://issues.apache.org/jira/browse/ARROW-2615) - [Rust] Refactor introduced a bug around Arrays of String
+* [ARROW-2622](https://issues.apache.org/jira/browse/ARROW-2622) - [C++] Array methods IsNull and IsValid are not complementary
+* [ARROW-2629](https://issues.apache.org/jira/browse/ARROW-2629) - [Plasma] Iterator invalidation for pending\_notifications\_
+* [ARROW-2630](https://issues.apache.org/jira/browse/ARROW-2630) - [Java] Typo in the document
+* [ARROW-2632](https://issues.apache.org/jira/browse/ARROW-2632) - [Java] ArrowStreamWriter accumulates ArrowBlock but does not use them
+* [ARROW-2640](https://issues.apache.org/jira/browse/ARROW-2640) - JS Writer should serialize schema metadata
+* [ARROW-2642](https://issues.apache.org/jira/browse/ARROW-2642) - [Python] Fail building parquet binding on Windows
+* [ARROW-2643](https://issues.apache.org/jira/browse/ARROW-2643) - [C++] Travis-CI build failure with cpp toolchain enabled
+* [ARROW-2644](https://issues.apache.org/jira/browse/ARROW-2644) - [Python] parquet binding fails building on AppVeyor
+* [ARROW-2655](https://issues.apache.org/jira/browse/ARROW-2655) - [C++] Failure with -Werror=conversion on gcc 7.3.0
+* [ARROW-2657](https://issues.apache.org/jira/browse/ARROW-2657) - Segfault when importing TensorFlow after Pyarrow
+* [ARROW-2668](https://issues.apache.org/jira/browse/ARROW-2668) - [C++] -Wnull-pointer-arithmetic warning with dlmalloc.c on clang 6.0, Ubuntu 14.04
+* [ARROW-2669](https://issues.apache.org/jira/browse/ARROW-2669) - [C++] EP\_CXX\_FLAGS not passed on when building gbenchmark
+* [ARROW-2675](https://issues.apache.org/jira/browse/ARROW-2675) - Arrow build error with clang-10 (Apple Clang / LLVM)
+* [ARROW-2683](https://issues.apache.org/jira/browse/ARROW-2683) - [Python] Resource Warning (Unclosed File) when using pyarrow.parquet.read\_table()
+* [ARROW-2690](https://issues.apache.org/jira/browse/ARROW-2690) - [C++] Plasma does not follow style conventions for variable and function names
+* [ARROW-2691](https://issues.apache.org/jira/browse/ARROW-2691) - [Rust] Travis fails due to formatting diff
+* [ARROW-2693](https://issues.apache.org/jira/browse/ARROW-2693) - [Python] pa.chunked\_array causes a segmentation fault on empty input
+* [ARROW-2694](https://issues.apache.org/jira/browse/ARROW-2694) - [Python] ArrayValue string conversion returns the representation instead of the converted python object string
+* [ARROW-2698](https://issues.apache.org/jira/browse/ARROW-2698) - [Python] Exception when passing a string to Table.column
+* [ARROW-2711](https://issues.apache.org/jira/browse/ARROW-2711) - [Python/C++] Pandas-Arrow doesn't roundtrip when column of lists has empty first element
+* [ARROW-2715](https://issues.apache.org/jira/browse/ARROW-2715) - Address apt flakiness with launchpad.net
+* [ARROW-2716](https://issues.apache.org/jira/browse/ARROW-2716) - [Python] Make manylinux1 base image independent of Python patch releases
+* [ARROW-2721](https://issues.apache.org/jira/browse/ARROW-2721) - [C++] Link error with Arrow C++ build with -DARROW\_ORC=ON on CentOS 7
+* [ARROW-2722](https://issues.apache.org/jira/browse/ARROW-2722) - [Python] ndarray to arrow conversion fails when downcasted from pandas to\_numeric
+* [ARROW-2723](https://issues.apache.org/jira/browse/ARROW-2723) - [C++] arrow-orc.pc is missing
+* [ARROW-2726](https://issues.apache.org/jira/browse/ARROW-2726) - [C++] The latest Boost version is wrong
+* [ARROW-2727](https://issues.apache.org/jira/browse/ARROW-2727) - [Java] Unable to build java/adapters module
+* [ARROW-2741](https://issues.apache.org/jira/browse/ARROW-2741) - [Python] pa.array from np.datetime[D] and type=pa.date64 produces invalid results
+* [ARROW-2744](https://issues.apache.org/jira/browse/ARROW-2744) - [Python] Writing to parquet crashes when writing a ListArray of empty lists
+* [ARROW-2745](https://issues.apache.org/jira/browse/ARROW-2745) - [C++] ORC ExternalProject needs to declare dependency on vendored protobuf
+* [ARROW-2747](https://issues.apache.org/jira/browse/ARROW-2747) - [CI] [Plasma] huge tables test failure on Travis
+* [ARROW-2754](https://issues.apache.org/jira/browse/ARROW-2754) - [Python] When installing pyarrow via pip, a debug build is created
+* [ARROW-2770](https://issues.apache.org/jira/browse/ARROW-2770) - [Packaging] Account for conda-forge compiler migration in conda recipes
+* [ARROW-2773](https://issues.apache.org/jira/browse/ARROW-2773) - [Python] Corrected parquet docs partition\_cols parameter name
+* [ARROW-2781](https://issues.apache.org/jira/browse/ARROW-2781) - [Python] Download boost using curl in manylinux1 image
+* [ARROW-2787](https://issues.apache.org/jira/browse/ARROW-2787) - [Python] Memory Issue passing table from python to c++ via cython
+* [ARROW-2795](https://issues.apache.org/jira/browse/ARROW-2795) - [Python] Run TensorFlow import workaround only on Linux
+* [ARROW-2806](https://issues.apache.org/jira/browse/ARROW-2806) - [Python] Inconsistent handling of np.nan
+* [ARROW-2810](https://issues.apache.org/jira/browse/ARROW-2810) - [Plasma] Plasma public headers leak flatbuffers.h
+* [ARROW-2812](https://issues.apache.org/jira/browse/ARROW-2812) - [Ruby] StructArray\#[] raises NoMethodError
+* [ARROW-2820](https://issues.apache.org/jira/browse/ARROW-2820) - [Python] RecordBatch.from\_arrays does not validate array lengths are all equal
+* [ARROW-2823](https://issues.apache.org/jira/browse/ARROW-2823) - [C++] Search for flatbuffers in <root\>/lib64
+* [ARROW-2841](https://issues.apache.org/jira/browse/ARROW-2841) - [Go] Fix recent Go build failures in Travis CI
+* [ARROW-2850](https://issues.apache.org/jira/browse/ARROW-2850) - [C++/Python] PARQUET\_RPATH\_ORIGIN=ON missing in manylinux1 build
+* [ARROW-2851](https://issues.apache.org/jira/browse/ARROW-2851) - [C++] Update RAT excludes for new install file names
+* [ARROW-2852](https://issues.apache.org/jira/browse/ARROW-2852) - [Rust] Mark Array as Sync and Send
+* [ARROW-2856](https://issues.apache.org/jira/browse/ARROW-2856) - [Python/C++] Array constructor should not truncate floats when casting to int
+* [ARROW-2862](https://issues.apache.org/jira/browse/ARROW-2862) - [C++] Ensure thirdparty download directory has been created in thirdparty/download\_thirdparty.sh
+* [ARROW-2867](https://issues.apache.org/jira/browse/ARROW-2867) - [Python] Incorrect example for Cython usage
+* [ARROW-2871](https://issues.apache.org/jira/browse/ARROW-2871) - [Python] Array.to\_numpy is invalid for boolean arrays
+* [ARROW-2872](https://issues.apache.org/jira/browse/ARROW-2872) - [Python] Add pytest mark to opt into TensorFlow-related unit tests
+* [ARROW-2876](https://issues.apache.org/jira/browse/ARROW-2876) - [Packaging] Crossbow builds can hang if you cloned using SSH
+* [ARROW-2877](https://issues.apache.org/jira/browse/ARROW-2877) - [Packaging] crossbow submit results in duplicate Travis CI build
+* [ARROW-2878](https://issues.apache.org/jira/browse/ARROW-2878) - [Packaging] README.md does not mention setting GitHub API token in user's crossbow repo settings
+* [ARROW-2883](https://issues.apache.org/jira/browse/ARROW-2883) - [Plasma] Compilation warnings
+* [ARROW-2891](https://issues.apache.org/jira/browse/ARROW-2891) - [Python] Preserve schema in write\_to\_dataset
+* [ARROW-2894](https://issues.apache.org/jira/browse/ARROW-2894) - [Glib] Format tests broken due to recent refactor
+* [ARROW-2895](https://issues.apache.org/jira/browse/ARROW-2895) - [Ruby] CI isn't ran when C++ is changed
+* [ARROW-2896](https://issues.apache.org/jira/browse/ARROW-2896) - [GLib] export are missing
+* [ARROW-2901](https://issues.apache.org/jira/browse/ARROW-2901) - [Java] Build is failing on Java9
+* [ARROW-2902](https://issues.apache.org/jira/browse/ARROW-2902) - [Python] HDFS Docker integration tests leave around files created by root
+* [ARROW-2903](https://issues.apache.org/jira/browse/ARROW-2903) - [C++] Setting -DARROW\_HDFS=OFF breaks arrow build when linking against boost libraries
+* [ARROW-2911](https://issues.apache.org/jira/browse/ARROW-2911) - [Python] Parquet binary statistics that end in '\0' truncate last byte
+* [ARROW-2917](https://issues.apache.org/jira/browse/ARROW-2917) - [Python] Tensor requiring gradiant cannot be serialized with pyarrow.serialize
+* [ARROW-2920](https://issues.apache.org/jira/browse/ARROW-2920) - [Python] Segfault with pytorch 0.4
+* [ARROW-2926](https://issues.apache.org/jira/browse/ARROW-2926) - [Python] ParquetWriter segfaults in example where passed schema and table schema do not match
+* [ARROW-2930](https://issues.apache.org/jira/browse/ARROW-2930) - [C++] Trying to set target properties on not existing CMake target
+* [ARROW-2940](https://issues.apache.org/jira/browse/ARROW-2940) - [Python] Import error with pytorch 0.3
+* [ARROW-2945](https://issues.apache.org/jira/browse/ARROW-2945) - [Packaging] Update argument check for 02-source.sh
+* [ARROW-2955](https://issues.apache.org/jira/browse/ARROW-2955) - [Python] Typo in pyarrow's HDFS API result
+* [ARROW-2963](https://issues.apache.org/jira/browse/ARROW-2963) - [Python] Deadlock during fork-join and use\_threads=True
+* [ARROW-2978](https://issues.apache.org/jira/browse/ARROW-2978) - [Rust] Travis CI build is failing
+* [ARROW-2982](https://issues.apache.org/jira/browse/ARROW-2982) - The "--show-progress" option is only supported in wget 1.16 and higher
+* [ARROW-3210](https://issues.apache.org/jira/browse/ARROW-3210) - [Python] Creating ParquetDataset creates partitioned ParquetFiles with mismatched Parquet schemas
+
+
+## New Features and Improvements
+
+* [ARROW-530](https://issues.apache.org/jira/browse/ARROW-530) - C++/Python: Provide subpools for better memory allocation tracking
+* [ARROW-564](https://issues.apache.org/jira/browse/ARROW-564) - [Python] Add methods to return vanilla NumPy arrays (plus boolean mask array if there are nulls)
+* [ARROW-665](https://issues.apache.org/jira/browse/ARROW-665) - C++: Move zeroing logic for (re)allocations to the Allocator
+* [ARROW-889](https://issues.apache.org/jira/browse/ARROW-889) - [C++] Implement arrow::PrettyPrint for ChunkedArray
+* [ARROW-902](https://issues.apache.org/jira/browse/ARROW-902) - [C++] Build C++ project including thirdparty dependencies from local tarballs
+* [ARROW-906](https://issues.apache.org/jira/browse/ARROW-906) - [C++] Serialize Field metadata to IPC metadata
+* [ARROW-1018](https://issues.apache.org/jira/browse/ARROW-1018) - [C++] Add option to create FileOutputStream, ReadableFile from OS file descriptor
+* [ARROW-1163](https://issues.apache.org/jira/browse/ARROW-1163) - [Plasma][Java] Java client for Plasma
+* [ARROW-1388](https://issues.apache.org/jira/browse/ARROW-1388) - [Python] Add Table.drop method for removing columns
+* [ARROW-1454](https://issues.apache.org/jira/browse/ARROW-1454) - [Python] More informative error message when attempting to write an unsupported Arrow type to Parquet format
+* [ARROW-1715](https://issues.apache.org/jira/browse/ARROW-1715) - [Python] Implement pickling for Column, ChunkedArray, RecordBatch, Table
+* [ARROW-1722](https://issues.apache.org/jira/browse/ARROW-1722) - [C++] Add linting script to look for C++/CLI issues
+* [ARROW-1731](https://issues.apache.org/jira/browse/ARROW-1731) - [Python] Provide for selecting a subset of columns to convert in RecordBatch/Table.from\_pandas
+* [ARROW-1744](https://issues.apache.org/jira/browse/ARROW-1744) - [Plasma] Provide TensorFlow operator to read tensors from plasma
+* [ARROW-1780](https://issues.apache.org/jira/browse/ARROW-1780) - [Java] JDBC Adapter for Apache Arrow
+* [ARROW-1858](https://issues.apache.org/jira/browse/ARROW-1858) - [Python] Add documentation about parquet.write\_to\_dataset and related methods
+* [ARROW-1868](https://issues.apache.org/jira/browse/ARROW-1868) - [Java] Change vector getMinorType to use MinorType instead of Types.MinorType
+* [ARROW-1886](https://issues.apache.org/jira/browse/ARROW-1886) - [Python] Add function to "flatten" structs within tables
+* [ARROW-1913](https://issues.apache.org/jira/browse/ARROW-1913) - [Java] Fix Javadoc generation bugs with JDK8
+* [ARROW-1928](https://issues.apache.org/jira/browse/ARROW-1928) - [C++] Add benchmarks comparing performance of internal::BitmapReader/Writer with naive approaches
+* [ARROW-1954](https://issues.apache.org/jira/browse/ARROW-1954) - [Python] Add metadata accessor to pyarrow.Field
+* [ARROW-1964](https://issues.apache.org/jira/browse/ARROW-1964) - [Python] Expose Builder classes
+* [ARROW-2014](https://issues.apache.org/jira/browse/ARROW-2014) - [Python] Document read\_pandas method in pyarrow.parquet
+* [ARROW-2055](https://issues.apache.org/jira/browse/ARROW-2055) - [Java] Upgrade to Java 8
+* [ARROW-2060](https://issues.apache.org/jira/browse/ARROW-2060) - [Python] Documentation for creating StructArray using from\_arrays or a sequence of dicts
+* [ARROW-2061](https://issues.apache.org/jira/browse/ARROW-2061) - [C++] Run ASAN builds in Travis CI
+* [ARROW-2074](https://issues.apache.org/jira/browse/ARROW-2074) - [Python] Allow type inference for struct arrays
+* [ARROW-2097](https://issues.apache.org/jira/browse/ARROW-2097) - [Python] Suppress valgrind stdout/stderr in Travis CI builds when there are no errors
+* [ARROW-2100](https://issues.apache.org/jira/browse/ARROW-2100) - [Python] Drop Python 3.4 support
+* [ARROW-2140](https://issues.apache.org/jira/browse/ARROW-2140) - [Python] Conversion from Numpy float16 array unimplemented
+* [ARROW-2141](https://issues.apache.org/jira/browse/ARROW-2141) - [Python] Conversion from Numpy object array to varsize binary unimplemented
+* [ARROW-2147](https://issues.apache.org/jira/browse/ARROW-2147) - [Python] Type inference doesn't work on lists of Numpy arrays
+* [ARROW-2207](https://issues.apache.org/jira/browse/ARROW-2207) - [GLib] Support decimal type
+* [ARROW-2222](https://issues.apache.org/jira/browse/ARROW-2222) - [C++] Add option to validate Flatbuffers messages
+* [ARROW-2224](https://issues.apache.org/jira/browse/ARROW-2224) - [C++] Get rid of boost regex usage
+* [ARROW-2241](https://issues.apache.org/jira/browse/ARROW-2241) - [Python] Simple script for running all current ASV benchmarks at a commit or tag
+* [ARROW-2264](https://issues.apache.org/jira/browse/ARROW-2264) - [Python] Efficiently serialize numpy arrays with dtype of unicode fixed length string
+* [ARROW-2267](https://issues.apache.org/jira/browse/ARROW-2267) - Rust bindings
+* [ARROW-2276](https://issues.apache.org/jira/browse/ARROW-2276) - [Python] Tensor could implement the buffer protocol
+* [ARROW-2281](https://issues.apache.org/jira/browse/ARROW-2281) - [Python] Expose MakeArray to construct arrays from buffers
+* [ARROW-2285](https://issues.apache.org/jira/browse/ARROW-2285) - [Python] Can't convert Numpy string arrays
+* [ARROW-2286](https://issues.apache.org/jira/browse/ARROW-2286) - [Python] Allow subscripting pyarrow.lib.StructValue
+* [ARROW-2287](https://issues.apache.org/jira/browse/ARROW-2287) - [Python] chunked array not iterable, not indexable
+* [ARROW-2299](https://issues.apache.org/jira/browse/ARROW-2299) - [Go] Go language implementation
+* [ARROW-2301](https://issues.apache.org/jira/browse/ARROW-2301) - [Python] Add source distribution publishing instructions to package / release management documentation
+* [ARROW-2302](https://issues.apache.org/jira/browse/ARROW-2302) - [GLib] Run autotools and meson Linux builds in same Travis CI build entry
+* [ARROW-2308](https://issues.apache.org/jira/browse/ARROW-2308) - Serialized tensor data should be 64-byte aligned.
+* [ARROW-2315](https://issues.apache.org/jira/browse/ARROW-2315) - [C++/Python] Add method to flatten a struct array
+* [ARROW-2319](https://issues.apache.org/jira/browse/ARROW-2319) - [C++] Add buffered output class implementing OutputStream interface
+* [ARROW-2322](https://issues.apache.org/jira/browse/ARROW-2322) - Document requirements to run dev/release/01-perform.sh
+* [ARROW-2325](https://issues.apache.org/jira/browse/ARROW-2325) - [Python] Update setup.py to use Markdown project description
+* [ARROW-2330](https://issues.apache.org/jira/browse/ARROW-2330) - [C++] Optimize delta buffer creation with partially finishable array builders
+* [ARROW-2332](https://issues.apache.org/jira/browse/ARROW-2332) - [Python] Provide API for reading multiple Feather files
+* [ARROW-2332](https://issues.apache.org/jira/browse/ARROW-2332) - [Python] Provide API for reading multiple Feather files
+* [ARROW-2334](https://issues.apache.org/jira/browse/ARROW-2334) - [C++] Update boost to 1.66.0
+* [ARROW-2335](https://issues.apache.org/jira/browse/ARROW-2335) - [Go] Move Go README one directory higher
+* [ARROW-2340](https://issues.apache.org/jira/browse/ARROW-2340) - [Website] Add blog post about Go codebase donation
+* [ARROW-2341](https://issues.apache.org/jira/browse/ARROW-2341) - [Python] pa.union() mode argument unintuitive
+* [ARROW-2343](https://issues.apache.org/jira/browse/ARROW-2343) - [Java/Packaging] Run mvn clean in API doc builds
+* [ARROW-2344](https://issues.apache.org/jira/browse/ARROW-2344) - [Go] Run Go unit tests in Travis CI
+* [ARROW-2345](https://issues.apache.org/jira/browse/ARROW-2345) - [Documentation] Fix bundle exec and set sphinx nosidebar to True
+* [ARROW-2348](https://issues.apache.org/jira/browse/ARROW-2348) - [GLib] Remove Go example
+* [ARROW-2350](https://issues.apache.org/jira/browse/ARROW-2350) - Shrink size of spark\_integration Docker container
+* [ARROW-2353](https://issues.apache.org/jira/browse/ARROW-2353) - Test correctness of built wheel on AppVeyor
+* [ARROW-2361](https://issues.apache.org/jira/browse/ARROW-2361) - [Rust] Start native Rust Implementation
+* [ARROW-2364](https://issues.apache.org/jira/browse/ARROW-2364) - [Plasma] PlasmaClient::Get() could take vector of object ids
+* [ARROW-2376](https://issues.apache.org/jira/browse/ARROW-2376) - [Rust] Travis should run tests for Rust library
+* [ARROW-2378](https://issues.apache.org/jira/browse/ARROW-2378) - [Rust] Use rustfmt to format source code
+* [ARROW-2381](https://issues.apache.org/jira/browse/ARROW-2381) - [Rust] Buffer<T\> should have an Iterator
+* [ARROW-2384](https://issues.apache.org/jira/browse/ARROW-2384) - Rust: Use Traits rather than defining methods directly
+* [ARROW-2385](https://issues.apache.org/jira/browse/ARROW-2385) - [Rust] Implement to\_json() for Field and DataType
+* [ARROW-2388](https://issues.apache.org/jira/browse/ARROW-2388) - [C++] Arrow::StringBuilder::Append() uses null\_bytes not valid\_bytes
+* [ARROW-2389](https://issues.apache.org/jira/browse/ARROW-2389) - [C++] Add StatusCode::OverflowError
+* [ARROW-2390](https://issues.apache.org/jira/browse/ARROW-2390) - [C++/Python] CheckPyError() could inspect exception type
+* [ARROW-2394](https://issues.apache.org/jira/browse/ARROW-2394) - [Python] Correct flake8 errors in benchmarks
+* [ARROW-2395](https://issues.apache.org/jira/browse/ARROW-2395) - [Python] Correct flake8 errors outside of pyarrow/ directory
+* [ARROW-2396](https://issues.apache.org/jira/browse/ARROW-2396) - Unify Rust Errors
+* [ARROW-2397](https://issues.apache.org/jira/browse/ARROW-2397) - Document changes in Tensor encoding in IPC.md.
+* [ARROW-2398](https://issues.apache.org/jira/browse/ARROW-2398) - [Rust] Provide a zero-copy builder for type-safe Buffer<T\>
+* [ARROW-2400](https://issues.apache.org/jira/browse/ARROW-2400) - [C++] Status destructor is expensive
+* [ARROW-2401](https://issues.apache.org/jira/browse/ARROW-2401) - Support filters on Hive partitioned Parquet files
+* [ARROW-2402](https://issues.apache.org/jira/browse/ARROW-2402) - [C++] FixedSizeBinaryBuilder::Append lacks "const char\*" overload
+* [ARROW-2404](https://issues.apache.org/jira/browse/ARROW-2404) - Fix declaration of 'type\_id' hides class member warning in msvc build
+* [ARROW-2407](https://issues.apache.org/jira/browse/ARROW-2407) - [GLib] Add garrow\_string\_array\_builder\_append\_values()
+* [ARROW-2408](https://issues.apache.org/jira/browse/ARROW-2408) - [Rust] It should be possible to get a &mut[T] from Builder<T\>
+* [ARROW-2408](https://issues.apache.org/jira/browse/ARROW-2408) - [Rust] It should be possible to get a &mut[T] from Builder<T\>
+* [ARROW-2411](https://issues.apache.org/jira/browse/ARROW-2411) - [C++] Add method to append batches of null-terminated strings to StringBuilder
+* [ARROW-2413](https://issues.apache.org/jira/browse/ARROW-2413) - [Rust] Remove useless use of \`format!\`
+* [ARROW-2414](https://issues.apache.org/jira/browse/ARROW-2414) - [Documentation] Fix miscellaneous documentation typos
+* [ARROW-2415](https://issues.apache.org/jira/browse/ARROW-2415) - [Rust] Fix using references in pattern matching
+* [ARROW-2416](https://issues.apache.org/jira/browse/ARROW-2416) - [C++] Support system libprotobuf
+* [ARROW-2417](https://issues.apache.org/jira/browse/ARROW-2417) - [Rust] Review APIs for safety
+* [ARROW-2422](https://issues.apache.org/jira/browse/ARROW-2422) - [Python] Support more filter operators on Hive partitioned Parquet files
+* [ARROW-2427](https://issues.apache.org/jira/browse/ARROW-2427) - [C++] ReadAt implementations suboptimal
+* [ARROW-2430](https://issues.apache.org/jira/browse/ARROW-2430) - MVP for branch based packaging automation
+* [ARROW-2433](https://issues.apache.org/jira/browse/ARROW-2433) - [Rust] Add Builder.push\_slice(&[T])
+* [ARROW-2434](https://issues.apache.org/jira/browse/ARROW-2434) - [Rust] Add windows support
+* [ARROW-2435](https://issues.apache.org/jira/browse/ARROW-2435) - [Rust] Add memory pool abstraction.
+* [ARROW-2436](https://issues.apache.org/jira/browse/ARROW-2436) - [Rust] Add windows CI
+* [ARROW-2439](https://issues.apache.org/jira/browse/ARROW-2439) - [Rust] Run license header checks also in Rust CI entry
+* [ARROW-2440](https://issues.apache.org/jira/browse/ARROW-2440) - [Rust] Implement ListBuilder<T\>
+* [ARROW-2442](https://issues.apache.org/jira/browse/ARROW-2442) - [C++] Disambiguate Builder::Append overloads
+* [ARROW-2445](https://issues.apache.org/jira/browse/ARROW-2445) - [Rust] Add documentation and make some fields private
+* [ARROW-2448](https://issues.apache.org/jira/browse/ARROW-2448) - Segfault when plasma client goes out of scope before buffer.
+* [ARROW-2451](https://issues.apache.org/jira/browse/ARROW-2451) - Handle more dtypes efficiently in custom numpy array serializer.
+* [ARROW-2453](https://issues.apache.org/jira/browse/ARROW-2453) - [Python] Improve Table column access
+* [ARROW-2458](https://issues.apache.org/jira/browse/ARROW-2458) - [Plasma] PlasmaClient uses global variable
+* [ARROW-2463](https://issues.apache.org/jira/browse/ARROW-2463) - [C++] Update flatbuffers to 1.9.0
+* [ARROW-2464](https://issues.apache.org/jira/browse/ARROW-2464) - [Python] Use a python\_version marker instead of a condition
+* [ARROW-2469](https://issues.apache.org/jira/browse/ARROW-2469) - Make out arguments last in ReadMessage API.
+* [ARROW-2470](https://issues.apache.org/jira/browse/ARROW-2470) - [C++] FileGetSize() should not seek
+* [ARROW-2472](https://issues.apache.org/jira/browse/ARROW-2472) - [Rust] The Schema and Fields types should not have public attributes
+* [ARROW-2477](https://issues.apache.org/jira/browse/ARROW-2477) - [Rust] Set up code coverage in CI
+* [ARROW-2478](https://issues.apache.org/jira/browse/ARROW-2478) - [C++] Introduce a checked\_cast function that performs a dynamic\_cast in debug mode
+* [ARROW-2479](https://issues.apache.org/jira/browse/ARROW-2479) - [C++] Have a global thread pool
+* [ARROW-2480](https://issues.apache.org/jira/browse/ARROW-2480) - [C++] Enable casting the value of a decimal to int32\_t or int64\_t
+* [ARROW-2481](https://issues.apache.org/jira/browse/ARROW-2481) - [Rust] Move calls to free() into memory.rs
+* [ARROW-2482](https://issues.apache.org/jira/browse/ARROW-2482) - [Rust] support nested types
+* [ARROW-2484](https://issues.apache.org/jira/browse/ARROW-2484) - [C++] Document ABI compliance checking
+* [ARROW-2485](https://issues.apache.org/jira/browse/ARROW-2485) - [C++] Output diff when run\_clang\_format.py reports a change
+* [ARROW-2486](https://issues.apache.org/jira/browse/ARROW-2486) - [C++/Python] Provide a Docker image that contains all dependencies for development
+* [ARROW-2488](https://issues.apache.org/jira/browse/ARROW-2488) - [C++] List Boost 1.67 as supported version
+* [ARROW-2493](https://issues.apache.org/jira/browse/ARROW-2493) - [Python] Add support for pickling to buffers and arrays
+* [ARROW-2494](https://issues.apache.org/jira/browse/ARROW-2494) - Return status codes from PlasmaClient::Seal
+* [ARROW-2498](https://issues.apache.org/jira/browse/ARROW-2498) - [Java] Upgrade to JDK 1.8
+* [ARROW-2499](https://issues.apache.org/jira/browse/ARROW-2499) - [C++] Add iterator facility for Python sequences
+* [ARROW-2505](https://issues.apache.org/jira/browse/ARROW-2505) - [C++] Disable MSVC warning C4800
+* [ARROW-2506](https://issues.apache.org/jira/browse/ARROW-2506) - [Plasma] Build error on macOS
+* [ARROW-2507](https://issues.apache.org/jira/browse/ARROW-2507) - [Rust] Don't take a reference when not needed
+* [ARROW-2508](https://issues.apache.org/jira/browse/ARROW-2508) - [Python] pytest API changes make tests fail
+* [ARROW-2513](https://issues.apache.org/jira/browse/ARROW-2513) - [Python] DictionaryType should give access to index type and dictionary array
+* [ARROW-2516](https://issues.apache.org/jira/browse/ARROW-2516) - AppVeyor Build Matrix should be specific to the changes made in a PR
+* [ARROW-2521](https://issues.apache.org/jira/browse/ARROW-2521) - [Rust] Refactor Rust API to use traits and generics
+* [ARROW-2522](https://issues.apache.org/jira/browse/ARROW-2522) - [C++] Version shared library files
+* [ARROW-2525](https://issues.apache.org/jira/browse/ARROW-2525) - [GLib] Add garrow\_struct\_array\_flatten()
+* [ARROW-2526](https://issues.apache.org/jira/browse/ARROW-2526) - [GLib] Update .gitignore
+* [ARROW-2527](https://issues.apache.org/jira/browse/ARROW-2527) - [GLib] Enable GPU document
+* [ARROW-2528](https://issues.apache.org/jira/browse/ARROW-2528) - [Rust] Add trait bounds for T in Buffer/List
+* [ARROW-2529](https://issues.apache.org/jira/browse/ARROW-2529) - [C++] Update mention of clang-format to 5.0 in the docs
+* [ARROW-2531](https://issues.apache.org/jira/browse/ARROW-2531) - [C++] Update clang bits to 6.0
+* [ARROW-2533](https://issues.apache.org/jira/browse/ARROW-2533) - [CI] Fast finish failing AppVeyor builds
+* [ARROW-2536](https://issues.apache.org/jira/browse/ARROW-2536) - [Rust] ListBuilder uses wrong initial size for offset builder
+* [ARROW-2537](https://issues.apache.org/jira/browse/ARROW-2537) - [Ruby] Import
+* [ARROW-2539](https://issues.apache.org/jira/browse/ARROW-2539) - [Plasma] Use unique\_ptr instead of raw pointer
+* [ARROW-2540](https://issues.apache.org/jira/browse/ARROW-2540) - [Plasma] add constructor/destructor to make sure dlfree is called automatically
+* [ARROW-2541](https://issues.apache.org/jira/browse/ARROW-2541) - [Plasma] Clean up macro usage
+* [ARROW-2543](https://issues.apache.org/jira/browse/ARROW-2543) - [Rust] CI should cache dependencies for faster builds
+* [ARROW-2544](https://issues.apache.org/jira/browse/ARROW-2544) - [CI] Run C++ tests with two jobs on Travis-CI
+* [ARROW-2547](https://issues.apache.org/jira/browse/ARROW-2547) - [Format] Fix off-by-one in List<List<byte\>\> example
+* [ARROW-2548](https://issues.apache.org/jira/browse/ARROW-2548) - [Format] Clarify \`List<Char\>\` Array example
+* [ARROW-2549](https://issues.apache.org/jira/browse/ARROW-2549) - [GLib] Apply arrow::StatusCodes changes to GArrowError
+* [ARROW-2550](https://issues.apache.org/jira/browse/ARROW-2550) - [C++] Add missing status codes into arrow::StatusCode::CodeAsString()
+* [ARROW-2551](https://issues.apache.org/jira/browse/ARROW-2551) - [Plasma] Improve notification logic
+* [ARROW-2552](https://issues.apache.org/jira/browse/ARROW-2552) - [Plasma] Unit tests are flaky
+* [ARROW-2553](https://issues.apache.org/jira/browse/ARROW-2553) - [Python] Set MACOSX\_DEPLOYMENT\_TARGET in wheel build
+* [ARROW-2558](https://issues.apache.org/jira/browse/ARROW-2558) - [Plasma] avoid walk through all the objects when a client disconnects
+* [ARROW-2562](https://issues.apache.org/jira/browse/ARROW-2562) - [C++] Upload coverage data to codecov.io
+* [ARROW-2563](https://issues.apache.org/jira/browse/ARROW-2563) - [Rust] Poor caching in Travis-CI
+* [ARROW-2566](https://issues.apache.org/jira/browse/ARROW-2566) - [CI] Add codecov.io badge to README
+* [ARROW-2567](https://issues.apache.org/jira/browse/ARROW-2567) - [C++/Python] Unit is ignored on comparison of TimestampArrays
+* [ARROW-2568](https://issues.apache.org/jira/browse/ARROW-2568) - [Python] Expose thread pool size setting to Python, and deprecate "nthreads"
+* [ARROW-2569](https://issues.apache.org/jira/browse/ARROW-2569) - [C++] Improve thread pool size heuristic
+* [ARROW-2574](https://issues.apache.org/jira/browse/ARROW-2574) - [CI] Collect and publish Python coverage
+* [ARROW-2576](https://issues.apache.org/jira/browse/ARROW-2576) - [GLib] Add abs functions for Decimal128.
+* [ARROW-2577](https://issues.apache.org/jira/browse/ARROW-2577) - [Plasma] Add ASV benchmarks
+* [ARROW-2580](https://issues.apache.org/jira/browse/ARROW-2580) - [GLib] Fix abs functions for Decimal128
+* [ARROW-2582](https://issues.apache.org/jira/browse/ARROW-2582) - [GLib] Add negate functions for Decimal128
+* [ARROW-2585](https://issues.apache.org/jira/browse/ARROW-2585) - [C++] Add Decimal128::FromBigEndian
+* [ARROW-2586](https://issues.apache.org/jira/browse/ARROW-2586) - [C++] Make child builders of ListBuilder and StructBuilder shared\_ptr's
+* [ARROW-2595](https://issues.apache.org/jira/browse/ARROW-2595) - [Plasma] operator[] creates entries in map
+* [ARROW-2596](https://issues.apache.org/jira/browse/ARROW-2596) - [GLib] Use the default value of GTK-Doc
+* [ARROW-2597](https://issues.apache.org/jira/browse/ARROW-2597) - [Plasma] remove UniqueIDHasher
+* [ARROW-2604](https://issues.apache.org/jira/browse/ARROW-2604) - [Java] Add method overload for VarCharVector.set(int,String)
+* [ARROW-2608](https://issues.apache.org/jira/browse/ARROW-2608) - [Java/Python] Add pyarrow.{Array,Field}.from\_jvm / jvm\_buffer
+* [ARROW-2611](https://issues.apache.org/jira/browse/ARROW-2611) - [Python] Python 2 integer serialization
+* [ARROW-2612](https://issues.apache.org/jira/browse/ARROW-2612) - [Plasma] Fix deprecated PLASMA\_DEFAULT\_RELEASE\_DELAY
+* [ARROW-2613](https://issues.apache.org/jira/browse/ARROW-2613) - [Docs] Update the gen\_apidocs docker script
+* [ARROW-2614](https://issues.apache.org/jira/browse/ARROW-2614) - [CI] Remove 'group: deprecated' in Travis
+* [ARROW-2626](https://issues.apache.org/jira/browse/ARROW-2626) - [Python] pandas ArrowInvalid message should include failing column name
+* [ARROW-2634](https://issues.apache.org/jira/browse/ARROW-2634) - [Go] Add LICENSE additions for Go subproject
+* [ARROW-2635](https://issues.apache.org/jira/browse/ARROW-2635) - [Ruby] LICENSE.txt isn't suitable
+* [ARROW-2636](https://issues.apache.org/jira/browse/ARROW-2636) - [Ruby] "Unofficial" package note is missing
+* [ARROW-2638](https://issues.apache.org/jira/browse/ARROW-2638) - [Python] Prevent calling extension class constructors directly
+* [ARROW-2639](https://issues.apache.org/jira/browse/ARROW-2639) - [Python] Remove unnecessary \_check\_nullptr methods
+* [ARROW-2641](https://issues.apache.org/jira/browse/ARROW-2641) - [C++] Investigate spurious memset() calls
+* [ARROW-2645](https://issues.apache.org/jira/browse/ARROW-2645) - [Java] ArrowStreamWriter accumulates DictionaryBatch ArrowBlocks
+* [ARROW-2649](https://issues.apache.org/jira/browse/ARROW-2649) - [C++] Add std::generate()-like function for faster bitmap writing
+* [ARROW-2656](https://issues.apache.org/jira/browse/ARROW-2656) - [Python] Improve ParquetManifest creation time
+* [ARROW-2660](https://issues.apache.org/jira/browse/ARROW-2660) - [Python] Experiment with zero-copy pickling
+* [ARROW-2661](https://issues.apache.org/jira/browse/ARROW-2661) - [Python/C++] Allow passing HDFS Config values via map/dict instead of needing an hdfs-site.xml file
+* [ARROW-2662](https://issues.apache.org/jira/browse/ARROW-2662) - [Python] Add to\_pandas / to\_numpy to ChunkedArray
+* [ARROW-2663](https://issues.apache.org/jira/browse/ARROW-2663) - [Python] Make dictionary\_encode and unique accesible on Column / ChunkedArray
+* [ARROW-2664](https://issues.apache.org/jira/browse/ARROW-2664) - [Python] Implement \_\_getitem\_\_ / slicing on Buffer
+* [ARROW-2666](https://issues.apache.org/jira/browse/ARROW-2666) - [Python] numpy.asarray should trigger to\_pandas on Array/ChunkedArray
+* [ARROW-2672](https://issues.apache.org/jira/browse/ARROW-2672) - [Python] Build ORC extension in manylinux1 wheels
+* [ARROW-2674](https://issues.apache.org/jira/browse/ARROW-2674) - [Packaging] Start building nightlies
+* [ARROW-2676](https://issues.apache.org/jira/browse/ARROW-2676) - [Packaging] Deploy build artifacts to github releases
+* [ARROW-2677](https://issues.apache.org/jira/browse/ARROW-2677) - [Python] Expose Parquet ZSTD compression
+* [ARROW-2678](https://issues.apache.org/jira/browse/ARROW-2678) - [GLib] Add extra information to common build problems on macOS
+* [ARROW-2680](https://issues.apache.org/jira/browse/ARROW-2680) - [Python] Add documentation about type inference in Table.from\_pandas
+* [ARROW-2682](https://issues.apache.org/jira/browse/ARROW-2682) - [CI] Notify in Slack about broken builds
+* [ARROW-2689](https://issues.apache.org/jira/browse/ARROW-2689) - [Python] Remove references to timestamps\_to\_ms argument from documentation
+* [ARROW-2692](https://issues.apache.org/jira/browse/ARROW-2692) - [Python] Add test for writing dictionary encoded columns to chunked Parquet files
+* [ARROW-2695](https://issues.apache.org/jira/browse/ARROW-2695) - [Python] Prevent calling scalar contructors directly
+* [ARROW-2696](https://issues.apache.org/jira/browse/ARROW-2696) - [JAVA] enhance AllocationListener with an onFailedAllocation() call
+* [ARROW-2699](https://issues.apache.org/jira/browse/ARROW-2699) - [C++/Python] Add Table method that replaces a column with a new supplied column
+* [ARROW-2700](https://issues.apache.org/jira/browse/ARROW-2700) - [Python] Add simple examples to Array.cast docstring
+* [ARROW-2701](https://issues.apache.org/jira/browse/ARROW-2701) - [C++] Make MemoryMappedFile resizable
+* [ARROW-2704](https://issues.apache.org/jira/browse/ARROW-2704) - [Java] IPC stream handling should be more friendly to low level processing
+* [ARROW-2713](https://issues.apache.org/jira/browse/ARROW-2713) - [Packaging] Fix linux package builds
+* [ARROW-2717](https://issues.apache.org/jira/browse/ARROW-2717) - [Packaging] Postfix conda artifacts with target arch
+* [ARROW-2718](https://issues.apache.org/jira/browse/ARROW-2718) - [Packaging] GPG sign downloaded artifacts
+* [ARROW-2724](https://issues.apache.org/jira/browse/ARROW-2724) - [Packaging] Determine whether all the expected artifacts are uploaded
+* [ARROW-2725](https://issues.apache.org/jira/browse/ARROW-2725) - [JAVA] make Accountant.AllocationOutcome publicly visible
+* [ARROW-2729](https://issues.apache.org/jira/browse/ARROW-2729) - [GLib] Add decimal128 array builder
+* [ARROW-2731](https://issues.apache.org/jira/browse/ARROW-2731) - Allow usage of external ORC library
+* [ARROW-2732](https://issues.apache.org/jira/browse/ARROW-2732) - Update brew packages for macOS
+* [ARROW-2733](https://issues.apache.org/jira/browse/ARROW-2733) - [GLib] Cast garrow\_decimal128 to gint64
+* [ARROW-2738](https://issues.apache.org/jira/browse/ARROW-2738) - [GLib] Use Brewfile on installation process
+* [ARROW-2739](https://issues.apache.org/jira/browse/ARROW-2739) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE for GArrowDecimalDataType and GArrowDecimal128ArrayBuilder
+* [ARROW-2740](https://issues.apache.org/jira/browse/ARROW-2740) - [Python] Add address property to Buffer
+* [ARROW-2742](https://issues.apache.org/jira/browse/ARROW-2742) - [Python] Allow Table.from\_batches to use Iterator of ArrowRecordBatches
+* [ARROW-2748](https://issues.apache.org/jira/browse/ARROW-2748) - [GLib] Add garrow\_decimal\_data\_type\_get\_scale() (and \_precision())
+* [ARROW-2749](https://issues.apache.org/jira/browse/ARROW-2749) - [GLib] Rename \*garrow\_decimal128\_array\_get\_value to \*garrow\_decimal128\_array\_format\_value
+* [ARROW-2751](https://issues.apache.org/jira/browse/ARROW-2751) - [GLib] Add garrow\_table\_replace\_column()
+* [ARROW-2752](https://issues.apache.org/jira/browse/ARROW-2752) - [GLib] Document garrow\_decimal\_data\_type\_new()
+* [ARROW-2753](https://issues.apache.org/jira/browse/ARROW-2753) - [GLib] Add garrow\_schema\_\*\_field()
+* [ARROW-2755](https://issues.apache.org/jira/browse/ARROW-2755) - [Python] Allow using Ninja to build extension
+* [ARROW-2756](https://issues.apache.org/jira/browse/ARROW-2756) - [Python] Remove redundant imports and minor fixes in parquet tests
+* [ARROW-2758](https://issues.apache.org/jira/browse/ARROW-2758) - [Plasma] Use Scope enum in Plasma
+* [ARROW-2760](https://issues.apache.org/jira/browse/ARROW-2760) - [Python] Remove legacy property definition syntax from parquet module and test them
+* [ARROW-2761](https://issues.apache.org/jira/browse/ARROW-2761) - Support set filter operators on Hive partitioned Parquet files
+* [ARROW-2763](https://issues.apache.org/jira/browse/ARROW-2763) - [Python] Make parquet \_metadata file accessible from ParquetDataset
+* [ARROW-2780](https://issues.apache.org/jira/browse/ARROW-2780) - [Go] Run code coverage analysis
+* [ARROW-2784](https://issues.apache.org/jira/browse/ARROW-2784) - [C++] MemoryMappedFile::WriteAt allow writing past the end
+* [ARROW-2790](https://issues.apache.org/jira/browse/ARROW-2790) - [C++] Buffers contain uninitialized memory
+* [ARROW-2790](https://issues.apache.org/jira/browse/ARROW-2790) - [C++] Buffers contain uninitialized memory
+* [ARROW-2791](https://issues.apache.org/jira/browse/ARROW-2791) - [Packaging] Build Ubuntu 18.04 packages
+* [ARROW-2792](https://issues.apache.org/jira/browse/ARROW-2792) - [Packaging] Consider uploading tarballs to avoid naming conflicts
+* [ARROW-2794](https://issues.apache.org/jira/browse/ARROW-2794) - [Plasma] Add Delete method for multiple objects
+* [ARROW-2798](https://issues.apache.org/jira/browse/ARROW-2798) - [Plasma] Use hashing function that takes into account all UniqueID bytes
+* [ARROW-2802](https://issues.apache.org/jira/browse/ARROW-2802) - [Docs] Move release management guide to project wiki
+* [ARROW-2804](https://issues.apache.org/jira/browse/ARROW-2804) - [Website] Link to Developer wiki (Confluence) from front page
+* [ARROW-2805](https://issues.apache.org/jira/browse/ARROW-2805) - [Python] TensorFlow import workaround not working with tensorflow-gpu if CUDA is not installed
+* [ARROW-2809](https://issues.apache.org/jira/browse/ARROW-2809) - [C++] Decrease verbosity of lint checks in Travis CI
+* [ARROW-2811](https://issues.apache.org/jira/browse/ARROW-2811) - [Python] Test serialization for determinism
+* [ARROW-2815](https://issues.apache.org/jira/browse/ARROW-2815) - [CI] Suppress DEBUG logging when building Java library in C++ CI entries
+* [ARROW-2816](https://issues.apache.org/jira/browse/ARROW-2816) - [Python] Add \_\_iter\_\_ method to NativeFile
+* [ARROW-2821](https://issues.apache.org/jira/browse/ARROW-2821) - [C++] Only zero memory in BooleanBuilder in one place
+* [ARROW-2822](https://issues.apache.org/jira/browse/ARROW-2822) - [C++] Zero padding bytes in PoolBuffer::Resize
+* [ARROW-2822](https://issues.apache.org/jira/browse/ARROW-2822) - [C++] Zero padding bytes in PoolBuffer::Resize
+* [ARROW-2824](https://issues.apache.org/jira/browse/ARROW-2824) - [GLib] Add garrow\_decimal128\_array\_get\_value()
+* [ARROW-2825](https://issues.apache.org/jira/browse/ARROW-2825) - [C++] Need AllocateBuffer / AllocateResizableBuffer variant with default memory pool
+* [ARROW-2826](https://issues.apache.org/jira/browse/ARROW-2826) - [C++] Clarification needed between ArrayBuilder::Init(), Resize() and Reserve()
+* [ARROW-2827](https://issues.apache.org/jira/browse/ARROW-2827) - [C++] LZ4 and Zstd build may be failed in parallel build
+* [ARROW-2829](https://issues.apache.org/jira/browse/ARROW-2829) - [GLib] Add GArrowORCFileReader
+* [ARROW-2830](https://issues.apache.org/jira/browse/ARROW-2830) - [Packaging] Enable parallel build for deb package build again
+* [ARROW-2832](https://issues.apache.org/jira/browse/ARROW-2832) - [Python] Pretty-print schema metadata in Schema.\_\_repr\_\_
+* [ARROW-2833](https://issues.apache.org/jira/browse/ARROW-2833) - [Python] Column.\_\_repr\_\_ will lock up Jupyter with large datasets
+* [ARROW-2834](https://issues.apache.org/jira/browse/ARROW-2834) - [GLib] Remove "enable\_" prefix from Meson options
+* [ARROW-2836](https://issues.apache.org/jira/browse/ARROW-2836) - [Packaging] Expand build matrices to multiple tasks
+* [ARROW-2837](https://issues.apache.org/jira/browse/ARROW-2837) - [C++] ArrayBuilder::null\_bitmap returns PoolBuffer
+* [ARROW-2838](https://issues.apache.org/jira/browse/ARROW-2838) - [Python] Speed up null testing with Pandas semantics
+* [ARROW-2844](https://issues.apache.org/jira/browse/ARROW-2844) - [Packaging] Test OSX wheels after build
+* [ARROW-2845](https://issues.apache.org/jira/browse/ARROW-2845) - [Packaging] Upload additional debian artifacts
+* [ARROW-2846](https://issues.apache.org/jira/browse/ARROW-2846) - [Packaging] Update nightly build in crossbow as well as the sample configuration
+* [ARROW-2847](https://issues.apache.org/jira/browse/ARROW-2847) - [Packaging] Fix artifact name matching for conda forge packages
+* [ARROW-2848](https://issues.apache.org/jira/browse/ARROW-2848) - [Packaging] lib\*.deb package name doesn't match so version
+* [ARROW-2849](https://issues.apache.org/jira/browse/ARROW-2849) - [Ruby] Arrow::Table\#load supports ORC
+* [ARROW-2855](https://issues.apache.org/jira/browse/ARROW-2855) - [C++] Blog post that outlines the benefits of using jemalloc
+* [ARROW-2859](https://issues.apache.org/jira/browse/ARROW-2859) - [Python] Handle objects exporting the buffer protocol in open\_stream, open\_file, and RecordBatch\*Reader APIs
+* [ARROW-2861](https://issues.apache.org/jira/browse/ARROW-2861) - [Python] Add extra tips about using Parquet to store index-less pandas data
+* [ARROW-2864](https://issues.apache.org/jira/browse/ARROW-2864) - [Plasma] Add deletion cache to delete objects later
+* [ARROW-2868](https://issues.apache.org/jira/browse/ARROW-2868) - [Packaging] Fix centos-7 build
+* [ARROW-2869](https://issues.apache.org/jira/browse/ARROW-2869) - [Python] Add documentation for Array.to\_numpy
+* [ARROW-2874](https://issues.apache.org/jira/browse/ARROW-2874) - [Packaging] Pass job prefix when putting on Queue
+* [ARROW-2875](https://issues.apache.org/jira/browse/ARROW-2875) - [Packaging] Don't attempt to download arrow archive in linux builds
+* [ARROW-2881](https://issues.apache.org/jira/browse/ARROW-2881) - [Website] Add Community tab to website
+* [ARROW-2884](https://issues.apache.org/jira/browse/ARROW-2884) - [Packaging] Options to build packages from apache source archive
+* [ARROW-2886](https://issues.apache.org/jira/browse/ARROW-2886) - [Release] An unused variable exists
+* [ARROW-2890](https://issues.apache.org/jira/browse/ARROW-2890) - [Plasma] Make Python PlasmaClient.release private
+* [ARROW-2893](https://issues.apache.org/jira/browse/ARROW-2893) - [C++] Remove PoolBuffer class from public API and hide implementation details behind factory functions
+* [ARROW-2897](https://issues.apache.org/jira/browse/ARROW-2897) - Organize supported Ubuntu versions
+* [ARROW-2898](https://issues.apache.org/jira/browse/ARROW-2898) - [Packaging] Setuptools\_scm just shipped a new version which fails to parse \`apache-arrow-<version\>\` tag
+* [ARROW-2906](https://issues.apache.org/jira/browse/ARROW-2906) - [Website] Remove the link to slack channel
+* [ARROW-2907](https://issues.apache.org/jira/browse/ARROW-2907) - [GitHub] Improve "How to contribute patches"
+* [ARROW-2908](https://issues.apache.org/jira/browse/ARROW-2908) - [Rust] Update version to 0.10.0
+* [ARROW-2914](https://issues.apache.org/jira/browse/ARROW-2914) - [Integration] Add WindowPandasUDFTests to Spark Integration
+* [ARROW-2915](https://issues.apache.org/jira/browse/ARROW-2915) - [Packaging] Remove artifact form ubuntu-trusty build
+* [ARROW-2918](https://issues.apache.org/jira/browse/ARROW-2918) - [C++] Improve formatting of Struct pretty prints
+* [ARROW-2921](https://issues.apache.org/jira/browse/ARROW-2921) - [Release] Update .deb/.rpm changelos in preparation
+* [ARROW-2922](https://issues.apache.org/jira/browse/ARROW-2922) - [Release] Make python command name customizable
+* [ARROW-2923](https://issues.apache.org/jira/browse/ARROW-2923) - [Doc] Add instructions for running Spark integration tests
+* [ARROW-2924](https://issues.apache.org/jira/browse/ARROW-2924) - [Java] mvn release fails when an older maven javadoc plugin is installed
+* [ARROW-2927](https://issues.apache.org/jira/browse/ARROW-2927) - [Packaging] AppVeyor wheel task is failing on initial checkout
+* [ARROW-2928](https://issues.apache.org/jira/browse/ARROW-2928) - [Packaging] AppVeyor crossbow conda builds are picking up boost 1.63.0 instead of the installed version
+* [ARROW-2929](https://issues.apache.org/jira/browse/ARROW-2929) - [C++] ARROW-2826 Breaks parquet-cpp 1.4.0 builds
+* [ARROW-2934](https://issues.apache.org/jira/browse/ARROW-2934) - [Packaging] Add checksums creation to sign subcommand
+* [ARROW-2935](https://issues.apache.org/jira/browse/ARROW-2935) - [Packaging] Add verify\_binary\_artifacts function to verify-release-candidate.sh
+* [ARROW-2937](https://issues.apache.org/jira/browse/ARROW-2937) - [Java] Follow-up changes to ARROW-2704
+* [ARROW-2943](https://issues.apache.org/jira/browse/ARROW-2943) - [C++] Implement BufferedOutputStream::Flush
+* [ARROW-2944](https://issues.apache.org/jira/browse/ARROW-2944) - [Format] Arrow columnar format docs mentions VectorLayout that does not exist anymore
+* [ARROW-2946](https://issues.apache.org/jira/browse/ARROW-2946) - [Packaging] Stop to use PWD in debian/rules
+* [ARROW-2947](https://issues.apache.org/jira/browse/ARROW-2947) - [Packaging] Remove Ubuntu Artful
+* [ARROW-2949](https://issues.apache.org/jira/browse/ARROW-2949) - [CI] repo.continuum.io can be flaky in builds
+* [ARROW-2951](https://issues.apache.org/jira/browse/ARROW-2951) - [CI] Changes in format/ should cause Appveyor builds to run
+* [ARROW-2953](https://issues.apache.org/jira/browse/ARROW-2953) - [Plasma] Store memory usage
+* [ARROW-2954](https://issues.apache.org/jira/browse/ARROW-2954) - [Plasma] Store object\_id only once in object table
+* [ARROW-2962](https://issues.apache.org/jira/browse/ARROW-2962) - [Packaging] Bintray descriptor files are no longer needed
+* [ARROW-2977](https://issues.apache.org/jira/browse/ARROW-2977) - [Packaging] Release verification script should check rust too
+* [ARROW-2985](https://issues.apache.org/jira/browse/ARROW-2985) - [Ruby] Run unit tests in verify-release-candidate.sh
+* [ARROW-2988](https://issues.apache.org/jira/browse/ARROW-2988) - [Release] More automated release verification on Windows
+* [ARROW-2990](https://issues.apache.org/jira/browse/ARROW-2990) - [GLib] Fail to build with rpath-ed Arrow C++ on macOS
+
+
+
+# Apache Arrow 0.9.0 (2018-03-19)
+
+## New Features and Improvements
+
+* [ARROW-232](https://issues.apache.org/jira/browse/ARROW-232) - C++/Parquet: Support writing chunked arrays as part of a table
+* [ARROW-633](https://issues.apache.org/jira/browse/ARROW-633) - [Java] Add support for FixedSizeBinary type
+* [ARROW-634](https://issues.apache.org/jira/browse/ARROW-634) - Add integration tests for FixedSizeBinary
+* [ARROW-760](https://issues.apache.org/jira/browse/ARROW-760) - [Python] document differences w.r.t. fastparquet
+* [ARROW-764](https://issues.apache.org/jira/browse/ARROW-764) - [C++] Improve performance of CopyBitmap, add benchmarks
+* [ARROW-969](https://issues.apache.org/jira/browse/ARROW-969) - [C++/Python] Add add/remove field functions for RecordBatch
+* [ARROW-1021](https://issues.apache.org/jira/browse/ARROW-1021) - [Python] Add documentation about using pyarrow from other Cython and C++ projects
+* [ARROW-1035](https://issues.apache.org/jira/browse/ARROW-1035) - [Python] Add ASV benchmarks for streaming columnar deserialization
+* [ARROW-1394](https://issues.apache.org/jira/browse/ARROW-1394) - [Plasma] Add optional extension for allocating memory on GPUs
+* [ARROW-1463](https://issues.apache.org/jira/browse/ARROW-1463) - [JAVA] Restructure ValueVector hierarchy to minimize compile-time generated code
+* [ARROW-1579](https://issues.apache.org/jira/browse/ARROW-1579) - [Java] Add dockerized test setup to validate Spark integration
+* [ARROW-1580](https://issues.apache.org/jira/browse/ARROW-1580) - [Python] Instructions for setting up nightly builds on Linux
+* [ARROW-1623](https://issues.apache.org/jira/browse/ARROW-1623) - [C++] Add convenience method to construct Buffer from a string that owns its memory
+* [ARROW-1632](https://issues.apache.org/jira/browse/ARROW-1632) - [Python] Permit categorical conversions in Table.to\_pandas on a per-column basis
+* [ARROW-1643](https://issues.apache.org/jira/browse/ARROW-1643) - [Python] Accept hdfs:// prefixes in parquet.read\_table and attempt to connect to HDFS
+* [ARROW-1705](https://issues.apache.org/jira/browse/ARROW-1705) - [Python] Create StructArray from sequence of dicts given a known data type
+* [ARROW-1706](https://issues.apache.org/jira/browse/ARROW-1706) - [Python] StructArray.from\_arrays should handle sequences that are coercible to arrays
+* [ARROW-1712](https://issues.apache.org/jira/browse/ARROW-1712) - [C++] Add method to BinaryBuilder to reserve space for value data
+* [ARROW-1757](https://issues.apache.org/jira/browse/ARROW-1757) - [C++] Add DictionaryArray::FromArrays alternate ctor that can check or sanitized "untrusted" indices
+* [ARROW-1815](https://issues.apache.org/jira/browse/ARROW-1815) - [Java] Rename MapVector to StructVector
+* [ARROW-1832](https://issues.apache.org/jira/browse/ARROW-1832) - [JS] Implement JSON reader for integration tests
+* [ARROW-1835](https://issues.apache.org/jira/browse/ARROW-1835) - [C++] Create Arrow schema from std::tuple types
+* [ARROW-1861](https://issues.apache.org/jira/browse/ARROW-1861) - [Python] Fix up ASV setup, add developer instructions for writing new benchmarks and running benchmark suite locally
+* [ARROW-1872](https://issues.apache.org/jira/browse/ARROW-1872) - [Website] Populate hard-coded fields for current release from a YAML file
+* [ARROW-1899](https://issues.apache.org/jira/browse/ARROW-1899) - [Python] Refactor handling of null sentinels in python/numpy\_to\_arrow.cc
+* [ARROW-1920](https://issues.apache.org/jira/browse/ARROW-1920) - Add support for reading ORC files
+* [ARROW-1926](https://issues.apache.org/jira/browse/ARROW-1926) - [GLib] Add garrow\_timestamp\_data\_type\_get\_unit()
+* [ARROW-1927](https://issues.apache.org/jira/browse/ARROW-1927) - [Plasma] Implement delete function
+* [ARROW-1929](https://issues.apache.org/jira/browse/ARROW-1929) - [C++] Move various Arrow testing utility code from Parquet to Arrow codebase
+* [ARROW-1930](https://issues.apache.org/jira/browse/ARROW-1930) - [C++] Implement Slice for ChunkedArray and Column
+* [ARROW-1931](https://issues.apache.org/jira/browse/ARROW-1931) - [C++] w4996 warning due to std::tr1 failing builds on Visual Studio 2017
+* [ARROW-1937](https://issues.apache.org/jira/browse/ARROW-1937) - [Python] Add documentation for different forms of constructing nested arrays from Python data structures
+* [ARROW-1942](https://issues.apache.org/jira/browse/ARROW-1942) - [C++] Hash table specializations for small integers
+* [ARROW-1947](https://issues.apache.org/jira/browse/ARROW-1947) - [Plasma] Change Client Create and Get to use Buffers
+* [ARROW-1951](https://issues.apache.org/jira/browse/ARROW-1951) - Add memcopy\_threads to serialization context
+* [ARROW-1962](https://issues.apache.org/jira/browse/ARROW-1962) - [Java] Add reset() to ValueVector interface
+* [ARROW-1965](https://issues.apache.org/jira/browse/ARROW-1965) - [GLib] Add garrow\_array\_builder\_get\_value\_data\_type() and garrow\_array\_builder\_get\_value\_type()
+* [ARROW-1969](https://issues.apache.org/jira/browse/ARROW-1969) - [C++] Do not build ORC adapter by default
+* [ARROW-1970](https://issues.apache.org/jira/browse/ARROW-1970) - [GLib] Add garrow\_chunked\_array\_get\_value\_data\_type() and garrow\_chunked\_array\_get\_value\_type()
+* [ARROW-1977](https://issues.apache.org/jira/browse/ARROW-1977) - [C++] Update windows dev docs
+* [ARROW-1978](https://issues.apache.org/jira/browse/ARROW-1978) - [Website] Add more visible link to "Powered By" page to front page, simplify Powered By
+* [ARROW-2004](https://issues.apache.org/jira/browse/ARROW-2004) - [C++] Add shrink\_to\_fit option in BufferBuilder::Resize
+* [ARROW-2007](https://issues.apache.org/jira/browse/ARROW-2007) - [Python] Sequence converter for float32 not implemented
+* [ARROW-2011](https://issues.apache.org/jira/browse/ARROW-2011) - Allow setting the pickler to use in pyarrow serialization.
+* [ARROW-2012](https://issues.apache.org/jira/browse/ARROW-2012) - [GLib] Support "make distclean"
+* [ARROW-2018](https://issues.apache.org/jira/browse/ARROW-2018) - [C++] Build instruction on macOS and Homebrew is incomplete
+* [ARROW-2019](https://issues.apache.org/jira/browse/ARROW-2019) - Control the memory allocated for inner vector in LIST
+* [ARROW-2024](https://issues.apache.org/jira/browse/ARROW-2024) - [Python] Remove global SerializationContext variables
+* [ARROW-2028](https://issues.apache.org/jira/browse/ARROW-2028) - [Python] extra\_cmake\_args needs to be passed through shlex.split
+* [ARROW-2031](https://issues.apache.org/jira/browse/ARROW-2031) - HadoopFileSystem isn't pickleable
+* [ARROW-2035](https://issues.apache.org/jira/browse/ARROW-2035) - [C++] Update vendored cpplint.py to a Py3-compatible one
+* [ARROW-2036](https://issues.apache.org/jira/browse/ARROW-2036) - NativeFile should support standard IOBase methods
+* [ARROW-2042](https://issues.apache.org/jira/browse/ARROW-2042) - [Plasma] Revert API change of plasma::Create to output a MutableBuffer
+* [ARROW-2043](https://issues.apache.org/jira/browse/ARROW-2043) - [C++] Change description from OS X to macOS
+* [ARROW-2046](https://issues.apache.org/jira/browse/ARROW-2046) - [Python] Add support for PEP519 - pathlib and similar objects
+* [ARROW-2048](https://issues.apache.org/jira/browse/ARROW-2048) - [Python/C++] Upate Thrift pin to 0.11
+* [ARROW-2050](https://issues.apache.org/jira/browse/ARROW-2050) - Support \`setup.py pytest\` to automatically fetch the test dependencies
+* [ARROW-2052](https://issues.apache.org/jira/browse/ARROW-2052) - Unify OwnedRef and ScopedRef
+* [ARROW-2053](https://issues.apache.org/jira/browse/ARROW-2053) - [C++] Build instruction is incomplete
+* [ARROW-2054](https://issues.apache.org/jira/browse/ARROW-2054) - Compilation warnings
+* [ARROW-2064](https://issues.apache.org/jira/browse/ARROW-2064) - [GLib] Add common build problems link to the install section
+* [ARROW-2065](https://issues.apache.org/jira/browse/ARROW-2065) - Fix bug in SerializationContext.clone().
+* [ARROW-2066](https://issues.apache.org/jira/browse/ARROW-2066) - [Python] Document reading Parquet files from Azure Blob Store
+* [ARROW-2068](https://issues.apache.org/jira/browse/ARROW-2068) - [Python] Expose Array's buffers to Python users
+* [ARROW-2069](https://issues.apache.org/jira/browse/ARROW-2069) - [Python] Document that Plasma is not (yet) supported on Windows
+* [ARROW-2071](https://issues.apache.org/jira/browse/ARROW-2071) - [Python] Reduce runtime of builds in Travis CI
+* [ARROW-2071](https://issues.apache.org/jira/browse/ARROW-2071) - [Python] Reduce runtime of builds in Travis CI
+* [ARROW-2073](https://issues.apache.org/jira/browse/ARROW-2073) - [Python] Create StructArray from sequence of tuples given a known data type
+* [ARROW-2076](https://issues.apache.org/jira/browse/ARROW-2076) - [Python] Display slowest test durations
+* [ARROW-2083](https://issues.apache.org/jira/browse/ARROW-2083) - Support skipping builds
+* [ARROW-2084](https://issues.apache.org/jira/browse/ARROW-2084) - [C++] Support newer Brotli static library names
+* [ARROW-2086](https://issues.apache.org/jira/browse/ARROW-2086) - [Python] Shrink size of arrow\_manylinux1\_x86\_64\_base docker image
+* [ARROW-2087](https://issues.apache.org/jira/browse/ARROW-2087) - [Python] Binaries of 3rdparty are not stripped in manylinux1 base image
+* [ARROW-2088](https://issues.apache.org/jira/browse/ARROW-2088) - [GLib] Add GArrowNumericArray
+* [ARROW-2089](https://issues.apache.org/jira/browse/ARROW-2089) - [GLib] Rename to GARROW\_TYPE\_BOOLEAN for consistency
+* [ARROW-2090](https://issues.apache.org/jira/browse/ARROW-2090) - [Python] Add context manager methods to ParquetWriter
+* [ARROW-2093](https://issues.apache.org/jira/browse/ARROW-2093) - [Python] Possibly do not test pytorch serialization in Travis CI
+* [ARROW-2094](https://issues.apache.org/jira/browse/ARROW-2094) - [Python] Use toolchain libraries and PROTOBUF\_HOME for protocol buffers
+* [ARROW-2095](https://issues.apache.org/jira/browse/ARROW-2095) - [C++] Suppress ORC EP build logging by default
+* [ARROW-2096](https://issues.apache.org/jira/browse/ARROW-2096) - [C++] Turn off Boost\_DEBUG to trim build output
+* [ARROW-2099](https://issues.apache.org/jira/browse/ARROW-2099) - [Python] Support DictionaryArray::FromArrays in Python bindings
+* [ARROW-2107](https://issues.apache.org/jira/browse/ARROW-2107) - [GLib] Follow arrow::gpu::CudaIpcMemHandle API change
+* [ARROW-2108](https://issues.apache.org/jira/browse/ARROW-2108) - [Python] Update instructions for ASV
+* [ARROW-2110](https://issues.apache.org/jira/browse/ARROW-2110) - [Python] Only require pytest-runner on test commands
+* [ARROW-2111](https://issues.apache.org/jira/browse/ARROW-2111) - [C++] Linting could be faster
+* [ARROW-2114](https://issues.apache.org/jira/browse/ARROW-2114) - [Python] Pull latest docker manylinux1 image
+* [ARROW-2117](https://issues.apache.org/jira/browse/ARROW-2117) - [C++] Pin clang to version 5.0
+* [ARROW-2118](https://issues.apache.org/jira/browse/ARROW-2118) - [Python] Improve error message when calling parquet.read\_table on an empty file
+* [ARROW-2120](https://issues.apache.org/jira/browse/ARROW-2120) - Add possibility to use empty \_MSVC\_STATIC\_LIB\_SUFFIX for Thirdparties
+* [ARROW-2121](https://issues.apache.org/jira/browse/ARROW-2121) - [Python] Consider special casing object arrays in pandas serializers.
+* [ARROW-2123](https://issues.apache.org/jira/browse/ARROW-2123) - [JS] Upgrade to TS 2.7.1
+* [ARROW-2132](https://issues.apache.org/jira/browse/ARROW-2132) - [Doc] Add links / mentions of Plasma store to main README
+* [ARROW-2134](https://issues.apache.org/jira/browse/ARROW-2134) - [CI] Make Travis commit inspection more robust
+* [ARROW-2137](https://issues.apache.org/jira/browse/ARROW-2137) - [Python] Don't print paths that are ignored when reading Parquet files
+* [ARROW-2138](https://issues.apache.org/jira/browse/ARROW-2138) - [C++] Have FatalLog abort instead of exiting
+* [ARROW-2142](https://issues.apache.org/jira/browse/ARROW-2142) - [Python] Conversion from Numpy struct array unimplemented
+* [ARROW-2143](https://issues.apache.org/jira/browse/ARROW-2143) - [Python] Provide a manylinux1 wheel for cp27m
+* [ARROW-2146](https://issues.apache.org/jira/browse/ARROW-2146) - [GLib] Implement Slice for ChunkedArray
+* [ARROW-2149](https://issues.apache.org/jira/browse/ARROW-2149) - [Python] reorganize test\_convert\_pandas.py
+* [ARROW-2154](https://issues.apache.org/jira/browse/ARROW-2154) - [Python] \_\_eq\_\_ unimplemented on Buffer
+* [ARROW-2155](https://issues.apache.org/jira/browse/ARROW-2155) - [Python] pa.frombuffer(bytearray) returns immutable Buffer
+* [ARROW-2156](https://issues.apache.org/jira/browse/ARROW-2156) - [CI] Isolate Sphinx dependencies
+* [ARROW-2163](https://issues.apache.org/jira/browse/ARROW-2163) - Install apt dependencies separate from built-in Travis commands, retry on flakiness
+* [ARROW-2166](https://issues.apache.org/jira/browse/ARROW-2166) - [GLib] Implement Slice for Column
+* [ARROW-2168](https://issues.apache.org/jira/browse/ARROW-2168) - [C++] Build toolchain builds with jemalloc
+* [ARROW-2169](https://issues.apache.org/jira/browse/ARROW-2169) - [C++] MSVC is complaining about uncaptured variables
+* [ARROW-2174](https://issues.apache.org/jira/browse/ARROW-2174) - [JS] Export format and schema enums
+* [ARROW-2176](https://issues.apache.org/jira/browse/ARROW-2176) - [C++] Extend DictionaryBuilder to support delta dictionaries
+* [ARROW-2177](https://issues.apache.org/jira/browse/ARROW-2177) - [C++] Remove support for specifying negative scale values in DecimalType
+* [ARROW-2180](https://issues.apache.org/jira/browse/ARROW-2180) - [C++] Remove APIs deprecated in 0.8.0 release
+* [ARROW-2181](https://issues.apache.org/jira/browse/ARROW-2181) - [Python] Add concat\_tables to API reference, add documentation on use
+* [ARROW-2184](https://issues.apache.org/jira/browse/ARROW-2184) - [C++] Add static constructor for FileOutputStream returning shared\_ptr to base OutputStream
+* [ARROW-2185](https://issues.apache.org/jira/browse/ARROW-2185) - Remove CI directives from squashed commit messages
+* [ARROW-2190](https://issues.apache.org/jira/browse/ARROW-2190) - [GLib] Add add/remove field functions for RecordBatch.
+* [ARROW-2191](https://issues.apache.org/jira/browse/ARROW-2191) - [C++] Only use specific version of jemalloc
+* [ARROW-2197](https://issues.apache.org/jira/browse/ARROW-2197) - Document "undefined symbol" issue and workaround
+* [ARROW-2198](https://issues.apache.org/jira/browse/ARROW-2198) - [Python] Docstring for parquet.read\_table is misleading or incorrect
+* [ARROW-2199](https://issues.apache.org/jira/browse/ARROW-2199) - [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
+* [ARROW-2203](https://issues.apache.org/jira/browse/ARROW-2203) - [C++] StderrStream class
+* [ARROW-2204](https://issues.apache.org/jira/browse/ARROW-2204) - [C++] Build fails with TLS error on parquet-cpp clone
+* [ARROW-2205](https://issues.apache.org/jira/browse/ARROW-2205) - [Python] Option for integer object nulls
+* [ARROW-2206](https://issues.apache.org/jira/browse/ARROW-2206) - [JS] Add Perspective as a community project
+* [ARROW-2218](https://issues.apache.org/jira/browse/ARROW-2218) - [Python] PythonFile should infer mode when not given
+* [ARROW-2231](https://issues.apache.org/jira/browse/ARROW-2231) - [CI] Use clcache on AppVeyor
+* [ARROW-2238](https://issues.apache.org/jira/browse/ARROW-2238) - [C++] Detect clcache in cmake configuration
+* [ARROW-2239](https://issues.apache.org/jira/browse/ARROW-2239) - [C++] Update build docs for Windows
+* [ARROW-2250](https://issues.apache.org/jira/browse/ARROW-2250) - plasma\_store process should cleanup on INT and TERM signals
+* [ARROW-2252](https://issues.apache.org/jira/browse/ARROW-2252) - [Python] Create buffer from address, size and base
+* [ARROW-2253](https://issues.apache.org/jira/browse/ARROW-2253) - [Python] Support \_\_eq\_\_ on scalar values
+* [ARROW-2257](https://issues.apache.org/jira/browse/ARROW-2257) - [C++] Add high-level option to toggle CXX11 ABI
+* [ARROW-2261](https://issues.apache.org/jira/browse/ARROW-2261) - [GLib] Can't share the same memory in GArrowBuffer safely
+* [ARROW-2262](https://issues.apache.org/jira/browse/ARROW-2262) - [Python] Support slicing on pyarrow.ChunkedArray
+* [ARROW-2279](https://issues.apache.org/jira/browse/ARROW-2279) - [Python] Better error message if lib cannot be found
+* [ARROW-2282](https://issues.apache.org/jira/browse/ARROW-2282) - [Python] Create StringArray from buffers
+* [ARROW-2283](https://issues.apache.org/jira/browse/ARROW-2283) - [C++] Support Arrow C++ installed in /usr detection by pkg-config
+* [ARROW-2289](https://issues.apache.org/jira/browse/ARROW-2289) - [GLib] Add Numeric, Integer and FloatingPoint data types
+* [ARROW-2291](https://issues.apache.org/jira/browse/ARROW-2291) - [C++] README missing instructions for libboost-regex-dev
+* [ARROW-2292](https://issues.apache.org/jira/browse/ARROW-2292) - [Python] More consistent / intuitive name for pyarrow.frombuffer
+* [ARROW-2309](https://issues.apache.org/jira/browse/ARROW-2309) - [C++] Use std::make\_unsigned
+* [ARROW-2321](https://issues.apache.org/jira/browse/ARROW-2321) - [C++] Release verification script fails with if CMAKE\_INSTALL\_LIBDIR is not $ARROW\_HOME/lib
+* [ARROW-2329](https://issues.apache.org/jira/browse/ARROW-2329) - [Website]: 0.9.0 release update
+* [ARROW-2336](https://issues.apache.org/jira/browse/ARROW-2336) - [Website] Blog post for 0.9.0 release
+* [ARROW-2768](https://issues.apache.org/jira/browse/ARROW-2768) - [Packaging] Support Ubuntu 18.04
+* [ARROW-2783](https://issues.apache.org/jira/browse/ARROW-2783) - Importing conda-forge pyarrow fails
+
+
+## Bug Fixes
+
+* [ARROW-1345](https://issues.apache.org/jira/browse/ARROW-1345) - [Python] Conversion from nested NumPy arrays fails on integers other than int64, float32
+* [ARROW-1589](https://issues.apache.org/jira/browse/ARROW-1589) - [C++] Fuzzing for certain input formats
+* [ARROW-1646](https://issues.apache.org/jira/browse/ARROW-1646) - [Python] pyarrow.array cannot handle NumPy scalar types
+* [ARROW-1856](https://issues.apache.org/jira/browse/ARROW-1856) - [Python] Auto-detect Parquet ABI version when using PARQUET\_HOME
+* [ARROW-1909](https://issues.apache.org/jira/browse/ARROW-1909) - [C++] Bug: Build fails on windows with "-DARROW\_BUILD\_BENCHMARKS=ON"
+* [ARROW-1912](https://issues.apache.org/jira/browse/ARROW-1912) - [Website] Add org affiliations to committers.html
+* [ARROW-1919](https://issues.apache.org/jira/browse/ARROW-1919) - Plasma hanging if object id is not 20 bytes
+* [ARROW-1924](https://issues.apache.org/jira/browse/ARROW-1924) - [Python] Bring back pickle=True option for serialization
+* [ARROW-1933](https://issues.apache.org/jira/browse/ARROW-1933) - [GLib] Build failure with --with-arrow-cpp-build-dir and GPU enabled Arrow C++
+* [ARROW-1940](https://issues.apache.org/jira/browse/ARROW-1940) - [Python] Extra metadata gets added after multiple conversions between pd.DataFrame and pa.Table
+* [ARROW-1941](https://issues.apache.org/jira/browse/ARROW-1941) - Table <–\> DataFrame roundtrip failing
+* [ARROW-1943](https://issues.apache.org/jira/browse/ARROW-1943) - Handle setInitialCapacity() for deeply nested lists of lists
+* [ARROW-1944](https://issues.apache.org/jira/browse/ARROW-1944) - FindArrow has wrong ARROW\_STATIC\_LIB
+* [ARROW-1945](https://issues.apache.org/jira/browse/ARROW-1945) - [C++] Fix doxygen documentation of array.h
+* [ARROW-1946](https://issues.apache.org/jira/browse/ARROW-1946) - Add APIs to decimal vector for writing big endian data
+* [ARROW-1948](https://issues.apache.org/jira/browse/ARROW-1948) - [Java] ListVector does not handle ipc with all non-null values with none set
+* [ARROW-1950](https://issues.apache.org/jira/browse/ARROW-1950) - [Python] pandas\_type in pandas metadata incorrect for List types
+* [ARROW-1953](https://issues.apache.org/jira/browse/ARROW-1953) - [JS] JavaScript builds broken on master
+* [ARROW-1955](https://issues.apache.org/jira/browse/ARROW-1955) - MSVC generates "attempting to reference a deleted function" during build.
+* [ARROW-1958](https://issues.apache.org/jira/browse/ARROW-1958) - [Python] Error in pandas conversion for datetimetz row index
+* [ARROW-1961](https://issues.apache.org/jira/browse/ARROW-1961) - [Python] Writing Parquet file with flavor='spark' loses pandas schema metadata
+* [ARROW-1966](https://issues.apache.org/jira/browse/ARROW-1966) - [C++] Support JAVA\_HOME paths in HDFS libjvm loading that include the jre directory
+* [ARROW-1967](https://issues.apache.org/jira/browse/ARROW-1967) - Python: AssertionError w.r.t Pandas conversion on Parquet files in 0.8.0 dev version
+* [ARROW-1971](https://issues.apache.org/jira/browse/ARROW-1971) - [Python] Add pandas serialization to the default
+* [ARROW-1972](https://issues.apache.org/jira/browse/ARROW-1972) - Deserialization of buffer objects (and pandas dataframes) segfaults on different processes.
+* [ARROW-1973](https://issues.apache.org/jira/browse/ARROW-1973) - [Python] Memory leak when converting Arrow tables with array columns to Pandas dataframes.
+* [ARROW-1976](https://issues.apache.org/jira/browse/ARROW-1976) - [Python] Handling unicode pandas columns on parquet.read\_table
+* [ARROW-1979](https://issues.apache.org/jira/browse/ARROW-1979) - [JS] JS builds handing in es2015:umd tests
+* [ARROW-1980](https://issues.apache.org/jira/browse/ARROW-1980) - [Python] Race condition in \`write\_to\_dataset\`
+* [ARROW-1982](https://issues.apache.org/jira/browse/ARROW-1982) - [Python] Return parquet statistics min/max as values instead of strings
+* [ARROW-1986](https://issues.apache.org/jira/browse/ARROW-1986) - [Python] HadoopFileSystem is not picklable and cannot currently be used with multiprocessing
+* [ARROW-1991](https://issues.apache.org/jira/browse/ARROW-1991) - [GLib] Docker-based documentation build is broken
+* [ARROW-1992](https://issues.apache.org/jira/browse/ARROW-1992) - [Python] to\_pandas crashes when using strings\_to\_categoricals on empty string cols on 0.8.0
+* [ARROW-1997](https://issues.apache.org/jira/browse/ARROW-1997) - [Python] to\_pandas with strings\_to\_categorical fails
+* [ARROW-1998](https://issues.apache.org/jira/browse/ARROW-1998) - [Python] Table.from\_pandas crashes when data frame is empty
+* [ARROW-1999](https://issues.apache.org/jira/browse/ARROW-1999) - [Python] from\_numpy\_dtype returns wrong types
+* [ARROW-2000](https://issues.apache.org/jira/browse/ARROW-2000) - Deduplicate file descriptors when plasma store replies to get request.
+* [ARROW-2002](https://issues.apache.org/jira/browse/ARROW-2002) - use pyarrow download file will raise queue.Full exceptions sometimes
+* [ARROW-2003](https://issues.apache.org/jira/browse/ARROW-2003) - [Python] Do not use deprecated kwarg in pandas.core.internals.make\_block
+* [ARROW-2005](https://issues.apache.org/jira/browse/ARROW-2005) - [Python] pyflakes warnings on Cython files not failing build
+* [ARROW-2008](https://issues.apache.org/jira/browse/ARROW-2008) - [Python] Type inference for int32 NumPy arrays (expecting list<int32\>) returns int64 and then conversion fails
+* [ARROW-2010](https://issues.apache.org/jira/browse/ARROW-2010) - [C++] Compiler warnings with CHECKIN warning level in ORC adapter
+* [ARROW-2017](https://issues.apache.org/jira/browse/ARROW-2017) - Array initialization with large (\>2\*\*31-1) uint64 values fails
+* [ARROW-2023](https://issues.apache.org/jira/browse/ARROW-2023) - [C++] Test opening IPC stream reader or file reader on an empty InputStream
+* [ARROW-2025](https://issues.apache.org/jira/browse/ARROW-2025) - [Python/C++] HDFS Client disconnect closes all open clients
+* [ARROW-2029](https://issues.apache.org/jira/browse/ARROW-2029) - [Python] Program crash on \`HdfsFile.tell\` if file is closed
+* [ARROW-2032](https://issues.apache.org/jira/browse/ARROW-2032) - [C++] ORC ep installs on each call to ninja build (even if no work to do)
+* [ARROW-2033](https://issues.apache.org/jira/browse/ARROW-2033) - pa.array() doesn't work with iterators
+* [ARROW-2039](https://issues.apache.org/jira/browse/ARROW-2039) - [Python] pyarrow.Buffer().to\_pybytes() segfaults
+* [ARROW-2040](https://issues.apache.org/jira/browse/ARROW-2040) - [Python] Deserialized Numpy array must keep ref to underlying tensor
+* [ARROW-2047](https://issues.apache.org/jira/browse/ARROW-2047) - [Python] test\_serialization.py uses a python executable in PATH rather than that used for a test run
+* [ARROW-2049](https://issues.apache.org/jira/browse/ARROW-2049) - ARROW-2049: [Python] Use python -m cython to run Cython, instead of CYTHON\_EXECUTABLE
+* [ARROW-2062](https://issues.apache.org/jira/browse/ARROW-2062) - [C++] Stalled builds in test\_serialization.py in Travis CI
+* [ARROW-2070](https://issues.apache.org/jira/browse/ARROW-2070) - [Python] chdir logic in setup.py buggy
+* [ARROW-2072](https://issues.apache.org/jira/browse/ARROW-2072) - [Python] decimal128.byte\_width crashes
+* [ARROW-2080](https://issues.apache.org/jira/browse/ARROW-2080) - [Python] Update documentation after ARROW-2024
+* [ARROW-2085](https://issues.apache.org/jira/browse/ARROW-2085) - HadoopFileSystem.isdir and .isfile should return False if the path doesn't exist
+* [ARROW-2106](https://issues.apache.org/jira/browse/ARROW-2106) - [Python] pyarrow.array can't take a pandas Series of python datetime objects.
+* [ARROW-2109](https://issues.apache.org/jira/browse/ARROW-2109) - [C++] Boost 1.66 compilation fails on Windows on linkage stage
+* [ARROW-2124](https://issues.apache.org/jira/browse/ARROW-2124) - [Python] ArrowInvalid raised if the first item of a nested list of numpy arrays is empty
+* [ARROW-2128](https://issues.apache.org/jira/browse/ARROW-2128) - [Python] Cannot serialize array of empty lists
+* [ARROW-2129](https://issues.apache.org/jira/browse/ARROW-2129) - [Python] Segmentation fault on conversion of empty array to Pandas
+* [ARROW-2131](https://issues.apache.org/jira/browse/ARROW-2131) - [Python] Serialization test fails on Windows when library has been built in place / not installed
+* [ARROW-2133](https://issues.apache.org/jira/browse/ARROW-2133) - [Python] Segmentation fault on conversion of empty nested arrays to Pandas
+* [ARROW-2135](https://issues.apache.org/jira/browse/ARROW-2135) - [Python] NaN values silently casted to int64 when passing explicit schema for conversion in Table.from\_pandas
+* [ARROW-2139](https://issues.apache.org/jira/browse/ARROW-2139) - [Python] Address Sphinx deprecation warning when building docs
+* [ARROW-2145](https://issues.apache.org/jira/browse/ARROW-2145) - [Python] Decimal conversion not working for NaN values
+* [ARROW-2150](https://issues.apache.org/jira/browse/ARROW-2150) - [Python] array equality defaults to identity
+* [ARROW-2151](https://issues.apache.org/jira/browse/ARROW-2151) - [Python] Error when converting from list of uint64 arrays
+* [ARROW-2153](https://issues.apache.org/jira/browse/ARROW-2153) - [C++/Python] Decimal conversion not working for exponential notation
+* [ARROW-2157](https://issues.apache.org/jira/browse/ARROW-2157) - [Python] Decimal arrays cannot be constructed from Python lists
+* [ARROW-2158](https://issues.apache.org/jira/browse/ARROW-2158) - [Python] Construction of Decimal array with None or np.nan fails
+* [ARROW-2160](https://issues.apache.org/jira/browse/ARROW-2160) - [C++/Python] Fix decimal precision inference
+* [ARROW-2161](https://issues.apache.org/jira/browse/ARROW-2161) - [Python] Skip test\_cython\_api if ARROW\_HOME isn't defined
+* [ARROW-2162](https://issues.apache.org/jira/browse/ARROW-2162) - [Python/C++] Decimal Values with too-high precision are multiplied by 100
+* [ARROW-2167](https://issues.apache.org/jira/browse/ARROW-2167) - [C++] Building Orc extensions fails with the default BUILD\_WARNING\_LEVEL=Production
+* [ARROW-2170](https://issues.apache.org/jira/browse/ARROW-2170) - [Python] construct\_metadata fails on reading files where no index was preserved
+* [ARROW-2171](https://issues.apache.org/jira/browse/ARROW-2171) - [Python] OwnedRef is fragile
+* [ARROW-2172](https://issues.apache.org/jira/browse/ARROW-2172) - [Python] Incorrect conversion from Numpy array when stride % itemsize != 0
+* [ARROW-2173](https://issues.apache.org/jira/browse/ARROW-2173) - [Python] NumPyBuffer destructor should hold the GIL
+* [ARROW-2175](https://issues.apache.org/jira/browse/ARROW-2175) - [Python] arrow\_ep build is triggering during parquet-cpp build in Travis CI
+* [ARROW-2178](https://issues.apache.org/jira/browse/ARROW-2178) - [JS] Fix JS html FileReader example
+* [ARROW-2179](https://issues.apache.org/jira/browse/ARROW-2179) - [C++] arrow/util/io-util.h missing from libarrow-dev
+* [ARROW-2192](https://issues.apache.org/jira/browse/ARROW-2192) - Commits to master should run all builds in CI matrix
+* [ARROW-2194](https://issues.apache.org/jira/browse/ARROW-2194) - [Python] Pandas columns metadata incorrect for empty string columns
+* [ARROW-2208](https://issues.apache.org/jira/browse/ARROW-2208) - [Python] install issues with jemalloc
+* [ARROW-2209](https://issues.apache.org/jira/browse/ARROW-2209) - [Python] Partition columns are not correctly loaded in schema of ParquetDataset
+* [ARROW-2210](https://issues.apache.org/jira/browse/ARROW-2210) - [C++] TestBuffer\_ResizeOOM has a memory leak with jemalloc
+* [ARROW-2212](https://issues.apache.org/jira/browse/ARROW-2212) - [C++/Python] Build Protobuf in base manylinux 1 docker image
+* [ARROW-2223](https://issues.apache.org/jira/browse/ARROW-2223) - [JS] installing umd release throws an error
+* [ARROW-2227](https://issues.apache.org/jira/browse/ARROW-2227) - [Python] Table.from\_pandas does not create chunked\_arrays.
+* [ARROW-2228](https://issues.apache.org/jira/browse/ARROW-2228) - [Python] Unsigned int type for arrow Table not supported
+* [ARROW-2230](https://issues.apache.org/jira/browse/ARROW-2230) - [Python] JS version number is sometimes picked up
+* [ARROW-2232](https://issues.apache.org/jira/browse/ARROW-2232) - [Python] pyarrow.Tensor constructor segfaults
+* [ARROW-2234](https://issues.apache.org/jira/browse/ARROW-2234) - [JS] Read timestamp low bits as Uint32s
+* [ARROW-2240](https://issues.apache.org/jira/browse/ARROW-2240) - [Python] Array initialization with leading numpy nan fails with exception
+* [ARROW-2244](https://issues.apache.org/jira/browse/ARROW-2244) - [C++] Slicing NullArray should not cause the null count on the internal data to be unknown
+* [ARROW-2245](https://issues.apache.org/jira/browse/ARROW-2245) - [Python] Revert static linkage of parquet-cpp in manylinux1 wheel
+* [ARROW-2246](https://issues.apache.org/jira/browse/ARROW-2246) - [Python] Use namespaced boost in manylinux1 package
+* [ARROW-2251](https://issues.apache.org/jira/browse/ARROW-2251) - [GLib] Destroying GArrowBuffer while GArrowTensor that uses the buffer causes a crash
+* [ARROW-2254](https://issues.apache.org/jira/browse/ARROW-2254) - [Python] Local in-place dev versions picking up JS tags
+* [ARROW-2258](https://issues.apache.org/jira/browse/ARROW-2258) - [C++] Appveyor builds failing on master
+* [ARROW-2263](https://issues.apache.org/jira/browse/ARROW-2263) - [Python] test\_cython.py fails if pyarrow is not in import path (e.g. with inplace builds)
+* [ARROW-2265](https://issues.apache.org/jira/browse/ARROW-2265) - [Python] Serializing subclasses of np.ndarray returns a np.ndarray.
+* [ARROW-2268](https://issues.apache.org/jira/browse/ARROW-2268) - Remove MD5 checksums from release process
+* [ARROW-2269](https://issues.apache.org/jira/browse/ARROW-2269) - [Python] Cannot build bdist\_wheel for Python
+* [ARROW-2270](https://issues.apache.org/jira/browse/ARROW-2270) - [Python] ForeignBuffer doesn't tie Python object lifetime to C++ buffer lifetime
+* [ARROW-2272](https://issues.apache.org/jira/browse/ARROW-2272) - [Python] test\_plasma spams /tmp
+* [ARROW-2275](https://issues.apache.org/jira/browse/ARROW-2275) - [C++] Buffer::mutable\_data\_ member uninitialized
+* [ARROW-2280](https://issues.apache.org/jira/browse/ARROW-2280) - [Python] pyarrow.Array.buffers should also include the offsets
+* [ARROW-2284](https://issues.apache.org/jira/browse/ARROW-2284) - [Python] test\_plasma error on plasma\_store error
+* [ARROW-2288](https://issues.apache.org/jira/browse/ARROW-2288) - [Python] slicing logic defective
+* [ARROW-2297](https://issues.apache.org/jira/browse/ARROW-2297) - [JS] babel-jest is not listed as a dev dependency
+* [ARROW-2304](https://issues.apache.org/jira/browse/ARROW-2304) - [C++] MultipleClients test in io-hdfs-test fails on trunk
+* [ARROW-2306](https://issues.apache.org/jira/browse/ARROW-2306) - [Python] HDFS test failures
+* [ARROW-2307](https://issues.apache.org/jira/browse/ARROW-2307) - [Python] Unable to read arrow stream containing 0 record batches
+* [ARROW-2311](https://issues.apache.org/jira/browse/ARROW-2311) - [Python] Struct array slicing defective
+* [ARROW-2312](https://issues.apache.org/jira/browse/ARROW-2312) - [JS] verify-release-candidate-sh must be updated to include JS in integration tests
+* [ARROW-2313](https://issues.apache.org/jira/browse/ARROW-2313) - [GLib] Release builds must define NDEBUG
+* [ARROW-2316](https://issues.apache.org/jira/browse/ARROW-2316) - [C++] Revert Buffer::mutable\_data member to always inline
+* [ARROW-2318](https://issues.apache.org/jira/browse/ARROW-2318) - [C++] TestPlasmaStore.MultipleClientTest is flaky (hangs) in release builds
+* [ARROW-2320](https://issues.apache.org/jira/browse/ARROW-2320) - [C++] Vendored Boost build does not build regex library
+* [ARROW-2406](https://issues.apache.org/jira/browse/ARROW-2406) - [Python] Segfault when creating PyArrow table from Pandas for empty string column when schema provided
+
+
+
+# Apache Arrow 0.8.0 (2017-12-18)
+
+## Bug Fixes
+
+* [ARROW-226](https://issues.apache.org/jira/browse/ARROW-226) - [C++] libhdfs: feedback to help determining cause of failure in opening file path
+* [ARROW-641](https://issues.apache.org/jira/browse/ARROW-641) - [C++] Do not build/run io-hdfs-test if ARROW\_HDFS=off
+* [ARROW-1282](https://issues.apache.org/jira/browse/ARROW-1282) - Large memory reallocation by Arrow causes hang in jemalloc
+* [ARROW-1298](https://issues.apache.org/jira/browse/ARROW-1298) - C++: Add prefix to jemalloc functions to guard against issues when using multiple allocators in the same process
+* [ARROW-1341](https://issues.apache.org/jira/browse/ARROW-1341) - [C++] Deprecate arrow::MakeTable in favor of new ctor from ARROW-1334
+* [ARROW-1347](https://issues.apache.org/jira/browse/ARROW-1347) - [JAVA] List null type should use consistent name for inner field
+* [ARROW-1398](https://issues.apache.org/jira/browse/ARROW-1398) - [Python] No support reading columns of type decimal(19,4)
+* [ARROW-1409](https://issues.apache.org/jira/browse/ARROW-1409) - [Format] Use for "page" attribute in Buffer in metadata
+* [ARROW-1431](https://issues.apache.org/jira/browse/ARROW-1431) - [Java] JsonFileReader doesn't intialize some vectors approperately
+* [ARROW-1436](https://issues.apache.org/jira/browse/ARROW-1436) - PyArrow Timestamps written to Parquet as INT96 appear in Spark as 'bigint'
+* [ARROW-1540](https://issues.apache.org/jira/browse/ARROW-1540) - [C++] Fix valgrind warnings in cuda-test if possible
+* [ARROW-1541](https://issues.apache.org/jira/browse/ARROW-1541) - [C++] Race condition with arrow\_gpu
+* [ARROW-1543](https://issues.apache.org/jira/browse/ARROW-1543) - [C++] row\_wise\_conversion example doesn't correspond to ListBuilder constructor arguments
+* [ARROW-1549](https://issues.apache.org/jira/browse/ARROW-1549) - [JS] Integrate auto-generated Arrow test files
+* [ARROW-1555](https://issues.apache.org/jira/browse/ARROW-1555) - [Python] write\_to\_dataset on s3
+* [ARROW-1584](https://issues.apache.org/jira/browse/ARROW-1584) - [PYTHON] serialize\_pandas on empty dataframe
+* [ARROW-1585](https://issues.apache.org/jira/browse/ARROW-1585) - serialize\_pandas round trip fails on integer columns
+* [ARROW-1586](https://issues.apache.org/jira/browse/ARROW-1586) - [PYTHON] serialize\_pandas roundtrip loses columns name
+* [ARROW-1609](https://issues.apache.org/jira/browse/ARROW-1609) - Plasma: Build fails with Xcode 9.0
+* [ARROW-1615](https://issues.apache.org/jira/browse/ARROW-1615) - CXX flags for development more permissive than Travis CI builds
+* [ARROW-1617](https://issues.apache.org/jira/browse/ARROW-1617) - [Python] Do not use symlinks in python/cmake\_modules
+* [ARROW-1620](https://issues.apache.org/jira/browse/ARROW-1620) - Python: Download Boost in manylinux1 build from bintray
+* [ARROW-1622](https://issues.apache.org/jira/browse/ARROW-1622) - [Plasma] Plasma doesn't compile with XCode 9
+* [ARROW-1624](https://issues.apache.org/jira/browse/ARROW-1624) - [C++] Follow up fixes / tweaks to compiler warnings for Plasma / LLVM 4.0, add to readme
+* [ARROW-1625](https://issues.apache.org/jira/browse/ARROW-1625) - [Serialization] Support OrderedDict properly
+* [ARROW-1629](https://issues.apache.org/jira/browse/ARROW-1629) - [C++] Fix problematic code paths identified by infer tool
+* [ARROW-1633](https://issues.apache.org/jira/browse/ARROW-1633) - [Python] numpy "unicode" arrays not understood
+* [ARROW-1640](https://issues.apache.org/jira/browse/ARROW-1640) - Resolve OpenSSL issues in Travis CI
+* [ARROW-1647](https://issues.apache.org/jira/browse/ARROW-1647) - [Plasma] Potential bug when reading/writing messages.
+* [ARROW-1653](https://issues.apache.org/jira/browse/ARROW-1653) - [Plasma] Use static cast to avoid compiler warning.
+* [ARROW-1655](https://issues.apache.org/jira/browse/ARROW-1655) - [Java] Add Scale and Precision to ValueVectorTypes.tdd for Decimals
+* [ARROW-1656](https://issues.apache.org/jira/browse/ARROW-1656) - [C++] Endianness Macro is Incorrect on Windows And Mac
+* [ARROW-1657](https://issues.apache.org/jira/browse/ARROW-1657) - [C++] Multithreaded Read Test Failing on Arch Linux
+* [ARROW-1658](https://issues.apache.org/jira/browse/ARROW-1658) - [Python] Out of bounds dictionary indices causes segfault after converting to pandas
+* [ARROW-1663](https://issues.apache.org/jira/browse/ARROW-1663) - [Java] Follow up on ARROW-1347 and make schema backward compatible
+* [ARROW-1670](https://issues.apache.org/jira/browse/ARROW-1670) - [Python] Speed up deserialization code path
+* [ARROW-1672](https://issues.apache.org/jira/browse/ARROW-1672) - [Python] Failure to write Feather bytes column
+* [ARROW-1673](https://issues.apache.org/jira/browse/ARROW-1673) - [Python] NumPy boolean arrays get converted to uint8 arrays on NdarrayToTensor roundtrip
+* [ARROW-1676](https://issues.apache.org/jira/browse/ARROW-1676) - [C++] Correctly truncate oversized validity bitmaps when writing Feather format
+* [ARROW-1678](https://issues.apache.org/jira/browse/ARROW-1678) - [Python] Incorrect serialization of numpy.float16
+* [ARROW-1680](https://issues.apache.org/jira/browse/ARROW-1680) - [Python] Timestamp unit change not done in from\_pandas() conversion
+* [ARROW-1681](https://issues.apache.org/jira/browse/ARROW-1681) - [Python] Error writing with nulls in lists
+* [ARROW-1686](https://issues.apache.org/jira/browse/ARROW-1686) - Documentation generation script creates "apidocs" directory under site/java
+* [ARROW-1693](https://issues.apache.org/jira/browse/ARROW-1693) - [JS] Error reading dictionary-encoded integration test files
+* [ARROW-1694](https://issues.apache.org/jira/browse/ARROW-1694) - [Java] Unclosed VectorSchemaRoot in JsonFileReader\#readDictionaryBatches()
+* [ARROW-1695](https://issues.apache.org/jira/browse/ARROW-1695) - [Serialization] Fix reference counting of numpy arrays created in custom serialializer
+* [ARROW-1698](https://issues.apache.org/jira/browse/ARROW-1698) - [JS] File reader attempts to load the same dictionary batch more than once
+* [ARROW-1704](https://issues.apache.org/jira/browse/ARROW-1704) - [GLib] Go example in test suite is broken
+* [ARROW-1708](https://issues.apache.org/jira/browse/ARROW-1708) - [JS] Linter problem breaks master build
+* [ARROW-1709](https://issues.apache.org/jira/browse/ARROW-1709) - [C++] Decimal.ToString is incorrect for negative scale
+* [ARROW-1711](https://issues.apache.org/jira/browse/ARROW-1711) - [Python] flake8 checks still not failing builds
+* [ARROW-1714](https://issues.apache.org/jira/browse/ARROW-1714) - [Python] No named pd.Series name serialized as u'None'
+* [ARROW-1720](https://issues.apache.org/jira/browse/ARROW-1720) - [Python] Segmentation fault while trying to access an out-of-bound chunk
+* [ARROW-1723](https://issues.apache.org/jira/browse/ARROW-1723) - Windows: \_\_declspec(dllexport) specified when building arrow static library
+* [ARROW-1730](https://issues.apache.org/jira/browse/ARROW-1730) - [Python] Incorrect result from pyarrow.array when passing timestamp type
+* [ARROW-1732](https://issues.apache.org/jira/browse/ARROW-1732) - [Python] RecordBatch.from\_pandas fails on DataFrame with no columns when preserve\_index=False
+* [ARROW-1735](https://issues.apache.org/jira/browse/ARROW-1735) - [C++] Cast kernels cannot write into sliced output array
+* [ARROW-1738](https://issues.apache.org/jira/browse/ARROW-1738) - [Python] Wrong datetime conversion when pa.array with unit
+* [ARROW-1739](https://issues.apache.org/jira/browse/ARROW-1739) - [Python] Fix usages of assertRaises causing broken build
+* [ARROW-1742](https://issues.apache.org/jira/browse/ARROW-1742) - C++: clang-format is not detected correct on OSX anymore
+* [ARROW-1743](https://issues.apache.org/jira/browse/ARROW-1743) - [Python] Table to\_pandas fails when index contains categorical column
+* [ARROW-1745](https://issues.apache.org/jira/browse/ARROW-1745) - Compilation failure on Mac OS in plasma tests
+* [ARROW-1749](https://issues.apache.org/jira/browse/ARROW-1749) - [C++] Handle range of Decimal128 values that require 39 digits to be displayed
+* [ARROW-1751](https://issues.apache.org/jira/browse/ARROW-1751) - [Python] Pandas 0.21.0 introduces a breaking API change for MultiIndex construction
+* [ARROW-1754](https://issues.apache.org/jira/browse/ARROW-1754) - [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name
+* [ARROW-1756](https://issues.apache.org/jira/browse/ARROW-1756) - [Python] Observed int32 overflow in Feather write/read path
+* [ARROW-1762](https://issues.apache.org/jira/browse/ARROW-1762) - [C++] unittest failure for language environment
+* [ARROW-1764](https://issues.apache.org/jira/browse/ARROW-1764) - [Python] Add -c conda-forge for Windows dev installation instructions
+* [ARROW-1766](https://issues.apache.org/jira/browse/ARROW-1766) - [GLib] Fix failing builds on OSX
+* [ARROW-1768](https://issues.apache.org/jira/browse/ARROW-1768) - [Python] Fix suppressed exception in ParquetWriter.\_\_del\_\_
+* [ARROW-1769](https://issues.apache.org/jira/browse/ARROW-1769) - Python: pyarrow.parquet.write\_to\_dataset creates cyclic references
+* [ARROW-1770](https://issues.apache.org/jira/browse/ARROW-1770) - [GLib] Fix GLib compiler warning
+* [ARROW-1771](https://issues.apache.org/jira/browse/ARROW-1771) - [C++] ARROW-1749 Breaks Public API test in parquet-cpp
+* [ARROW-1776](https://issues.apache.org/jira/browse/ARROW-1776) - [C++[ arrow::gpu::CudaContext::bytes\_allocated() isn't defined
+* [ARROW-1778](https://issues.apache.org/jira/browse/ARROW-1778) - [Python] Link parquet-cpp statically, privately in manylinux1 wheels
+* [ARROW-1781](https://issues.apache.org/jira/browse/ARROW-1781) - [CI] OSX Builds on Travis-CI time out often
+* [ARROW-1788](https://issues.apache.org/jira/browse/ARROW-1788) - Plasma store crashes when trying to abort objects for disconnected client
+* [ARROW-1791](https://issues.apache.org/jira/browse/ARROW-1791) - Integration tests generate date[DAY] values outside of reasonable range
+* [ARROW-1793](https://issues.apache.org/jira/browse/ARROW-1793) - [Integration] fix a typo for README.md
+* [ARROW-1800](https://issues.apache.org/jira/browse/ARROW-1800) - [C++] Fix and simplify random\_decimals
+* [ARROW-1805](https://issues.apache.org/jira/browse/ARROW-1805) - [Python] ignore non-parquet files when exploring dataset
+* [ARROW-1811](https://issues.apache.org/jira/browse/ARROW-1811) - [C++/Python] Rename all Decimal based APIs to Decimal128
+* [ARROW-1812](https://issues.apache.org/jira/browse/ARROW-1812) - Plasma store modifies hash table while iterating during client disconnect
+* [ARROW-1813](https://issues.apache.org/jira/browse/ARROW-1813) - Enforce checkstyle failure in JAVA build and fix all checkstyle
+* [ARROW-1821](https://issues.apache.org/jira/browse/ARROW-1821) - Add integration test case to explicitly check for optional validity buffer
+* [ARROW-1829](https://issues.apache.org/jira/browse/ARROW-1829) - [Plasma] Clean up eviction policy bookkeeping
+* [ARROW-1830](https://issues.apache.org/jira/browse/ARROW-1830) - [Python] Error when loading all the files in a dictionary
+* [ARROW-1831](https://issues.apache.org/jira/browse/ARROW-1831) - [Python] Docker-based documentation build does not properly set LD\_LIBRARY\_PATH
+* [ARROW-1836](https://issues.apache.org/jira/browse/ARROW-1836) - [C++] Fix C4996 warning from arrow/util/variant.h on MSVC builds
+* [ARROW-1839](https://issues.apache.org/jira/browse/ARROW-1839) - [C++/Python] Add Decimal Parquet Read/Write Tests
+* [ARROW-1840](https://issues.apache.org/jira/browse/ARROW-1840) - [Website] The installation command failed on Windows10 anaconda environment.
+* [ARROW-1845](https://issues.apache.org/jira/browse/ARROW-1845) - [Python] Expose Decimal128Type
+* [ARROW-1852](https://issues.apache.org/jira/browse/ARROW-1852) - [Plasma] Make retrieving manager file descriptor const
+* [ARROW-1853](https://issues.apache.org/jira/browse/ARROW-1853) - [Plasma] Fix off-by-one error in retry processing
+* [ARROW-1863](https://issues.apache.org/jira/browse/ARROW-1863) - [Python] PyObjectStringify could render bytes-like output for more types of objects
+* [ARROW-1865](https://issues.apache.org/jira/browse/ARROW-1865) - [C++] Adding a column to an empty Table fails
+* [ARROW-1869](https://issues.apache.org/jira/browse/ARROW-1869) - Fix typo in LowCostIdentityHashMap
+* [ARROW-1871](https://issues.apache.org/jira/browse/ARROW-1871) - [Python/C++] Appending Python Decimals with different scales requires rescaling
+* [ARROW-1873](https://issues.apache.org/jira/browse/ARROW-1873) - [Python] Segmentation fault when loading total 2GB of parquet files
+* [ARROW-1877](https://issues.apache.org/jira/browse/ARROW-1877) - Incorrect comparison in JsonStringArrayList.equals
+* [ARROW-1879](https://issues.apache.org/jira/browse/ARROW-1879) - [Python] Dask integration tests are not skipped if dask is not installed
+* [ARROW-1881](https://issues.apache.org/jira/browse/ARROW-1881) - [Python] setuptools\_scm picks up JS version tags
+* [ARROW-1882](https://issues.apache.org/jira/browse/ARROW-1882) - [C++] Reintroduce DictionaryBuilder
+* [ARROW-1883](https://issues.apache.org/jira/browse/ARROW-1883) - [Python] BUG: Table.to\_pandas metadata checking fails if columns are not present
+* [ARROW-1889](https://issues.apache.org/jira/browse/ARROW-1889) - [Python] --exclude is not available in older git versions
+* [ARROW-1890](https://issues.apache.org/jira/browse/ARROW-1890) - [Python] Masking for date32 arrays not working
+* [ARROW-1891](https://issues.apache.org/jira/browse/ARROW-1891) - [Python] NaT date32 values are only converted to nulls if from\_pandas is used
+* [ARROW-1892](https://issues.apache.org/jira/browse/ARROW-1892) - [Python] Unknown list item type: binary
+* [ARROW-1893](https://issues.apache.org/jira/browse/ARROW-1893) - [Python] test\_primitive\_serialization fails on Python 2.7.3
+* [ARROW-1895](https://issues.apache.org/jira/browse/ARROW-1895) - [Python] Add field\_name to pandas index metadata
+* [ARROW-1897](https://issues.apache.org/jira/browse/ARROW-1897) - [Python] Incorrect numpy\_type for pandas metadata of Categoricals
+* [ARROW-1904](https://issues.apache.org/jira/browse/ARROW-1904) - [C++] Deprecate PrimitiveArray::raw\_values
+* [ARROW-1906](https://issues.apache.org/jira/browse/ARROW-1906) - [Python] Creating a pyarrow.Array with timestamp of different unit is not casted
+* [ARROW-1908](https://issues.apache.org/jira/browse/ARROW-1908) - [Python] Construction of arrow table from pandas DataFrame with duplicate column names crashes
+* [ARROW-1910](https://issues.apache.org/jira/browse/ARROW-1910) - CPP README Brewfile link incorrect
+* [ARROW-1914](https://issues.apache.org/jira/browse/ARROW-1914) - [C++] make -j may fail to build with -DARROW\_GPU=on
+* [ARROW-1915](https://issues.apache.org/jira/browse/ARROW-1915) - [Python] Parquet tests should be optional
+* [ARROW-1916](https://issues.apache.org/jira/browse/ARROW-1916) - [Java] Do not exclude java/dev/checkstyle from source releases
+* [ARROW-1917](https://issues.apache.org/jira/browse/ARROW-1917) - [GLib] Must set GI\_TYPELIB\_PATH in verify-release-candidate.sh
+* [ARROW-1935](https://issues.apache.org/jira/browse/ARROW-1935) - Download page must not link to snapshots / nightly builds
+* [ARROW-1936](https://issues.apache.org/jira/browse/ARROW-1936) - Broken links to signatures/hashes etc
+* [ARROW-1939](https://issues.apache.org/jira/browse/ARROW-1939) - Correct links in release 0.8 blog post
+
+
+## New Features and Improvements
+
+* [ARROW-480](https://issues.apache.org/jira/browse/ARROW-480) - [Python] Add accessors for Parquet column statistics
+* [ARROW-504](https://issues.apache.org/jira/browse/ARROW-504) - [Python] Add adapter to write pandas.DataFrame in user-selected chunk size to streaming format
+* [ARROW-507](https://issues.apache.org/jira/browse/ARROW-507) - [C++/Python] Construct List container from offsets and values subarrays
+* [ARROW-541](https://issues.apache.org/jira/browse/ARROW-541) - [JS] Implement JavaScript-compatible implementation
+* [ARROW-571](https://issues.apache.org/jira/browse/ARROW-571) - [Python] Add APIs to build Parquet files incrementally from Arrow tables
+* [ARROW-587](https://issues.apache.org/jira/browse/ARROW-587) - Add JIRA fix version to merge tool
+* [ARROW-609](https://issues.apache.org/jira/browse/ARROW-609) - [C++] Function for casting from days since UNIX epoch to int64 date
+* [ARROW-838](https://issues.apache.org/jira/browse/ARROW-838) - [Python] Efficient construction of arrays from non-pandas 1D NumPy arrays
+* [ARROW-905](https://issues.apache.org/jira/browse/ARROW-905) - [Docs] Add Dockerfile for reproducible documentation generation
+* [ARROW-911](https://issues.apache.org/jira/browse/ARROW-911) - [Python] Expand development.rst with build instructions without conda
+* [ARROW-942](https://issues.apache.org/jira/browse/ARROW-942) - Support integration testing on Python 2.7
+* [ARROW-950](https://issues.apache.org/jira/browse/ARROW-950) - [Site] Add Google Analytics tag
+* [ARROW-972](https://issues.apache.org/jira/browse/ARROW-972) - [Python] Add test cases and basic APIs for UnionArray
+* [ARROW-1032](https://issues.apache.org/jira/browse/ARROW-1032) - [JS] Support custom\_metadata
+* [ARROW-1047](https://issues.apache.org/jira/browse/ARROW-1047) - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing
+* [ARROW-1047](https://issues.apache.org/jira/browse/ARROW-1047) - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing
+* [ARROW-1087](https://issues.apache.org/jira/browse/ARROW-1087) - [Python] add get\_include to expose directory containing header files
+* [ARROW-1114](https://issues.apache.org/jira/browse/ARROW-1114) - [C++] Create Record Batch Builder class as a reusable and efficient way to transpose row-by-row data to columns
+* [ARROW-1134](https://issues.apache.org/jira/browse/ARROW-1134) - [C++] Allow C++/CLI projects to build with Arrow​
+* [ARROW-1178](https://issues.apache.org/jira/browse/ARROW-1178) - [Python] Create alternative to Table.from\_pandas that yields a list of RecordBatch objects with a given chunk size
+* [ARROW-1226](https://issues.apache.org/jira/browse/ARROW-1226) - [C++] Improve / correct doxygen function documentation in arrow::ipc
+* [ARROW-1250](https://issues.apache.org/jira/browse/ARROW-1250) - [Python] Define API for user type checking of array types
+* [ARROW-1362](https://issues.apache.org/jira/browse/ARROW-1362) - [Integration] Validate vector type layout in IPC messages
+* [ARROW-1367](https://issues.apache.org/jira/browse/ARROW-1367) - [Website] Divide CHANGELOG issues by component and add subheaders
+* [ARROW-1369](https://issues.apache.org/jira/browse/ARROW-1369) - Support boolean types in the javascript arrow reader library
+* [ARROW-1371](https://issues.apache.org/jira/browse/ARROW-1371) - [Website] Add "Powered By" page to the website
+* [ARROW-1455](https://issues.apache.org/jira/browse/ARROW-1455) - [Python] Add Dockerfile for validating Dask integration outside of usual CI
+* [ARROW-1471](https://issues.apache.org/jira/browse/ARROW-1471) - [JAVA] Document requirements and non/requirements for ValueVector updates
+* [ARROW-1472](https://issues.apache.org/jira/browse/ARROW-1472) - [JAVA] Design updated ValueVector Object Hierarchy
+* [ARROW-1473](https://issues.apache.org/jira/browse/ARROW-1473) - [JAVA] Create Prototype Code Hierarchy (Implementation Phase 1)
+* [ARROW-1474](https://issues.apache.org/jira/browse/ARROW-1474) - [JAVA] ValueVector hierarchy (Implementation Phase 2)
+* [ARROW-1476](https://issues.apache.org/jira/browse/ARROW-1476) - [JAVA] Implement final ValueVector updates
+* [ARROW-1482](https://issues.apache.org/jira/browse/ARROW-1482) - [C++] Implement casts between date32 and date64
+* [ARROW-1483](https://issues.apache.org/jira/browse/ARROW-1483) - [C++] Implement casts between time32 and time64
+* [ARROW-1484](https://issues.apache.org/jira/browse/ARROW-1484) - [C++] Implement (safe and unsafe) casts between timestamps and times of different units
+* [ARROW-1485](https://issues.apache.org/jira/browse/ARROW-1485) - [C++] Implement union-like data type for accommodating kernel arguments which may be scalars or arrays
+* [ARROW-1486](https://issues.apache.org/jira/browse/ARROW-1486) - [C++] Decide if arrow::RecordBatch needs to be copyable
+* [ARROW-1487](https://issues.apache.org/jira/browse/ARROW-1487) - [C++] Implement casts from List<A\> to List<B\>, where a cast function is defined from any A to B
+* [ARROW-1488](https://issues.apache.org/jira/browse/ARROW-1488) - [C++] Implement ArrayBuilder::Finish in terms of internal::ArrayData
+* [ARROW-1498](https://issues.apache.org/jira/browse/ARROW-1498) - [GitHub] Add CONTRIBUTING.md and ISSUE\_TEMPLATE.md
+* [ARROW-1503](https://issues.apache.org/jira/browse/ARROW-1503) - [Python] Add serialization callbacks for pandas objects in pyarrow.serialize
+* [ARROW-1522](https://issues.apache.org/jira/browse/ARROW-1522) - [C++] Support pyarrow.Buffer as built-in type in pyarrow.serialize
+* [ARROW-1523](https://issues.apache.org/jira/browse/ARROW-1523) - [C++] Add helper data struct with methods for reading a validity bitmap possibly having a non-zero offset
+* [ARROW-1524](https://issues.apache.org/jira/browse/ARROW-1524) - [C++] More graceful solution for handling non-zero offsets on inputs and outputs in compute library
+* [ARROW-1525](https://issues.apache.org/jira/browse/ARROW-1525) - [C++] Change functions in arrow/compare.h to not return Status
+* [ARROW-1526](https://issues.apache.org/jira/browse/ARROW-1526) - [Python] Unit tests to exercise code path in PARQUET-1100
+* [ARROW-1535](https://issues.apache.org/jira/browse/ARROW-1535) - [Python] Enable sdist source tarballs to build assuming that Arrow C++ libraries are available on the host system
+* [ARROW-1538](https://issues.apache.org/jira/browse/ARROW-1538) - [C++] Support Ubuntu 14.04 in .deb packaging automation
+* [ARROW-1539](https://issues.apache.org/jira/browse/ARROW-1539) - [C++] Remove functions deprecated as of 0.7.0 and prior releases
+* [ARROW-1556](https://issues.apache.org/jira/browse/ARROW-1556) - [C++] Incorporate AssertArraysEqual function from PARQUET-1100 patch
+* [ARROW-1559](https://issues.apache.org/jira/browse/ARROW-1559) - [C++] Kernel implementations for "unique" (compute distinct elements of array)
+* [ARROW-1573](https://issues.apache.org/jira/browse/ARROW-1573) - [C++] Implement stateful kernel function that uses DictionaryBuilder to compute dictionary indices
+* [ARROW-1575](https://issues.apache.org/jira/browse/ARROW-1575) - [Python] Add pyarrow.column factory function
+* [ARROW-1576](https://issues.apache.org/jira/browse/ARROW-1576) - [Python] Add utility functions (or a richer type hierachy) for checking whether data type instances are members of various type classes
+* [ARROW-1577](https://issues.apache.org/jira/browse/ARROW-1577) - [JS] Package release script for NPM modules
+* [ARROW-1588](https://issues.apache.org/jira/browse/ARROW-1588) - [C++/Format] Harden Decimal Format
+* [ARROW-1593](https://issues.apache.org/jira/browse/ARROW-1593) - [PYTHON] serialize\_pandas should pass through the preserve\_index keyword
+* [ARROW-1594](https://issues.apache.org/jira/browse/ARROW-1594) - [Python] Enable multi-threaded conversions in Table.from\_pandas
+* [ARROW-1600](https://issues.apache.org/jira/browse/ARROW-1600) - [C++] Zero-copy Buffer constructor from std::string
+* [ARROW-1602](https://issues.apache.org/jira/browse/ARROW-1602) - [C++] Add IsValid/IsNotNull method to arrow::Array
+* [ARROW-1603](https://issues.apache.org/jira/browse/ARROW-1603) - [C++] Add BinaryArray method to get a value as a std::string
+* [ARROW-1604](https://issues.apache.org/jira/browse/ARROW-1604) - [Python] Support common type aliases in cast(...) and various type= arguments
+* [ARROW-1605](https://issues.apache.org/jira/browse/ARROW-1605) - [Python] pyarrow.array should be able to yield smaller integer types without an explicit cast
+* [ARROW-1607](https://issues.apache.org/jira/browse/ARROW-1607) - [C++] Implement DictionaryBuilder for Decimals
+* [ARROW-1613](https://issues.apache.org/jira/browse/ARROW-1613) - [Java] ArrowReader should not close the input ReadChannel
+* [ARROW-1616](https://issues.apache.org/jira/browse/ARROW-1616) - [Python] Add "write" method to RecordBatchStreamWriter that dispatches to write\_table/write\_back as appropriate
+* [ARROW-1626](https://issues.apache.org/jira/browse/ARROW-1626) - Add make targets to run the inter-procedural static analysis tool called "infer".
+* [ARROW-1627](https://issues.apache.org/jira/browse/ARROW-1627) - [JAVA] Reduce heap usage(Phase 2) - memory footprint in AllocationManager.BufferLedger
+* [ARROW-1630](https://issues.apache.org/jira/browse/ARROW-1630) - [Serialization] Support Python datetime objects
+* [ARROW-1631](https://issues.apache.org/jira/browse/ARROW-1631) - [C++] Add GRPC to ThirdpartyToolchain.cmake
+* [ARROW-1635](https://issues.apache.org/jira/browse/ARROW-1635) - Add release management guide for PMCs
+* [ARROW-1637](https://issues.apache.org/jira/browse/ARROW-1637) - [C++] IPC round-trip for null type
+* [ARROW-1641](https://issues.apache.org/jira/browse/ARROW-1641) - [C++] Do not include <mutex\> in public headers
+* [ARROW-1648](https://issues.apache.org/jira/browse/ARROW-1648) - C++: Add cast from Dictionary[NullType] to NullType
+* [ARROW-1649](https://issues.apache.org/jira/browse/ARROW-1649) - C++: Print number of nulls in PrettyPrint for NullArray
+* [ARROW-1651](https://issues.apache.org/jira/browse/ARROW-1651) - [JS] Lazy row accessor in Table
+* [ARROW-1652](https://issues.apache.org/jira/browse/ARROW-1652) - [JS] Separate Vector into BatchVector and CompositeVector
+* [ARROW-1654](https://issues.apache.org/jira/browse/ARROW-1654) - [Python] pa.DataType cannot be pickled
+* [ARROW-1662](https://issues.apache.org/jira/browse/ARROW-1662) - Move OSX Dependency management into brew bundle Brewfiles
+* [ARROW-1665](https://issues.apache.org/jira/browse/ARROW-1665) - [Serialization] Support more custom datatypes in the default serialization context
+* [ARROW-1666](https://issues.apache.org/jira/browse/ARROW-1666) - [GLib] Enable gtk-doc on Travis CI Mac environment
+* [ARROW-1667](https://issues.apache.org/jira/browse/ARROW-1667) - [GLib] Support Meson
+* [ARROW-1671](https://issues.apache.org/jira/browse/ARROW-1671) - [C++] Change arrow::MakeArray to not return Status
+* [ARROW-1675](https://issues.apache.org/jira/browse/ARROW-1675) - [Python] Use RecordBatch.from\_pandas in FeatherWriter.write
+* [ARROW-1677](https://issues.apache.org/jira/browse/ARROW-1677) - [Blog] Add blog post on Ray and Arrow Python serialization
+* [ARROW-1679](https://issues.apache.org/jira/browse/ARROW-1679) - [GLib] Add garrow\_record\_batch\_reader\_read\_next()
+* [ARROW-1683](https://issues.apache.org/jira/browse/ARROW-1683) - [Python] Restore "TimestampType" to pyarrow namespace
+* [ARROW-1684](https://issues.apache.org/jira/browse/ARROW-1684) - [Python] Simplify user API for reading nested Parquet columns
+* [ARROW-1685](https://issues.apache.org/jira/browse/ARROW-1685) - [GLib] Add GArrowTableReader
+* [ARROW-1687](https://issues.apache.org/jira/browse/ARROW-1687) - [Python] Expose UnionArray to pyarrow
+* [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689) - [Python] Categorical Indices Should Be Zero-Copy
+* [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689) - [Python] Categorical Indices Should Be Zero-Copy
+* [ARROW-1690](https://issues.apache.org/jira/browse/ARROW-1690) - [GLib] Add garrow\_array\_is\_valid()
+* [ARROW-1691](https://issues.apache.org/jira/browse/ARROW-1691) - [Java] Conform Java Decimal type implementation to format decisions in ARROW-1588
+* [ARROW-1697](https://issues.apache.org/jira/browse/ARROW-1697) - [GitHub] Add ISSUE\_TEMPLATE.md
+* [ARROW-1701](https://issues.apache.org/jira/browse/ARROW-1701) - [Serialization] Support zero copy PyTorch Tensor serialization
+* [ARROW-1702](https://issues.apache.org/jira/browse/ARROW-1702) - Update jemalloc in manylinux1 build
+* [ARROW-1703](https://issues.apache.org/jira/browse/ARROW-1703) - [C++] Vendor exact version of jemalloc we depend on
+* [ARROW-1707](https://issues.apache.org/jira/browse/ARROW-1707) - Update dev README after movement to GitBox
+* [ARROW-1710](https://issues.apache.org/jira/browse/ARROW-1710) - [Java] Remove non-nullable vectors in new vector class hierarchy
+* [ARROW-1716](https://issues.apache.org/jira/browse/ARROW-1716) - [Format/JSON] Use string integer value for Decimals in JSON
+* [ARROW-1717](https://issues.apache.org/jira/browse/ARROW-1717) - [Java] Remove public static helper method in vector classes for JSONReader/Writer
+* [ARROW-1718](https://issues.apache.org/jira/browse/ARROW-1718) - [Python] Implement casts from timestamp to date32/date64 and support in Array.from\_pandas
+* [ARROW-1719](https://issues.apache.org/jira/browse/ARROW-1719) - [Java] Remove accessor/mutator
+* [ARROW-1721](https://issues.apache.org/jira/browse/ARROW-1721) - [Python] Support null mask in places where it isn't supported in numpy\_to\_arrow.cc
+* [ARROW-1724](https://issues.apache.org/jira/browse/ARROW-1724) - [Packaging] Support Ubuntu 17.10
+* [ARROW-1725](https://issues.apache.org/jira/browse/ARROW-1725) - [Packaging] Upload .deb for Ubuntu 17.10
+* [ARROW-1726](https://issues.apache.org/jira/browse/ARROW-1726) - [GLib] Add setup description to verify C GLib build
+* [ARROW-1727](https://issues.apache.org/jira/browse/ARROW-1727) - [Format] Expand Arrow streaming format to permit new dictionaries and deltas / additions to existing dictionaries
+* [ARROW-1728](https://issues.apache.org/jira/browse/ARROW-1728) - [C++] Run clang-format checks in Travis CI
+* [ARROW-1734](https://issues.apache.org/jira/browse/ARROW-1734) - C++/Python: Add cast function on Column-level
+* [ARROW-1736](https://issues.apache.org/jira/browse/ARROW-1736) - [GLib] Add GArrowCastOptions:allow-time-truncate
+* [ARROW-1737](https://issues.apache.org/jira/browse/ARROW-1737) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE
+* [ARROW-1740](https://issues.apache.org/jira/browse/ARROW-1740) - C++: Kernel to get unique values of an Array/Column
+* [ARROW-1746](https://issues.apache.org/jira/browse/ARROW-1746) - [Python] Add build dependencies for Arch Linux
+* [ARROW-1747](https://issues.apache.org/jira/browse/ARROW-1747) - [C++] Don't export symbols of statically linked libraries
+* [ARROW-1748](https://issues.apache.org/jira/browse/ARROW-1748) - [GLib] Add GArrowRecordBatchBuilder
+* [ARROW-1750](https://issues.apache.org/jira/browse/ARROW-1750) - [C++] Remove the need for arrow/util/random.h
+* [ARROW-1752](https://issues.apache.org/jira/browse/ARROW-1752) - [Packaging] Add GPU packages for Debian and Ubuntu
+* [ARROW-1753](https://issues.apache.org/jira/browse/ARROW-1753) - [Python] Provide for matching subclasses with register\_type in serialization context
+* [ARROW-1755](https://issues.apache.org/jira/browse/ARROW-1755) - [C++] Add build options for MSVC to use static runtime libraries
+* [ARROW-1758](https://issues.apache.org/jira/browse/ARROW-1758) - [Python] Remove pickle=True option for object serialization
+* [ARROW-1759](https://issues.apache.org/jira/browse/ARROW-1759) - [Python] Add function / property to get implied Arrow schema from Parquet file
+* [ARROW-1763](https://issues.apache.org/jira/browse/ARROW-1763) - [Python] DataType should be hashable
+* [ARROW-1765](https://issues.apache.org/jira/browse/ARROW-1765) - [Doc] Use dependencies from conda in C++ docker build
+* [ARROW-1767](https://issues.apache.org/jira/browse/ARROW-1767) - [C++] Support file reads and writes over 2GB on Windows
+* [ARROW-1772](https://issues.apache.org/jira/browse/ARROW-1772) - [C++] Add public-api-test module in style of parquet-cpp
+* [ARROW-1773](https://issues.apache.org/jira/browse/ARROW-1773) - [C++] Add casts from date/time types to compatible signed integers
+* [ARROW-1775](https://issues.apache.org/jira/browse/ARROW-1775) - Ability to abort created but unsealed Plasma objects
+* [ARROW-1777](https://issues.apache.org/jira/browse/ARROW-1777) - [C++] Add static ctor ArrayData::Make for nicer syntax in places
+* [ARROW-1779](https://issues.apache.org/jira/browse/ARROW-1779) - [Java] Integration test breaks without zeroing out validity vectors
+* [ARROW-1782](https://issues.apache.org/jira/browse/ARROW-1782) - [Python] Expose compressors as pyarrow.compress, pyarrow.decompress
+* [ARROW-1783](https://issues.apache.org/jira/browse/ARROW-1783) - [Python] Convert SerializedPyObject to/from sequence of component buffers with minimal memory allocation / copying
+* [ARROW-1784](https://issues.apache.org/jira/browse/ARROW-1784) - [Python] Read and write pandas.DataFrame in pyarrow.serialize by decomposing the BlockManager rather than coercing to Arrow format
+* [ARROW-1785](https://issues.apache.org/jira/browse/ARROW-1785) - [Format/C++/Java] Remove VectorLayout metadata from Flatbuffers metadata
+* [ARROW-1787](https://issues.apache.org/jira/browse/ARROW-1787) - [Python] Support reading parquet files into DataFrames in a backward compatible way
+* [ARROW-1794](https://issues.apache.org/jira/browse/ARROW-1794) - [C++/Python] Rename DecimalArray to Decimal128Array
+* [ARROW-1795](https://issues.apache.org/jira/browse/ARROW-1795) - [Plasma C++] change evict policy
+* [ARROW-1801](https://issues.apache.org/jira/browse/ARROW-1801) - [Docs] Update install instructions to use red-data-tools repos
+* [ARROW-1802](https://issues.apache.org/jira/browse/ARROW-1802) - [GLib] Add Arrow GPU support
+* [ARROW-1806](https://issues.apache.org/jira/browse/ARROW-1806) - [GLib] Add garrow\_record\_batch\_writer\_write\_table()
+* [ARROW-1808](https://issues.apache.org/jira/browse/ARROW-1808) - [C++] Make RecordBatch interface virtual to permit record batches that lazy-materialize columns
+* [ARROW-1809](https://issues.apache.org/jira/browse/ARROW-1809) - [GLib] Use .xml instead of .sgml for GTK-Doc main file
+* [ARROW-1810](https://issues.apache.org/jira/browse/ARROW-1810) - [Plasma] Remove test shell scripts
+* [ARROW-1816](https://issues.apache.org/jira/browse/ARROW-1816) - [Java] Resolve new vector classes structure for timestamp, date and maybe interval
+* [ARROW-1817](https://issues.apache.org/jira/browse/ARROW-1817) - Configure JsonFileReader to read NaN for floats
+* [ARROW-1818](https://issues.apache.org/jira/browse/ARROW-1818) - Examine Java Dependencies
+* [ARROW-1819](https://issues.apache.org/jira/browse/ARROW-1819) - [Java] Remove legacy vector classes
+* [ARROW-1820](https://issues.apache.org/jira/browse/ARROW-1820) - [C++] Create arrow\_compute shared library subcomponent
+* [ARROW-1826](https://issues.apache.org/jira/browse/ARROW-1826) - [JAVA] Avoid branching at cell level (copyFrom)
+* [ARROW-1827](https://issues.apache.org/jira/browse/ARROW-1827) - [Java] Add checkstyle config file and header file
+* [ARROW-1828](https://issues.apache.org/jira/browse/ARROW-1828) - [C++] Implement hash kernel specialization for BooleanType
+* [ARROW-1834](https://issues.apache.org/jira/browse/ARROW-1834) - [Doc] Build documentation in separate build folders
+* [ARROW-1838](https://issues.apache.org/jira/browse/ARROW-1838) - [C++] Use compute::Datum uniformly for input argument to kernels
+* [ARROW-1841](https://issues.apache.org/jira/browse/ARROW-1841) - [JS] Update text-encoding-utf-8 and tslib for node ESModules support
+* [ARROW-1844](https://issues.apache.org/jira/browse/ARROW-1844) - [C++] Basic benchmark suite for hash kernels
+* [ARROW-1849](https://issues.apache.org/jira/browse/ARROW-1849) - [GLib] Add input checks to GArrowRecordBatch
+* [ARROW-1850](https://issues.apache.org/jira/browse/ARROW-1850) - [C++] Use const void\* in Writable::Write instead of const uint8\_t\*
+* [ARROW-1854](https://issues.apache.org/jira/browse/ARROW-1854) - [Python] Improve performance of serializing object dtype ndarrays
+* [ARROW-1855](https://issues.apache.org/jira/browse/ARROW-1855) - [GLib] Add workaround for build failure on macOS
+* [ARROW-1857](https://issues.apache.org/jira/browse/ARROW-1857) - [Python] Add switch for boost linkage with static parquet in wheels
+* [ARROW-1859](https://issues.apache.org/jira/browse/ARROW-1859) - [GLib] Add GArrowDictionaryDataType
+* [ARROW-1862](https://issues.apache.org/jira/browse/ARROW-1862) - [GLib] Add GArrowDictionaryArray
+* [ARROW-1864](https://issues.apache.org/jira/browse/ARROW-1864) - [Java] Upgrade Netty to 4.1.x
+* [ARROW-1866](https://issues.apache.org/jira/browse/ARROW-1866) - [Java] Combine MapVector and NonNullableMapVector Classes
+* [ARROW-1867](https://issues.apache.org/jira/browse/ARROW-1867) - [Java] Add BitVector APIs from old vector class
+* [ARROW-1874](https://issues.apache.org/jira/browse/ARROW-1874) - [GLib] Add garrow\_array\_unique()
+* [ARROW-1878](https://issues.apache.org/jira/browse/ARROW-1878) - [GLib] Add garrow\_array\_dictionary\_encode()
+* [ARROW-1884](https://issues.apache.org/jira/browse/ARROW-1884) - [C++] Make JsonReader/JsonWriter classes internal APIs
+* [ARROW-1885](https://issues.apache.org/jira/browse/ARROW-1885) - [Java] Restore previous MapVector class names
+* [ARROW-1901](https://issues.apache.org/jira/browse/ARROW-1901) - [Python] Support recursive mkdir for DaskFilesystem
+* [ARROW-1902](https://issues.apache.org/jira/browse/ARROW-1902) - [Python] Remove mkdir race condition from write\_to\_dataset
+* [ARROW-1905](https://issues.apache.org/jira/browse/ARROW-1905) - [Python] Add more functions for checking exact types in pyarrow.types
+* [ARROW-1911](https://issues.apache.org/jira/browse/ARROW-1911) - Add Graphistry to Arrow JS proof points
+* [ARROW-1922](https://issues.apache.org/jira/browse/ARROW-1922) - Blog post on recent improvements/changes in JAVA Vectors
+* [ARROW-1932](https://issues.apache.org/jira/browse/ARROW-1932) - [Website] Update site for 0.8.0
+* [ARROW-1934](https://issues.apache.org/jira/browse/ARROW-1934) - [Website] Blog post summarizing highlights of 0.8.0 release
+
+
+
+# Apache Arrow 0.7.1 (2017-10-01)
+
+## New Features and Improvements
+
+* [ARROW-559](https://issues.apache.org/jira/browse/ARROW-559) - Script to easily verify release in all languages
+* [ARROW-1464](https://issues.apache.org/jira/browse/ARROW-1464) - [GLib] Documentation for troubleshooting of build errors
+* [ARROW-1537](https://issues.apache.org/jira/browse/ARROW-1537) - [C++] Support building with full path install\_name on macOS
+* [ARROW-1546](https://issues.apache.org/jira/browse/ARROW-1546) - [GLib] Support GLib 2.40 again
+* [ARROW-1548](https://issues.apache.org/jira/browse/ARROW-1548) - [GLib] Support build append in builder
+* [ARROW-1578](https://issues.apache.org/jira/browse/ARROW-1578) - [C++/Python] Run lint checks in Travis CI to fail for linting issues as early as possible
+* [ARROW-1592](https://issues.apache.org/jira/browse/ARROW-1592) - [GLib] Add GArrowUIntArrayBuilder
+* [ARROW-1608](https://issues.apache.org/jira/browse/ARROW-1608) - Support Release verification script on macOS
+* [ARROW-1612](https://issues.apache.org/jira/browse/ARROW-1612) - [GLib] add how to install for mac os to README
+* [ARROW-1618](https://issues.apache.org/jira/browse/ARROW-1618) - [JAVA] Reduce Heap Usage(Phase 1): move release listener logic to Allocation Manager
+* [ARROW-1634](https://issues.apache.org/jira/browse/ARROW-1634) - [Website] Updates for 0.7.1 release
+
+
+## Bug Fixes
+
+* [ARROW-1497](https://issues.apache.org/jira/browse/ARROW-1497) - [Java] JsonFileReader doesn't set value count for some vectors
+* [ARROW-1500](https://issues.apache.org/jira/browse/ARROW-1500) - [C++] Result of ftruncate ignored in MemoryMappedFile::Create
+* [ARROW-1529](https://issues.apache.org/jira/browse/ARROW-1529) - [GLib] Fix failure on macOS on Travis CI
+* [ARROW-1533](https://issues.apache.org/jira/browse/ARROW-1533) - [JAVA] realloc should consider the existing buffer capacity for computing target memory requirement
+* [ARROW-1536](https://issues.apache.org/jira/browse/ARROW-1536) - [C++] Do not transitively depend on libboost\_system
+* [ARROW-1542](https://issues.apache.org/jira/browse/ARROW-1542) - [C++] Windows release verification script should not modify conda environment
+* [ARROW-1544](https://issues.apache.org/jira/browse/ARROW-1544) - [JS] Export Vector type definitions
+* [ARROW-1545](https://issues.apache.org/jira/browse/ARROW-1545) - Int64Builder should not need int64() as arg
+* [ARROW-1547](https://issues.apache.org/jira/browse/ARROW-1547) - [JAVA] Fix 8x memory over-allocation in BitVector
+* [ARROW-1550](https://issues.apache.org/jira/browse/ARROW-1550) - [Python] Fix flaky test on Windows
+* [ARROW-1550](https://issues.apache.org/jira/browse/ARROW-1550) - [Python] Fix flaky test on Windows
+* [ARROW-1553](https://issues.apache.org/jira/browse/ARROW-1553) - [JAVA] Implement setInitialCapacity for MapWriter and pass on this capacity during lazy creation of child vectors
+* [ARROW-1554](https://issues.apache.org/jira/browse/ARROW-1554) - [Python] Document that pip wheels depend on MSVC14 runtime
+* [ARROW-1557](https://issues.apache.org/jira/browse/ARROW-1557) - [PYTHON] pyarrow.Table.from\_arrays doesn't validate names length
+* [ARROW-1590](https://issues.apache.org/jira/browse/ARROW-1590) - Flow TS Table method generics
+* [ARROW-1591](https://issues.apache.org/jira/browse/ARROW-1591) - C++: Xcode 9 is not correctly detected
+* [ARROW-1595](https://issues.apache.org/jira/browse/ARROW-1595) - [Python] Fix package dependency issues causing build failures
+* [ARROW-1598](https://issues.apache.org/jira/browse/ARROW-1598) - [C++/Tutorials] MIsmatch code comment and actual code about Object ID
+* [ARROW-1601](https://issues.apache.org/jira/browse/ARROW-1601) - [C++] READ\_NEXT\_BITSET reads one byte past the last byte on last iteration
+* [ARROW-1606](https://issues.apache.org/jira/browse/ARROW-1606) - Python: Windows wheels don't include .lib files.
+* [ARROW-1610](https://issues.apache.org/jira/browse/ARROW-1610) - C++/Python: Only call python-prefix if the default PYTHON\_LIBRARY is not present
+* [ARROW-1611](https://issues.apache.org/jira/browse/ARROW-1611) - Crash in BitmapReader when length is zero
+* [ARROW-1619](https://issues.apache.org/jira/browse/ARROW-1619) - [Java] Correctly set "lastSet" for variable vectors in JsonReader
+
+
+
+# Apache Arrow 0.7.0 (2017-09-17)
+
+## Bug Fixes
+
+* [ARROW-12](https://issues.apache.org/jira/browse/ARROW-12) - Get Github activity mirrored to JIRA
+* [ARROW-248](https://issues.apache.org/jira/browse/ARROW-248) - UnionVector.close() should call clear()
+* [ARROW-269](https://issues.apache.org/jira/browse/ARROW-269) - UnionVector getBuffers method does not include typevector
+* [ARROW-407](https://issues.apache.org/jira/browse/ARROW-407) - BitVector.copyFromSafe() should re-allocate if necessary instead of returning false
+* [ARROW-801](https://issues.apache.org/jira/browse/ARROW-801) - [JAVA] Provide direct access to underlying buffer memory addresses in consistent way without generating garbage or large amount indirections
+* [ARROW-1302](https://issues.apache.org/jira/browse/ARROW-1302) - C++: ${MAKE} variable not set sometimes on older MacOS installations
+* [ARROW-1332](https://issues.apache.org/jira/browse/ARROW-1332) - [Packaging] Building Windows wheels in Apache repos
+* [ARROW-1354](https://issues.apache.org/jira/browse/ARROW-1354) - [Python] Segfault in Table.from\_pandas with Mixed-Type Categories
+* [ARROW-1357](https://issues.apache.org/jira/browse/ARROW-1357) - [Python] Data corruption in reading multi-file parquet dataset
+* [ARROW-1363](https://issues.apache.org/jira/browse/ARROW-1363) - [C++] IPC writer sends buffer layout for dictionary rather than indices
+* [ARROW-1365](https://issues.apache.org/jira/browse/ARROW-1365) - [Python] Remove usage of removed jemalloc\_memory\_pool in Python API docs
+* [ARROW-1373](https://issues.apache.org/jira/browse/ARROW-1373) - [Java] Implement get<type\>Buffer() methods at the ValueVector interface
+* [ARROW-1375](https://issues.apache.org/jira/browse/ARROW-1375) - [C++] Visual Studio 2017 Appveyor builds failing
+* [ARROW-1378](https://issues.apache.org/jira/browse/ARROW-1378) - [Python] whl is not a supported wheel on this platform on Debian/Jessie
+* [ARROW-1379](https://issues.apache.org/jira/browse/ARROW-1379) - [Java] maven dependency issues - both unused and undeclared
+* [ARROW-1390](https://issues.apache.org/jira/browse/ARROW-1390) - [Python] Extend tests for python serialization
+* [ARROW-1407](https://issues.apache.org/jira/browse/ARROW-1407) - Dictionaries can only hold a maximum of 4096 indices
+* [ARROW-1411](https://issues.apache.org/jira/browse/ARROW-1411) - [Python] Booleans in Float Columns cause Segfault
+* [ARROW-1414](https://issues.apache.org/jira/browse/ARROW-1414) - [GLib] Cast after status check
+* [ARROW-1421](https://issues.apache.org/jira/browse/ARROW-1421) - [Python] pyarrow.serialize cannot serialize a Python dict input
+* [ARROW-1426](https://issues.apache.org/jira/browse/ARROW-1426) - [Website] The title element of the top page is empty
+* [ARROW-1429](https://issues.apache.org/jira/browse/ARROW-1429) - [Python] Error loading parquet file with \_metadata from HDFS
+* [ARROW-1430](https://issues.apache.org/jira/browse/ARROW-1430) - [Python] flake8 warnings are not failing CI builds
+* [ARROW-1434](https://issues.apache.org/jira/browse/ARROW-1434) - [C++/Python] pyarrow.Array.from\_pandas does not support datetime64[D] arrays
+* [ARROW-1435](https://issues.apache.org/jira/browse/ARROW-1435) - [Python] PyArrow not propagating timezone information from Parquet to Python
+* [ARROW-1437](https://issues.apache.org/jira/browse/ARROW-1437) - [Python] pa.Array.from\_pandas segfaults when given a mixed-type array
+* [ARROW-1439](https://issues.apache.org/jira/browse/ARROW-1439) - [Packaging] Automate updating RPM in RPM build
+* [ARROW-1443](https://issues.apache.org/jira/browse/ARROW-1443) - [Java] Bug on ArrowBuf.setBytes with unsliced ByteBuffers
+* [ARROW-1444](https://issues.apache.org/jira/browse/ARROW-1444) - [JAVA] BitVector.splitAndTransfer copies last byte incorrectly
+* [ARROW-1446](https://issues.apache.org/jira/browse/ARROW-1446) - Python: Writing more than 2^31 rows from pandas dataframe causes row count overflow error
+* [ARROW-1450](https://issues.apache.org/jira/browse/ARROW-1450) - [Python] Raise proper error if custom serialization handler fails
+* [ARROW-1452](https://issues.apache.org/jira/browse/ARROW-1452) - [C++] Make UNUSED macro name more unique so it does not conflict with thirdparty projects
+* [ARROW-1452](https://issues.apache.org/jira/browse/ARROW-1452) - [C++] Make UNUSED macro name more unique so it does not conflict with thirdparty projects
+* [ARROW-1453](https://issues.apache.org/jira/browse/ARROW-1453) - [Python] Implement WriteTensor for non-contiguous tensors
+* [ARROW-1457](https://issues.apache.org/jira/browse/ARROW-1457) - [C++] Optimize strided WriteTensor
+* [ARROW-1458](https://issues.apache.org/jira/browse/ARROW-1458) - [Python] Document that HadoopFileSystem.mkdir with create\_parents=False has no effect
+* [ARROW-1459](https://issues.apache.org/jira/browse/ARROW-1459) - [Python] PyArrow fails to load partitioned parquet files with non-primitive types
+* [ARROW-1461](https://issues.apache.org/jira/browse/ARROW-1461) - [C++] Disable builds using LLVM apt packages temporarily
+* [ARROW-1461](https://issues.apache.org/jira/browse/ARROW-1461) - [C++] Disable builds using LLVM apt packages temporarily
+* [ARROW-1467](https://issues.apache.org/jira/browse/ARROW-1467) - [JAVA]: Fix reset() and allocateNew() in Nullable Value Vectors template
+* [ARROW-1469](https://issues.apache.org/jira/browse/ARROW-1469) - Segfault when serialize Pandas series with mixed object type
+* [ARROW-1490](https://issues.apache.org/jira/browse/ARROW-1490) - [Java] Allow Travis CI failures for JDK9 for now
+* [ARROW-1493](https://issues.apache.org/jira/browse/ARROW-1493) - [C++] Flush the output stream at the end of each PrettyPrint function
+* [ARROW-1495](https://issues.apache.org/jira/browse/ARROW-1495) - [C++] Store shared\_ptr to boxed arrays in RecordBatch
+* [ARROW-1507](https://issues.apache.org/jira/browse/ARROW-1507) - [C++] arrow/compute/api.h can't be used without arrow/array.h
+* [ARROW-1512](https://issues.apache.org/jira/browse/ARROW-1512) - [Docs] NumericArray has no member named 'raw\_data'
+* [ARROW-1514](https://issues.apache.org/jira/browse/ARROW-1514) - [C++] Fix a typo in document
+* [ARROW-1527](https://issues.apache.org/jira/browse/ARROW-1527) - Fix Travis JDK9 build
+* [ARROW-1531](https://issues.apache.org/jira/browse/ARROW-1531) - [C++] Return ToBytes by value from Decimal128
+* [ARROW-1532](https://issues.apache.org/jira/browse/ARROW-1532) - [Python] Referencing an Empty Schema causes a SegFault
+
+
+## New Features and Improvements
+
+* [ARROW-34](https://issues.apache.org/jira/browse/ARROW-34) - C++: establish a basic function evaluation model
+* [ARROW-229](https://issues.apache.org/jira/browse/ARROW-229) - [C++] Implement safe casts for primitive types
+* [ARROW-592](https://issues.apache.org/jira/browse/ARROW-592) - [C++] Provide .deb and .rpm packages
+* [ARROW-594](https://issues.apache.org/jira/browse/ARROW-594) - [Python] Provide interface to write pyarrow.Table to a stream
+* [ARROW-695](https://issues.apache.org/jira/browse/ARROW-695) - Integration tests for Decimal types
+* [ARROW-696](https://issues.apache.org/jira/browse/ARROW-696) - [C++] Add JSON read/write support for decimals for integration tests
+* [ARROW-759](https://issues.apache.org/jira/browse/ARROW-759) - [Python] Implement a transient list serialization function that can handle a mix of scalars, lists, ndarrays, dicts
+* [ARROW-786](https://issues.apache.org/jira/browse/ARROW-786) - [Format] In-memory format for 128-bit Decimals, handling of sign bit
+* [ARROW-837](https://issues.apache.org/jira/browse/ARROW-837) - [Python] Expose buffer allocation, FixedSizeBufferWriter
+* [ARROW-941](https://issues.apache.org/jira/browse/ARROW-941) - [Docs] Improve "cold start" integration testing instructions
+* [ARROW-989](https://issues.apache.org/jira/browse/ARROW-989) - [Python] Write pyarrow.Table to FileWriter or StreamWriter
+* [ARROW-1156](https://issues.apache.org/jira/browse/ARROW-1156) - [Python] pyarrow.Array.from\_pandas should take a type parameter
+* [ARROW-1238](https://issues.apache.org/jira/browse/ARROW-1238) - [Java] Add JSON read/write support for decimals for integration tests
+* [ARROW-1286](https://issues.apache.org/jira/browse/ARROW-1286) - PYTHON: support Categorical serialization to/from parquet
+* [ARROW-1307](https://issues.apache.org/jira/browse/ARROW-1307) - [Python] Add pandas serialization section + Feather API to Sphinx docs
+* [ARROW-1317](https://issues.apache.org/jira/browse/ARROW-1317) - [Python] Add function to set Hadoop CLASSPATH
+* [ARROW-1331](https://issues.apache.org/jira/browse/ARROW-1331) - [Java] Refactor tests
+* [ARROW-1331](https://issues.apache.org/jira/browse/ARROW-1331) - [Java] Refactor tests
+* [ARROW-1339](https://issues.apache.org/jira/browse/ARROW-1339) - [C++] Use boost::filesystem for handling of platform-specific file path encodings
+* [ARROW-1344](https://issues.apache.org/jira/browse/ARROW-1344) - [C++] Calling BufferOutputStream::Write after calling Finish crashes
+* [ARROW-1348](https://issues.apache.org/jira/browse/ARROW-1348) - [C++/Python] Add release verification script for Windows
+* [ARROW-1351](https://issues.apache.org/jira/browse/ARROW-1351) - Automate updating CHANGELOG.md as part of release scripts
+* [ARROW-1352](https://issues.apache.org/jira/browse/ARROW-1352) - [Integration] Improve print formatting for producer, consumer line
+* [ARROW-1355](https://issues.apache.org/jira/browse/ARROW-1355) - Make arrow buildable with java9
+* [ARROW-1356](https://issues.apache.org/jira/browse/ARROW-1356) - [Website] Add new committers
+* [ARROW-1358](https://issues.apache.org/jira/browse/ARROW-1358) - Update source release scripts to account for new SHA checksum policy
+* [ARROW-1359](https://issues.apache.org/jira/browse/ARROW-1359) - [Python] Add Parquet writer option to normalize field names for use in Spark
+* [ARROW-1364](https://issues.apache.org/jira/browse/ARROW-1364) - [C++] IPC reader and writer specialized for GPU device memory
+* [ARROW-1366](https://issues.apache.org/jira/browse/ARROW-1366) - [Python] Add instructions for starting the Plasma store when installing pyarrow from wheels
+* [ARROW-1372](https://issues.apache.org/jira/browse/ARROW-1372) - [Plasma] Support for storing data in huge pages
+* [ARROW-1376](https://issues.apache.org/jira/browse/ARROW-1376) - [C++] RecordBatchStreamReader::Open API is inconsistent with writer
+* [ARROW-1377](https://issues.apache.org/jira/browse/ARROW-1377) - [Python] Add function to assist with benchmarking Parquet scan performance
+* [ARROW-1381](https://issues.apache.org/jira/browse/ARROW-1381) - [Python] Improve performance of SerializedPyObject.to\_buffer
+* [ARROW-1383](https://issues.apache.org/jira/browse/ARROW-1383) - [C++] Support std::vector<bool\> in builder vector appends
+* [ARROW-1384](https://issues.apache.org/jira/browse/ARROW-1384) - [C++] Add convenience function for serializing a record batch to an IPC message
+* [ARROW-1386](https://issues.apache.org/jira/browse/ARROW-1386) - [C++] Unpin CMake version in MSVC build toolchain
+* [ARROW-1387](https://issues.apache.org/jira/browse/ARROW-1387) - [C++] Set up GPU leaf library build toolchain
+* [ARROW-1392](https://issues.apache.org/jira/browse/ARROW-1392) - [C++] Implement reader and writer IO interfaces for GPU buffers
+* [ARROW-1395](https://issues.apache.org/jira/browse/ARROW-1395) - [C++] Remove APIs deprecated as of 0.5.0 and later versions
+* [ARROW-1396](https://issues.apache.org/jira/browse/ARROW-1396) - [C++] Add PrettyPrint function for Schemas, which also outputs any dictionaries
+* [ARROW-1397](https://issues.apache.org/jira/browse/ARROW-1397) - [Packaging] Use Docker instead of Vagrant
+* [ARROW-1399](https://issues.apache.org/jira/browse/ARROW-1399) - [C++] Add CUDA build version in a public header to help prevent ABI conflicts
+* [ARROW-1400](https://issues.apache.org/jira/browse/ARROW-1400) - [Python] Ability to create partitions when writing to Parquet
+* [ARROW-1401](https://issues.apache.org/jira/browse/ARROW-1401) - [C++] Add extra debugging context to failures in RETURN\_NOT\_OK in debug builds
+* [ARROW-1401](https://issues.apache.org/jira/browse/ARROW-1401) - [C++] Add extra debugging context to failures in RETURN\_NOT\_OK in debug builds
+* [ARROW-1402](https://issues.apache.org/jira/browse/ARROW-1402) - [C++] Possibly deprecate public APIs that use MutableBuffer
+* [ARROW-1404](https://issues.apache.org/jira/browse/ARROW-1404) - [Packaging] Build .deb and .rpm on Travis CI
+* [ARROW-1405](https://issues.apache.org/jira/browse/ARROW-1405) - [Python] Add logging option for verbose memory allocations
+* [ARROW-1406](https://issues.apache.org/jira/browse/ARROW-1406) - [Python] Harden user API for generating serialized schema and record batch messages as memoryview-compatible objects
+* [ARROW-1408](https://issues.apache.org/jira/browse/ARROW-1408) - [C++] Refactor and make IPC read / write APIs more consistent, add appropriate deprecations
+* [ARROW-1410](https://issues.apache.org/jira/browse/ARROW-1410) - Plasma object store occasionally pauses for a long time
+* [ARROW-1412](https://issues.apache.org/jira/browse/ARROW-1412) - [Plasma] Add higher level API for putting and getting Python objects
+* [ARROW-1413](https://issues.apache.org/jira/browse/ARROW-1413) - [C++] Add include-what-you-use configuration
+* [ARROW-1415](https://issues.apache.org/jira/browse/ARROW-1415) - [GLib] Support date32 and date64
+* [ARROW-1416](https://issues.apache.org/jira/browse/ARROW-1416) - [Format] Clarify example array in memory layout documentation
+* [ARROW-1417](https://issues.apache.org/jira/browse/ARROW-1417) - [Python] Allow more generic filesystem objects to be passed to ParquetDataset
+* [ARROW-1418](https://issues.apache.org/jira/browse/ARROW-1418) - [Python] Introduce SerializationContext to register custom serialization callbacks
+* [ARROW-1419](https://issues.apache.org/jira/browse/ARROW-1419) - [GLib] Suppress sign-conversion warning on Clang
+* [ARROW-1427](https://issues.apache.org/jira/browse/ARROW-1427) - [GLib] Add a link to readme of Arrow GLib
+* [ARROW-1428](https://issues.apache.org/jira/browse/ARROW-1428) - [C++] Append steps to clone source code to README.mb
+* [ARROW-1432](https://issues.apache.org/jira/browse/ARROW-1432) - [C++] Build bundled jemalloc functions with private prefix
+* [ARROW-1433](https://issues.apache.org/jira/browse/ARROW-1433) - [C++] Simplify implementation of Array::Slice
+* [ARROW-1438](https://issues.apache.org/jira/browse/ARROW-1438) - [Plasma] Pull SerializationContext through PlasmaClient put and get
+* [ARROW-1441](https://issues.apache.org/jira/browse/ARROW-1441) - [Site] Add Ruby to Flexible section
+* [ARROW-1442](https://issues.apache.org/jira/browse/ARROW-1442) - [Website] Add pointer to nightly conda packages on /install
+* [ARROW-1447](https://issues.apache.org/jira/browse/ARROW-1447) - [C++] Round of include-what-you-use include cleanups
+* [ARROW-1448](https://issues.apache.org/jira/browse/ARROW-1448) - [Packaging] Support uploading built .deb and .rpm to Bintray
+* [ARROW-1449](https://issues.apache.org/jira/browse/ARROW-1449) - Implement Decimal using only Int128
+* [ARROW-1451](https://issues.apache.org/jira/browse/ARROW-1451) - [C++] Create arrow/io/api.h
+* [ARROW-1460](https://issues.apache.org/jira/browse/ARROW-1460) - [C++] Upgrade clang-format used to LLVM 4.0
+* [ARROW-1462](https://issues.apache.org/jira/browse/ARROW-1462) - [GLib] Support time array
+* [ARROW-1466](https://issues.apache.org/jira/browse/ARROW-1466) - [C++] Support DecimalArray in arrow::PrettyPrint
+* [ARROW-1468](https://issues.apache.org/jira/browse/ARROW-1468) - [C++] Append to PrimitiveBuilder from std::vector<CTYPE\>
+* [ARROW-1479](https://issues.apache.org/jira/browse/ARROW-1479) - [JS] Expand JavaScript implementation
+* [ARROW-1480](https://issues.apache.org/jira/browse/ARROW-1480) - [Python] Improve performance of serializing sets
+* [ARROW-1481](https://issues.apache.org/jira/browse/ARROW-1481) - [C++] Expose type casts as generic callable object that can write into pre-allocated memory
+* [ARROW-1494](https://issues.apache.org/jira/browse/ARROW-1494) - [C++] Document that shared\_ptr returned by RecordBatch::column needs to be retained
+* [ARROW-1499](https://issues.apache.org/jira/browse/ARROW-1499) - [Python] Consider adding option to parquet.write\_table that sets options for maximum Spark compatibility
+* [ARROW-1504](https://issues.apache.org/jira/browse/ARROW-1504) - [GLib] Support timestamp
+* [ARROW-1505](https://issues.apache.org/jira/browse/ARROW-1505) - [GLib] Simplify arguments check
+* [ARROW-1506](https://issues.apache.org/jira/browse/ARROW-1506) - [C++] Support pkg-config for compute modules
+* [ARROW-1508](https://issues.apache.org/jira/browse/ARROW-1508) - C++: Add support for FixedSizeBinaryType in DictionaryBuilder
+* [ARROW-1510](https://issues.apache.org/jira/browse/ARROW-1510) - [C++] Support cast
+* [ARROW-1511](https://issues.apache.org/jira/browse/ARROW-1511) - [C++] Deprecate arrow::MakePrimitiveArray
+* [ARROW-1513](https://issues.apache.org/jira/browse/ARROW-1513) - C++: Add cast from Dictionary to plain arrays
+* [ARROW-1515](https://issues.apache.org/jira/browse/ARROW-1515) - [GLib] Detect version directly
+* [ARROW-1516](https://issues.apache.org/jira/browse/ARROW-1516) - [GLib] Update document
+* [ARROW-1517](https://issues.apache.org/jira/browse/ARROW-1517) - Remove unnecessary temporary in DecimalUtil::ToString function
+* [ARROW-1519](https://issues.apache.org/jira/browse/ARROW-1519) - [C++] Move DecimalUtil functions to methods on the Int128 class
+* [ARROW-1528](https://issues.apache.org/jira/browse/ARROW-1528) - [GLib] Resolve include dependency
+* [ARROW-1530](https://issues.apache.org/jira/browse/ARROW-1530) - [C++] Install arrow/util/parallel.h
+* [ARROW-1551](https://issues.apache.org/jira/browse/ARROW-1551) - [Website] Updates for 0.7.0 release
+* [ARROW-1597](https://issues.apache.org/jira/browse/ARROW-1597) - [Packaging] arrow-compute.pc is missing in .deb/.rpm file list
+
+
+
+# Apache Arrow 0.6.0 (2017-08-14)
+
+## Bug Fixes
+
+* [ARROW-187](https://issues.apache.org/jira/browse/ARROW-187) - [C++] Decide on how pedantic we want to be about exceptions
+* [ARROW-276](https://issues.apache.org/jira/browse/ARROW-276) - [JAVA] Nullable Value Vectors should extend BaseValueVector instead of BaseDataValueVector
+* [ARROW-573](https://issues.apache.org/jira/browse/ARROW-573) - [Python/C++] Support ordered dictionaries data, pandas Categorical
+* [ARROW-884](https://issues.apache.org/jira/browse/ARROW-884) - [C++] Exclude internal classes from documentation
+* [ARROW-932](https://issues.apache.org/jira/browse/ARROW-932) - [Python] Fix compiler warnings on MSVC
+* [ARROW-968](https://issues.apache.org/jira/browse/ARROW-968) - [Python] RecordBatch [i:j] syntax is incomplete
+* [ARROW-1192](https://issues.apache.org/jira/browse/ARROW-1192) - [JAVA] Improve splitAndTransfer performance for List and Union vectors
+* [ARROW-1195](https://issues.apache.org/jira/browse/ARROW-1195) - [C++] CpuInfo doesn't get cache size on Windows
+* [ARROW-1204](https://issues.apache.org/jira/browse/ARROW-1204) - [C++] lz4 ExternalProject fails in Visual Studio 2015
+* [ARROW-1225](https://issues.apache.org/jira/browse/ARROW-1225) - [Python] pyarrow.array does not attempt to convert bytes to UTF8 when passed a StringType
+* [ARROW-1237](https://issues.apache.org/jira/browse/ARROW-1237) - [JAVA] Expose the ability to set lastSet
+* [ARROW-1239](https://issues.apache.org/jira/browse/ARROW-1239) - issue with current version of git-commit-id-plugin
+* [ARROW-1240](https://issues.apache.org/jira/browse/ARROW-1240) - security: upgrade logback to address CVE-2017-5929
+* [ARROW-1240](https://issues.apache.org/jira/browse/ARROW-1240) - security: upgrade logback to address CVE-2017-5929
+* [ARROW-1241](https://issues.apache.org/jira/browse/ARROW-1241) - [C++] Visual Studio 2017 Appveyor build job
+* [ARROW-1242](https://issues.apache.org/jira/browse/ARROW-1242) - [Java] security - upgrade Jackson to mitigate 3 CVE vulnerabilities
+* [ARROW-1242](https://issues.apache.org/jira/browse/ARROW-1242) - [Java] security - upgrade Jackson to mitigate 3 CVE vulnerabilities
+* [ARROW-1245](https://issues.apache.org/jira/browse/ARROW-1245) - [Integration] Java Integration Tests Disabled
+* [ARROW-1248](https://issues.apache.org/jira/browse/ARROW-1248) - [Python] C linkage warnings in Clang with public Cython API
+* [ARROW-1249](https://issues.apache.org/jira/browse/ARROW-1249) - [JAVA] Expose the fillEmpties function from Nullable<Varlength\>Vector.mutator
+* [ARROW-1263](https://issues.apache.org/jira/browse/ARROW-1263) - [C++] CpuInfo should be able to get CPU features on Windows
+* [ARROW-1265](https://issues.apache.org/jira/browse/ARROW-1265) - [Plasma] Plasma store memory leak warnings in Python test suite
+* [ARROW-1267](https://issues.apache.org/jira/browse/ARROW-1267) - [Java] Handle zero length case in BitVector.splitAndTransfer
+* [ARROW-1269](https://issues.apache.org/jira/browse/ARROW-1269) - [Packaging] Add Windows wheel build scripts from ARROW-1068 to arrow-dist
+* [ARROW-1275](https://issues.apache.org/jira/browse/ARROW-1275) - [C++] Default static library prefix for Snappy should be "\_static"
+* [ARROW-1276](https://issues.apache.org/jira/browse/ARROW-1276) - Cannot serializer empty DataFrame to parquet
+* [ARROW-1283](https://issues.apache.org/jira/browse/ARROW-1283) - [Java] VectorSchemaRoot should be able to be closed() more than once
+* [ARROW-1285](https://issues.apache.org/jira/browse/ARROW-1285) - PYTHON: NotImplemented exception creates empty parquet file
+* [ARROW-1287](https://issues.apache.org/jira/browse/ARROW-1287) - [Python] Emulate "whence" argument of seek in NativeFile
+* [ARROW-1290](https://issues.apache.org/jira/browse/ARROW-1290) - [C++] Use array capacity doubling in arrow::BufferBuilder
+* [ARROW-1291](https://issues.apache.org/jira/browse/ARROW-1291) - [Python] pa.RecordBatch.from\_pandas doesn't accept DataFrame with numeric column names
+* [ARROW-1294](https://issues.apache.org/jira/browse/ARROW-1294) - [C++] New Appveyor build failures
+* [ARROW-1296](https://issues.apache.org/jira/browse/ARROW-1296) - [Java] templates/FixValueVectors reset() method doesn't set allocationSizeInBytes correctly
+* [ARROW-1300](https://issues.apache.org/jira/browse/ARROW-1300) - [JAVA] Fix ListVector Tests
+* [ARROW-1306](https://issues.apache.org/jira/browse/ARROW-1306) - [Python] Encoding? issue with error reporting for parquet.read\_table
+* [ARROW-1308](https://issues.apache.org/jira/browse/ARROW-1308) - [C++] ld tries to link 'arrow\_static' even when -DARROW\_BUILD\_STATIC=off
+* [ARROW-1309](https://issues.apache.org/jira/browse/ARROW-1309) - [Python] Error inferring List type in Array.from\_pandas when inner values are all None
+* [ARROW-1310](https://issues.apache.org/jira/browse/ARROW-1310) - [JAVA] Revert ARROW-886
+* [ARROW-1311](https://issues.apache.org/jira/browse/ARROW-1311) - python hangs after write a few parquet tables
+* [ARROW-1312](https://issues.apache.org/jira/browse/ARROW-1312) - [C++] Set default value to ARROW\_JEMALLOC to OFF until ARROW-1282 is resolved
+* [ARROW-1312](https://issues.apache.org/jira/browse/ARROW-1312) - [C++] Set default value to ARROW\_JEMALLOC to OFF until ARROW-1282 is resolved
+* [ARROW-1326](https://issues.apache.org/jira/browse/ARROW-1326) - [Python] Fix Sphinx build in Travis CI
+* [ARROW-1327](https://issues.apache.org/jira/browse/ARROW-1327) - [Python] Failing to release GIL in MemoryMappedFile.\_open causes deadlock
+* [ARROW-1328](https://issues.apache.org/jira/browse/ARROW-1328) - [Python] pyarrow.Table.from\_pandas option timestamps\_to\_ms changes column values
+* [ARROW-1330](https://issues.apache.org/jira/browse/ARROW-1330) - [Plasma] Turn on plasma tests on manylinux1
+* [ARROW-1335](https://issues.apache.org/jira/browse/ARROW-1335) - [C++] PrimitiveArray::raw\_values has inconsistent semantics re: offsets compared with subclasses
+* [ARROW-1338](https://issues.apache.org/jira/browse/ARROW-1338) - [Python] Investigate non-deterministic core dump on Python 2.7, Travis CI builds
+* [ARROW-1340](https://issues.apache.org/jira/browse/ARROW-1340) - [Java] NullableMapVector field doesn't maintain metadata
+* [ARROW-1342](https://issues.apache.org/jira/browse/ARROW-1342) - [Python] Support strided array of lists
+* [ARROW-1343](https://issues.apache.org/jira/browse/ARROW-1343) - [Format/Java/C++] Ensuring encapsulated stream / IPC message sizes are always a multiple of 8
+* [ARROW-1350](https://issues.apache.org/jira/browse/ARROW-1350) - [C++] Include Plasma source tree in source distribution
+
+
+## New Features and Improvements
+
+* [ARROW-439](https://issues.apache.org/jira/browse/ARROW-439) - [Python] Add option in "to\_pandas" conversions to yield Categorical from String/Binary arrays
+* [ARROW-622](https://issues.apache.org/jira/browse/ARROW-622) - [Python] Investigate alternatives to timestamps\_to\_ms argument in pandas conversion
+* [ARROW-1076](https://issues.apache.org/jira/browse/ARROW-1076) - [Python] Handle nanosecond timestamps more gracefully when writing to Parquet format
+* [ARROW-1093](https://issues.apache.org/jira/browse/ARROW-1093) - [Python] Fail Python builds if flake8 yields warnings
+* [ARROW-1104](https://issues.apache.org/jira/browse/ARROW-1104) - Integrate in-memory object store from Ray
+* [ARROW-1116](https://issues.apache.org/jira/browse/ARROW-1116) - [Python] Create single external GitHub repo building for building wheels for all platforms in one shot
+* [ARROW-1121](https://issues.apache.org/jira/browse/ARROW-1121) - [C++] Improve error message when opening OS file fails
+* [ARROW-1140](https://issues.apache.org/jira/browse/ARROW-1140) - [C++] Allow optional build of plasma
+* [ARROW-1149](https://issues.apache.org/jira/browse/ARROW-1149) - [Plasma] Create Cython client library for Plasma
+* [ARROW-1173](https://issues.apache.org/jira/browse/ARROW-1173) - [Plasma] Blog post for Plasma
+* [ARROW-1211](https://issues.apache.org/jira/browse/ARROW-1211) - [C++] Consider making default\_memory\_pool() the default for builder classes
+* [ARROW-1213](https://issues.apache.org/jira/browse/ARROW-1213) - [Python] Enable s3fs to be used with ParquetDataset and reader/writer functions
+* [ARROW-1219](https://issues.apache.org/jira/browse/ARROW-1219) - [C++] Use more vanilla Google C++ formatting
+* [ARROW-1224](https://issues.apache.org/jira/browse/ARROW-1224) - [Format] Clarify language around buffer padding and alignment in IPC
+* [ARROW-1230](https://issues.apache.org/jira/browse/ARROW-1230) - [Plasma] Install libraries and headers
+* [ARROW-1243](https://issues.apache.org/jira/browse/ARROW-1243) - [Java] security: upgrade all libraries to latest stable versions
+* [ARROW-1246](https://issues.apache.org/jira/browse/ARROW-1246) - [Format] Add Map logical type to metadata
+* [ARROW-1251](https://issues.apache.org/jira/browse/ARROW-1251) - [Python/C++] Revise build documentation to account for latest build toolchain
+* [ARROW-1253](https://issues.apache.org/jira/browse/ARROW-1253) - [C++] Use pre-built toolchain libraries where prudent to speed up CI builds
+* [ARROW-1255](https://issues.apache.org/jira/browse/ARROW-1255) - [Plasma] Check plasma flatbuffer messages with the flatbuffer verifier
+* [ARROW-1256](https://issues.apache.org/jira/browse/ARROW-1256) - [Plasma] Fix compile warnings on macOS
+* [ARROW-1257](https://issues.apache.org/jira/browse/ARROW-1257) - [Plasma] Plasma documentation
+* [ARROW-1258](https://issues.apache.org/jira/browse/ARROW-1258) - [C++] Suppress dlmalloc warnings on Clang
+* [ARROW-1259](https://issues.apache.org/jira/browse/ARROW-1259) - [Plasma] Speed up Plasma tests
+* [ARROW-1260](https://issues.apache.org/jira/browse/ARROW-1260) - [Plasma] Use factory method to create Python PlasmaClient
+* [ARROW-1264](https://issues.apache.org/jira/browse/ARROW-1264) - [Plasma] Don't exit the Python interpreter if the plasma client can't connect to the store
+* [ARROW-1268](https://issues.apache.org/jira/browse/ARROW-1268) - [Website] Blog post on Arrow integration with Spark
+* [ARROW-1270](https://issues.apache.org/jira/browse/ARROW-1270) - [Packaging] Add Python wheel build scripts for macOS to arrow-dist
+* [ARROW-1272](https://issues.apache.org/jira/browse/ARROW-1272) - [Python] Add script to arrow-dist to generate and upload manylinux1 Python wheels
+* [ARROW-1273](https://issues.apache.org/jira/browse/ARROW-1273) - [Python] Add convenience functions for reading only Parquet metadata or effective Arrow schema from a particular Parquet file
+* [ARROW-1274](https://issues.apache.org/jira/browse/ARROW-1274) - [C++] add\_compiler\_export\_flags() throws warning with CMake \>= 3.3
+* [ARROW-1281](https://issues.apache.org/jira/browse/ARROW-1281) - [C++/Python] Add Docker setup for running HDFS tests and other tests we may not run in Travis CI
+* [ARROW-1288](https://issues.apache.org/jira/browse/ARROW-1288) - Clean up many ASF license headers
+* [ARROW-1289](https://issues.apache.org/jira/browse/ARROW-1289) - [Python] Add PYARROW\_BUILD\_PLASMA option like Parquet
+* [ARROW-1297](https://issues.apache.org/jira/browse/ARROW-1297) - 0.6.0 Release
+* [ARROW-1301](https://issues.apache.org/jira/browse/ARROW-1301) - [C++/Python] Add remaining supported libhdfs UNIX-like filesystem APIs
+* [ARROW-1303](https://issues.apache.org/jira/browse/ARROW-1303) - [C++] Support downloading Boost
+* [ARROW-1304](https://issues.apache.org/jira/browse/ARROW-1304) - [Java] Fix checkstyle checks warning
+* [ARROW-1305](https://issues.apache.org/jira/browse/ARROW-1305) - [GLib] Add GArrowIntArrayBuilder
+* [ARROW-1315](https://issues.apache.org/jira/browse/ARROW-1315) - [GLib] Status check of arrow::ArrayBuilder::Finish() is missing
+* [ARROW-1323](https://issues.apache.org/jira/browse/ARROW-1323) - [GLib] Add garrow\_boolean\_array\_get\_values()
+* [ARROW-1333](https://issues.apache.org/jira/browse/ARROW-1333) - [Plasma] Sorting example for DataFrames in plasma
+* [ARROW-1334](https://issues.apache.org/jira/browse/ARROW-1334) - [C++] Instantiate arrow::Table from vector of Array objects (instead of Columns)
+* [ARROW-1336](https://issues.apache.org/jira/browse/ARROW-1336) - [C++] Add arrow::schema factory function
+* [ARROW-1353](https://issues.apache.org/jira/browse/ARROW-1353) - [Website] Updates + blog post for 0.6.0 release
+
+
+
+# Apache Arrow 0.5.0 (2017-07-23)
+
+## New Features and Improvements
+
+* [ARROW-111](https://issues.apache.org/jira/browse/ARROW-111) - [C++] Add static analyzer to tool chain to verify checking of Status returns
+* [ARROW-195](https://issues.apache.org/jira/browse/ARROW-195) - [C++] Upgrade clang bits to clang-3.8 and move back to trusty.
+* [ARROW-460](https://issues.apache.org/jira/browse/ARROW-460) - [C++] Implement JSON round trip for DictionaryArray
+* [ARROW-462](https://issues.apache.org/jira/browse/ARROW-462) - [C++] Implement in-memory conversions between non-nested primitive types and DictionaryArray equivalent
+* [ARROW-575](https://issues.apache.org/jira/browse/ARROW-575) - Python: Auto-detect nested lists and nested numpy arrays in Pandas
+* [ARROW-597](https://issues.apache.org/jira/browse/ARROW-597) - [Python] Add convenience function to yield DataFrame from any object that a StreamReader or FileReader can read from
+* [ARROW-599](https://issues.apache.org/jira/browse/ARROW-599) - [C++] Add LZ4 codec to 3rd-party toolchain
+* [ARROW-599](https://issues.apache.org/jira/browse/ARROW-599) - [C++] Add LZ4 codec to 3rd-party toolchain
+* [ARROW-600](https://issues.apache.org/jira/browse/ARROW-600) - [C++] Add ZSTD codec to 3rd-party toolchain
+* [ARROW-692](https://issues.apache.org/jira/browse/ARROW-692) - Java<-\>C++ Integration tests for dictionary-encoded vectors
+* [ARROW-693](https://issues.apache.org/jira/browse/ARROW-693) - [Java] Add JSON support for dictionary vectors
+* [ARROW-742](https://issues.apache.org/jira/browse/ARROW-742) - Handling exceptions during execution of std::wstring\_convert
+* [ARROW-742](https://issues.apache.org/jira/browse/ARROW-742) - Handling exceptions during execution of std::wstring\_convert
+* [ARROW-834](https://issues.apache.org/jira/browse/ARROW-834) - [Python] Support creating Arrow arrays from Python iterables
+* [ARROW-915](https://issues.apache.org/jira/browse/ARROW-915) - Struct Array reads limited support
+* [ARROW-935](https://issues.apache.org/jira/browse/ARROW-935) - [Java] Build Javadoc in Travis CI
+* [ARROW-960](https://issues.apache.org/jira/browse/ARROW-960) - [Python] Add source build guide for macOS + Homebrew
+* [ARROW-962](https://issues.apache.org/jira/browse/ARROW-962) - [Python] Add schema attribute to FileReader
+* [ARROW-964](https://issues.apache.org/jira/browse/ARROW-964) - [Python] Improve api docs
+* [ARROW-966](https://issues.apache.org/jira/browse/ARROW-966) - [Python] pyarrow.list\_ should also accept Field instance
+* [ARROW-978](https://issues.apache.org/jira/browse/ARROW-978) - [Python] Use sphinx-bootstrap-theme for Sphinx documentation
+* [ARROW-1041](https://issues.apache.org/jira/browse/ARROW-1041) - [Python] Support read\_pandas on a directory of Parquet files
+* [ARROW-1048](https://issues.apache.org/jira/browse/ARROW-1048) - Allow user LD\_LIBRARY\_PATH to be used with source release script
+* [ARROW-1052](https://issues.apache.org/jira/browse/ARROW-1052) - Arrow 0.5.0 release
+* [ARROW-1071](https://issues.apache.org/jira/browse/ARROW-1071) - [Python] RecordBatchFileReader does not have a schema property
+* [ARROW-1073](https://issues.apache.org/jira/browse/ARROW-1073) - C++: Adapative integer builder
+* [ARROW-1095](https://issues.apache.org/jira/browse/ARROW-1095) - [Website] Add Arrow icon asset
+* [ARROW-1100](https://issues.apache.org/jira/browse/ARROW-1100) - [Python] Add "mode" property to NativeFile instances
+* [ARROW-1102](https://issues.apache.org/jira/browse/ARROW-1102) - Make MessageSerializer.serializeMessage() public
+* [ARROW-1120](https://issues.apache.org/jira/browse/ARROW-1120) - [Python] Write support for int96
+* [ARROW-1122](https://issues.apache.org/jira/browse/ARROW-1122) - [Website] Guest blog post on Arrow + ODBC from turbodbc
+* [ARROW-1122](https://issues.apache.org/jira/browse/ARROW-1122) - [Website] Guest blog post on Arrow + ODBC from turbodbc
+* [ARROW-1123](https://issues.apache.org/jira/browse/ARROW-1123) - C++: Make jemalloc the default allocator
+* [ARROW-1135](https://issues.apache.org/jira/browse/ARROW-1135) - Upgrade Travis CI clang builds to use LLVM 4.0
+* [ARROW-1137](https://issues.apache.org/jira/browse/ARROW-1137) - Python: Ensure Pandas roundtrip of all-None column
+* [ARROW-1142](https://issues.apache.org/jira/browse/ARROW-1142) - [C++] Move over compression library toolchain from parquet-cpp
+* [ARROW-1145](https://issues.apache.org/jira/browse/ARROW-1145) - [GLib] Add get\_values()
+* [ARROW-1146](https://issues.apache.org/jira/browse/ARROW-1146) - Add .gitignore for \*\_generated.h files in src/plasma/format
+* [ARROW-1148](https://issues.apache.org/jira/browse/ARROW-1148) - [C++] Raise minimum CMake version to 3.2
+* [ARROW-1151](https://issues.apache.org/jira/browse/ARROW-1151) - [C++] Add gcc branch prediction to status check macro
+* [ARROW-1154](https://issues.apache.org/jira/browse/ARROW-1154) - [C++] Migrate more computational utility code from parquet-cpp
+* [ARROW-1160](https://issues.apache.org/jira/browse/ARROW-1160) - C++: Implement DictionaryBuilder
+* [ARROW-1165](https://issues.apache.org/jira/browse/ARROW-1165) - [C++] Refactor PythonDecimalToArrowDecimal to not use templates
+* [ARROW-1172](https://issues.apache.org/jira/browse/ARROW-1172) - [C++] Use unique\_ptr with array builder classes
+* [ARROW-1183](https://issues.apache.org/jira/browse/ARROW-1183) - [Python] Implement time type conversions in to\_pandas
+* [ARROW-1185](https://issues.apache.org/jira/browse/ARROW-1185) - [C++] Clean up arrow::Status implementation, add warn\_unused\_result attribute for clang
+* [ARROW-1187](https://issues.apache.org/jira/browse/ARROW-1187) - Serialize a DataFrame with None column
+* [ARROW-1193](https://issues.apache.org/jira/browse/ARROW-1193) - [C++] Support pkg-config forarrow\_python.so
+* [ARROW-1196](https://issues.apache.org/jira/browse/ARROW-1196) - [C++] Appveyor separate jobs for Debug/Release builds from sources; Build with conda toolchain; Build with NMake Makefiles Generator
+* [ARROW-1198](https://issues.apache.org/jira/browse/ARROW-1198) - Python: Add public C++ API to unwrap PyArrow object
+* [ARROW-1199](https://issues.apache.org/jira/browse/ARROW-1199) - [C++] Introduce mutable POD struct for generic array data
+* [ARROW-1202](https://issues.apache.org/jira/browse/ARROW-1202) - Remove semicolons from status macros
+* [ARROW-1212](https://issues.apache.org/jira/browse/ARROW-1212) - [GLib] Add garrow\_binary\_array\_get\_offsets\_buffer()
+* [ARROW-1214](https://issues.apache.org/jira/browse/ARROW-1214) - [Python] Add classes / functions to enable stream message components to be handled outside of the stream reader class
+* [ARROW-1217](https://issues.apache.org/jira/browse/ARROW-1217) - [GLib] Add GInputStream based arrow::io::RandomAccessFile
+* [ARROW-1220](https://issues.apache.org/jira/browse/ARROW-1220) - [C++] Standartize usage of \*\_HOME cmake script variables for 3rd party libs
+* [ARROW-1221](https://issues.apache.org/jira/browse/ARROW-1221) - [C++] Pin clang-format version
+* [ARROW-1227](https://issues.apache.org/jira/browse/ARROW-1227) - [GLib] Support GOutputStream
+* [ARROW-1229](https://issues.apache.org/jira/browse/ARROW-1229) - [GLib] Follow Reader API change (get -\> read)
+* [ARROW-1244](https://issues.apache.org/jira/browse/ARROW-1244) - [C++] Do not include cpp/src/plasma in source release pending IP clearance
+* [ARROW-1252](https://issues.apache.org/jira/browse/ARROW-1252) - [Website] Update for 0.5.0 release, add blog post summarizing changes from 0.4.x
+
+
+## Bug Fixes
+
+* [ARROW-288](https://issues.apache.org/jira/browse/ARROW-288) - Implement Arrow adapter for Spark Datasets
+* [ARROW-601](https://issues.apache.org/jira/browse/ARROW-601) - Some logical types not supported when loading Parquet
+* [ARROW-784](https://issues.apache.org/jira/browse/ARROW-784) - Cleaning up thirdparty toolchain support in Arrow on Windows
+* [ARROW-785](https://issues.apache.org/jira/browse/ARROW-785) - possible issue on writing parquet via pyarrow, subsequently read in Hive
+* [ARROW-924](https://issues.apache.org/jira/browse/ARROW-924) - Setting GTEST\_HOME Fails on CMake run
+* [ARROW-992](https://issues.apache.org/jira/browse/ARROW-992) - [Python] In place development builds do not have a \_\_version\_\_
+* [ARROW-1043](https://issues.apache.org/jira/browse/ARROW-1043) - [Python] Make sure pandas metadata created by arrow conforms to the pandas spec
+* [ARROW-1074](https://issues.apache.org/jira/browse/ARROW-1074) - from\_pandas doesnt convert ndarray to list
+* [ARROW-1079](https://issues.apache.org/jira/browse/ARROW-1079) - [Python] Empty "private" directories should be ignored by Parquet interface
+* [ARROW-1081](https://issues.apache.org/jira/browse/ARROW-1081) - C++: arrow::test::TestBase::MakePrimitive doesn't fill null\_bitmap
+* [ARROW-1096](https://issues.apache.org/jira/browse/ARROW-1096) - [C++] Memory mapping file over 4GB fails on Windows
+* [ARROW-1097](https://issues.apache.org/jira/browse/ARROW-1097) - Reading tensor needs file to be opened in writeable mode
+* [ARROW-1098](https://issues.apache.org/jira/browse/ARROW-1098) - Document Error?
+* [ARROW-1101](https://issues.apache.org/jira/browse/ARROW-1101) - UnionListWriter is not implementing all methods on interface ScalarWriter
+* [ARROW-1103](https://issues.apache.org/jira/browse/ARROW-1103) - [Python] Utilize pandas metadata from common \_metadata Parquet file if it exists
+* [ARROW-1107](https://issues.apache.org/jira/browse/ARROW-1107) - [JAVA] NullableMapVector getField() should return nullable type
+* [ARROW-1108](https://issues.apache.org/jira/browse/ARROW-1108) - Check if ArrowBuf is empty buffer in getActualConsumedMemory() and getPossibleConsumedMemory()
+* [ARROW-1109](https://issues.apache.org/jira/browse/ARROW-1109) - [JAVA] transferOwnership fails when readerIndex is not 0
+* [ARROW-1110](https://issues.apache.org/jira/browse/ARROW-1110) - [JAVA] make union vector naming consistent
+* [ARROW-1111](https://issues.apache.org/jira/browse/ARROW-1111) - [JAVA] Make aligning buffers optional, and allow -1 for unknown null count
+* [ARROW-1112](https://issues.apache.org/jira/browse/ARROW-1112) - [JAVA] Set lastSet for VarLength and List vectors when loading
+* [ARROW-1113](https://issues.apache.org/jira/browse/ARROW-1113) - [C++] gflags EP build gets triggered (as a no-op) on subsequent calls to make or ninja build
+* [ARROW-1115](https://issues.apache.org/jira/browse/ARROW-1115) - [C++] Use absolute path for ccache
+* [ARROW-1117](https://issues.apache.org/jira/browse/ARROW-1117) - [Docs] Minor issues in GLib README
+* [ARROW-1124](https://issues.apache.org/jira/browse/ARROW-1124) - [Python] pyarrow needs to depend on numpy\>=1.10 (not 1.9)
+* [ARROW-1125](https://issues.apache.org/jira/browse/ARROW-1125) - Python: Table.from\_pandas doesn't work anymore on partial schemas
+* [ARROW-1125](https://issues.apache.org/jira/browse/ARROW-1125) - Python: Table.from\_pandas doesn't work anymore on partial schemas
+* [ARROW-1128](https://issues.apache.org/jira/browse/ARROW-1128) - [Docs] command to build a wheel is not properly rendered
+* [ARROW-1129](https://issues.apache.org/jira/browse/ARROW-1129) - [C++] Fix Linux toolchain build regression from ARROW-742
+* [ARROW-1130](https://issues.apache.org/jira/browse/ARROW-1130) - io-hdfs-test failure
+* [ARROW-1131](https://issues.apache.org/jira/browse/ARROW-1131) - Python: Parquet unit tests are always skipped
+* [ARROW-1132](https://issues.apache.org/jira/browse/ARROW-1132) - [Python] Unable to write pandas DataFrame w/MultiIndex containing duplicate values to parquet
+* [ARROW-1136](https://issues.apache.org/jira/browse/ARROW-1136) - [C++/Python] Segfault on empty stream
+* [ARROW-1138](https://issues.apache.org/jira/browse/ARROW-1138) - Travis: Use OpenJDK7 instead of OracleJDK7
+* [ARROW-1139](https://issues.apache.org/jira/browse/ARROW-1139) - [C++] dlmalloc doesn't allow arrow to be built with clang 4 or gcc 7.1.1
+* [ARROW-1141](https://issues.apache.org/jira/browse/ARROW-1141) - on import get libjemalloc.so.2: cannot allocate memory in static TLS block
+* [ARROW-1143](https://issues.apache.org/jira/browse/ARROW-1143) - C++: Fix comparison of NullArray
+* [ARROW-1144](https://issues.apache.org/jira/browse/ARROW-1144) - [C++] Remove unused variable
+* [ARROW-1147](https://issues.apache.org/jira/browse/ARROW-1147) - [C++] Allow optional vendoring of flatbuffers in plasma
+* [ARROW-1150](https://issues.apache.org/jira/browse/ARROW-1150) - [C++] AdaptiveIntBuilder compiler warning on MSVC
+* [ARROW-1152](https://issues.apache.org/jira/browse/ARROW-1152) - [Cython] read\_tensor should work with a readable file
+* [ARROW-1153](https://issues.apache.org/jira/browse/ARROW-1153) - All non-Pandas column throws NotImplemented: unhandled type
+* [ARROW-1155](https://issues.apache.org/jira/browse/ARROW-1155) - segmentation fault when run pa.Int16Value()
+* [ARROW-1157](https://issues.apache.org/jira/browse/ARROW-1157) - C++/Python: Decimal templates are not correctly exported on OSX
+* [ARROW-1159](https://issues.apache.org/jira/browse/ARROW-1159) - [C++] Static data members cannot be accessed from inline functions in Arrow headers by thirdparty users
+* [ARROW-1162](https://issues.apache.org/jira/browse/ARROW-1162) - Transfer Between Empty Lists Should Not Invoke Callback
+* [ARROW-1164](https://issues.apache.org/jira/browse/ARROW-1164) - C++: Templated functions need ARROW\_EXPORT instead of ARROW\_TEMPLATE\_EXPORT
+* [ARROW-1166](https://issues.apache.org/jira/browse/ARROW-1166) - Errors in Struct type's example and missing reference in Layout.md
+* [ARROW-1167](https://issues.apache.org/jira/browse/ARROW-1167) - [Python] Create chunked BinaryArray in Table.from\_pandas when a column's data exceeds 2GB
+* [ARROW-1168](https://issues.apache.org/jira/browse/ARROW-1168) - [Python] pandas metadata may contain "mixed" data types
+* [ARROW-1169](https://issues.apache.org/jira/browse/ARROW-1169) - C++: jemalloc externalproject doesn't build with CMake's ninja generator
+* [ARROW-1170](https://issues.apache.org/jira/browse/ARROW-1170) - C++: ARROW\_JEMALLOC=OFF breaks linking on unittest
+* [ARROW-1174](https://issues.apache.org/jira/browse/ARROW-1174) - [GLib] Investigate root cause of ListArray glib test failure
+* [ARROW-1177](https://issues.apache.org/jira/browse/ARROW-1177) - [C++] Detect int32 overflow in ListBuilder::Append
+* [ARROW-1179](https://issues.apache.org/jira/browse/ARROW-1179) - C++: Add missing virtual destructors
+* [ARROW-1180](https://issues.apache.org/jira/browse/ARROW-1180) - [GLib] garrow\_tensor\_get\_dimension\_name() returns invalid address
+* [ARROW-1181](https://issues.apache.org/jira/browse/ARROW-1181) - [Python] Parquet test fail if not enabled
+* [ARROW-1182](https://issues.apache.org/jira/browse/ARROW-1182) - C++: Specify BUILD\_BYPRODUCTS for zlib and zstd
+* [ARROW-1186](https://issues.apache.org/jira/browse/ARROW-1186) - [C++] Enable option to build arrow with minimal dependencies needed to build Parquet library
+* [ARROW-1188](https://issues.apache.org/jira/browse/ARROW-1188) - Segfault when trying to serialize a DataFrame with Null-only Categorical Column
+* [ARROW-1190](https://issues.apache.org/jira/browse/ARROW-1190) - VectorLoader corrupts vectors with duplicate names
+* [ARROW-1191](https://issues.apache.org/jira/browse/ARROW-1191) - [JAVA] Implement getField() method for the complex readers
+* [ARROW-1194](https://issues.apache.org/jira/browse/ARROW-1194) - Getting record batch size with pa.get\_record\_batch\_size returns a size that is too small for pandas DataFrame.
+* [ARROW-1197](https://issues.apache.org/jira/browse/ARROW-1197) - [GLib] record\_batch.hpp Inclusion is missing
+* [ARROW-1200](https://issues.apache.org/jira/browse/ARROW-1200) - [C++] DictionaryBuilder should use signed integers for indices
+* [ARROW-1201](https://issues.apache.org/jira/browse/ARROW-1201) - [Python] Incomplete Python types cause a core dump when repr-ing
+* [ARROW-1203](https://issues.apache.org/jira/browse/ARROW-1203) - [C++] Disallow BinaryBuilder to append byte strings larger than the maximum value of int32\_t
+* [ARROW-1205](https://issues.apache.org/jira/browse/ARROW-1205) - C++: Reference to type objects in ArrayLoader may cause segmentation faults.
+* [ARROW-1206](https://issues.apache.org/jira/browse/ARROW-1206) - [C++] Enable MSVC builds to work with some compression library support disabled
+* [ARROW-1208](https://issues.apache.org/jira/browse/ARROW-1208) - [C++] Toolchain build with ZSTD library from conda-forge failure
+* [ARROW-1208](https://issues.apache.org/jira/browse/ARROW-1208) - [C++] Toolchain build with ZSTD library from conda-forge failure
+* [ARROW-1215](https://issues.apache.org/jira/browse/ARROW-1215) - [Python] Class methods in API reference
+* [ARROW-1216](https://issues.apache.org/jira/browse/ARROW-1216) - Numpy arrays cannot be created from Arrow Buffers on Python 2
+* [ARROW-1218](https://issues.apache.org/jira/browse/ARROW-1218) - Arrow doesn't compile if all compression libraries are deactivated
+* [ARROW-1222](https://issues.apache.org/jira/browse/ARROW-1222) - [Python] pyarrow.array returns NullArray for array of unsupported Python objects
+* [ARROW-1223](https://issues.apache.org/jira/browse/ARROW-1223) - [GLib] Fix function name that returns wrapped object
+* [ARROW-1228](https://issues.apache.org/jira/browse/ARROW-1228) - [GLib] Test file name should be the same name as target class
+* [ARROW-1233](https://issues.apache.org/jira/browse/ARROW-1233) - [C++] Validate cmake script resolving of 3rd party linked libs from correct location in toolchain build
+* [ARROW-1235](https://issues.apache.org/jira/browse/ARROW-1235) - [C++] macOS linker failure with operator<< and std::ostream
+* [ARROW-1236](https://issues.apache.org/jira/browse/ARROW-1236) - Library paths in exported pkg-config file are incorrect
+* [ARROW-1284](https://issues.apache.org/jira/browse/ARROW-1284) - Windows can't install pyarrow 0.4.1 and 0.5.0
+
+
+
+# Apache Arrow 0.4.1 (2017-06-09)
+
+## Bug Fixes
+
+* [ARROW-424](https://issues.apache.org/jira/browse/ARROW-424) - [C++] Threadsafety in arrow/io/hdfs.h
+* [ARROW-1039](https://issues.apache.org/jira/browse/ARROW-1039) - Python: pyarrow.Filesystem.read\_parquet causing error if nthreads\>1
+* [ARROW-1050](https://issues.apache.org/jira/browse/ARROW-1050) - [C++] Export arrow::ValidateArray
+* [ARROW-1051](https://issues.apache.org/jira/browse/ARROW-1051) - [Python] If pyarrow.parquet fails to import due to a shared library ABI conflict, the test\_parquet.py tests silently do not run
+* [ARROW-1056](https://issues.apache.org/jira/browse/ARROW-1056) - [Python] Parquet+HDFS test failure due to writing pandas index
+* [ARROW-1057](https://issues.apache.org/jira/browse/ARROW-1057) - Fix cmake warning and msvc debug asserts
+* [ARROW-1060](https://issues.apache.org/jira/browse/ARROW-1060) - [Python] Add unit test for ARROW-1053
+* [ARROW-1062](https://issues.apache.org/jira/browse/ARROW-1062) - [GLib] Examples use old API
+* [ARROW-1066](https://issues.apache.org/jira/browse/ARROW-1066) - remove warning on feather for pandas \>= 0.20.1
+* [ARROW-1070](https://issues.apache.org/jira/browse/ARROW-1070) - [C++] Feather files for date/time types should be written with the physical types
+* [ARROW-1075](https://issues.apache.org/jira/browse/ARROW-1075) - [GLib] Build error on macOS
+* [ARROW-1082](https://issues.apache.org/jira/browse/ARROW-1082) - [GLib] Add CI on macOS
+* [ARROW-1085](https://issues.apache.org/jira/browse/ARROW-1085) - [java] Follow up on template cleanup. Missing method for IntervalYear
+* [ARROW-1086](https://issues.apache.org/jira/browse/ARROW-1086) - [Python] pyarrow 0.4.0 on pypi is missing pxd files
+* [ARROW-1088](https://issues.apache.org/jira/browse/ARROW-1088) - [Python] test\_unicode\_filename test fails when unicode filenames aren't supported by system
+* [ARROW-1090](https://issues.apache.org/jira/browse/ARROW-1090) - [Python] build\_ext usability
+* [ARROW-1091](https://issues.apache.org/jira/browse/ARROW-1091) - Decimal scale and precision are flipped
+* [ARROW-1092](https://issues.apache.org/jira/browse/ARROW-1092) - More Decimal and scale flipped follow-up
+* [ARROW-1094](https://issues.apache.org/jira/browse/ARROW-1094) - [C++] Incomplete buffer reads in arrow::io::ReadableFile should exactly truncate returned buffer
+* [ARROW-1127](https://issues.apache.org/jira/browse/ARROW-1127) - pyarrow 4.1 import failure on Travis
+
+
+## New Features and Improvements
+
+* [ARROW-897](https://issues.apache.org/jira/browse/ARROW-897) - [GLib] Build arrow-glib as a separate build in the Travis CI build matrix
+* [ARROW-986](https://issues.apache.org/jira/browse/ARROW-986) - [Format] Update IPC.md to account for dictionary batches
+* [ARROW-990](https://issues.apache.org/jira/browse/ARROW-990) - [JS] Add tslint support for linting TypeScript
+* [ARROW-1020](https://issues.apache.org/jira/browse/ARROW-1020) - [Format] Add additional language to Schema.fbs to clarify naive vs. localized Timestamp values
+* [ARROW-1034](https://issues.apache.org/jira/browse/ARROW-1034) - [Python] Enable creation of binary wheels on Windows / MSVC
+* [ARROW-1049](https://issues.apache.org/jira/browse/ARROW-1049) - [java] vector template cleanup
+* [ARROW-1063](https://issues.apache.org/jira/browse/ARROW-1063) - [Website] Blog post and website updates for 0.4.0 release
+* [ARROW-1068](https://issues.apache.org/jira/browse/ARROW-1068) - [Python] Create external repo with appveyor.yml configured for building Python wheel installers
+* [ARROW-1069](https://issues.apache.org/jira/browse/ARROW-1069) - Add instructions for publishing maven artifacts
+* [ARROW-1078](https://issues.apache.org/jira/browse/ARROW-1078) - [Python] Account for PARQUET-967
+* [ARROW-1080](https://issues.apache.org/jira/browse/ARROW-1080) - C++: Add tutorial about converting to/from row-wise representation
+* [ARROW-1084](https://issues.apache.org/jira/browse/ARROW-1084) - Implementations of BufferAllocator should handle Netty's OutOfDirectMemoryError
+* [ARROW-1118](https://issues.apache.org/jira/browse/ARROW-1118) - [Website] Site updates for 0.4.1
+
+
+
+# Apache Arrow 0.4.0 (2017-05-22)
+
+## Bug Fixes
+
+* [ARROW-813](https://issues.apache.org/jira/browse/ARROW-813) - [Python] setup.py sdist must also bundle dependent cmake modules
+* [ARROW-824](https://issues.apache.org/jira/browse/ARROW-824) - Date and Time Vectors should reflect timezone-less semantics
+* [ARROW-856](https://issues.apache.org/jira/browse/ARROW-856) - CmakeError by Unknown compiler.
+* [ARROW-909](https://issues.apache.org/jira/browse/ARROW-909) - libjemalloc.so.2: cannot open shared object file:
+* [ARROW-939](https://issues.apache.org/jira/browse/ARROW-939) - Fix division by zero for zero-dimensional Tensors
+* [ARROW-940](https://issues.apache.org/jira/browse/ARROW-940) - [JS] Generate multiple sets of artifacts
+* [ARROW-944](https://issues.apache.org/jira/browse/ARROW-944) - Python: Compat broken for pandas==0.18.1
+* [ARROW-948](https://issues.apache.org/jira/browse/ARROW-948) - [GLib] Update C++ header file list
+* [ARROW-952](https://issues.apache.org/jira/browse/ARROW-952) - Compilation error on macOS with clang-802.0.42
+* [ARROW-958](https://issues.apache.org/jira/browse/ARROW-958) - [Python] Conda build guide still needs ARROW\_HOME, PARQUET\_HOME
+* [ARROW-979](https://issues.apache.org/jira/browse/ARROW-979) - [Python] Fix setuptools\_scm version when release tag is not in the master timeline
+* [ARROW-991](https://issues.apache.org/jira/browse/ARROW-991) - [Python] PyArray\_SimpleNew should not be used with NPY\_DATETIME
+* [ARROW-995](https://issues.apache.org/jira/browse/ARROW-995) - [Website] 0.3 release announce has a typo in reference
+* [ARROW-998](https://issues.apache.org/jira/browse/ARROW-998) - [Doc] File format documents incorrect schema location
+* [ARROW-1003](https://issues.apache.org/jira/browse/ARROW-1003) - [C++] Hdfs and java dlls fail to load when built for Windows with MSVC
+* [ARROW-1004](https://issues.apache.org/jira/browse/ARROW-1004) - ArrowInvalid: Invalid: Python object of type float is not None and is not a string, bool, or date object
+* [ARROW-1017](https://issues.apache.org/jira/browse/ARROW-1017) - Python: Table.to\_pandas leaks memory
+* [ARROW-1023](https://issues.apache.org/jira/browse/ARROW-1023) - Python: Fix bundling of arrow-cpp for macOS
+* [ARROW-1033](https://issues.apache.org/jira/browse/ARROW-1033) - [Python] pytest discovers scripts/test\_leak.py
+* [ARROW-1045](https://issues.apache.org/jira/browse/ARROW-1045) - [JAVA] Add support for custom metadata in org.apache.arrow.vector.types.pojo.\*
+* [ARROW-1046](https://issues.apache.org/jira/browse/ARROW-1046) - [Python] Conform DataFrame metadata to pandas spec
+* [ARROW-1053](https://issues.apache.org/jira/browse/ARROW-1053) - [Python] Memory leak with RecordBatchFileReader
+* [ARROW-1054](https://issues.apache.org/jira/browse/ARROW-1054) - [Python] Test suite fails on pandas 0.19.2
+* [ARROW-1061](https://issues.apache.org/jira/browse/ARROW-1061) - [C++] Harden decimal parsing against invalid strings
+* [ARROW-1064](https://issues.apache.org/jira/browse/ARROW-1064) - ModuleNotFoundError: No module named 'pyarrow.\_parquet'
+
+
+## New Features and Improvements
+
+* [ARROW-29](https://issues.apache.org/jira/browse/ARROW-29) - C++: Add re2 as optional 3rd-party toolchain dependency
+* [ARROW-182](https://issues.apache.org/jira/browse/ARROW-182) - [C++] Remove Array::Validate virtual function and make a separate method
+* [ARROW-376](https://issues.apache.org/jira/browse/ARROW-376) - Python: Convert non-range Pandas indices (optionally) to Arrow
+* [ARROW-446](https://issues.apache.org/jira/browse/ARROW-446) - [Python] Document NativeFile interfaces, HDFS client in Sphinx
+* [ARROW-482](https://issues.apache.org/jira/browse/ARROW-482) - [Java] Provide API access to "custom\_metadata" Field attribute in IPC setting
+* [ARROW-532](https://issues.apache.org/jira/browse/ARROW-532) - [Python] Expand pyarrow.parquet documentation for 0.3 release
+* [ARROW-579](https://issues.apache.org/jira/browse/ARROW-579) - Python: Provide redistributable pyarrow wheels on OSX
+* [ARROW-596](https://issues.apache.org/jira/browse/ARROW-596) - [Python] Add convenience function to convert pandas.DataFrame to pyarrow.Buffer containing a file or stream representation
+* [ARROW-629](https://issues.apache.org/jira/browse/ARROW-629) - [JS] Add unit test suite
+* [ARROW-714](https://issues.apache.org/jira/browse/ARROW-714) - [C++] Add import\_pyarrow C API in the style of NumPy for thirdparty C++ users
+* [ARROW-819](https://issues.apache.org/jira/browse/ARROW-819) - [Python] Define public Cython API
+* [ARROW-872](https://issues.apache.org/jira/browse/ARROW-872) - [JS] Read streaming format
+* [ARROW-873](https://issues.apache.org/jira/browse/ARROW-873) - [JS] Implement fixed width list type
+* [ARROW-874](https://issues.apache.org/jira/browse/ARROW-874) - [JS] Read dictionary-encoded vectors
+* [ARROW-881](https://issues.apache.org/jira/browse/ARROW-881) - [Python] Reconstruct Pandas DataFrame indexes using custom\_metadata
+* [ARROW-891](https://issues.apache.org/jira/browse/ARROW-891) - [Python] Expand Windows build instructions to not require looking at separate C++ docs
+* [ARROW-899](https://issues.apache.org/jira/browse/ARROW-899) - [Docs] Add CHANGELOG for 0.3.0
+* [ARROW-901](https://issues.apache.org/jira/browse/ARROW-901) - [Python] Write FixedSizeBinary to Parquet
+* [ARROW-913](https://issues.apache.org/jira/browse/ARROW-913) - [Python] Only link jemalloc to the Cython extension where it's needed
+* [ARROW-923](https://issues.apache.org/jira/browse/ARROW-923) - [Docs] Generate Changelog for website with JIRA links
+* [ARROW-929](https://issues.apache.org/jira/browse/ARROW-929) - Move KEYS file to SVN, remove from git
+* [ARROW-943](https://issues.apache.org/jira/browse/ARROW-943) - [GLib] Support running unit tests with source archive
+* [ARROW-945](https://issues.apache.org/jira/browse/ARROW-945) - [GLib] Add a Lua example to show Torch integration
+* [ARROW-946](https://issues.apache.org/jira/browse/ARROW-946) - [GLib] Use "new" instead of "open" for constructor name
+* [ARROW-947](https://issues.apache.org/jira/browse/ARROW-947) - [Python] Improve execution time of manylinux1 build
+* [ARROW-953](https://issues.apache.org/jira/browse/ARROW-953) - Use cmake / curl from conda-forge in CI builds
+* [ARROW-954](https://issues.apache.org/jira/browse/ARROW-954) - Make it possible to compile Arrow with header-only boost
+* [ARROW-956](https://issues.apache.org/jira/browse/ARROW-956) - remove pandas pre-0.20.0 compat
+* [ARROW-957](https://issues.apache.org/jira/browse/ARROW-957) - [Doc] Add HDFS and Windows documents to doxygen output
+* [ARROW-961](https://issues.apache.org/jira/browse/ARROW-961) - [Python] Rename InMemoryOutputStream to BufferOutputStream
+* [ARROW-963](https://issues.apache.org/jira/browse/ARROW-963) - [GLib] Add equal
+* [ARROW-967](https://issues.apache.org/jira/browse/ARROW-967) - [GLib] Support initializing array with buffer
+* [ARROW-970](https://issues.apache.org/jira/browse/ARROW-970) - [Python] Accidentally calling pyarrow.Table() should not segfault process
+* [ARROW-977](https://issues.apache.org/jira/browse/ARROW-977) - [java] Add Timezone aware timestamp vectors
+* [ARROW-980](https://issues.apache.org/jira/browse/ARROW-980) - Fix detection of "msvc" COMPILER\_FAMILY
+* [ARROW-982](https://issues.apache.org/jira/browse/ARROW-982) - [Website] Improve website front copy to highlight serialization efficiency benefits
+* [ARROW-984](https://issues.apache.org/jira/browse/ARROW-984) - [GLib] Add Go examples
+* [ARROW-985](https://issues.apache.org/jira/browse/ARROW-985) - [GLib] Update package information
+* [ARROW-988](https://issues.apache.org/jira/browse/ARROW-988) - [JS] Add entry to Travis CI matrix
+* [ARROW-993](https://issues.apache.org/jira/browse/ARROW-993) - [GLib] Add missing error checks in Go examples
+* [ARROW-996](https://issues.apache.org/jira/browse/ARROW-996) - [Website] Add 0.3 release announce in Japanese
+* [ARROW-997](https://issues.apache.org/jira/browse/ARROW-997) - [Java] Implement transfer in FixedSizeListVector
+* [ARROW-1000](https://issues.apache.org/jira/browse/ARROW-1000) - [GLib] Move install document to Website
+* [ARROW-1001](https://issues.apache.org/jira/browse/ARROW-1001) - [GLib] Unify writer files
+* [ARROW-1002](https://issues.apache.org/jira/browse/ARROW-1002) - [C++] It is not necessary to add padding after the magic header in the FileWriter implementation
+* [ARROW-1008](https://issues.apache.org/jira/browse/ARROW-1008) - [C++] Define abstract interface for stream iteration
+* [ARROW-1010](https://issues.apache.org/jira/browse/ARROW-1010) - [Website] Only show English posts in /blog/
+* [ARROW-1011](https://issues.apache.org/jira/browse/ARROW-1011) - [Format] Clarify requirements around buffer padding in validity bitmaps
+* [ARROW-1014](https://issues.apache.org/jira/browse/ARROW-1014) - 0.4.0 release
+* [ARROW-1015](https://issues.apache.org/jira/browse/ARROW-1015) - [Java] Implement schema-level metadata
+* [ARROW-1016](https://issues.apache.org/jira/browse/ARROW-1016) - Python: Include C++ headers (optionally) in wheels
+* [ARROW-1022](https://issues.apache.org/jira/browse/ARROW-1022) - [Python] Add nthreads option to Feather read method
+* [ARROW-1024](https://issues.apache.org/jira/browse/ARROW-1024) - Python: Update build time numpy version to 1.10.1
+* [ARROW-1025](https://issues.apache.org/jira/browse/ARROW-1025) - [Website] Improve changelog on website
+* [ARROW-1027](https://issues.apache.org/jira/browse/ARROW-1027) - [Python] Allow negative indexing in fields/columns on pyarrow Table and Schema objects
+* [ARROW-1028](https://issues.apache.org/jira/browse/ARROW-1028) - [Python] Documentation updates after ARROW-1008
+* [ARROW-1029](https://issues.apache.org/jira/browse/ARROW-1029) - [Python] Fix --with-parquet build on Windows, add unit tests to Appveyor
+* [ARROW-1030](https://issues.apache.org/jira/browse/ARROW-1030) - Python: Account for library versioning in parquet-cpp
+* [ARROW-1031](https://issues.apache.org/jira/browse/ARROW-1031) - [GLib] Support pretty print
+* [ARROW-1037](https://issues.apache.org/jira/browse/ARROW-1037) - [GLib] Follow reader name change
+* [ARROW-1038](https://issues.apache.org/jira/browse/ARROW-1038) - [GLib] Follow writer name change
+* [ARROW-1040](https://issues.apache.org/jira/browse/ARROW-1040) - [GLib] Follow tensor IO
+* [ARROW-1044](https://issues.apache.org/jira/browse/ARROW-1044) - [GLib] Support Feather
+* [ARROW-1126](https://issues.apache.org/jira/browse/ARROW-1126) - Python: Add function to convert NumPy/Pandas dtypes to Arrow DataTypes
+
+
+
+# Apache Arrow 0.3.0 (2017-05-05)
+
+## Bug Fixes
+
+* [ARROW-109](https://issues.apache.org/jira/browse/ARROW-109) - [C++] Investigate recursive data types limit in flatbuffers
+* [ARROW-208](https://issues.apache.org/jira/browse/ARROW-208) - Add checkstyle policy to java project
+* [ARROW-347](https://issues.apache.org/jira/browse/ARROW-347) - Add method to pass CallBack when creating a transfer pair
+* [ARROW-413](https://issues.apache.org/jira/browse/ARROW-413) - DATE type is not specified clearly
+* [ARROW-431](https://issues.apache.org/jira/browse/ARROW-431) - [Python] Review GIL release and acquisition in to\_pandas conversion
+* [ARROW-443](https://issues.apache.org/jira/browse/ARROW-443) - [Python] Support for converting from strided pandas data in Table.from\_pandas
+* [ARROW-451](https://issues.apache.org/jira/browse/ARROW-451) - [C++] Override DataType::Equals for other types with additional metadata
+* [ARROW-454](https://issues.apache.org/jira/browse/ARROW-454) - pojo.Field doesn't implement hashCode()
+* [ARROW-526](https://issues.apache.org/jira/browse/ARROW-526) - [Format] Update IPC.md to account for File format changes and Streaming format
+* [ARROW-565](https://issues.apache.org/jira/browse/ARROW-565) - [C++] Examine "Field::dictionary" member
+* [ARROW-570](https://issues.apache.org/jira/browse/ARROW-570) - Determine Java tools JAR location from project metadata
+* [ARROW-584](https://issues.apache.org/jira/browse/ARROW-584) - [C++] Fix compiler warnings exposed with -Wconversion
+* [ARROW-586](https://issues.apache.org/jira/browse/ARROW-586) - Problem with reading parquet files saved by Apache Spark
+* [ARROW-588](https://issues.apache.org/jira/browse/ARROW-588) - [C++] Fix compiler warnings on 32-bit platforms
+* [ARROW-595](https://issues.apache.org/jira/browse/ARROW-595) - [Python] StreamReader.schema returns None
+* [ARROW-604](https://issues.apache.org/jira/browse/ARROW-604) - Python: boxed Field instances are missing the reference to DataType
+* [ARROW-611](https://issues.apache.org/jira/browse/ARROW-611) - [Java] TimeVector TypeLayout is incorrectly specified as 64 bit width
+* [ARROW-613](https://issues.apache.org/jira/browse/ARROW-613) - [JS] Implement random-access file format
+* [ARROW-617](https://issues.apache.org/jira/browse/ARROW-617) - Time type is not specified clearly
+* [ARROW-619](https://issues.apache.org/jira/browse/ARROW-619) - Python: Fix typos in setup.py args and LD\_LIBRARY\_PATH
+* [ARROW-619](https://issues.apache.org/jira/browse/ARROW-619) - Python: Fix typos in setup.py args and LD\_LIBRARY\_PATH
+* [ARROW-623](https://issues.apache.org/jira/browse/ARROW-623) - segfault with \_\_repr\_\_ of empty Field
+* [ARROW-624](https://issues.apache.org/jira/browse/ARROW-624) - [C++] Restore MakePrimitiveArray function
+* [ARROW-627](https://issues.apache.org/jira/browse/ARROW-627) - [C++] Compatibility macros for exported extern template class declarations
+* [ARROW-628](https://issues.apache.org/jira/browse/ARROW-628) - [Python] Install nomkl metapackage when building parquet-cpp for faster Travis builds
+* [ARROW-630](https://issues.apache.org/jira/browse/ARROW-630) - [C++] IPC unloading for BooleanArray does not account for offset
+* [ARROW-636](https://issues.apache.org/jira/browse/ARROW-636) - [C++] Add Boost / other system requirements to C++ README
+* [ARROW-639](https://issues.apache.org/jira/browse/ARROW-639) - [C++] Invalid offset in slices
+* [ARROW-642](https://issues.apache.org/jira/browse/ARROW-642) - [Java] Remove temporary file in java/tools
+* [ARROW-644](https://issues.apache.org/jira/browse/ARROW-644) - Python: Cython should be a setup-only requirement
+* [ARROW-652](https://issues.apache.org/jira/browse/ARROW-652) - Remove trailing f in merge script output
+* [ARROW-654](https://issues.apache.org/jira/browse/ARROW-654) - [C++] Support timezone metadata in file/stream formats
+* [ARROW-666](https://issues.apache.org/jira/browse/ARROW-666) - [Python] Error in DictionaryArray \_\_repr\_\_
+* [ARROW-667](https://issues.apache.org/jira/browse/ARROW-667) - build of arrow-master/cpp fails with altivec error?
+* [ARROW-668](https://issues.apache.org/jira/browse/ARROW-668) - [Python] Convert nanosecond timestamps to pandas.Timestamp when converting from TimestampValue
+* [ARROW-671](https://issues.apache.org/jira/browse/ARROW-671) - [GLib] License file isn't installed
+* [ARROW-673](https://issues.apache.org/jira/browse/ARROW-673) - [Java] Support additional Time metadata
+* [ARROW-677](https://issues.apache.org/jira/browse/ARROW-677) - [java] Fix checkstyle jcl-over-slf4j conflict issue
+* [ARROW-678](https://issues.apache.org/jira/browse/ARROW-678) - [GLib] Fix dependenciesfff
+* [ARROW-680](https://issues.apache.org/jira/browse/ARROW-680) - [C++] Multiarch support impacts user-supplied install prefix
+* [ARROW-682](https://issues.apache.org/jira/browse/ARROW-682) - Add self-validation checks in integration tests
+* [ARROW-683](https://issues.apache.org/jira/browse/ARROW-683) - [C++] Support date32 (DateUnit::DAY) in IPC metadata, rename date to date64
+* [ARROW-685](https://issues.apache.org/jira/browse/ARROW-685) - [GLib] AX\_CXX\_COMPILE\_STDCXX\_11 error running ./configure
+* [ARROW-686](https://issues.apache.org/jira/browse/ARROW-686) - [C++] Account for time metadata changes, add time32 and time64 types
+* [ARROW-689](https://issues.apache.org/jira/browse/ARROW-689) - [GLib] Install header files and documents to wrong directories
+* [ARROW-691](https://issues.apache.org/jira/browse/ARROW-691) - [Java] Encode dictionary Int type in message format
+* [ARROW-697](https://issues.apache.org/jira/browse/ARROW-697) - [Java] Raise appropriate exceptions when encountering large (\> INT32\_MAX) record batches
+* [ARROW-699](https://issues.apache.org/jira/browse/ARROW-699) - [C++] Arrow dynamic libraries are missed on run of unit tests on Windows
+* [ARROW-702](https://issues.apache.org/jira/browse/ARROW-702) - Fix BitVector.copyFromSafe to reAllocate instead of returning false
+* [ARROW-703](https://issues.apache.org/jira/browse/ARROW-703) - Fix issue where setValueCount(0) doesn’t work in the case that we’ve shipped vectors across the wire
+* [ARROW-704](https://issues.apache.org/jira/browse/ARROW-704) - Fix bad import caused by conflicting changes
+* [ARROW-709](https://issues.apache.org/jira/browse/ARROW-709) - [C++] Restore type comparator for DecimalType
+* [ARROW-713](https://issues.apache.org/jira/browse/ARROW-713) - [C++] Fix linking issue with ipc benchmark
+* [ARROW-715](https://issues.apache.org/jira/browse/ARROW-715) - Python: Explicit pandas import makes it a hard requirement
+* [ARROW-716](https://issues.apache.org/jira/browse/ARROW-716) - error building arrow/python
+* [ARROW-720](https://issues.apache.org/jira/browse/ARROW-720) - [java] arrow should not have a dependency on slf4j bridges in compile
+* [ARROW-723](https://issues.apache.org/jira/browse/ARROW-723) - Arrow freezes on write if chunk\_size=0
+* [ARROW-726](https://issues.apache.org/jira/browse/ARROW-726) - [C++] PyBuffer dtor may segfault if constructor passed an object not exporting buffer protocol
+* [ARROW-732](https://issues.apache.org/jira/browse/ARROW-732) - Schema comparison bugs in struct and union types
+* [ARROW-736](https://issues.apache.org/jira/browse/ARROW-736) - [Python] Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default
+* [ARROW-738](https://issues.apache.org/jira/browse/ARROW-738) - [Python] Fix manylinux1 packaging
+* [ARROW-739](https://issues.apache.org/jira/browse/ARROW-739) - Parallel build fails non-deterministically.
+* [ARROW-740](https://issues.apache.org/jira/browse/ARROW-740) - FileReader fails for large objects
+* [ARROW-747](https://issues.apache.org/jira/browse/ARROW-747) - [C++] Fix spurious warning caused by passing dl to add\_dependencies
+* [ARROW-749](https://issues.apache.org/jira/browse/ARROW-749) - [Python] Delete incomplete binary files when writing fails
+* [ARROW-753](https://issues.apache.org/jira/browse/ARROW-753) - [Python] Unit tests in arrow/python fail to link on some OS X platforms
+* [ARROW-756](https://issues.apache.org/jira/browse/ARROW-756) - [C++] Do not pass -fPIC when compiling with MSVC
+* [ARROW-757](https://issues.apache.org/jira/browse/ARROW-757) - [C++] MSVC build fails on googletest when using NMake
+* [ARROW-762](https://issues.apache.org/jira/browse/ARROW-762) - Kerberos Problem with PyArrow
+* [ARROW-776](https://issues.apache.org/jira/browse/ARROW-776) - [GLib] Cast type is wrong
+* [ARROW-777](https://issues.apache.org/jira/browse/ARROW-777) - [Java] Resolve getObject behavior per changes / discussion in ARROW-729
+* [ARROW-778](https://issues.apache.org/jira/browse/ARROW-778) - Modify merge tool to work on Windows
+* [ARROW-780](https://issues.apache.org/jira/browse/ARROW-780) - PYTHON\_EXECUTABLE Required to be set during build
+* [ARROW-781](https://issues.apache.org/jira/browse/ARROW-781) - [Python/C++] Increase reference count for base object?
+* [ARROW-783](https://issues.apache.org/jira/browse/ARROW-783) - Integration tests fail for length-0 record batch
+* [ARROW-787](https://issues.apache.org/jira/browse/ARROW-787) - [GLib] Fix compilation errors caused by ARROW-758
+* [ARROW-789](https://issues.apache.org/jira/browse/ARROW-789) - Fix issue where setValueCount(0) doesn’t work in the case that we’ve shipped vectors across the wire
+* [ARROW-793](https://issues.apache.org/jira/browse/ARROW-793) - [GLib] Wrong indent
+* [ARROW-794](https://issues.apache.org/jira/browse/ARROW-794) - [C++] Check whether data is contiguous in ipc::WriteTensor
+* [ARROW-796](https://issues.apache.org/jira/browse/ARROW-796) - [Java] Checkstyle additions causing build failure in some environments
+* [ARROW-797](https://issues.apache.org/jira/browse/ARROW-797) - [Python] Add updated pyarrow.\* public API listing in Sphinx docs
+* [ARROW-800](https://issues.apache.org/jira/browse/ARROW-800) - [C++] Boost headers being transitively included in pyarrow
+* [ARROW-805](https://issues.apache.org/jira/browse/ARROW-805) - listing empty HDFS directory returns an error instead of returning empty list
+* [ARROW-809](https://issues.apache.org/jira/browse/ARROW-809) - C++: Writing sliced record batch to IPC writes the entire array
+* [ARROW-812](https://issues.apache.org/jira/browse/ARROW-812) - Pip install pyarrow on mac failed.
+* [ARROW-817](https://issues.apache.org/jira/browse/ARROW-817) - [C++] Fix incorrect code comment from ARROW-722
+* [ARROW-821](https://issues.apache.org/jira/browse/ARROW-821) - [Python] Extra file \_table\_api.h generated during Python build process
+* [ARROW-822](https://issues.apache.org/jira/browse/ARROW-822) - [Python] StreamWriter fails to open with socket as sink
+* [ARROW-826](https://issues.apache.org/jira/browse/ARROW-826) - Compilation error on Mac with -DARROW\_PYTHON=on
+* [ARROW-829](https://issues.apache.org/jira/browse/ARROW-829) - Python: Parquet: Dictionary encoding is deactivated if column-wise compression was selected
+* [ARROW-830](https://issues.apache.org/jira/browse/ARROW-830) - Python: jemalloc is not anymore publicly exposed
+* [ARROW-836](https://issues.apache.org/jira/browse/ARROW-836) - Test for timedelta compat with pandas
+* [ARROW-839](https://issues.apache.org/jira/browse/ARROW-839) - [C++] Portable alternative to PyDate\_to\_ms function
+* [ARROW-847](https://issues.apache.org/jira/browse/ARROW-847) - C++: BUILD\_BYPRODUCTS not specified anymore for gtest
+* [ARROW-852](https://issues.apache.org/jira/browse/ARROW-852) - Python: Also set Arrow Library PATHS when detection was done through pkg-config
+* [ARROW-853](https://issues.apache.org/jira/browse/ARROW-853) - [Python] It is no longer necessary to modify the RPATH of the Cython extensions on many environments
+* [ARROW-858](https://issues.apache.org/jira/browse/ARROW-858) - Remove dependency on boost regex
+* [ARROW-866](https://issues.apache.org/jira/browse/ARROW-866) - [Python] Error from file object destructor
+* [ARROW-867](https://issues.apache.org/jira/browse/ARROW-867) - [Python] Miscellaneous pyarrow MSVC fixes
+* [ARROW-875](https://issues.apache.org/jira/browse/ARROW-875) - Nullable variable length vector fillEmpties() fills an extra value
+* [ARROW-879](https://issues.apache.org/jira/browse/ARROW-879) - compat with pandas 0.20.0
+* [ARROW-882](https://issues.apache.org/jira/browse/ARROW-882) - [C++] On Windows statically built lib file overwrites lib file of shared build
+* [ARROW-883](https://issues.apache.org/jira/browse/ARROW-883) - [JAVA] Introduction of new types has shifted Enumerations
+* [ARROW-885](https://issues.apache.org/jira/browse/ARROW-885) - [Python/C++] Decimal test failure on MSVC
+* [ARROW-886](https://issues.apache.org/jira/browse/ARROW-886) - VariableLengthVectors don't reAlloc offsets
+* [ARROW-887](https://issues.apache.org/jira/browse/ARROW-887) - [format] For backward compatibility, new unit fields must have default values matching previous implied unit
+* [ARROW-888](https://issues.apache.org/jira/browse/ARROW-888) - BitVector transfer() does not transfer ownership
+* [ARROW-895](https://issues.apache.org/jira/browse/ARROW-895) - Nullable variable length vector lastSet not set correctly
+* [ARROW-900](https://issues.apache.org/jira/browse/ARROW-900) - [Python] UnboundLocalError in ParquetDatasetPiece
+* [ARROW-903](https://issues.apache.org/jira/browse/ARROW-903) - [GLib] Remove a needless "."
+* [ARROW-914](https://issues.apache.org/jira/browse/ARROW-914) - [C++/Python] Fix Decimal ToBytes
+* [ARROW-922](https://issues.apache.org/jira/browse/ARROW-922) - Allow Flatbuffers and RapidJSON to be used locally on Windows
+* [ARROW-927](https://issues.apache.org/jira/browse/ARROW-927) - C++/Python: Add manylinux1 builds to Travis matrix
+* [ARROW-928](https://issues.apache.org/jira/browse/ARROW-928) - Update CMAKE script to detect unsupported msvc compilers versions
+* [ARROW-933](https://issues.apache.org/jira/browse/ARROW-933) - [Python] arrow\_python bindings have debug print statement
+* [ARROW-934](https://issues.apache.org/jira/browse/ARROW-934) - [GLib] Glib sources missing from result of 02-source.sh
+* [ARROW-936](https://issues.apache.org/jira/browse/ARROW-936) - Fix release README
+* [ARROW-936](https://issues.apache.org/jira/browse/ARROW-936) - Fix release README
+* [ARROW-938](https://issues.apache.org/jira/browse/ARROW-938) - Fix Apache Rat errors from source release build
+
+
+## New Features and Improvements
+
+* [ARROW-6](https://issues.apache.org/jira/browse/ARROW-6) - Hope to add development document
+* [ARROW-39](https://issues.apache.org/jira/browse/ARROW-39) - C++: Logical chunked arrays / columns: conforming to fixed chunk sizes
+* [ARROW-52](https://issues.apache.org/jira/browse/ARROW-52) - Set up project blog
+* [ARROW-95](https://issues.apache.org/jira/browse/ARROW-95) - Scaffold Main Documentation using asciidoc
+* [ARROW-98](https://issues.apache.org/jira/browse/ARROW-98) - Java: API documentation
+* [ARROW-99](https://issues.apache.org/jira/browse/ARROW-99) - C++: Explore if RapidCheck may be helpful for testing / worth adding to toolchain
+* [ARROW-183](https://issues.apache.org/jira/browse/ARROW-183) - C++: Add storage type to DecimalType
+* [ARROW-231](https://issues.apache.org/jira/browse/ARROW-231) - C++: Add typed Resize to PoolBuffer
+* [ARROW-281](https://issues.apache.org/jira/browse/ARROW-281) - [C++] IPC/RPC support on Win32 platforms
+* [ARROW-316](https://issues.apache.org/jira/browse/ARROW-316) - Finalize Date type
+* [ARROW-341](https://issues.apache.org/jira/browse/ARROW-341) - [Python] Making libpyarrow available to third parties
+* [ARROW-452](https://issues.apache.org/jira/browse/ARROW-452) - [C++/Python] Merge "Feather" file format implementation
+* [ARROW-459](https://issues.apache.org/jira/browse/ARROW-459) - [C++] Implement IPC round trip for DictionaryArray, dictionaries shared across record batches
+* [ARROW-483](https://issues.apache.org/jira/browse/ARROW-483) - [C++/Python] Provide access to "custom\_metadata" Field attribute in IPC setting
+* [ARROW-491](https://issues.apache.org/jira/browse/ARROW-491) - [C++] Add FixedWidthBinary type
+* [ARROW-492](https://issues.apache.org/jira/browse/ARROW-492) - [C++] Add arrow/arrow.h public API
+* [ARROW-493](https://issues.apache.org/jira/browse/ARROW-493) - [C++] Allow in-memory array over 2^31 -1 elements but require splitting at IPC / RPC boundaries
+* [ARROW-502](https://issues.apache.org/jira/browse/ARROW-502) - [C++/Python] Add MemoryPool implementation that logs allocation activity to std::cout
+* [ARROW-510](https://issues.apache.org/jira/browse/ARROW-510) - Add integration tests for date and time types
+* [ARROW-518](https://issues.apache.org/jira/browse/ARROW-518) - C++: Make Status::OK method constexpr
+* [ARROW-520](https://issues.apache.org/jira/browse/ARROW-520) - [C++] Add STL-compliant allocator that hooks into an arrow::MemoryPool
+* [ARROW-528](https://issues.apache.org/jira/browse/ARROW-528) - [Python] Support \_metadata or \_common\_metadata files when reading Parquet directories
+* [ARROW-534](https://issues.apache.org/jira/browse/ARROW-534) - [C++] Add IPC tests for date/time types
+* [ARROW-539](https://issues.apache.org/jira/browse/ARROW-539) - [Python] Support reading Parquet datasets with standard partition directory schemes
+* [ARROW-542](https://issues.apache.org/jira/browse/ARROW-542) - [Java] Implement dictionaries in stream/file encoding
+* [ARROW-550](https://issues.apache.org/jira/browse/ARROW-550) - [Format] Add a TensorMessage type
+* [ARROW-552](https://issues.apache.org/jira/browse/ARROW-552) - [Python] Add scalar value support for Dictionary type
+* [ARROW-557](https://issues.apache.org/jira/browse/ARROW-557) - [Python] Explicitly opt in to HDFS unit tests
+* [ARROW-563](https://issues.apache.org/jira/browse/ARROW-563) - C++: Support non-standard gcc version strings
+* [ARROW-566](https://issues.apache.org/jira/browse/ARROW-566) - Python: Deterministic position of libarrow in manylinux1 wheels
+* [ARROW-568](https://issues.apache.org/jira/browse/ARROW-568) - [C++] Add default implementations for TypeVisitor, ArrayVisitor methods that return NotImplemented
+* [ARROW-569](https://issues.apache.org/jira/browse/ARROW-569) - [C++] Set version for \*.pc
+* [ARROW-574](https://issues.apache.org/jira/browse/ARROW-574) - Python: Add support for nested Python lists in Pandas conversion
+* [ARROW-576](https://issues.apache.org/jira/browse/ARROW-576) - [C++] Complete round trip Union file/stream IPC tests
+* [ARROW-577](https://issues.apache.org/jira/browse/ARROW-577) - [C++] Refactor StreamWriter and FileWriter to have private implementations
+* [ARROW-578](https://issues.apache.org/jira/browse/ARROW-578) - [C++] Add CMake option to add custom $CXXFLAGS
+* [ARROW-580](https://issues.apache.org/jira/browse/ARROW-580) - C++: Also provide jemalloc\_X targets if only a static or shared version is found
+* [ARROW-582](https://issues.apache.org/jira/browse/ARROW-582) - [Java] Add Date/Time Support to JSON File
+* [ARROW-589](https://issues.apache.org/jira/browse/ARROW-589) - C++: Use system provided shared jemalloc if static is unavailable
+* [ARROW-591](https://issues.apache.org/jira/browse/ARROW-591) - [C++] Add round trip testing fixture for JSON format
+* [ARROW-593](https://issues.apache.org/jira/browse/ARROW-593) - [C++] Rename ReadableFileInterface to RandomAccessFile
+* [ARROW-598](https://issues.apache.org/jira/browse/ARROW-598) - [Python] Add support for converting pyarrow.Buffer to a memoryview with zero copy
+* [ARROW-603](https://issues.apache.org/jira/browse/ARROW-603) - [C++] Add RecordBatch::Validate method that at least checks that schema matches the array metadata
+* [ARROW-605](https://issues.apache.org/jira/browse/ARROW-605) - [C++] Refactor generic ArrayLoader class, support work for Feather merge
+* [ARROW-606](https://issues.apache.org/jira/browse/ARROW-606) - [C++] Upgrade to flatbuffers 1.6.0
+* [ARROW-608](https://issues.apache.org/jira/browse/ARROW-608) - [Format] Days since epoch date type
+* [ARROW-610](https://issues.apache.org/jira/browse/ARROW-610) - [C++] Win32 compatibility in file.cc
+* [ARROW-612](https://issues.apache.org/jira/browse/ARROW-612) - [Java] Field toString should show nullable flag status
+* [ARROW-615](https://issues.apache.org/jira/browse/ARROW-615) - Move ByteArrayReadableSeekableByteChannel to vector.util package
+* [ARROW-616](https://issues.apache.org/jira/browse/ARROW-616) - [C++] Remove -g flag in release builds
+* [ARROW-618](https://issues.apache.org/jira/browse/ARROW-618) - [Python] Implement support for DatetimeTZ custom type from pandas
+* [ARROW-620](https://issues.apache.org/jira/browse/ARROW-620) - [C++] Add date/time support to JSON reader/writer for integration testing
+* [ARROW-621](https://issues.apache.org/jira/browse/ARROW-621) - [C++] Implement an "inline visitor" template that enables visitor-pattern-like code without virtual function dispatch
+* [ARROW-625](https://issues.apache.org/jira/browse/ARROW-625) - [C++] Add time unit to TimeType::ToString
+* [ARROW-626](https://issues.apache.org/jira/browse/ARROW-626) - [Python] Enable pyarrow.BufferReader to read from any Python object implementing the buffer/memoryview protocol
+* [ARROW-631](https://issues.apache.org/jira/browse/ARROW-631) - [GLib] Import C API (C++ API wrapper) based on GLib from https://github.com/kou/arrow-glib
+* [ARROW-632](https://issues.apache.org/jira/browse/ARROW-632) - [Python] Add support for FixedWidthBinary type
+* [ARROW-635](https://issues.apache.org/jira/browse/ARROW-635) - [C++] Add JSON read/write support for FixedWidthBinary
+* [ARROW-637](https://issues.apache.org/jira/browse/ARROW-637) - [Format] Add time zone metadata to Timestamp type
+* [ARROW-646](https://issues.apache.org/jira/browse/ARROW-646) - Cache miniconda packages
+* [ARROW-647](https://issues.apache.org/jira/browse/ARROW-647) - [C++] Don't require Boost static libraries to support CentOS 7
+* [ARROW-648](https://issues.apache.org/jira/browse/ARROW-648) - [C++] Support multiarch on Debian
+* [ARROW-650](https://issues.apache.org/jira/browse/ARROW-650) - [GLib] Follow eadableFileInterface -\> RnadomAccessFile change
+* [ARROW-651](https://issues.apache.org/jira/browse/ARROW-651) - [C++] Set shared library version for .deb packages
+* [ARROW-655](https://issues.apache.org/jira/browse/ARROW-655) - Implement DecimalArray
+* [ARROW-656](https://issues.apache.org/jira/browse/ARROW-656) - [C++] Implement IO interface that can read and write to a fixed-size mutable buffer
+* [ARROW-657](https://issues.apache.org/jira/browse/ARROW-657) - [Python] Write and read tensors (with zero copy) into shared memory
+* [ARROW-658](https://issues.apache.org/jira/browse/ARROW-658) - [C++] Implement in-memory arrow::Tensor objects
+* [ARROW-659](https://issues.apache.org/jira/browse/ARROW-659) - [C++] Add multithreaded memcpy implementation (for hardware where it helps)
+* [ARROW-660](https://issues.apache.org/jira/browse/ARROW-660) - [C++] Restore function that can read a complete encapsulated record batch message
+* [ARROW-661](https://issues.apache.org/jira/browse/ARROW-661) - [C++] Add a Flatbuffer metadata type that supports array data over 2^31 - 1 elements
+* [ARROW-662](https://issues.apache.org/jira/browse/ARROW-662) - [Format] Factor Flatbuffer schema metadata into a Schema.fbs
+* [ARROW-663](https://issues.apache.org/jira/browse/ARROW-663) - [Java] Support additional Time metadata + vector value accessors
+* [ARROW-664](https://issues.apache.org/jira/browse/ARROW-664) - Make C++ Arrow serialization deterministic
+* [ARROW-669](https://issues.apache.org/jira/browse/ARROW-669) - [Python] Attach proper tzinfo when computing boxed scalars for TimestampArray
+* [ARROW-670](https://issues.apache.org/jira/browse/ARROW-670) - Arrow 0.3 release
+* [ARROW-672](https://issues.apache.org/jira/browse/ARROW-672) - [Format] Bump metadata version for 0.3 release
+* [ARROW-674](https://issues.apache.org/jira/browse/ARROW-674) - [Java] Support additional Timestamp timezone metadata
+* [ARROW-675](https://issues.apache.org/jira/browse/ARROW-675) - [GLib] Update package metadata
+* [ARROW-676](https://issues.apache.org/jira/browse/ARROW-676) - [java] move from MinorType to FieldType in ValueVectors to carry all the relevant type bits
+* [ARROW-679](https://issues.apache.org/jira/browse/ARROW-679) - [Format] Change RecordBatch and Field length members from int to long
+* [ARROW-681](https://issues.apache.org/jira/browse/ARROW-681) - [C++] Build Arrow on Windows with dynamically linked boost
+* [ARROW-684](https://issues.apache.org/jira/browse/ARROW-684) - Python: More informative message when parquet-cpp but not parquet-arrow is available
+* [ARROW-687](https://issues.apache.org/jira/browse/ARROW-687) - [C++] Build and run full test suite in Appveyor
+* [ARROW-688](https://issues.apache.org/jira/browse/ARROW-688) - [C++] Use CMAKE\_INSTALL\_INCLUDEDIR for consistency
+* [ARROW-690](https://issues.apache.org/jira/browse/ARROW-690) - Only send JIRA updates to [email protected]
+* [ARROW-698](https://issues.apache.org/jira/browse/ARROW-698) - [C++] Add options to StreamWriter/FileWriter to permit large record batches
+* [ARROW-700](https://issues.apache.org/jira/browse/ARROW-700) - Add headroom interface for allocator.
+* [ARROW-701](https://issues.apache.org/jira/browse/ARROW-701) - [Java] Support additional Date metadata
+* [ARROW-706](https://issues.apache.org/jira/browse/ARROW-706) - [GLib] Add package install document
+* [ARROW-707](https://issues.apache.org/jira/browse/ARROW-707) - Python: All none-Pandas column should be converted to NullArray
+* [ARROW-708](https://issues.apache.org/jira/browse/ARROW-708) - [C++] Some IPC code simplification, perf analysis
+* [ARROW-710](https://issues.apache.org/jira/browse/ARROW-710) - [Python] Enable Feather APIs to read and write using Python file-like objects
+* [ARROW-711](https://issues.apache.org/jira/browse/ARROW-711) - [C++] Remove extern template declarations for NumericArray<T\> types
+* [ARROW-712](https://issues.apache.org/jira/browse/ARROW-712) - [C++] Implement Array::Accept as inline visitor
+* [ARROW-717](https://issues.apache.org/jira/browse/ARROW-717) - [C++] IPC zero-copy round trips for arrow::Tensor
+* [ARROW-718](https://issues.apache.org/jira/browse/ARROW-718) - [Python] Expose arrow::Tensor with conversions to/from NumPy arrays
+* [ARROW-719](https://issues.apache.org/jira/browse/ARROW-719) - [GLib] Support prepared source archive release
+* [ARROW-722](https://issues.apache.org/jira/browse/ARROW-722) - [Python] pandas conversions for new date and time types/metadata
+* [ARROW-724](https://issues.apache.org/jira/browse/ARROW-724) - Add "How to Contribute" section to README
+* [ARROW-725](https://issues.apache.org/jira/browse/ARROW-725) - [Format] Constant length list type
+* [ARROW-727](https://issues.apache.org/jira/browse/ARROW-727) - [Python] Write memoryview-compatible objects in NativeFile.write with zero copy
+* [ARROW-728](https://issues.apache.org/jira/browse/ARROW-728) - [C++/Python] Add arrow::Table function for removing a column
+* [ARROW-729](https://issues.apache.org/jira/browse/ARROW-729) - [Java] Add vector type for 32-bit date as days since UNIX epoch
+* [ARROW-731](https://issues.apache.org/jira/browse/ARROW-731) - [C++] Add shared library related versions to .pc
+* [ARROW-733](https://issues.apache.org/jira/browse/ARROW-733) - [C++/Format] Change name of Fixed Width Binary to Fixed \*Size\* Binary for consistency
+* [ARROW-734](https://issues.apache.org/jira/browse/ARROW-734) - [Python] Support for pyarrow on Windows / MSVC
+* [ARROW-735](https://issues.apache.org/jira/browse/ARROW-735) - [C++] Developer instruction document for MSVC on Windows
+* [ARROW-737](https://issues.apache.org/jira/browse/ARROW-737) - [C++] Support obtaining mutable slices of mutable buffers
+* [ARROW-741](https://issues.apache.org/jira/browse/ARROW-741) - [Python] Add Python 3.6 to Travis CI
+* [ARROW-743](https://issues.apache.org/jira/browse/ARROW-743) - [C++] Consolidate unit tests for code in array.h
+* [ARROW-744](https://issues.apache.org/jira/browse/ARROW-744) - [GLib] Re-add an assertion to garrow\_table\_new() test
+* [ARROW-745](https://issues.apache.org/jira/browse/ARROW-745) - [C++] Allow use of system cpplint
+* [ARROW-746](https://issues.apache.org/jira/browse/ARROW-746) - [GLib] Add garrow\_array\_get\_data\_type()
+* [ARROW-748](https://issues.apache.org/jira/browse/ARROW-748) - [Python] Pin runtime library versions in conda-forge packages to force upgrades
+* [ARROW-751](https://issues.apache.org/jira/browse/ARROW-751) - [Python] Rename all Cython extensions to "private" status with leading underscore
+* [ARROW-752](https://issues.apache.org/jira/browse/ARROW-752) - [Python] Construct pyarrow.DictionaryArray from boxed pyarrow array objects
+* [ARROW-754](https://issues.apache.org/jira/browse/ARROW-754) - [GLib] Add garrow\_array\_is\_null()
+* [ARROW-755](https://issues.apache.org/jira/browse/ARROW-755) - [GLib] Add garrow\_array\_get\_value\_type()
+* [ARROW-758](https://issues.apache.org/jira/browse/ARROW-758) - [C++] Fix compiler warnings on MSVC x64
+* [ARROW-761](https://issues.apache.org/jira/browse/ARROW-761) - [Python] Add function to compute the total size of tensor payloads, including metadata and padding
+* [ARROW-763](https://issues.apache.org/jira/browse/ARROW-763) - C++: Use \`python-config\` to find libpythonX.X.dylib
+* [ARROW-765](https://issues.apache.org/jira/browse/ARROW-765) - [Python] Make generic ArrowException subclass value error
+* [ARROW-768](https://issues.apache.org/jira/browse/ARROW-768) - [Java] Change the "boxed" object representation of date and time types
+* [ARROW-769](https://issues.apache.org/jira/browse/ARROW-769) - [GLib] Support building without installed Arrow C++
+* [ARROW-770](https://issues.apache.org/jira/browse/ARROW-770) - [C++] Move clang-tidy/format config files back to C++ source tree
+* [ARROW-771](https://issues.apache.org/jira/browse/ARROW-771) - [Python] Add APIs for reading individual Parquet row groups
+* [ARROW-773](https://issues.apache.org/jira/browse/ARROW-773) - [C++] Add function to create arrow::Table with column appended to existing table
+* [ARROW-774](https://issues.apache.org/jira/browse/ARROW-774) - [GLib] Remove needless LICENSE.txt copy
+* [ARROW-775](https://issues.apache.org/jira/browse/ARROW-775) - [Java] add simple constructors to value vectors
+* [ARROW-779](https://issues.apache.org/jira/browse/ARROW-779) - [C++/Python] Raise exception if old metadata encountered
+* [ARROW-782](https://issues.apache.org/jira/browse/ARROW-782) - [C++] Change struct to class for objects that meet the criteria in the Google style guide
+* [ARROW-788](https://issues.apache.org/jira/browse/ARROW-788) - Possible nondeterminism in Tensor serialization code
+* [ARROW-795](https://issues.apache.org/jira/browse/ARROW-795) - [C++] Combine libarrow/libarrow\_io/libarrow\_ipc
+* [ARROW-798](https://issues.apache.org/jira/browse/ARROW-798) - [Docs] Publish Format Markdown documents somehow on arrow.apache.org
+* [ARROW-802](https://issues.apache.org/jira/browse/ARROW-802) - [GLib] Add read examples
+* [ARROW-803](https://issues.apache.org/jira/browse/ARROW-803) - [GLib] Update package repository URL
+* [ARROW-804](https://issues.apache.org/jira/browse/ARROW-804) - [GLib] Update build document
+* [ARROW-806](https://issues.apache.org/jira/browse/ARROW-806) - [GLib] Support add/remove a column from table
+* [ARROW-807](https://issues.apache.org/jira/browse/ARROW-807) - [GLib] Update "Since" tag
+* [ARROW-808](https://issues.apache.org/jira/browse/ARROW-808) - [GLib] Remove needless ignore entries
+* [ARROW-810](https://issues.apache.org/jira/browse/ARROW-810) - [GLib] Remove io/ipc prefix
+* [ARROW-811](https://issues.apache.org/jira/browse/ARROW-811) - [GLib] Add GArrowBuffer
+* [ARROW-815](https://issues.apache.org/jira/browse/ARROW-815) - [Java] Allow for expanding underlying buffer size after allocation
+* [ARROW-816](https://issues.apache.org/jira/browse/ARROW-816) - [C++] Use conda packages for RapidJSON, Flatbuffers to speed up builds
+* [ARROW-818](https://issues.apache.org/jira/browse/ARROW-818) - [Python] Review public pyarrow.\* API completeness and update docs
+* [ARROW-820](https://issues.apache.org/jira/browse/ARROW-820) - [C++] Build dependencies for Parquet library without arrow support
+* [ARROW-825](https://issues.apache.org/jira/browse/ARROW-825) - [Python] Generalize pyarrow.from\_pylist to accept any object implementing the PySequence protocol
+* [ARROW-827](https://issues.apache.org/jira/browse/ARROW-827) - [Python] Variety of Parquet improvements to support Dask integration
+* [ARROW-828](https://issues.apache.org/jira/browse/ARROW-828) - [CPP] Document new requirement (libboost-regex-dev) in README.md
+* [ARROW-831](https://issues.apache.org/jira/browse/ARROW-831) - Switch from boost::regex to std::regex
+* [ARROW-832](https://issues.apache.org/jira/browse/ARROW-832) - [C++] Upgrade thirdparty gtest to 1.8.0
+* [ARROW-833](https://issues.apache.org/jira/browse/ARROW-833) - [Python] "Quickstart" build / environment setup guide for Python developers
+* [ARROW-841](https://issues.apache.org/jira/browse/ARROW-841) - [Python] Add pyarrow build to Appveyor
+* [ARROW-844](https://issues.apache.org/jira/browse/ARROW-844) - [Format] Revise format/README.md to reflect progress reaching a more complete specification
+* [ARROW-845](https://issues.apache.org/jira/browse/ARROW-845) - [Python] Sync FindArrow.cmake changes from parquet-cpp
+* [ARROW-846](https://issues.apache.org/jira/browse/ARROW-846) - [GLib] Add GArrowTensor, GArrowInt8Tensor and GArrowUInt8Tensor
+* [ARROW-848](https://issues.apache.org/jira/browse/ARROW-848) - [Python] Improvements / fixes to conda quickstart guide
+* [ARROW-849](https://issues.apache.org/jira/browse/ARROW-849) - [C++] Add optional $ARROW\_BUILD\_TOOLCHAIN environment variable option for configuring build environment
+* [ARROW-857](https://issues.apache.org/jira/browse/ARROW-857) - [Python] Automate publishing Python documentation to arrow-site
+* [ARROW-859](https://issues.apache.org/jira/browse/ARROW-859) - [C++] Do not build unit tests by default?
+* [ARROW-860](https://issues.apache.org/jira/browse/ARROW-860) - [C++] Decide if typed Tensor subclasses are worthwhile
+* [ARROW-861](https://issues.apache.org/jira/browse/ARROW-861) - [Python] Move DEVELOPMENT.md to Sphinx docs
+* [ARROW-862](https://issues.apache.org/jira/browse/ARROW-862) - [Python] Improve source build instructions in README
+* [ARROW-863](https://issues.apache.org/jira/browse/ARROW-863) - [GLib] Use GBytes to implement zero-copy
+* [ARROW-864](https://issues.apache.org/jira/browse/ARROW-864) - [GLib] Unify Array files
+* [ARROW-865](https://issues.apache.org/jira/browse/ARROW-865) - [Python] Verify Parquet roundtrips for new date/time types
+* [ARROW-868](https://issues.apache.org/jira/browse/ARROW-868) - [GLib] Use GBytes to reduce copy
+* [ARROW-869](https://issues.apache.org/jira/browse/ARROW-869) - [JS] Rename directory to js/
+* [ARROW-871](https://issues.apache.org/jira/browse/ARROW-871) - [GLib] Unify DataType files
+* [ARROW-876](https://issues.apache.org/jira/browse/ARROW-876) - [GLib] Unify ArrayBuffer files
+* [ARROW-877](https://issues.apache.org/jira/browse/ARROW-877) - [GLib] Add garrow\_array\_get\_null\_bitmap()
+* [ARROW-878](https://issues.apache.org/jira/browse/ARROW-878) - [GLib] Add garrow\_binary\_array\_get\_buffer()
+* [ARROW-880](https://issues.apache.org/jira/browse/ARROW-880) - [GLib] Add garrow\_primitive\_array\_get\_buffer()
+* [ARROW-890](https://issues.apache.org/jira/browse/ARROW-890) - [GLib] Add GArrowMutableBuffer
+* [ARROW-892](https://issues.apache.org/jira/browse/ARROW-892) - [GLib] Fix GArrowTensor document
+* [ARROW-893](https://issues.apache.org/jira/browse/ARROW-893) - Add GLib document to Web site
+* [ARROW-894](https://issues.apache.org/jira/browse/ARROW-894) - [GLib] Add GArrowPoolBuffer
+* [ARROW-896](https://issues.apache.org/jira/browse/ARROW-896) - [Docs] Add Jekyll plugin for including rendered Jupyter notebooks on website
+* [ARROW-898](https://issues.apache.org/jira/browse/ARROW-898) - [C++] Expand metadata support to field level, provide for sharing instances of KeyValueMetadata
+* [ARROW-904](https://issues.apache.org/jira/browse/ARROW-904) - [GLib] Simplify error check codes
+* [ARROW-907](https://issues.apache.org/jira/browse/ARROW-907) - C++: Convenience construct Table from schema and arrays
+* [ARROW-908](https://issues.apache.org/jira/browse/ARROW-908) - [GLib] Unify OutputStream files
+* [ARROW-910](https://issues.apache.org/jira/browse/ARROW-910) - [C++] Write 0-length EOS indicator at end of stream
+* [ARROW-916](https://issues.apache.org/jira/browse/ARROW-916) - [GLib] Add GArrowBufferOutputStream
+* [ARROW-917](https://issues.apache.org/jira/browse/ARROW-917) - [GLib] Add GArrowBufferReader
+* [ARROW-918](https://issues.apache.org/jira/browse/ARROW-918) - [GLib] Use GArrowBuffer for read
+* [ARROW-919](https://issues.apache.org/jira/browse/ARROW-919) - [GLib] Use "id" to get type enum value from GArrowDataType
+* [ARROW-920](https://issues.apache.org/jira/browse/ARROW-920) - [GLib] Add Lua examples
+* [ARROW-925](https://issues.apache.org/jira/browse/ARROW-925) - [GLib] Fix GArrowBufferReader test
+* [ARROW-926](https://issues.apache.org/jira/browse/ARROW-926) - Update KEYS to include wesm
+* [ARROW-930](https://issues.apache.org/jira/browse/ARROW-930) - javadoc generation fails with java 8
+* [ARROW-931](https://issues.apache.org/jira/browse/ARROW-931) - [GLib] Reconstruct input stream
+* [ARROW-965](https://issues.apache.org/jira/browse/ARROW-965) - Website updates for 0.3.0 release
+
+
+
+# Apache Arrow 0.2.0 (2017-02-18)
+
+## Bug Fixes
+
+* [ARROW-112](https://issues.apache.org/jira/browse/ARROW-112) - [C++] Style fix for constants/enums
+* [ARROW-202](https://issues.apache.org/jira/browse/ARROW-202) - [C++] Integrate with appveyor ci for windows support and get arrow building on windows
+* [ARROW-220](https://issues.apache.org/jira/browse/ARROW-220) - [C++] Build conda artifacts in a build environment with better cross-linux ABI compatibility
+* [ARROW-224](https://issues.apache.org/jira/browse/ARROW-224) - [C++] Address static linking of boost dependencies
+* [ARROW-230](https://issues.apache.org/jira/browse/ARROW-230) - Python: Do not name modules like native ones (i.e. rename pyarrow.io)
+* [ARROW-239](https://issues.apache.org/jira/browse/ARROW-239) - [Python] HdfsFile.read called with no arguments should read remainder of file
+* [ARROW-261](https://issues.apache.org/jira/browse/ARROW-261) - [C++] Refactor BinaryArray/StringArray classes to not inherit from ListArray
+* [ARROW-273](https://issues.apache.org/jira/browse/ARROW-273) - Lists use unsigned offset vectors instead of signed (as defined in the spec)
+* [ARROW-275](https://issues.apache.org/jira/browse/ARROW-275) - Add tests for UnionVector in Arrow File
+* [ARROW-294](https://issues.apache.org/jira/browse/ARROW-294) - [C++] Do not use fopen / fclose / etc. methods for memory mapped file implementation
+* [ARROW-322](https://issues.apache.org/jira/browse/ARROW-322) - [C++] Do not build HDFS IO interface optionally
+* [ARROW-323](https://issues.apache.org/jira/browse/ARROW-323) - [Python] Opt-in to PyArrow parquet build rather than skipping silently on failure
+* [ARROW-334](https://issues.apache.org/jira/browse/ARROW-334) - [Python] OS X rpath issues on some configurations
+* [ARROW-337](https://issues.apache.org/jira/browse/ARROW-337) - UnionListWriter.list() is doing more than it should, this can cause data corruption
+* [ARROW-339](https://issues.apache.org/jira/browse/ARROW-339) - Make merge\_arrow\_pr script work with Python 3
+* [ARROW-339](https://issues.apache.org/jira/browse/ARROW-339) - Make merge\_arrow\_pr script work with Python 3
+* [ARROW-340](https://issues.apache.org/jira/browse/ARROW-340) - [C++] Opening a writeable file on disk that already exists does not truncate to zero
+* [ARROW-342](https://issues.apache.org/jira/browse/ARROW-342) - Set Python version on release
+* [ARROW-345](https://issues.apache.org/jira/browse/ARROW-345) - libhdfs integration doesn't work for Mac
+* [ARROW-346](https://issues.apache.org/jira/browse/ARROW-346) - Python API Documentation
+* [ARROW-348](https://issues.apache.org/jira/browse/ARROW-348) - [Python] CMake build type should be configurable on the command line
+* [ARROW-349](https://issues.apache.org/jira/browse/ARROW-349) - Six is missing as a requirement in the python setup.py
+* [ARROW-351](https://issues.apache.org/jira/browse/ARROW-351) - Time type has no unit
+* [ARROW-354](https://issues.apache.org/jira/browse/ARROW-354) - Connot compare an array of empty strings to another
+* [ARROW-357](https://issues.apache.org/jira/browse/ARROW-357) - Default Parquet chunk\_size of 64k is too small
+* [ARROW-358](https://issues.apache.org/jira/browse/ARROW-358) - [C++] libhdfs can be in non-standard locations in some Hadoop distributions
+* [ARROW-362](https://issues.apache.org/jira/browse/ARROW-362) - Python: Calling to\_pandas on a table read from Parquet leaks memory
+* [ARROW-371](https://issues.apache.org/jira/browse/ARROW-371) - Python: Table with null timestamp becomes float in pandas
+* [ARROW-375](https://issues.apache.org/jira/browse/ARROW-375) - columns parameter in parquet.read\_table() raises KeyError for valid column
+* [ARROW-384](https://issues.apache.org/jira/browse/ARROW-384) - Align Java and C++ RecordBatch data and metadata layout
+* [ARROW-386](https://issues.apache.org/jira/browse/ARROW-386) - [Java] Respect case of struct / map field names
+* [ARROW-387](https://issues.apache.org/jira/browse/ARROW-387) - [C++] arrow::io::BufferReader does not permit shared memory ownership in zero-copy reads
+* [ARROW-390](https://issues.apache.org/jira/browse/ARROW-390) - C++: CMake fails on json-integration-test with ARROW\_BUILD\_TESTS=OFF
+* [ARROW-392](https://issues.apache.org/jira/browse/ARROW-392) - Fix string/binary integration tests
+* [ARROW-393](https://issues.apache.org/jira/browse/ARROW-393) - [JAVA] JSON file reader fails to set the buffer size on String data vector
+* [ARROW-395](https://issues.apache.org/jira/browse/ARROW-395) - Arrow file format writes record batches in reverse order.
+* [ARROW-398](https://issues.apache.org/jira/browse/ARROW-398) - [Java] Java file format requires bitmaps of all 1's to be written when there are no nulls
+* [ARROW-399](https://issues.apache.org/jira/browse/ARROW-399) - [Java] ListVector.loadFieldBuffers ignores the ArrowFieldNode length metadata
+* [ARROW-400](https://issues.apache.org/jira/browse/ARROW-400) - [Java] ArrowWriter writes length 0 for Struct types
+* [ARROW-401](https://issues.apache.org/jira/browse/ARROW-401) - [Java] Floating point vectors should do an approximate comparison in integration tests
+* [ARROW-402](https://issues.apache.org/jira/browse/ARROW-402) - [Java] "refCnt gone negative" error in integration tests
+* [ARROW-403](https://issues.apache.org/jira/browse/ARROW-403) - [JAVA] UnionVector: Creating a transfer pair doesn't transfer the schema to destination vector
+* [ARROW-404](https://issues.apache.org/jira/browse/ARROW-404) - [Python] Closing an HdfsClient while there are still open file handles results in a crash
+* [ARROW-405](https://issues.apache.org/jira/browse/ARROW-405) - [C++] Be less stringent about finding include/hdfs.h in HADOOP\_HOME
+* [ARROW-406](https://issues.apache.org/jira/browse/ARROW-406) - [C++] Large HDFS reads must utilize the set file buffer size when making RPCs
+* [ARROW-408](https://issues.apache.org/jira/browse/ARROW-408) - [C++/Python] Remove defunct conda recipes
+* [ARROW-414](https://issues.apache.org/jira/browse/ARROW-414) - [Java] "Buffer too large to resize to ..." error
+* [ARROW-420](https://issues.apache.org/jira/browse/ARROW-420) - Align Date implementation between Java and C++
+* [ARROW-421](https://issues.apache.org/jira/browse/ARROW-421) - [Python] Zero-copy buffers read by pyarrow::PyBytesReader must retain a reference to the parent PyBytes to avoid premature garbage collection issues
+* [ARROW-422](https://issues.apache.org/jira/browse/ARROW-422) - C++: IPC should depend on rapidjson\_ep if RapidJSON is vendored
+* [ARROW-429](https://issues.apache.org/jira/browse/ARROW-429) - git-archive SHA-256 checksums are changing
+* [ARROW-433](https://issues.apache.org/jira/browse/ARROW-433) - [Python] Date conversion is locale-dependent
+* [ARROW-434](https://issues.apache.org/jira/browse/ARROW-434) - Segfaults and encoding issues in Python Parquet reads
+* [ARROW-435](https://issues.apache.org/jira/browse/ARROW-435) - C++: Spelling mistake in if(RAPIDJSON\_VENDORED)
+* [ARROW-437](https://issues.apache.org/jira/browse/ARROW-437) - [C++] clang compiler warnings from overridden virtual functions
+* [ARROW-445](https://issues.apache.org/jira/browse/ARROW-445) - C++: arrow\_ipc is built before arrow/ipc/Message\_generated.h was generated
+* [ARROW-447](https://issues.apache.org/jira/browse/ARROW-447) - Python: Align scalar/pylist string encoding with pandas' one.
+* [ARROW-455](https://issues.apache.org/jira/browse/ARROW-455) - [C++] BufferOutputStream dtor does not call Close()
+* [ARROW-469](https://issues.apache.org/jira/browse/ARROW-469) - C++: Add option so that resize doesn't decrease the capacity
+* [ARROW-481](https://issues.apache.org/jira/browse/ARROW-481) - [Python] Fix Python 2.7 regression in patch for PARQUET-472
+* [ARROW-486](https://issues.apache.org/jira/browse/ARROW-486) - [C++] arrow::io::MemoryMappedFile can't be casted to arrow::io::FileInterface
+* [ARROW-487](https://issues.apache.org/jira/browse/ARROW-487) - Python: ConvertTableToPandas segfaults if ObjectBlock::Write fails
+* [ARROW-494](https://issues.apache.org/jira/browse/ARROW-494) - [C++] When MemoryMappedFile is destructed, memory is unmapped even if buffer referecnes still exist
+* [ARROW-499](https://issues.apache.org/jira/browse/ARROW-499) - Update file serialization to use streaming serialization format
+* [ARROW-505](https://issues.apache.org/jira/browse/ARROW-505) - [C++] Fix compiler warnings in release mode
+* [ARROW-511](https://issues.apache.org/jira/browse/ARROW-511) - [Python] List[T] conversions not implemented for single arrays
+* [ARROW-513](https://issues.apache.org/jira/browse/ARROW-513) - [C++] Fix Appveyor build
+* [ARROW-516](https://issues.apache.org/jira/browse/ARROW-516) - Building pyarrow with parquet
+* [ARROW-519](https://issues.apache.org/jira/browse/ARROW-519) - [C++] Missing vtable in libarrow.dylib on Xcode 6.4
+* [ARROW-523](https://issues.apache.org/jira/browse/ARROW-523) - Python: Account for changes in PARQUET-834
+* [ARROW-533](https://issues.apache.org/jira/browse/ARROW-533) - [C++] arrow::TimestampArray / TimeArray has a broken constructor
+* [ARROW-535](https://issues.apache.org/jira/browse/ARROW-535) - [Python] Add type mapping for NPY\_LONGLONG
+* [ARROW-537](https://issues.apache.org/jira/browse/ARROW-537) - [C++] StringArray/BinaryArray comparisons may be incorrect when values with non-zero length are null
+* [ARROW-540](https://issues.apache.org/jira/browse/ARROW-540) - [C++] Fix build in aftermath of ARROW-33
+* [ARROW-543](https://issues.apache.org/jira/browse/ARROW-543) - C++: Lazily computed null\_counts counts number of non-null entries
+* [ARROW-544](https://issues.apache.org/jira/browse/ARROW-544) - [C++] ArrayLoader::LoadBinary fails for length-0 arrays
+* [ARROW-545](https://issues.apache.org/jira/browse/ARROW-545) - [Python] Ignore files without .parq or .parquet prefix when reading directory of files
+* [ARROW-548](https://issues.apache.org/jira/browse/ARROW-548) - [Python] Add nthreads option to pyarrow.Filesystem.read\_parquet
+* [ARROW-551](https://issues.apache.org/jira/browse/ARROW-551) - C++: Construction of Column with nullptr Array segfaults
+* [ARROW-556](https://issues.apache.org/jira/browse/ARROW-556) - [Integration] Can not run Integration tests if different cpp build path
+* [ARROW-561](https://issues.apache.org/jira/browse/ARROW-561) - Update java & python dependencies to improve downstream packaging experience
+* [ARROW-562](https://issues.apache.org/jira/browse/ARROW-562) - Mockito should be in test scope
+
+
+## New Features and Improvements
+
+* [ARROW-33](https://issues.apache.org/jira/browse/ARROW-33) - C++: Implement zero-copy array slicing
+* [ARROW-81](https://issues.apache.org/jira/browse/ARROW-81) - [Format] Add a Category logical type (distinct from dictionary-encoding)
+* [ARROW-96](https://issues.apache.org/jira/browse/ARROW-96) - C++: API documentation using Doxygen
+* [ARROW-97](https://issues.apache.org/jira/browse/ARROW-97) - Python: API documentation via sphinx-apidoc
+* [ARROW-108](https://issues.apache.org/jira/browse/ARROW-108) - [C++] Add IPC round trip for union types
+* [ARROW-189](https://issues.apache.org/jira/browse/ARROW-189) - C++: Use ExternalProject to build thirdparty dependencies
+* [ARROW-191](https://issues.apache.org/jira/browse/ARROW-191) - Python: Provide infrastructure for manylinux1 wheels
+* [ARROW-221](https://issues.apache.org/jira/browse/ARROW-221) - Add switch for writing Parquet 1.0 compatible logical types
+* [ARROW-227](https://issues.apache.org/jira/browse/ARROW-227) - [C++/Python] Hook arrow\_io generic reader / writer interface into arrow\_parquet
+* [ARROW-228](https://issues.apache.org/jira/browse/ARROW-228) - [Python] Create an Arrow-cpp-compatible interface for reading bytes from Python file-like objects
+* [ARROW-240](https://issues.apache.org/jira/browse/ARROW-240) - Installation instructions for pyarrow
+* [ARROW-243](https://issues.apache.org/jira/browse/ARROW-243) - [C++] Add "driver" option to HdfsClient to choose between libhdfs and libhdfs3 at runtime
+* [ARROW-268](https://issues.apache.org/jira/browse/ARROW-268) - [C++] Flesh out union implementation to have all required methods for IPC
+* [ARROW-303](https://issues.apache.org/jira/browse/ARROW-303) - [C++] Also build static libraries for leaf libraries
+* [ARROW-312](https://issues.apache.org/jira/browse/ARROW-312) - [Python] Provide Python API to read/write the Arrow IPC file format
+* [ARROW-312](https://issues.apache.org/jira/browse/ARROW-312) - [Python] Provide Python API to read/write the Arrow IPC file format
+* [ARROW-317](https://issues.apache.org/jira/browse/ARROW-317) - [C++] Implement zero-copy Slice method on arrow::Buffer that retains reference to parent
+* [ARROW-327](https://issues.apache.org/jira/browse/ARROW-327) - [Python] Remove conda builds from Travis CI processes
+* [ARROW-328](https://issues.apache.org/jira/browse/ARROW-328) - [C++] Return shared\_ptr by value instead of const-ref?
+* [ARROW-330](https://issues.apache.org/jira/browse/ARROW-330) - [C++] CMake functions to simplify shared / static library configuration
+* [ARROW-332](https://issues.apache.org/jira/browse/ARROW-332) - [Python] Add helper function to convert RecordBatch to pandas.DataFrame
+* [ARROW-333](https://issues.apache.org/jira/browse/ARROW-333) - Make writers update their internal schema even when no data is written.
+* [ARROW-335](https://issues.apache.org/jira/browse/ARROW-335) - Improve Type apis and toString() by encapsulating flatbuffers better
+* [ARROW-336](https://issues.apache.org/jira/browse/ARROW-336) - Run Apache Rat in Travis builds
+* [ARROW-338](https://issues.apache.org/jira/browse/ARROW-338) - [C++] Refactor IPC vector "loading" and "unloading" to be based on cleaner visitor pattern
+* [ARROW-344](https://issues.apache.org/jira/browse/ARROW-344) - Instructions for building with conda
+* [ARROW-350](https://issues.apache.org/jira/browse/ARROW-350) - Add Kerberos support to HDFS shim
+* [ARROW-353](https://issues.apache.org/jira/browse/ARROW-353) - Arrow release 0.2
+* [ARROW-355](https://issues.apache.org/jira/browse/ARROW-355) - Add tests for serialising arrays of empty strings to Parquet
+* [ARROW-356](https://issues.apache.org/jira/browse/ARROW-356) - Add documentation about reading Parquet
+* [ARROW-359](https://issues.apache.org/jira/browse/ARROW-359) - Need to document ARROW\_LIBHDFS\_DIR
+* [ARROW-360](https://issues.apache.org/jira/browse/ARROW-360) - C++: Add method to shrink PoolBuffer using realloc
+* [ARROW-361](https://issues.apache.org/jira/browse/ARROW-361) - Python: Support reading a column-selection from Parquet files
+* [ARROW-363](https://issues.apache.org/jira/browse/ARROW-363) - Set up Java/C++ integration test harness
+* [ARROW-365](https://issues.apache.org/jira/browse/ARROW-365) - Python: Provide Array.to\_pandas()
+* [ARROW-366](https://issues.apache.org/jira/browse/ARROW-366) - [java] implement Dictionary vector
+* [ARROW-367](https://issues.apache.org/jira/browse/ARROW-367) - [java] converter csv/json <=\> Arrow file format for Integration tests
+* [ARROW-368](https://issues.apache.org/jira/browse/ARROW-368) - Document use of LD\_LIBRARY\_PATH when using Python
+* [ARROW-369](https://issues.apache.org/jira/browse/ARROW-369) - [Python] Add ability to convert multiple record batches at once to pandas
+* [ARROW-370](https://issues.apache.org/jira/browse/ARROW-370) - Python: Pandas conversion from \`datetime.date\` columns
+* [ARROW-372](https://issues.apache.org/jira/browse/ARROW-372) - Create JSON arrow file format for integration tests
+* [ARROW-373](https://issues.apache.org/jira/browse/ARROW-373) - [C++] Implement C++ version of JSON file format for testing
+* [ARROW-374](https://issues.apache.org/jira/browse/ARROW-374) - Python: clarify unicode vs. binary in API
+* [ARROW-377](https://issues.apache.org/jira/browse/ARROW-377) - Python: Add support for conversion of Pandas.Categorical
+* [ARROW-379](https://issues.apache.org/jira/browse/ARROW-379) - Python: Use setuptools\_scm/setuptools\_scm\_git\_archive to provide the version number
+* [ARROW-380](https://issues.apache.org/jira/browse/ARROW-380) - [Java] optimize null count when serializing vectors.
+* [ARROW-381](https://issues.apache.org/jira/browse/ARROW-381) - [C++] Simplify primitive array type builders to use a default type singleton
+* [ARROW-382](https://issues.apache.org/jira/browse/ARROW-382) - Python: Extend API documentation
+* [ARROW-383](https://issues.apache.org/jira/browse/ARROW-383) - [C++] Implement C++ version of ARROW-367 integration test validator
+* [ARROW-389](https://issues.apache.org/jira/browse/ARROW-389) - Python: Write Parquet files to pyarrow.io.NativeFile objects
+* [ARROW-394](https://issues.apache.org/jira/browse/ARROW-394) - Add integration tests for boolean, list, struct, and other basic types
+* [ARROW-396](https://issues.apache.org/jira/browse/ARROW-396) - Python: Add pyarrow.schema.Schema.equals
+* [ARROW-409](https://issues.apache.org/jira/browse/ARROW-409) - Python: Change pyarrow.Table.dataframe\_from\_batches API to create Table instead
+* [ARROW-410](https://issues.apache.org/jira/browse/ARROW-410) - [C++] Add Flush method to arrow::io::OutputStream
+* [ARROW-411](https://issues.apache.org/jira/browse/ARROW-411) - [Java] Move Intergration.compare and Intergration.compareSchemas to a public utils class
+* [ARROW-415](https://issues.apache.org/jira/browse/ARROW-415) - C++: Add Equals implementation to compare Tables
+* [ARROW-416](https://issues.apache.org/jira/browse/ARROW-416) - C++: Add Equals implementation to compare Columns
+* [ARROW-417](https://issues.apache.org/jira/browse/ARROW-417) - C++: Add Equals implementation to compare ChunkedArrays
+* [ARROW-418](https://issues.apache.org/jira/browse/ARROW-418) - [C++] Consolidate array container and builder code, remove arrow/types
+* [ARROW-419](https://issues.apache.org/jira/browse/ARROW-419) - [C++] Promote util/{status.h, buffer.h, memory-pool.h} to top level of arrow/ source directory
+* [ARROW-423](https://issues.apache.org/jira/browse/ARROW-423) - C++: Define BUILD\_BYPRODUCTS in external project to support non-make CMake generators
+* [ARROW-425](https://issues.apache.org/jira/browse/ARROW-425) - Python: Expose a C function to convert arrow::Table to pyarrow.Table
+* [ARROW-426](https://issues.apache.org/jira/browse/ARROW-426) - Python: Conversion from pyarrow.Array to a Python list
+* [ARROW-427](https://issues.apache.org/jira/browse/ARROW-427) - [C++] Implement dictionary-encoded array container
+* [ARROW-428](https://issues.apache.org/jira/browse/ARROW-428) - [Python] Deserialize from Arrow record batches to pandas in parallel using a thread pool
+* [ARROW-430](https://issues.apache.org/jira/browse/ARROW-430) - Python: Better version handling
+* [ARROW-432](https://issues.apache.org/jira/browse/ARROW-432) - [Python] Avoid unnecessary memory copy in to\_pandas conversion by using low-level pandas internals APIs
+* [ARROW-438](https://issues.apache.org/jira/browse/ARROW-438) - [Python] Concatenate Table instances with equal schemas
+* [ARROW-440](https://issues.apache.org/jira/browse/ARROW-440) - [C++] Support pkg-config
+* [ARROW-441](https://issues.apache.org/jira/browse/ARROW-441) - [Python] Expose Arrow's file and memory map classes as NativeFile subclasses
+* [ARROW-442](https://issues.apache.org/jira/browse/ARROW-442) - [Python] Add public Python API to inspect Parquet file metadata
+* [ARROW-444](https://issues.apache.org/jira/browse/ARROW-444) - [Python] Avoid unnecessary memory copies from use of PyBytes\_\* C APIs
+* [ARROW-449](https://issues.apache.org/jira/browse/ARROW-449) - Python: Conversion from pyarrow.{Table,RecordBatch} to a Python dict
+* [ARROW-450](https://issues.apache.org/jira/browse/ARROW-450) - Python: Fixes for PARQUET-818
+* [ARROW-456](https://issues.apache.org/jira/browse/ARROW-456) - C++: Add jemalloc based MemoryPool
+* [ARROW-457](https://issues.apache.org/jira/browse/ARROW-457) - Python: Better control over memory pool
+* [ARROW-458](https://issues.apache.org/jira/browse/ARROW-458) - [Python] Expose jemalloc MemoryPool
+* [ARROW-461](https://issues.apache.org/jira/browse/ARROW-461) - [Python] Implement conversion between arrow::DictionaryArray and pandas.Categorical
+* [ARROW-463](https://issues.apache.org/jira/browse/ARROW-463) - C++: Support jemalloc 4.x
+* [ARROW-466](https://issues.apache.org/jira/browse/ARROW-466) - C++: ExternalProject for jemalloc
+* [ARROW-467](https://issues.apache.org/jira/browse/ARROW-467) - [Python] Run parquet-cpp unit tests in Travis CI
+* [ARROW-468](https://issues.apache.org/jira/browse/ARROW-468) - Python: Conversion of nested data in pd.DataFrames to/from Arrow structures
+* [ARROW-470](https://issues.apache.org/jira/browse/ARROW-470) - [Python] Add "FileSystem" abstraction to access directories of files in a uniform way
+* [ARROW-471](https://issues.apache.org/jira/browse/ARROW-471) - [Python] Enable ParquetFile to pass down separately-obtained file metadata
+* [ARROW-472](https://issues.apache.org/jira/browse/ARROW-472) - [Python] Expose parquet::{SchemaDescriptor, ColumnDescriptor}::Equals
+* [ARROW-474](https://issues.apache.org/jira/browse/ARROW-474) - Create an Arrow streaming file fomat
+* [ARROW-475](https://issues.apache.org/jira/browse/ARROW-475) - [Python] High level support for reading directories of Parquet files (as a single Arrow table) from supported file system interfaces
+* [ARROW-476](https://issues.apache.org/jira/browse/ARROW-476) - [Integration] Add integration tests for Binary / Varbytes type
+* [ARROW-477](https://issues.apache.org/jira/browse/ARROW-477) - [Java] Add support for second/microsecond/nanosecond timestamps in-memory and in IPC/JSON layer
+* [ARROW-478](https://issues.apache.org/jira/browse/ARROW-478) - [Python] Accept a PyBytes object in the pyarrow.io.BufferReader ctor
+* [ARROW-479](https://issues.apache.org/jira/browse/ARROW-479) - Python: Test for expected schema in Pandas conversion
+* [ARROW-484](https://issues.apache.org/jira/browse/ARROW-484) - Add more detail about what of technology can be found in the Arrow implementations to README
+* [ARROW-485](https://issues.apache.org/jira/browse/ARROW-485) - [Java] Users are required to initialize VariableLengthVectors.offsetVector before calling VariableLengthVectors.mutator.getSafe
+* [ARROW-490](https://issues.apache.org/jira/browse/ARROW-490) - Python: Update manylinux1 build scripts
+* [ARROW-495](https://issues.apache.org/jira/browse/ARROW-495) - [C++] Add C++ implementation of streaming serialized format
+* [ARROW-497](https://issues.apache.org/jira/browse/ARROW-497) - [Java] Integration test harness for streaming format
+* [ARROW-498](https://issues.apache.org/jira/browse/ARROW-498) - [C++] Integration test harness for streaming format
+* [ARROW-503](https://issues.apache.org/jira/browse/ARROW-503) - [Python] Interface to streaming binary format
+* [ARROW-506](https://issues.apache.org/jira/browse/ARROW-506) - Implement Arrow Echo server for integration testing
+* [ARROW-508](https://issues.apache.org/jira/browse/ARROW-508) - [C++] Make file/memory-mapped file interfaces threadsafe
+* [ARROW-509](https://issues.apache.org/jira/browse/ARROW-509) - [Python] Add support for PARQUET-835 (parallel column reads)
+* [ARROW-512](https://issues.apache.org/jira/browse/ARROW-512) - C++: Add method to check for primitive types
+* [ARROW-514](https://issues.apache.org/jira/browse/ARROW-514) - [Python] Accept pyarrow.io.Buffer as input to StreamReader, FileReader classes
+* [ARROW-515](https://issues.apache.org/jira/browse/ARROW-515) - [Python] Add StreamReader/FileReader methods that read all record batches as a Table
+* [ARROW-521](https://issues.apache.org/jira/browse/ARROW-521) - [C++/Python] Track peak memory use in default MemoryPool
+* [ARROW-524](https://issues.apache.org/jira/browse/ARROW-524) - [java] provide apis to access nested vectors and buffers
+* [ARROW-525](https://issues.apache.org/jira/browse/ARROW-525) - Python: Add more documentation to the package
+* [ARROW-527](https://issues.apache.org/jira/browse/ARROW-527) - clean drill-module.conf file
+* [ARROW-529](https://issues.apache.org/jira/browse/ARROW-529) - Python: Add jemalloc and Python 3.6 to manylinux1 build
+* [ARROW-531](https://issues.apache.org/jira/browse/ARROW-531) - Python: Document jemalloc, extend Pandas section, add Getting Involved
+* [ARROW-538](https://issues.apache.org/jira/browse/ARROW-538) - [C++] Set up AddressSanitizer (ASAN) builds
+* [ARROW-546](https://issues.apache.org/jira/browse/ARROW-546) - Python: Account for changes in PARQUET-867
+* [ARROW-547](https://issues.apache.org/jira/browse/ARROW-547) - [Python] Expose Array::Slice and RecordBatch::Slice
+* [ARROW-553](https://issues.apache.org/jira/browse/ARROW-553) - C++: Faster valid bitmap building
+* [ARROW-558](https://issues.apache.org/jira/browse/ARROW-558) - Add KEYS files
+
+
+
+# Apache Arrow 0.1.0 (2016-10-10)
+
+## New Features and Improvements
+
+* [ARROW-1](https://issues.apache.org/jira/browse/ARROW-1) - Import Initial Codebase
+* [ARROW-2](https://issues.apache.org/jira/browse/ARROW-2) - Post Simple Website
+* [ARROW-3](https://issues.apache.org/jira/browse/ARROW-3) - Post Initial Arrow Format Spec
+* [ARROW-4](https://issues.apache.org/jira/browse/ARROW-4) - Initial Arrow CPP Implementation
+* [ARROW-7](https://issues.apache.org/jira/browse/ARROW-7) - Add Python library build toolchain
+* [ARROW-8](https://issues.apache.org/jira/browse/ARROW-8) - Set up Travis CI
+* [ARROW-9](https://issues.apache.org/jira/browse/ARROW-9) - Rename some unchanged "Drill" to "Arrow"
+* [ARROW-9](https://issues.apache.org/jira/browse/ARROW-9) - Rename some unchanged "Drill" to "Arrow"
+* [ARROW-10](https://issues.apache.org/jira/browse/ARROW-10) - Fix mismatch of javadoc names and method parameters
+* [ARROW-11](https://issues.apache.org/jira/browse/ARROW-11) - Mirror JIRA activity to [email protected]
+* [ARROW-13](https://issues.apache.org/jira/browse/ARROW-13) - Add PR merge tool similar to that used in Parquet
+* [ARROW-14](https://issues.apache.org/jira/browse/ARROW-14) - Add JIRA components
+* [ARROW-15](https://issues.apache.org/jira/browse/ARROW-15) - Fix a naming typo for memory.AllocationManager.AllocationOutcome
+* [ARROW-19](https://issues.apache.org/jira/browse/ARROW-19) - C++: Externalize memory allocations and add a MemoryPool abstract interface to builder classes
+* [ARROW-20](https://issues.apache.org/jira/browse/ARROW-20) - C++: Add null count member to Array containers, remove nullable member
+* [ARROW-21](https://issues.apache.org/jira/browse/ARROW-21) - C++: Add in-memory schema metadata container
+* [ARROW-22](https://issues.apache.org/jira/browse/ARROW-22) - C++: Add schema adapter routines for converting flat Parquet schemas to in-memory Arrow schemas
+* [ARROW-23](https://issues.apache.org/jira/browse/ARROW-23) - C++: Add logical "Column" container for chunked data
+* [ARROW-24](https://issues.apache.org/jira/browse/ARROW-24) - C++: Add logical "Table" container
+* [ARROW-26](https://issues.apache.org/jira/browse/ARROW-26) - C++: Add developer instructions for building parquet-cpp integration
+* [ARROW-28](https://issues.apache.org/jira/browse/ARROW-28) - C++: Add google/benchmark to the 3rd-party build toolchain
+* [ARROW-30](https://issues.apache.org/jira/browse/ARROW-30) - Python: pandas/NumPy to/from Arrow conversion routines
+* [ARROW-31](https://issues.apache.org/jira/browse/ARROW-31) - Python: basic PyList <-\> Arrow marshaling code
+* [ARROW-35](https://issues.apache.org/jira/browse/ARROW-35) - Add a short call-to-action / how-to-get-involved to the main README.md
+* [ARROW-37](https://issues.apache.org/jira/browse/ARROW-37) - C++: Represent boolean array data in bit-packed form
+* [ARROW-42](https://issues.apache.org/jira/browse/ARROW-42) - Python: Add to Travis CI build
+* [ARROW-43](https://issues.apache.org/jira/browse/ARROW-43) - Python: Add rudimentary console \_\_repr\_\_ for array types
+* [ARROW-44](https://issues.apache.org/jira/browse/ARROW-44) - Python: Implement basic object model for scalar values (i.e. results of arrow\_arr[i])
+* [ARROW-48](https://issues.apache.org/jira/browse/ARROW-48) - Python: Add Schema object wrapper
+* [ARROW-49](https://issues.apache.org/jira/browse/ARROW-49) - Python: Add Column and Table wrapper interface
+* [ARROW-50](https://issues.apache.org/jira/browse/ARROW-50) - C++: Enable library builds for 3rd-party users without having to build thirdparty googletest
+* [ARROW-53](https://issues.apache.org/jira/browse/ARROW-53) - Python: Fix RPATH and add source installation instructions
+* [ARROW-54](https://issues.apache.org/jira/browse/ARROW-54) - Python: rename package to "pyarrow"
+* [ARROW-56](https://issues.apache.org/jira/browse/ARROW-56) - Format: Specify LSB bit ordering in bit arrays
+* [ARROW-57](https://issues.apache.org/jira/browse/ARROW-57) - Format: Draft data headers IDL for data interchange
+* [ARROW-58](https://issues.apache.org/jira/browse/ARROW-58) - Format: Draft type metadata ("schemas") IDL
+* [ARROW-59](https://issues.apache.org/jira/browse/ARROW-59) - Python: Boolean data support for builtin data structures
+* [ARROW-60](https://issues.apache.org/jira/browse/ARROW-60) - C++: Struct type builder API
+* [ARROW-64](https://issues.apache.org/jira/browse/ARROW-64) - Add zsh support to C++ build scripts
+* [ARROW-66](https://issues.apache.org/jira/browse/ARROW-66) - Maybe some missing steps in installation guide
+* [ARROW-67](https://issues.apache.org/jira/browse/ARROW-67) - C++: Draft type metadata conversion to/from IPC representation
+* [ARROW-68](https://issues.apache.org/jira/browse/ARROW-68) - Update setup\_build\_env and third-party script to be more userfriendly
+* [ARROW-70](https://issues.apache.org/jira/browse/ARROW-70) - C++: Add "lite" DCHECK macros used in parquet-cpp
+* [ARROW-71](https://issues.apache.org/jira/browse/ARROW-71) - C++: Add script to run clang-tidy on codebase
+* [ARROW-73](https://issues.apache.org/jira/browse/ARROW-73) - Support CMake 2.8
+* [ARROW-76](https://issues.apache.org/jira/browse/ARROW-76) - Revise format document to include null count, defer non-nullable arrays to the domain of metadata
+* [ARROW-78](https://issues.apache.org/jira/browse/ARROW-78) - C++: Add constructor for DecimalType
+* [ARROW-79](https://issues.apache.org/jira/browse/ARROW-79) - Python: Add benchmarks
+* [ARROW-82](https://issues.apache.org/jira/browse/ARROW-82) - C++: Implement IPC exchange for List types
+* [ARROW-85](https://issues.apache.org/jira/browse/ARROW-85) - C++: memcmp can be avoided in Equal when comparing with the same Buffer
+* [ARROW-86](https://issues.apache.org/jira/browse/ARROW-86) - Python: Implement zero-copy Arrow-to-Pandas conversion
+* [ARROW-87](https://issues.apache.org/jira/browse/ARROW-87) - Implement Decimal schema conversion for all ways supported in Parquet
+* [ARROW-89](https://issues.apache.org/jira/browse/ARROW-89) - Python: Add benchmarks for Arrow<-\>Pandas conversion
+* [ARROW-90](https://issues.apache.org/jira/browse/ARROW-90) - Apache Arrow cpp code does not support power architecture
+* [ARROW-91](https://issues.apache.org/jira/browse/ARROW-91) - C++: First draft of an adapter class for parquet-cpp's ParquetFileReader that produces Arrow table/row batch objects
+* [ARROW-92](https://issues.apache.org/jira/browse/ARROW-92) - C++: Arrow to Parquet Schema conversion
+* [ARROW-100](https://issues.apache.org/jira/browse/ARROW-100) - [C++] Computing RowBatch size
+* [ARROW-101](https://issues.apache.org/jira/browse/ARROW-101) - Fix java warnings emitted by java compiler
+* [ARROW-102](https://issues.apache.org/jira/browse/ARROW-102) - travis-ci support for java project
+* [ARROW-106](https://issues.apache.org/jira/browse/ARROW-106) - Add IPC round trip for string types (string, char, varchar, binary)
+* [ARROW-107](https://issues.apache.org/jira/browse/ARROW-107) - [C++] add ipc round trip for struct types
+* [ARROW-190](https://issues.apache.org/jira/browse/ARROW-190) - Python: Provide installable sdist builds
+* [ARROW-196](https://issues.apache.org/jira/browse/ARROW-196) - [C++] Add conda dev recipe for libarrow and libarrow\_parquet
+* [ARROW-197](https://issues.apache.org/jira/browse/ARROW-197) - [Python] Add conda dev recipe for pyarrow
+* [ARROW-199](https://issues.apache.org/jira/browse/ARROW-199) - [C++] Refine third party dependency
+* [ARROW-201](https://issues.apache.org/jira/browse/ARROW-201) - C++: Initial ParquetWriter implementation
+* [ARROW-203](https://issues.apache.org/jira/browse/ARROW-203) - Python: Basic filename based Parquet read/write
+* [ARROW-204](https://issues.apache.org/jira/browse/ARROW-204) - [Python] Automate uploading conda build artifacts for libarrow and pyarrow
+* [ARROW-206](https://issues.apache.org/jira/browse/ARROW-206) - [C++] Expose an equality API for arrays that compares a range of slots on two arrays
+* [ARROW-207](https://issues.apache.org/jira/browse/ARROW-207) - Extend BufferAllocator interface to allow decorators around BufferAllocator
+* [ARROW-212](https://issues.apache.org/jira/browse/ARROW-212) - [C++] Clarify the fact that PrimitiveArray is now abstract class
+* [ARROW-213](https://issues.apache.org/jira/browse/ARROW-213) - Exposing static arrow build
+* [ARROW-214](https://issues.apache.org/jira/browse/ARROW-214) - C++: Add String support to Parquet I/O
+* [ARROW-215](https://issues.apache.org/jira/browse/ARROW-215) - C++: Support other integer types in Parquet I/O
+* [ARROW-218](https://issues.apache.org/jira/browse/ARROW-218) - Add option to use GitHub API token via environment variable when merging PRs
+* [ARROW-222](https://issues.apache.org/jira/browse/ARROW-222) - [C++] Create prototype file-like interface to HDFS (via libhdfs) and begin defining more general IO interface for Arrow data adapters
+* [ARROW-233](https://issues.apache.org/jira/browse/ARROW-233) - [C++] Add visibility defines for limiting shared library symbol visibility
+* [ARROW-234](https://issues.apache.org/jira/browse/ARROW-234) - [C++] Build with libhdfs support in arrow\_io in conda builds
+* [ARROW-236](https://issues.apache.org/jira/browse/ARROW-236) - [Python] Enable Parquet read/write to work with HDFS file objects
+* [ARROW-237](https://issues.apache.org/jira/browse/ARROW-237) - [C++] Create Arrow specializations of Parquet allocator and read interfaces
+* [ARROW-238](https://issues.apache.org/jira/browse/ARROW-238) - C++: InternalMemoryPool::Free() should throw an error when there is insufficient allocated memory
+* [ARROW-242](https://issues.apache.org/jira/browse/ARROW-242) - C++/Python: Support Timestamp Data Type
+* [ARROW-245](https://issues.apache.org/jira/browse/ARROW-245) - [Format] Clarify Arrow's relationship with big endian platforms
+* [ARROW-251](https://issues.apache.org/jira/browse/ARROW-251) - [C++] Expose APIs for getting code and message of the status
+* [ARROW-252](https://issues.apache.org/jira/browse/ARROW-252) - Add implementation guidelines to the documentation
+* [ARROW-253](https://issues.apache.org/jira/browse/ARROW-253) - Int types should only have width of 8\*2^n (8, 16, 32, 64)
+* [ARROW-254](https://issues.apache.org/jira/browse/ARROW-254) - Remove Bit type as it is redundant with boolean
+* [ARROW-255](https://issues.apache.org/jira/browse/ARROW-255) - Finalize Dictionary representation
+* [ARROW-256](https://issues.apache.org/jira/browse/ARROW-256) - Add versioning to the arrow spec.
+* [ARROW-257](https://issues.apache.org/jira/browse/ARROW-257) - Add a typeids Vector to Union type
+* [ARROW-262](https://issues.apache.org/jira/browse/ARROW-262) - [Format] Add a new format document for metadata and logical types for messaging and IPC / on-wire/file representations
+* [ARROW-264](https://issues.apache.org/jira/browse/ARROW-264) - Create an Arrow File format
+* [ARROW-267](https://issues.apache.org/jira/browse/ARROW-267) - [C++] C++ implementation of file-like layout for RPC / IPC
+* [ARROW-270](https://issues.apache.org/jira/browse/ARROW-270) - [Format] Define more generic Interval logical type
+* [ARROW-271](https://issues.apache.org/jira/browse/ARROW-271) - Update Field structure to be more explicit
+* [ARROW-272](https://issues.apache.org/jira/browse/ARROW-272) - Arrow release 0.1
+* [ARROW-279](https://issues.apache.org/jira/browse/ARROW-279) - rename vector module to arrow-vector for consistency
+* [ARROW-280](https://issues.apache.org/jira/browse/ARROW-280) - [C++] Consolidate file and shared memory IO interfaces
+* [ARROW-282](https://issues.apache.org/jira/browse/ARROW-282) - Make parquet-cpp an optional dependency of pyarrow
+* [ARROW-285](https://issues.apache.org/jira/browse/ARROW-285) - Allow for custom flatc compiler
+* [ARROW-286](https://issues.apache.org/jira/browse/ARROW-286) - Build thirdparty dependencies in parallel
+* [ARROW-289](https://issues.apache.org/jira/browse/ARROW-289) - Install test-util.h
+* [ARROW-290](https://issues.apache.org/jira/browse/ARROW-290) - Specialize alloc() in ArrowBuf
+* [ARROW-291](https://issues.apache.org/jira/browse/ARROW-291) - [Python] Update NOTICE file for Python codebase
+* [ARROW-292](https://issues.apache.org/jira/browse/ARROW-292) - [Java] Upgrade Netty to 4.041
+* [ARROW-293](https://issues.apache.org/jira/browse/ARROW-293) - [C++] Implementations of IO interfaces for operating system files
+* [ARROW-296](https://issues.apache.org/jira/browse/ARROW-296) - [C++] Remove arrow\_parquet C++ module and related parts of build system
+* [ARROW-298](https://issues.apache.org/jira/browse/ARROW-298) - create release scripts
+* [ARROW-299](https://issues.apache.org/jira/browse/ARROW-299) - Use absolute namespace in macros
+* [ARROW-301](https://issues.apache.org/jira/browse/ARROW-301) - [Format] Add some form of user field metadata to IPC schemas
+* [ARROW-302](https://issues.apache.org/jira/browse/ARROW-302) - [Python] Add support to use the Arrow file format with file-like objects
+* [ARROW-305](https://issues.apache.org/jira/browse/ARROW-305) - Add compression and use\_dictionary options to Parquet interface
+* [ARROW-306](https://issues.apache.org/jira/browse/ARROW-306) - Add option to pass cmake arguments via environment variable
+* [ARROW-315](https://issues.apache.org/jira/browse/ARROW-315) - Finalize timestamp type
+* [ARROW-318](https://issues.apache.org/jira/browse/ARROW-318) - [Python] Revise README to reflect current state of project
+* [ARROW-319](https://issues.apache.org/jira/browse/ARROW-319) - Add canonical Arrow Schema json representation
+* [ARROW-324](https://issues.apache.org/jira/browse/ARROW-324) - Update arrow metadata diagram
+* [ARROW-325](https://issues.apache.org/jira/browse/ARROW-325) - make TestArrowFile not dependent on timezone
+
+
+## Bug Fixes
+
+* [ARROW-5](https://issues.apache.org/jira/browse/ARROW-5) - Error when run maven install
+* [ARROW-5](https://issues.apache.org/jira/browse/ARROW-5) - Error when run maven install
+* [ARROW-16](https://issues.apache.org/jira/browse/ARROW-16) - Building cpp issues on XCode 7.2.1
+* [ARROW-17](https://issues.apache.org/jira/browse/ARROW-17) - Set some vector fields to default access level for Drill compatibility
+* [ARROW-18](https://issues.apache.org/jira/browse/ARROW-18) - Fix bug with decimal precision and scale
+* [ARROW-36](https://issues.apache.org/jira/browse/ARROW-36) - Remove fixVersions from patch tool (until we have them)
+* [ARROW-46](https://issues.apache.org/jira/browse/ARROW-46) - Port DRILL-4410 to Arrow
+* [ARROW-51](https://issues.apache.org/jira/browse/ARROW-51) - Move ValueVector test from Drill project
+* [ARROW-55](https://issues.apache.org/jira/browse/ARROW-55) - Python: fix legacy Python (2.7) tests and add to Travis CI
+* [ARROW-62](https://issues.apache.org/jira/browse/ARROW-62) - Format: Are the nulls bits 0 or 1 for null values?
+* [ARROW-63](https://issues.apache.org/jira/browse/ARROW-63) - C++: ctest fails if Python 3 is the active Python interpreter
+* [ARROW-65](https://issues.apache.org/jira/browse/ARROW-65) - Python: FindPythonLibsNew does not work in a virtualenv
+* [ARROW-69](https://issues.apache.org/jira/browse/ARROW-69) - Change permissions for assignable users
+* [ARROW-72](https://issues.apache.org/jira/browse/ARROW-72) - FindParquet searches for non-existent header
+* [ARROW-75](https://issues.apache.org/jira/browse/ARROW-75) - C++: Fix handling of empty strings
+* [ARROW-77](https://issues.apache.org/jira/browse/ARROW-77) - C++: conform null bit interpretation to match ARROW-62
+* [ARROW-80](https://issues.apache.org/jira/browse/ARROW-80) - Segmentation fault on len(Array) for empty arrays
+* [ARROW-83](https://issues.apache.org/jira/browse/ARROW-83) - Add basic test infrastructure for DecimalType
+* [ARROW-84](https://issues.apache.org/jira/browse/ARROW-84) - C++: separate test codes
+* [ARROW-88](https://issues.apache.org/jira/browse/ARROW-88) - C++: Refactor given PARQUET-572
+* [ARROW-93](https://issues.apache.org/jira/browse/ARROW-93) - XCode 7.3 breaks builds
+* [ARROW-94](https://issues.apache.org/jira/browse/ARROW-94) - Expand list example to clarify null vs empty list
+* [ARROW-103](https://issues.apache.org/jira/browse/ARROW-103) - Missing patterns from .gitignore
+* [ARROW-104](https://issues.apache.org/jira/browse/ARROW-104) - Update Layout.md based on discussion on the mailing list
+* [ARROW-105](https://issues.apache.org/jira/browse/ARROW-105) - Unit tests fail if assertions are disabled
+* [ARROW-113](https://issues.apache.org/jira/browse/ARROW-113) - TestValueVector test fails if cannot allocate 2GB of memory
+* [ARROW-185](https://issues.apache.org/jira/browse/ARROW-185) - [C++] Make sure alignment and memory padding conform to spec
+* [ARROW-188](https://issues.apache.org/jira/browse/ARROW-188) - Python: Add numpy as install requirement
+* [ARROW-193](https://issues.apache.org/jira/browse/ARROW-193) - For the instruction, typos "int his" should be "in this"
+* [ARROW-194](https://issues.apache.org/jira/browse/ARROW-194) - C++: Allow read-only memory mapped source
+* [ARROW-200](https://issues.apache.org/jira/browse/ARROW-200) - [Python] Convert Values String looks like it has incorrect error handling
+* [ARROW-205](https://issues.apache.org/jira/browse/ARROW-205) - builds failing on master branch with apt-get error
+* [ARROW-209](https://issues.apache.org/jira/browse/ARROW-209) - [C++] Broken builds: llvm.org apt repos are unavailable
+* [ARROW-210](https://issues.apache.org/jira/browse/ARROW-210) - [C++] Tidy up the type system a little bit
+* [ARROW-211](https://issues.apache.org/jira/browse/ARROW-211) - Several typos/errors in Layout.md examples
+* [ARROW-217](https://issues.apache.org/jira/browse/ARROW-217) - Fix Travis w.r.t conda 4.1.0 changes
+* [ARROW-219](https://issues.apache.org/jira/browse/ARROW-219) - [C++] Passed CMAKE\_CXX\_FLAGS are being dropped, fix compiler warnings
+* [ARROW-223](https://issues.apache.org/jira/browse/ARROW-223) - Do not link against libpython
+* [ARROW-225](https://issues.apache.org/jira/browse/ARROW-225) - [C++/Python] master Travis CI build is broken
+* [ARROW-244](https://issues.apache.org/jira/browse/ARROW-244) - [C++] Some global APIs of IPC module should be visible to the outside
+* [ARROW-246](https://issues.apache.org/jira/browse/ARROW-246) - [Java] UnionVector doesn't call allocateNew() when creating it's vectorType
+* [ARROW-247](https://issues.apache.org/jira/browse/ARROW-247) - [C++] Missing explicit destructor in RowBatchReader causes an incomplete type error
+* [ARROW-250](https://issues.apache.org/jira/browse/ARROW-250) - Fix for ARROW-246 may cause memory leaks
+* [ARROW-259](https://issues.apache.org/jira/browse/ARROW-259) - Use flatbuffer fields in java implementation
+* [ARROW-260](https://issues.apache.org/jira/browse/ARROW-260) - TestValueVector.testFixedVectorReallocation and testVariableVectorReallocation are flaky
+* [ARROW-265](https://issues.apache.org/jira/browse/ARROW-265) - Negative decimal values have wrong padding
+* [ARROW-265](https://issues.apache.org/jira/browse/ARROW-265) - Negative decimal values have wrong padding
+* [ARROW-266](https://issues.apache.org/jira/browse/ARROW-266) - [C++] Fix the broken build
+* [ARROW-274](https://issues.apache.org/jira/browse/ARROW-274) - Make the MapVector nullable
+* [ARROW-277](https://issues.apache.org/jira/browse/ARROW-277) - Flatbuf serialization fails for Timestamp type
+* [ARROW-278](https://issues.apache.org/jira/browse/ARROW-278) - [Format] Struct type name consistency in implementations and metadata
+* [ARROW-283](https://issues.apache.org/jira/browse/ARROW-283) - [C++] Update arrow\_parquet to account for API changes in PARQUET-573
+* [ARROW-284](https://issues.apache.org/jira/browse/ARROW-284) - [C++] Triage builds by disabling Arrow-Parquet module
+* [ARROW-287](https://issues.apache.org/jira/browse/ARROW-287) - [java] Make nullable vectors use a BitVecor instead of UInt1Vector for bits
+* [ARROW-297](https://issues.apache.org/jira/browse/ARROW-297) - Fix Arrow pom for release
+* [ARROW-304](https://issues.apache.org/jira/browse/ARROW-304) - NullableMapReaderImpl.isSet() always returns true
+* [ARROW-308](https://issues.apache.org/jira/browse/ARROW-308) - UnionListWriter.setPosition() should not call startList()
+* [ARROW-309](https://issues.apache.org/jira/browse/ARROW-309) - Types.getMinorTypeForArrowType() does not work for Union type
+* [ARROW-313](https://issues.apache.org/jira/browse/ARROW-313) - XCode 8.0 breaks builds
+* [ARROW-314](https://issues.apache.org/jira/browse/ARROW-314) - JSONScalar is unnecessary and unused.
+* [ARROW-320](https://issues.apache.org/jira/browse/ARROW-320) - ComplexCopier.copy(FieldReader, FieldWriter) should not start a list if reader is not set
+* [ARROW-321](https://issues.apache.org/jira/browse/ARROW-321) - Fix Arrow licences
+* [ARROW-855](https://issues.apache.org/jira/browse/ARROW-855) - Arrow Memory Leak
+
+
diff --git a/contrib/libs/apache/arrow/CODE_OF_CONDUCT.md b/contrib/libs/apache/arrow/CODE_OF_CONDUCT.md
index 2efe740b77c..5138a55ae09 100644
--- a/contrib/libs/apache/arrow/CODE_OF_CONDUCT.md
+++ b/contrib/libs/apache/arrow/CODE_OF_CONDUCT.md
@@ -1,24 +1,24 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Code of Conduct
-
-* [Code of Conduct for The Apache Software Foundation][1]
-
-[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Code of Conduct
+
+* [Code of Conduct for The Apache Software Foundation][1]
+
+[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file
diff --git a/contrib/libs/apache/arrow/CONTRIBUTING.md b/contrib/libs/apache/arrow/CONTRIBUTING.md
index 3e636d9cd2f..e304528359b 100644
--- a/contrib/libs/apache/arrow/CONTRIBUTING.md
+++ b/contrib/libs/apache/arrow/CONTRIBUTING.md
@@ -1,77 +1,77 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# How to contribute to Apache Arrow
-
-## Did you find a bug?
-
-The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have
-to first create an account on the
-[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server
-hosts bugs and issues for multiple Apache projects. The JIRA project name
-for Arrow is "ARROW".
-
-To be assigned to an issue, ask an Arrow JIRA admin to go to
-[Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles),
-click "Add users to a role," and add you to the "Contributor" role. Most
-committers are authorized to do this; if you're a committer and aren't
-able to load that project admin page, have someone else add you to the
-necessary role.
-
-Before you create a new bug entry, we recommend you first
-[search](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-5140?filter=allopenissues)
-among existing Arrow issues.
-
-When you create a new JIRA entry, please don't forget to fill the "Component"
-field. Arrow has many subcomponents and this helps triaging and filtering
-tremendously. Also, we conventionally prefix the issue title with the component
-name in brackets, such as "[C++] Crash in Array::Frobnicate()", so as to make
-lists more easy to navigate, and we'd be grateful if you did the same.
-
-## Did you write a patch that fixes a bug or brings an improvement?
-
-First create a JIRA entry as described above. Then, submit your changes
-as a GitHub Pull Request. We'll ask you to prefix the pull request title
-with the JIRA issue number and the component name in brackets.
-(for example: "ARROW-2345: [C++] Fix crash in Array::Frobnicate()").
-Respecting this convention makes it easier for us to process the backlog
-of submitted Pull Requests.
-
-### Minor Fixes
-
-Any functionality change should have a JIRA opened. For minor changes that
-affect documentation, you do not need to open up a JIRA. Instead you can
-prefix the title of your PR with "MINOR: " if meets the following guidelines:
-
-* Grammar, usage and spelling fixes that affect no more than 2 files
-* Documentation updates affecting no more than 2 files and not more
- than 500 words.
-
-## Do you want to propose a significant new feature or an important refactoring?
-
-We ask that all discussions about major changes in the codebase happen
-publicly on the [arrow-dev mailing-list](https://mail-archives.apache.org/mod_mbox/arrow-dev/).
-
-## Do you have questions about the source code, the build procedure or the development process?
-
-You can also ask on the mailing-list, see above.
-
-## Further information
-
-Please read our [development documentation](https://arrow.apache.org/docs/developers/contributing.html).
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# How to contribute to Apache Arrow
+
+## Did you find a bug?
+
+The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have
+to first create an account on the
+[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server
+hosts bugs and issues for multiple Apache projects. The JIRA project name
+for Arrow is "ARROW".
+
+To be assigned to an issue, ask an Arrow JIRA admin to go to
+[Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles),
+click "Add users to a role," and add you to the "Contributor" role. Most
+committers are authorized to do this; if you're a committer and aren't
+able to load that project admin page, have someone else add you to the
+necessary role.
+
+Before you create a new bug entry, we recommend you first
+[search](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-5140?filter=allopenissues)
+among existing Arrow issues.
+
+When you create a new JIRA entry, please don't forget to fill the "Component"
+field. Arrow has many subcomponents and this helps triaging and filtering
+tremendously. Also, we conventionally prefix the issue title with the component
+name in brackets, such as "[C++] Crash in Array::Frobnicate()", so as to make
+lists more easy to navigate, and we'd be grateful if you did the same.
+
+## Did you write a patch that fixes a bug or brings an improvement?
+
+First create a JIRA entry as described above. Then, submit your changes
+as a GitHub Pull Request. We'll ask you to prefix the pull request title
+with the JIRA issue number and the component name in brackets.
+(for example: "ARROW-2345: [C++] Fix crash in Array::Frobnicate()").
+Respecting this convention makes it easier for us to process the backlog
+of submitted Pull Requests.
+
+### Minor Fixes
+
+Any functionality change should have a JIRA opened. For minor changes that
+affect documentation, you do not need to open up a JIRA. Instead you can
+prefix the title of your PR with "MINOR: " if meets the following guidelines:
+
+* Grammar, usage and spelling fixes that affect no more than 2 files
+* Documentation updates affecting no more than 2 files and not more
+ than 500 words.
+
+## Do you want to propose a significant new feature or an important refactoring?
+
+We ask that all discussions about major changes in the codebase happen
+publicly on the [arrow-dev mailing-list](https://mail-archives.apache.org/mod_mbox/arrow-dev/).
+
+## Do you have questions about the source code, the build procedure or the development process?
+
+You can also ask on the mailing-list, see above.
+
+## Further information
+
+Please read our [development documentation](https://arrow.apache.org/docs/developers/contributing.html).
diff --git a/contrib/libs/apache/arrow/LICENSE.txt b/contrib/libs/apache/arrow/LICENSE.txt
index 5d4de206545..619079f034d 100644
--- a/contrib/libs/apache/arrow/LICENSE.txt
+++ b/contrib/libs/apache/arrow/LICENSE.txt
@@ -1,2242 +1,2242 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
---------------------------------------------------------------------------------
-
-src/plasma/fling.cc and src/plasma/fling.h: Apache 2.0
-
-Copyright 2013 Sharvil Nanavati
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-src/plasma/thirdparty/ae: Modified / 3-Clause BSD
-
-Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of Redis nor the names of its contributors may be used
- to endorse or promote products derived from this software without
- specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-src/plasma/thirdparty/dlmalloc.c: CC0
-
-This is a version (aka dlmalloc) of malloc/free/realloc written by
-Doug Lea and released to the public domain, as explained at
-http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
-comments, complaints, performance data, etc to [email protected]
-
---------------------------------------------------------------------------------
-
-src/plasma/common.cc (some portions)
-
-Copyright (c) Austin Appleby (aappleby (AT) gmail)
-
-Some portions of this file are derived from code in the MurmurHash project
-
-All code is released to the public domain. For business purposes, Murmurhash is
-under the MIT license.
-
-https://sites.google.com/site/murmurhash/
-
---------------------------------------------------------------------------------
-
-src/arrow/util (some portions): Apache 2.0, and 3-clause BSD
-
-Some portions of this module are derived from code in the Chromium project,
-copyright (c) Google inc and (c) The Chromium Authors and licensed under the
-Apache 2.0 License or the under the 3-clause BSD license:
-
- Copyright (c) 2013 The Chromium Authors. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following disclaimer
- in the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Google Inc. nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project includes code from Daniel Lemire's FrameOfReference project.
-
-https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
-
-Copyright: 2013 Daniel Lemire
-Home page: http://lemire.me/en/
-Project page: https://github.com/lemire/FrameOfReference
-License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from the TensorFlow project
-
-Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-This project includes code from the NumPy project.
-
-https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910
-
-https://github.com/numpy/numpy/blob/68fd82271b9ea5a9e50d4e761061dfcca851382a/numpy/core/src/multiarray/datetime.c
-
-Copyright (c) 2005-2017, NumPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- * Neither the name of the NumPy Developers nor the names of any
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project includes code from the Boost project
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-This project includes code from the FlatBuffers project
-
-Copyright 2014 Google Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-This project includes code from the tslib project
-
-Copyright 2015 Microsoft Corporation. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-This project includes code from the jemalloc project
-
-https://github.com/jemalloc/jemalloc
-
-Copyright (C) 2002-2017 Jason Evans <[email protected]>.
-All rights reserved.
-Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved.
-Copyright (C) 2009-2017 Facebook, Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice(s),
- this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice(s),
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
-OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------------------
-
-This project includes code from the Go project, BSD 3-clause license + PATENTS
-weak patent termination clause
-(https://github.com/golang/go/blob/master/PATENTS).
-
-Copyright (c) 2009 The Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
- * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project includes code from the hs2client
-
-https://github.com/cloudera/hs2client
-
-Copyright 2016 Cloudera Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-The script ci/scripts/util_wait_for_it.sh has the following license
-
-Copyright (c) 2016 Giles Hall
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The script r/configure has the following license (MIT)
-
-Copyright (c) 2017, Jeroen Ooms and Jim Hester
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-cpp/src/arrow/util/logging.cc, cpp/src/arrow/util/logging.h and
-cpp/src/arrow/util/logging-test.cc are adapted from
-Ray Project (https://github.com/ray-project/ray) (Apache 2.0).
-
-Copyright (c) 2016 Ray Project (https://github.com/ray-project/ray)
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-The files cpp/src/arrow/vendored/datetime/date.h, cpp/src/arrow/vendored/datetime/tz.h,
-cpp/src/arrow/vendored/datetime/tz_private.h, cpp/src/arrow/vendored/datetime/ios.h,
-cpp/src/arrow/vendored/datetime/ios.mm,
-cpp/src/arrow/vendored/datetime/tz.cpp are adapted from
-Howard Hinnant's date library (https://github.com/HowardHinnant/date)
-It is licensed under MIT license.
-
-The MIT License (MIT)
-Copyright (c) 2015, 2016, 2017 Howard Hinnant
-Copyright (c) 2016 Adrian Colomitchi
-Copyright (c) 2017 Florian Dang
-Copyright (c) 2017 Paul Thompson
-Copyright (c) 2018 Tomasz Kamiński
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/util/utf8.h includes code adapted from the page
- https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
-with the following license (MIT)
-
-Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/vendored/string_view.hpp has the following license
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/xxhash/ have the following license
-(BSD 2-Clause License)
-
-xxHash Library
-Copyright (c) 2012-2014, Yann Collet
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
- list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash homepage: http://www.xxhash.com
-- xxHash source repository : https://github.com/Cyan4973/xxHash
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/double-conversion/ have the following license
-(BSD 3-Clause License)
-
-Copyright 2006-2011, the V8 project authors. All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
- * Neither the name of Google Inc. nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/uriparser/ have the following license
-(BSD 3-Clause License)
-
-uriparser - RFC 3986 URI parsing library
-
-Copyright (C) 2007, Weijia Song <[email protected]>
-Copyright (C) 2007, Sebastian Pipping <[email protected]>
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
- * Redistributions of source code must retain the above
- copyright notice, this list of conditions and the following
- disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- * Neither the name of the <ORGANIZATION> nor the names of its
- contributors may be used to endorse or promote products
- derived from this software without specific prior written
- permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-The files under dev/tasks/conda-recipes have the following license
-
-BSD 3-clause license
-Copyright (c) 2015-2018, conda-forge
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
- may be used to endorse or promote products derived from this software without
- specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/utfcpp/ have the following license
-
-Copyright 2006-2018 Nemanja Trifunovic
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-This project includes code from Apache Kudu.
-
- * cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake
-
-Copyright: 2016 The Apache Software Foundation.
-Home page: https://kudu.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from Apache Impala (incubating), formerly
-Impala. The Impala code and rights were donated to the ASF as part of the
-Incubator process after the initial code imports into Apache Parquet.
-
-Copyright: 2012 Cloudera, Inc.
-Copyright: 2016 The Apache Software Foundation.
-Home page: http://impala.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from Apache Aurora.
-
-* dev/release/{release,changelog,release-candidate} are based on the scripts from
- Apache Aurora
-
-Copyright: 2016 The Apache Software Foundation.
-Home page: https://aurora.apache.org/
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from the Google styleguide.
-
-* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide.
-
-Copyright: 2009 Google Inc. All rights reserved.
-Homepage: https://github.com/google/styleguide
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-This project includes code from Snappy.
-
-* cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code
- from Google's Snappy project.
-
-Copyright: 2009 Google Inc. All rights reserved.
-Homepage: https://github.com/google/snappy
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-This project includes code from the manylinux project.
-
-* python/manylinux1/scripts/{build_python.sh,python-tag-abi-tag.py,
- requirements.txt} are based on code from the manylinux project.
-
-Copyright: 2016 manylinux
-Homepage: https://github.com/pypa/manylinux
-License: The MIT License (MIT)
-
---------------------------------------------------------------------------------
-
-This project includes code from the cymove project:
-
-* python/pyarrow/includes/common.pxd includes code from the cymove project
-
-The MIT License (MIT)
-Copyright (c) 2019 Omer Ozarslan
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-OR OTHER DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The projects includes code from the Ursabot project under the dev/archery
-directory.
-
-License: BSD 2-Clause
-
-Copyright 2019 RStudio, Inc.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-This project include code from mingw-w64.
-
-* cpp/src/arrow/util/cpu-info.cc has a polyfill for mingw-w64 < 5
-
-Copyright (c) 2009 - 2013 by the mingw-w64 project
-Homepage: https://mingw-w64.org
-License: Zope Public License (ZPL) Version 2.1.
-
----------------------------------------------------------------------------------
-
-This project include code from Google's Asylo project.
-
-* cpp/src/arrow/result.h is based on status_or.h
-
-Copyright (c) Copyright 2017 Asylo authors
-Homepage: https://asylo.dev/
-License: Apache 2.0
-
---------------------------------------------------------------------------------
-
-This project includes code from Google's protobuf project
-
-* cpp/src/arrow/result.h ARROW_ASSIGN_OR_RAISE is based off ASSIGN_OR_RETURN
-
-Copyright 2008 Google Inc. All rights reserved.
-Homepage: https://developers.google.com/protocol-buffers/
-License:
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
- * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Code generated by the Protocol Buffer compiler is owned by the owner
-of the input file used when generating it. This code is not
-standalone and requires a support library to be linked with it. This
-support library is itself covered by the above license.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency LLVM is statically linked in certain binary distributions.
-Additionally some sections of source code have been derived from sources in LLVM
-and have been clearly labeled as such. LLVM has the following license:
-
-==============================================================================
-The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
-==============================================================================
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
----- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
-
-==============================================================================
-Software from third parties included in the LLVM Project:
-==============================================================================
-The LLVM Project contains third party software which is under different license
-terms. All such code will be identified clearly using at least one of two
-mechanisms:
-1) It will be in a separate directory tree with its own `LICENSE.txt` or
- `LICENSE` file at the top containing the specific license and restrictions
- which apply to that software, or
-2) It will contain specific license and restriction terms at the top of every
- file.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency gRPC is statically linked in certain binary
-distributions, like the python wheels. gRPC has the following license:
-
-Copyright 2014 gRPC authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency Apache Thrift is statically linked in certain binary
-distributions, like the python wheels. Apache Thrift has the following license:
-
-Apache Thrift
-Copyright (C) 2006 - 2019, The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency Apache ORC is statically linked in certain binary
-distributions, like the python wheels. Apache ORC has the following license:
-
-Apache ORC
-Copyright 2013-2019 The Apache Software Foundation
-
-This product includes software developed by The Apache Software
-Foundation (http://www.apache.org/).
-
-This product includes software developed by Hewlett-Packard:
-(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency zstd is statically linked in certain binary
-distributions, like the python wheels. ZSTD has the following license:
-
-BSD License
-
-For Zstandard software
-
-Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific
- prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency lz4 is statically linked in certain binary
-distributions, like the python wheels. lz4 has the following license:
-
-LZ4 Library
-Copyright (c) 2011-2016, Yann Collet
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
- list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency Brotli is statically linked in certain binary
-distributions, like the python wheels. Brotli has the following license:
-
-Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency rapidjson is statically linked in certain binary
-distributions, like the python wheels. rapidjson and its dependencies have the
-following licenses:
-
-Tencent is pleased to support the open source community by making RapidJSON
-available.
-
-Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
-All rights reserved.
-
-If you have downloaded a copy of the RapidJSON binary from Tencent, please note
-that the RapidJSON binary is licensed under the MIT License.
-If you have downloaded a copy of the RapidJSON source code from Tencent, please
-note that RapidJSON source code is licensed under the MIT License, except for
-the third-party components listed below which are subject to different license
-terms. Your integration of RapidJSON into your own projects may require
-compliance with the MIT License, as well as the other licenses applicable to
-the third-party components included within RapidJSON. To avoid the problematic
-JSON license in your own projects, it's sufficient to exclude the
-bin/jsonchecker/ directory, as it's the only code under the JSON license.
-A copy of the MIT License is included in this file.
-
-Other dependencies and licenses:
-
- Open Source Software Licensed Under the BSD License:
- --------------------------------------------------------------------
-
- The msinttypes r29
- Copyright (c) 2006-2013 Alexander Chemeris
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of copyright holder nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
-
- Open Source Software Licensed Under the JSON License:
- --------------------------------------------------------------------
-
- json.org
- Copyright (c) 2002 JSON.org
- All Rights Reserved.
-
- JSON_checker
- Copyright (c) 2002 JSON.org
- All Rights Reserved.
-
-
- Terms of the JSON License:
- ---------------------------------------------------
-
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the "Software"),
- to deal in the Software without restriction, including without limitation
- the rights to use, copy, modify, merge, publish, distribute, sublicense,
- and/or sell copies of the Software, and to permit persons to whom the
- Software is furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- The Software shall be used for Good, not Evil.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- DEALINGS IN THE SOFTWARE.
-
-
- Terms of the MIT License:
- --------------------------------------------------------------------
-
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the "Software"),
- to deal in the Software without restriction, including without limitation
- the rights to use, copy, modify, merge, publish, distribute, sublicense,
- and/or sell copies of the Software, and to permit persons to whom the
- Software is furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included
- in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency snappy is statically linked in certain binary
-distributions, like the python wheels. snappy has the following license:
-
-Copyright 2011, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of Google Inc. nor the names of its contributors may be
- used to endorse or promote products derived from this software without
- specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-===
-
-Some of the benchmark data in testdata/ is licensed differently:
-
- - fireworks.jpeg is Copyright 2013 Steinar H. Gunderson, and
- is licensed under the Creative Commons Attribution 3.0 license
- (CC-BY-3.0). See https://creativecommons.org/licenses/by/3.0/
- for more information.
-
- - kppkn.gtb is taken from the Gaviota chess tablebase set, and
- is licensed under the MIT License. See
- https://sites.google.com/site/gaviotachessengine/Home/endgame-tablebases-1
- for more information.
-
- - paper-100k.pdf is an excerpt (bytes 92160 to 194560) from the paper
- “Combinatorial Modeling of Chromatin Features Quantitatively Predicts DNA
- Replication Timing in _Drosophila_” by Federico Comoglio and Renato Paro,
- which is licensed under the CC-BY license. See
- http://www.ploscompbiol.org/static/license for more ifnormation.
-
- - alice29.txt, asyoulik.txt, plrabn12.txt and lcet10.txt are from Project
- Gutenberg. The first three have expired copyrights and are in the public
- domain; the latter does not have expired copyright, but is still in the
- public domain according to the license information
- (http://www.gutenberg.org/ebooks/53).
-
---------------------------------------------------------------------------------
-
-3rdparty dependency gflags is statically linked in certain binary
-distributions, like the python wheels. gflags has the following license:
-
-Copyright (c) 2006, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
- * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency glog is statically linked in certain binary
-distributions, like the python wheels. glog has the following license:
-
-Copyright (c) 2008, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
- * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-A function gettimeofday in utilities.cc is based on
-
-http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd
-
-The license of this code is:
-
-Copyright (c) 2003-2008, Jouni Malinen <[email protected]> and contributors
-All Rights Reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
-3. Neither the name(s) of the above-listed copyright holder(s) nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency re2 is statically linked in certain binary
-distributions, like the python wheels. re2 has the following license:
-
-Copyright (c) 2009 The RE2 Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
- * Neither the name of Google Inc. nor the names of its contributors
- may be used to endorse or promote products derived from this
- software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency c-ares is statically linked in certain binary
-distributions, like the python wheels. c-ares has the following license:
-
-# c-ares license
-
-Copyright (c) 2007 - 2018, Daniel Stenberg with many contributors, see AUTHORS
-file.
-
-Copyright 1998 by the Massachusetts Institute of Technology.
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose and without fee is hereby granted, provided that
-the above copyright notice appear in all copies and that both that copyright
-notice and this permission notice appear in supporting documentation, and that
-the name of M.I.T. not be used in advertising or publicity pertaining to
-distribution of the software without specific, written prior permission.
-M.I.T. makes no representations about the suitability of this software for any
-purpose. It is provided "as is" without express or implied warranty.
-
---------------------------------------------------------------------------------
-
-3rdparty dependency zlib is redistributed as a dynamically linked shared
-library in certain binary distributions, like the python wheels. In the future
-this will likely change to static linkage. zlib has the following license:
-
-zlib.h -- interface of the 'zlib' general purpose compression library
- version 1.2.11, January 15th, 2017
-
- Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
-
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any damages
- arising from the use of this software.
-
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it
- freely, subject to the following restrictions:
-
- 1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
-
- Jean-loup Gailly Mark Adler
-
---------------------------------------------------------------------------------
-
-3rdparty dependency openssl is redistributed as a dynamically linked shared
-library in certain binary distributions, like the python wheels. openssl
-preceding version 3 has the following license:
-
- LICENSE ISSUES
- ==============
-
- The OpenSSL toolkit stays under a double license, i.e. both the conditions of
- the OpenSSL License and the original SSLeay license apply to the toolkit.
- See below for the actual license texts.
-
- OpenSSL License
- ---------------
-
-/* ====================================================================
- * Copyright (c) 1998-2019 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * ([email protected]). This product includes software written by Tim
- * Hudson ([email protected]).
- *
- */
-
- Original SSLeay License
- -----------------------
-
-/* Copyright (C) 1995-1998 Eric Young ([email protected])
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young ([email protected]).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson ([email protected]).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young ([email protected])"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson ([email protected])"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
---------------------------------------------------------------------------------
-
-This project includes code from the rtools-backports project.
-
-* ci/scripts/PKGBUILD and ci/scripts/r_windows_build.sh are based on code
- from the rtools-backports project.
-
-Copyright: Copyright (c) 2013 - 2019, Алексей and Jeroen Ooms.
-All rights reserved.
-Homepage: https://github.com/r-windows/rtools-backports
-License: 3-clause BSD
-
---------------------------------------------------------------------------------
-
-Some code from pandas has been adapted for the pyarrow codebase. pandas is
-available under the 3-clause BSD license, which follows:
-
-pandas license
-==============
-
-Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
-All rights reserved.
-
-Copyright (c) 2008-2011 AQR Capital Management, LLC
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- * Neither the name of the copyright holder nor the names of any
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------------------------
-
-Some bits from DyND, in particular aspects of the build system, have been
-adapted from libdynd and dynd-python under the terms of the BSD 2-clause
-license
-
-The BSD 2-Clause License
-
- Copyright (C) 2011-12, Dynamic NDArray Developers
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Dynamic NDArray Developers list:
-
- * Mark Wiebe
- * Continuum Analytics
-
---------------------------------------------------------------------------------
-
-Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted
-for PyArrow. Ibis is released under the Apache License, Version 2.0.
-
---------------------------------------------------------------------------------
-
-This project includes code from the autobrew project.
-
-* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
- are based on code from the autobrew project.
-
-Copyright (c) 2019, Jeroen Ooms
-License: MIT
-Homepage: https://github.com/jeroen/autobrew
-
---------------------------------------------------------------------------------
-
-dev/tasks/homebrew-formulae/apache-arrow.rb has the following license:
-
-BSD 2-Clause License
-
-Copyright (c) 2009-present, Homebrew contributors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-----------------------------------------------------------------------
-
-cpp/src/arrow/vendored/base64.cpp has the following license
-
-ZLIB License
-
-Copyright (C) 2004-2017 René Nyffenegger
-
-This source code is provided 'as-is', without any express or implied
-warranty. In no event will the author be held liable for any damages arising
-from the use of this software.
-
-Permission is granted to anyone to use this software for any purpose, including
-commercial applications, and to alter it and redistribute it freely, subject to
-the following restrictions:
-
-1. The origin of this source code must not be misrepresented; you must not
- claim that you wrote the original source code. If you use this source code
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
-
-2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original source code.
-
-3. This notice may not be removed or altered from any source distribution.
-
-René Nyffenegger [email protected]
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/vendored/optional.hpp has the following license
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-This project includes code from Folly.
-
- * cpp/src/arrow/vendored/ProducerConsumerQueue.h
-
-is based on Folly's
-
- * folly/Portability.h
- * folly/lang/Align.h
- * folly/ProducerConsumerQueue.h
-
-Copyright: Copyright (c) Facebook, Inc. and its affiliates.
-Home page: https://github.com/facebook/folly
-License: http://www.apache.org/licenses/LICENSE-2.0
-
---------------------------------------------------------------------------------
-
-The file cpp/src/arrow/vendored/musl/strptime.c has the following license
-
-Copyright © 2005-2020 Rich Felker, et al.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The file cpp/cmake_modules/BuildUtils.cmake contains code from
-
-https://gist.github.com/cristianadam/ef920342939a89fae3e8a85ca9459b49
-
-which is made available under the MIT license
-
-Copyright (c) 2019 Cristian Adam
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/portable-snippets/ contain code from
-
-https://github.com/nemequ/portable-snippets
-
-and have the following copyright notice:
-
-Each source file contains a preamble explaining the license situation
-for that file, which takes priority over this file. With the
-exception of some code pulled in from other repositories (such as
-µnit, an MIT-licensed project which is used for testing), the code is
-public domain, released using the CC0 1.0 Universal dedication (*).
-
-(*) https://creativecommons.org/publicdomain/zero/1.0/legalcode
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/fast_float/ contain code from
-
-https://github.com/lemire/fast_float
-
-which is made available under the Apache License 2.0.
-
---------------------------------------------------------------------------------
-
-The file python/pyarrow/vendored/version.py contains code from
-
-https://github.com/pypa/packaging/
-
-which is made available under both the Apache license v2.0 and the
-BSD 2-clause license.
-
---------------------------------------------------------------------------------
-
-The files in cpp/src/arrow/vendored/pcg contain code from
-
-https://github.com/imneme/pcg-cpp
-
-and have the following copyright notice:
-
-Copyright 2014-2019 Melissa O'Neill <[email protected]>,
- and the PCG Project contributors.
-
-SPDX-License-Identifier: (Apache-2.0 OR MIT)
-
-Licensed under the Apache License, Version 2.0 (provided in
-LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
-or under the MIT license (provided in LICENSE-MIT.txt and at
-http://opensource.org/licenses/MIT), at your option. This file may not
-be copied, modified, or distributed except according to those terms.
-
-Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
-express or implied. See your chosen license for details.
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+--------------------------------------------------------------------------------
+
+src/plasma/fling.cc and src/plasma/fling.h: Apache 2.0
+
+Copyright 2013 Sharvil Nanavati
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+src/plasma/thirdparty/ae: Modified / 3-Clause BSD
+
+Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Redis nor the names of its contributors may be used
+ to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+src/plasma/thirdparty/dlmalloc.c: CC0
+
+This is a version (aka dlmalloc) of malloc/free/realloc written by
+Doug Lea and released to the public domain, as explained at
+http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
+comments, complaints, performance data, etc to [email protected]
+
+--------------------------------------------------------------------------------
+
+src/plasma/common.cc (some portions)
+
+Copyright (c) Austin Appleby (aappleby (AT) gmail)
+
+Some portions of this file are derived from code in the MurmurHash project
+
+All code is released to the public domain. For business purposes, Murmurhash is
+under the MIT license.
+
+https://sites.google.com/site/murmurhash/
+
+--------------------------------------------------------------------------------
+
+src/arrow/util (some portions): Apache 2.0, and 3-clause BSD
+
+Some portions of this module are derived from code in the Chromium project,
+copyright (c) Google inc and (c) The Chromium Authors and licensed under the
+Apache 2.0 License or the under the 3-clause BSD license:
+
+ Copyright (c) 2013 The Chromium Authors. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from Daniel Lemire's FrameOfReference project.
+
+https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp
+
+Copyright: 2013 Daniel Lemire
+Home page: http://lemire.me/en/
+Project page: https://github.com/lemire/FrameOfReference
+License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from the TensorFlow project
+
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the NumPy project.
+
+https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910
+
+https://github.com/numpy/numpy/blob/68fd82271b9ea5a9e50d4e761061dfcca851382a/numpy/core/src/multiarray/datetime.c
+
+Copyright (c) 2005-2017, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the NumPy Developers nor the names of any
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the Boost project
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the FlatBuffers project
+
+Copyright 2014 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the tslib project
+
+Copyright 2015 Microsoft Corporation. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the jemalloc project
+
+https://github.com/jemalloc/jemalloc
+
+Copyright (C) 2002-2017 Jason Evans <[email protected]>.
+All rights reserved.
+Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved.
+Copyright (C) 2009-2017 Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice(s),
+ this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice(s),
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--------------------------------------------------------------------------------
+
+This project includes code from the Go project, BSD 3-clause license + PATENTS
+weak patent termination clause
+(https://github.com/golang/go/blob/master/PATENTS).
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the hs2client
+
+https://github.com/cloudera/hs2client
+
+Copyright 2016 Cloudera Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+The script ci/scripts/util_wait_for_it.sh has the following license
+
+Copyright (c) 2016 Giles Hall
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The script r/configure has the following license (MIT)
+
+Copyright (c) 2017, Jeroen Ooms and Jim Hester
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+cpp/src/arrow/util/logging.cc, cpp/src/arrow/util/logging.h and
+cpp/src/arrow/util/logging-test.cc are adapted from
+Ray Project (https://github.com/ray-project/ray) (Apache 2.0).
+
+Copyright (c) 2016 Ray Project (https://github.com/ray-project/ray)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+The files cpp/src/arrow/vendored/datetime/date.h, cpp/src/arrow/vendored/datetime/tz.h,
+cpp/src/arrow/vendored/datetime/tz_private.h, cpp/src/arrow/vendored/datetime/ios.h,
+cpp/src/arrow/vendored/datetime/ios.mm,
+cpp/src/arrow/vendored/datetime/tz.cpp are adapted from
+Howard Hinnant's date library (https://github.com/HowardHinnant/date)
+It is licensed under MIT license.
+
+The MIT License (MIT)
+Copyright (c) 2015, 2016, 2017 Howard Hinnant
+Copyright (c) 2016 Adrian Colomitchi
+Copyright (c) 2017 Florian Dang
+Copyright (c) 2017 Paul Thompson
+Copyright (c) 2018 Tomasz Kamiński
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/util/utf8.h includes code adapted from the page
+ https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+with the following license (MIT)
+
+Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/vendored/string_view.hpp has the following license
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/xxhash/ have the following license
+(BSD 2-Clause License)
+
+xxHash Library
+Copyright (c) 2012-2014, Yann Collet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash homepage: http://www.xxhash.com
+- xxHash source repository : https://github.com/Cyan4973/xxHash
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/double-conversion/ have the following license
+(BSD 3-Clause License)
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/uriparser/ have the following license
+(BSD 3-Clause License)
+
+uriparser - RFC 3986 URI parsing library
+
+Copyright (C) 2007, Weijia Song <[email protected]>
+Copyright (C) 2007, Sebastian Pipping <[email protected]>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ * Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the following
+ disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+ * Neither the name of the <ORGANIZATION> nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+The files under dev/tasks/conda-recipes have the following license
+
+BSD 3-clause license
+Copyright (c) 2015-2018, conda-forge
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+ may be used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/utfcpp/ have the following license
+
+Copyright 2006-2018 Nemanja Trifunovic
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from Apache Kudu.
+
+ * cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake
+
+Copyright: 2016 The Apache Software Foundation.
+Home page: https://kudu.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from Apache Impala (incubating), formerly
+Impala. The Impala code and rights were donated to the ASF as part of the
+Incubator process after the initial code imports into Apache Parquet.
+
+Copyright: 2012 Cloudera, Inc.
+Copyright: 2016 The Apache Software Foundation.
+Home page: http://impala.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from Apache Aurora.
+
+* dev/release/{release,changelog,release-candidate} are based on the scripts from
+ Apache Aurora
+
+Copyright: 2016 The Apache Software Foundation.
+Home page: https://aurora.apache.org/
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from the Google styleguide.
+
+* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide.
+
+Copyright: 2009 Google Inc. All rights reserved.
+Homepage: https://github.com/google/styleguide
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+This project includes code from Snappy.
+
+* cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code
+ from Google's Snappy project.
+
+Copyright: 2009 Google Inc. All rights reserved.
+Homepage: https://github.com/google/snappy
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+This project includes code from the manylinux project.
+
+* python/manylinux1/scripts/{build_python.sh,python-tag-abi-tag.py,
+ requirements.txt} are based on code from the manylinux project.
+
+Copyright: 2016 manylinux
+Homepage: https://github.com/pypa/manylinux
+License: The MIT License (MIT)
+
+--------------------------------------------------------------------------------
+
+This project includes code from the cymove project:
+
+* python/pyarrow/includes/common.pxd includes code from the cymove project
+
+The MIT License (MIT)
+Copyright (c) 2019 Omer Ozarslan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The projects includes code from the Ursabot project under the dev/archery
+directory.
+
+License: BSD 2-Clause
+
+Copyright 2019 RStudio, Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+This project include code from mingw-w64.
+
+* cpp/src/arrow/util/cpu-info.cc has a polyfill for mingw-w64 < 5
+
+Copyright (c) 2009 - 2013 by the mingw-w64 project
+Homepage: https://mingw-w64.org
+License: Zope Public License (ZPL) Version 2.1.
+
+---------------------------------------------------------------------------------
+
+This project include code from Google's Asylo project.
+
+* cpp/src/arrow/result.h is based on status_or.h
+
+Copyright (c) Copyright 2017 Asylo authors
+Homepage: https://asylo.dev/
+License: Apache 2.0
+
+--------------------------------------------------------------------------------
+
+This project includes code from Google's protobuf project
+
+* cpp/src/arrow/result.h ARROW_ASSIGN_OR_RAISE is based off ASSIGN_OR_RETURN
+
+Copyright 2008 Google Inc. All rights reserved.
+Homepage: https://developers.google.com/protocol-buffers/
+License:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Code generated by the Protocol Buffer compiler is owned by the owner
+of the input file used when generating it. This code is not
+standalone and requires a support library to be linked with it. This
+support library is itself covered by the above license.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency LLVM is statically linked in certain binary distributions.
+Additionally some sections of source code have been derived from sources in LLVM
+and have been clearly labeled as such. LLVM has the following license:
+
+==============================================================================
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+ `LICENSE` file at the top containing the specific license and restrictions
+ which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+ file.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency gRPC is statically linked in certain binary
+distributions, like the python wheels. gRPC has the following license:
+
+Copyright 2014 gRPC authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency Apache Thrift is statically linked in certain binary
+distributions, like the python wheels. Apache Thrift has the following license:
+
+Apache Thrift
+Copyright (C) 2006 - 2019, The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency Apache ORC is statically linked in certain binary
+distributions, like the python wheels. Apache ORC has the following license:
+
+Apache ORC
+Copyright 2013-2019 The Apache Software Foundation
+
+This product includes software developed by The Apache Software
+Foundation (http://www.apache.org/).
+
+This product includes software developed by Hewlett-Packard:
+(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency zstd is statically linked in certain binary
+distributions, like the python wheels. ZSTD has the following license:
+
+BSD License
+
+For Zstandard software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+ endorse or promote products derived from this software without specific
+ prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency lz4 is statically linked in certain binary
+distributions, like the python wheels. lz4 has the following license:
+
+LZ4 Library
+Copyright (c) 2011-2016, Yann Collet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency Brotli is statically linked in certain binary
+distributions, like the python wheels. Brotli has the following license:
+
+Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency rapidjson is statically linked in certain binary
+distributions, like the python wheels. rapidjson and its dependencies have the
+following licenses:
+
+Tencent is pleased to support the open source community by making RapidJSON
+available.
+
+Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.
+All rights reserved.
+
+If you have downloaded a copy of the RapidJSON binary from Tencent, please note
+that the RapidJSON binary is licensed under the MIT License.
+If you have downloaded a copy of the RapidJSON source code from Tencent, please
+note that RapidJSON source code is licensed under the MIT License, except for
+the third-party components listed below which are subject to different license
+terms. Your integration of RapidJSON into your own projects may require
+compliance with the MIT License, as well as the other licenses applicable to
+the third-party components included within RapidJSON. To avoid the problematic
+JSON license in your own projects, it's sufficient to exclude the
+bin/jsonchecker/ directory, as it's the only code under the JSON license.
+A copy of the MIT License is included in this file.
+
+Other dependencies and licenses:
+
+ Open Source Software Licensed Under the BSD License:
+ --------------------------------------------------------------------
+
+ The msinttypes r29
+ Copyright (c) 2006-2013 Alexander Chemeris
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of copyright holder nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ DAMAGE.
+
+ Open Source Software Licensed Under the JSON License:
+ --------------------------------------------------------------------
+
+ json.org
+ Copyright (c) 2002 JSON.org
+ All Rights Reserved.
+
+ JSON_checker
+ Copyright (c) 2002 JSON.org
+ All Rights Reserved.
+
+
+ Terms of the JSON License:
+ ---------------------------------------------------
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ The Software shall be used for Good, not Evil.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+
+
+ Terms of the MIT License:
+ --------------------------------------------------------------------
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency snappy is statically linked in certain binary
+distributions, like the python wheels. snappy has the following license:
+
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+ * Neither the name of Google Inc. nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+===
+
+Some of the benchmark data in testdata/ is licensed differently:
+
+ - fireworks.jpeg is Copyright 2013 Steinar H. Gunderson, and
+ is licensed under the Creative Commons Attribution 3.0 license
+ (CC-BY-3.0). See https://creativecommons.org/licenses/by/3.0/
+ for more information.
+
+ - kppkn.gtb is taken from the Gaviota chess tablebase set, and
+ is licensed under the MIT License. See
+ https://sites.google.com/site/gaviotachessengine/Home/endgame-tablebases-1
+ for more information.
+
+ - paper-100k.pdf is an excerpt (bytes 92160 to 194560) from the paper
+ “Combinatorial Modeling of Chromatin Features Quantitatively Predicts DNA
+ Replication Timing in _Drosophila_” by Federico Comoglio and Renato Paro,
+ which is licensed under the CC-BY license. See
+ http://www.ploscompbiol.org/static/license for more ifnormation.
+
+ - alice29.txt, asyoulik.txt, plrabn12.txt and lcet10.txt are from Project
+ Gutenberg. The first three have expired copyrights and are in the public
+ domain; the latter does not have expired copyright, but is still in the
+ public domain according to the license information
+ (http://www.gutenberg.org/ebooks/53).
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency gflags is statically linked in certain binary
+distributions, like the python wheels. gflags has the following license:
+
+Copyright (c) 2006, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency glog is statically linked in certain binary
+distributions, like the python wheels. glog has the following license:
+
+Copyright (c) 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+A function gettimeofday in utilities.cc is based on
+
+http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd
+
+The license of this code is:
+
+Copyright (c) 2003-2008, Jouni Malinen <[email protected]> and contributors
+All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. Neither the name(s) of the above-listed copyright holder(s) nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency re2 is statically linked in certain binary
+distributions, like the python wheels. re2 has the following license:
+
+Copyright (c) 2009 The RE2 Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+ * Neither the name of Google Inc. nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency c-ares is statically linked in certain binary
+distributions, like the python wheels. c-ares has the following license:
+
+# c-ares license
+
+Copyright (c) 2007 - 2018, Daniel Stenberg with many contributors, see AUTHORS
+file.
+
+Copyright 1998 by the Massachusetts Institute of Technology.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted, provided that
+the above copyright notice appear in all copies and that both that copyright
+notice and this permission notice appear in supporting documentation, and that
+the name of M.I.T. not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior permission.
+M.I.T. makes no representations about the suitability of this software for any
+purpose. It is provided "as is" without express or implied warranty.
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency zlib is redistributed as a dynamically linked shared
+library in certain binary distributions, like the python wheels. In the future
+this will likely change to static linkage. zlib has the following license:
+
+zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.11, January 15th, 2017
+
+ Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+
+--------------------------------------------------------------------------------
+
+3rdparty dependency openssl is redistributed as a dynamically linked shared
+library in certain binary distributions, like the python wheels. openssl
+preceding version 3 has the following license:
+
+ LICENSE ISSUES
+ ==============
+
+ The OpenSSL toolkit stays under a double license, i.e. both the conditions of
+ the OpenSSL License and the original SSLeay license apply to the toolkit.
+ See below for the actual license texts.
+
+ OpenSSL License
+ ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2019 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * ([email protected]). This product includes software written by Tim
+ * Hudson ([email protected]).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young ([email protected])"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson ([email protected])"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+--------------------------------------------------------------------------------
+
+This project includes code from the rtools-backports project.
+
+* ci/scripts/PKGBUILD and ci/scripts/r_windows_build.sh are based on code
+ from the rtools-backports project.
+
+Copyright: Copyright (c) 2013 - 2019, Алексей and Jeroen Ooms.
+All rights reserved.
+Homepage: https://github.com/r-windows/rtools-backports
+License: 3-clause BSD
+
+--------------------------------------------------------------------------------
+
+Some code from pandas has been adapted for the pyarrow codebase. pandas is
+available under the 3-clause BSD license, which follows:
+
+pandas license
+==============
+
+Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+
+Copyright (c) 2008-2011 AQR Capital Management, LLC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the copyright holder nor the names of any
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
+Some bits from DyND, in particular aspects of the build system, have been
+adapted from libdynd and dynd-python under the terms of the BSD 2-clause
+license
+
+The BSD 2-Clause License
+
+ Copyright (C) 2011-12, Dynamic NDArray Developers
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Dynamic NDArray Developers list:
+
+ * Mark Wiebe
+ * Continuum Analytics
+
+--------------------------------------------------------------------------------
+
+Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted
+for PyArrow. Ibis is released under the Apache License, Version 2.0.
+
+--------------------------------------------------------------------------------
+
+This project includes code from the autobrew project.
+
+* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb
+ are based on code from the autobrew project.
+
+Copyright (c) 2019, Jeroen Ooms
+License: MIT
+Homepage: https://github.com/jeroen/autobrew
+
+--------------------------------------------------------------------------------
+
+dev/tasks/homebrew-formulae/apache-arrow.rb has the following license:
+
+BSD 2-Clause License
+
+Copyright (c) 2009-present, Homebrew contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------------------------------------------------------------
+
+cpp/src/arrow/vendored/base64.cpp has the following license
+
+ZLIB License
+
+Copyright (C) 2004-2017 René Nyffenegger
+
+This source code is provided 'as-is', without any express or implied
+warranty. In no event will the author be held liable for any damages arising
+from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose, including
+commercial applications, and to alter it and redistribute it freely, subject to
+the following restrictions:
+
+1. The origin of this source code must not be misrepresented; you must not
+ claim that you wrote the original source code. If you use this source code
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original source code.
+
+3. This notice may not be removed or altered from any source distribution.
+
+René Nyffenegger [email protected]
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/vendored/optional.hpp has the following license
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+This project includes code from Folly.
+
+ * cpp/src/arrow/vendored/ProducerConsumerQueue.h
+
+is based on Folly's
+
+ * folly/Portability.h
+ * folly/lang/Align.h
+ * folly/ProducerConsumerQueue.h
+
+Copyright: Copyright (c) Facebook, Inc. and its affiliates.
+Home page: https://github.com/facebook/folly
+License: http://www.apache.org/licenses/LICENSE-2.0
+
+--------------------------------------------------------------------------------
+
+The file cpp/src/arrow/vendored/musl/strptime.c has the following license
+
+Copyright © 2005-2020 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The file cpp/cmake_modules/BuildUtils.cmake contains code from
+
+https://gist.github.com/cristianadam/ef920342939a89fae3e8a85ca9459b49
+
+which is made available under the MIT license
+
+Copyright (c) 2019 Cristian Adam
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/portable-snippets/ contain code from
+
+https://github.com/nemequ/portable-snippets
+
+and have the following copyright notice:
+
+Each source file contains a preamble explaining the license situation
+for that file, which takes priority over this file. With the
+exception of some code pulled in from other repositories (such as
+µnit, an MIT-licensed project which is used for testing), the code is
+public domain, released using the CC0 1.0 Universal dedication (*).
+
+(*) https://creativecommons.org/publicdomain/zero/1.0/legalcode
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/fast_float/ contain code from
+
+https://github.com/lemire/fast_float
+
+which is made available under the Apache License 2.0.
+
+--------------------------------------------------------------------------------
+
+The file python/pyarrow/vendored/version.py contains code from
+
+https://github.com/pypa/packaging/
+
+which is made available under both the Apache license v2.0 and the
+BSD 2-clause license.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/vendored/pcg contain code from
+
+https://github.com/imneme/pcg-cpp
+
+and have the following copyright notice:
+
+Copyright 2014-2019 Melissa O'Neill <[email protected]>,
+ and the PCG Project contributors.
+
+SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+Licensed under the Apache License, Version 2.0 (provided in
+LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
+or under the MIT license (provided in LICENSE-MIT.txt and at
+http://opensource.org/licenses/MIT), at your option. This file may not
+be copied, modified, or distributed except according to those terms.
+
+Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
+express or implied. See your chosen license for details.
diff --git a/contrib/libs/apache/arrow/NOTICE.txt b/contrib/libs/apache/arrow/NOTICE.txt
index a609791374c..649a6c21c40 100644
--- a/contrib/libs/apache/arrow/NOTICE.txt
+++ b/contrib/libs/apache/arrow/NOTICE.txt
@@ -1,84 +1,84 @@
-Apache Arrow
-Copyright 2016-2019 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-This product includes software from the SFrame project (BSD, 3-clause).
-* Copyright (C) 2015 Dato, Inc.
-* Copyright (c) 2009 Carnegie Mellon University.
-
-This product includes software from the Feather project (Apache 2.0)
-https://github.com/wesm/feather
-
-This product includes software from the DyND project (BSD 2-clause)
-https://github.com/libdynd
-
-This product includes software from the LLVM project
- * distributed under the University of Illinois Open Source
-
-This product includes software from the google-lint project
- * Copyright (c) 2009 Google Inc. All rights reserved.
-
-This product includes software from the mman-win32 project
- * Copyright https://code.google.com/p/mman-win32/
- * Licensed under the MIT License;
-
-This product includes software from the LevelDB project
- * Copyright (c) 2011 The LevelDB Authors. All rights reserved.
- * Use of this source code is governed by a BSD-style license that can be
- * Moved from Kudu http://github.com/cloudera/kudu
-
-This product includes software from the CMake project
- * Copyright 2001-2009 Kitware, Inc.
- * Copyright 2012-2014 Continuum Analytics, Inc.
- * All rights reserved.
-
-This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause)
- * Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved.
-
-This product includes software from the Ibis project (Apache 2.0)
- * Copyright (c) 2015 Cloudera, Inc.
- * https://github.com/cloudera/ibis
-
-This product includes software from Dremio (Apache 2.0)
- * Copyright (C) 2017-2018 Dremio Corporation
- * https://github.com/dremio/dremio-oss
-
-This product includes software from Google Guava (Apache 2.0)
- * Copyright (C) 2007 The Guava Authors
- * https://github.com/google/guava
-
-This product include software from CMake (BSD 3-Clause)
- * CMake - Cross Platform Makefile Generator
- * Copyright 2000-2019 Kitware, Inc. and Contributors
-
-The web site includes files generated by Jekyll.
-
---------------------------------------------------------------------------------
-
-This product includes code from Apache Kudu, which includes the following in
-its NOTICE file:
-
- Apache Kudu
- Copyright 2016 The Apache Software Foundation
-
- This product includes software developed at
- The Apache Software Foundation (http://www.apache.org/).
-
- Portions of this software were developed at
- Cloudera, Inc (http://www.cloudera.com/).
-
---------------------------------------------------------------------------------
-
-This product includes code from Apache ORC, which includes the following in
-its NOTICE file:
-
- Apache ORC
- Copyright 2013-2019 The Apache Software Foundation
-
- This product includes software developed by The Apache Software
- Foundation (http://www.apache.org/).
-
- This product includes software developed by Hewlett-Packard:
- (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
+Apache Arrow
+Copyright 2016-2019 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+This product includes software from the SFrame project (BSD, 3-clause).
+* Copyright (C) 2015 Dato, Inc.
+* Copyright (c) 2009 Carnegie Mellon University.
+
+This product includes software from the Feather project (Apache 2.0)
+https://github.com/wesm/feather
+
+This product includes software from the DyND project (BSD 2-clause)
+https://github.com/libdynd
+
+This product includes software from the LLVM project
+ * distributed under the University of Illinois Open Source
+
+This product includes software from the google-lint project
+ * Copyright (c) 2009 Google Inc. All rights reserved.
+
+This product includes software from the mman-win32 project
+ * Copyright https://code.google.com/p/mman-win32/
+ * Licensed under the MIT License;
+
+This product includes software from the LevelDB project
+ * Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * Moved from Kudu http://github.com/cloudera/kudu
+
+This product includes software from the CMake project
+ * Copyright 2001-2009 Kitware, Inc.
+ * Copyright 2012-2014 Continuum Analytics, Inc.
+ * All rights reserved.
+
+This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause)
+ * Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved.
+
+This product includes software from the Ibis project (Apache 2.0)
+ * Copyright (c) 2015 Cloudera, Inc.
+ * https://github.com/cloudera/ibis
+
+This product includes software from Dremio (Apache 2.0)
+ * Copyright (C) 2017-2018 Dremio Corporation
+ * https://github.com/dremio/dremio-oss
+
+This product includes software from Google Guava (Apache 2.0)
+ * Copyright (C) 2007 The Guava Authors
+ * https://github.com/google/guava
+
+This product include software from CMake (BSD 3-Clause)
+ * CMake - Cross Platform Makefile Generator
+ * Copyright 2000-2019 Kitware, Inc. and Contributors
+
+The web site includes files generated by Jekyll.
+
+--------------------------------------------------------------------------------
+
+This product includes code from Apache Kudu, which includes the following in
+its NOTICE file:
+
+ Apache Kudu
+ Copyright 2016 The Apache Software Foundation
+
+ This product includes software developed at
+ The Apache Software Foundation (http://www.apache.org/).
+
+ Portions of this software were developed at
+ Cloudera, Inc (http://www.cloudera.com/).
+
+--------------------------------------------------------------------------------
+
+This product includes code from Apache ORC, which includes the following in
+its NOTICE file:
+
+ Apache ORC
+ Copyright 2013-2019 The Apache Software Foundation
+
+ This product includes software developed by The Apache Software
+ Foundation (http://www.apache.org/).
+
+ This product includes software developed by Hewlett-Packard:
+ (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
diff --git a/contrib/libs/apache/arrow/README.md b/contrib/libs/apache/arrow/README.md
index 7d10b81c6e4..17d4537953a 100644
--- a/contrib/libs/apache/arrow/README.md
+++ b/contrib/libs/apache/arrow/README.md
@@ -17,88 +17,88 @@
under the License.
-->
-# Apache Arrow
-
-[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow)
-[![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt)
-[![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow)
-
-## Powering In-Memory Analytics
-
-Apache Arrow is a development platform for in-memory analytics. It contains a
-set of technologies that enable big data systems to process and move data fast.
-
-Major components of the project include:
-
- - [The Arrow Columnar In-Memory Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst):
- a standard and efficient in-memory representation of various datatypes, plain or nested
- - [The Arrow IPC Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst#serialization-and-interprocess-communication-ipc):
- an efficient serialization of the Arrow format and associated metadata,
- for communication between processes and heterogeneous environments
- - [The Arrow Flight RPC protocol](https://github.com/apache/arrow/tree/master/format/Flight.proto):
- based on the Arrow IPC format, a building block for remote services exchanging
- Arrow data with application-defined semantics (for example a storage server or a database)
- - [C++ libraries](https://github.com/apache/arrow/tree/master/cpp)
- - [C bindings using GLib](https://github.com/apache/arrow/tree/master/c_glib)
- - [C# .NET libraries](https://github.com/apache/arrow/tree/master/csharp)
- - [Gandiva](https://github.com/apache/arrow/tree/master/cpp/src/gandiva):
- an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase
- - [Go libraries](https://github.com/apache/arrow/tree/master/go)
- - [Java libraries](https://github.com/apache/arrow/tree/master/java)
- - [JavaScript libraries](https://github.com/apache/arrow/tree/master/js)
- - [Plasma Object Store](https://github.com/apache/arrow/tree/master/cpp/src/plasma):
- a shared-memory blob store, part of the C++ codebase
- - [Python libraries](https://github.com/apache/arrow/tree/master/python)
- - [R libraries](https://github.com/apache/arrow/tree/master/r)
- - [Ruby libraries](https://github.com/apache/arrow/tree/master/ruby)
- - [Rust libraries](https://github.com/apache/arrow-rs)
-
-Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn more at
-[arrow.apache.org](https://arrow.apache.org).
-
-## What's in the Arrow libraries?
-
-The reference Arrow libraries contain many distinct software components:
-
-- Columnar vector and table-like containers (similar to data frames) supporting
- flat or nested types
-- Fast, language agnostic metadata messaging layer (using Google's Flatbuffers
- library)
-- Reference-counted off-heap buffer memory management, for zero-copy memory
- sharing and handling memory-mapped files
-- IO interfaces to local and remote filesystems
-- Self-describing binary wire formats (streaming and batch/file-like) for
- remote procedure calls (RPC) and interprocess communication (IPC)
-- Integration tests for verifying binary compatibility between the
- implementations (e.g. sending data from Java to C++)
-- Conversions to and from other in-memory data structures
-- Readers and writers for various widely-used file formats (such as Parquet, CSV)
-
-## Implementation status
-
-The official Arrow libraries in this repository are in different stages of
-implementing the Arrow format and related features. See our current
-[feature matrix](https://github.com/apache/arrow/blob/master/docs/source/status.rst)
-on git master.
-
-## How to Contribute
-
-Please read our latest [project contribution guide][5].
-
-## Getting involved
-
-Even if you do not plan to contribute to Apache Arrow itself or Arrow
-integrations in other projects, we'd be happy to have you involved:
-
-- Join the mailing list: send an email to
- [[email protected]][1]. Share your ideas and use cases for the
- project.
-- [Follow our activity on JIRA][3]
-- [Learn the format][2]
-- Contribute code to one of the reference implementations
-
-[1]: mailto:[email protected]
-[2]: https://github.com/apache/arrow/tree/master/format
-[3]: https://issues.apache.org/jira/browse/ARROW
-[4]: https://github.com/apache/arrow
-[5]: https://github.com/apache/arrow/blob/master/docs/source/developers/contributing.rst
+# Apache Arrow
+
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow)
+[![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt)
+[![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow)
+
+## Powering In-Memory Analytics
+
+Apache Arrow is a development platform for in-memory analytics. It contains a
+set of technologies that enable big data systems to process and move data fast.
+
+Major components of the project include:
+
+ - [The Arrow Columnar In-Memory Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst):
+ a standard and efficient in-memory representation of various datatypes, plain or nested
+ - [The Arrow IPC Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst#serialization-and-interprocess-communication-ipc):
+ an efficient serialization of the Arrow format and associated metadata,
+ for communication between processes and heterogeneous environments
+ - [The Arrow Flight RPC protocol](https://github.com/apache/arrow/tree/master/format/Flight.proto):
+ based on the Arrow IPC format, a building block for remote services exchanging
+ Arrow data with application-defined semantics (for example a storage server or a database)
+ - [C++ libraries](https://github.com/apache/arrow/tree/master/cpp)
+ - [C bindings using GLib](https://github.com/apache/arrow/tree/master/c_glib)
+ - [C# .NET libraries](https://github.com/apache/arrow/tree/master/csharp)
+ - [Gandiva](https://github.com/apache/arrow/tree/master/cpp/src/gandiva):
+ an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase
+ - [Go libraries](https://github.com/apache/arrow/tree/master/go)
+ - [Java libraries](https://github.com/apache/arrow/tree/master/java)
+ - [JavaScript libraries](https://github.com/apache/arrow/tree/master/js)
+ - [Plasma Object Store](https://github.com/apache/arrow/tree/master/cpp/src/plasma):
+ a shared-memory blob store, part of the C++ codebase
+ - [Python libraries](https://github.com/apache/arrow/tree/master/python)
+ - [R libraries](https://github.com/apache/arrow/tree/master/r)
+ - [Ruby libraries](https://github.com/apache/arrow/tree/master/ruby)
+ - [Rust libraries](https://github.com/apache/arrow-rs)
+
+Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn more at
+[arrow.apache.org](https://arrow.apache.org).
+
+## What's in the Arrow libraries?
+
+The reference Arrow libraries contain many distinct software components:
+
+- Columnar vector and table-like containers (similar to data frames) supporting
+ flat or nested types
+- Fast, language agnostic metadata messaging layer (using Google's Flatbuffers
+ library)
+- Reference-counted off-heap buffer memory management, for zero-copy memory
+ sharing and handling memory-mapped files
+- IO interfaces to local and remote filesystems
+- Self-describing binary wire formats (streaming and batch/file-like) for
+ remote procedure calls (RPC) and interprocess communication (IPC)
+- Integration tests for verifying binary compatibility between the
+ implementations (e.g. sending data from Java to C++)
+- Conversions to and from other in-memory data structures
+- Readers and writers for various widely-used file formats (such as Parquet, CSV)
+
+## Implementation status
+
+The official Arrow libraries in this repository are in different stages of
+implementing the Arrow format and related features. See our current
+[feature matrix](https://github.com/apache/arrow/blob/master/docs/source/status.rst)
+on git master.
+
+## How to Contribute
+
+Please read our latest [project contribution guide][5].
+
+## Getting involved
+
+Even if you do not plan to contribute to Apache Arrow itself or Arrow
+integrations in other projects, we'd be happy to have you involved:
+
+- Join the mailing list: send an email to
+ [[email protected]][1]. Share your ideas and use cases for the
+ project.
+- [Follow our activity on JIRA][3]
+- [Learn the format][2]
+- Contribute code to one of the reference implementations
+
+[1]: mailto:[email protected]
+[2]: https://github.com/apache/arrow/tree/master/format
+[3]: https://issues.apache.org/jira/browse/ARROW
+[4]: https://github.com/apache/arrow
+[5]: https://github.com/apache/arrow/blob/master/docs/source/developers/contributing.rst
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
index 2f74b40e40d..33ac56ff816 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
@@ -1,595 +1,595 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/adapters/orc/adapter.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <list>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/adapters/orc/adapter_util.h"
-#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/io/interfaces.h"
-#include "arrow/memory_pool.h"
-#include "arrow/record_batch.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/table_builder.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/decimal.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/range.h"
-#include "arrow/util/visibility.h"
-#include "orc/Exceptions.hh"
-
-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
-#define ORC_THROW_NOT_OK(s) \
- do { \
- Status _s = (s); \
- if (!_s.ok()) { \
- std::stringstream ss; \
- ss << "Arrow error: " << _s.ToString(); \
- throw liborc::ParseError(ss.str()); \
- } \
- } while (0)
-
-#define ORC_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
- auto status_name = (rexpr); \
- ORC_THROW_NOT_OK(status_name.status()); \
- lhs = std::move(status_name).ValueOrDie();
-
-#define ORC_ASSIGN_OR_THROW(lhs, rexpr) \
- ORC_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
- lhs, rexpr);
-
-#define ORC_BEGIN_CATCH_NOT_OK try {
-#define ORC_END_CATCH_NOT_OK \
- } \
- catch (const liborc::ParseError& e) { \
- return Status::IOError(e.what()); \
- } \
- catch (const liborc::InvalidArgument& e) { \
- return Status::Invalid(e.what()); \
- } \
- catch (const liborc::NotImplementedYet& e) { \
- return Status::NotImplemented(e.what()); \
- }
-
-#define ORC_CATCH_NOT_OK(_s) \
- ORC_BEGIN_CATCH_NOT_OK(_s); \
- ORC_END_CATCH_NOT_OK
-
-namespace arrow {
-namespace adapters {
-namespace orc {
-
-namespace {
-
-// The following are required by ORC to be uint64_t
-constexpr uint64_t kOrcWriterBatchSize = 128 * 1024;
-constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024;
-
-using internal::checked_cast;
-
-class ArrowInputFile : public liborc::InputStream {
- public:
- explicit ArrowInputFile(const std::shared_ptr<io::RandomAccessFile>& file)
- : file_(file) {}
-
- uint64_t getLength() const override {
- ORC_ASSIGN_OR_THROW(int64_t size, file_->GetSize());
- return static_cast<uint64_t>(size);
- }
-
- uint64_t getNaturalReadSize() const override { return 128 * 1024; }
-
- void read(void* buf, uint64_t length, uint64_t offset) override {
- ORC_ASSIGN_OR_THROW(int64_t bytes_read, file_->ReadAt(offset, length, buf));
-
- if (static_cast<uint64_t>(bytes_read) != length) {
- throw liborc::ParseError("Short read from arrow input file");
- }
- }
-
- const std::string& getName() const override {
- static const std::string filename("ArrowInputFile");
- return filename;
- }
-
- private:
- std::shared_ptr<io::RandomAccessFile> file_;
-};
-
-struct StripeInformation {
- uint64_t offset;
- uint64_t length;
- uint64_t num_rows;
- uint64_t first_row_of_stripe;
-};
-
-// The number of rows to read in a ColumnVectorBatch
-constexpr int64_t kReadRowsBatch = 1000;
-
-class OrcStripeReader : public RecordBatchReader {
- public:
- OrcStripeReader(std::unique_ptr<liborc::RowReader> row_reader,
- std::shared_ptr<Schema> schema, int64_t batch_size, MemoryPool* pool)
- : row_reader_(std::move(row_reader)),
- schema_(schema),
- pool_(pool),
- batch_size_{batch_size} {}
-
- std::shared_ptr<Schema> schema() const override { return schema_; }
-
- Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
- std::unique_ptr<liborc::ColumnVectorBatch> batch;
- ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
-
- const liborc::Type& type = row_reader_->getSelectedType();
- if (!row_reader_->next(*batch)) {
- out->reset();
- return Status::OK();
- }
-
- std::unique_ptr<RecordBatchBuilder> builder;
- RETURN_NOT_OK(RecordBatchBuilder::Make(schema_, pool_, batch->numElements, &builder));
-
- // The top-level type must be a struct to read into an arrow table
- const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
-
- for (int i = 0; i < builder->num_fields(); i++) {
- RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
- batch->numElements, builder->GetField(i)));
- }
-
- RETURN_NOT_OK(builder->Flush(out));
- return Status::OK();
- }
-
- private:
- std::unique_ptr<liborc::RowReader> row_reader_;
- std::shared_ptr<Schema> schema_;
- MemoryPool* pool_;
- int64_t batch_size_;
-};
-
-} // namespace
-
-class ORCFileReader::Impl {
- public:
- Impl() {}
- ~Impl() {}
-
- Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
- std::unique_ptr<ArrowInputFile> io_wrapper(new ArrowInputFile(file));
- liborc::ReaderOptions options;
- std::unique_ptr<liborc::Reader> liborc_reader;
- ORC_CATCH_NOT_OK(liborc_reader = createReader(std::move(io_wrapper), options));
- pool_ = pool;
- reader_ = std::move(liborc_reader);
- current_row_ = 0;
-
- return Init();
- }
-
- Status Init() {
- int64_t nstripes = reader_->getNumberOfStripes();
- stripes_.resize(nstripes);
- std::unique_ptr<liborc::StripeInformation> stripe;
- uint64_t first_row_of_stripe = 0;
- for (int i = 0; i < nstripes; ++i) {
- stripe = reader_->getStripe(i);
- stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(),
- stripe->getNumberOfRows(), first_row_of_stripe});
- first_row_of_stripe += stripe->getNumberOfRows();
- }
- return Status::OK();
- }
-
- int64_t NumberOfStripes() { return stripes_.size(); }
-
- int64_t NumberOfRows() { return reader_->getNumberOfRows(); }
-
- Status ReadSchema(std::shared_ptr<Schema>* out) {
- const liborc::Type& type = reader_->getType();
- return GetArrowSchema(type, out);
- }
-
- Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
- std::unique_ptr<liborc::RowReader> row_reader;
- ORC_CATCH_NOT_OK(row_reader = reader_->createRowReader(opts));
- const liborc::Type& type = row_reader->getSelectedType();
- return GetArrowSchema(type, out);
- }
-
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() {
- const std::list<std::string> keys = reader_->getMetadataKeys();
- auto metadata = std::make_shared<KeyValueMetadata>();
- for (const auto& key : keys) {
- metadata->Append(key, reader_->getMetadataValue(key));
- }
- return std::const_pointer_cast<const KeyValueMetadata>(metadata);
- }
-
- Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
- if (type.getKind() != liborc::STRUCT) {
- return Status::NotImplemented(
- "Only ORC files with a top-level struct "
- "can be handled");
- }
- int size = static_cast<int>(type.getSubtypeCount());
- std::vector<std::shared_ptr<Field>> fields;
- for (int child = 0; child < size; ++child) {
- std::shared_ptr<DataType> elemtype;
- RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype));
- std::string name = type.getFieldName(child);
- fields.push_back(field(name, elemtype));
- }
- ARROW_ASSIGN_OR_RAISE(auto metadata, ReadMetadata());
- *out = std::make_shared<Schema>(std::move(fields), std::move(metadata));
- return Status::OK();
- }
-
- Status Read(std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadTable(opts, schema, out);
- }
-
- Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- return ReadTable(opts, schema, out);
- }
-
- Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadTable(opts, schema, out);
- }
-
- Status Read(const std::shared_ptr<Schema>& schema,
- const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- return ReadTable(opts, schema, out);
- }
-
- Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectStripe(&opts, stripe));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
- }
-
- Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatch>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- RETURN_NOT_OK(SelectStripe(&opts, stripe));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
- }
-
- Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
- ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
- Status::Invalid("Out of bounds stripe: ", stripe));
-
- opts->range(stripes_[stripe].offset, stripes_[stripe].length);
- return Status::OK();
- }
-
- Status SelectStripeWithRowNumber(liborc::RowReaderOptions* opts, int64_t row_number,
- StripeInformation* out) {
- ARROW_RETURN_IF(row_number >= NumberOfRows(),
- Status::Invalid("Out of bounds row number: ", row_number));
-
- for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
- if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
- static_cast<uint64_t>(row_number) < it->first_row_of_stripe + it->num_rows) {
- opts->range(it->offset, it->length);
- *out = *it;
- return Status::OK();
- }
- }
-
- return Status::Invalid("Invalid row number", row_number);
- }
-
- Status SelectIndices(liborc::RowReaderOptions* opts,
- const std::vector<int>& include_indices) {
- std::list<uint64_t> include_indices_list;
- for (auto it = include_indices.begin(); it != include_indices.end(); ++it) {
- ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
- include_indices_list.push_back(*it);
- }
- opts->includeTypes(include_indices_list);
- return Status::OK();
- }
-
- Status ReadTable(const liborc::RowReaderOptions& row_opts,
- const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts(row_opts);
- std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
- for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
- opts.range(stripes_[stripe].offset, stripes_[stripe].length);
- RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
- }
- return Table::FromRecordBatches(schema, std::move(batches)).Value(out);
- }
-
- Status ReadBatch(const liborc::RowReaderOptions& opts,
- const std::shared_ptr<Schema>& schema, int64_t nrows,
- std::shared_ptr<RecordBatch>* out) {
- std::unique_ptr<liborc::RowReader> row_reader;
- std::unique_ptr<liborc::ColumnVectorBatch> batch;
-
- ORC_BEGIN_CATCH_NOT_OK
- row_reader = reader_->createRowReader(opts);
- batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
- ORC_END_CATCH_NOT_OK
-
- std::unique_ptr<RecordBatchBuilder> builder;
- RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
-
- // The top-level type must be a struct to read into an arrow table
- const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
-
- const liborc::Type& type = row_reader->getSelectedType();
- while (row_reader->next(*batch)) {
- for (int i = 0; i < builder->num_fields(); i++) {
- RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
- batch->numElements, builder->GetField(i)));
- }
- }
- RETURN_NOT_OK(builder->Flush(out));
- return Status::OK();
- }
-
- Status Seek(int64_t row_number) {
- ARROW_RETURN_IF(row_number >= NumberOfRows(),
- Status::Invalid("Out of bounds row number: ", row_number));
-
- current_row_ = row_number;
- return Status::OK();
- }
-
- Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- if (current_row_ >= NumberOfRows()) {
- out->reset();
- return Status::OK();
- }
-
- liborc::RowReaderOptions opts;
- if (!include_indices.empty()) {
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- }
- StripeInformation stripe_info({0, 0, 0, 0});
- RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- std::unique_ptr<liborc::RowReader> row_reader;
-
- ORC_BEGIN_CATCH_NOT_OK
- row_reader = reader_->createRowReader(opts);
- row_reader->seekToRow(current_row_);
- current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
- ORC_END_CATCH_NOT_OK
-
- *out = std::shared_ptr<RecordBatchReader>(
- new OrcStripeReader(std::move(row_reader), schema, batch_size, pool_));
- return Status::OK();
- }
-
- Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
- return NextStripeReader(batch_size, {}, out);
- }
-
- private:
- MemoryPool* pool_;
- std::unique_ptr<liborc::Reader> reader_;
- std::vector<StripeInformation> stripes_;
- int64_t current_row_;
-};
-
-ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); }
-
-ORCFileReader::~ORCFileReader() {}
-
-Status ORCFileReader::Open(const std::shared_ptr<io::RandomAccessFile>& file,
- MemoryPool* pool, std::unique_ptr<ORCFileReader>* reader) {
- auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
- RETURN_NOT_OK(result->impl_->Open(file, pool));
- *reader = std::move(result);
- return Status::OK();
-}
-
-Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
- return impl_->ReadMetadata();
-}
-
-Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
- return impl_->ReadSchema(out);
-}
-
-Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
-
-Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
- std::shared_ptr<Table>* out) {
- return impl_->Read(schema, out);
-}
-
-Status ORCFileReader::Read(const std::vector<int>& include_indices,
- std::shared_ptr<Table>* out) {
- return impl_->Read(include_indices, out);
-}
-
-Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
- const std::vector<int>& include_indices,
- std::shared_ptr<Table>* out) {
- return impl_->Read(schema, include_indices, out);
-}
-
-Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
- return impl_->ReadStripe(stripe, out);
-}
-
-Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatch>* out) {
- return impl_->ReadStripe(stripe, include_indices, out);
-}
-
-Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); }
-
-Status ORCFileReader::NextStripeReader(int64_t batch_sizes,
- std::shared_ptr<RecordBatchReader>* out) {
- return impl_->NextStripeReader(batch_sizes, out);
-}
-
-Status ORCFileReader::NextStripeReader(int64_t batch_size,
- const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- return impl_->NextStripeReader(batch_size, include_indices, out);
-}
-
-int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
-
-int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
-
-namespace {
-
-class ArrowOutputStream : public liborc::OutputStream {
- public:
- explicit ArrowOutputStream(arrow::io::OutputStream& output_stream)
- : output_stream_(output_stream), length_(0) {}
-
- uint64_t getLength() const override { return length_; }
-
- uint64_t getNaturalWriteSize() const override { return kOrcNaturalWriteSize; }
-
- void write(const void* buf, size_t length) override {
- ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast<int64_t>(length)));
- length_ += static_cast<int64_t>(length);
- }
-
- // Mandatory due to us implementing an ORC virtual class.
- // Used by ORC for error messages, not used by Arrow
- const std::string& getName() const override {
- static const std::string filename("ArrowOutputFile");
- return filename;
- }
-
- void close() override {
- if (!output_stream_.closed()) {
- ORC_THROW_NOT_OK(output_stream_.Close());
- }
- }
-
- void set_length(int64_t length) { length_ = length; }
-
- private:
- arrow::io::OutputStream& output_stream_;
- int64_t length_;
-};
-
-} // namespace
-
-class ORCFileWriter::Impl {
- public:
- Status Open(arrow::io::OutputStream* output_stream) {
- out_stream_ = std::unique_ptr<liborc::OutputStream>(
- checked_cast<liborc::OutputStream*>(new ArrowOutputStream(*output_stream)));
- return Status::OK();
- }
-
- Status Write(const Table& table) {
- std::unique_ptr<liborc::WriterOptions> orc_options =
- std::unique_ptr<liborc::WriterOptions>(new liborc::WriterOptions());
- ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema())));
- ORC_CATCH_NOT_OK(
- writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), *orc_options))
-
- int64_t num_rows = table.num_rows();
- const int num_cols_ = table.num_columns();
- std::vector<int64_t> arrow_index_offset(num_cols_, 0);
- std::vector<int> arrow_chunk_offset(num_cols_, 0);
- std::unique_ptr<liborc::ColumnVectorBatch> batch =
- writer_->createRowBatch(kOrcWriterBatchSize);
- liborc::StructVectorBatch* root =
- internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
- while (num_rows > 0) {
- for (int i = 0; i < num_cols_; i++) {
- RETURN_NOT_OK(adapters::orc::WriteBatch(
- *(table.column(i)), kOrcWriterBatchSize, &(arrow_chunk_offset[i]),
- &(arrow_index_offset[i]), (root->fields)[i]));
- }
- root->numElements = (root->fields)[0]->numElements;
- writer_->add(*batch);
- batch->clear();
- num_rows -= kOrcWriterBatchSize;
- }
- return Status::OK();
- }
-
- Status Close() {
- writer_->close();
- return Status::OK();
- }
-
- private:
- std::unique_ptr<liborc::Writer> writer_;
- std::unique_ptr<liborc::OutputStream> out_stream_;
-};
-
-ORCFileWriter::~ORCFileWriter() {}
-
-ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); }
-
-Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open(
- io::OutputStream* output_stream) {
- std::unique_ptr<ORCFileWriter> result =
- std::unique_ptr<ORCFileWriter>(new ORCFileWriter());
- Status status = result->impl_->Open(output_stream);
- RETURN_NOT_OK(status);
- return std::move(result);
-}
-
-Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); }
-
-Status ORCFileWriter::Close() { return impl_->Close(); }
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/adapters/orc/adapter_util.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/table_builder.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/range.h"
+#include "arrow/util/visibility.h"
+#include "orc/Exceptions.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+#define ORC_THROW_NOT_OK(s) \
+ do { \
+ Status _s = (s); \
+ if (!_s.ok()) { \
+ std::stringstream ss; \
+ ss << "Arrow error: " << _s.ToString(); \
+ throw liborc::ParseError(ss.str()); \
+ } \
+ } while (0)
+
+#define ORC_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+ auto status_name = (rexpr); \
+ ORC_THROW_NOT_OK(status_name.status()); \
+ lhs = std::move(status_name).ValueOrDie();
+
+#define ORC_ASSIGN_OR_THROW(lhs, rexpr) \
+ ORC_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+#define ORC_BEGIN_CATCH_NOT_OK try {
+#define ORC_END_CATCH_NOT_OK \
+ } \
+ catch (const liborc::ParseError& e) { \
+ return Status::IOError(e.what()); \
+ } \
+ catch (const liborc::InvalidArgument& e) { \
+ return Status::Invalid(e.what()); \
+ } \
+ catch (const liborc::NotImplementedYet& e) { \
+ return Status::NotImplemented(e.what()); \
+ }
+
+#define ORC_CATCH_NOT_OK(_s) \
+ ORC_BEGIN_CATCH_NOT_OK(_s); \
+ ORC_END_CATCH_NOT_OK
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+namespace {
+
+// The following are required by ORC to be uint64_t
+constexpr uint64_t kOrcWriterBatchSize = 128 * 1024;
+constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024;
+
+using internal::checked_cast;
+
+class ArrowInputFile : public liborc::InputStream {
+ public:
+ explicit ArrowInputFile(const std::shared_ptr<io::RandomAccessFile>& file)
+ : file_(file) {}
+
+ uint64_t getLength() const override {
+ ORC_ASSIGN_OR_THROW(int64_t size, file_->GetSize());
+ return static_cast<uint64_t>(size);
+ }
+
+ uint64_t getNaturalReadSize() const override { return 128 * 1024; }
+
+ void read(void* buf, uint64_t length, uint64_t offset) override {
+ ORC_ASSIGN_OR_THROW(int64_t bytes_read, file_->ReadAt(offset, length, buf));
+
+ if (static_cast<uint64_t>(bytes_read) != length) {
+ throw liborc::ParseError("Short read from arrow input file");
+ }
+ }
+
+ const std::string& getName() const override {
+ static const std::string filename("ArrowInputFile");
+ return filename;
+ }
+
+ private:
+ std::shared_ptr<io::RandomAccessFile> file_;
+};
+
+struct StripeInformation {
+ uint64_t offset;
+ uint64_t length;
+ uint64_t num_rows;
+ uint64_t first_row_of_stripe;
+};
+
+// The number of rows to read in a ColumnVectorBatch
+constexpr int64_t kReadRowsBatch = 1000;
+
+class OrcStripeReader : public RecordBatchReader {
+ public:
+ OrcStripeReader(std::unique_ptr<liborc::RowReader> row_reader,
+ std::shared_ptr<Schema> schema, int64_t batch_size, MemoryPool* pool)
+ : row_reader_(std::move(row_reader)),
+ schema_(schema),
+ pool_(pool),
+ batch_size_{batch_size} {}
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
+ std::unique_ptr<liborc::ColumnVectorBatch> batch;
+ ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
+
+ const liborc::Type& type = row_reader_->getSelectedType();
+ if (!row_reader_->next(*batch)) {
+ out->reset();
+ return Status::OK();
+ }
+
+ std::unique_ptr<RecordBatchBuilder> builder;
+ RETURN_NOT_OK(RecordBatchBuilder::Make(schema_, pool_, batch->numElements, &builder));
+
+ // The top-level type must be a struct to read into an arrow table
+ const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
+ batch->numElements, builder->GetField(i)));
+ }
+
+ RETURN_NOT_OK(builder->Flush(out));
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<liborc::RowReader> row_reader_;
+ std::shared_ptr<Schema> schema_;
+ MemoryPool* pool_;
+ int64_t batch_size_;
+};
+
+} // namespace
+
+class ORCFileReader::Impl {
+ public:
+ Impl() {}
+ ~Impl() {}
+
+ Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
+ std::unique_ptr<ArrowInputFile> io_wrapper(new ArrowInputFile(file));
+ liborc::ReaderOptions options;
+ std::unique_ptr<liborc::Reader> liborc_reader;
+ ORC_CATCH_NOT_OK(liborc_reader = createReader(std::move(io_wrapper), options));
+ pool_ = pool;
+ reader_ = std::move(liborc_reader);
+ current_row_ = 0;
+
+ return Init();
+ }
+
+ Status Init() {
+ int64_t nstripes = reader_->getNumberOfStripes();
+ stripes_.resize(nstripes);
+ std::unique_ptr<liborc::StripeInformation> stripe;
+ uint64_t first_row_of_stripe = 0;
+ for (int i = 0; i < nstripes; ++i) {
+ stripe = reader_->getStripe(i);
+ stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(),
+ stripe->getNumberOfRows(), first_row_of_stripe});
+ first_row_of_stripe += stripe->getNumberOfRows();
+ }
+ return Status::OK();
+ }
+
+ int64_t NumberOfStripes() { return stripes_.size(); }
+
+ int64_t NumberOfRows() { return reader_->getNumberOfRows(); }
+
+ Status ReadSchema(std::shared_ptr<Schema>* out) {
+ const liborc::Type& type = reader_->getType();
+ return GetArrowSchema(type, out);
+ }
+
+ Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ ORC_CATCH_NOT_OK(row_reader = reader_->createRowReader(opts));
+ const liborc::Type& type = row_reader->getSelectedType();
+ return GetArrowSchema(type, out);
+ }
+
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() {
+ const std::list<std::string> keys = reader_->getMetadataKeys();
+ auto metadata = std::make_shared<KeyValueMetadata>();
+ for (const auto& key : keys) {
+ metadata->Append(key, reader_->getMetadataValue(key));
+ }
+ return std::const_pointer_cast<const KeyValueMetadata>(metadata);
+ }
+
+ Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
+ if (type.getKind() != liborc::STRUCT) {
+ return Status::NotImplemented(
+ "Only ORC files with a top-level struct "
+ "can be handled");
+ }
+ int size = static_cast<int>(type.getSubtypeCount());
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int child = 0; child < size; ++child) {
+ std::shared_ptr<DataType> elemtype;
+ RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype));
+ std::string name = type.getFieldName(child);
+ fields.push_back(field(name, elemtype));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto metadata, ReadMetadata());
+ *out = std::make_shared<Schema>(std::move(fields), std::move(metadata));
+ return Status::OK();
+ }
+
+ Status Read(std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectStripe(&opts, stripe));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
+ }
+
+ Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ RETURN_NOT_OK(SelectStripe(&opts, stripe));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
+ }
+
+ Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
+ ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
+ Status::Invalid("Out of bounds stripe: ", stripe));
+
+ opts->range(stripes_[stripe].offset, stripes_[stripe].length);
+ return Status::OK();
+ }
+
+ Status SelectStripeWithRowNumber(liborc::RowReaderOptions* opts, int64_t row_number,
+ StripeInformation* out) {
+ ARROW_RETURN_IF(row_number >= NumberOfRows(),
+ Status::Invalid("Out of bounds row number: ", row_number));
+
+ for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
+ if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
+ static_cast<uint64_t>(row_number) < it->first_row_of_stripe + it->num_rows) {
+ opts->range(it->offset, it->length);
+ *out = *it;
+ return Status::OK();
+ }
+ }
+
+ return Status::Invalid("Invalid row number", row_number);
+ }
+
+ Status SelectIndices(liborc::RowReaderOptions* opts,
+ const std::vector<int>& include_indices) {
+ std::list<uint64_t> include_indices_list;
+ for (auto it = include_indices.begin(); it != include_indices.end(); ++it) {
+ ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
+ include_indices_list.push_back(*it);
+ }
+ opts->includeTypes(include_indices_list);
+ return Status::OK();
+ }
+
+ Status ReadTable(const liborc::RowReaderOptions& row_opts,
+ const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts(row_opts);
+ std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
+ for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
+ opts.range(stripes_[stripe].offset, stripes_[stripe].length);
+ RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
+ }
+ return Table::FromRecordBatches(schema, std::move(batches)).Value(out);
+ }
+
+ Status ReadBatch(const liborc::RowReaderOptions& opts,
+ const std::shared_ptr<Schema>& schema, int64_t nrows,
+ std::shared_ptr<RecordBatch>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ std::unique_ptr<liborc::ColumnVectorBatch> batch;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
+ ORC_END_CATCH_NOT_OK
+
+ std::unique_ptr<RecordBatchBuilder> builder;
+ RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
+
+ // The top-level type must be a struct to read into an arrow table
+ const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+
+ const liborc::Type& type = row_reader->getSelectedType();
+ while (row_reader->next(*batch)) {
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
+ batch->numElements, builder->GetField(i)));
+ }
+ }
+ RETURN_NOT_OK(builder->Flush(out));
+ return Status::OK();
+ }
+
+ Status Seek(int64_t row_number) {
+ ARROW_RETURN_IF(row_number >= NumberOfRows(),
+ Status::Invalid("Out of bounds row number: ", row_number));
+
+ current_row_ = row_number;
+ return Status::OK();
+ }
+
+ Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ if (current_row_ >= NumberOfRows()) {
+ out->reset();
+ return Status::OK();
+ }
+
+ liborc::RowReaderOptions opts;
+ if (!include_indices.empty()) {
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ *out = std::shared_ptr<RecordBatchReader>(
+ new OrcStripeReader(std::move(row_reader), schema, batch_size, pool_));
+ return Status::OK();
+ }
+
+ Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
+ return NextStripeReader(batch_size, {}, out);
+ }
+
+ private:
+ MemoryPool* pool_;
+ std::unique_ptr<liborc::Reader> reader_;
+ std::vector<StripeInformation> stripes_;
+ int64_t current_row_;
+};
+
+ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); }
+
+ORCFileReader::~ORCFileReader() {}
+
+Status ORCFileReader::Open(const std::shared_ptr<io::RandomAccessFile>& file,
+ MemoryPool* pool, std::unique_ptr<ORCFileReader>* reader) {
+ auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
+ RETURN_NOT_OK(result->impl_->Open(file, pool));
+ *reader = std::move(result);
+ return Status::OK();
+}
+
+Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
+ return impl_->ReadMetadata();
+}
+
+Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
+ return impl_->ReadSchema(out);
+}
+
+Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
+
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, out);
+}
+
+Status ORCFileReader::Read(const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(include_indices, out);
+}
+
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, include_indices, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+ return impl_->ReadStripe(stripe, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out) {
+ return impl_->ReadStripe(stripe, include_indices, out);
+}
+
+Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); }
+
+Status ORCFileReader::NextStripeReader(int64_t batch_sizes,
+ std::shared_ptr<RecordBatchReader>* out) {
+ return impl_->NextStripeReader(batch_sizes, out);
+}
+
+Status ORCFileReader::NextStripeReader(int64_t batch_size,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ return impl_->NextStripeReader(batch_size, include_indices, out);
+}
+
+int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
+
+int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
+
+namespace {
+
+class ArrowOutputStream : public liborc::OutputStream {
+ public:
+ explicit ArrowOutputStream(arrow::io::OutputStream& output_stream)
+ : output_stream_(output_stream), length_(0) {}
+
+ uint64_t getLength() const override { return length_; }
+
+ uint64_t getNaturalWriteSize() const override { return kOrcNaturalWriteSize; }
+
+ void write(const void* buf, size_t length) override {
+ ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast<int64_t>(length)));
+ length_ += static_cast<int64_t>(length);
+ }
+
+ // Mandatory due to us implementing an ORC virtual class.
+ // Used by ORC for error messages, not used by Arrow
+ const std::string& getName() const override {
+ static const std::string filename("ArrowOutputFile");
+ return filename;
+ }
+
+ void close() override {
+ if (!output_stream_.closed()) {
+ ORC_THROW_NOT_OK(output_stream_.Close());
+ }
+ }
+
+ void set_length(int64_t length) { length_ = length; }
+
+ private:
+ arrow::io::OutputStream& output_stream_;
+ int64_t length_;
+};
+
+} // namespace
+
+class ORCFileWriter::Impl {
+ public:
+ Status Open(arrow::io::OutputStream* output_stream) {
+ out_stream_ = std::unique_ptr<liborc::OutputStream>(
+ checked_cast<liborc::OutputStream*>(new ArrowOutputStream(*output_stream)));
+ return Status::OK();
+ }
+
+ Status Write(const Table& table) {
+ std::unique_ptr<liborc::WriterOptions> orc_options =
+ std::unique_ptr<liborc::WriterOptions>(new liborc::WriterOptions());
+ ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema())));
+ ORC_CATCH_NOT_OK(
+ writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), *orc_options))
+
+ int64_t num_rows = table.num_rows();
+ const int num_cols_ = table.num_columns();
+ std::vector<int64_t> arrow_index_offset(num_cols_, 0);
+ std::vector<int> arrow_chunk_offset(num_cols_, 0);
+ std::unique_ptr<liborc::ColumnVectorBatch> batch =
+ writer_->createRowBatch(kOrcWriterBatchSize);
+ liborc::StructVectorBatch* root =
+ internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+ while (num_rows > 0) {
+ for (int i = 0; i < num_cols_; i++) {
+ RETURN_NOT_OK(adapters::orc::WriteBatch(
+ *(table.column(i)), kOrcWriterBatchSize, &(arrow_chunk_offset[i]),
+ &(arrow_index_offset[i]), (root->fields)[i]));
+ }
+ root->numElements = (root->fields)[0]->numElements;
+ writer_->add(*batch);
+ batch->clear();
+ num_rows -= kOrcWriterBatchSize;
+ }
+ return Status::OK();
+ }
+
+ Status Close() {
+ writer_->close();
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<liborc::Writer> writer_;
+ std::unique_ptr<liborc::OutputStream> out_stream_;
+};
+
+ORCFileWriter::~ORCFileWriter() {}
+
+ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); }
+
+Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open(
+ io::OutputStream* output_stream) {
+ std::unique_ptr<ORCFileWriter> result =
+ std::unique_ptr<ORCFileWriter>(new ORCFileWriter());
+ Status status = result->impl_->Open(output_stream);
+ RETURN_NOT_OK(status);
+ return std::move(result);
+}
+
+Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); }
+
+Status ORCFileWriter::Close() { return impl_->Close(); }
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
index 012c1701980..e6e406068a9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
@@ -1,181 +1,181 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "arrow/io/interfaces.h"
-#include "arrow/memory_pool.h"
-#include "arrow/record_batch.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace adapters {
-namespace orc {
-
-/// \class ORCFileReader
-/// \brief Read an Arrow Table or RecordBatch from an ORC file.
-class ARROW_EXPORT ORCFileReader {
- public:
- ~ORCFileReader();
-
- /// \brief Creates a new ORC reader.
- ///
- /// \param[in] file the data source
- /// \param[in] pool a MemoryPool to use for buffer allocations
- /// \param[out] reader the returned reader object
- /// \return Status
- static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
- std::unique_ptr<ORCFileReader>* reader);
-
- /// \brief Return the metadata read from the ORC file
- ///
- /// \return A KeyValueMetadata object containing the ORC metadata
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
-
- /// \brief Return the schema read from the ORC file
- ///
- /// \param[out] out the returned Schema object
- Status ReadSchema(std::shared_ptr<Schema>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[out] out the returned Table
- Status Read(std::shared_ptr<Table>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[in] schema the Table schema
- /// \param[out] out the returned Table
- Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned Table
- Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[in] schema the Table schema
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned Table
- Status Read(const std::shared_ptr<Schema>& schema,
- const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
-
- /// \brief Read a single stripe as a RecordBatch
- ///
- /// \param[in] stripe the stripe index
- /// \param[out] out the returned RecordBatch
- Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);
-
- /// \brief Read a single stripe as a RecordBatch
- ///
- /// \param[in] stripe the stripe index
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned RecordBatch
- Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatch>* out);
-
- /// \brief Seek to designated row. Invoke NextStripeReader() after seek
- /// will return stripe reader starting from designated row.
- ///
- /// \param[in] row_number the rows number to seek
- Status Seek(int64_t row_number);
-
- /// \brief Get a stripe level record batch iterator with specified row count
- /// in each record batch. NextStripeReader serves as a fine grain
- /// alternative to ReadStripe which may cause OOM issue by loading
- /// the whole stripes into memory.
- ///
- /// \param[in] batch_size the number of rows each record batch contains in
- /// record batch iteration.
- /// \param[out] out the returned stripe reader
- Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out);
-
- /// \brief Get a stripe level record batch iterator with specified row count
- /// in each record batch. NextStripeReader serves as a fine grain
- /// alternative to ReadStripe which may cause OOM issue by loading
- /// the whole stripes into memory.
- ///
- /// \param[in] batch_size Get a stripe level record batch iterator with specified row
- /// count in each record batch.
- ///
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned stripe reader
- Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatchReader>* out);
-
- /// \brief The number of stripes in the file
- int64_t NumberOfStripes();
-
- /// \brief The number of rows in the file
- int64_t NumberOfRows();
-
- private:
- class Impl;
- std::unique_ptr<Impl> impl_;
- ORCFileReader();
-};
-
-/// \class ORCFileWriter
-/// \brief Write an Arrow Table or RecordBatch to an ORC file.
-class ARROW_EXPORT ORCFileWriter {
- public:
- ~ORCFileWriter();
- /// \brief Creates a new ORC writer.
- ///
- /// \param[in] output_stream a pointer to the io::OutputStream to write into
- /// \return the returned writer object
- static Result<std::unique_ptr<ORCFileWriter>> Open(io::OutputStream* output_stream);
-
- /// \brief Write a table
- ///
- /// \param[in] table the Arrow table from which data is extracted
- /// \return Status
- Status Write(const Table& table);
-
- /// \brief Close an ORC writer (orc::Writer)
- ///
- /// \return Status
- Status Close();
-
- private:
- class Impl;
- std::unique_ptr<Impl> impl_;
-
- private:
- ORCFileWriter();
-};
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+/// \class ORCFileReader
+/// \brief Read an Arrow Table or RecordBatch from an ORC file.
+class ARROW_EXPORT ORCFileReader {
+ public:
+ ~ORCFileReader();
+
+ /// \brief Creates a new ORC reader.
+ ///
+ /// \param[in] file the data source
+ /// \param[in] pool a MemoryPool to use for buffer allocations
+ /// \param[out] reader the returned reader object
+ /// \return Status
+ static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
+ std::unique_ptr<ORCFileReader>* reader);
+
+ /// \brief Return the metadata read from the ORC file
+ ///
+ /// \return A KeyValueMetadata object containing the ORC metadata
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ /// \brief Return the schema read from the ORC file
+ ///
+ /// \param[out] out the returned Schema object
+ Status ReadSchema(std::shared_ptr<Schema>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[out] out the returned Table
+ Status Read(std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
+ /// \brief Read a single stripe as a RecordBatch
+ ///
+ /// \param[in] stripe the stripe index
+ /// \param[out] out the returned RecordBatch
+ Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);
+
+ /// \brief Read a single stripe as a RecordBatch
+ ///
+ /// \param[in] stripe the stripe index
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned RecordBatch
+ Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out);
+
+ /// \brief Seek to designated row. Invoke NextStripeReader() after seek
+ /// will return stripe reader starting from designated row.
+ ///
+ /// \param[in] row_number the rows number to seek
+ Status Seek(int64_t row_number);
+
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size the number of rows each record batch contains in
+ /// record batch iteration.
+ /// \param[out] out the returned stripe reader
+ Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out);
+
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned stripe reader
+ Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out);
+
+ /// \brief The number of stripes in the file
+ int64_t NumberOfStripes();
+
+ /// \brief The number of rows in the file
+ int64_t NumberOfRows();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+ ORCFileReader();
+};
+
+/// \class ORCFileWriter
+/// \brief Write an Arrow Table or RecordBatch to an ORC file.
+class ARROW_EXPORT ORCFileWriter {
+ public:
+ ~ORCFileWriter();
+ /// \brief Creates a new ORC writer.
+ ///
+ /// \param[in] output_stream a pointer to the io::OutputStream to write into
+ /// \return the returned writer object
+ static Result<std::unique_ptr<ORCFileWriter>> Open(io::OutputStream* output_stream);
+
+ /// \brief Write a table
+ ///
+ /// \param[in] table the Arrow table from which data is extracted
+ /// \return Status
+ Status Write(const Table& table);
+
+ /// \brief Close an ORC writer (orc::Writer)
+ ///
+ /// \return Status
+ Status Close();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+
+ private:
+ ORCFileWriter();
+};
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
index f956a6f6217..cbd29b3741b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
@@ -1,1069 +1,1069 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/adapters/orc/adapter_util.h"
-
-#include <cmath>
-#include <string>
-#include <vector>
-
-#include "arrow/array/builder_base.h"
-#include "arrow/builder.h"
-#include "arrow/chunked_array.h"
-#include "arrow/scalar.h"
-#include "arrow/status.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/decimal.h"
-#include "arrow/util/range.h"
-#include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
-#include "orc/Exceptions.hh"
-#include "orc/MemoryPool.hh"
-#include "orc/OrcFile.hh"
-
-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
-namespace arrow {
-
-using internal::checked_cast;
-
-namespace adapters {
-namespace orc {
-
-namespace {
-
-// The number of milliseconds, microseconds and nanoseconds in a second
-constexpr int64_t kOneSecondMillis = 1000LL;
-constexpr int64_t kOneMicroNanos = 1000LL;
-constexpr int64_t kOneSecondMicros = 1000000LL;
-constexpr int64_t kOneMilliNanos = 1000000LL;
-constexpr int64_t kOneSecondNanos = 1000000000LL;
-
-Status AppendStructBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<StructBuilder*>(abuilder);
- auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- RETURN_NOT_OK(builder->AppendValues(length, valid_bytes));
-
- for (int i = 0; i < builder->num_fields(); i++) {
- RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, length,
- builder->field_builder(i)));
- }
- return Status::OK();
-}
-
-Status AppendListBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<ListBuilder*>(abuilder);
- auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* elements = batch->elements.get();
- const liborc::Type* elemtype = type->getSubtype(0);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- int64_t start = batch->offsets[i];
- int64_t end = batch->offsets[i + 1];
- RETURN_NOT_OK(builder->Append());
- RETURN_NOT_OK(
- AppendBatch(elemtype, elements, start, end - start, builder->value_builder()));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-Status AppendMapBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<MapBuilder*>(abuilder);
- auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* keys = batch->keys.get();
- liborc::ColumnVectorBatch* items = batch->elements.get();
- const liborc::Type* key_type = type->getSubtype(0);
- const liborc::Type* item_type = type->getSubtype(1);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- int64_t start = batch->offsets[i];
- int64_t end = batch->offsets[i + 1];
- RETURN_NOT_OK(builder->Append());
- RETURN_NOT_OK(
- AppendBatch(key_type, keys, start, end - start, builder->key_builder()));
- RETURN_NOT_OK(
- AppendBatch(item_type, items, start, end - start, builder->item_builder()));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-template <class BuilderType, class BatchType, class ElemType>
-Status AppendNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BuilderType*>(abuilder);
- auto batch = checked_cast<BatchType*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- const ElemType* source = batch->data.data() + offset;
- RETURN_NOT_OK(builder->AppendValues(source, length, valid_bytes));
- return Status::OK();
-}
-
-template <class BuilderType, class TargetType, class BatchType, class SourceType>
-Status AppendNumericBatchCast(liborc::ColumnVectorBatch* column_vector_batch,
- int64_t offset, int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BuilderType*>(abuilder);
- auto batch = checked_cast<BatchType*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- const SourceType* source = batch->data.data() + offset;
- auto cast_iter = internal::MakeLazyRange(
- [&source](int64_t index) { return static_cast<TargetType>(source[index]); },
- length);
-
- RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
-
- return Status::OK();
-}
-
-Status AppendBoolBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BooleanBuilder*>(abuilder);
- auto batch = checked_cast<liborc::LongVectorBatch*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- const int64_t* source = batch->data.data() + offset;
-
- auto cast_iter = internal::MakeLazyRange(
- [&source](int64_t index) { return static_cast<bool>(source[index]); }, length);
-
- RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
-
- return Status::OK();
-}
-
-Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch,
- int64_t offset, int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<TimestampBuilder*>(abuilder);
- auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
-
- const int64_t* seconds = batch->data.data() + offset;
- const int64_t* nanos = batch->nanoseconds.data() + offset;
-
- auto transform_timestamp = [seconds, nanos](int64_t index) {
- return seconds[index] * kOneSecondNanos + nanos[index];
- };
-
- auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
-
- RETURN_NOT_OK(
- builder->AppendValues(transform_range.begin(), transform_range.end(), valid_bytes));
- return Status::OK();
-}
-
-template <class BuilderType>
-Status AppendBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BuilderType*>(abuilder);
- auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(
- builder->Append(batch->data[i], static_cast<int32_t>(batch->length[i])));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
- int64_t offset, int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<FixedSizeBinaryBuilder*>(abuilder);
- auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(builder->Append(batch->data[i]));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-Status AppendDecimalBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<Decimal128Builder*>(abuilder);
-
- const bool has_nulls = column_vector_batch->hasNulls;
- if (type->getPrecision() == 0 || type->getPrecision() > 18) {
- auto batch = checked_cast<liborc::Decimal128VectorBatch*>(column_vector_batch);
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(builder->Append(
- Decimal128(batch->values[i].getHighBits(), batch->values[i].getLowBits())));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- } else {
- auto batch = checked_cast<liborc::Decimal64VectorBatch*>(column_vector_batch);
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i])));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- }
- return Status::OK();
-}
-
-} // namespace
-
-Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
- int64_t offset, int64_t length, ArrayBuilder* builder) {
- if (type == nullptr) {
- return Status::OK();
- }
- liborc::TypeKind kind = type->getKind();
- switch (kind) {
- case liborc::STRUCT:
- return AppendStructBatch(type, batch, offset, length, builder);
- case liborc::LIST:
- return AppendListBatch(type, batch, offset, length, builder);
- case liborc::MAP:
- return AppendMapBatch(type, batch, offset, length, builder);
- case liborc::LONG:
- return AppendNumericBatch<Int64Builder, liborc::LongVectorBatch, int64_t>(
- batch, offset, length, builder);
- case liborc::INT:
- return AppendNumericBatchCast<Int32Builder, int32_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::SHORT:
- return AppendNumericBatchCast<Int16Builder, int16_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::BYTE:
- return AppendNumericBatchCast<Int8Builder, int8_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::DOUBLE:
- return AppendNumericBatch<DoubleBuilder, liborc::DoubleVectorBatch, double>(
- batch, offset, length, builder);
- case liborc::FLOAT:
- return AppendNumericBatchCast<FloatBuilder, float, liborc::DoubleVectorBatch,
- double>(batch, offset, length, builder);
- case liborc::BOOLEAN:
- return AppendBoolBatch(batch, offset, length, builder);
- case liborc::VARCHAR:
- case liborc::STRING:
- return AppendBinaryBatch<StringBuilder>(batch, offset, length, builder);
- case liborc::BINARY:
- return AppendBinaryBatch<BinaryBuilder>(batch, offset, length, builder);
- case liborc::CHAR:
- return AppendFixedBinaryBatch(batch, offset, length, builder);
- case liborc::DATE:
- return AppendNumericBatchCast<Date32Builder, int32_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::TIMESTAMP:
- return AppendTimestampBatch(batch, offset, length, builder);
- case liborc::DECIMAL:
- return AppendDecimalBatch(type, batch, offset, length, builder);
- default:
- return Status::NotImplemented("Not implemented type kind: ", kind);
- }
-}
-
-namespace {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-Status WriteBatch(const Array& parray, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch);
-
-// Make sure children of StructArray have appropriate null.
-Result<std::shared_ptr<Array>> NormalizeArray(const std::shared_ptr<Array>& array) {
- Type::type kind = array->type_id();
- switch (kind) {
- case Type::type::STRUCT: {
- if (array->null_count() == 0) {
- return array;
- } else {
- auto struct_array = checked_pointer_cast<StructArray>(array);
- const std::shared_ptr<Buffer> bitmap = struct_array->null_bitmap();
- std::shared_ptr<DataType> struct_type = struct_array->type();
- std::size_t size = struct_type->fields().size();
- std::vector<std::shared_ptr<Array>> new_children(size, nullptr);
- for (std::size_t i = 0; i < size; i++) {
- std::shared_ptr<Array> child = struct_array->field(i);
- const std::shared_ptr<Buffer> child_bitmap = child->null_bitmap();
- std::shared_ptr<Buffer> final_child_bitmap;
- if (child_bitmap == nullptr) {
- final_child_bitmap = bitmap;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- final_child_bitmap,
- internal::BitmapAnd(default_memory_pool(), bitmap->data(), 0,
- child_bitmap->data(), 0, struct_array->length(), 0));
- }
- std::shared_ptr<ArrayData> child_array_data = child->data();
- std::vector<std::shared_ptr<Buffer>> child_buffers = child_array_data->buffers;
- child_buffers[0] = final_child_bitmap;
- std::shared_ptr<ArrayData> new_child_array_data =
- ArrayData::Make(child->type(), child->length(), child_buffers,
- child_array_data->child_data, child_array_data->dictionary);
- ARROW_ASSIGN_OR_RAISE(new_children[i],
- NormalizeArray(MakeArray(new_child_array_data)));
- }
- return std::make_shared<StructArray>(struct_type, struct_array->length(),
- new_children, bitmap);
- }
- }
- case Type::type::LIST: {
- auto list_array = checked_pointer_cast<ListArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
- return std::make_shared<ListArray>(list_array->type(), list_array->length(),
- list_array->value_offsets(), value_array,
- list_array->null_bitmap());
- }
- case Type::type::LARGE_LIST: {
- auto list_array = checked_pointer_cast<LargeListArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
- return std::make_shared<LargeListArray>(list_array->type(), list_array->length(),
- list_array->value_offsets(), value_array,
- list_array->null_bitmap());
- }
- case Type::type::FIXED_SIZE_LIST: {
- auto list_array = checked_pointer_cast<FixedSizeListArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
- return std::make_shared<FixedSizeListArray>(list_array->type(),
- list_array->length(), value_array,
- list_array->null_bitmap());
- }
- case Type::type::MAP: {
- auto map_array = checked_pointer_cast<MapArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto key_array, NormalizeArray(map_array->keys()));
- ARROW_ASSIGN_OR_RAISE(auto item_array, NormalizeArray(map_array->items()));
- return std::make_shared<MapArray>(map_array->type(), map_array->length(),
- map_array->value_offsets(), key_array, item_array,
- map_array->null_bitmap());
- }
- default: {
- return array;
- }
- }
-}
-
-template <class DataType, class BatchType, typename Enable = void>
-struct Appender {};
-
-// Types for long/double-like Appender, that is, numeric, boolean or date32
-template <typename T>
-using is_generic_type =
- std::integral_constant<bool, is_number_type<T>::value ||
- std::is_same<Date32Type, T>::value ||
- is_boolean_type<T>::value>;
-template <typename T, typename R = void>
-using enable_if_generic = enable_if_t<is_generic_type<T>::value, R>;
-
-// Number-like
-template <class DataType, class BatchType>
-struct Appender<DataType, BatchType, enable_if_generic<DataType>> {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- using ValueType = typename TypeTraits<DataType>::CType;
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(ValueType v) {
- batch->data[running_orc_offset] = array.Value(running_arrow_offset);
- batch->notNull[running_orc_offset] = true;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const ArrayType& array;
- BatchType* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-// Binary
-template <class DataType>
-struct Appender<DataType, liborc::StringVectorBatch> {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- using COffsetType = typename TypeTraits<DataType>::OffsetType::c_type;
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- COffsetType data_length = 0;
- batch->data[running_orc_offset] = reinterpret_cast<char*>(
- const_cast<uint8_t*>(array.GetValue(running_arrow_offset, &data_length)));
- batch->length[running_orc_offset] = data_length;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const ArrayType& array;
- liborc::StringVectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-// Decimal
-template <>
-struct Appender<Decimal128Type, liborc::Decimal64VectorBatch> {
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- const Decimal128 dec_value(array.GetValue(running_arrow_offset));
- batch->values[running_orc_offset] = static_cast<int64_t>(dec_value.low_bits());
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const Decimal128Array& array;
- liborc::Decimal64VectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-template <>
-struct Appender<Decimal128Type, liborc::Decimal128VectorBatch> {
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- const Decimal128 dec_value(array.GetValue(running_arrow_offset));
- batch->values[running_orc_offset] =
- liborc::Int128(dec_value.high_bits(), dec_value.low_bits());
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const Decimal128Array& array;
- liborc::Decimal128VectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-// Date64 and Timestamp
-template <class DataType>
-struct TimestampAppender {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(int64_t v) {
- int64_t data = array.Value(running_arrow_offset);
- batch->notNull[running_orc_offset] = true;
- batch->data[running_orc_offset] =
- static_cast<int64_t>(std::floor(data / conversion_factor_from_second));
- batch->nanoseconds[running_orc_offset] =
- (data - conversion_factor_from_second * batch->data[running_orc_offset]) *
- conversion_factor_to_nano;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const ArrayType& array;
- liborc::TimestampVectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
- int64_t conversion_factor_from_second, conversion_factor_to_nano;
-};
-
-// FSB
-struct FixedSizeBinaryAppender {
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- batch->data[running_orc_offset] = reinterpret_cast<char*>(
- const_cast<uint8_t*>(array.GetValue(running_arrow_offset)));
- batch->length[running_orc_offset] = data_length;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const FixedSizeBinaryArray& array;
- liborc::StringVectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
- const int32_t data_length;
-};
-
-// static_cast from int64_t or double to itself shouldn't introduce overhead
-// Pleae see
-// https://stackoverflow.com/questions/19106826/
-// can-static-cast-to-same-type-introduce-runtime-overhead
-template <class DataType, class BatchType>
-Status WriteGenericBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- const ArrayType& array_(checked_cast<const ArrayType&>(array));
- auto batch = checked_cast<BatchType*>(column_vector_batch);
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- Appender<DataType, BatchType> appender{array_, batch, orc_offset, 0};
- ArrayDataVisitor<DataType> visitor;
- RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
- return Status::OK();
-}
-
-template <class DataType>
-Status WriteTimestampBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch,
- const int64_t& conversion_factor_from_second,
- const int64_t& conversion_factor_to_nano) {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- const ArrayType& array_(checked_cast<const ArrayType&>(array));
- auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- TimestampAppender<DataType> appender{array_,
- batch,
- orc_offset,
- 0,
- conversion_factor_from_second,
- conversion_factor_to_nano};
- ArrayDataVisitor<DataType> visitor;
- RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
- return Status::OK();
-}
-
-Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- const FixedSizeBinaryArray& array_(checked_cast<const FixedSizeBinaryArray&>(array));
- auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()};
- ArrayDataVisitor<FixedSizeBinaryType> visitor;
- RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
- return Status::OK();
-}
-
-Status WriteStructBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- std::shared_ptr<Array> array_ = MakeArray(array.data());
- std::shared_ptr<StructArray> struct_array(checked_pointer_cast<StructArray>(array_));
- auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
- std::size_t size = array.type()->fields().size();
- int64_t arrow_length = array.length();
- int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
- // First fill fields of ColumnVectorBatch
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- for (; running_arrow_offset < arrow_length;
- running_orc_offset++, running_arrow_offset++) {
- if (array.IsNull(running_arrow_offset)) {
- batch->notNull[running_orc_offset] = false;
- } else {
- batch->notNull[running_orc_offset] = true;
- }
- }
- // Fill the fields
- for (std::size_t i = 0; i < size; i++) {
- batch->fields[i]->resize(orc_offset + arrow_length);
- RETURN_NOT_OK(WriteBatch(*(struct_array->field(i)), orc_offset, batch->fields[i]));
- }
- return Status::OK();
-}
-
-template <class ArrayType>
-Status WriteListBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- const ArrayType& list_array(checked_cast<const ArrayType&>(array));
- auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
- int64_t arrow_length = array.length();
- int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
- if (orc_offset == 0) {
- batch->offsets[0] = 0;
- }
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- for (; running_arrow_offset < arrow_length;
- running_orc_offset++, running_arrow_offset++) {
- if (array.IsNull(running_arrow_offset)) {
- batch->notNull[running_orc_offset] = false;
- batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
- } else {
- batch->notNull[running_orc_offset] = true;
- batch->offsets[running_orc_offset + 1] =
- batch->offsets[running_orc_offset] +
- list_array.value_offset(running_arrow_offset + 1) -
- list_array.value_offset(running_arrow_offset);
- element_batch->resize(batch->offsets[running_orc_offset + 1]);
- int64_t subarray_arrow_offset = list_array.value_offset(running_arrow_offset),
- subarray_orc_offset = batch->offsets[running_orc_offset],
- subarray_orc_length =
- batch->offsets[running_orc_offset + 1] - subarray_orc_offset;
- RETURN_NOT_OK(WriteBatch(
- *(list_array.values()->Slice(subarray_arrow_offset, subarray_orc_length)),
- subarray_orc_offset, element_batch));
- }
- }
- return Status::OK();
-}
-
-Status WriteMapBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- const MapArray& map_array(checked_cast<const MapArray&>(array));
- auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* key_batch = (batch->keys).get();
- liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
- std::shared_ptr<Array> key_array = map_array.keys();
- std::shared_ptr<Array> element_array = map_array.items();
- int64_t arrow_length = array.length();
- int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
- if (orc_offset == 0) {
- batch->offsets[0] = 0;
- }
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- for (; running_arrow_offset < arrow_length;
- running_orc_offset++, running_arrow_offset++) {
- if (array.IsNull(running_arrow_offset)) {
- batch->notNull[running_orc_offset] = false;
- batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
- } else {
- batch->notNull[running_orc_offset] = true;
- batch->offsets[running_orc_offset + 1] =
- batch->offsets[running_orc_offset] +
- map_array.value_offset(running_arrow_offset + 1) -
- map_array.value_offset(running_arrow_offset);
- int64_t subarray_arrow_offset = map_array.value_offset(running_arrow_offset),
- subarray_orc_offset = batch->offsets[running_orc_offset],
- new_subarray_orc_offset = batch->offsets[running_orc_offset + 1],
- subarray_orc_length = new_subarray_orc_offset - subarray_orc_offset;
- key_batch->resize(new_subarray_orc_offset);
- element_batch->resize(new_subarray_orc_offset);
- RETURN_NOT_OK(
- WriteBatch(*(key_array->Slice(subarray_arrow_offset, subarray_orc_length)),
- subarray_orc_offset, key_batch));
- RETURN_NOT_OK(
- WriteBatch(*(element_array->Slice(subarray_arrow_offset, subarray_orc_length)),
- subarray_orc_offset, element_batch));
- }
- }
- return Status::OK();
-}
-
-Status WriteBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- Type::type kind = array.type_id();
- column_vector_batch->numElements = orc_offset;
- switch (kind) {
- case Type::type::BOOL:
- return WriteGenericBatch<BooleanType, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT8:
- return WriteGenericBatch<Int8Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT16:
- return WriteGenericBatch<Int16Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT32:
- return WriteGenericBatch<Int32Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT64:
- return WriteGenericBatch<Int64Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::FLOAT:
- return WriteGenericBatch<FloatType, liborc::DoubleVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::DOUBLE:
- return WriteGenericBatch<DoubleType, liborc::DoubleVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::BINARY:
- return WriteGenericBatch<BinaryType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::LARGE_BINARY:
- return WriteGenericBatch<LargeBinaryType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::STRING:
- return WriteGenericBatch<StringType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::LARGE_STRING:
- return WriteGenericBatch<LargeStringType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::FIXED_SIZE_BINARY:
- return WriteFixedSizeBinaryBatch(array, orc_offset, column_vector_batch);
- case Type::type::DATE32:
- return WriteGenericBatch<Date32Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::DATE64:
- return WriteTimestampBatch<Date64Type>(array, orc_offset, column_vector_batch,
- kOneSecondMillis, kOneMilliNanos);
- case Type::type::TIMESTAMP: {
- switch (internal::checked_pointer_cast<TimestampType>(array.type())->unit()) {
- case TimeUnit::type::SECOND:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, 1, kOneSecondNanos);
- case TimeUnit::type::MILLI:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, kOneSecondMillis, kOneMilliNanos);
- case TimeUnit::type::MICRO:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, kOneSecondMicros, kOneMicroNanos);
- case TimeUnit::type::NANO:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, kOneSecondNanos, 1);
- default:
- return Status::TypeError("Unknown or unsupported Arrow type: ",
- array.type()->ToString());
- }
- }
- case Type::type::DECIMAL128: {
- int32_t precision = checked_pointer_cast<Decimal128Type>(array.type())->precision();
- if (precision > 18) {
- return WriteGenericBatch<Decimal128Type, liborc::Decimal128VectorBatch>(
- array, orc_offset, column_vector_batch);
- } else {
- return WriteGenericBatch<Decimal128Type, liborc::Decimal64VectorBatch>(
- array, orc_offset, column_vector_batch);
- }
- }
- case Type::type::STRUCT:
- return WriteStructBatch(array, orc_offset, column_vector_batch);
- case Type::type::LIST:
- return WriteListBatch<ListArray>(array, orc_offset, column_vector_batch);
- case Type::type::LARGE_LIST:
- return WriteListBatch<LargeListArray>(array, orc_offset, column_vector_batch);
- case Type::type::FIXED_SIZE_LIST:
- return WriteListBatch<FixedSizeListArray>(array, orc_offset, column_vector_batch);
- case Type::type::MAP:
- return WriteMapBatch(array, orc_offset, column_vector_batch);
- default: {
- return Status::NotImplemented("Unknown or unsupported Arrow type: ",
- array.type()->ToString());
- }
- }
- return Status::OK();
-}
-
-Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const DataType& type) {
- Type::type kind = type.id();
- switch (kind) {
- case Type::type::BOOL:
- return liborc::createPrimitiveType(liborc::TypeKind::BOOLEAN);
- case Type::type::INT8:
- return liborc::createPrimitiveType(liborc::TypeKind::BYTE);
- case Type::type::INT16:
- return liborc::createPrimitiveType(liborc::TypeKind::SHORT);
- case Type::type::INT32:
- return liborc::createPrimitiveType(liborc::TypeKind::INT);
- case Type::type::INT64:
- return liborc::createPrimitiveType(liborc::TypeKind::LONG);
- case Type::type::FLOAT:
- return liborc::createPrimitiveType(liborc::TypeKind::FLOAT);
- case Type::type::DOUBLE:
- return liborc::createPrimitiveType(liborc::TypeKind::DOUBLE);
- // Use STRING instead of VARCHAR for now, both use UTF-8
- case Type::type::STRING:
- case Type::type::LARGE_STRING:
- return liborc::createPrimitiveType(liborc::TypeKind::STRING);
- case Type::type::BINARY:
- case Type::type::LARGE_BINARY:
- case Type::type::FIXED_SIZE_BINARY:
- return liborc::createPrimitiveType(liborc::TypeKind::BINARY);
- case Type::type::DATE32:
- return liborc::createPrimitiveType(liborc::TypeKind::DATE);
- case Type::type::DATE64:
- case Type::type::TIMESTAMP:
- return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP);
- case Type::type::DECIMAL128: {
- const uint64_t precision =
- static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).precision());
- const uint64_t scale =
- static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).scale());
- return liborc::createDecimalType(precision, scale);
- }
- case Type::type::LIST:
- case Type::type::FIXED_SIZE_LIST:
- case Type::type::LARGE_LIST: {
- std::shared_ptr<DataType> arrow_child_type =
- checked_cast<const BaseListType&>(type).value_type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- return liborc::createListType(std::move(orc_subtype));
- }
- case Type::type::STRUCT: {
- ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
- std::vector<std::shared_ptr<Field>> arrow_fields =
- checked_cast<const StructType&>(type).fields();
- for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
- it != arrow_fields.end(); ++it) {
- std::string field_name = (*it)->name();
- std::shared_ptr<DataType> arrow_child_type = (*it)->type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- out_type->addStructField(field_name, std::move(orc_subtype));
- }
- return std::move(out_type);
- }
- case Type::type::MAP: {
- std::shared_ptr<DataType> key_arrow_type =
- checked_cast<const MapType&>(type).key_type();
- std::shared_ptr<DataType> item_arrow_type =
- checked_cast<const MapType&>(type).item_type();
- ARROW_ASSIGN_OR_RAISE(auto key_orc_type, GetOrcType(*key_arrow_type));
- ARROW_ASSIGN_OR_RAISE(auto item_orc_type, GetOrcType(*item_arrow_type));
- return liborc::createMapType(std::move(key_orc_type), std::move(item_orc_type));
- }
- case Type::type::DENSE_UNION:
- case Type::type::SPARSE_UNION: {
- ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createUnionType();
- std::vector<std::shared_ptr<Field>> arrow_fields =
- checked_cast<const UnionType&>(type).fields();
- for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
- it != arrow_fields.end(); ++it) {
- std::string field_name = (*it)->name();
- std::shared_ptr<DataType> arrow_child_type = (*it)->type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- out_type->addUnionChild(std::move(orc_subtype));
- }
- return std::move(out_type);
- }
- default: {
- return Status::NotImplemented("Unknown or unsupported Arrow type: ",
- type.ToString());
- }
- }
-}
-
-} // namespace
-
-Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
- int* arrow_chunk_offset, int64_t* arrow_index_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- int num_batch = chunked_array.num_chunks();
- int64_t orc_offset = 0;
- while (*arrow_chunk_offset < num_batch && orc_offset < length) {
- ARROW_ASSIGN_OR_RAISE(auto array,
- NormalizeArray(chunked_array.chunk(*arrow_chunk_offset)));
- int64_t num_written_elements =
- std::min(length - orc_offset, array->length() - *arrow_index_offset);
- if (num_written_elements > 0) {
- RETURN_NOT_OK(WriteBatch(*(array->Slice(*arrow_index_offset, num_written_elements)),
- orc_offset, column_vector_batch));
- orc_offset += num_written_elements;
- *arrow_index_offset += num_written_elements;
- }
- if (orc_offset < length) { // Another Arrow Array done
- *arrow_index_offset = 0;
- (*arrow_chunk_offset)++;
- }
- }
- column_vector_batch->numElements = orc_offset;
- return Status::OK();
-}
-
-Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
- // When subselecting fields on read, liborc will set some nodes to nullptr,
- // so we need to check for nullptr before progressing
- if (type == nullptr) {
- *out = null();
- return Status::OK();
- }
- liborc::TypeKind kind = type->getKind();
- const int subtype_count = static_cast<int>(type->getSubtypeCount());
-
- switch (kind) {
- case liborc::BOOLEAN:
- *out = boolean();
- break;
- case liborc::BYTE:
- *out = int8();
- break;
- case liborc::SHORT:
- *out = int16();
- break;
- case liborc::INT:
- *out = int32();
- break;
- case liborc::LONG:
- *out = int64();
- break;
- case liborc::FLOAT:
- *out = float32();
- break;
- case liborc::DOUBLE:
- *out = float64();
- break;
- case liborc::VARCHAR:
- case liborc::STRING:
- *out = utf8();
- break;
- case liborc::BINARY:
- *out = binary();
- break;
- case liborc::CHAR:
- *out = fixed_size_binary(static_cast<int>(type->getMaximumLength()));
- break;
- case liborc::TIMESTAMP:
- *out = timestamp(TimeUnit::NANO);
- break;
- case liborc::DATE:
- *out = date32();
- break;
- case liborc::DECIMAL: {
- const int precision = static_cast<int>(type->getPrecision());
- const int scale = static_cast<int>(type->getScale());
- if (precision == 0) {
- // In HIVE 0.11/0.12 precision is set as 0, but means max precision
- *out = decimal128(38, 6);
- } else {
- *out = decimal128(precision, scale);
- }
- break;
- }
- case liborc::LIST: {
- if (subtype_count != 1) {
- return Status::TypeError("Invalid Orc List type");
- }
- std::shared_ptr<DataType> elemtype;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype));
- *out = list(elemtype);
- break;
- }
- case liborc::MAP: {
- if (subtype_count != 2) {
- return Status::TypeError("Invalid Orc Map type");
- }
- std::shared_ptr<DataType> key_type, item_type;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &key_type));
- RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &item_type));
- *out = map(key_type, item_type);
- break;
- }
- case liborc::STRUCT: {
- std::vector<std::shared_ptr<Field>> fields;
- for (int child = 0; child < subtype_count; ++child) {
- std::shared_ptr<DataType> elem_type;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
- std::string name = type->getFieldName(child);
- fields.push_back(field(name, elem_type));
- }
- *out = struct_(fields);
- break;
- }
- case liborc::UNION: {
- std::vector<std::shared_ptr<Field>> fields;
- std::vector<int8_t> type_codes;
- for (int child = 0; child < subtype_count; ++child) {
- std::shared_ptr<DataType> elem_type;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
- fields.push_back(field("_union_" + std::to_string(child), elem_type));
- type_codes.push_back(static_cast<int8_t>(child));
- }
- *out = sparse_union(fields, type_codes);
- break;
- }
- default: {
- return Status::TypeError("Unknown Orc type kind: ", type->toString());
- }
- }
- return Status::OK();
-}
-
-Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema) {
- int numFields = schema.num_fields();
- ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
- for (int i = 0; i < numFields; i++) {
- std::shared_ptr<Field> field = schema.field(i);
- std::string field_name = field->name();
- std::shared_ptr<DataType> arrow_child_type = field->type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- out_type->addStructField(field_name, std::move(orc_subtype));
- }
- return std::move(out_type);
-}
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter_util.h"
+
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/range.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
+#include "orc/Exceptions.hh"
+#include "orc/MemoryPool.hh"
+#include "orc/OrcFile.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace adapters {
+namespace orc {
+
+namespace {
+
+// The number of milliseconds, microseconds and nanoseconds in a second
+constexpr int64_t kOneSecondMillis = 1000LL;
+constexpr int64_t kOneMicroNanos = 1000LL;
+constexpr int64_t kOneSecondMicros = 1000000LL;
+constexpr int64_t kOneMilliNanos = 1000000LL;
+constexpr int64_t kOneSecondNanos = 1000000000LL;
+
+Status AppendStructBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<StructBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ RETURN_NOT_OK(builder->AppendValues(length, valid_bytes));
+
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, length,
+ builder->field_builder(i)));
+ }
+ return Status::OK();
+}
+
+Status AppendListBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<ListBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* elements = batch->elements.get();
+ const liborc::Type* elemtype = type->getSubtype(0);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ int64_t start = batch->offsets[i];
+ int64_t end = batch->offsets[i + 1];
+ RETURN_NOT_OK(builder->Append());
+ RETURN_NOT_OK(
+ AppendBatch(elemtype, elements, start, end - start, builder->value_builder()));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendMapBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<MapBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* keys = batch->keys.get();
+ liborc::ColumnVectorBatch* items = batch->elements.get();
+ const liborc::Type* key_type = type->getSubtype(0);
+ const liborc::Type* item_type = type->getSubtype(1);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ int64_t start = batch->offsets[i];
+ int64_t end = batch->offsets[i + 1];
+ RETURN_NOT_OK(builder->Append());
+ RETURN_NOT_OK(
+ AppendBatch(key_type, keys, start, end - start, builder->key_builder()));
+ RETURN_NOT_OK(
+ AppendBatch(item_type, items, start, end - start, builder->item_builder()));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+template <class BuilderType, class BatchType, class ElemType>
+Status AppendNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const ElemType* source = batch->data.data() + offset;
+ RETURN_NOT_OK(builder->AppendValues(source, length, valid_bytes));
+ return Status::OK();
+}
+
+template <class BuilderType, class TargetType, class BatchType, class SourceType>
+Status AppendNumericBatchCast(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const SourceType* source = batch->data.data() + offset;
+ auto cast_iter = internal::MakeLazyRange(
+ [&source](int64_t index) { return static_cast<TargetType>(source[index]); },
+ length);
+
+ RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
+
+ return Status::OK();
+}
+
+Status AppendBoolBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BooleanBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::LongVectorBatch*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const int64_t* source = batch->data.data() + offset;
+
+ auto cast_iter = internal::MakeLazyRange(
+ [&source](int64_t index) { return static_cast<bool>(source[index]); }, length);
+
+ RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
+
+ return Status::OK();
+}
+
+Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<TimestampBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+
+ const int64_t* seconds = batch->data.data() + offset;
+ const int64_t* nanos = batch->nanoseconds.data() + offset;
+
+ auto transform_timestamp = [seconds, nanos](int64_t index) {
+ return seconds[index] * kOneSecondNanos + nanos[index];
+ };
+
+ auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
+
+ RETURN_NOT_OK(
+ builder->AppendValues(transform_range.begin(), transform_range.end(), valid_bytes));
+ return Status::OK();
+}
+
+template <class BuilderType>
+Status AppendBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(
+ builder->Append(batch->data[i], static_cast<int32_t>(batch->length[i])));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<FixedSizeBinaryBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(batch->data[i]));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendDecimalBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<Decimal128Builder*>(abuilder);
+
+ const bool has_nulls = column_vector_batch->hasNulls;
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ auto batch = checked_cast<liborc::Decimal128VectorBatch*>(column_vector_batch);
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(
+ Decimal128(batch->values[i].getHighBits(), batch->values[i].getLowBits())));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ } else {
+ auto batch = checked_cast<liborc::Decimal64VectorBatch*>(column_vector_batch);
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i])));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
+ int64_t offset, int64_t length, ArrayBuilder* builder) {
+ if (type == nullptr) {
+ return Status::OK();
+ }
+ liborc::TypeKind kind = type->getKind();
+ switch (kind) {
+ case liborc::STRUCT:
+ return AppendStructBatch(type, batch, offset, length, builder);
+ case liborc::LIST:
+ return AppendListBatch(type, batch, offset, length, builder);
+ case liborc::MAP:
+ return AppendMapBatch(type, batch, offset, length, builder);
+ case liborc::LONG:
+ return AppendNumericBatch<Int64Builder, liborc::LongVectorBatch, int64_t>(
+ batch, offset, length, builder);
+ case liborc::INT:
+ return AppendNumericBatchCast<Int32Builder, int32_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::SHORT:
+ return AppendNumericBatchCast<Int16Builder, int16_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::BYTE:
+ return AppendNumericBatchCast<Int8Builder, int8_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::DOUBLE:
+ return AppendNumericBatch<DoubleBuilder, liborc::DoubleVectorBatch, double>(
+ batch, offset, length, builder);
+ case liborc::FLOAT:
+ return AppendNumericBatchCast<FloatBuilder, float, liborc::DoubleVectorBatch,
+ double>(batch, offset, length, builder);
+ case liborc::BOOLEAN:
+ return AppendBoolBatch(batch, offset, length, builder);
+ case liborc::VARCHAR:
+ case liborc::STRING:
+ return AppendBinaryBatch<StringBuilder>(batch, offset, length, builder);
+ case liborc::BINARY:
+ return AppendBinaryBatch<BinaryBuilder>(batch, offset, length, builder);
+ case liborc::CHAR:
+ return AppendFixedBinaryBatch(batch, offset, length, builder);
+ case liborc::DATE:
+ return AppendNumericBatchCast<Date32Builder, int32_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::TIMESTAMP:
+ return AppendTimestampBatch(batch, offset, length, builder);
+ case liborc::DECIMAL:
+ return AppendDecimalBatch(type, batch, offset, length, builder);
+ default:
+ return Status::NotImplemented("Not implemented type kind: ", kind);
+ }
+}
+
+namespace {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+Status WriteBatch(const Array& parray, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch);
+
+// Make sure children of StructArray have appropriate null.
+Result<std::shared_ptr<Array>> NormalizeArray(const std::shared_ptr<Array>& array) {
+ Type::type kind = array->type_id();
+ switch (kind) {
+ case Type::type::STRUCT: {
+ if (array->null_count() == 0) {
+ return array;
+ } else {
+ auto struct_array = checked_pointer_cast<StructArray>(array);
+ const std::shared_ptr<Buffer> bitmap = struct_array->null_bitmap();
+ std::shared_ptr<DataType> struct_type = struct_array->type();
+ std::size_t size = struct_type->fields().size();
+ std::vector<std::shared_ptr<Array>> new_children(size, nullptr);
+ for (std::size_t i = 0; i < size; i++) {
+ std::shared_ptr<Array> child = struct_array->field(i);
+ const std::shared_ptr<Buffer> child_bitmap = child->null_bitmap();
+ std::shared_ptr<Buffer> final_child_bitmap;
+ if (child_bitmap == nullptr) {
+ final_child_bitmap = bitmap;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ final_child_bitmap,
+ internal::BitmapAnd(default_memory_pool(), bitmap->data(), 0,
+ child_bitmap->data(), 0, struct_array->length(), 0));
+ }
+ std::shared_ptr<ArrayData> child_array_data = child->data();
+ std::vector<std::shared_ptr<Buffer>> child_buffers = child_array_data->buffers;
+ child_buffers[0] = final_child_bitmap;
+ std::shared_ptr<ArrayData> new_child_array_data =
+ ArrayData::Make(child->type(), child->length(), child_buffers,
+ child_array_data->child_data, child_array_data->dictionary);
+ ARROW_ASSIGN_OR_RAISE(new_children[i],
+ NormalizeArray(MakeArray(new_child_array_data)));
+ }
+ return std::make_shared<StructArray>(struct_type, struct_array->length(),
+ new_children, bitmap);
+ }
+ }
+ case Type::type::LIST: {
+ auto list_array = checked_pointer_cast<ListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<ListArray>(list_array->type(), list_array->length(),
+ list_array->value_offsets(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::LARGE_LIST: {
+ auto list_array = checked_pointer_cast<LargeListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<LargeListArray>(list_array->type(), list_array->length(),
+ list_array->value_offsets(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::FIXED_SIZE_LIST: {
+ auto list_array = checked_pointer_cast<FixedSizeListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<FixedSizeListArray>(list_array->type(),
+ list_array->length(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::MAP: {
+ auto map_array = checked_pointer_cast<MapArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto key_array, NormalizeArray(map_array->keys()));
+ ARROW_ASSIGN_OR_RAISE(auto item_array, NormalizeArray(map_array->items()));
+ return std::make_shared<MapArray>(map_array->type(), map_array->length(),
+ map_array->value_offsets(), key_array, item_array,
+ map_array->null_bitmap());
+ }
+ default: {
+ return array;
+ }
+ }
+}
+
+template <class DataType, class BatchType, typename Enable = void>
+struct Appender {};
+
+// Types for long/double-like Appender, that is, numeric, boolean or date32
+template <typename T>
+using is_generic_type =
+ std::integral_constant<bool, is_number_type<T>::value ||
+ std::is_same<Date32Type, T>::value ||
+ is_boolean_type<T>::value>;
+template <typename T, typename R = void>
+using enable_if_generic = enable_if_t<is_generic_type<T>::value, R>;
+
+// Number-like
+template <class DataType, class BatchType>
+struct Appender<DataType, BatchType, enable_if_generic<DataType>> {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ using ValueType = typename TypeTraits<DataType>::CType;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(ValueType v) {
+ batch->data[running_orc_offset] = array.Value(running_arrow_offset);
+ batch->notNull[running_orc_offset] = true;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ BatchType* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Binary
+template <class DataType>
+struct Appender<DataType, liborc::StringVectorBatch> {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ using COffsetType = typename TypeTraits<DataType>::OffsetType::c_type;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ COffsetType data_length = 0;
+ batch->data[running_orc_offset] = reinterpret_cast<char*>(
+ const_cast<uint8_t*>(array.GetValue(running_arrow_offset, &data_length)));
+ batch->length[running_orc_offset] = data_length;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ liborc::StringVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Decimal
+template <>
+struct Appender<Decimal128Type, liborc::Decimal64VectorBatch> {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ const Decimal128 dec_value(array.GetValue(running_arrow_offset));
+ batch->values[running_orc_offset] = static_cast<int64_t>(dec_value.low_bits());
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const Decimal128Array& array;
+ liborc::Decimal64VectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+template <>
+struct Appender<Decimal128Type, liborc::Decimal128VectorBatch> {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ const Decimal128 dec_value(array.GetValue(running_arrow_offset));
+ batch->values[running_orc_offset] =
+ liborc::Int128(dec_value.high_bits(), dec_value.low_bits());
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const Decimal128Array& array;
+ liborc::Decimal128VectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Date64 and Timestamp
+template <class DataType>
+struct TimestampAppender {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(int64_t v) {
+ int64_t data = array.Value(running_arrow_offset);
+ batch->notNull[running_orc_offset] = true;
+ batch->data[running_orc_offset] =
+ static_cast<int64_t>(std::floor(data / conversion_factor_from_second));
+ batch->nanoseconds[running_orc_offset] =
+ (data - conversion_factor_from_second * batch->data[running_orc_offset]) *
+ conversion_factor_to_nano;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ liborc::TimestampVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+ int64_t conversion_factor_from_second, conversion_factor_to_nano;
+};
+
+// FSB
+struct FixedSizeBinaryAppender {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ batch->data[running_orc_offset] = reinterpret_cast<char*>(
+ const_cast<uint8_t*>(array.GetValue(running_arrow_offset)));
+ batch->length[running_orc_offset] = data_length;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const FixedSizeBinaryArray& array;
+ liborc::StringVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+ const int32_t data_length;
+};
+
+// static_cast from int64_t or double to itself shouldn't introduce overhead
+// Pleae see
+// https://stackoverflow.com/questions/19106826/
+// can-static-cast-to-same-type-introduce-runtime-overhead
+template <class DataType, class BatchType>
+Status WriteGenericBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ const ArrayType& array_(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ Appender<DataType, BatchType> appender{array_, batch, orc_offset, 0};
+ ArrayDataVisitor<DataType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+template <class DataType>
+Status WriteTimestampBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch,
+ const int64_t& conversion_factor_from_second,
+ const int64_t& conversion_factor_to_nano) {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ const ArrayType& array_(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ TimestampAppender<DataType> appender{array_,
+ batch,
+ orc_offset,
+ 0,
+ conversion_factor_from_second,
+ conversion_factor_to_nano};
+ ArrayDataVisitor<DataType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const FixedSizeBinaryArray& array_(checked_cast<const FixedSizeBinaryArray&>(array));
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()};
+ ArrayDataVisitor<FixedSizeBinaryType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+Status WriteStructBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ std::shared_ptr<Array> array_ = MakeArray(array.data());
+ std::shared_ptr<StructArray> struct_array(checked_pointer_cast<StructArray>(array_));
+ auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
+ std::size_t size = array.type()->fields().size();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ // First fill fields of ColumnVectorBatch
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ }
+ }
+ // Fill the fields
+ for (std::size_t i = 0; i < size; i++) {
+ batch->fields[i]->resize(orc_offset + arrow_length);
+ RETURN_NOT_OK(WriteBatch(*(struct_array->field(i)), orc_offset, batch->fields[i]));
+ }
+ return Status::OK();
+}
+
+template <class ArrayType>
+Status WriteListBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const ArrayType& list_array(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ if (orc_offset == 0) {
+ batch->offsets[0] = 0;
+ }
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ batch->offsets[running_orc_offset + 1] =
+ batch->offsets[running_orc_offset] +
+ list_array.value_offset(running_arrow_offset + 1) -
+ list_array.value_offset(running_arrow_offset);
+ element_batch->resize(batch->offsets[running_orc_offset + 1]);
+ int64_t subarray_arrow_offset = list_array.value_offset(running_arrow_offset),
+ subarray_orc_offset = batch->offsets[running_orc_offset],
+ subarray_orc_length =
+ batch->offsets[running_orc_offset + 1] - subarray_orc_offset;
+ RETURN_NOT_OK(WriteBatch(
+ *(list_array.values()->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, element_batch));
+ }
+ }
+ return Status::OK();
+}
+
+Status WriteMapBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const MapArray& map_array(checked_cast<const MapArray&>(array));
+ auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* key_batch = (batch->keys).get();
+ liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
+ std::shared_ptr<Array> key_array = map_array.keys();
+ std::shared_ptr<Array> element_array = map_array.items();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ if (orc_offset == 0) {
+ batch->offsets[0] = 0;
+ }
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ batch->offsets[running_orc_offset + 1] =
+ batch->offsets[running_orc_offset] +
+ map_array.value_offset(running_arrow_offset + 1) -
+ map_array.value_offset(running_arrow_offset);
+ int64_t subarray_arrow_offset = map_array.value_offset(running_arrow_offset),
+ subarray_orc_offset = batch->offsets[running_orc_offset],
+ new_subarray_orc_offset = batch->offsets[running_orc_offset + 1],
+ subarray_orc_length = new_subarray_orc_offset - subarray_orc_offset;
+ key_batch->resize(new_subarray_orc_offset);
+ element_batch->resize(new_subarray_orc_offset);
+ RETURN_NOT_OK(
+ WriteBatch(*(key_array->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, key_batch));
+ RETURN_NOT_OK(
+ WriteBatch(*(element_array->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, element_batch));
+ }
+ }
+ return Status::OK();
+}
+
+Status WriteBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ Type::type kind = array.type_id();
+ column_vector_batch->numElements = orc_offset;
+ switch (kind) {
+ case Type::type::BOOL:
+ return WriteGenericBatch<BooleanType, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT8:
+ return WriteGenericBatch<Int8Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT16:
+ return WriteGenericBatch<Int16Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT32:
+ return WriteGenericBatch<Int32Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT64:
+ return WriteGenericBatch<Int64Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::FLOAT:
+ return WriteGenericBatch<FloatType, liborc::DoubleVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::DOUBLE:
+ return WriteGenericBatch<DoubleType, liborc::DoubleVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::BINARY:
+ return WriteGenericBatch<BinaryType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_BINARY:
+ return WriteGenericBatch<LargeBinaryType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::STRING:
+ return WriteGenericBatch<StringType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_STRING:
+ return WriteGenericBatch<LargeStringType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::FIXED_SIZE_BINARY:
+ return WriteFixedSizeBinaryBatch(array, orc_offset, column_vector_batch);
+ case Type::type::DATE32:
+ return WriteGenericBatch<Date32Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::DATE64:
+ return WriteTimestampBatch<Date64Type>(array, orc_offset, column_vector_batch,
+ kOneSecondMillis, kOneMilliNanos);
+ case Type::type::TIMESTAMP: {
+ switch (internal::checked_pointer_cast<TimestampType>(array.type())->unit()) {
+ case TimeUnit::type::SECOND:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, 1, kOneSecondNanos);
+ case TimeUnit::type::MILLI:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondMillis, kOneMilliNanos);
+ case TimeUnit::type::MICRO:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondMicros, kOneMicroNanos);
+ case TimeUnit::type::NANO:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondNanos, 1);
+ default:
+ return Status::TypeError("Unknown or unsupported Arrow type: ",
+ array.type()->ToString());
+ }
+ }
+ case Type::type::DECIMAL128: {
+ int32_t precision = checked_pointer_cast<Decimal128Type>(array.type())->precision();
+ if (precision > 18) {
+ return WriteGenericBatch<Decimal128Type, liborc::Decimal128VectorBatch>(
+ array, orc_offset, column_vector_batch);
+ } else {
+ return WriteGenericBatch<Decimal128Type, liborc::Decimal64VectorBatch>(
+ array, orc_offset, column_vector_batch);
+ }
+ }
+ case Type::type::STRUCT:
+ return WriteStructBatch(array, orc_offset, column_vector_batch);
+ case Type::type::LIST:
+ return WriteListBatch<ListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_LIST:
+ return WriteListBatch<LargeListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::FIXED_SIZE_LIST:
+ return WriteListBatch<FixedSizeListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::MAP:
+ return WriteMapBatch(array, orc_offset, column_vector_batch);
+ default: {
+ return Status::NotImplemented("Unknown or unsupported Arrow type: ",
+ array.type()->ToString());
+ }
+ }
+ return Status::OK();
+}
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const DataType& type) {
+ Type::type kind = type.id();
+ switch (kind) {
+ case Type::type::BOOL:
+ return liborc::createPrimitiveType(liborc::TypeKind::BOOLEAN);
+ case Type::type::INT8:
+ return liborc::createPrimitiveType(liborc::TypeKind::BYTE);
+ case Type::type::INT16:
+ return liborc::createPrimitiveType(liborc::TypeKind::SHORT);
+ case Type::type::INT32:
+ return liborc::createPrimitiveType(liborc::TypeKind::INT);
+ case Type::type::INT64:
+ return liborc::createPrimitiveType(liborc::TypeKind::LONG);
+ case Type::type::FLOAT:
+ return liborc::createPrimitiveType(liborc::TypeKind::FLOAT);
+ case Type::type::DOUBLE:
+ return liborc::createPrimitiveType(liborc::TypeKind::DOUBLE);
+ // Use STRING instead of VARCHAR for now, both use UTF-8
+ case Type::type::STRING:
+ case Type::type::LARGE_STRING:
+ return liborc::createPrimitiveType(liborc::TypeKind::STRING);
+ case Type::type::BINARY:
+ case Type::type::LARGE_BINARY:
+ case Type::type::FIXED_SIZE_BINARY:
+ return liborc::createPrimitiveType(liborc::TypeKind::BINARY);
+ case Type::type::DATE32:
+ return liborc::createPrimitiveType(liborc::TypeKind::DATE);
+ case Type::type::DATE64:
+ case Type::type::TIMESTAMP:
+ return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP);
+ case Type::type::DECIMAL128: {
+ const uint64_t precision =
+ static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).precision());
+ const uint64_t scale =
+ static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).scale());
+ return liborc::createDecimalType(precision, scale);
+ }
+ case Type::type::LIST:
+ case Type::type::FIXED_SIZE_LIST:
+ case Type::type::LARGE_LIST: {
+ std::shared_ptr<DataType> arrow_child_type =
+ checked_cast<const BaseListType&>(type).value_type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ return liborc::createListType(std::move(orc_subtype));
+ }
+ case Type::type::STRUCT: {
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
+ std::vector<std::shared_ptr<Field>> arrow_fields =
+ checked_cast<const StructType&>(type).fields();
+ for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
+ it != arrow_fields.end(); ++it) {
+ std::string field_name = (*it)->name();
+ std::shared_ptr<DataType> arrow_child_type = (*it)->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addStructField(field_name, std::move(orc_subtype));
+ }
+ return std::move(out_type);
+ }
+ case Type::type::MAP: {
+ std::shared_ptr<DataType> key_arrow_type =
+ checked_cast<const MapType&>(type).key_type();
+ std::shared_ptr<DataType> item_arrow_type =
+ checked_cast<const MapType&>(type).item_type();
+ ARROW_ASSIGN_OR_RAISE(auto key_orc_type, GetOrcType(*key_arrow_type));
+ ARROW_ASSIGN_OR_RAISE(auto item_orc_type, GetOrcType(*item_arrow_type));
+ return liborc::createMapType(std::move(key_orc_type), std::move(item_orc_type));
+ }
+ case Type::type::DENSE_UNION:
+ case Type::type::SPARSE_UNION: {
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createUnionType();
+ std::vector<std::shared_ptr<Field>> arrow_fields =
+ checked_cast<const UnionType&>(type).fields();
+ for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
+ it != arrow_fields.end(); ++it) {
+ std::string field_name = (*it)->name();
+ std::shared_ptr<DataType> arrow_child_type = (*it)->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addUnionChild(std::move(orc_subtype));
+ }
+ return std::move(out_type);
+ }
+ default: {
+ return Status::NotImplemented("Unknown or unsupported Arrow type: ",
+ type.ToString());
+ }
+ }
+}
+
+} // namespace
+
+Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
+ int* arrow_chunk_offset, int64_t* arrow_index_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ int num_batch = chunked_array.num_chunks();
+ int64_t orc_offset = 0;
+ while (*arrow_chunk_offset < num_batch && orc_offset < length) {
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ NormalizeArray(chunked_array.chunk(*arrow_chunk_offset)));
+ int64_t num_written_elements =
+ std::min(length - orc_offset, array->length() - *arrow_index_offset);
+ if (num_written_elements > 0) {
+ RETURN_NOT_OK(WriteBatch(*(array->Slice(*arrow_index_offset, num_written_elements)),
+ orc_offset, column_vector_batch));
+ orc_offset += num_written_elements;
+ *arrow_index_offset += num_written_elements;
+ }
+ if (orc_offset < length) { // Another Arrow Array done
+ *arrow_index_offset = 0;
+ (*arrow_chunk_offset)++;
+ }
+ }
+ column_vector_batch->numElements = orc_offset;
+ return Status::OK();
+}
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
+ // When subselecting fields on read, liborc will set some nodes to nullptr,
+ // so we need to check for nullptr before progressing
+ if (type == nullptr) {
+ *out = null();
+ return Status::OK();
+ }
+ liborc::TypeKind kind = type->getKind();
+ const int subtype_count = static_cast<int>(type->getSubtypeCount());
+
+ switch (kind) {
+ case liborc::BOOLEAN:
+ *out = boolean();
+ break;
+ case liborc::BYTE:
+ *out = int8();
+ break;
+ case liborc::SHORT:
+ *out = int16();
+ break;
+ case liborc::INT:
+ *out = int32();
+ break;
+ case liborc::LONG:
+ *out = int64();
+ break;
+ case liborc::FLOAT:
+ *out = float32();
+ break;
+ case liborc::DOUBLE:
+ *out = float64();
+ break;
+ case liborc::VARCHAR:
+ case liborc::STRING:
+ *out = utf8();
+ break;
+ case liborc::BINARY:
+ *out = binary();
+ break;
+ case liborc::CHAR:
+ *out = fixed_size_binary(static_cast<int>(type->getMaximumLength()));
+ break;
+ case liborc::TIMESTAMP:
+ *out = timestamp(TimeUnit::NANO);
+ break;
+ case liborc::DATE:
+ *out = date32();
+ break;
+ case liborc::DECIMAL: {
+ const int precision = static_cast<int>(type->getPrecision());
+ const int scale = static_cast<int>(type->getScale());
+ if (precision == 0) {
+ // In HIVE 0.11/0.12 precision is set as 0, but means max precision
+ *out = decimal128(38, 6);
+ } else {
+ *out = decimal128(precision, scale);
+ }
+ break;
+ }
+ case liborc::LIST: {
+ if (subtype_count != 1) {
+ return Status::TypeError("Invalid Orc List type");
+ }
+ std::shared_ptr<DataType> elemtype;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype));
+ *out = list(elemtype);
+ break;
+ }
+ case liborc::MAP: {
+ if (subtype_count != 2) {
+ return Status::TypeError("Invalid Orc Map type");
+ }
+ std::shared_ptr<DataType> key_type, item_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &key_type));
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &item_type));
+ *out = map(key_type, item_type);
+ break;
+ }
+ case liborc::STRUCT: {
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int child = 0; child < subtype_count; ++child) {
+ std::shared_ptr<DataType> elem_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
+ std::string name = type->getFieldName(child);
+ fields.push_back(field(name, elem_type));
+ }
+ *out = struct_(fields);
+ break;
+ }
+ case liborc::UNION: {
+ std::vector<std::shared_ptr<Field>> fields;
+ std::vector<int8_t> type_codes;
+ for (int child = 0; child < subtype_count; ++child) {
+ std::shared_ptr<DataType> elem_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
+ fields.push_back(field("_union_" + std::to_string(child), elem_type));
+ type_codes.push_back(static_cast<int8_t>(child));
+ }
+ *out = sparse_union(fields, type_codes);
+ break;
+ }
+ default: {
+ return Status::TypeError("Unknown Orc type kind: ", type->toString());
+ }
+ }
+ return Status::OK();
+}
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema) {
+ int numFields = schema.num_fields();
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
+ for (int i = 0; i < numFields; i++) {
+ std::shared_ptr<Field> field = schema.field(i);
+ std::string field_name = field->name();
+ std::shared_ptr<DataType> arrow_child_type = field->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addStructField(field_name, std::move(orc_subtype));
+ }
+ return std::move(out_type);
+}
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
index 3e6d0fcc660..8176715aa51 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
@@ -1,57 +1,57 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "arrow/array/builder_base.h"
-#include "arrow/status.h"
-#include "orc/OrcFile.hh"
-
-namespace liborc = orc;
-
-namespace arrow {
-
-namespace adapters {
-
-namespace orc {
-
-Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out);
-
-Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema);
-
-Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
- int64_t offset, int64_t length, arrow::ArrayBuilder* builder);
-
-/// \brief Write a chunked array to an orc::ColumnVectorBatch
-///
-/// \param[in] chunked_array the chunked array
-/// \param[in] length the orc::ColumnVectorBatch size limit
-/// \param[in,out] arrow_chunk_offset The current chunk being processed
-/// \param[in,out] arrow_index_offset The index of the arrow_chunk_offset array
-/// before or after a process
-/// \param[in,out] column_vector_batch the orc::ColumnVectorBatch to be filled
-/// \return Status
-Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
- int* arrow_chunk_offset, int64_t* arrow_index_offset,
- liborc::ColumnVectorBatch* column_vector_batch);
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/status.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out);
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema);
+
+Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
+ int64_t offset, int64_t length, arrow::ArrayBuilder* builder);
+
+/// \brief Write a chunked array to an orc::ColumnVectorBatch
+///
+/// \param[in] chunked_array the chunked array
+/// \param[in] length the orc::ColumnVectorBatch size limit
+/// \param[in,out] arrow_chunk_offset The current chunk being processed
+/// \param[in,out] arrow_index_offset The index of the arrow_chunk_offset array
+/// before or after a process
+/// \param[in,out] column_vector_batch the orc::ColumnVectorBatch to be filled
+/// \return Status
+Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
+ int* arrow_chunk_offset, int64_t* arrow_index_offset,
+ liborc::ColumnVectorBatch* column_vector_batch);
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
index 67c5ca84e1f..5d731baa777 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
@@ -73,10 +73,10 @@ struct ScalarFromArraySlotImpl {
return Finish(Decimal128(a.GetValue(index_)));
}
- Status Visit(const Decimal256Array& a) {
- return Finish(Decimal256(a.GetValue(index_)));
- }
-
+ Status Visit(const Decimal256Array& a) {
+ return Finish(Decimal256(a.GetValue(index_)));
+ }
+
template <typename T>
Status Visit(const BaseBinaryArray<T>& a) {
return Finish(a.GetString(index_));
@@ -222,31 +222,31 @@ bool Array::ApproxEquals(const std::shared_ptr<Array>& arr,
}
bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx, const EqualOptions& opts) const {
- return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
+ int64_t other_start_idx, const EqualOptions& opts) const {
+ return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
}
bool Array::RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
- int64_t end_idx, int64_t other_start_idx,
- const EqualOptions& opts) const {
+ int64_t end_idx, int64_t other_start_idx,
+ const EqualOptions& opts) const {
if (!other) {
return false;
}
- return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
+ return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
}
bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const Array& other, const EqualOptions& opts) const {
- return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
+ const Array& other, const EqualOptions& opts) const {
+ return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
}
bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const std::shared_ptr<Array>& other,
- const EqualOptions& opts) const {
+ const std::shared_ptr<Array>& other,
+ const EqualOptions& opts) const {
if (!other) {
return false;
}
- return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
+ return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
}
std::shared_ptr<Array> Array::Slice(int64_t offset, int64_t length) const {
@@ -302,7 +302,7 @@ Status Array::Validate() const { return internal::ValidateArray(*this); }
Status Array::ValidateFull() const {
RETURN_NOT_OK(internal::ValidateArray(*this));
- return internal::ValidateArrayFull(*this);
+ return internal::ValidateArrayFull(*this);
}
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
index 2add572e7a4..469ae94d2eb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
@@ -56,17 +56,17 @@ class ARROW_EXPORT Array {
/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const {
- return null_bitmap_data_ != NULLPTR
- ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
- : data_->null_count == data_->length;
+ return null_bitmap_data_ != NULLPTR
+ ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
+ : data_->null_count == data_->length;
}
/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
- return null_bitmap_data_ != NULLPTR
- ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
- : data_->null_count != data_->length;
+ return null_bitmap_data_ != NULLPTR
+ ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
+ : data_->null_count != data_->length;
}
/// \brief Return a Scalar containing the value of this array at i
@@ -93,7 +93,7 @@ class ARROW_EXPORT Array {
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
- const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
+ const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
/// Raw pointer to the null bitmap.
///
@@ -121,17 +121,17 @@ class ARROW_EXPORT Array {
/// Compare if the range of slots specified are equal for the given array and
/// this array. end_idx exclusive. This methods does not bounds check.
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const Array& other,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ const Array& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const std::shared_ptr<Array>& other,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ const std::shared_ptr<Array>& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
- int64_t end_idx, int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ int64_t end_idx, int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults()) const;
Status Accept(ArrayVisitor* visitor) const;
@@ -162,7 +162,7 @@ class ARROW_EXPORT Array {
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
- const std::shared_ptr<ArrayData>& data() const { return data_; }
+ const std::shared_ptr<ArrayData>& data() const { return data_; }
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
index 9466b5a48f9..14a3a8ef961 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
@@ -21,9 +21,9 @@
#include <memory>
#include "arrow/array/array_base.h"
-#include "arrow/array/validate.h"
+#include "arrow/array/validate.h"
#include "arrow/type.h"
-#include "arrow/type_traits.h"
+#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
@@ -32,7 +32,7 @@ namespace arrow {
using internal::checked_cast;
BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
- ARROW_CHECK(is_binary_like(data->type->id()));
+ ARROW_CHECK(is_binary_like(data->type->id()));
SetData(data);
}
@@ -45,7 +45,7 @@ BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_of
}
LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
- ARROW_CHECK(is_large_binary_like(data->type->id()));
+ ARROW_CHECK(is_large_binary_like(data->type->id()));
SetData(data);
}
@@ -71,7 +71,7 @@ StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_of
offset));
}
-Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
@@ -87,7 +87,7 @@ LargeStringArray::LargeStringArray(int64_t length,
null_count, offset));
}
-Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
index f8e8c4f8a44..735042f4a09 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
@@ -28,7 +28,7 @@
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
-#include "arrow/stl_iterator.h"
+#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
@@ -47,7 +47,7 @@ class BaseBinaryArray : public FlatArray {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
- using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
+ using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
@@ -71,13 +71,13 @@ class BaseBinaryArray : public FlatArray {
raw_value_offsets_[i + 1] - pos);
}
- /// \brief Get binary value as a string_view
- /// Provided for consistency with other arrays.
- ///
- /// \param i the value index
- /// \return the view over the selected value
- util::string_view Value(int64_t i) const { return GetView(i); }
-
+ /// \brief Get binary value as a string_view
+ /// Provided for consistency with other arrays.
+ ///
+ /// \param i the value index
+ /// \return the view over the selected value
+ util::string_view Value(int64_t i) const { return GetView(i); }
+
/// \brief Get binary value as a std::string
///
/// \param i the value index
@@ -124,13 +124,13 @@ class BaseBinaryArray : public FlatArray {
}
}
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
// For subclasses
- BaseBinaryArray() = default;
+ BaseBinaryArray() = default;
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
@@ -139,8 +139,8 @@ class BaseBinaryArray : public FlatArray {
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
}
- const offset_type* raw_value_offsets_ = NULLPTR;
- const uint8_t* raw_data_ = NULLPTR;
+ const offset_type* raw_value_offsets_ = NULLPTR;
+ const uint8_t* raw_data_ = NULLPTR;
};
/// Concrete Array class for variable-size binary data
@@ -216,7 +216,7 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
public:
using TypeClass = FixedSizeBinaryType;
- using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
+ using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
@@ -238,10 +238,10 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
index d65f6ee5356..58852a7b6c5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
@@ -33,11 +33,11 @@ namespace arrow {
using internal::checked_cast;
// ----------------------------------------------------------------------
-// Decimal128
+// Decimal128
Decimal128Array::Decimal128Array(const std::shared_ptr<ArrayData>& data)
: FixedSizeBinaryArray(data) {
- ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL128);
+ ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL128);
}
std::string Decimal128Array::FormatValue(int64_t i) const {
@@ -46,18 +46,18 @@ std::string Decimal128Array::FormatValue(int64_t i) const {
return value.ToString(type_.scale());
}
-// ----------------------------------------------------------------------
-// Decimal256
-
-Decimal256Array::Decimal256Array(const std::shared_ptr<ArrayData>& data)
- : FixedSizeBinaryArray(data) {
- ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL256);
-}
-
-std::string Decimal256Array::FormatValue(int64_t i) const {
- const auto& type_ = checked_cast<const Decimal256Type&>(*type());
- const Decimal256 value(GetValue(i));
- return value.ToString(type_.scale());
-}
-
+// ----------------------------------------------------------------------
+// Decimal256
+
+Decimal256Array::Decimal256Array(const std::shared_ptr<ArrayData>& data)
+ : FixedSizeBinaryArray(data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL256);
+}
+
+std::string Decimal256Array::FormatValue(int64_t i) const {
+ const auto& type_ = checked_cast<const Decimal256Type&>(*type());
+ const Decimal256 value(GetValue(i));
+ return value.ToString(type_.scale());
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
index 8d7d1c59cd0..e32b9d26a35 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
@@ -47,20 +47,20 @@ class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
// Backward compatibility
using DecimalArray = Decimal128Array;
-// ----------------------------------------------------------------------
-// Decimal256Array
-
-/// Concrete Array class for 256-bit decimal data
-class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
- public:
- using TypeClass = Decimal256Type;
-
- using FixedSizeBinaryArray::FixedSizeBinaryArray;
-
- /// \brief Construct Decimal256Array from ArrayData instance
- explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
-
- std::string FormatValue(int64_t i) const;
-};
-
+// ----------------------------------------------------------------------
+// Decimal256Array
+
+/// Concrete Array class for 256-bit decimal data
+class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
+ public:
+ using TypeClass = Decimal256Type;
+
+ using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+ /// \brief Construct Decimal256Array from ArrayData instance
+ explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
+
+ std::string FormatValue(int64_t i) const;
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
index 2fa95e9a176..ddb44b470f6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
@@ -29,10 +29,10 @@
#include "arrow/array/dict_internal.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
-#include "arrow/chunked_array.h"
-#include "arrow/datum.h"
+#include "arrow/chunked_array.h"
+#include "arrow/datum.h"
#include "arrow/status.h"
-#include "arrow/table.h"
+#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
@@ -144,88 +144,88 @@ bool DictionaryArray::CanCompareIndices(const DictionaryArray& other) const {
}
// ----------------------------------------------------------------------
-// Dictionary transposition
-
-namespace {
-
-inline bool IsTrivialTransposition(const int32_t* transpose_map,
- int64_t input_dict_size) {
- for (int64_t i = 0; i < input_dict_size; ++i) {
- if (transpose_map[i] != i) {
- return false;
- }
- }
- return true;
-}
-
-Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
- const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& in_type,
- const std::shared_ptr<DataType>& out_type,
- const std::shared_ptr<ArrayData>& dictionary, const int32_t* transpose_map,
- MemoryPool* pool) {
- // Note that in_type may be different from data->type if data is of type ExtensionType
- if (in_type->id() != Type::DICTIONARY || out_type->id() != Type::DICTIONARY) {
- return Status::TypeError("Expected dictionary type");
- }
- const int64_t in_dict_len = data->dictionary->length;
- const auto& in_dict_type = checked_cast<const DictionaryType&>(*in_type);
- const auto& out_dict_type = checked_cast<const DictionaryType&>(*out_type);
-
- const auto& in_index_type = *in_dict_type.index_type();
- const auto& out_index_type =
- checked_cast<const FixedWidthType&>(*out_dict_type.index_type());
-
- if (in_index_type.id() == out_index_type.id() &&
- IsTrivialTransposition(transpose_map, in_dict_len)) {
- // Index type and values will be identical => we can simply reuse
- // the existing buffers.
- auto out_data =
- ArrayData::Make(out_type, data->length, {data->buffers[0], data->buffers[1]},
- data->null_count, data->offset);
- out_data->dictionary = dictionary;
- return out_data;
- }
-
- // Default path: compute a buffer of transposed indices.
- ARROW_ASSIGN_OR_RAISE(
- auto out_buffer,
- AllocateBuffer(data->length * (out_index_type.bit_width() / CHAR_BIT), pool));
-
- // Shift null buffer if the original offset is non-zero
- std::shared_ptr<Buffer> null_bitmap;
- if (data->offset != 0 && data->null_count != 0) {
- ARROW_ASSIGN_OR_RAISE(null_bitmap, CopyBitmap(pool, data->buffers[0]->data(),
- data->offset, data->length));
- } else {
- null_bitmap = data->buffers[0];
- }
-
- auto out_data = ArrayData::Make(out_type, data->length,
- {null_bitmap, std::move(out_buffer)}, data->null_count);
- out_data->dictionary = dictionary;
- RETURN_NOT_OK(internal::TransposeInts(
- in_index_type, out_index_type, data->GetValues<uint8_t>(1, 0),
- out_data->GetMutableValues<uint8_t>(1, 0), data->offset, out_data->offset,
- data->length, transpose_map));
- return out_data;
-}
-
-} // namespace
-
-Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
- const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
- const int32_t* transpose_map, MemoryPool* pool) const {
- ARROW_ASSIGN_OR_RAISE(auto transposed,
- TransposeDictIndices(data_, data_->type, type, dictionary->data(),
- transpose_map, pool));
- return MakeArray(std::move(transposed));
-}
-
-// ----------------------------------------------------------------------
-// Dictionary unification
-
-namespace {
-
+// Dictionary transposition
+
+namespace {
+
+inline bool IsTrivialTransposition(const int32_t* transpose_map,
+ int64_t input_dict_size) {
+ for (int64_t i = 0; i < input_dict_size; ++i) {
+ if (transpose_map[i] != i) {
+ return false;
+ }
+ }
+ return true;
+}
+
+Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
+ const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& in_type,
+ const std::shared_ptr<DataType>& out_type,
+ const std::shared_ptr<ArrayData>& dictionary, const int32_t* transpose_map,
+ MemoryPool* pool) {
+ // Note that in_type may be different from data->type if data is of type ExtensionType
+ if (in_type->id() != Type::DICTIONARY || out_type->id() != Type::DICTIONARY) {
+ return Status::TypeError("Expected dictionary type");
+ }
+ const int64_t in_dict_len = data->dictionary->length;
+ const auto& in_dict_type = checked_cast<const DictionaryType&>(*in_type);
+ const auto& out_dict_type = checked_cast<const DictionaryType&>(*out_type);
+
+ const auto& in_index_type = *in_dict_type.index_type();
+ const auto& out_index_type =
+ checked_cast<const FixedWidthType&>(*out_dict_type.index_type());
+
+ if (in_index_type.id() == out_index_type.id() &&
+ IsTrivialTransposition(transpose_map, in_dict_len)) {
+ // Index type and values will be identical => we can simply reuse
+ // the existing buffers.
+ auto out_data =
+ ArrayData::Make(out_type, data->length, {data->buffers[0], data->buffers[1]},
+ data->null_count, data->offset);
+ out_data->dictionary = dictionary;
+ return out_data;
+ }
+
+ // Default path: compute a buffer of transposed indices.
+ ARROW_ASSIGN_OR_RAISE(
+ auto out_buffer,
+ AllocateBuffer(data->length * (out_index_type.bit_width() / CHAR_BIT), pool));
+
+ // Shift null buffer if the original offset is non-zero
+ std::shared_ptr<Buffer> null_bitmap;
+ if (data->offset != 0 && data->null_count != 0) {
+ ARROW_ASSIGN_OR_RAISE(null_bitmap, CopyBitmap(pool, data->buffers[0]->data(),
+ data->offset, data->length));
+ } else {
+ null_bitmap = data->buffers[0];
+ }
+
+ auto out_data = ArrayData::Make(out_type, data->length,
+ {null_bitmap, std::move(out_buffer)}, data->null_count);
+ out_data->dictionary = dictionary;
+ RETURN_NOT_OK(internal::TransposeInts(
+ in_index_type, out_index_type, data->GetValues<uint8_t>(1, 0),
+ out_data->GetMutableValues<uint8_t>(1, 0), data->offset, out_data->offset,
+ data->length, transpose_map));
+ return out_data;
+}
+
+} // namespace
+
+Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+ const int32_t* transpose_map, MemoryPool* pool) const {
+ ARROW_ASSIGN_OR_RAISE(auto transposed,
+ TransposeDictIndices(data_, data_->type, type, dictionary->data(),
+ transpose_map, pool));
+ return MakeArray(std::move(transposed));
+}
+
+// ----------------------------------------------------------------------
+// Dictionary unification
+
+namespace {
+
template <typename T>
class DictionaryUnifierImpl : public DictionaryUnifier {
public:
@@ -288,23 +288,23 @@ class DictionaryUnifierImpl : public DictionaryUnifier {
return Status::OK();
}
- Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
- std::shared_ptr<Array>* out_dict) override {
- int64_t dict_length = memo_table_.size();
- if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) {
- return Status::Invalid(
- "These dictionaries cannot be combined. The unified dictionary requires a "
- "larger index type.");
- }
-
- // Build unified dictionary array
- std::shared_ptr<ArrayData> data;
- RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
- 0 /* start_offset */, &data));
- *out_dict = MakeArray(data);
- return Status::OK();
- }
-
+ Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) override {
+ int64_t dict_length = memo_table_.size();
+ if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) {
+ return Status::Invalid(
+ "These dictionaries cannot be combined. The unified dictionary requires a "
+ "larger index type.");
+ }
+
+ // Build unified dictionary array
+ std::shared_ptr<ArrayData> data;
+ RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
+ 0 /* start_offset */, &data));
+ *out_dict = MakeArray(data);
+ return Status::OK();
+ }
+
private:
MemoryPool* pool_;
std::shared_ptr<DataType> value_type_;
@@ -322,7 +322,7 @@ struct MakeUnifier {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
// Default implementation for non-dictionary-supported datatypes
- return Status::NotImplemented("Unification of ", *value_type,
+ return Status::NotImplemented("Unification of ", *value_type,
" dictionaries is not implemented");
}
@@ -333,110 +333,110 @@ struct MakeUnifier {
}
};
-struct RecursiveUnifier {
- MemoryPool* pool;
-
- // Return true if any of the arrays was changed (including descendents)
- Result<bool> Unify(std::shared_ptr<DataType> type, ArrayDataVector* chunks) {
- DCHECK(!chunks->empty());
- bool changed = false;
- std::shared_ptr<DataType> ext_type = nullptr;
-
- if (type->id() == Type::EXTENSION) {
- ext_type = std::move(type);
- type = checked_cast<const ExtensionType&>(*ext_type).storage_type();
+struct RecursiveUnifier {
+ MemoryPool* pool;
+
+ // Return true if any of the arrays was changed (including descendents)
+ Result<bool> Unify(std::shared_ptr<DataType> type, ArrayDataVector* chunks) {
+ DCHECK(!chunks->empty());
+ bool changed = false;
+ std::shared_ptr<DataType> ext_type = nullptr;
+
+ if (type->id() == Type::EXTENSION) {
+ ext_type = std::move(type);
+ type = checked_cast<const ExtensionType&>(*ext_type).storage_type();
+ }
+
+ // Unify all child dictionaries (if any)
+ if (type->num_fields() > 0) {
+ ArrayDataVector children(chunks->size());
+ for (int i = 0; i < type->num_fields(); ++i) {
+ std::transform(chunks->begin(), chunks->end(), children.begin(),
+ [i](const std::shared_ptr<ArrayData>& array) {
+ return array->child_data[i];
+ });
+ ARROW_ASSIGN_OR_RAISE(bool child_changed,
+ Unify(type->field(i)->type(), &children));
+ if (child_changed) {
+ // Only do this when unification actually occurred
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ (*chunks)[j]->child_data[i] = std::move(children[j]);
+ }
+ changed = true;
+ }
+ }
}
- // Unify all child dictionaries (if any)
- if (type->num_fields() > 0) {
- ArrayDataVector children(chunks->size());
- for (int i = 0; i < type->num_fields(); ++i) {
- std::transform(chunks->begin(), chunks->end(), children.begin(),
- [i](const std::shared_ptr<ArrayData>& array) {
- return array->child_data[i];
- });
- ARROW_ASSIGN_OR_RAISE(bool child_changed,
- Unify(type->field(i)->type(), &children));
- if (child_changed) {
- // Only do this when unification actually occurred
- for (size_t j = 0; j < chunks->size(); ++j) {
- (*chunks)[j]->child_data[i] = std::move(children[j]);
- }
- changed = true;
- }
- }
- }
-
- // Unify this dictionary
- if (type->id() == Type::DICTIONARY) {
- const auto& dict_type = checked_cast<const DictionaryType&>(*type);
- // XXX Ideally, we should unify dictionaries nested in value_type first,
- // but DictionaryUnifier doesn't supported nested dictionaries anyway,
- // so this will fail.
- ARROW_ASSIGN_OR_RAISE(auto unifier,
- DictionaryUnifier::Make(dict_type.value_type(), this->pool));
- // Unify all dictionary array chunks
- BufferVector transpose_maps(chunks->size());
- for (size_t j = 0; j < chunks->size(); ++j) {
- DCHECK_NE((*chunks)[j]->dictionary, nullptr);
- RETURN_NOT_OK(
- unifier->Unify(*MakeArray((*chunks)[j]->dictionary), &transpose_maps[j]));
- }
- std::shared_ptr<Array> dictionary;
- RETURN_NOT_OK(unifier->GetResultWithIndexType(dict_type.index_type(), &dictionary));
- for (size_t j = 0; j < chunks->size(); ++j) {
- ARROW_ASSIGN_OR_RAISE(
- (*chunks)[j],
- TransposeDictIndices(
- (*chunks)[j], type, type, dictionary->data(),
- reinterpret_cast<const int32_t*>(transpose_maps[j]->data()), this->pool));
- if (ext_type) {
- (*chunks)[j]->type = ext_type;
- }
- }
- changed = true;
- }
-
- return changed;
+ // Unify this dictionary
+ if (type->id() == Type::DICTIONARY) {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*type);
+ // XXX Ideally, we should unify dictionaries nested in value_type first,
+ // but DictionaryUnifier doesn't supported nested dictionaries anyway,
+ // so this will fail.
+ ARROW_ASSIGN_OR_RAISE(auto unifier,
+ DictionaryUnifier::Make(dict_type.value_type(), this->pool));
+ // Unify all dictionary array chunks
+ BufferVector transpose_maps(chunks->size());
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ DCHECK_NE((*chunks)[j]->dictionary, nullptr);
+ RETURN_NOT_OK(
+ unifier->Unify(*MakeArray((*chunks)[j]->dictionary), &transpose_maps[j]));
+ }
+ std::shared_ptr<Array> dictionary;
+ RETURN_NOT_OK(unifier->GetResultWithIndexType(dict_type.index_type(), &dictionary));
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ ARROW_ASSIGN_OR_RAISE(
+ (*chunks)[j],
+ TransposeDictIndices(
+ (*chunks)[j], type, type, dictionary->data(),
+ reinterpret_cast<const int32_t*>(transpose_maps[j]->data()), this->pool));
+ if (ext_type) {
+ (*chunks)[j]->type = ext_type;
+ }
+ }
+ changed = true;
+ }
+
+ return changed;
}
-};
+};
-} // namespace
+} // namespace
-Result<std::unique_ptr<DictionaryUnifier>> DictionaryUnifier::Make(
- std::shared_ptr<DataType> value_type, MemoryPool* pool) {
- MakeUnifier maker(pool, value_type);
- RETURN_NOT_OK(VisitTypeInline(*value_type, &maker));
- return std::move(maker.result);
-}
+Result<std::unique_ptr<DictionaryUnifier>> DictionaryUnifier::Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool) {
+ MakeUnifier maker(pool, value_type);
+ RETURN_NOT_OK(VisitTypeInline(*value_type, &maker));
+ return std::move(maker.result);
+}
-Result<std::shared_ptr<ChunkedArray>> DictionaryUnifier::UnifyChunkedArray(
- const std::shared_ptr<ChunkedArray>& array, MemoryPool* pool) {
- if (array->num_chunks() <= 1) {
- return array;
+Result<std::shared_ptr<ChunkedArray>> DictionaryUnifier::UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array, MemoryPool* pool) {
+ if (array->num_chunks() <= 1) {
+ return array;
}
- ArrayDataVector data_chunks(array->num_chunks());
- std::transform(array->chunks().begin(), array->chunks().end(), data_chunks.begin(),
- [](const std::shared_ptr<Array>& array) { return array->data(); });
- ARROW_ASSIGN_OR_RAISE(bool changed,
- RecursiveUnifier{pool}.Unify(array->type(), &data_chunks));
- if (!changed) {
- return array;
+ ArrayDataVector data_chunks(array->num_chunks());
+ std::transform(array->chunks().begin(), array->chunks().end(), data_chunks.begin(),
+ [](const std::shared_ptr<Array>& array) { return array->data(); });
+ ARROW_ASSIGN_OR_RAISE(bool changed,
+ RecursiveUnifier{pool}.Unify(array->type(), &data_chunks));
+ if (!changed) {
+ return array;
}
- ArrayVector chunks(array->num_chunks());
- std::transform(data_chunks.begin(), data_chunks.end(), chunks.begin(),
- [](const std::shared_ptr<ArrayData>& data) { return MakeArray(data); });
- return std::make_shared<ChunkedArray>(std::move(chunks), array->type());
-}
-
-Result<std::shared_ptr<Table>> DictionaryUnifier::UnifyTable(const Table& table,
- MemoryPool* pool) {
- ChunkedArrayVector columns = table.columns();
- for (auto& col : columns) {
- ARROW_ASSIGN_OR_RAISE(col, DictionaryUnifier::UnifyChunkedArray(col, pool));
+ ArrayVector chunks(array->num_chunks());
+ std::transform(data_chunks.begin(), data_chunks.end(), chunks.begin(),
+ [](const std::shared_ptr<ArrayData>& data) { return MakeArray(data); });
+ return std::make_shared<ChunkedArray>(std::move(chunks), array->type());
+}
+
+Result<std::shared_ptr<Table>> DictionaryUnifier::UnifyTable(const Table& table,
+ MemoryPool* pool) {
+ ChunkedArrayVector columns = table.columns();
+ for (auto& col : columns) {
+ ARROW_ASSIGN_OR_RAISE(col, DictionaryUnifier::UnifyChunkedArray(col, pool));
}
- return Table::Make(table.schema(), std::move(columns), table.num_rows());
+ return Table::Make(table.schema(), std::move(columns), table.num_rows());
}
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
index 8791eaa07db..eb039331b51 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
@@ -120,61 +120,61 @@ class ARROW_EXPORT DictionaryArray : public Array {
mutable std::shared_ptr<Array> dictionary_;
};
-/// \brief Helper class for incremental dictionary unification
-class ARROW_EXPORT DictionaryUnifier {
- public:
- virtual ~DictionaryUnifier() = default;
-
- /// \brief Construct a DictionaryUnifier
- /// \param[in] value_type the data type of the dictionaries
- /// \param[in] pool MemoryPool to use for memory allocations
- static Result<std::unique_ptr<DictionaryUnifier>> Make(
- std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
-
- /// \brief Unify dictionaries accross array chunks
- ///
- /// The dictionaries in the array chunks will be unified, their indices
- /// accordingly transposed.
- ///
- /// Only dictionaries with a primitive value type are currently supported.
- /// However, dictionaries nested inside a more complex type are correctly unified.
- static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
- const std::shared_ptr<ChunkedArray>& array,
- MemoryPool* pool = default_memory_pool());
-
- /// \brief Unify dictionaries accross the chunks of each table column
- ///
- /// The dictionaries in each table column will be unified, their indices
- /// accordingly transposed.
- ///
- /// Only dictionaries with a primitive value type are currently supported.
- /// However, dictionaries nested inside a more complex type are correctly unified.
- static Result<std::shared_ptr<Table>> UnifyTable(
- const Table& table, MemoryPool* pool = default_memory_pool());
-
- /// \brief Append dictionary to the internal memo
- virtual Status Unify(const Array& dictionary) = 0;
-
- /// \brief Append dictionary and compute transpose indices
- /// \param[in] dictionary the dictionary values to unify
- /// \param[out] out_transpose a Buffer containing computed transpose indices
- /// as int32_t values equal in length to the passed dictionary. The value in
- /// each slot corresponds to the new index value for each original index
- /// for a DictionaryArray with the old dictionary
- virtual Status Unify(const Array& dictionary,
- std::shared_ptr<Buffer>* out_transpose) = 0;
-
- /// \brief Return a result DictionaryType with the smallest possible index
- /// type to accommodate the unified dictionary. The unifier cannot be used
- /// after this is called
- virtual Status GetResult(std::shared_ptr<DataType>* out_type,
- std::shared_ptr<Array>* out_dict) = 0;
-
- /// \brief Return a unified dictionary with the given index type. If
- /// the index type is not large enough then an invalid status will be returned.
- /// The unifier cannot be used after this is called
- virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
- std::shared_ptr<Array>* out_dict) = 0;
-};
-
+/// \brief Helper class for incremental dictionary unification
+class ARROW_EXPORT DictionaryUnifier {
+ public:
+ virtual ~DictionaryUnifier() = default;
+
+ /// \brief Construct a DictionaryUnifier
+ /// \param[in] value_type the data type of the dictionaries
+ /// \param[in] pool MemoryPool to use for memory allocations
+ static Result<std::unique_ptr<DictionaryUnifier>> Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross array chunks
+ ///
+ /// The dictionaries in the array chunks will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross the chunks of each table column
+ ///
+ /// The dictionaries in each table column will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<Table>> UnifyTable(
+ const Table& table, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Append dictionary to the internal memo
+ virtual Status Unify(const Array& dictionary) = 0;
+
+ /// \brief Append dictionary and compute transpose indices
+ /// \param[in] dictionary the dictionary values to unify
+ /// \param[out] out_transpose a Buffer containing computed transpose indices
+ /// as int32_t values equal in length to the passed dictionary. The value in
+ /// each slot corresponds to the new index value for each original index
+ /// for a DictionaryArray with the old dictionary
+ virtual Status Unify(const Array& dictionary,
+ std::shared_ptr<Buffer>* out_transpose) = 0;
+
+ /// \brief Return a result DictionaryType with the smallest possible index
+ /// type to accommodate the unified dictionary. The unifier cannot be used
+ /// after this is called
+ virtual Status GetResult(std::shared_ptr<DataType>* out_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+
+ /// \brief Return a unified dictionary with the given index type. If
+ /// the index type is not large enough then an invalid status will be returned.
+ /// The unifier cannot be used after this is called
+ virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
index f967127c5f1..fdbc0eb8f3d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
@@ -70,8 +70,8 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool,
ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
AllocateBuffer(num_offsets * sizeof(offset_type), pool));
- // Copy valid bits, ignoring the final offset (since for a length N list array,
- // we have N + 1 offsets)
+ // Copy valid bits, ignoring the final offset (since for a length N list array,
+ // we have N + 1 offsets)
ARROW_ASSIGN_OR_RAISE(
auto clean_valid_bits,
offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1)));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
index b601eb770c3..3b8f769b7dc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
@@ -25,7 +25,7 @@
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
-#include "arrow/stl_iterator.h"
+#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/type_traits.h"
@@ -41,7 +41,7 @@ class NumericArray : public PrimitiveArray {
public:
using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
- using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
+ using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
@@ -64,10 +64,10 @@ class NumericArray : public PrimitiveArray {
// For API compatibility with BinaryArray etc.
value_type GetView(int64_t i) const { return Value(i); }
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
using PrimitiveArray::PrimitiveArray;
};
@@ -76,7 +76,7 @@ class NumericArray : public PrimitiveArray {
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
using TypeClass = BooleanType;
- using IteratorType = stl::ArrayIterator<BooleanArray>;
+ using IteratorType = stl::ArrayIterator<BooleanArray>;
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
@@ -99,10 +99,10 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
/// values. Result is not cached.
int64_t true_count() const;
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
using PrimitiveArray::PrimitiveArray;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
index c0df797256d..0c2782e7466 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
@@ -64,26 +64,26 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(CommitPendingData());
- ARROW_RETURN_NOT_OK(Reserve(length));
- memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
- UnsafeSetNotNull(length);
- return Status::OK();
- }
-
- Status AppendEmptyValue() final {
- pending_data_[pending_pos_] = 0;
- pending_valid_[pending_pos_] = 1;
- ++pending_pos_;
- ++length_;
-
- if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
- return CommitPendingData();
- }
- return Status::OK();
- }
-
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(CommitPendingData());
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ pending_data_[pending_pos_] = 0;
+ pending_valid_[pending_pos_] = 1;
+ ++pending_pos_;
+ ++length_;
+
+ if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+ return CommitPendingData();
+ }
+ return Status::OK();
+ }
+
void Reset() override;
Status Resize(int64_t capacity) override;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
index c892e3d664b..ff11984790c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
@@ -24,11 +24,11 @@
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/scalar.h"
+#include "arrow/builder.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/logging.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
@@ -95,162 +95,162 @@ Status ArrayBuilder::Advance(int64_t elements) {
return null_bitmap_builder_.Advance(elements);
}
-namespace {
-struct AppendScalarImpl {
- template <typename T>
- enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
- is_fixed_size_binary_type<T>::value,
- Status>
- Visit(const T&) {
- auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
- RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
-
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar =
- internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
- if (scalar->is_valid) {
- builder->UnsafeAppend(scalar->value);
- } else {
- builder->UnsafeAppendNull();
- }
- }
- }
- return Status::OK();
- }
-
- template <typename T>
- enable_if_base_binary<T, Status> Visit(const T&) {
- int64_t data_size = 0;
- for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar =
- internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
- if (scalar->is_valid) {
- data_size += scalar->value->size();
- }
- }
-
- auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
- RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
- RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
-
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar =
- internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
- if (scalar->is_valid) {
- builder->UnsafeAppend(util::string_view{*scalar->value});
- } else {
- builder->UnsafeAppendNull();
- }
- }
- }
- return Status::OK();
- }
-
- template <typename T>
- enable_if_list_like<T, Status> Visit(const T&) {
- auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
- int64_t num_children = 0;
- for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
- scalar++) {
- if (!(*scalar)->is_valid) continue;
- num_children +=
- internal::checked_cast<const BaseListScalar&>(**scalar).value->length();
- }
- RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_));
-
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
- scalar++) {
- if ((*scalar)->is_valid) {
- RETURN_NOT_OK(builder->Append());
- const Array& list =
- *internal::checked_cast<const BaseListScalar&>(**scalar).value;
- for (int64_t i = 0; i < list.length(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
- RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
- }
- } else {
- RETURN_NOT_OK(builder_->AppendNull());
- }
- }
- }
- return Status::OK();
- }
-
- Status Visit(const StructType& type) {
- auto* builder = internal::checked_cast<StructBuilder*>(builder_);
- auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
- RETURN_NOT_OK(builder->Reserve(count));
- for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
- RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count));
- }
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* s = scalars_begin_; s != scalars_end_; s++) {
- const auto& scalar = internal::checked_cast<const StructScalar&>(**s);
- for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
- if (!scalar.is_valid || !scalar.value[field_index]) {
- RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
- } else {
- RETURN_NOT_OK(builder->field_builder(field_index)
- ->AppendScalar(*scalar.value[field_index]));
- }
- }
- RETURN_NOT_OK(builder->Append(scalar.is_valid));
- }
- }
- return Status::OK();
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("AppendScalar for type ", type);
- }
-
- Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); }
-
- const std::shared_ptr<Scalar>* scalars_begin_;
- const std::shared_ptr<Scalar>* scalars_end_;
- int64_t n_repeats_;
- ArrayBuilder* builder_;
-};
-} // namespace
-
-Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
- if (!scalar.type->Equals(type())) {
- return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
- " to builder for type ", type()->ToString());
- }
- std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
- return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
-}
-
-Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
- if (!scalar.type->Equals(type())) {
- return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
- " to builder for type ", type()->ToString());
- }
- std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
- return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
-}
-
-Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
- if (scalars.empty()) return Status::OK();
- const auto ty = type();
- for (const auto& scalar : scalars) {
- if (!scalar->type->Equals(ty)) {
- return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
- " to builder for type ", type()->ToString());
- }
- }
- return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(),
- /*n_repeats=*/1, this}
- .Convert();
-}
-
+namespace {
+struct AppendScalarImpl {
+ template <typename T>
+ enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
+ is_fixed_size_binary_type<T>::value,
+ Status>
+ Visit(const T&) {
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(scalar->value);
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ int64_t data_size = 0;
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ data_size += scalar->value->size();
+ }
+ }
+
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
+ RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(util::string_view{*scalar->value});
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_list_like<T, Status> Visit(const T&) {
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ int64_t num_children = 0;
+ for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if (!(*scalar)->is_valid) continue;
+ num_children +=
+ internal::checked_cast<const BaseListScalar&>(**scalar).value->length();
+ }
+ RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if ((*scalar)->is_valid) {
+ RETURN_NOT_OK(builder->Append());
+ const Array& list =
+ *internal::checked_cast<const BaseListScalar&>(**scalar).value;
+ for (int64_t i = 0; i < list.length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
+ RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
+ }
+ } else {
+ RETURN_NOT_OK(builder_->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ auto* builder = internal::checked_cast<StructBuilder*>(builder_);
+ auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
+ RETURN_NOT_OK(builder->Reserve(count));
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count));
+ }
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* s = scalars_begin_; s != scalars_end_; s++) {
+ const auto& scalar = internal::checked_cast<const StructScalar&>(**s);
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ if (!scalar.is_valid || !scalar.value[field_index]) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->field_builder(field_index)
+ ->AppendScalar(*scalar.value[field_index]));
+ }
+ }
+ RETURN_NOT_OK(builder->Append(scalar.is_valid));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("AppendScalar for type ", type);
+ }
+
+ Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); }
+
+ const std::shared_ptr<Scalar>* scalars_begin_;
+ const std::shared_ptr<Scalar>* scalars_end_;
+ int64_t n_repeats_;
+ ArrayBuilder* builder_;
+};
+} // namespace
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
+ if (scalars.empty()) return Status::OK();
+ const auto ty = type();
+ for (const auto& scalar : scalars) {
+ if (!scalar->type->Equals(ty)) {
+ return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ }
+ return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(),
+ /*n_repeats=*/1, this}
+ .Convert();
+}
+
Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
std::shared_ptr<ArrayData> internal_data;
RETURN_NOT_OK(FinishInternal(&internal_data));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
index 905b3c1b491..7a1ad81998a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
@@ -29,7 +29,7 @@
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
+#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -51,7 +51,7 @@ class ARROW_EXPORT ArrayBuilder {
explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
virtual ~ArrayBuilder() = default;
- ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
+ ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
/// For nested types. Since the objects are owned by this class instance, we
/// skip shared pointers and just return a raw pointer
@@ -98,30 +98,30 @@ class ARROW_EXPORT ArrayBuilder {
/// Reset the builder.
virtual void Reset();
- /// \brief Append a null value to builder
+ /// \brief Append a null value to builder
virtual Status AppendNull() = 0;
- /// \brief Append a number of null values to builder
+ /// \brief Append a number of null values to builder
virtual Status AppendNulls(int64_t length) = 0;
- /// \brief Append a non-null value to builder
- ///
- /// The appended value is an implementation detail, but the corresponding
- /// memory slot is guaranteed to be initialized.
- /// This method is useful when appending a null value to a parent nested type.
- virtual Status AppendEmptyValue() = 0;
-
- /// \brief Append a number of non-null values to builder
- ///
- /// The appended values are an implementation detail, but the corresponding
- /// memory slot is guaranteed to be initialized.
- /// This method is useful when appending null values to a parent nested type.
- virtual Status AppendEmptyValues(int64_t length) = 0;
-
- /// \brief Append a value from a scalar
- Status AppendScalar(const Scalar& scalar);
- Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
- Status AppendScalars(const ScalarVector& scalars);
-
+ /// \brief Append a non-null value to builder
+ ///
+ /// The appended value is an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending a null value to a parent nested type.
+ virtual Status AppendEmptyValue() = 0;
+
+ /// \brief Append a number of non-null values to builder
+ ///
+ /// The appended values are an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending null values to a parent nested type.
+ virtual Status AppendEmptyValues(int64_t length) = 0;
+
+ /// \brief Append a value from a scalar
+ Status AppendScalar(const Scalar& scalar);
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
+ Status AppendScalars(const ScalarVector& scalars);
+
/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
/// this function responsibly.
@@ -253,24 +253,24 @@ class ARROW_EXPORT ArrayBuilder {
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};
-/// \brief Construct an empty ArrayBuilder corresponding to the data
-/// type
-/// \param[in] pool the MemoryPool to use for allocations
-/// \param[in] type the data type to create the builder for
-/// \param[out] out the created ArrayBuilder
-ARROW_EXPORT
-Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
- std::unique_ptr<ArrayBuilder>* out);
-
-/// \brief Construct an empty DictionaryBuilder initialized optionally
-/// with a pre-existing dictionary
-/// \param[in] pool the MemoryPool to use for allocations
-/// \param[in] type the dictionary type to create the builder for
-/// \param[in] dictionary the initial dictionary, if any. May be nullptr
-/// \param[out] out the created ArrayBuilder
-ARROW_EXPORT
-Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
- const std::shared_ptr<Array>& dictionary,
- std::unique_ptr<ArrayBuilder>* out);
-
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the data type to create the builder for
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ std::unique_ptr<ArrayBuilder>* out);
+
+/// \brief Construct an empty DictionaryBuilder initialized optionally
+/// with a pre-existing dictionary
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the dictionary type to create the builder for
+/// \param[in] dictionary the initial dictionary, if any. May be nullptr
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& dictionary,
+ std::unique_ptr<ArrayBuilder>* out);
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
index 6822dc89903..26d6a7129f6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
@@ -73,20 +73,20 @@ Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) {
return Status::OK();
}
-Status FixedSizeBinaryBuilder::AppendEmptyValue() {
- RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(true);
- byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
- return Status::OK();
-}
-
-Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) {
- RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, true);
- byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0);
- return Status::OK();
-}
-
+Status FixedSizeBinaryBuilder::AppendEmptyValue() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
+ return Status::OK();
+}
+
+Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0);
+ return Status::OK();
+}
+
void FixedSizeBinaryBuilder::Reset() {
ArrayBuilder::Reset();
byte_builder_.Reset();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
index 62edc69fb8e..346e90d25a0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
@@ -61,7 +61,7 @@ class BaseBinaryBuilder : public ArrayBuilder {
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
- ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+ ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
@@ -77,23 +77,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Append(value.data(), static_cast<offset_type>(value.size()));
}
- /// Extend the last appended value by appending more data at the end
- ///
- /// Unlike Append, this does not create a new offset.
- Status ExtendCurrent(const uint8_t* value, offset_type length) {
- // Safety check for UBSAN.
- if (ARROW_PREDICT_TRUE(length > 0)) {
- ARROW_RETURN_NOT_OK(ValidateOverflow(length));
- ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
- }
- return Status::OK();
- }
-
- Status ExtendCurrent(util::string_view value) {
- return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
- static_cast<offset_type>(value.size()));
- }
-
+ /// Extend the last appended value by appending more data at the end
+ ///
+ /// Unlike Append, this does not create a new offset.
+ Status ExtendCurrent(const uint8_t* value, offset_type length) {
+ // Safety check for UBSAN.
+ if (ARROW_PREDICT_TRUE(length > 0)) {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
+ }
+ return Status::OK();
+ }
+
+ Status ExtendCurrent(util::string_view value) {
+ return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<offset_type>(value.size()));
+ }
+
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
@@ -111,23 +111,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValue() final {
- ARROW_RETURN_NOT_OK(AppendNextOffset());
- ARROW_RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(true);
- return Status::OK();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- const int64_t num_bytes = value_data_builder_.length();
- ARROW_RETURN_NOT_OK(Reserve(length));
- for (int64_t i = 0; i < length; ++i) {
- offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
- }
- UnsafeAppendToBitmap(length, true);
- return Status::OK();
- }
-
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ const int64_t num_bytes = value_data_builder_.length();
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ }
+ UnsafeAppendToBitmap(length, true);
+ return Status::OK();
+ }
+
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
@@ -150,28 +150,28 @@ class BaseBinaryBuilder : public ArrayBuilder {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
- /// Like ExtendCurrent, but do not check capacity
- void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
- value_data_builder_.UnsafeAppend(value, length);
- }
-
- void UnsafeExtendCurrent(util::string_view value) {
- UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
- static_cast<offset_type>(value.size()));
- }
-
+ /// Like ExtendCurrent, but do not check capacity
+ void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
+ value_data_builder_.UnsafeAppend(value, length);
+ }
+
+ void UnsafeExtendCurrent(util::string_view value) {
+ UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<offset_type>(value.size()));
+ }
+
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
- void UnsafeAppendEmptyValue() {
- const int64_t num_bytes = value_data_builder_.length();
- offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
- UnsafeAppendToBitmap(true);
- }
-
+ void UnsafeAppendEmptyValue() {
+ const int64_t num_bytes = value_data_builder_.length();
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ UnsafeAppendToBitmap(true);
+ }
+
/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
@@ -467,14 +467,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
- Status Append(const Buffer& s) {
- ARROW_RETURN_NOT_OK(Reserve(1));
- UnsafeAppend(util::string_view(s));
- return Status::OK();
- }
-
- Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
-
+ Status Append(const Buffer& s) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(util::string_view(s));
+ return Status::OK();
+ }
+
+ Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
+
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
@@ -489,9 +489,9 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
- Status AppendEmptyValue() final;
- Status AppendEmptyValues(int64_t length) final;
-
+ Status AppendEmptyValue() final;
+ Status AppendEmptyValues(int64_t length) final;
+
void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
@@ -510,10 +510,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
}
- void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
-
- void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
-
+ void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
+
+ void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
+
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
index bd7615a7309..34c81f76c6f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
@@ -67,39 +67,39 @@ Status Decimal128Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::OK();
}
-// ----------------------------------------------------------------------
-// Decimal256Builder
-
-Decimal256Builder::Decimal256Builder(const std::shared_ptr<DataType>& type,
- MemoryPool* pool)
- : FixedSizeBinaryBuilder(type, pool),
- decimal_type_(internal::checked_pointer_cast<Decimal256Type>(type)) {}
-
-Status Decimal256Builder::Append(const Decimal256& value) {
- RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1));
- UnsafeAppend(value);
- return Status::OK();
-}
-
-void Decimal256Builder::UnsafeAppend(const Decimal256& value) {
- value.ToBytes(GetMutableValue(length()));
- byte_builder_.UnsafeAdvance(32);
- UnsafeAppendToBitmap(true);
-}
-
-void Decimal256Builder::UnsafeAppend(util::string_view value) {
- FixedSizeBinaryBuilder::UnsafeAppend(value);
-}
-
-Status Decimal256Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- std::shared_ptr<Buffer> data;
- RETURN_NOT_OK(byte_builder_.Finish(&data));
- std::shared_ptr<Buffer> null_bitmap;
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
-
- *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
- capacity_ = length_ = null_count_ = 0;
- return Status::OK();
-}
-
+// ----------------------------------------------------------------------
+// Decimal256Builder
+
+Decimal256Builder::Decimal256Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool)
+ : FixedSizeBinaryBuilder(type, pool),
+ decimal_type_(internal::checked_pointer_cast<Decimal256Type>(type)) {}
+
+Status Decimal256Builder::Append(const Decimal256& value) {
+ RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1));
+ UnsafeAppend(value);
+ return Status::OK();
+}
+
+void Decimal256Builder::UnsafeAppend(const Decimal256& value) {
+ value.ToBytes(GetMutableValue(length()));
+ byte_builder_.UnsafeAdvance(32);
+ UnsafeAppendToBitmap(true);
+}
+
+void Decimal256Builder::UnsafeAppend(util::string_view value) {
+ FixedSizeBinaryBuilder::UnsafeAppend(value);
+}
+
+Status Decimal256Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
index f48392ed001..7fee4ab4c73 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
@@ -32,7 +32,7 @@ namespace arrow {
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;
- using ValueType = Decimal128;
+ using ValueType = Decimal128;
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
@@ -59,36 +59,36 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
std::shared_ptr<Decimal128Type> decimal_type_;
};
-class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
- public:
- using TypeClass = Decimal256Type;
- using ValueType = Decimal256;
-
- explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
- MemoryPool* pool = default_memory_pool());
-
- using FixedSizeBinaryBuilder::Append;
- using FixedSizeBinaryBuilder::AppendValues;
- using FixedSizeBinaryBuilder::Reset;
-
- Status Append(const Decimal256& val);
- void UnsafeAppend(const Decimal256& val);
- void UnsafeAppend(util::string_view val);
-
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
-
- /// \cond FALSE
- using ArrayBuilder::Finish;
- /// \endcond
-
- Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
-
- std::shared_ptr<DataType> type() const override { return decimal_type_; }
-
- protected:
- std::shared_ptr<Decimal256Type> decimal_type_;
-};
-
+class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
+ public:
+ using TypeClass = Decimal256Type;
+ using ValueType = Decimal256;
+
+ explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool());
+
+ using FixedSizeBinaryBuilder::Append;
+ using FixedSizeBinaryBuilder::AppendValues;
+ using FixedSizeBinaryBuilder::Reset;
+
+ Status Append(const Decimal256& val);
+ void UnsafeAppend(const Decimal256& val);
+ void UnsafeAppend(util::string_view val);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+ std::shared_ptr<Decimal256Type> decimal_type_;
+};
+
using DecimalBuilder = Decimal128Builder;
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
index b13f6a2db34..7bbb6b25499 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
@@ -45,7 +45,7 @@ class DictionaryMemoTable::DictionaryMemoTableImpl {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
- return Status::NotImplemented("Initialization of ", value_type_->ToString(),
+ return Status::NotImplemented("Initialization of ", value_type_->ToString(),
" memo table is not implemented");
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
index eb96482dbf7..d5541db2e7c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
@@ -29,7 +29,7 @@
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/data.h"
#include "arrow/array/util.h"
-#include "arrow/scalar.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
@@ -241,20 +241,20 @@ class DictionaryBuilderBase : public ArrayBuilder {
/// \brief Append a decimal (only for Decimal128Type)
template <typename T1 = T>
- enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
+ enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
uint8_t data[16];
value.ToBytes(data);
return Append(data, 16);
}
- /// \brief Append a decimal (only for Decimal128Type)
- template <typename T1 = T>
- enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
- uint8_t data[32];
- value.ToBytes(data);
- return Append(data, 32);
- }
-
+ /// \brief Append a decimal (only for Decimal128Type)
+ template <typename T1 = T>
+ enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
+ uint8_t data[32];
+ value.ToBytes(data);
+ return Append(data, 32);
+ }
+
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
@@ -270,18 +270,18 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendNulls(length);
}
- Status AppendEmptyValue() final {
- length_ += 1;
-
- return indices_builder_.AppendEmptyValue();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- length_ += length;
-
- return indices_builder_.AppendEmptyValues(length);
- }
-
+ Status AppendEmptyValue() final {
+ length_ += 1;
+
+ return indices_builder_.AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ length_ += length;
+
+ return indices_builder_.AppendEmptyValues(length);
+ }
+
/// \brief Insert values into the dictionary's memo, but do not append any
/// indices. Can be used to initialize a new builder with known dictionary
/// values
@@ -458,18 +458,18 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
return indices_builder_.AppendNulls(length);
}
- Status AppendEmptyValue() final {
- length_ += 1;
-
- return indices_builder_.AppendEmptyValue();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- length_ += length;
-
- return indices_builder_.AppendEmptyValues(length);
- }
-
+ Status AppendEmptyValue() final {
+ length_ += 1;
+
+ return indices_builder_.AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ length_ += length;
+
+ return indices_builder_.AppendEmptyValues(length);
+ }
+
/// \brief Append a whole dense array to the builder
Status AppendArray(const Array& array) {
#ifndef NDEBUG
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
index a3bcde0381a..b49741d365f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
@@ -123,24 +123,24 @@ Status MapBuilder::AppendNulls(int64_t length) {
return Status::OK();
}
-Status MapBuilder::AppendEmptyValue() {
- DCHECK_EQ(item_builder_->length(), key_builder_->length());
- RETURN_NOT_OK(AdjustStructBuilderLength());
- RETURN_NOT_OK(list_builder_->AppendEmptyValue());
- length_ = list_builder_->length();
- null_count_ = list_builder_->null_count();
- return Status::OK();
-}
-
-Status MapBuilder::AppendEmptyValues(int64_t length) {
- DCHECK_EQ(item_builder_->length(), key_builder_->length());
- RETURN_NOT_OK(AdjustStructBuilderLength());
- RETURN_NOT_OK(list_builder_->AppendEmptyValues(length));
- length_ = list_builder_->length();
- null_count_ = list_builder_->null_count();
- return Status::OK();
-}
-
+Status MapBuilder::AppendEmptyValue() {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendEmptyValue());
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendEmptyValues(int64_t length) {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendEmptyValues(length));
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
Status MapBuilder::AdjustStructBuilderLength() {
// If key/item builders have been appended, adjust struct builder length
// to match. Struct and key are non-nullable, append all valid values.
@@ -213,18 +213,18 @@ Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) {
return Status::OK();
}
-Status FixedSizeListBuilder::AppendEmptyValue() {
- RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(true);
- return value_builder_->AppendEmptyValues(list_size_);
-}
-
-Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) {
- RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, true);
- return value_builder_->AppendEmptyValues(list_size_ * length);
-}
-
+Status FixedSizeListBuilder::AppendEmptyValue() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return value_builder_->AppendEmptyValues(list_size_);
+}
+
+Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ return value_builder_->AppendEmptyValues(list_size_ * length);
+}
+
Status FixedSizeListBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(CheckCapacity(capacity));
return ArrayBuilder::Resize(capacity);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
index 12b999b786e..3acf421ef3e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
@@ -109,19 +109,19 @@ class BaseListBuilder : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValue() final { return Append(true); }
-
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(Reserve(length));
- ARROW_RETURN_NOT_OK(ValidateOverflow(0));
- UnsafeAppendToBitmap(length, true);
- const int64_t num_values = value_builder_->length();
- for (int64_t i = 0; i < length; ++i) {
- offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
- }
- return Status::OK();
- }
-
+ Status AppendEmptyValue() final { return Append(true); }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(ValidateOverflow(0));
+ UnsafeAppendToBitmap(length, true);
+ const int64_t num_values = value_builder_->length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
+ }
+ return Status::OK();
+ }
+
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(AppendNextOffset());
@@ -271,10 +271,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
Status AppendNulls(int64_t length) final;
- Status AppendEmptyValue() final;
-
- Status AppendEmptyValues(int64_t length) final;
-
+ Status AppendEmptyValue() final;
+
+ Status AppendEmptyValues(int64_t length) final;
+
/// \brief Get builder to append keys.
///
/// Append a key with this builder should be followed by appending
@@ -370,10 +370,10 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
Status ValidateOverflow(int64_t new_elements);
- Status AppendEmptyValue() final;
-
- Status AppendEmptyValues(int64_t length) final;
-
+ Status AppendEmptyValue() final;
+
+ Status AppendEmptyValues(int64_t length) final;
+
ArrayBuilder* value_builder() const { return value_builder_.get(); }
std::shared_ptr<DataType> type() const override {
@@ -431,42 +431,42 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
return Status::OK();
}
- /// \brief Append a null value. Automatically appends an empty value to each child
+ /// \brief Append a null value. Automatically appends an empty value to each child
/// builder.
Status AppendNull() final {
for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(false);
}
- /// \brief Append multiple null values. Automatically appends empty values to each
+ /// \brief Append multiple null values. Automatically appends empty values to each
/// child builder.
- Status AppendNulls(int64_t length) final {
- for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
- }
- ARROW_RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, false);
- return Status::OK();
- }
-
- Status AppendEmptyValue() final {
- for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
- }
- return Append(true);
- }
-
- Status AppendEmptyValues(int64_t length) final {
- for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
- }
- ARROW_RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, true);
- return Status::OK();
- }
-
+ Status AppendNulls(int64_t length) final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, false);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+ }
+ return Append(true);
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ return Status::OK();
+ }
+
void Reset() override;
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
index e403c42411d..ef5c4d14f7f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
@@ -65,8 +65,8 @@ Status BooleanBuilder::Resize(int64_t capacity) {
}
Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
- ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
index 80cfc4061bb..3dd2370cddb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
@@ -23,7 +23,7 @@
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
-#include "arrow/result.h"
+#include "arrow/result.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
@@ -47,10 +47,10 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
/// \brief Append a single null element
Status AppendNull() final { return AppendNulls(1); }
- Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
-
- Status AppendEmptyValue() final { return AppendEmptyValues(1); }
-
+ Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
+
+ Status AppendEmptyValue() final { return AppendEmptyValues(1); }
+
Status Append(std::nullptr_t) { return AppendNull(); }
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
@@ -105,22 +105,22 @@ class NumericBuilder : public ArrayBuilder {
return Status::OK();
}
- /// \brief Append a empty element
- Status AppendEmptyValue() final {
- ARROW_RETURN_NOT_OK(Reserve(1));
- data_builder_.UnsafeAppend(value_type{}); // zero
- UnsafeAppendToBitmap(true);
- return Status::OK();
- }
-
- /// \brief Append several empty elements
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(Reserve(length));
- data_builder_.UnsafeAppend(length, value_type{}); // zero
- UnsafeSetNotNull(length);
- return Status::OK();
- }
-
+ /// \brief Append a empty element
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(value_type{}); // zero
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ /// \brief Append several empty elements
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, value_type{}); // zero
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
void Reset() override { data_builder_.Reset(); }
@@ -186,9 +186,9 @@ class NumericBuilder : public ArrayBuilder {
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
- null_bitmap_builder_.FinishWithLength(length_));
- ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
+ null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
capacity_ = length_ = null_count_ = 0;
return Status::OK();
@@ -318,20 +318,20 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValue() final {
- ARROW_RETURN_NOT_OK(Reserve(1));
- data_builder_.UnsafeAppend(false);
- UnsafeSetNotNull(1);
- return Status::OK();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(Reserve(length));
- data_builder_.UnsafeAppend(length, false);
- UnsafeSetNotNull(length);
- return Status::OK();
- }
-
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(false);
+ UnsafeSetNotNull(1);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, false);
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
/// Scalar append
Status Append(const bool val) {
ARROW_RETURN_NOT_OK(Reserve(1));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
index 8617cb73fce..0168646cf48 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
@@ -65,8 +65,8 @@ BasicUnionBuilder::BasicUnionBuilder(
children_ = children;
type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr);
- DCHECK_LE(
- type_id_to_children_.size() - 1,
+ DCHECK_LE(
+ type_id_to_children_.size() - 1,
static_cast<decltype(type_id_to_children_)::size_type>(UnionType::kMaxTypeCode));
for (size_t i = 0; i < children.size(); ++i) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
index 060be474fb8..979b3f1effc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
@@ -117,26 +117,26 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
return child_builder->AppendNull();
}
- Status AppendEmptyValue() final {
- const int8_t first_child_code = type_codes_[0];
- ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
- ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
- ARROW_RETURN_NOT_OK(
- offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
- // Append an empty value arbitrarily to the first child
- return child_builder->AppendEmptyValue();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- const int8_t first_child_code = type_codes_[0];
- ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
- ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
- ARROW_RETURN_NOT_OK(
- offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
- // Append just a single empty value to the first child
- return child_builder->AppendEmptyValue();
- }
-
+ Status AppendEmptyValue() final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
+ // Append an empty value arbitrarily to the first child
+ return child_builder->AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
+ // Append just a single empty value to the first child
+ return child_builder->AppendEmptyValue();
+ }
+
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
@@ -179,45 +179,45 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, children, type) {}
- /// \brief Append a null value.
- ///
- /// A null is appended to the first child, empty values to the other children.
+ /// \brief Append a null value.
+ ///
+ /// A null is appended to the first child, empty values to the other children.
Status AppendNull() final {
- const auto first_child_code = type_codes_[0];
- ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
- ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
- for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
- ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
- }
- return Status::OK();
- }
-
- /// \brief Append multiple null values.
- ///
- /// Nulls are appended to the first child, empty values to the other children.
- Status AppendNulls(int64_t length) final {
- const auto first_child_code = type_codes_[0];
- ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
- ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
- for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
- ARROW_RETURN_NOT_OK(
- type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
- }
- return Status::OK();
- }
-
- Status AppendEmptyValue() final {
+ const auto first_child_code = type_codes_[0];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
+ for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+ ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
+ }
+ return Status::OK();
+ }
+
+ /// \brief Append multiple null values.
+ ///
+ /// Nulls are appended to the first child, empty values to the other children.
+ Status AppendNulls(int64_t length) final {
+ const auto first_child_code = type_codes_[0];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
+ for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+ ARROW_RETURN_NOT_OK(
+ type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
+ }
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
for (int8_t code : type_codes_) {
- ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
+ ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
}
return Status::OK();
}
- Status AppendEmptyValues(int64_t length) final {
+ Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
for (int8_t code : type_codes_) {
- ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
}
return Status::OK();
}
@@ -228,7 +228,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
- /// is called, and all other child builders must have null or empty value appended.
+ /// is called, and all other child builders must have null or empty value appended.
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
index 32478783394..be9b5c3258c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
@@ -36,7 +36,7 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/int_util.h"
+#include "arrow/util/int_util.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -45,7 +45,7 @@ namespace arrow {
using internal::SafeSignedAdd;
-namespace {
+namespace {
/// offset, length pair for representing a Range of a buffer or array
struct Range {
int64_t offset = -1, length = 0;
@@ -68,8 +68,8 @@ struct Bitmap {
};
// Allocate a buffer and concatenate bitmaps into it.
-Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
- std::shared_ptr<Buffer>* out) {
+Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
int64_t out_length = 0;
for (const auto& bitmap : bitmaps) {
if (internal::AddWithOverflow(out_length, bitmap.range.length, &out_length)) {
@@ -96,15 +96,15 @@ Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
// Write offsets in src into dst, adjusting them such that first_offset
// will be the first offset written.
template <typename Offset>
-Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
- Range* values_range);
+Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
+ Range* values_range);
// Concatenate buffers holding offsets into a single buffer of offsets,
// also computing the ranges of values spanned by each buffer of offsets.
template <typename Offset>
-Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
- std::shared_ptr<Buffer>* out,
- std::vector<Range>* values_ranges) {
+Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out,
+ std::vector<Range>* values_ranges) {
values_ranges->resize(buffers.size());
// allocate output buffer
@@ -132,8 +132,8 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
}
template <typename Offset>
-Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
- Range* values_range) {
+Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
+ Range* values_range) {
if (src->size() == 0) {
// It's allowed to have an empty offsets buffer for a 0-length array
// (see Array::Validate)
@@ -167,7 +167,7 @@ Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offse
class ConcatenateImpl {
public:
- ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool)
+ ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool)
: in_(std::move(in)), pool_(pool), out_(std::make_shared<ArrayData>()) {
out_->type = in[0]->type;
for (size_t i = 0; i < in_.size(); ++i) {
@@ -202,7 +202,7 @@ class ConcatenateImpl {
}
Status Visit(const FixedWidthType& fixed) {
- // Handles numbers, decimal128, decimal256, fixed_size_binary
+ // Handles numbers, decimal128, decimal256, fixed_size_binary
ARROW_ASSIGN_OR_RAISE(auto buffers, Buffers(1, fixed));
return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]);
}
@@ -243,8 +243,8 @@ class ConcatenateImpl {
return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
}
- Status Visit(const FixedSizeListType& fixed_size_list) {
- ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size()));
+ Status Visit(const FixedSizeListType& fixed_size_list) {
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size()));
return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
}
@@ -256,47 +256,47 @@ class ConcatenateImpl {
return Status::OK();
}
- Result<BufferVector> UnifyDictionaries(const DictionaryType& d) {
- BufferVector new_index_lookup;
- ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(d.value_type()));
- new_index_lookup.resize(in_.size());
- for (size_t i = 0; i < in_.size(); i++) {
- auto item = in_[i];
- auto dictionary_array = MakeArray(item->dictionary);
- RETURN_NOT_OK(unifier->Unify(*dictionary_array, &new_index_lookup[i]));
- }
- std::shared_ptr<Array> out_dictionary;
- RETURN_NOT_OK(unifier->GetResultWithIndexType(d.index_type(), &out_dictionary));
- out_->dictionary = out_dictionary->data();
- return new_index_lookup;
- }
-
- // Transpose and concatenate dictionary indices
- Result<std::shared_ptr<Buffer>> ConcatenateDictionaryIndices(
- const DataType& index_type, const BufferVector& index_transpositions) {
- const auto index_width =
- internal::checked_cast<const FixedWidthType&>(index_type).bit_width() / 8;
- int64_t out_length = 0;
- for (const auto& data : in_) {
- out_length += data->length;
- }
- ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length * index_width, pool_));
- uint8_t* out_data = out->mutable_data();
- for (size_t i = 0; i < in_.size(); i++) {
- const auto& data = in_[i];
- auto transpose_map =
- reinterpret_cast<const int32_t*>(index_transpositions[i]->data());
- RETURN_NOT_OK(internal::TransposeInts(index_type, index_type,
- /*src=*/data->GetValues<uint8_t>(1, 0),
- /*dest=*/out_data,
- /*src_offset=*/data->offset,
- /*dest_offset=*/0, /*length=*/data->length,
- transpose_map));
- out_data += data->length * index_width;
- }
- return std::move(out);
- }
-
+ Result<BufferVector> UnifyDictionaries(const DictionaryType& d) {
+ BufferVector new_index_lookup;
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(d.value_type()));
+ new_index_lookup.resize(in_.size());
+ for (size_t i = 0; i < in_.size(); i++) {
+ auto item = in_[i];
+ auto dictionary_array = MakeArray(item->dictionary);
+ RETURN_NOT_OK(unifier->Unify(*dictionary_array, &new_index_lookup[i]));
+ }
+ std::shared_ptr<Array> out_dictionary;
+ RETURN_NOT_OK(unifier->GetResultWithIndexType(d.index_type(), &out_dictionary));
+ out_->dictionary = out_dictionary->data();
+ return new_index_lookup;
+ }
+
+ // Transpose and concatenate dictionary indices
+ Result<std::shared_ptr<Buffer>> ConcatenateDictionaryIndices(
+ const DataType& index_type, const BufferVector& index_transpositions) {
+ const auto index_width =
+ internal::checked_cast<const FixedWidthType&>(index_type).bit_width() / 8;
+ int64_t out_length = 0;
+ for (const auto& data : in_) {
+ out_length += data->length;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length * index_width, pool_));
+ uint8_t* out_data = out->mutable_data();
+ for (size_t i = 0; i < in_.size(); i++) {
+ const auto& data = in_[i];
+ auto transpose_map =
+ reinterpret_cast<const int32_t*>(index_transpositions[i]->data());
+ RETURN_NOT_OK(internal::TransposeInts(index_type, index_type,
+ /*src=*/data->GetValues<uint8_t>(1, 0),
+ /*dest=*/out_data,
+ /*src_offset=*/data->offset,
+ /*dest_offset=*/0, /*length=*/data->length,
+ transpose_map));
+ out_data += data->length * index_width;
+ }
+ return std::move(out);
+ }
+
Status Visit(const DictionaryType& d) {
auto fixed = internal::checked_cast<const FixedWidthType*>(d.index_type().get());
@@ -311,15 +311,15 @@ class ConcatenateImpl {
}
}
- ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));
if (dictionaries_same) {
out_->dictionary = in_[0]->dictionary;
return ConcatenateBuffers(index_buffers, pool_).Value(&out_->buffers[1]);
} else {
- ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d));
- ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
- ConcatenateDictionaryIndices(*fixed, index_lookup));
- return Status::OK();
+ ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d));
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+ ConcatenateDictionaryIndices(*fixed, index_lookup));
+ return Status::OK();
}
}
@@ -344,7 +344,7 @@ class ConcatenateImpl {
Result<BufferVector> Buffers(size_t index) {
BufferVector buffers;
buffers.reserve(in_.size());
- for (const auto& array_data : in_) {
+ for (const auto& array_data : in_) {
const auto& buffer = array_data->buffers[index];
if (buffer != nullptr) {
ARROW_ASSIGN_OR_RAISE(
@@ -386,7 +386,7 @@ class ConcatenateImpl {
Result<BufferVector> Buffers(size_t index, int byte_width) {
BufferVector buffers;
buffers.reserve(in_.size());
- for (const auto& array_data : in_) {
+ for (const auto& array_data : in_) {
const auto& buffer = array_data->buffers[index];
if (buffer != nullptr) {
ARROW_ASSIGN_OR_RAISE(auto sliced_buffer,
@@ -421,8 +421,8 @@ class ConcatenateImpl {
// Gather the index-th child_data of each input into a vector.
// Elements are sliced with that input's offset and length.
- Result<ArrayDataVector> ChildData(size_t index) {
- ArrayDataVector child_data(in_.size());
+ Result<ArrayDataVector> ChildData(size_t index) {
+ ArrayDataVector child_data(in_.size());
for (size_t i = 0; i < in_.size(); ++i) {
ARROW_ASSIGN_OR_RAISE(child_data[i], in_[i]->child_data[index]->SliceSafe(
in_[i]->offset, in_[i]->length));
@@ -431,22 +431,22 @@ class ConcatenateImpl {
}
// Gather the index-th child_data of each input into a vector.
- // Elements are sliced with that input's offset and length multiplied by multiplier.
- Result<ArrayDataVector> ChildData(size_t index, size_t multiplier) {
- ArrayDataVector child_data(in_.size());
- for (size_t i = 0; i < in_.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- child_data[i], in_[i]->child_data[index]->SliceSafe(
- in_[i]->offset * multiplier, in_[i]->length * multiplier));
- }
- return child_data;
- }
-
- // Gather the index-th child_data of each input into a vector.
+ // Elements are sliced with that input's offset and length multiplied by multiplier.
+ Result<ArrayDataVector> ChildData(size_t index, size_t multiplier) {
+ ArrayDataVector child_data(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ child_data[i], in_[i]->child_data[index]->SliceSafe(
+ in_[i]->offset * multiplier, in_[i]->length * multiplier));
+ }
+ return child_data;
+ }
+
+ // Gather the index-th child_data of each input into a vector.
// Elements are sliced with the explicitly passed ranges.
- Result<ArrayDataVector> ChildData(size_t index, const std::vector<Range>& ranges) {
+ Result<ArrayDataVector> ChildData(size_t index, const std::vector<Range>& ranges) {
DCHECK_EQ(in_.size(), ranges.size());
- ArrayDataVector child_data(in_.size());
+ ArrayDataVector child_data(in_.size());
for (size_t i = 0; i < in_.size(); ++i) {
ARROW_ASSIGN_OR_RAISE(child_data[i], in_[i]->child_data[index]->SliceSafe(
ranges[i].offset, ranges[i].length));
@@ -454,20 +454,20 @@ class ConcatenateImpl {
return child_data;
}
- const ArrayDataVector& in_;
+ const ArrayDataVector& in_;
MemoryPool* pool_;
std::shared_ptr<ArrayData> out_;
};
-} // namespace
-
+} // namespace
+
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool) {
if (arrays.size() == 0) {
return Status::Invalid("Must pass at least one array");
}
// gather ArrayData of input arrays
- ArrayDataVector data(arrays.size());
+ ArrayDataVector data(arrays.size());
for (size_t i = 0; i < arrays.size(); ++i) {
if (!arrays[i]->type()->Equals(*arrays[0]->type())) {
return Status::Invalid("arrays to be concatenated must be identically typed, but ",
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
index 5a214473972..be30ff7d685 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
@@ -37,13 +37,13 @@ namespace arrow {
using internal::CountSetBits;
-static inline void AdjustNonNullable(Type::type type_id, int64_t length,
+static inline void AdjustNonNullable(Type::type type_id, int64_t length,
std::vector<std::shared_ptr<Buffer>>* buffers,
int64_t* null_count) {
- if (type_id == Type::NA) {
- *null_count = length;
- (*buffers)[0] = nullptr;
- } else if (internal::HasValidityBitmap(type_id)) {
+ if (type_id == Type::NA) {
+ *null_count = length;
+ (*buffers)[0] = nullptr;
+ } else if (internal::HasValidityBitmap(type_id)) {
if (*null_count == 0) {
// In case there are no nulls, don't keep an allocated null bitmap around
(*buffers)[0] = nullptr;
@@ -56,39 +56,39 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length,
}
}
-std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
+std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count, int64_t offset) {
- AdjustNonNullable(type->id(), length, &buffers, &null_count);
- return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
- null_count, offset);
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ null_count, offset);
}
std::shared_ptr<ArrayData> ArrayData::Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data, int64_t null_count,
int64_t offset) {
- AdjustNonNullable(type->id(), length, &buffers, &null_count);
- return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
std::move(child_data), null_count, offset);
}
std::shared_ptr<ArrayData> ArrayData::Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count, int64_t offset) {
- AdjustNonNullable(type->id(), length, &buffers, &null_count);
- auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
std::move(child_data), null_count, offset);
data->dictionary = std::move(dictionary);
return data;
}
-std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
- int64_t null_count, int64_t offset) {
- return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
+std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
+ int64_t null_count, int64_t offset) {
+ return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
}
std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
@@ -213,7 +213,7 @@ struct ViewDataImpl {
Status MakeDataView(const std::shared_ptr<Field>& out_field,
std::shared_ptr<ArrayData>* out) {
- const auto& out_type = out_field->type();
+ const auto& out_type = out_field->type();
const auto out_layout = out_type->layout();
AdjustInputPointer();
@@ -249,11 +249,11 @@ struct ViewDataImpl {
} else {
// No null bitmap in input, append no-nulls bitmap
out_buffers.push_back(nullptr);
- if (out_type->id() == Type::NA) {
- out_null_count = out_length;
- } else {
- out_null_count = 0;
- }
+ if (out_type->id() == Type::NA) {
+ out_null_count = out_length;
+ } else {
+ out_null_count = 0;
+ }
}
// Process other buffers in output layout
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
index 418d09def6b..db166ffaa27 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
@@ -71,47 +71,47 @@ constexpr int64_t kUnknownNullCount = -1;
/// input array and replace them with newly-allocated data, changing the output
/// data type as well.
struct ARROW_EXPORT ArrayData {
- ArrayData() = default;
+ ArrayData() = default;
- ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
+ : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
- ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : ArrayData(std::move(type), length, null_count, offset) {
+ : ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
}
- ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : ArrayData(std::move(type), length, null_count, offset) {
+ : ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
this->child_data = std::move(child_data);
}
- static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+ static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
- static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+ static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
@@ -230,11 +230,11 @@ struct ARROW_EXPORT ArrayData {
}
std::shared_ptr<DataType> type;
- int64_t length = 0;
- mutable std::atomic<int64_t> null_count{0};
+ int64_t length = 0;
+ mutable std::atomic<int64_t> null_count{0};
// The logical start point into the physical buffers (in values, not bytes).
// Note that, for child data, this must be *added* to the child data's own offset.
- int64_t offset = 0;
+ int64_t offset = 0;
std::vector<std::shared_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<ArrayData>> child_data;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
index ed26ecff4e0..41d7242a44f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
@@ -41,7 +41,7 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -52,7 +52,7 @@ using internal::checked_cast;
// ----------------------------------------------------------------------
// Loading from ArrayData
-namespace {
+namespace {
class ArrayDataWrapper {
public:
@@ -75,209 +75,209 @@ class ArrayDataWrapper {
std::shared_ptr<Array>* out_;
};
-class ArrayDataEndianSwapper {
- public:
- ArrayDataEndianSwapper(const std::shared_ptr<ArrayData>& data, int64_t length)
- : data_(data), length_(length) {
- out_ = data->Copy();
- }
-
- Status SwapType(const DataType& type) {
- RETURN_NOT_OK(VisitTypeInline(type, this));
- RETURN_NOT_OK(SwapChildren(type.fields()));
- if (internal::HasValidityBitmap(type.id())) {
- // Copy null bitmap
- out_->buffers[0] = data_->buffers[0];
- }
- return Status::OK();
- }
-
- Status SwapChildren(const FieldVector& child_fields) {
- for (size_t i = 0; i < child_fields.size(); i++) {
- ARROW_ASSIGN_OR_RAISE(out_->child_data[i],
- internal::SwapEndianArrayData(data_->child_data[i]));
- }
- return Status::OK();
- }
-
- template <typename T>
- Result<std::shared_ptr<Buffer>> ByteSwapBuffer(
- const std::shared_ptr<Buffer>& in_buffer) {
- if (sizeof(T) == 1) {
- // if data size is 1, element is not swapped. We can use the original buffer
- return in_buffer;
- }
- auto in_data = reinterpret_cast<const T*>(in_buffer->data());
- ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size()));
- auto out_data = reinterpret_cast<T*>(out_buffer->mutable_data());
- int64_t length = in_buffer->size() / sizeof(T);
- for (int64_t i = 0; i < length; i++) {
- out_data[i] = BitUtil::ByteSwap(in_data[i]);
- }
- return std::move(out_buffer);
- }
-
- template <typename VALUE_TYPE>
- Status SwapOffsets(int index) {
- if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) {
- out_->buffers[index] = data_->buffers[index];
- return Status::OK();
- }
- // Except union, offset has one more element rather than data->length
- ARROW_ASSIGN_OR_RAISE(out_->buffers[index],
- ByteSwapBuffer<VALUE_TYPE>(data_->buffers[index]));
- return Status::OK();
- }
-
- template <typename T>
- enable_if_t<std::is_base_of<FixedWidthType, T>::value &&
- !std::is_base_of<FixedSizeBinaryType, T>::value &&
- !std::is_base_of<DictionaryType, T>::value,
- Status>
- Visit(const T& type) {
- using value_type = typename T::c_type;
- ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
- ByteSwapBuffer<value_type>(data_->buffers[1]));
- return Status::OK();
- }
-
- Status Visit(const Decimal128Type& type) {
- auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
- ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
- auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
- int64_t length = length_;
- length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2);
- for (int64_t i = 0; i < length; i++) {
- uint64_t tmp;
- auto idx = i * 2;
-#if ARROW_LITTLE_ENDIAN
- tmp = BitUtil::FromBigEndian(data[idx]);
- new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]);
- new_data[idx + 1] = tmp;
-#else
- tmp = BitUtil::FromLittleEndian(data[idx]);
- new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]);
- new_data[idx + 1] = tmp;
-#endif
- }
- out_->buffers[1] = std::move(new_buffer);
- return Status::OK();
- }
-
- Status Visit(const Decimal256Type& type) {
- auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
- ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
- auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
- int64_t length = length_;
- length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4);
- for (int64_t i = 0; i < length; i++) {
- uint64_t tmp0, tmp1, tmp2;
- auto idx = i * 4;
-#if ARROW_LITTLE_ENDIAN
- tmp0 = BitUtil::FromBigEndian(data[idx]);
- tmp1 = BitUtil::FromBigEndian(data[idx + 1]);
- tmp2 = BitUtil::FromBigEndian(data[idx + 2]);
- new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]);
- new_data[idx + 1] = tmp2;
- new_data[idx + 2] = tmp1;
- new_data[idx + 3] = tmp0;
-#else
- tmp0 = BitUtil::FromLittleEndian(data[idx]);
- tmp1 = BitUtil::FromLittleEndian(data[idx + 1]);
- tmp2 = BitUtil::FromLittleEndian(data[idx + 2]);
- new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]);
- new_data[idx + 1] = tmp2;
- new_data[idx + 2] = tmp1;
- new_data[idx + 3] = tmp0;
-#endif
- }
- out_->buffers[1] = std::move(new_buffer);
- return Status::OK();
- }
-
- Status Visit(const DayTimeIntervalType& type) {
- ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer<uint32_t>(data_->buffers[1]));
- return Status::OK();
- }
-
- Status Visit(const NullType& type) { return Status::OK(); }
- Status Visit(const BooleanType& type) { return Status::OK(); }
- Status Visit(const Int8Type& type) { return Status::OK(); }
- Status Visit(const UInt8Type& type) { return Status::OK(); }
- Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
- Status Visit(const FixedSizeListType& type) { return Status::OK(); }
- Status Visit(const StructType& type) { return Status::OK(); }
- Status Visit(const UnionType& type) {
- out_->buffers[1] = data_->buffers[1];
- if (type.mode() == UnionMode::DENSE) {
- RETURN_NOT_OK(SwapOffsets<int32_t>(2));
- }
- return Status::OK();
- }
-
- template <typename T>
- enable_if_t<std::is_same<BinaryType, T>::value || std::is_same<StringType, T>::value,
- Status>
- Visit(const T& type) {
- RETURN_NOT_OK(SwapOffsets<int32_t>(1));
- out_->buffers[2] = data_->buffers[2];
- return Status::OK();
- }
-
- template <typename T>
- enable_if_t<std::is_same<LargeBinaryType, T>::value ||
- std::is_same<LargeStringType, T>::value,
- Status>
- Visit(const T& type) {
- RETURN_NOT_OK(SwapOffsets<int64_t>(1));
- out_->buffers[2] = data_->buffers[2];
- return Status::OK();
- }
-
- Status Visit(const ListType& type) {
- RETURN_NOT_OK(SwapOffsets<int32_t>(1));
- return Status::OK();
- }
- Status Visit(const LargeListType& type) {
- RETURN_NOT_OK(SwapOffsets<int64_t>(1));
- return Status::OK();
- }
-
- Status Visit(const DictionaryType& type) {
- // dictionary was already swapped in ReadDictionary() in ipc/reader.cc
- RETURN_NOT_OK(SwapType(*type.index_type()));
- return Status::OK();
- }
-
- Status Visit(const ExtensionType& type) {
- RETURN_NOT_OK(SwapType(*type.storage_type()));
- return Status::OK();
- }
-
- const std::shared_ptr<ArrayData>& data_;
- int64_t length_;
- std::shared_ptr<ArrayData> out_;
-};
-
-} // namespace
-
-namespace internal {
-
-Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
- const std::shared_ptr<ArrayData>& data) {
- if (data->offset != 0) {
- return Status::Invalid("Unsupported data format: data.offset != 0");
- }
- ArrayDataEndianSwapper swapper(data, data->length);
- RETURN_NOT_OK(swapper.SwapType(*data->type));
- return std::move(swapper.out_);
-}
-
+class ArrayDataEndianSwapper {
+ public:
+ ArrayDataEndianSwapper(const std::shared_ptr<ArrayData>& data, int64_t length)
+ : data_(data), length_(length) {
+ out_ = data->Copy();
+ }
+
+ Status SwapType(const DataType& type) {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ RETURN_NOT_OK(SwapChildren(type.fields()));
+ if (internal::HasValidityBitmap(type.id())) {
+ // Copy null bitmap
+ out_->buffers[0] = data_->buffers[0];
+ }
+ return Status::OK();
+ }
+
+ Status SwapChildren(const FieldVector& child_fields) {
+ for (size_t i = 0; i < child_fields.size(); i++) {
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[i],
+ internal::SwapEndianArrayData(data_->child_data[i]));
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ Result<std::shared_ptr<Buffer>> ByteSwapBuffer(
+ const std::shared_ptr<Buffer>& in_buffer) {
+ if (sizeof(T) == 1) {
+ // if data size is 1, element is not swapped. We can use the original buffer
+ return in_buffer;
+ }
+ auto in_data = reinterpret_cast<const T*>(in_buffer->data());
+ ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size()));
+ auto out_data = reinterpret_cast<T*>(out_buffer->mutable_data());
+ int64_t length = in_buffer->size() / sizeof(T);
+ for (int64_t i = 0; i < length; i++) {
+ out_data[i] = BitUtil::ByteSwap(in_data[i]);
+ }
+ return std::move(out_buffer);
+ }
+
+ template <typename VALUE_TYPE>
+ Status SwapOffsets(int index) {
+ if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) {
+ out_->buffers[index] = data_->buffers[index];
+ return Status::OK();
+ }
+ // Except union, offset has one more element rather than data->length
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[index],
+ ByteSwapBuffer<VALUE_TYPE>(data_->buffers[index]));
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_base_of<FixedWidthType, T>::value &&
+ !std::is_base_of<FixedSizeBinaryType, T>::value &&
+ !std::is_base_of<DictionaryType, T>::value,
+ Status>
+ Visit(const T& type) {
+ using value_type = typename T::c_type;
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+ ByteSwapBuffer<value_type>(data_->buffers[1]));
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal128Type& type) {
+ auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+ auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+ int64_t length = length_;
+ length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2);
+ for (int64_t i = 0; i < length; i++) {
+ uint64_t tmp;
+ auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+ tmp = BitUtil::FromBigEndian(data[idx]);
+ new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]);
+ new_data[idx + 1] = tmp;
+#else
+ tmp = BitUtil::FromLittleEndian(data[idx]);
+ new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]);
+ new_data[idx + 1] = tmp;
+#endif
+ }
+ out_->buffers[1] = std::move(new_buffer);
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Type& type) {
+ auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+ auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+ int64_t length = length_;
+ length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4);
+ for (int64_t i = 0; i < length; i++) {
+ uint64_t tmp0, tmp1, tmp2;
+ auto idx = i * 4;
+#if ARROW_LITTLE_ENDIAN
+ tmp0 = BitUtil::FromBigEndian(data[idx]);
+ tmp1 = BitUtil::FromBigEndian(data[idx + 1]);
+ tmp2 = BitUtil::FromBigEndian(data[idx + 2]);
+ new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]);
+ new_data[idx + 1] = tmp2;
+ new_data[idx + 2] = tmp1;
+ new_data[idx + 3] = tmp0;
+#else
+ tmp0 = BitUtil::FromLittleEndian(data[idx]);
+ tmp1 = BitUtil::FromLittleEndian(data[idx + 1]);
+ tmp2 = BitUtil::FromLittleEndian(data[idx + 2]);
+ new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]);
+ new_data[idx + 1] = tmp2;
+ new_data[idx + 2] = tmp1;
+ new_data[idx + 3] = tmp0;
+#endif
+ }
+ out_->buffers[1] = std::move(new_buffer);
+ return Status::OK();
+ }
+
+ Status Visit(const DayTimeIntervalType& type) {
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer<uint32_t>(data_->buffers[1]));
+ return Status::OK();
+ }
+
+ Status Visit(const NullType& type) { return Status::OK(); }
+ Status Visit(const BooleanType& type) { return Status::OK(); }
+ Status Visit(const Int8Type& type) { return Status::OK(); }
+ Status Visit(const UInt8Type& type) { return Status::OK(); }
+ Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
+ Status Visit(const FixedSizeListType& type) { return Status::OK(); }
+ Status Visit(const StructType& type) { return Status::OK(); }
+ Status Visit(const UnionType& type) {
+ out_->buffers[1] = data_->buffers[1];
+ if (type.mode() == UnionMode::DENSE) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(2));
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_same<BinaryType, T>::value || std::is_same<StringType, T>::value,
+ Status>
+ Visit(const T& type) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(1));
+ out_->buffers[2] = data_->buffers[2];
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_same<LargeBinaryType, T>::value ||
+ std::is_same<LargeStringType, T>::value,
+ Status>
+ Visit(const T& type) {
+ RETURN_NOT_OK(SwapOffsets<int64_t>(1));
+ out_->buffers[2] = data_->buffers[2];
+ return Status::OK();
+ }
+
+ Status Visit(const ListType& type) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(1));
+ return Status::OK();
+ }
+ Status Visit(const LargeListType& type) {
+ RETURN_NOT_OK(SwapOffsets<int64_t>(1));
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ // dictionary was already swapped in ReadDictionary() in ipc/reader.cc
+ RETURN_NOT_OK(SwapType(*type.index_type()));
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& type) {
+ RETURN_NOT_OK(SwapType(*type.storage_type()));
+ return Status::OK();
+ }
+
+ const std::shared_ptr<ArrayData>& data_;
+ int64_t length_;
+ std::shared_ptr<ArrayData> out_;
+};
+
+} // namespace
+
+namespace internal {
+
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+ const std::shared_ptr<ArrayData>& data) {
+ if (data->offset != 0) {
+ return Status::Invalid("Unsupported data format: data.offset != 0");
+ }
+ ArrayDataEndianSwapper swapper(data, data->length);
+ RETURN_NOT_OK(swapper.SwapType(*data->type));
+ return std::move(swapper.out_);
+}
+
} // namespace internal
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
std::shared_ptr<Array> out;
- ArrayDataWrapper wrapper_visitor(data, &out);
+ ArrayDataWrapper wrapper_visitor(data, &out);
DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor));
DCHECK(out);
return out;
@@ -286,7 +286,7 @@ std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
// ----------------------------------------------------------------------
// Misc APIs
-namespace {
+namespace {
// get the maximum buffer length required, then allocate a single zeroed buffer
// to use anywhere a buffer is required
@@ -496,9 +496,9 @@ class RepeatedArrayFactory {
return out_;
}
- Status Visit(const NullType& type) {
- DCHECK(false); // already forwarded to MakeArrayOfNull
- return Status::OK();
+ Status Visit(const NullType& type) {
+ DCHECK(false); // already forwarded to MakeArrayOfNull
+ return Status::OK();
}
Status Visit(const BooleanType&) {
@@ -510,29 +510,29 @@ class RepeatedArrayFactory {
}
template <typename T>
- enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value, Status> Visit(
- const T&) {
+ enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value, Status> Visit(
+ const T&) {
auto value = checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
return FinishFixedWidth(&value, sizeof(value));
}
- Status Visit(const FixedSizeBinaryType& type) {
- auto value = checked_cast<const FixedSizeBinaryScalar&>(scalar_).value;
- return FinishFixedWidth(value->data(), type.byte_width());
- }
-
- template <typename T>
- enable_if_decimal<T, Status> Visit(const T&) {
- using ScalarType = typename TypeTraits<T>::ScalarType;
- auto value = checked_cast<const ScalarType&>(scalar_).value.ToBytes();
- return FinishFixedWidth(value.data(), value.size());
- }
-
- Status Visit(const Decimal256Type&) {
- auto value = checked_cast<const Decimal256Scalar&>(scalar_).value.ToBytes();
+ Status Visit(const FixedSizeBinaryType& type) {
+ auto value = checked_cast<const FixedSizeBinaryScalar&>(scalar_).value;
+ return FinishFixedWidth(value->data(), type.byte_width());
+ }
+
+ template <typename T>
+ enable_if_decimal<T, Status> Visit(const T&) {
+ using ScalarType = typename TypeTraits<T>::ScalarType;
+ auto value = checked_cast<const ScalarType&>(scalar_).value.ToBytes();
return FinishFixedWidth(value.data(), value.size());
}
+ Status Visit(const Decimal256Type&) {
+ auto value = checked_cast<const Decimal256Scalar&>(scalar_).value.ToBytes();
+ return FinishFixedWidth(value.data(), value.size());
+ }
+
template <typename T>
enable_if_base_binary<T, Status> Visit(const T&) {
std::shared_ptr<Buffer> value =
@@ -613,18 +613,18 @@ class RepeatedArrayFactory {
return Status::OK();
}
- Status Visit(const ExtensionType& type) {
- return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
- }
-
- Status Visit(const DenseUnionType& type) {
- return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
- }
-
- Status Visit(const SparseUnionType& type) {
- return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
- }
-
+ Status Visit(const ExtensionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ Status Visit(const DenseUnionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ Status Visit(const SparseUnionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
template <typename OffsetType>
Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr<Buffer>* out) {
TypedBufferBuilder<OffsetType> builder(pool_);
@@ -660,11 +660,11 @@ class RepeatedArrayFactory {
std::shared_ptr<Array> out_;
};
-} // namespace
+} // namespace
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
int64_t length, MemoryPool* pool) {
- ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create());
+ ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create());
return MakeArray(data);
}
@@ -673,7 +673,7 @@ Result<std::shared_ptr<Array>> MakeArrayFromScalar(const Scalar& scalar, int64_t
if (!scalar.is_valid) {
return MakeArrayOfNull(scalar.type, length, pool);
}
- return RepeatedArrayFactory(pool, scalar, length).Create();
+ return RepeatedArrayFactory(pool, scalar, length).Create();
}
namespace internal {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
index 3ef4e08828f..ac71c6d8570 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
@@ -56,17 +56,17 @@ Result<std::shared_ptr<Array>> MakeArrayFromScalar(
namespace internal {
-/// \brief Swap endian of each element in a generic ArrayData
-///
-/// As dictionaries are often shared between different arrays, dictionaries
-/// are not swapped by this function and should be handled separately.
-///
-/// \param[in] data the array contents
-/// \return the resulting ArrayData whose elements were swapped
-ARROW_EXPORT
-Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
- const std::shared_ptr<ArrayData>& data);
-
+/// \brief Swap endian of each element in a generic ArrayData
+///
+/// As dictionaries are often shared between different arrays, dictionaries
+/// are not swapped by this function and should be handled separately.
+///
+/// \param[in] data the array contents
+/// \return the resulting ArrayData whose elements were swapped
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+ const std::shared_ptr<ArrayData>& data);
+
/// Given a number of ArrayVectors, treat each ArrayVector as the
/// chunks of a chunked array. Then rechunk each ArrayVector such that
/// all ArrayVectors are chunked identically. It is mandatory that
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
index 5cc3bacf282..5adc18bd495 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
@@ -23,12 +23,12 @@
#include "arrow/extension_type.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
-#include "arrow/util/utf8.h"
+#include "arrow/util/utf8.h"
#include "arrow/visitor_inline.h"
namespace arrow {
@@ -39,172 +39,172 @@ namespace internal {
namespace {
-struct ValidateArrayImpl {
- const ArrayData& data;
+struct ValidateArrayImpl {
+ const ArrayData& data;
- Status Validate() { return ValidateWithType(*data.type); }
-
- Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
-
- Status Visit(const NullType&) {
- if (data.null_count != data.length) {
- return Status::Invalid("Null array null_count unequal to its length");
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType&) {
+ if (data.null_count != data.length) {
+ return Status::Invalid("Null array null_count unequal to its length");
}
return Status::OK();
}
- Status Visit(const FixedWidthType&) {
- if (data.length > 0) {
- if (!IsBufferValid(1)) {
- return Status::Invalid("Missing values buffer in non-empty array");
- }
+ Status Visit(const FixedWidthType&) {
+ if (data.length > 0) {
+ if (!IsBufferValid(1)) {
+ return Status::Invalid("Missing values buffer in non-empty array");
+ }
}
return Status::OK();
}
- Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
- Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
- Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); }
- Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
- Status Visit(const ListType& type) { return ValidateListLike(type); }
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
- Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
- Status Visit(const MapType& type) { return ValidateListLike(type); }
-
- Status Visit(const FixedSizeListType& type) {
- const ArrayData& values = *data.child_data[0];
- const int64_t list_size = type.list_size();
- if (list_size < 0) {
- return Status::Invalid("Fixed size list has negative list size");
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& values = *data.child_data[0];
+ const int64_t list_size = type.list_size();
+ if (list_size < 0) {
+ return Status::Invalid("Fixed size list has negative list size");
}
int64_t expected_values_length = -1;
- if (MultiplyWithOverflow(data.length, list_size, &expected_values_length) ||
- values.length != expected_values_length) {
- return Status::Invalid("Values length (", values.length,
- ") is not equal to the length (", data.length,
- ") multiplied by the value size (", list_size, ")");
- }
-
- const Status child_valid = ValidateArray(values);
- if (!child_valid.ok()) {
- return Status::Invalid("Fixed size list child array invalid: ",
- child_valid.ToString());
- }
-
+ if (MultiplyWithOverflow(data.length, list_size, &expected_values_length) ||
+ values.length != expected_values_length) {
+ return Status::Invalid("Values length (", values.length,
+ ") is not equal to the length (", data.length,
+ ") multiplied by the value size (", list_size, ")");
+ }
+
+ const Status child_valid = ValidateArray(values);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+
return Status::OK();
}
- Status Visit(const StructType& type) {
- for (int i = 0; i < type.num_fields(); ++i) {
- const auto& field_data = *data.child_data[i];
+ Status Visit(const StructType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
- // Validate child first, to catch nonsensical length / offset etc.
- const Status field_valid = ValidateArray(field_data);
- if (!field_valid.ok()) {
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
return Status::Invalid("Struct child array #", i,
- " invalid: ", field_valid.ToString());
+ " invalid: ", field_valid.ToString());
}
- if (field_data.length < data.length + data.offset) {
+ if (field_data.length < data.length + data.offset) {
return Status::Invalid("Struct child array #", i,
- " has length smaller than expected for struct array (",
- field_data.length, " < ", data.length + data.offset, ")");
+ " has length smaller than expected for struct array (",
+ field_data.length, " < ", data.length + data.offset, ")");
}
- const auto& field_type = type.field(i)->type();
- if (!field_data.type->Equals(*field_type)) {
- return Status::Invalid("Struct child array #", i, " does not match type field: ",
- field_data.type->ToString(), " vs ",
- field_type->ToString());
+ const auto& field_type = type.field(i)->type();
+ if (!field_data.type->Equals(*field_type)) {
+ return Status::Invalid("Struct child array #", i, " does not match type field: ",
+ field_data.type->ToString(), " vs ",
+ field_type->ToString());
}
}
return Status::OK();
}
- Status Visit(const UnionType& type) {
- for (int i = 0; i < type.num_fields(); ++i) {
- const auto& field_data = *data.child_data[i];
-
- // Validate child first, to catch nonsensical length / offset etc.
- const Status field_valid = ValidateArray(field_data);
- if (!field_valid.ok()) {
- return Status::Invalid("Union child array #", i,
- " invalid: ", field_valid.ToString());
+ Status Visit(const UnionType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
+
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Union child array #", i,
+ " invalid: ", field_valid.ToString());
}
- if (type.mode() == UnionMode::SPARSE &&
- field_data.length < data.length + data.offset) {
+ if (type.mode() == UnionMode::SPARSE &&
+ field_data.length < data.length + data.offset) {
return Status::Invalid("Sparse union child array #", i,
- " has length smaller than expected for union array (",
- field_data.length, " < ", data.length + data.offset, ")");
+ " has length smaller than expected for union array (",
+ field_data.length, " < ", data.length + data.offset, ")");
}
- const auto& field_type = type.field(i)->type();
- if (!field_data.type->Equals(*field_type)) {
- return Status::Invalid("Union child array #", i, " does not match type field: ",
- field_data.type->ToString(), " vs ",
- field_type->ToString());
+ const auto& field_type = type.field(i)->type();
+ if (!field_data.type->Equals(*field_type)) {
+ return Status::Invalid("Union child array #", i, " does not match type field: ",
+ field_data.type->ToString(), " vs ",
+ field_type->ToString());
}
}
return Status::OK();
}
- Status Visit(const DictionaryType& type) {
- Type::type index_type_id = type.index_type()->id();
+ Status Visit(const DictionaryType& type) {
+ Type::type index_type_id = type.index_type()->id();
if (!is_integer(index_type_id)) {
return Status::Invalid("Dictionary indices must be integer type");
}
- if (!data.dictionary) {
+ if (!data.dictionary) {
return Status::Invalid("Dictionary values must be non-null");
}
- const Status dict_valid = ValidateArray(*data.dictionary);
+ const Status dict_valid = ValidateArray(*data.dictionary);
if (!dict_valid.ok()) {
return Status::Invalid("Dictionary array invalid: ", dict_valid.ToString());
}
- // Visit indices
- return ValidateWithType(*type.index_type());
- }
-
- Status Visit(const ExtensionType& type) {
- // Visit storage
- return ValidateWithType(*type.storage_type());
+ // Visit indices
+ return ValidateWithType(*type.index_type());
}
- private:
- bool IsBufferValid(int index) { return IsBufferValid(data, index); }
+ Status Visit(const ExtensionType& type) {
+ // Visit storage
+ return ValidateWithType(*type.storage_type());
+ }
- static bool IsBufferValid(const ArrayData& data, int index) {
- return data.buffers[index] != nullptr && data.buffers[index]->address() != 0;
+ private:
+ bool IsBufferValid(int index) { return IsBufferValid(data, index); }
+
+ static bool IsBufferValid(const ArrayData& data, int index) {
+ return data.buffers[index] != nullptr && data.buffers[index]->address() != 0;
}
- template <typename BinaryType>
- Status ValidateBinaryLike(const BinaryType& type) {
- if (!IsBufferValid(2)) {
- return Status::Invalid("Value data buffer is null");
+ template <typename BinaryType>
+ Status ValidateBinaryLike(const BinaryType& type) {
+ if (!IsBufferValid(2)) {
+ return Status::Invalid("Value data buffer is null");
}
- // First validate offsets, to make sure the accesses below are valid
- RETURN_NOT_OK(ValidateOffsets(type));
-
- if (data.length > 0 && data.buffers[1]->is_cpu()) {
- using offset_type = typename BinaryType::offset_type;
+ // First validate offsets, to make sure the accesses below are valid
+ RETURN_NOT_OK(ValidateOffsets(type));
- const auto offsets = data.GetValues<offset_type>(1);
- const Buffer& values = *data.buffers[2];
-
- const auto first_offset = offsets[0];
- const auto last_offset = offsets[data.length];
+ if (data.length > 0 && data.buffers[1]->is_cpu()) {
+ using offset_type = typename BinaryType::offset_type;
+
+ const auto offsets = data.GetValues<offset_type>(1);
+ const Buffer& values = *data.buffers[2];
+
+ const auto first_offset = offsets[0];
+ const auto last_offset = offsets[data.length];
// This early test avoids undefined behaviour when computing `data_extent`
if (first_offset < 0 || last_offset < 0) {
return Status::Invalid("Negative offsets in binary array");
}
const auto data_extent = last_offset - first_offset;
- const auto values_length = values.size();
+ const auto values_length = values.size();
if (values_length < data_extent) {
return Status::Invalid("Length spanned by binary offsets (", data_extent,
") larger than values array (size ", values_length, ")");
@@ -221,27 +221,27 @@ struct ValidateArrayImpl {
return Status::OK();
}
- template <typename ListType>
- Status ValidateListLike(const ListType& type) {
+ template <typename ListType>
+ Status ValidateListLike(const ListType& type) {
// First validate offsets, to make sure the accesses below are valid
- RETURN_NOT_OK(ValidateOffsets(type));
-
- const ArrayData& values = *data.child_data[0];
+ RETURN_NOT_OK(ValidateOffsets(type));
+ const ArrayData& values = *data.child_data[0];
+
// An empty list array can have 0 offsets
- if (data.length > 0 && data.buffers[1]->is_cpu()) {
- using offset_type = typename ListType::offset_type;
-
- const auto offsets = data.GetValues<offset_type>(1);
-
- const auto first_offset = offsets[0];
- const auto last_offset = offsets[data.length];
+ if (data.length > 0 && data.buffers[1]->is_cpu()) {
+ using offset_type = typename ListType::offset_type;
+
+ const auto offsets = data.GetValues<offset_type>(1);
+
+ const auto first_offset = offsets[0];
+ const auto last_offset = offsets[data.length];
// This early test avoids undefined behaviour when computing `data_extent`
if (first_offset < 0 || last_offset < 0) {
return Status::Invalid("Negative offsets in list array");
}
const auto data_extent = last_offset - first_offset;
- const auto values_length = values.length;
+ const auto values_length = values.length;
if (values_length < data_extent) {
return Status::Invalid("Length spanned by list offsets (", data_extent,
") larger than values array (length ", values_length, ")");
@@ -256,32 +256,32 @@ struct ValidateArrayImpl {
}
}
- const Status child_valid = ValidateArray(values);
+ const Status child_valid = ValidateArray(values);
if (!child_valid.ok()) {
return Status::Invalid("List child array invalid: ", child_valid.ToString());
}
return Status::OK();
}
- template <typename TypeClass>
- Status ValidateOffsets(const TypeClass& type) {
- using offset_type = typename TypeClass::offset_type;
+ template <typename TypeClass>
+ Status ValidateOffsets(const TypeClass& type) {
+ using offset_type = typename TypeClass::offset_type;
- const Buffer* offsets = data.buffers[1].get();
- if (offsets == nullptr) {
- // For length 0, an empty offsets buffer seems accepted as a special case
- // (ARROW-544)
- if (data.length > 0) {
- return Status::Invalid("Non-empty array but offsets are null");
+ const Buffer* offsets = data.buffers[1].get();
+ if (offsets == nullptr) {
+ // For length 0, an empty offsets buffer seems accepted as a special case
+ // (ARROW-544)
+ if (data.length > 0) {
+ return Status::Invalid("Non-empty array but offsets are null");
}
return Status::OK();
}
// An empty list array can have 0 offsets
- auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
- if (offsets->size() / static_cast<int32_t>(sizeof(offset_type)) < required_offsets) {
- return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(),
- " isn't large enough for length: ", data.length);
+ auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
+ if (offsets->size() / static_cast<int32_t>(sizeof(offset_type)) < required_offsets) {
+ return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(),
+ " isn't large enough for length: ", data.length);
}
return Status::OK();
@@ -291,12 +291,12 @@ struct ValidateArrayImpl {
} // namespace
ARROW_EXPORT
-Status ValidateArray(const ArrayData& data) {
- // First check the data layout conforms to the spec
- const DataType& type = *data.type;
+Status ValidateArray(const ArrayData& data) {
+ // First check the data layout conforms to the spec
+ const DataType& type = *data.type;
const auto layout = type.layout();
- if (data.length < 0) {
+ if (data.length < 0) {
return Status::Invalid("Array length is negative");
}
@@ -306,14 +306,14 @@ Status ValidateArray(const ArrayData& data) {
"of type ",
type.ToString(), ", got ", data.buffers.size());
}
-
+
// This check is required to avoid addition overflow below
int64_t length_plus_offset = -1;
- if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
+ if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
return Status::Invalid("Array of type ", type.ToString(),
" has impossibly large length and offset");
}
-
+
for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
const auto& buffer = data.buffers[i];
const auto& spec = layout.buffers[i];
@@ -340,7 +340,7 @@ Status ValidateArray(const ArrayData& data) {
}
if (buffer->size() < min_buffer_size) {
return Status::Invalid("Buffer #", i, " too small in array of type ",
- type.ToString(), " and length ", data.length,
+ type.ToString(), " and length ", data.length,
": expected at least ", min_buffer_size, " byte(s), got ",
buffer->size());
}
@@ -352,12 +352,12 @@ Status ValidateArray(const ArrayData& data) {
// Check null_count() *after* validating the buffer sizes, to avoid
// reading out of bounds.
- if (data.null_count > data.length) {
+ if (data.null_count > data.length) {
return Status::Invalid("Null count exceeds array length");
}
- if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
- return Status::Invalid("Negative null count");
- }
+ if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
+ return Status::Invalid("Negative null count");
+ }
if (type.id() != Type::EXTENSION) {
if (data.child_data.size() != static_cast<size_t>(type.num_fields())) {
@@ -376,142 +376,142 @@ Status ValidateArray(const ArrayData& data) {
type.ToString());
}
- ValidateArrayImpl validator{data};
- return validator.Validate();
+ ValidateArrayImpl validator{data};
+ return validator.Validate();
}
-ARROW_EXPORT
-Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
-
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
+
///////////////////////////////////////////////////////////////////////////
-// ValidateArrayFull: expensive validation checks
+// ValidateArrayFull: expensive validation checks
namespace {
-struct UTF8DataValidator {
- const ArrayData& data;
+struct UTF8DataValidator {
+ const ArrayData& data;
- Status Visit(const DataType&) {
+ Status Visit(const DataType&) {
// Default, should be unreachable
return Status::NotImplemented("");
}
- template <typename StringType>
- enable_if_string<StringType, Status> Visit(const StringType&) {
- util::InitializeUTF8();
-
- int64_t i = 0;
- return VisitArrayDataInline<StringType>(
- data,
- [&](util::string_view v) {
- if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
- return Status::Invalid("Invalid UTF8 sequence at string index ", i);
- }
- ++i;
- return Status::OK();
- },
- [&]() {
- ++i;
- return Status::OK();
- });
+ template <typename StringType>
+ enable_if_string<StringType, Status> Visit(const StringType&) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArrayDataInline<StringType>(
+ data,
+ [&](util::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
};
-struct BoundsChecker {
- const ArrayData& data;
- int64_t min_value;
- int64_t max_value;
-
- Status Visit(const DataType&) {
- // Default, should be unreachable
- return Status::NotImplemented("");
- }
-
- template <typename IntegerType>
- enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
- using c_type = typename IntegerType::c_type;
-
- int64_t i = 0;
- return VisitArrayDataInline<IntegerType>(
- data,
- [&](c_type value) {
- const auto v = static_cast<int64_t>(value);
- if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
- return Status::Invalid("Value at position ", i, " out of bounds: ", v,
- " (should be in [", min_value, ", ", max_value, "])");
- }
- ++i;
- return Status::OK();
- },
- [&]() {
- ++i;
- return Status::OK();
- });
- }
-};
-
-struct ValidateArrayFullImpl {
- const ArrayData& data;
-
- Status Validate() { return ValidateWithType(*data.type); }
-
- Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
-
- Status Visit(const NullType& type) { return Status::OK(); }
-
- Status Visit(const FixedWidthType& type) { return Status::OK(); }
-
- Status Visit(const StringType& type) {
- RETURN_NOT_OK(ValidateBinaryLike(type));
- return ValidateUTF8(data);
- }
-
- Status Visit(const LargeStringType& type) {
- RETURN_NOT_OK(ValidateBinaryLike(type));
- return ValidateUTF8(data);
- }
-
- Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
-
- Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
-
- Status Visit(const ListType& type) { return ValidateListLike(type); }
-
- Status Visit(const LargeListType& type) { return ValidateListLike(type); }
-
- Status Visit(const MapType& type) { return ValidateListLike(type); }
-
- Status Visit(const FixedSizeListType& type) {
- const ArrayData& child = *data.child_data[0];
- const Status child_valid = ValidateArrayFull(child);
- if (!child_valid.ok()) {
- return Status::Invalid("Fixed size list child array invalid: ",
- child_valid.ToString());
- }
- return Status::OK();
- }
-
- Status Visit(const StructType& type) {
- // Validate children
- for (int64_t i = 0; i < type.num_fields(); ++i) {
- const ArrayData& field = *data.child_data[i];
- const Status field_valid = ValidateArrayFull(field);
- if (!field_valid.ok()) {
- return Status::Invalid("Struct child array #", i,
- " invalid: ", field_valid.ToString());
+struct BoundsChecker {
+ const ArrayData& data;
+ int64_t min_value;
+ int64_t max_value;
+
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
+ }
+
+ template <typename IntegerType>
+ enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+ using c_type = typename IntegerType::c_type;
+
+ int64_t i = 0;
+ return VisitArrayDataInline<IntegerType>(
+ data,
+ [&](c_type value) {
+ const auto v = static_cast<int64_t>(value);
+ if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+ return Status::Invalid("Value at position ", i, " out of bounds: ", v,
+ " (should be in [", min_value, ", ", max_value, "])");
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
+ }
+};
+
+struct ValidateArrayFullImpl {
+ const ArrayData& data;
+
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType& type) { return Status::OK(); }
+
+ Status Visit(const FixedWidthType& type) { return Status::OK(); }
+
+ Status Visit(const StringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
+
+ Status Visit(const LargeStringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
+
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Struct child array #", i,
+ " invalid: ", field_valid.ToString());
}
- }
- return Status::OK();
- }
-
- Status Visit(const UnionType& type) {
- const auto& child_ids = type.child_ids();
- const auto& type_codes_map = type.type_codes();
-
- const int8_t* type_codes = data.GetValues<int8_t>(1);
-
- for (int64_t i = 0; i < data.length; ++i) {
- // Note that union arrays never have top-level nulls
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const auto& type_codes_map = type.type_codes();
+
+ const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ // Note that union arrays never have top-level nulls
const int32_t code = type_codes[i];
if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
return Status::Invalid("Union value at position ", i, " has invalid type id ",
@@ -519,17 +519,17 @@ struct ValidateArrayFullImpl {
}
}
- if (type.mode() == UnionMode::DENSE) {
+ if (type.mode() == UnionMode::DENSE) {
// Map logical type id to child length
std::vector<int64_t> child_lengths(256);
- for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
- child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
+ for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
+ child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
}
- // Check offsets are in bounds
- std::vector<int64_t> last_child_offsets(256, 0);
- const int32_t* offsets = data.GetValues<int32_t>(2);
- for (int64_t i = 0; i < data.length; ++i) {
+ // Check offsets are in bounds
+ std::vector<int64_t> last_child_offsets(256, 0);
+ const int32_t* offsets = data.GetValues<int32_t>(2);
+ for (int64_t i = 0; i < data.length; ++i) {
const int32_t code = type_codes[i];
const int32_t offset = offsets[i];
if (offset < 0) {
@@ -542,78 +542,78 @@ struct ValidateArrayFullImpl {
"than child length (",
offset, " >= ", child_lengths[code], ")");
}
- if (offset < last_child_offsets[code]) {
- return Status::Invalid("Union value at position ", i,
- " has non-monotonic offset ", offset);
- }
- last_child_offsets[code] = offset;
- }
- }
-
- // Validate children
- for (int64_t i = 0; i < type.num_fields(); ++i) {
- const ArrayData& field = *data.child_data[i];
- const Status field_valid = ValidateArrayFull(field);
- if (!field_valid.ok()) {
- return Status::Invalid("Union child array #", i,
- " invalid: ", field_valid.ToString());
+ if (offset < last_child_offsets[code]) {
+ return Status::Invalid("Union value at position ", i,
+ " has non-monotonic offset ", offset);
+ }
+ last_child_offsets[code] = offset;
}
}
+
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Union child array #", i,
+ " invalid: ", field_valid.ToString());
+ }
+ }
return Status::OK();
}
- Status Visit(const DictionaryType& type) {
+ Status Visit(const DictionaryType& type) {
const Status indices_status =
- CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
+ CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
if (!indices_status.ok()) {
return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString());
}
- return ValidateArrayFull(*data.dictionary);
+ return ValidateArrayFull(*data.dictionary);
}
- Status Visit(const ExtensionType& type) {
- return ValidateWithType(*type.storage_type());
+ Status Visit(const ExtensionType& type) {
+ return ValidateWithType(*type.storage_type());
}
protected:
- template <typename BinaryType>
- Status ValidateBinaryLike(const BinaryType& type) {
- const auto& data_buffer = data.buffers[2];
- if (data_buffer == nullptr) {
- return Status::Invalid("Binary data buffer is null");
+ template <typename BinaryType>
+ Status ValidateBinaryLike(const BinaryType& type) {
+ const auto& data_buffer = data.buffers[2];
+ if (data_buffer == nullptr) {
+ return Status::Invalid("Binary data buffer is null");
}
- return ValidateOffsets(type, data_buffer->size());
+ return ValidateOffsets(type, data_buffer->size());
}
- template <typename ListType>
- Status ValidateListLike(const ListType& type) {
- const ArrayData& child = *data.child_data[0];
- const Status child_valid = ValidateArrayFull(child);
+ template <typename ListType>
+ Status ValidateListLike(const ListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
if (!child_valid.ok()) {
return Status::Invalid("List child array invalid: ", child_valid.ToString());
}
- return ValidateOffsets(type, child.offset + child.length);
+ return ValidateOffsets(type, child.offset + child.length);
}
- template <typename TypeClass>
- Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
- using offset_type = typename TypeClass::offset_type;
- if (data.length == 0) {
+ template <typename TypeClass>
+ Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+ using offset_type = typename TypeClass::offset_type;
+ if (data.length == 0) {
return Status::OK();
}
-
- const offset_type* offsets = data.GetValues<offset_type>(1);
- if (offsets == nullptr) {
- return Status::Invalid("Non-empty array but offsets are null");
+
+ const offset_type* offsets = data.GetValues<offset_type>(1);
+ if (offsets == nullptr) {
+ return Status::Invalid("Non-empty array but offsets are null");
}
- auto prev_offset = offsets[0];
+ auto prev_offset = offsets[0];
if (prev_offset < 0) {
- return Status::Invalid("Offset invariant failure: array starts at negative offset ",
- prev_offset);
+ return Status::Invalid("Offset invariant failure: array starts at negative offset ",
+ prev_offset);
}
- for (int64_t i = 1; i <= data.length; ++i) {
- const auto current_offset = offsets[i];
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
if (current_offset < prev_offset) {
return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ",
i, ": ", current_offset, " < ", prev_offset);
@@ -627,31 +627,31 @@ struct ValidateArrayFullImpl {
return Status::OK();
}
- Status CheckBounds(const DataType& type, int64_t min_value, int64_t max_value) {
- BoundsChecker checker{data, min_value, max_value};
- return VisitTypeInline(type, &checker);
+ Status CheckBounds(const DataType& type, int64_t min_value, int64_t max_value) {
+ BoundsChecker checker{data, min_value, max_value};
+ return VisitTypeInline(type, &checker);
}
};
} // namespace
ARROW_EXPORT
-Status ValidateArrayFull(const ArrayData& data) {
- return ValidateArrayFullImpl{data}.Validate();
-}
-
-ARROW_EXPORT
-Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.data()); }
-
-ARROW_EXPORT
-Status ValidateUTF8(const ArrayData& data) {
- DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
- UTF8DataValidator validator{data};
- return VisitTypeInline(*data.type, &validator);
+Status ValidateArrayFull(const ArrayData& data) {
+ return ValidateArrayFullImpl{data}.Validate();
}
-ARROW_EXPORT
-Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
-
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.data()); }
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data) {
+ DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
+ UTF8DataValidator validator{data};
+ return VisitTypeInline(*data.type, &validator);
+}
+
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
index cae3e16b3c5..7e07100e1fc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
@@ -18,7 +18,7 @@
#pragma once
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
+#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -26,30 +26,30 @@ namespace internal {
// Internal functions implementing Array::Validate() and friends.
-// O(1) array metadata validation
-
+// O(1) array metadata validation
+
ARROW_EXPORT
Status ValidateArray(const Array& array);
ARROW_EXPORT
-Status ValidateArray(const ArrayData& data);
-
-// O(N) array data validation.
-// Note the "full" routines don't validate metadata. It should be done
-// beforehand using ValidateArray(), otherwise invalid memory accesses
-// may occur.
-
-ARROW_EXPORT
-Status ValidateArrayFull(const Array& array);
-
-ARROW_EXPORT
-Status ValidateArrayFull(const ArrayData& data);
-
-ARROW_EXPORT
-Status ValidateUTF8(const Array& array);
-
-ARROW_EXPORT
-Status ValidateUTF8(const ArrayData& data);
-
+Status ValidateArray(const ArrayData& data);
+
+// O(N) array data validation.
+// Note the "full" routines don't validate metadata. It should be done
+// beforehand using ValidateArray(), otherwise invalid memory accesses
+// may occur.
+
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array);
+
+ARROW_EXPORT
+Status ValidateArrayFull(const ArrayData& data);
+
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array);
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data);
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
index 6c47a464b1d..7d71846d9ab 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
@@ -56,13 +56,13 @@ class ARROW_EXPORT Buffer {
///
/// \note The passed memory must be kept alive through some other means
Buffer(const uint8_t* data, int64_t size)
- : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
+ : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
SetMemoryManager(default_cpu_memory_manager());
}
Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
std::shared_ptr<Buffer> parent = NULLPTR)
- : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
+ : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
SetMemoryManager(std::move(mm));
}
@@ -121,7 +121,7 @@ class ARROW_EXPORT Buffer {
#endif
// A zero-capacity buffer can have a null data pointer
if (capacity_ != 0) {
- memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
+ memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
}
}
@@ -195,8 +195,8 @@ class ARROW_EXPORT Buffer {
CheckCPU();
CheckMutable();
#endif
- return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
- : NULLPTR;
+ return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
+ : NULLPTR;
}
/// \brief Return the device address of the buffer's data
@@ -210,7 +210,7 @@ class ARROW_EXPORT Buffer {
#ifndef NDEBUG
CheckMutable();
#endif
- return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
+ return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
}
/// \brief Return the buffer's size in bytes
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
index c6250ae2b76..cebaa5db510 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
@@ -64,10 +64,10 @@ class ARROW_EXPORT BufferBuilder {
/// \brief Resize the buffer to the nearest multiple of 64 bytes
///
/// \param new_capacity the new capacity of the of the builder. Will be
- /// rounded up to a multiple of 64 bytes for padding
- /// \param shrink_to_fit if new capacity is smaller than the existing,
- /// reallocate internal buffer. Set to false to avoid reallocations when
- /// shrinking the builder.
+ /// rounded up to a multiple of 64 bytes for padding
+ /// \param shrink_to_fit if new capacity is smaller than the existing,
+ /// reallocate internal buffer. Set to false to avoid reallocations when
+ /// shrinking the builder.
/// \return Status
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
if (buffer_ == NULLPTR) {
@@ -159,23 +159,23 @@ class ARROW_EXPORT BufferBuilder {
return Status::OK();
}
- Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
- std::shared_ptr<Buffer> out;
- ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
- return out;
- }
-
- /// \brief Like Finish, but override the final buffer size
- ///
- /// This is useful after writing data directly into the builder memory
- /// without calling the Append methods (basically, when using BufferBuilder
- /// mostly for memory allocation).
- Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
- bool shrink_to_fit = true) {
- size_ = final_length;
- return Finish(shrink_to_fit);
- }
-
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using BufferBuilder
+ /// mostly for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ size_ = final_length;
+ return Finish(shrink_to_fit);
+ }
+
void Reset() {
buffer_ = NULLPTR;
capacity_ = size_ = 0;
@@ -216,11 +216,11 @@ class TypedBufferBuilder<
MemoryPool* pool = default_memory_pool())
: bytes_builder_(std::move(buffer), pool) {}
- explicit TypedBufferBuilder(BufferBuilder builder)
- : bytes_builder_(std::move(builder)) {}
-
- BufferBuilder* bytes_builder() { return &bytes_builder_; }
-
+ explicit TypedBufferBuilder(BufferBuilder builder)
+ : bytes_builder_(std::move(builder)) {}
+
+ BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
Status Append(T value) {
return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
}
@@ -275,22 +275,22 @@ class TypedBufferBuilder<
return bytes_builder_.Finish(out, shrink_to_fit);
}
- Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
- std::shared_ptr<Buffer> out;
- ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
- return out;
- }
-
- /// \brief Like Finish, but override the final buffer size
- ///
- /// This is useful after writing data directly into the builder memory
- /// without calling the Append methods (basically, when using TypedBufferBuilder
- /// only for memory allocation).
- Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
- bool shrink_to_fit = true) {
- return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
- }
-
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using TypedBufferBuilder
+ /// only for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
+ }
+
void Reset() { bytes_builder_.Reset(); }
int64_t length() const { return bytes_builder_.length() / sizeof(T); }
@@ -309,11 +309,11 @@ class TypedBufferBuilder<bool> {
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
: bytes_builder_(pool) {}
- explicit TypedBufferBuilder(BufferBuilder builder)
- : bytes_builder_(std::move(builder)) {}
-
- BufferBuilder* bytes_builder() { return &bytes_builder_; }
-
+ explicit TypedBufferBuilder(BufferBuilder builder)
+ : bytes_builder_(std::move(builder)) {}
+
+ BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
Status Append(bool value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
@@ -411,25 +411,25 @@ class TypedBufferBuilder<bool> {
return bytes_builder_.Finish(out, shrink_to_fit);
}
- Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
- std::shared_ptr<Buffer> out;
- ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
- return out;
- }
-
- /// \brief Like Finish, but override the final buffer size
- ///
- /// This is useful after writing data directly into the builder memory
- /// without calling the Append methods (basically, when using TypedBufferBuilder
- /// only for memory allocation).
- Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
- bool shrink_to_fit = true) {
- const auto final_byte_length = BitUtil::BytesForBits(final_length);
- bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
- bit_length_ = false_count_ = 0;
- return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
- }
-
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using TypedBufferBuilder
+ /// only for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ const auto final_byte_length = BitUtil::BytesForBits(final_length);
+ bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
+ bit_length_ = false_count_ = 0;
+ return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
+ }
+
void Reset() {
bytes_builder_.Reset();
bit_length_ = false_count_ = 0;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
index f22228a4588..e46661b4b42 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
@@ -51,7 +51,7 @@ struct DictionaryBuilderCase {
}
Status Visit(const FixedSizeBinaryType&) { return CreateFor<FixedSizeBinaryType>(); }
Status Visit(const Decimal128Type&) { return CreateFor<Decimal128Type>(); }
- Status Visit(const Decimal256Type&) { return CreateFor<Decimal256Type>(); }
+ Status Visit(const Decimal256Type&) { return CreateFor<Decimal256Type>(); }
Status Visit(const DataType& value_type) { return NotImplemented(value_type); }
Status Visit(const HalfFloatType& value_type) { return NotImplemented(value_type); }
@@ -139,7 +139,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(LargeBinary);
BUILDER_CASE(FixedSizeBinary);
BUILDER_CASE(Decimal128);
- BUILDER_CASE(Decimal256);
+ BUILDER_CASE(Decimal256);
case Type::DICTIONARY: {
const auto& dict_type = static_cast<const DictionaryType&>(*type);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
index a43bf8104f2..ccd780fa687 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
@@ -304,16 +304,16 @@ struct SchemaExporter {
return SetFormat("w:" + std::to_string(type.byte_width()));
}
- Status Visit(const DecimalType& type) {
- if (type.bit_width() == 128) {
- // 128 is the default bit-width
- return SetFormat("d:" + std::to_string(type.precision()) + "," +
- std::to_string(type.scale()));
- } else {
- return SetFormat("d:" + std::to_string(type.precision()) + "," +
- std::to_string(type.scale()) + "," +
- std::to_string(type.bit_width()));
- }
+ Status Visit(const DecimalType& type) {
+ if (type.bit_width() == 128) {
+ // 128 is the default bit-width
+ return SetFormat("d:" + std::to_string(type.precision()) + "," +
+ std::to_string(type.scale()));
+ } else {
+ return SetFormat("d:" + std::to_string(type.precision()) + "," +
+ std::to_string(type.scale()) + "," +
+ std::to_string(type.bit_width()));
+ }
}
Status Visit(const BinaryType& type) { return SetFormat("z"); }
@@ -980,20 +980,20 @@ struct SchemaImporter {
Status ProcessDecimal() {
RETURN_NOT_OK(f_parser_.CheckNext(':'));
ARROW_ASSIGN_OR_RAISE(auto prec_scale, f_parser_.ParseInts(f_parser_.Rest()));
- // 3 elements indicates bit width was communicated as well.
- if (prec_scale.size() != 2 && prec_scale.size() != 3) {
+ // 3 elements indicates bit width was communicated as well.
+ if (prec_scale.size() != 2 && prec_scale.size() != 3) {
return f_parser_.Invalid();
}
- if (prec_scale[0] <= 0) {
- return f_parser_.Invalid();
- }
- if (prec_scale.size() == 2 || prec_scale[2] == 128) {
- type_ = decimal128(prec_scale[0], prec_scale[1]);
- } else if (prec_scale[2] == 256) {
- type_ = decimal256(prec_scale[0], prec_scale[1]);
- } else {
+ if (prec_scale[0] <= 0) {
return f_parser_.Invalid();
}
+ if (prec_scale.size() == 2 || prec_scale[2] == 128) {
+ type_ = decimal128(prec_scale[0], prec_scale[1]);
+ } else if (prec_scale[2] == 256) {
+ type_ = decimal256(prec_scale[0], prec_scale[1]);
+ } else {
+ return f_parser_.Invalid();
+ }
return Status::OK();
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
index 142bd0d8c89..20c63c78959 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
@@ -118,33 +118,33 @@ bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const {
return Equals(*other.get());
}
-bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
- const EqualOptions& equal_options) const {
- if (length_ != other.length()) {
- return false;
- }
- if (null_count_ != other.null_count()) {
- return false;
- }
- // We cannot toggle check_metadata here yet, so we don't check it
- if (!type_->Equals(*other.type_, /*check_metadata=*/false)) {
- return false;
- }
-
- // Check contents of the underlying arrays. This checks for equality of
- // the underlying data independently of the chunk size.
- return internal::ApplyBinaryChunked(
- *this, other,
- [&](const Array& left_piece, const Array& right_piece,
- int64_t ARROW_ARG_UNUSED(position)) {
- if (!left_piece.ApproxEquals(right_piece, equal_options)) {
- return Status::Invalid("Unequal piece");
- }
- return Status::OK();
- })
- .ok();
-}
-
+bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
+ const EqualOptions& equal_options) const {
+ if (length_ != other.length()) {
+ return false;
+ }
+ if (null_count_ != other.null_count()) {
+ return false;
+ }
+ // We cannot toggle check_metadata here yet, so we don't check it
+ if (!type_->Equals(*other.type_, /*check_metadata=*/false)) {
+ return false;
+ }
+
+ // Check contents of the underlying arrays. This checks for equality of
+ // the underlying data independently of the chunk size.
+ return internal::ApplyBinaryChunked(
+ *this, other,
+ [&](const Array& left_piece, const Array& right_piece,
+ int64_t ARROW_ARG_UNUSED(position)) {
+ if (!left_piece.ApproxEquals(right_piece, equal_options)) {
+ return Status::Invalid("Unequal piece");
+ }
+ return Status::OK();
+ })
+ .ok();
+}
+
std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset, int64_t length) const {
ARROW_CHECK_LE(offset, length_) << "Slice offset greater than array length";
bool offset_equals_length = offset == length_;
@@ -246,7 +246,7 @@ Status ChunkedArray::ValidateFull() const {
RETURN_NOT_OK(Validate());
for (size_t i = 0; i < chunks_.size(); ++i) {
const Array& chunk = *chunks_[i];
- const Status st = internal::ValidateArrayFull(chunk);
+ const Status st = internal::ValidateArrayFull(chunk);
if (!st.ok()) {
return Status::Invalid("In chunk ", i, ": ", st.ToString());
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
index 2ace045c2bf..892ae637545 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
@@ -23,7 +23,7 @@
#include <utility>
#include <vector>
-#include "arrow/compare.h"
+#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
@@ -73,9 +73,9 @@ class ARROW_EXPORT ChunkedArray {
/// data type.
explicit ChunkedArray(ArrayVector chunks);
- ChunkedArray(ChunkedArray&&) = default;
- ChunkedArray& operator=(ChunkedArray&&) = default;
-
+ ChunkedArray(ChunkedArray&&) = default;
+ ChunkedArray& operator=(ChunkedArray&&) = default;
+
/// \brief Construct a chunked array from a single Array
explicit ChunkedArray(std::shared_ptr<Array> chunk)
: ChunkedArray(ArrayVector{std::move(chunk)}) {}
@@ -137,9 +137,9 @@ class ARROW_EXPORT ChunkedArray {
bool Equals(const ChunkedArray& other) const;
/// \brief Determine if two chunked arrays are equal.
bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
- /// \brief Determine if two chunked arrays approximately equal
- bool ApproxEquals(const ChunkedArray& other,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ /// \brief Determine if two chunked arrays approximately equal
+ bool ApproxEquals(const ChunkedArray& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
/// \return PrettyPrint representation suitable for debugging
std::string ToString() const;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
index 4c6f97faf95..51fec14e768 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
@@ -38,10 +38,10 @@
#include "arrow/tensor.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_reader.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
@@ -51,499 +51,499 @@
namespace arrow {
using internal::BitmapEquals;
-using internal::BitmapReader;
-using internal::BitmapUInt64Reader;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
-using internal::OptionalBitmapEquals;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// TODO also handle HALF_FLOAT NaNs
-
-enum FloatingEqualityFlags : int8_t { Approximate = 1, NansEqual = 2 };
-
-template <typename T, int8_t Flags>
-struct FloatingEquality {
- bool operator()(T x, T y) { return x == y; }
-};
-
-template <typename T>
-struct FloatingEquality<T, NansEqual> {
- bool operator()(T x, T y) { return (x == y) || (std::isnan(x) && std::isnan(y)); }
-};
-
-template <typename T>
-struct FloatingEquality<T, Approximate> {
- explicit FloatingEquality(const EqualOptions& options)
- : epsilon(static_cast<T>(options.atol())) {}
-
- bool operator()(T x, T y) { return (fabs(x - y) <= epsilon) || (x == y); }
-
- const T epsilon;
-};
-
-template <typename T>
-struct FloatingEquality<T, Approximate | NansEqual> {
- explicit FloatingEquality(const EqualOptions& options)
- : epsilon(static_cast<T>(options.atol())) {}
-
- bool operator()(T x, T y) {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- }
-
- const T epsilon;
-};
-
-template <typename T, typename Visitor>
-void VisitFloatingEquality(const EqualOptions& options, bool floating_approximate,
- Visitor&& visit) {
- if (options.nans_equal()) {
- if (floating_approximate) {
- visit(FloatingEquality<T, NansEqual | Approximate>{options});
- } else {
- visit(FloatingEquality<T, NansEqual>{});
+// TODO also handle HALF_FLOAT NaNs
+
+enum FloatingEqualityFlags : int8_t { Approximate = 1, NansEqual = 2 };
+
+template <typename T, int8_t Flags>
+struct FloatingEquality {
+ bool operator()(T x, T y) { return x == y; }
+};
+
+template <typename T>
+struct FloatingEquality<T, NansEqual> {
+ bool operator()(T x, T y) { return (x == y) || (std::isnan(x) && std::isnan(y)); }
+};
+
+template <typename T>
+struct FloatingEquality<T, Approximate> {
+ explicit FloatingEquality(const EqualOptions& options)
+ : epsilon(static_cast<T>(options.atol())) {}
+
+ bool operator()(T x, T y) { return (fabs(x - y) <= epsilon) || (x == y); }
+
+ const T epsilon;
+};
+
+template <typename T>
+struct FloatingEquality<T, Approximate | NansEqual> {
+ explicit FloatingEquality(const EqualOptions& options)
+ : epsilon(static_cast<T>(options.atol())) {}
+
+ bool operator()(T x, T y) {
+ return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+ }
+
+ const T epsilon;
+};
+
+template <typename T, typename Visitor>
+void VisitFloatingEquality(const EqualOptions& options, bool floating_approximate,
+ Visitor&& visit) {
+ if (options.nans_equal()) {
+ if (floating_approximate) {
+ visit(FloatingEquality<T, NansEqual | Approximate>{options});
+ } else {
+ visit(FloatingEquality<T, NansEqual>{});
}
} else {
- if (floating_approximate) {
- visit(FloatingEquality<T, Approximate>{options});
- } else {
- visit(FloatingEquality<T, 0>{});
+ if (floating_approximate) {
+ visit(FloatingEquality<T, Approximate>{options});
+ } else {
+ visit(FloatingEquality<T, 0>{});
}
}
}
-inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) {
- if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) {
- return false;
- }
- for (const auto& child : type.fields()) {
- if (!IdentityImpliesEqualityNansNotEqual(*child->type())) {
- return false;
- }
+inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) {
+ if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) {
+ return false;
}
- return true;
+ for (const auto& child : type.fields()) {
+ if (!IdentityImpliesEqualityNansNotEqual(*child->type())) {
+ return false;
+ }
+ }
+ return true;
}
-inline bool IdentityImpliesEquality(const DataType& type, const EqualOptions& options) {
- if (options.nans_equal()) {
- return true;
+inline bool IdentityImpliesEquality(const DataType& type, const EqualOptions& options) {
+ if (options.nans_equal()) {
+ return true;
}
- return IdentityImpliesEqualityNansNotEqual(type);
+ return IdentityImpliesEqualityNansNotEqual(type);
}
-bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
- int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx, const EqualOptions& options,
- bool floating_approximate);
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-class RangeDataEqualsImpl {
+class RangeDataEqualsImpl {
public:
- // PRE-CONDITIONS:
- // - the types are equal
- // - the ranges are in bounds
- RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
- const ArrayData& left, const ArrayData& right,
- int64_t left_start_idx, int64_t right_start_idx,
- int64_t range_length)
- : options_(options),
- floating_approximate_(floating_approximate),
- left_(left),
- right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
right_start_idx_(right_start_idx),
- range_length_(range_length),
+ range_length_(range_length),
result_(false) {}
- bool Compare() {
- // Compare null bitmaps
- if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
- range_length_ == right_.length) {
- // If we're comparing entire arrays, we can first compare the cached null counts
- if (left_.GetNullCount() != right_.GetNullCount()) {
- return false;
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
- right_.buffers[0], right_.offset + right_start_idx_,
- range_length_)) {
- return false;
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
+ }
+ // Compare values
+ return CompareWithType(*left_.type);
+ }
+
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
}
- // Compare values
- return CompareWithType(*left_.type);
+ return result_;
}
- bool CompareWithType(const DataType& type) {
- result_ = true;
- if (range_length_ != 0) {
- ARROW_CHECK_OK(VisitTypeInline(type, this));
- }
- return result_;
- }
+ Status Visit(const NullType&) { return Status::OK(); }
- Status Visit(const NullType&) { return Status::OK(); }
-
- template <typename TypeClass>
- enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
- return ComparePrimitive(type);
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename TypeClass>
- enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
- return ComparePrimitive(type);
- }
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
+ }
- Status Visit(const BooleanType&) {
- const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
- const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- if (length <= 8) {
- // Avoid the BitmapUInt64Reader overhead for very small runs
- for (int64_t j = i; j < i + length; ++j) {
- if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
- BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
- return false;
- }
- }
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
+ }
return true;
- } else if (length <= 1024) {
- BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
- length);
- BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
- length);
- while (left_reader.position() < length) {
- if (left_reader.NextWord() != right_reader.NextWord()) {
- return false;
- }
- }
- DCHECK_EQ(right_reader.position(), length);
- } else {
- // BitmapEquals is the fastest method on large runs
- return BitmapEquals(left_bits, left_start_idx_ + left_.offset + i, right_bits,
- right_start_idx_ + right_.offset + i, length);
+ } else if (length <= 1024) {
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
+ }
+ DCHECK_EQ(right_reader.position(), length);
+ } else {
+ // BitmapEquals is the fastest method on large runs
+ return BitmapEquals(left_bits, left_start_idx_ + left_.offset + i, right_bits,
+ right_start_idx_ + right_.offset + i, length);
}
- return true;
+ return true;
};
- VisitValidRuns(compare_runs);
- return Status::OK();
- }
-
- Status Visit(const FloatType& type) { return CompareFloating(type); }
-
- Status Visit(const DoubleType& type) { return CompareFloating(type); }
-
- // Also matches StringType
- Status Visit(const BinaryType& type) { return CompareBinary(type); }
-
- // Also matches LargeStringType
- Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
-
- Status Visit(const FixedSizeBinaryType& type) {
- const auto byte_width = type.byte_width();
- const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
- const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
-
- if (left_data != nullptr && right_data != nullptr) {
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
- right_data + (right_start_idx_ + right_.offset + i) * byte_width,
- length * byte_width) == 0;
- };
- VisitValidRuns(compare_runs);
- } else {
- auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
- VisitValidRuns(compare_runs);
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
+
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
+
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
+
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
+
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- return Status::OK();
- }
-
- // Also matches MapType
- Status Visit(const ListType& type) { return CompareList(type); }
-
- Status Visit(const LargeListType& type) { return CompareList(type); }
-
- Status Visit(const FixedSizeListType& type) {
- const auto list_size = type.list_size();
- const ArrayData& left_data = *left_.child_data[0];
- const ArrayData& right_data = *right_.child_data[0];
-
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
- (left_start_idx_ + left_.offset + i) * list_size,
- (right_start_idx_ + right_.offset + i) * list_size,
- length * list_size);
- return impl.Compare();
- };
- VisitValidRuns(compare_runs);
- return Status::OK();
- }
-
- Status Visit(const StructType& type) {
- const int32_t num_fields = type.num_fields();
-
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- for (int32_t f = 0; f < num_fields; ++f) {
- RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
- *right_.child_data[f],
- left_start_idx_ + left_.offset + i,
- right_start_idx_ + right_.offset + i, length);
- if (!impl.Compare()) {
+ return Status::OK();
+ }
+
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
+
+ Status Visit(const LargeListType& type) { return CompareList(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
+
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
+
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
return false;
}
}
- return true;
- };
- VisitValidRuns(compare_runs);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const SparseUnionType& type) {
- const auto& child_ids = type.child_ids();
- const int8_t* left_codes = left_.GetValues<int8_t>(1);
- const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- // Unions don't have a null bitmap
- for (int64_t i = 0; i < range_length_; ++i) {
- const auto type_id = left_codes[left_start_idx_ + i];
- if (type_id != right_codes[right_start_idx_ + i]) {
+ // Unions don't have a null bitmap
+ for (int64_t i = 0; i < range_length_; ++i) {
+ const auto type_id = left_codes[left_start_idx_ + i];
+ if (type_id != right_codes[right_start_idx_ + i]) {
result_ = false;
- break;
+ break;
}
- const auto child_num = child_ids[type_id];
- // XXX can we instead detect runs of same-child union values?
- RangeDataEqualsImpl impl(
- options_, floating_approximate_, *left_.child_data[child_num],
- *right_.child_data[child_num], left_start_idx_ + left_.offset + i,
- right_start_idx_ + right_.offset + i, 1);
- if (!impl.Compare()) {
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, 1);
+ if (!impl.Compare()) {
result_ = false;
- break;
+ break;
}
}
return Status::OK();
}
- Status Visit(const DenseUnionType& type) {
- const auto& child_ids = type.child_ids();
- const int8_t* left_codes = left_.GetValues<int8_t>(1);
- const int8_t* right_codes = right_.GetValues<int8_t>(1);
- const int32_t* left_offsets = left_.GetValues<int32_t>(2);
- const int32_t* right_offsets = right_.GetValues<int32_t>(2);
-
- for (int64_t i = 0; i < range_length_; ++i) {
- const auto type_id = left_codes[left_start_idx_ + i];
- if (type_id != right_codes[right_start_idx_ + i]) {
- result_ = false;
- break;
- }
- const auto child_num = child_ids[type_id];
- RangeDataEqualsImpl impl(
- options_, floating_approximate_, *left_.child_data[child_num],
- *right_.child_data[child_num], left_offsets[left_start_idx_ + i],
- right_offsets[right_start_idx_ + i], 1);
- if (!impl.Compare()) {
- result_ = false;
- break;
- }
- }
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ for (int64_t i = 0; i < range_length_; ++i) {
+ const auto type_id = left_codes[left_start_idx_ + i];
+ if (type_id != right_codes[right_start_idx_ + i]) {
+ result_ = false;
+ break;
+ }
+ const auto child_num = child_ids[type_id];
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_offsets[left_start_idx_ + i],
+ right_offsets[right_start_idx_ + i], 1);
+ if (!impl.Compare()) {
+ result_ = false;
+ break;
+ }
+ }
return Status::OK();
}
- Status Visit(const DictionaryType& type) {
- // Compare dictionaries
- result_ &= CompareArrayRanges(
- *left_.dictionary, *right_.dictionary,
- /*left_start_idx=*/0,
- /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
- /*right_start_idx=*/0, options_, floating_approximate_);
- if (result_) {
- // Compare indices
- result_ &= CompareWithType(*type.index_type());
+ Status Visit(const DictionaryType& type) {
+ // Compare dictionaries
+ result_ &= CompareArrayRanges(
+ *left_.dictionary, *right_.dictionary,
+ /*left_start_idx=*/0,
+ /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+ /*right_start_idx=*/0, options_, floating_approximate_);
+ if (result_) {
+ // Compare indices
+ result_ &= CompareWithType(*type.index_type());
}
return Status::OK();
}
- Status Visit(const ExtensionType& type) {
- // Compare storages
- result_ &= CompareWithType(*type.storage_type());
+ Status Visit(const ExtensionType& type) {
+ // Compare storages
+ result_ &= CompareWithType(*type.storage_type());
return Status::OK();
}
protected:
- // For CompareFloating (templated local classes or lambdas not supported in C++11)
- template <typename CType>
- struct ComparatorVisitor {
- RangeDataEqualsImpl* impl;
- const CType* left_values;
- const CType* right_values;
-
- template <typename CompareFunction>
- void operator()(CompareFunction&& compare) {
- impl->VisitValues([&](int64_t i) {
- const CType x = left_values[i + impl->left_start_idx_];
- const CType y = right_values[i + impl->right_start_idx_];
- return compare(x, y);
- });
- }
- };
-
- template <typename CType>
- friend struct ComparatorVisitor;
-
- template <typename TypeClass, typename CType = typename TypeClass::c_type>
- Status ComparePrimitive(const TypeClass&) {
- const CType* left_values = left_.GetValues<CType>(1);
- const CType* right_values = right_.GetValues<CType>(1);
- VisitValidRuns([&](int64_t i, int64_t length) {
- return memcmp(left_values + left_start_idx_ + i,
- right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
- });
- return Status::OK();
- }
-
- template <typename TypeClass>
- Status CompareFloating(const TypeClass&) {
- using CType = typename TypeClass::c_type;
- const CType* left_values = left_.GetValues<CType>(1);
- const CType* right_values = right_.GetValues<CType>(1);
-
- ComparatorVisitor<CType> visitor{this, left_values, right_values};
- VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
+ // For CompareFloating (templated local classes or lambdas not supported in C++11)
+ template <typename CType>
+ struct ComparatorVisitor {
+ RangeDataEqualsImpl* impl;
+ const CType* left_values;
+ const CType* right_values;
+
+ template <typename CompareFunction>
+ void operator()(CompareFunction&& compare) {
+ impl->VisitValues([&](int64_t i) {
+ const CType x = left_values[i + impl->left_start_idx_];
+ const CType y = right_values[i + impl->right_start_idx_];
+ return compare(x, y);
+ });
+ }
+ };
+
+ template <typename CType>
+ friend struct ComparatorVisitor;
+
+ template <typename TypeClass, typename CType = typename TypeClass::c_type>
+ Status ComparePrimitive(const TypeClass&) {
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return memcmp(left_values + left_start_idx_ + i,
+ right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+ });
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ Status CompareFloating(const TypeClass&) {
+ using CType = typename TypeClass::c_type;
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+
+ ComparatorVisitor<CType> visitor{this, left_values, right_values};
+ VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
return Status::OK();
}
- template <typename TypeClass>
- Status CompareBinary(const TypeClass&) {
- const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
- const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
+ template <typename TypeClass>
+ Status CompareBinary(const TypeClass&) {
+ const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
- if (left_data != nullptr && right_data != nullptr) {
- const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
- int64_t length) -> bool {
- return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
- };
- CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
+ if (left_data != nullptr && right_data != nullptr) {
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+ };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
} else {
- // One of the arrays is an array of empty strings and nulls.
- // We just need to compare the offsets.
- // (note we must not call memcmp() with null data pointers)
- CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
+ // One of the arrays is an array of empty strings and nulls.
+ // We just need to compare the offsets.
+ // (note we must not call memcmp() with null data pointers)
+ CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
}
return Status::OK();
}
- template <typename TypeClass>
- Status CompareList(const TypeClass&) {
- const ArrayData& left_data = *left_.child_data[0];
- const ArrayData& right_data = *right_.child_data[0];
+ template <typename TypeClass>
+ Status CompareList(const TypeClass&) {
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
- int64_t length) -> bool {
- RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
- left_offset, right_offset, length);
- return impl.Compare();
- };
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ left_offset, right_offset, length);
+ return impl.Compare();
+ };
- CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
return Status::OK();
}
- template <typename offset_type, typename CompareRanges>
- void CompareWithOffsets(int offsets_buffer_index, CompareRanges&& compare_ranges) {
- const offset_type* left_offsets =
- left_.GetValues<offset_type>(offsets_buffer_index) + left_start_idx_;
- const offset_type* right_offsets =
- right_.GetValues<offset_type>(offsets_buffer_index) + right_start_idx_;
+ template <typename offset_type, typename CompareRanges>
+ void CompareWithOffsets(int offsets_buffer_index, CompareRanges&& compare_ranges) {
+ const offset_type* left_offsets =
+ left_.GetValues<offset_type>(offsets_buffer_index) + left_start_idx_;
+ const offset_type* right_offsets =
+ right_.GetValues<offset_type>(offsets_buffer_index) + right_start_idx_;
- const auto compare_runs = [&](int64_t i, int64_t length) {
- for (int64_t j = i; j < i + length; ++j) {
- if (left_offsets[j + 1] - left_offsets[j] !=
- right_offsets[j + 1] - right_offsets[j]) {
+ const auto compare_runs = [&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ if (left_offsets[j + 1] - left_offsets[j] !=
+ right_offsets[j + 1] - right_offsets[j]) {
return false;
}
}
- if (!compare_ranges(left_offsets[i], right_offsets[i],
- left_offsets[i + length] - left_offsets[i])) {
- return false;
- }
+ if (!compare_ranges(left_offsets[i], right_offsets[i],
+ left_offsets[i + length] - left_offsets[i])) {
+ return false;
+ }
return true;
- };
-
- VisitValidRuns(compare_runs);
- }
-
- template <typename CompareValues>
- void VisitValues(CompareValues&& compare_values) {
- internal::VisitSetBitRunsVoid(left_.buffers[0], left_.offset + left_start_idx_,
- range_length_, [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; ++i) {
- result_ &= compare_values(position + i);
- }
- });
- }
-
- // Visit and compare runs of non-null values
- template <typename CompareRuns>
- void VisitValidRuns(CompareRuns&& compare_runs) {
- const uint8_t* left_null_bitmap = left_.GetValues<uint8_t>(0, 0);
- if (left_null_bitmap == nullptr) {
- result_ = compare_runs(0, range_length_);
- return;
+ };
+
+ VisitValidRuns(compare_runs);
+ }
+
+ template <typename CompareValues>
+ void VisitValues(CompareValues&& compare_values) {
+ internal::VisitSetBitRunsVoid(left_.buffers[0], left_.offset + left_start_idx_,
+ range_length_, [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; ++i) {
+ result_ &= compare_values(position + i);
+ }
+ });
+ }
+
+ // Visit and compare runs of non-null values
+ template <typename CompareRuns>
+ void VisitValidRuns(CompareRuns&& compare_runs) {
+ const uint8_t* left_null_bitmap = left_.GetValues<uint8_t>(0, 0);
+ if (left_null_bitmap == nullptr) {
+ result_ = compare_runs(0, range_length_);
+ return;
}
- internal::SetBitRunReader reader(left_null_bitmap, left_.offset + left_start_idx_,
- range_length_);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- return;
+ internal::SetBitRunReader reader(left_null_bitmap, left_.offset + left_start_idx_,
+ range_length_);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ return;
}
- if (!compare_runs(run.position, run.length)) {
- result_ = false;
- return;
+ if (!compare_runs(run.position, run.length)) {
+ result_ = false;
+ return;
}
}
}
- const EqualOptions& options_;
- const bool floating_approximate_;
- const ArrayData& left_;
- const ArrayData& right_;
- const int64_t left_start_idx_;
- const int64_t right_start_idx_;
- const int64_t range_length_;
+ const EqualOptions& options_;
+ const bool floating_approximate_;
+ const ArrayData& left_;
+ const ArrayData& right_;
+ const int64_t left_start_idx_;
+ const int64_t right_start_idx_;
+ const int64_t range_length_;
- bool result_;
+ bool result_;
};
-bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
- int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx, const EqualOptions& options,
- bool floating_approximate) {
- if (left.type->id() != right.type->id() ||
- !TypeEquals(*left.type, *right.type, false /* check_metadata */)) {
- return false;
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate) {
+ if (left.type->id() != right.type->id() ||
+ !TypeEquals(*left.type, *right.type, false /* check_metadata */)) {
+ return false;
}
- const int64_t range_length = left_end_idx - left_start_idx;
- DCHECK_GE(range_length, 0);
- if (left_start_idx + range_length > left.length) {
- // Left range too small
+ const int64_t range_length = left_end_idx - left_start_idx;
+ DCHECK_GE(range_length, 0);
+ if (left_start_idx + range_length > left.length) {
+ // Left range too small
return false;
}
- if (right_start_idx + range_length > right.length) {
- // Right range too small
+ if (right_start_idx + range_length > right.length) {
+ // Right range too small
return false;
}
- if (&left == &right && left_start_idx == right_start_idx &&
- IdentityImpliesEquality(*left.type, options)) {
- return true;
+ if (&left == &right && left_start_idx == right_start_idx &&
+ IdentityImpliesEquality(*left.type, options)) {
+ return true;
}
- // Compare values
- RangeDataEqualsImpl impl(options, floating_approximate, left, right, left_start_idx,
- right_start_idx, range_length);
- return impl.Compare();
+ // Compare values
+ RangeDataEqualsImpl impl(options, floating_approximate, left, right, left_start_idx,
+ right_start_idx, range_length);
+ return impl.Compare();
}
class TypeEqualsVisitor {
@@ -611,12 +611,12 @@ class TypeEqualsVisitor {
return Status::OK();
}
- Status Visit(const Decimal256Type& left) {
- const auto& right = checked_cast<const Decimal256Type&>(right_);
- result_ = left.precision() == right.precision() && left.scale() == right.scale();
- return Status::OK();
- }
-
+ Status Visit(const Decimal256Type& left) {
+ const auto& right = checked_cast<const Decimal256Type&>(right_);
+ result_ = left.precision() == right.precision() && left.scale() == right.scale();
+ return Status::OK();
+ }
+
template <typename T>
enable_if_t<is_list_like_type<T>::value || is_struct_type<T>::value, Status> Visit(
const T& left) {
@@ -671,22 +671,22 @@ class TypeEqualsVisitor {
bool result_;
};
-bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
- bool floating_approximate);
-bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
- bool floating_approximate);
-
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
+ bool floating_approximate);
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
+ bool floating_approximate);
+
class ScalarEqualsVisitor {
public:
- // PRE-CONDITIONS:
- // - the types are equal
- // - the scalars are non-null
- explicit ScalarEqualsVisitor(const Scalar& right, const EqualOptions& opts,
- bool floating_approximate)
- : right_(right),
- options_(opts),
- floating_approximate_(floating_approximate),
- result_(false) {}
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the scalars are non-null
+ explicit ScalarEqualsVisitor(const Scalar& right, const EqualOptions& opts,
+ bool floating_approximate)
+ : right_(right),
+ options_(opts),
+ floating_approximate_(floating_approximate),
+ result_(false) {}
Status Visit(const NullScalar& left) {
result_ = true;
@@ -700,8 +700,8 @@ class ScalarEqualsVisitor {
}
template <typename T>
- typename std::enable_if<(is_primitive_ctype<typename T::TypeClass>::value ||
- is_temporal_type<typename T::TypeClass>::value),
+ typename std::enable_if<(is_primitive_ctype<typename T::TypeClass>::value ||
+ is_temporal_type<typename T::TypeClass>::value),
Status>::type
Visit(const T& left_) {
const auto& right = checked_cast<const T&>(right_);
@@ -709,10 +709,10 @@ class ScalarEqualsVisitor {
return Status::OK();
}
- Status Visit(const FloatScalar& left) { return CompareFloating(left); }
-
- Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
-
+ Status Visit(const FloatScalar& left) { return CompareFloating(left); }
+
+ Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
+
template <typename T>
typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, Status>::type
Visit(const T& left) {
@@ -727,33 +727,33 @@ class ScalarEqualsVisitor {
return Status::OK();
}
- Status Visit(const Decimal256Scalar& left) {
- const auto& right = checked_cast<const Decimal256Scalar&>(right_);
- result_ = left.value == right.value;
- return Status::OK();
- }
-
+ Status Visit(const Decimal256Scalar& left) {
+ const auto& right = checked_cast<const Decimal256Scalar&>(right_);
+ result_ = left.value == right.value;
+ return Status::OK();
+ }
+
Status Visit(const ListScalar& left) {
const auto& right = checked_cast<const ListScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
Status Visit(const LargeListScalar& left) {
const auto& right = checked_cast<const LargeListScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
Status Visit(const MapScalar& left) {
const auto& right = checked_cast<const MapScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
Status Visit(const FixedSizeListScalar& left) {
const auto& right = checked_cast<const FixedSizeListScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
@@ -765,8 +765,8 @@ class ScalarEqualsVisitor {
} else {
bool all_equals = true;
for (size_t i = 0; i < left.value.size() && all_equals; i++) {
- all_equals &= ScalarEquals(*left.value[i], *right.value[i], options_,
- floating_approximate_);
+ all_equals &= ScalarEquals(*left.value[i], *right.value[i], options_,
+ floating_approximate_);
}
result_ = all_equals;
}
@@ -777,7 +777,7 @@ class ScalarEqualsVisitor {
Status Visit(const UnionScalar& left) {
const auto& right = checked_cast<const UnionScalar&>(right_);
if (left.is_valid && right.is_valid) {
- result_ = ScalarEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ScalarEquals(*left.value, *right.value, options_, floating_approximate_);
} else if (!left.is_valid && !right.is_valid) {
result_ = true;
} else {
@@ -788,10 +788,10 @@ class ScalarEqualsVisitor {
Status Visit(const DictionaryScalar& left) {
const auto& right = checked_cast<const DictionaryScalar&>(right_);
- result_ = ScalarEquals(*left.value.index, *right.value.index, options_,
- floating_approximate_) &&
- ArrayEquals(*left.value.dictionary, *right.value.dictionary, options_,
- floating_approximate_);
+ result_ = ScalarEquals(*left.value.index, *right.value.index, options_,
+ floating_approximate_) &&
+ ArrayEquals(*left.value.dictionary, *right.value.dictionary, options_,
+ floating_approximate_);
return Status::OK();
}
@@ -802,40 +802,40 @@ class ScalarEqualsVisitor {
bool result() const { return result_; }
protected:
- // For CompareFloating (templated local classes or lambdas not supported in C++11)
- template <typename ScalarType>
- struct ComparatorVisitor {
- const ScalarType& left;
- const ScalarType& right;
- bool* result;
-
- template <typename CompareFunction>
- void operator()(CompareFunction&& compare) {
- *result = compare(left.value, right.value);
- }
- };
-
- template <typename ScalarType>
- Status CompareFloating(const ScalarType& left) {
- using CType = decltype(left.value);
-
- ComparatorVisitor<ScalarType> visitor{left, checked_cast<const ScalarType&>(right_),
- &result_};
- VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
- return Status::OK();
- }
-
+ // For CompareFloating (templated local classes or lambdas not supported in C++11)
+ template <typename ScalarType>
+ struct ComparatorVisitor {
+ const ScalarType& left;
+ const ScalarType& right;
+ bool* result;
+
+ template <typename CompareFunction>
+ void operator()(CompareFunction&& compare) {
+ *result = compare(left.value, right.value);
+ }
+ };
+
+ template <typename ScalarType>
+ Status CompareFloating(const ScalarType& left) {
+ using CType = decltype(left.value);
+
+ ComparatorVisitor<ScalarType> visitor{left, checked_cast<const ScalarType&>(right_),
+ &result_};
+ VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
+ return Status::OK();
+ }
+
const Scalar& right_;
- const EqualOptions options_;
- const bool floating_approximate_;
+ const EqualOptions options_;
+ const bool floating_approximate_;
bool result_;
};
-Status PrintDiff(const Array& left, const Array& right, std::ostream* os);
-
-Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
- int64_t left_length, int64_t right_offset, int64_t right_length,
- std::ostream* os) {
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os);
+
+Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
+ int64_t left_length, int64_t right_offset, int64_t right_length,
+ std::ostream* os) {
if (os == nullptr) {
return Status::OK();
}
@@ -868,100 +868,100 @@ Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
return Status::OK();
}
- const auto left_slice = left.Slice(left_offset, left_length);
- const auto right_slice = right.Slice(right_offset, right_length);
- ARROW_ASSIGN_OR_RAISE(auto edits,
- Diff(*left_slice, *right_slice, default_memory_pool()));
+ const auto left_slice = left.Slice(left_offset, left_length);
+ const auto right_slice = right.Slice(right_offset, right_length);
+ ARROW_ASSIGN_OR_RAISE(auto edits,
+ Diff(*left_slice, *right_slice, default_memory_pool()));
ARROW_ASSIGN_OR_RAISE(auto formatter, MakeUnifiedDiffFormatter(*left.type(), os));
- return formatter(*edits, *left_slice, *right_slice);
+ return formatter(*edits, *left_slice, *right_slice);
}
-Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
- return PrintDiff(left, right, 0, left.length(), 0, right.length(), os);
-}
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
+ return PrintDiff(left, right, 0, left.length(), 0, right.length(), os);
+}
-bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
- int64_t left_end_idx, int64_t right_start_idx,
- const EqualOptions& options, bool floating_approximate) {
- bool are_equal =
- CompareArrayRanges(*left.data(), *right.data(), left_start_idx, left_end_idx,
- right_start_idx, options, floating_approximate);
+bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options, bool floating_approximate) {
+ bool are_equal =
+ CompareArrayRanges(*left.data(), *right.data(), left_start_idx, left_end_idx,
+ right_start_idx, options, floating_approximate);
if (!are_equal) {
- ARROW_IGNORE_EXPR(PrintDiff(
- left, right, left_start_idx, left_end_idx, right_start_idx,
- right_start_idx + (left_end_idx - left_start_idx), options.diff_sink()));
+ ARROW_IGNORE_EXPR(PrintDiff(
+ left, right, left_start_idx, left_end_idx, right_start_idx,
+ right_start_idx + (left_end_idx - left_start_idx), options.diff_sink()));
}
return are_equal;
}
-bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
- bool floating_approximate) {
- if (left.length() != right.length()) {
- ARROW_IGNORE_EXPR(PrintDiff(left, right, opts.diff_sink()));
- return false;
- }
- return ArrayRangeEquals(left, right, 0, left.length(), 0, opts, floating_approximate);
-}
-
-bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
- bool floating_approximate) {
- if (&left == &right && IdentityImpliesEquality(*left.type, options)) {
- return true;
- }
- if (!left.type->Equals(right.type)) {
- return false;
- }
- if (left.is_valid != right.is_valid) {
- return false;
- }
- if (!left.is_valid) {
- return true;
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
+ bool floating_approximate) {
+ if (left.length() != right.length()) {
+ ARROW_IGNORE_EXPR(PrintDiff(left, right, opts.diff_sink()));
+ return false;
}
- ScalarEqualsVisitor visitor(right, options, floating_approximate);
- auto error = VisitScalarInline(left, &visitor);
- DCHECK_OK(error);
- return visitor.result();
-}
-
-} // namespace
-
-bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
- int64_t left_end_idx, int64_t right_start_idx,
- const EqualOptions& options) {
- const bool floating_approximate = false;
- return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
- options, floating_approximate);
-}
-
-bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_start_idx,
- int64_t left_end_idx, int64_t right_start_idx,
- const EqualOptions& options) {
- const bool floating_approximate = true;
- return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
- options, floating_approximate);
-}
-
-bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
- const bool floating_approximate = false;
- return ArrayEquals(left, right, opts, floating_approximate);
-}
-
-bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
- const bool floating_approximate = true;
- return ArrayEquals(left, right, opts, floating_approximate);
-}
-
-bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) {
- const bool floating_approximate = false;
- return ScalarEquals(left, right, options, floating_approximate);
+ return ArrayRangeEquals(left, right, 0, left.length(), 0, opts, floating_approximate);
}
-bool ScalarApproxEquals(const Scalar& left, const Scalar& right,
- const EqualOptions& options) {
- const bool floating_approximate = true;
- return ScalarEquals(left, right, options, floating_approximate);
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
+ bool floating_approximate) {
+ if (&left == &right && IdentityImpliesEquality(*left.type, options)) {
+ return true;
+ }
+ if (!left.type->Equals(right.type)) {
+ return false;
+ }
+ if (left.is_valid != right.is_valid) {
+ return false;
+ }
+ if (!left.is_valid) {
+ return true;
+ }
+ ScalarEqualsVisitor visitor(right, options, floating_approximate);
+ auto error = VisitScalarInline(left, &visitor);
+ DCHECK_OK(error);
+ return visitor.result();
}
+} // namespace
+
+bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options) {
+ const bool floating_approximate = false;
+ return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
+ options, floating_approximate);
+}
+
+bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options) {
+ const bool floating_approximate = true;
+ return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
+ options, floating_approximate);
+}
+
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
+ const bool floating_approximate = false;
+ return ArrayEquals(left, right, opts, floating_approximate);
+}
+
+bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
+ const bool floating_approximate = true;
+ return ArrayEquals(left, right, opts, floating_approximate);
+}
+
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) {
+ const bool floating_approximate = false;
+ return ScalarEquals(left, right, options, floating_approximate);
+}
+
+bool ScalarApproxEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options) {
+ const bool floating_approximate = true;
+ return ScalarEquals(left, right, options, floating_approximate);
+}
+
namespace {
bool StridedIntegerTensorContentEquals(const int dim_index, int64_t left_offset,
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
index 6769b23867b..3acd6b1b33e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
@@ -71,7 +71,7 @@ class EqualOptions {
return res;
}
- static EqualOptions Defaults() { return {}; }
+ static EqualOptions Defaults() { return {}; }
protected:
double atol_ = kDefaultAbsoluteTolerance;
@@ -88,25 +88,25 @@ bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right,
bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right,
const EqualOptions& = EqualOptions::Defaults());
-/// Returns true if indicated equal-length segment of arrays are exactly equal
+/// Returns true if indicated equal-length segment of arrays are exactly equal
bool ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right,
int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults());
-
-/// Returns true if indicated equal-length segment of arrays are approximately equal
-bool ARROW_EXPORT ArrayRangeApproxEquals(const Array& left, const Array& right,
- int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults());
-
-bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
- const EqualOptions& = EqualOptions::Defaults());
-
-/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
-bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
- const EqualOptions& = EqualOptions::Defaults());
-
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if indicated equal-length segment of arrays are approximately equal
+bool ARROW_EXPORT ArrayRangeApproxEquals(const Array& left, const Array& right,
+ int64_t start_idx, int64_t end_idx,
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults());
+
+bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
+bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
/// Returns true if the type metadata are exactly equal
/// \param[in] left a DataType
/// \param[in] right a DataType
@@ -122,12 +122,12 @@ bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right,
bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right,
const EqualOptions& options = EqualOptions::Defaults());
-/// Returns true if scalars are approximately equal
-/// \param[in] left a Scalar
-/// \param[in] right a Scalar
-/// \param[in] options comparison options
-bool ARROW_EXPORT
-ScalarApproxEquals(const Scalar& left, const Scalar& right,
- const EqualOptions& options = EqualOptions::Defaults());
-
+/// Returns true if scalars are approximately equal
+/// \param[in] left a Scalar
+/// \param[in] right a Scalar
+/// \param[in] options comparison options
+bool ARROW_EXPORT
+ScalarApproxEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options = EqualOptions::Defaults());
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
index 1b00c366bfd..2f26520c22a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
@@ -18,157 +18,157 @@
#include "arrow/compute/api_aggregate.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/registry.h"
-#include "arrow/compute/util_internal.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
-
-namespace internal {
-template <>
-struct EnumTraits<compute::QuantileOptions::Interpolation>
- : BasicEnumTraits<compute::QuantileOptions::Interpolation,
- compute::QuantileOptions::LINEAR, compute::QuantileOptions::LOWER,
- compute::QuantileOptions::HIGHER, compute::QuantileOptions::NEAREST,
- compute::QuantileOptions::MIDPOINT> {
- static std::string name() { return "QuantileOptions::Interpolation"; }
- static std::string value_name(compute::QuantileOptions::Interpolation value) {
- switch (value) {
- case compute::QuantileOptions::LINEAR:
- return "LINEAR";
- case compute::QuantileOptions::LOWER:
- return "LOWER";
- case compute::QuantileOptions::HIGHER:
- return "HIGHER";
- case compute::QuantileOptions::NEAREST:
- return "NEAREST";
- case compute::QuantileOptions::MIDPOINT:
- return "MIDPOINT";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
+
+namespace internal {
+template <>
+struct EnumTraits<compute::QuantileOptions::Interpolation>
+ : BasicEnumTraits<compute::QuantileOptions::Interpolation,
+ compute::QuantileOptions::LINEAR, compute::QuantileOptions::LOWER,
+ compute::QuantileOptions::HIGHER, compute::QuantileOptions::NEAREST,
+ compute::QuantileOptions::MIDPOINT> {
+ static std::string name() { return "QuantileOptions::Interpolation"; }
+ static std::string value_name(compute::QuantileOptions::Interpolation value) {
+ switch (value) {
+ case compute::QuantileOptions::LINEAR:
+ return "LINEAR";
+ case compute::QuantileOptions::LOWER:
+ return "LOWER";
+ case compute::QuantileOptions::HIGHER:
+ return "HIGHER";
+ case compute::QuantileOptions::NEAREST:
+ return "NEAREST";
+ case compute::QuantileOptions::MIDPOINT:
+ return "MIDPOINT";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
namespace compute {
// ----------------------------------------------------------------------
-// Function options
-
-using ::arrow::internal::checked_cast;
-
-namespace internal {
-namespace {
-using ::arrow::internal::DataMember;
-static auto kScalarAggregateOptionsType = GetFunctionOptionsType<ScalarAggregateOptions>(
- DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls),
- DataMember("min_count", &ScalarAggregateOptions::min_count));
-static auto kModeOptionsType =
- GetFunctionOptionsType<ModeOptions>(DataMember("n", &ModeOptions::n));
-static auto kVarianceOptionsType =
- GetFunctionOptionsType<VarianceOptions>(DataMember("ddof", &VarianceOptions::ddof));
-static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
- DataMember("q", &QuantileOptions::q),
- DataMember("interpolation", &QuantileOptions::interpolation));
-static auto kTDigestOptionsType = GetFunctionOptionsType<TDigestOptions>(
- DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta),
- DataMember("buffer_size", &TDigestOptions::buffer_size));
-static auto kIndexOptionsType =
- GetFunctionOptionsType<IndexOptions>(DataMember("value", &IndexOptions::value));
-} // namespace
-} // namespace internal
-
-ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count)
- : FunctionOptions(internal::kScalarAggregateOptionsType),
- skip_nulls(skip_nulls),
- min_count(min_count) {}
-constexpr char ScalarAggregateOptions::kTypeName[];
-
-ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {}
-constexpr char ModeOptions::kTypeName[];
-
-VarianceOptions::VarianceOptions(int ddof)
- : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {}
-constexpr char VarianceOptions::kTypeName[];
-
-QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation)
- : FunctionOptions(internal::kQuantileOptionsType),
- q{q},
- interpolation{interpolation} {}
-QuantileOptions::QuantileOptions(std::vector<double> q, enum Interpolation interpolation)
- : FunctionOptions(internal::kQuantileOptionsType),
- q{std::move(q)},
- interpolation{interpolation} {}
-constexpr char QuantileOptions::kTypeName[];
-
-TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size)
- : FunctionOptions(internal::kTDigestOptionsType),
- q{q},
- delta{delta},
- buffer_size{buffer_size} {}
-TDigestOptions::TDigestOptions(std::vector<double> q, uint32_t delta,
- uint32_t buffer_size)
- : FunctionOptions(internal::kTDigestOptionsType),
- q{std::move(q)},
- delta{delta},
- buffer_size{buffer_size} {}
-constexpr char TDigestOptions::kTypeName[];
-
-IndexOptions::IndexOptions(std::shared_ptr<Scalar> value)
- : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {}
-IndexOptions::IndexOptions() : IndexOptions(std::make_shared<NullScalar>()) {}
-constexpr char IndexOptions::kTypeName[];
-
-namespace internal {
-void RegisterAggregateOptions(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
-}
-} // namespace internal
-
-// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kScalarAggregateOptionsType = GetFunctionOptionsType<ScalarAggregateOptions>(
+ DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls),
+ DataMember("min_count", &ScalarAggregateOptions::min_count));
+static auto kModeOptionsType =
+ GetFunctionOptionsType<ModeOptions>(DataMember("n", &ModeOptions::n));
+static auto kVarianceOptionsType =
+ GetFunctionOptionsType<VarianceOptions>(DataMember("ddof", &VarianceOptions::ddof));
+static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
+ DataMember("q", &QuantileOptions::q),
+ DataMember("interpolation", &QuantileOptions::interpolation));
+static auto kTDigestOptionsType = GetFunctionOptionsType<TDigestOptions>(
+ DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta),
+ DataMember("buffer_size", &TDigestOptions::buffer_size));
+static auto kIndexOptionsType =
+ GetFunctionOptionsType<IndexOptions>(DataMember("value", &IndexOptions::value));
+} // namespace
+} // namespace internal
+
+ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count)
+ : FunctionOptions(internal::kScalarAggregateOptionsType),
+ skip_nulls(skip_nulls),
+ min_count(min_count) {}
+constexpr char ScalarAggregateOptions::kTypeName[];
+
+ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {}
+constexpr char ModeOptions::kTypeName[];
+
+VarianceOptions::VarianceOptions(int ddof)
+ : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {}
+constexpr char VarianceOptions::kTypeName[];
+
+QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{q},
+ interpolation{interpolation} {}
+QuantileOptions::QuantileOptions(std::vector<double> q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{std::move(q)},
+ interpolation{interpolation} {}
+constexpr char QuantileOptions::kTypeName[];
+
+TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{q},
+ delta{delta},
+ buffer_size{buffer_size} {}
+TDigestOptions::TDigestOptions(std::vector<double> q, uint32_t delta,
+ uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{std::move(q)},
+ delta{delta},
+ buffer_size{buffer_size} {}
+constexpr char TDigestOptions::kTypeName[];
+
+IndexOptions::IndexOptions(std::shared_ptr<Scalar> value)
+ : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {}
+IndexOptions::IndexOptions() : IndexOptions(std::make_shared<NullScalar>()) {}
+constexpr char IndexOptions::kTypeName[];
+
+namespace internal {
+void RegisterAggregateOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
+}
+} // namespace internal
+
+// ----------------------------------------------------------------------
// Scalar aggregates
-Result<Datum> Count(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
+Result<Datum> Count(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
return CallFunction("count", {value}, &options, ctx);
}
-Result<Datum> Mean(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("mean", {value}, &options, ctx);
+Result<Datum> Mean(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("mean", {value}, &options, ctx);
}
-Result<Datum> Sum(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("sum", {value}, &options, ctx);
+Result<Datum> Sum(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("sum", {value}, &options, ctx);
}
-Result<Datum> MinMax(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
+Result<Datum> MinMax(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
return CallFunction("min_max", {value}, &options, ctx);
}
-Result<Datum> Any(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("any", {value}, &options, ctx);
-}
-
-Result<Datum> All(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("all", {value}, &options, ctx);
-}
-
-Result<Datum> Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) {
- return CallFunction("mode", {value}, &options, ctx);
+Result<Datum> Any(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("any", {value}, &options, ctx);
}
+Result<Datum> All(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("all", {value}, &options, ctx);
+}
+
+Result<Datum> Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) {
+ return CallFunction("mode", {value}, &options, ctx);
+}
+
Result<Datum> Stddev(const Datum& value, const VarianceOptions& options,
ExecContext* ctx) {
return CallFunction("stddev", {value}, &options, ctx);
@@ -179,19 +179,19 @@ Result<Datum> Variance(const Datum& value, const VarianceOptions& options,
return CallFunction("variance", {value}, &options, ctx);
}
-Result<Datum> Quantile(const Datum& value, const QuantileOptions& options,
- ExecContext* ctx) {
- return CallFunction("quantile", {value}, &options, ctx);
-}
-
-Result<Datum> TDigest(const Datum& value, const TDigestOptions& options,
- ExecContext* ctx) {
- return CallFunction("tdigest", {value}, &options, ctx);
-}
-
-Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
- return CallFunction("index", {value}, &options, ctx);
-}
-
+Result<Datum> Quantile(const Datum& value, const QuantileOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("quantile", {value}, &options, ctx);
+}
+
+Result<Datum> TDigest(const Datum& value, const TDigestOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("tdigest", {value}, &options, ctx);
+}
+
+Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
+ return CallFunction("index", {value}, &options, ctx);
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
index 7a6c44bd923..37296779b2f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
@@ -40,108 +40,108 @@ class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
-/// \brief Control general scalar aggregate kernel behavior
-///
-/// By default, null values are ignored
-class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
- public:
- explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
- constexpr static char const kTypeName[] = "ScalarAggregateOptions";
- static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
-
- bool skip_nulls;
- uint32_t min_count;
+/// \brief Control general scalar aggregate kernel behavior
+///
+/// By default, null values are ignored
+class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
+ public:
+ explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
+ constexpr static char const kTypeName[] = "ScalarAggregateOptions";
+ static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
+
+ bool skip_nulls;
+ uint32_t min_count;
+};
+
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+class ARROW_EXPORT ModeOptions : public FunctionOptions {
+ public:
+ explicit ModeOptions(int64_t n = 1);
+ constexpr static char const kTypeName[] = "ModeOptions";
+ static ModeOptions Defaults() { return ModeOptions{}; }
+
+ int64_t n = 1;
};
-/// \brief Control Mode kernel behavior
-///
-/// Returns top-n common values and counts.
-/// By default, returns the most common value and count.
-class ARROW_EXPORT ModeOptions : public FunctionOptions {
- public:
- explicit ModeOptions(int64_t n = 1);
- constexpr static char const kTypeName[] = "ModeOptions";
- static ModeOptions Defaults() { return ModeOptions{}; }
-
- int64_t n = 1;
-};
-
-/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
-///
-/// The divisor used in calculations is N - ddof, where N is the number of elements.
-/// By default, ddof is zero, and population variance or stddev is returned.
-class ARROW_EXPORT VarianceOptions : public FunctionOptions {
- public:
- explicit VarianceOptions(int ddof = 0);
- constexpr static char const kTypeName[] = "VarianceOptions";
- static VarianceOptions Defaults() { return VarianceOptions{}; }
-
- int ddof = 0;
-};
-
-/// \brief Control Quantile kernel behavior
-///
-/// By default, returns the median value.
-class ARROW_EXPORT QuantileOptions : public FunctionOptions {
- public:
- /// Interpolation method to use when quantile lies between two data points
- enum Interpolation {
- LINEAR = 0,
- LOWER,
- HIGHER,
- NEAREST,
- MIDPOINT,
+/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
+///
+/// The divisor used in calculations is N - ddof, where N is the number of elements.
+/// By default, ddof is zero, and population variance or stddev is returned.
+class ARROW_EXPORT VarianceOptions : public FunctionOptions {
+ public:
+ explicit VarianceOptions(int ddof = 0);
+ constexpr static char const kTypeName[] = "VarianceOptions";
+ static VarianceOptions Defaults() { return VarianceOptions{}; }
+
+ int ddof = 0;
+};
+
+/// \brief Control Quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT QuantileOptions : public FunctionOptions {
+ public:
+ /// Interpolation method to use when quantile lies between two data points
+ enum Interpolation {
+ LINEAR = 0,
+ LOWER,
+ HIGHER,
+ NEAREST,
+ MIDPOINT,
};
- explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR);
-
- explicit QuantileOptions(std::vector<double> q,
- enum Interpolation interpolation = LINEAR);
-
- constexpr static char const kTypeName[] = "QuantileOptions";
- static QuantileOptions Defaults() { return QuantileOptions{}; }
+ explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR);
- /// quantile must be between 0 and 1 inclusive
- std::vector<double> q;
- enum Interpolation interpolation;
-};
-
-/// \brief Control TDigest approximate quantile kernel behavior
-///
-/// By default, returns the median value.
-class ARROW_EXPORT TDigestOptions : public FunctionOptions {
- public:
- explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
- uint32_t buffer_size = 500);
- explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
- uint32_t buffer_size = 500);
- constexpr static char const kTypeName[] = "TDigestOptions";
- static TDigestOptions Defaults() { return TDigestOptions{}; }
+ explicit QuantileOptions(std::vector<double> q,
+ enum Interpolation interpolation = LINEAR);
- /// quantile must be between 0 and 1 inclusive
- std::vector<double> q;
- /// compression parameter, default 100
- uint32_t delta;
- /// input buffer size, default 500
- uint32_t buffer_size;
+ constexpr static char const kTypeName[] = "QuantileOptions";
+ static QuantileOptions Defaults() { return QuantileOptions{}; }
+
+ /// quantile must be between 0 and 1 inclusive
+ std::vector<double> q;
+ enum Interpolation interpolation;
};
-/// \brief Control Index kernel behavior
-class ARROW_EXPORT IndexOptions : public FunctionOptions {
- public:
- explicit IndexOptions(std::shared_ptr<Scalar> value);
- // Default constructor for serialization
- IndexOptions();
- constexpr static char const kTypeName[] = "IndexOptions";
-
- std::shared_ptr<Scalar> value;
+/// \brief Control TDigest approximate quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT TDigestOptions : public FunctionOptions {
+ public:
+ explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
+ uint32_t buffer_size = 500);
+ explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
+ uint32_t buffer_size = 500);
+ constexpr static char const kTypeName[] = "TDigestOptions";
+ static TDigestOptions Defaults() { return TDigestOptions{}; }
+
+ /// quantile must be between 0 and 1 inclusive
+ std::vector<double> q;
+ /// compression parameter, default 100
+ uint32_t delta;
+ /// input buffer size, default 500
+ uint32_t buffer_size;
+};
+
+/// \brief Control Index kernel behavior
+class ARROW_EXPORT IndexOptions : public FunctionOptions {
+ public:
+ explicit IndexOptions(std::shared_ptr<Scalar> value);
+ // Default constructor for serialization
+ IndexOptions();
+ constexpr static char const kTypeName[] = "IndexOptions";
+
+ std::shared_ptr<Scalar> value;
};
/// @}
/// \brief Count non-null (or null) values in an array.
///
-/// \param[in] options counting options, see ScalarAggregateOptions for more information
+/// \param[in] options counting options, see ScalarAggregateOptions for more information
/// \param[in] datum to count
/// \param[in] ctx the function execution context, optional
/// \return out resulting datum
@@ -149,40 +149,40 @@ class ARROW_EXPORT IndexOptions : public FunctionOptions {
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Count(
- const Datum& datum,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Count(
+ const Datum& datum,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Compute the mean of a numeric array.
///
/// \param[in] value datum to compute the mean, expecting Array
-/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed mean as a DoubleScalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Mean(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Mean(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Sum values of a numeric array.
///
/// \param[in] value datum to sum, expecting Array or ChunkedArray
-/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Sum(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Sum(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Calculate the min / max of a numeric array
///
@@ -190,78 +190,78 @@ Result<Datum> Sum(
/// struct<min: T, max: T>, where T is the input type
///
/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a struct<min: T, max: T> scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> MinMax(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Test whether any element in a boolean array evaluates to true.
-///
-/// This function returns true if any of the elements in the array evaluates
-/// to true and false otherwise. Null values are ignored by default.
-/// If null values are taken into account by setting ScalarAggregateOptions
-/// parameter skip_nulls = false then Kleene logic is used.
-/// See KleeneOr for more details on Kleene logic.
-///
-/// \param[in] value input datum, expecting a boolean array
-/// \param[in] options see ScalarAggregateOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as a BooleanScalar
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Any(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Test whether all elements in a boolean array evaluate to true.
-///
-/// This function returns true if all of the elements in the array evaluate
-/// to true and false otherwise. Null values are ignored by default.
-/// If null values are taken into account by setting ScalarAggregateOptions
-/// parameter skip_nulls = false then Kleene logic is used.
-/// See KleeneAnd for more details on Kleene logic.
-///
-/// \param[in] value input datum, expecting a boolean array
-/// \param[in] options see ScalarAggregateOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as a BooleanScalar
-
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> All(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
+Result<Datum> MinMax(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether any element in a boolean array evaluates to true.
+///
+/// This function returns true if any of the elements in the array evaluates
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneOr for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Any(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether all elements in a boolean array evaluate to true.
+///
+/// This function returns true if all of the elements in the array evaluate
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneAnd for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> All(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
/// \brief Calculate the modal (most common) value of a numeric array
///
-/// This function returns top-n most common values and number of times they occur as
-/// an array of `struct<mode: T, count: int64>`, where T is the input type.
-/// Values with larger counts are returned before smaller ones.
-/// If there are more than one values with same count, smaller value is returned first.
+/// This function returns top-n most common values and number of times they occur as
+/// an array of `struct<mode: T, count: int64>`, where T is the input type.
+/// Values with larger counts are returned before smaller ones.
+/// If there are more than one values with same count, smaller value is returned first.
///
/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see ModeOptions for more information
+/// \param[in] options see ModeOptions for more information
/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as an array of struct<mode: T, count: int64>
+/// \return resulting datum as an array of struct<mode: T, count: int64>
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Mode(const Datum& value,
- const ModeOptions& options = ModeOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Mode(const Datum& value,
+ const ModeOptions& options = ModeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Calculate the standard deviation of a numeric array
///
@@ -291,143 +291,143 @@ Result<Datum> Variance(const Datum& value,
const VarianceOptions& options = VarianceOptions::Defaults(),
ExecContext* ctx = NULLPTR);
-/// \brief Calculate the quantiles of a numeric array
-///
-/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see QuantileOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as an array
-///
-/// \since 4.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Quantile(const Datum& value,
- const QuantileOptions& options = QuantileOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
-///
-/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see TDigestOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as an array
-///
-/// \since 4.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> TDigest(const Datum& value,
- const TDigestOptions& options = TDigestOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Find the first index of a value in an array.
-///
-/// \param[in] value The array to search.
-/// \param[in] options The array to search for. See IndexOoptions.
-/// \param[in] ctx the function execution context, optional
-/// \return out a Scalar containing the index (or -1 if not found).
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Index(const Datum& value, const IndexOptions& options,
- ExecContext* ctx = NULLPTR);
-
-namespace internal {
-
-/// Internal use only: streaming group identifier.
-/// Consumes batches of keys and yields batches of the group ids.
-class ARROW_EXPORT Grouper {
- public:
- virtual ~Grouper() = default;
-
- /// Construct a Grouper which receives the specified key types
- static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
- ExecContext* ctx = default_exec_context());
-
- /// Consume a batch of keys, producing the corresponding group ids as an integer array.
- /// Currently only uint32 indices will be produced, eventually the bit width will only
- /// be as wide as necessary.
- virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
-
- /// Get current unique keys. May be called multiple times.
- virtual Result<ExecBatch> GetUniques() = 0;
-
- /// Get the current number of groups.
- virtual uint32_t num_groups() const = 0;
-
- /// \brief Assemble lists of indices of identical elements.
- ///
- /// \param[in] ids An unsigned, all-valid integral array which will be
- /// used as grouping criteria.
- /// \param[in] num_groups An upper bound for the elements of ids
- /// \return A num_groups-long ListArray where the slot at i contains a
- /// list of indices where i appears in ids.
- ///
- /// MakeGroupings([
- /// 2,
- /// 2,
- /// 5,
- /// 5,
- /// 2,
- /// 3
- /// ], 8) == [
- /// [],
- /// [],
- /// [0, 1, 4],
- /// [5],
- /// [],
- /// [2, 3],
- /// [],
- /// []
- /// ]
- static Result<std::shared_ptr<ListArray>> MakeGroupings(
- const UInt32Array& ids, uint32_t num_groups,
- ExecContext* ctx = default_exec_context());
-
- /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
- /// the provided groupings.
- ///
- /// For example,
- /// ApplyGroupings([
- /// [],
- /// [],
- /// [0, 1, 4],
- /// [5],
- /// [],
- /// [2, 3],
- /// [],
- /// []
- /// ], [2, 2, 5, 5, 2, 3]) == [
- /// [],
- /// [],
- /// [2, 2, 2],
- /// [3],
- /// [],
- /// [5, 5],
- /// [],
- /// []
- /// ]
- static Result<std::shared_ptr<ListArray>> ApplyGroupings(
- const ListArray& groupings, const Array& array,
- ExecContext* ctx = default_exec_context());
-};
-
-/// \brief Configure a grouped aggregation
-struct ARROW_EXPORT Aggregate {
- /// the name of the aggregation function
- std::string function;
-
- /// options for the aggregation function
- const FunctionOptions* options;
-};
-
-/// Internal use only: helper function for testing HashAggregateKernels.
-/// This will be replaced by streaming execution operators.
-ARROW_EXPORT
-Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
- const std::vector<Aggregate>& aggregates,
- ExecContext* ctx = default_exec_context());
-
-} // namespace internal
+/// \brief Calculate the quantiles of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see QuantileOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Quantile(const Datum& value,
+ const QuantileOptions& options = QuantileOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see TDigestOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> TDigest(const Datum& value,
+ const TDigestOptions& options = TDigestOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Find the first index of a value in an array.
+///
+/// \param[in] value The array to search.
+/// \param[in] options The array to search for. See IndexOoptions.
+/// \param[in] ctx the function execution context, optional
+/// \return out a Scalar containing the index (or -1 if not found).
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Index(const Datum& value, const IndexOptions& options,
+ ExecContext* ctx = NULLPTR);
+
+namespace internal {
+
+/// Internal use only: streaming group identifier.
+/// Consumes batches of keys and yields batches of the group ids.
+class ARROW_EXPORT Grouper {
+ public:
+ virtual ~Grouper() = default;
+
+ /// Construct a Grouper which receives the specified key types
+ static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
+ ExecContext* ctx = default_exec_context());
+
+ /// Consume a batch of keys, producing the corresponding group ids as an integer array.
+ /// Currently only uint32 indices will be produced, eventually the bit width will only
+ /// be as wide as necessary.
+ virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
+
+ /// Get current unique keys. May be called multiple times.
+ virtual Result<ExecBatch> GetUniques() = 0;
+
+ /// Get the current number of groups.
+ virtual uint32_t num_groups() const = 0;
+
+ /// \brief Assemble lists of indices of identical elements.
+ ///
+ /// \param[in] ids An unsigned, all-valid integral array which will be
+ /// used as grouping criteria.
+ /// \param[in] num_groups An upper bound for the elements of ids
+ /// \return A num_groups-long ListArray where the slot at i contains a
+ /// list of indices where i appears in ids.
+ ///
+ /// MakeGroupings([
+ /// 2,
+ /// 2,
+ /// 5,
+ /// 5,
+ /// 2,
+ /// 3
+ /// ], 8) == [
+ /// [],
+ /// [],
+ /// [0, 1, 4],
+ /// [5],
+ /// [],
+ /// [2, 3],
+ /// [],
+ /// []
+ /// ]
+ static Result<std::shared_ptr<ListArray>> MakeGroupings(
+ const UInt32Array& ids, uint32_t num_groups,
+ ExecContext* ctx = default_exec_context());
+
+ /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
+ /// the provided groupings.
+ ///
+ /// For example,
+ /// ApplyGroupings([
+ /// [],
+ /// [],
+ /// [0, 1, 4],
+ /// [5],
+ /// [],
+ /// [2, 3],
+ /// [],
+ /// []
+ /// ], [2, 2, 5, 5, 2, 3]) == [
+ /// [],
+ /// [],
+ /// [2, 2, 2],
+ /// [3],
+ /// [],
+ /// [5, 5],
+ /// [],
+ /// []
+ /// ]
+ static Result<std::shared_ptr<ListArray>> ApplyGroupings(
+ const ListArray& groupings, const Array& array,
+ ExecContext* ctx = default_exec_context());
+};
+
+/// \brief Configure a grouped aggregation
+struct ARROW_EXPORT Aggregate {
+ /// the name of the aggregation function
+ std::string function;
+
+ /// options for the aggregation function
+ const FunctionOptions* options;
+};
+
+/// Internal use only: helper function for testing HashAggregateKernels.
+/// This will be replaced by streaming execution operators.
+ARROW_EXPORT
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+ const std::vector<Aggregate>& aggregates,
+ ExecContext* ctx = default_exec_context());
+
+} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
index 1feb4e7eee0..989ca2b3937 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
@@ -21,287 +21,287 @@
#include <sstream>
#include <string>
-#include "arrow/array/array_base.h"
+#include "arrow/array/array_base.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/registry.h"
-#include "arrow/compute/util_internal.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
#include "arrow/status.h"
#include "arrow/type.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
-
-namespace internal {
-template <>
-struct EnumTraits<compute::JoinOptions::NullHandlingBehavior>
- : BasicEnumTraits<compute::JoinOptions::NullHandlingBehavior,
- compute::JoinOptions::NullHandlingBehavior::EMIT_NULL,
- compute::JoinOptions::NullHandlingBehavior::SKIP,
- compute::JoinOptions::NullHandlingBehavior::REPLACE> {
- static std::string name() { return "JoinOptions::NullHandlingBehavior"; }
- static std::string value_name(compute::JoinOptions::NullHandlingBehavior value) {
- switch (value) {
- case compute::JoinOptions::NullHandlingBehavior::EMIT_NULL:
- return "EMIT_NULL";
- case compute::JoinOptions::NullHandlingBehavior::SKIP:
- return "SKIP";
- case compute::JoinOptions::NullHandlingBehavior::REPLACE:
- return "REPLACE";
- }
- return "<INVALID>";
- }
-};
-template <>
-struct EnumTraits<TimeUnit::type>
- : BasicEnumTraits<TimeUnit::type, TimeUnit::type::SECOND, TimeUnit::type::MILLI,
- TimeUnit::type::MICRO, TimeUnit::type::NANO> {
- static std::string name() { return "TimeUnit::type"; }
- static std::string value_name(TimeUnit::type value) {
- switch (value) {
- case TimeUnit::type::SECOND:
- return "SECOND";
- case TimeUnit::type::MILLI:
- return "MILLI";
- case TimeUnit::type::MICRO:
- return "MICRO";
- case TimeUnit::type::NANO:
- return "NANO";
- }
- return "<INVALID>";
- }
-};
-template <>
-struct EnumTraits<compute::CompareOperator>
- : BasicEnumTraits<
- compute::CompareOperator, compute::CompareOperator::EQUAL,
- compute::CompareOperator::NOT_EQUAL, compute::CompareOperator::GREATER,
- compute::CompareOperator::GREATER_EQUAL, compute::CompareOperator::LESS,
- compute::CompareOperator::LESS_EQUAL> {
- static std::string name() { return "compute::CompareOperator"; }
- static std::string value_name(compute::CompareOperator value) {
- switch (value) {
- case compute::CompareOperator::EQUAL:
- return "EQUAL";
- case compute::CompareOperator::NOT_EQUAL:
- return "NOT_EQUAL";
- case compute::CompareOperator::GREATER:
- return "GREATER";
- case compute::CompareOperator::GREATER_EQUAL:
- return "GREATER_EQUAL";
- case compute::CompareOperator::LESS:
- return "LESS";
- case compute::CompareOperator::LESS_EQUAL:
- return "LESS_EQUAL";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
+
+namespace internal {
+template <>
+struct EnumTraits<compute::JoinOptions::NullHandlingBehavior>
+ : BasicEnumTraits<compute::JoinOptions::NullHandlingBehavior,
+ compute::JoinOptions::NullHandlingBehavior::EMIT_NULL,
+ compute::JoinOptions::NullHandlingBehavior::SKIP,
+ compute::JoinOptions::NullHandlingBehavior::REPLACE> {
+ static std::string name() { return "JoinOptions::NullHandlingBehavior"; }
+ static std::string value_name(compute::JoinOptions::NullHandlingBehavior value) {
+ switch (value) {
+ case compute::JoinOptions::NullHandlingBehavior::EMIT_NULL:
+ return "EMIT_NULL";
+ case compute::JoinOptions::NullHandlingBehavior::SKIP:
+ return "SKIP";
+ case compute::JoinOptions::NullHandlingBehavior::REPLACE:
+ return "REPLACE";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<TimeUnit::type>
+ : BasicEnumTraits<TimeUnit::type, TimeUnit::type::SECOND, TimeUnit::type::MILLI,
+ TimeUnit::type::MICRO, TimeUnit::type::NANO> {
+ static std::string name() { return "TimeUnit::type"; }
+ static std::string value_name(TimeUnit::type value) {
+ switch (value) {
+ case TimeUnit::type::SECOND:
+ return "SECOND";
+ case TimeUnit::type::MILLI:
+ return "MILLI";
+ case TimeUnit::type::MICRO:
+ return "MICRO";
+ case TimeUnit::type::NANO:
+ return "NANO";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<compute::CompareOperator>
+ : BasicEnumTraits<
+ compute::CompareOperator, compute::CompareOperator::EQUAL,
+ compute::CompareOperator::NOT_EQUAL, compute::CompareOperator::GREATER,
+ compute::CompareOperator::GREATER_EQUAL, compute::CompareOperator::LESS,
+ compute::CompareOperator::LESS_EQUAL> {
+ static std::string name() { return "compute::CompareOperator"; }
+ static std::string value_name(compute::CompareOperator value) {
+ switch (value) {
+ case compute::CompareOperator::EQUAL:
+ return "EQUAL";
+ case compute::CompareOperator::NOT_EQUAL:
+ return "NOT_EQUAL";
+ case compute::CompareOperator::GREATER:
+ return "GREATER";
+ case compute::CompareOperator::GREATER_EQUAL:
+ return "GREATER_EQUAL";
+ case compute::CompareOperator::LESS:
+ return "LESS";
+ case compute::CompareOperator::LESS_EQUAL:
+ return "LESS_EQUAL";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
namespace compute {
-// ----------------------------------------------------------------------
-// Function options
-
-using ::arrow::internal::checked_cast;
-
-namespace internal {
-namespace {
-using ::arrow::internal::DataMember;
-static auto kArithmeticOptionsType = GetFunctionOptionsType<ArithmeticOptions>(
- DataMember("check_overflow", &ArithmeticOptions::check_overflow));
-static auto kElementWiseAggregateOptionsType =
- GetFunctionOptionsType<ElementWiseAggregateOptions>(
- DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls));
-static auto kJoinOptionsType = GetFunctionOptionsType<JoinOptions>(
- DataMember("null_handling", &JoinOptions::null_handling),
- DataMember("null_replacement", &JoinOptions::null_replacement));
-static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOptions>(
- DataMember("pattern", &MatchSubstringOptions::pattern),
- DataMember("ignore_case", &MatchSubstringOptions::ignore_case));
-static auto kSplitOptionsType = GetFunctionOptionsType<SplitOptions>(
- DataMember("max_splits", &SplitOptions::max_splits),
- DataMember("reverse", &SplitOptions::reverse));
-static auto kSplitPatternOptionsType = GetFunctionOptionsType<SplitPatternOptions>(
- DataMember("pattern", &SplitPatternOptions::pattern),
- DataMember("max_splits", &SplitPatternOptions::max_splits),
- DataMember("reverse", &SplitPatternOptions::reverse));
-static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
- DataMember("start", &ReplaceSliceOptions::start),
- DataMember("stop", &ReplaceSliceOptions::stop),
- DataMember("replacement", &ReplaceSliceOptions::replacement));
-static auto kReplaceSubstringOptionsType =
- GetFunctionOptionsType<ReplaceSubstringOptions>(
- DataMember("pattern", &ReplaceSubstringOptions::pattern),
- DataMember("replacement", &ReplaceSubstringOptions::replacement),
- DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements));
-static auto kExtractRegexOptionsType = GetFunctionOptionsType<ExtractRegexOptions>(
- DataMember("pattern", &ExtractRegexOptions::pattern));
-static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
- DataMember("value_set", &SetLookupOptions::value_set),
- DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
-static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
- DataMember("format", &StrptimeOptions::format),
- DataMember("unit", &StrptimeOptions::unit));
-static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
- DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
-static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
- DataMember("characters", &TrimOptions::characters));
-static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
- DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
- DataMember("step", &SliceOptions::step));
-static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
- DataMember("field_names", &MakeStructOptions::field_names),
- DataMember("field_nullability", &MakeStructOptions::field_nullability),
- DataMember("field_metadata", &MakeStructOptions::field_metadata));
-static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
- DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
- DataMember("week_start", &DayOfWeekOptions::week_start));
-} // namespace
-} // namespace internal
-
-ArithmeticOptions::ArithmeticOptions(bool check_overflow)
- : FunctionOptions(internal::kArithmeticOptionsType), check_overflow(check_overflow) {}
-constexpr char ArithmeticOptions::kTypeName[];
-
-ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls)
- : FunctionOptions(internal::kElementWiseAggregateOptionsType),
- skip_nulls(skip_nulls) {}
-constexpr char ElementWiseAggregateOptions::kTypeName[];
-
-JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement)
- : FunctionOptions(internal::kJoinOptionsType),
- null_handling(null_handling),
- null_replacement(std::move(null_replacement)) {}
-constexpr char JoinOptions::kTypeName[];
-
-MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case)
- : FunctionOptions(internal::kMatchSubstringOptionsType),
- pattern(std::move(pattern)),
- ignore_case(ignore_case) {}
-MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {}
-constexpr char MatchSubstringOptions::kTypeName[];
-
-SplitOptions::SplitOptions(int64_t max_splits, bool reverse)
- : FunctionOptions(internal::kSplitOptionsType),
- max_splits(max_splits),
- reverse(reverse) {}
-constexpr char SplitOptions::kTypeName[];
-
-SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits,
- bool reverse)
- : FunctionOptions(internal::kSplitPatternOptionsType),
- pattern(std::move(pattern)),
- max_splits(max_splits),
- reverse(reverse) {}
-SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {}
-constexpr char SplitPatternOptions::kTypeName[];
-
-ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop,
- std::string replacement)
- : FunctionOptions(internal::kReplaceSliceOptionsType),
- start(start),
- stop(stop),
- replacement(std::move(replacement)) {}
-ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {}
-constexpr char ReplaceSliceOptions::kTypeName[];
-
-ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern,
- std::string replacement,
- int64_t max_replacements)
- : FunctionOptions(internal::kReplaceSubstringOptionsType),
- pattern(std::move(pattern)),
- replacement(std::move(replacement)),
- max_replacements(max_replacements) {}
-ReplaceSubstringOptions::ReplaceSubstringOptions()
- : ReplaceSubstringOptions("", "", -1) {}
-constexpr char ReplaceSubstringOptions::kTypeName[];
-
-ExtractRegexOptions::ExtractRegexOptions(std::string pattern)
- : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {}
-ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {}
-constexpr char ExtractRegexOptions::kTypeName[];
-
-SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
- : FunctionOptions(internal::kSetLookupOptionsType),
- value_set(std::move(value_set)),
- skip_nulls(skip_nulls) {}
-SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
-constexpr char SetLookupOptions::kTypeName[];
-
-StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
- : FunctionOptions(internal::kStrptimeOptionsType),
- format(std::move(format)),
- unit(unit) {}
-StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
-constexpr char StrptimeOptions::kTypeName[];
-
-PadOptions::PadOptions(int64_t width, std::string padding)
- : FunctionOptions(internal::kPadOptionsType),
- width(width),
- padding(std::move(padding)) {}
-PadOptions::PadOptions() : PadOptions(0, " ") {}
-constexpr char PadOptions::kTypeName[];
-
-TrimOptions::TrimOptions(std::string characters)
- : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {}
-TrimOptions::TrimOptions() : TrimOptions("") {}
-constexpr char TrimOptions::kTypeName[];
-
-SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
- : FunctionOptions(internal::kSliceOptionsType),
- start(start),
- stop(stop),
- step(step) {}
-SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {}
-constexpr char SliceOptions::kTypeName[];
-
-MakeStructOptions::MakeStructOptions(
- std::vector<std::string> n, std::vector<bool> r,
- std::vector<std::shared_ptr<const KeyValueMetadata>> m)
- : FunctionOptions(internal::kMakeStructOptionsType),
- field_names(std::move(n)),
- field_nullability(std::move(r)),
- field_metadata(std::move(m)) {}
-
-MakeStructOptions::MakeStructOptions(std::vector<std::string> n)
- : FunctionOptions(internal::kMakeStructOptionsType),
- field_names(std::move(n)),
- field_nullability(field_names.size(), true),
- field_metadata(field_names.size(), NULLPTR) {}
-
-MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector<std::string>()) {}
-constexpr char MakeStructOptions::kTypeName[];
-
-DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start)
- : FunctionOptions(internal::kDayOfWeekOptionsType),
- one_based_numbering(one_based_numbering),
- week_start(week_start) {}
-constexpr char DayOfWeekOptions::kTypeName[];
-
-namespace internal {
-void RegisterScalarOptions(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
-}
-} // namespace internal
-
+// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kArithmeticOptionsType = GetFunctionOptionsType<ArithmeticOptions>(
+ DataMember("check_overflow", &ArithmeticOptions::check_overflow));
+static auto kElementWiseAggregateOptionsType =
+ GetFunctionOptionsType<ElementWiseAggregateOptions>(
+ DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls));
+static auto kJoinOptionsType = GetFunctionOptionsType<JoinOptions>(
+ DataMember("null_handling", &JoinOptions::null_handling),
+ DataMember("null_replacement", &JoinOptions::null_replacement));
+static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOptions>(
+ DataMember("pattern", &MatchSubstringOptions::pattern),
+ DataMember("ignore_case", &MatchSubstringOptions::ignore_case));
+static auto kSplitOptionsType = GetFunctionOptionsType<SplitOptions>(
+ DataMember("max_splits", &SplitOptions::max_splits),
+ DataMember("reverse", &SplitOptions::reverse));
+static auto kSplitPatternOptionsType = GetFunctionOptionsType<SplitPatternOptions>(
+ DataMember("pattern", &SplitPatternOptions::pattern),
+ DataMember("max_splits", &SplitPatternOptions::max_splits),
+ DataMember("reverse", &SplitPatternOptions::reverse));
+static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
+ DataMember("start", &ReplaceSliceOptions::start),
+ DataMember("stop", &ReplaceSliceOptions::stop),
+ DataMember("replacement", &ReplaceSliceOptions::replacement));
+static auto kReplaceSubstringOptionsType =
+ GetFunctionOptionsType<ReplaceSubstringOptions>(
+ DataMember("pattern", &ReplaceSubstringOptions::pattern),
+ DataMember("replacement", &ReplaceSubstringOptions::replacement),
+ DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements));
+static auto kExtractRegexOptionsType = GetFunctionOptionsType<ExtractRegexOptions>(
+ DataMember("pattern", &ExtractRegexOptions::pattern));
+static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
+ DataMember("value_set", &SetLookupOptions::value_set),
+ DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
+static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
+ DataMember("format", &StrptimeOptions::format),
+ DataMember("unit", &StrptimeOptions::unit));
+static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
+ DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
+static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
+ DataMember("characters", &TrimOptions::characters));
+static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
+ DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
+ DataMember("step", &SliceOptions::step));
+static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
+ DataMember("field_names", &MakeStructOptions::field_names),
+ DataMember("field_nullability", &MakeStructOptions::field_nullability),
+ DataMember("field_metadata", &MakeStructOptions::field_metadata));
+static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
+ DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
+ DataMember("week_start", &DayOfWeekOptions::week_start));
+} // namespace
+} // namespace internal
+
+ArithmeticOptions::ArithmeticOptions(bool check_overflow)
+ : FunctionOptions(internal::kArithmeticOptionsType), check_overflow(check_overflow) {}
+constexpr char ArithmeticOptions::kTypeName[];
+
+ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls)
+ : FunctionOptions(internal::kElementWiseAggregateOptionsType),
+ skip_nulls(skip_nulls) {}
+constexpr char ElementWiseAggregateOptions::kTypeName[];
+
+JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement)
+ : FunctionOptions(internal::kJoinOptionsType),
+ null_handling(null_handling),
+ null_replacement(std::move(null_replacement)) {}
+constexpr char JoinOptions::kTypeName[];
+
+MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case)
+ : FunctionOptions(internal::kMatchSubstringOptionsType),
+ pattern(std::move(pattern)),
+ ignore_case(ignore_case) {}
+MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {}
+constexpr char MatchSubstringOptions::kTypeName[];
+
+SplitOptions::SplitOptions(int64_t max_splits, bool reverse)
+ : FunctionOptions(internal::kSplitOptionsType),
+ max_splits(max_splits),
+ reverse(reverse) {}
+constexpr char SplitOptions::kTypeName[];
+
+SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits,
+ bool reverse)
+ : FunctionOptions(internal::kSplitPatternOptionsType),
+ pattern(std::move(pattern)),
+ max_splits(max_splits),
+ reverse(reverse) {}
+SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {}
+constexpr char SplitPatternOptions::kTypeName[];
+
+ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop,
+ std::string replacement)
+ : FunctionOptions(internal::kReplaceSliceOptionsType),
+ start(start),
+ stop(stop),
+ replacement(std::move(replacement)) {}
+ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {}
+constexpr char ReplaceSliceOptions::kTypeName[];
+
+ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern,
+ std::string replacement,
+ int64_t max_replacements)
+ : FunctionOptions(internal::kReplaceSubstringOptionsType),
+ pattern(std::move(pattern)),
+ replacement(std::move(replacement)),
+ max_replacements(max_replacements) {}
+ReplaceSubstringOptions::ReplaceSubstringOptions()
+ : ReplaceSubstringOptions("", "", -1) {}
+constexpr char ReplaceSubstringOptions::kTypeName[];
+
+ExtractRegexOptions::ExtractRegexOptions(std::string pattern)
+ : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {}
+ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {}
+constexpr char ExtractRegexOptions::kTypeName[];
+
+SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
+ : FunctionOptions(internal::kSetLookupOptionsType),
+ value_set(std::move(value_set)),
+ skip_nulls(skip_nulls) {}
+SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
+constexpr char SetLookupOptions::kTypeName[];
+
+StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
+ : FunctionOptions(internal::kStrptimeOptionsType),
+ format(std::move(format)),
+ unit(unit) {}
+StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
+constexpr char StrptimeOptions::kTypeName[];
+
+PadOptions::PadOptions(int64_t width, std::string padding)
+ : FunctionOptions(internal::kPadOptionsType),
+ width(width),
+ padding(std::move(padding)) {}
+PadOptions::PadOptions() : PadOptions(0, " ") {}
+constexpr char PadOptions::kTypeName[];
+
+TrimOptions::TrimOptions(std::string characters)
+ : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {}
+TrimOptions::TrimOptions() : TrimOptions("") {}
+constexpr char TrimOptions::kTypeName[];
+
+SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
+ : FunctionOptions(internal::kSliceOptionsType),
+ start(start),
+ stop(stop),
+ step(step) {}
+SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {}
+constexpr char SliceOptions::kTypeName[];
+
+MakeStructOptions::MakeStructOptions(
+ std::vector<std::string> n, std::vector<bool> r,
+ std::vector<std::shared_ptr<const KeyValueMetadata>> m)
+ : FunctionOptions(internal::kMakeStructOptionsType),
+ field_names(std::move(n)),
+ field_nullability(std::move(r)),
+ field_metadata(std::move(m)) {}
+
+MakeStructOptions::MakeStructOptions(std::vector<std::string> n)
+ : FunctionOptions(internal::kMakeStructOptionsType),
+ field_names(std::move(n)),
+ field_nullability(field_names.size(), true),
+ field_metadata(field_names.size(), NULLPTR) {}
+
+MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector<std::string>()) {}
+constexpr char MakeStructOptions::kTypeName[];
+
+DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start)
+ : FunctionOptions(internal::kDayOfWeekOptionsType),
+ one_based_numbering(one_based_numbering),
+ week_start(week_start) {}
+constexpr char DayOfWeekOptions::kTypeName[];
+
+namespace internal {
+void RegisterScalarOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
+}
+} // namespace internal
+
#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME) \
Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
return CallFunction(REGISTRY_NAME, {value}, ctx); \
@@ -315,26 +315,26 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
// ----------------------------------------------------------------------
// Arithmetic
-#define SCALAR_ARITHMETIC_UNARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
- Result<Datum> NAME(const Datum& arg, ArithmeticOptions options, ExecContext* ctx) { \
- auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \
- return CallFunction(func_name, {arg}, ctx); \
- }
-
-SCALAR_ARITHMETIC_UNARY(AbsoluteValue, "abs", "abs_checked")
-SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked")
-SCALAR_EAGER_UNARY(Sign, "sign")
-SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked")
-SCALAR_ARITHMETIC_UNARY(Cos, "cos", "cos_checked")
-SCALAR_ARITHMETIC_UNARY(Asin, "asin", "asin_checked")
-SCALAR_ARITHMETIC_UNARY(Acos, "acos", "acos_checked")
-SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked")
-SCALAR_EAGER_UNARY(Atan, "atan")
-SCALAR_ARITHMETIC_UNARY(Ln, "ln", "ln_checked")
-SCALAR_ARITHMETIC_UNARY(Log10, "log10", "log10_checked")
-SCALAR_ARITHMETIC_UNARY(Log2, "log2", "log2_checked")
-SCALAR_ARITHMETIC_UNARY(Log1p, "log1p", "log1p_checked")
-
+#define SCALAR_ARITHMETIC_UNARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
+ Result<Datum> NAME(const Datum& arg, ArithmeticOptions options, ExecContext* ctx) { \
+ auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \
+ return CallFunction(func_name, {arg}, ctx); \
+ }
+
+SCALAR_ARITHMETIC_UNARY(AbsoluteValue, "abs", "abs_checked")
+SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked")
+SCALAR_EAGER_UNARY(Sign, "sign")
+SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked")
+SCALAR_ARITHMETIC_UNARY(Cos, "cos", "cos_checked")
+SCALAR_ARITHMETIC_UNARY(Asin, "asin", "asin_checked")
+SCALAR_ARITHMETIC_UNARY(Acos, "acos", "acos_checked")
+SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked")
+SCALAR_EAGER_UNARY(Atan, "atan")
+SCALAR_ARITHMETIC_UNARY(Ln, "ln", "ln_checked")
+SCALAR_ARITHMETIC_UNARY(Log10, "log10", "log10_checked")
+SCALAR_ARITHMETIC_UNARY(Log2, "log2", "log2_checked")
+SCALAR_ARITHMETIC_UNARY(Log1p, "log1p", "log1p_checked")
+
#define SCALAR_ARITHMETIC_BINARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
Result<Datum> NAME(const Datum& left, const Datum& right, ArithmeticOptions options, \
ExecContext* ctx) { \
@@ -346,65 +346,65 @@ SCALAR_ARITHMETIC_BINARY(Add, "add", "add_checked")
SCALAR_ARITHMETIC_BINARY(Subtract, "subtract", "subtract_checked")
SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked")
SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked")
-SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked")
-SCALAR_ARITHMETIC_BINARY(ShiftLeft, "shift_left", "shift_left_checked")
-SCALAR_ARITHMETIC_BINARY(ShiftRight, "shift_right", "shift_right_checked")
-SCALAR_EAGER_BINARY(Atan2, "atan2")
-SCALAR_EAGER_UNARY(Floor, "floor")
-SCALAR_EAGER_UNARY(Ceil, "ceil")
-SCALAR_EAGER_UNARY(Trunc, "trunc")
-
-Result<Datum> MaxElementWise(const std::vector<Datum>& args,
- ElementWiseAggregateOptions options, ExecContext* ctx) {
- return CallFunction("max_element_wise", args, &options, ctx);
-}
-
-Result<Datum> MinElementWise(const std::vector<Datum>& args,
- ElementWiseAggregateOptions options, ExecContext* ctx) {
- return CallFunction("min_element_wise", args, &options, ctx);
-}
-
+SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked")
+SCALAR_ARITHMETIC_BINARY(ShiftLeft, "shift_left", "shift_left_checked")
+SCALAR_ARITHMETIC_BINARY(ShiftRight, "shift_right", "shift_right_checked")
+SCALAR_EAGER_BINARY(Atan2, "atan2")
+SCALAR_EAGER_UNARY(Floor, "floor")
+SCALAR_EAGER_UNARY(Ceil, "ceil")
+SCALAR_EAGER_UNARY(Trunc, "trunc")
+
+Result<Datum> MaxElementWise(const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options, ExecContext* ctx) {
+ return CallFunction("max_element_wise", args, &options, ctx);
+}
+
+Result<Datum> MinElementWise(const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options, ExecContext* ctx) {
+ return CallFunction("min_element_wise", args, &options, ctx);
+}
+
// ----------------------------------------------------------------------
// Set-related operations
static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
- const SetLookupOptions& options, ExecContext* ctx) {
- if (!options.value_set.is_arraylike()) {
+ const SetLookupOptions& options, ExecContext* ctx) {
+ if (!options.value_set.is_arraylike()) {
return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
}
- std::shared_ptr<DataType> data_type;
- if (data.type()->id() == Type::DICTIONARY) {
- data_type =
- arrow::internal::checked_pointer_cast<DictionaryType>(data.type())->value_type();
- } else {
- data_type = data.type();
- }
-
- if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) {
+ std::shared_ptr<DataType> data_type;
+ if (data.type()->id() == Type::DICTIONARY) {
+ data_type =
+ arrow::internal::checked_pointer_cast<DictionaryType>(data.type())->value_type();
+ } else {
+ data_type = data.type();
+ }
+
+ if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) {
std::stringstream ss;
- ss << "Array type didn't match type of values set: " << data_type->ToString()
- << " vs " << options.value_set.type()->ToString();
+ ss << "Array type didn't match type of values set: " << data_type->ToString()
+ << " vs " << options.value_set.type()->ToString();
return Status::Invalid(ss.str());
}
return CallFunction(func_name, {data}, &options, ctx);
}
-Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx) {
- return ExecSetLookup("is_in", values, options, ctx);
-}
-
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx) {
+ return ExecSetLookup("is_in", values, options, ctx);
+}
+
Result<Datum> IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
- return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
-}
-
-Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx) {
- return ExecSetLookup("index_in", values, options, ctx);
+ return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
}
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx) {
+ return ExecSetLookup("index_in", values, options, ctx);
+}
+
Result<Datum> IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
- return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
+ return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
}
// ----------------------------------------------------------------------
@@ -416,8 +416,8 @@ SCALAR_EAGER_BINARY(KleeneAnd, "and_kleene")
SCALAR_EAGER_BINARY(Or, "or")
SCALAR_EAGER_BINARY(KleeneOr, "or_kleene")
SCALAR_EAGER_BINARY(Xor, "xor")
-SCALAR_EAGER_BINARY(AndNot, "and_not")
-SCALAR_EAGER_BINARY(KleeneAndNot, "and_not_kleene")
+SCALAR_EAGER_BINARY(AndNot, "and_not")
+SCALAR_EAGER_BINARY(KleeneAndNot, "and_not_kleene")
// ----------------------------------------------------------------------
@@ -444,7 +444,7 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
func_name = "less_equal";
break;
}
- return CallFunction(func_name, {left, right}, nullptr, ctx);
+ return CallFunction(func_name, {left, right}, nullptr, ctx);
}
// ----------------------------------------------------------------------
@@ -452,47 +452,47 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
SCALAR_EAGER_UNARY(IsValid, "is_valid")
SCALAR_EAGER_UNARY(IsNull, "is_null")
-SCALAR_EAGER_UNARY(IsNan, "is_nan")
+SCALAR_EAGER_UNARY(IsNan, "is_nan")
Result<Datum> FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) {
return CallFunction("fill_null", {values, fill_value}, ctx);
}
-Result<Datum> IfElse(const Datum& cond, const Datum& if_true, const Datum& if_false,
- ExecContext* ctx) {
- return CallFunction("if_else", {cond, if_true, if_false}, ctx);
-}
-
-Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
- ExecContext* ctx) {
- std::vector<Datum> args = {cond};
- args.reserve(cases.size() + 1);
- args.insert(args.end(), cases.begin(), cases.end());
- return CallFunction("case_when", args, ctx);
-}
-
-// ----------------------------------------------------------------------
-// Temporal functions
-
-SCALAR_EAGER_UNARY(Year, "year")
-SCALAR_EAGER_UNARY(Month, "month")
-SCALAR_EAGER_UNARY(Day, "day")
-SCALAR_EAGER_UNARY(DayOfYear, "day_of_year")
-SCALAR_EAGER_UNARY(ISOYear, "iso_year")
-SCALAR_EAGER_UNARY(ISOWeek, "iso_week")
-SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar")
-SCALAR_EAGER_UNARY(Quarter, "quarter")
-SCALAR_EAGER_UNARY(Hour, "hour")
-SCALAR_EAGER_UNARY(Minute, "minute")
-SCALAR_EAGER_UNARY(Second, "second")
-SCALAR_EAGER_UNARY(Millisecond, "millisecond")
-SCALAR_EAGER_UNARY(Microsecond, "microsecond")
-SCALAR_EAGER_UNARY(Nanosecond, "nanosecond")
-SCALAR_EAGER_UNARY(Subsecond, "subsecond")
-
-Result<Datum> DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* ctx) {
- return CallFunction("day_of_week", {arg}, &options, ctx);
-}
-
+Result<Datum> IfElse(const Datum& cond, const Datum& if_true, const Datum& if_false,
+ ExecContext* ctx) {
+ return CallFunction("if_else", {cond, if_true, if_false}, ctx);
+}
+
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+ ExecContext* ctx) {
+ std::vector<Datum> args = {cond};
+ args.reserve(cases.size() + 1);
+ args.insert(args.end(), cases.begin(), cases.end());
+ return CallFunction("case_when", args, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Temporal functions
+
+SCALAR_EAGER_UNARY(Year, "year")
+SCALAR_EAGER_UNARY(Month, "month")
+SCALAR_EAGER_UNARY(Day, "day")
+SCALAR_EAGER_UNARY(DayOfYear, "day_of_year")
+SCALAR_EAGER_UNARY(ISOYear, "iso_year")
+SCALAR_EAGER_UNARY(ISOWeek, "iso_week")
+SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar")
+SCALAR_EAGER_UNARY(Quarter, "quarter")
+SCALAR_EAGER_UNARY(Hour, "hour")
+SCALAR_EAGER_UNARY(Minute, "minute")
+SCALAR_EAGER_UNARY(Second, "second")
+SCALAR_EAGER_UNARY(Millisecond, "millisecond")
+SCALAR_EAGER_UNARY(Microsecond, "microsecond")
+SCALAR_EAGER_UNARY(Nanosecond, "nanosecond")
+SCALAR_EAGER_UNARY(Subsecond, "subsecond")
+
+Result<Datum> DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* ctx) {
+ return CallFunction("day_of_week", {arg}, &options, ctx);
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
index e07e41569a1..8486cb0126f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
@@ -37,125 +37,125 @@ namespace compute {
///
/// @{
-class ARROW_EXPORT ArithmeticOptions : public FunctionOptions {
- public:
- explicit ArithmeticOptions(bool check_overflow = false);
- constexpr static char const kTypeName[] = "ArithmeticOptions";
+class ARROW_EXPORT ArithmeticOptions : public FunctionOptions {
+ public:
+ explicit ArithmeticOptions(bool check_overflow = false);
+ constexpr static char const kTypeName[] = "ArithmeticOptions";
bool check_overflow;
};
-class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
- public:
- explicit ElementWiseAggregateOptions(bool skip_nulls = true);
- constexpr static char const kTypeName[] = "ElementWiseAggregateOptions";
- static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
-
- bool skip_nulls;
-};
-
-/// Options for var_args_join.
-class ARROW_EXPORT JoinOptions : public FunctionOptions {
- public:
- /// How to handle null values. (A null separator always results in a null output.)
- enum NullHandlingBehavior {
- /// A null in any input results in a null in the output.
- EMIT_NULL,
- /// Nulls in inputs are skipped.
- SKIP,
- /// Nulls in inputs are replaced with the replacement string.
- REPLACE,
- };
- explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
- std::string null_replacement = "");
- constexpr static char const kTypeName[] = "JoinOptions";
- static JoinOptions Defaults() { return JoinOptions(); }
- NullHandlingBehavior null_handling;
- std::string null_replacement;
-};
-
-class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
- public:
- explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
- MatchSubstringOptions();
- constexpr static char const kTypeName[] = "MatchSubstringOptions";
-
- /// The exact substring (or regex, depending on kernel) to look for inside input values.
- std::string pattern;
- /// Whether to perform a case-insensitive match.
- bool ignore_case = false;
-};
-
-class ARROW_EXPORT SplitOptions : public FunctionOptions {
- public:
- explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
- constexpr static char const kTypeName[] = "SplitOptions";
-
- /// Maximum number of splits allowed, or unlimited when -1
- int64_t max_splits;
- /// Start splitting from the end of the string (only relevant when max_splits != -1)
- bool reverse;
-};
-
-class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
- public:
- explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
- bool reverse = false);
- SplitPatternOptions();
- constexpr static char const kTypeName[] = "SplitPatternOptions";
-
- /// The exact substring to split on.
- std::string pattern;
- /// Maximum number of splits allowed, or unlimited when -1
- int64_t max_splits;
- /// Start splitting from the end of the string (only relevant when max_splits != -1)
- bool reverse;
-};
-
-class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
- public:
- explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
- ReplaceSliceOptions();
- constexpr static char const kTypeName[] = "ReplaceSliceOptions";
-
- /// Index to start slicing at
- int64_t start;
- /// Index to stop slicing at
- int64_t stop;
- /// String to replace the slice with
- std::string replacement;
-};
-
-class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
- public:
- explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
- int64_t max_replacements = -1);
- ReplaceSubstringOptions();
- constexpr static char const kTypeName[] = "ReplaceSubstringOptions";
-
- /// Pattern to match, literal, or regular expression depending on which kernel is used
- std::string pattern;
- /// String to replace the pattern with
- std::string replacement;
- /// Max number of substrings to replace (-1 means unbounded)
- int64_t max_replacements;
-};
-
-class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
- public:
- explicit ExtractRegexOptions(std::string pattern);
- ExtractRegexOptions();
- constexpr static char const kTypeName[] = "ExtractRegexOptions";
-
- /// Regular expression with named capture fields
+class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
+ public:
+ explicit ElementWiseAggregateOptions(bool skip_nulls = true);
+ constexpr static char const kTypeName[] = "ElementWiseAggregateOptions";
+ static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
+
+ bool skip_nulls;
+};
+
+/// Options for var_args_join.
+class ARROW_EXPORT JoinOptions : public FunctionOptions {
+ public:
+ /// How to handle null values. (A null separator always results in a null output.)
+ enum NullHandlingBehavior {
+ /// A null in any input results in a null in the output.
+ EMIT_NULL,
+ /// Nulls in inputs are skipped.
+ SKIP,
+ /// Nulls in inputs are replaced with the replacement string.
+ REPLACE,
+ };
+ explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
+ std::string null_replacement = "");
+ constexpr static char const kTypeName[] = "JoinOptions";
+ static JoinOptions Defaults() { return JoinOptions(); }
+ NullHandlingBehavior null_handling;
+ std::string null_replacement;
+};
+
+class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
+ public:
+ explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
+ MatchSubstringOptions();
+ constexpr static char const kTypeName[] = "MatchSubstringOptions";
+
+ /// The exact substring (or regex, depending on kernel) to look for inside input values.
std::string pattern;
+ /// Whether to perform a case-insensitive match.
+ bool ignore_case = false;
};
+class ARROW_EXPORT SplitOptions : public FunctionOptions {
+ public:
+ explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
+ constexpr static char const kTypeName[] = "SplitOptions";
+
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
+};
+
+class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
+ public:
+ explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
+ bool reverse = false);
+ SplitPatternOptions();
+ constexpr static char const kTypeName[] = "SplitPatternOptions";
+
+ /// The exact substring to split on.
+ std::string pattern;
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
+};
+
+class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
+ ReplaceSliceOptions();
+ constexpr static char const kTypeName[] = "ReplaceSliceOptions";
+
+ /// Index to start slicing at
+ int64_t start;
+ /// Index to stop slicing at
+ int64_t stop;
+ /// String to replace the slice with
+ std::string replacement;
+};
+
+class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
+ int64_t max_replacements = -1);
+ ReplaceSubstringOptions();
+ constexpr static char const kTypeName[] = "ReplaceSubstringOptions";
+
+ /// Pattern to match, literal, or regular expression depending on which kernel is used
+ std::string pattern;
+ /// String to replace the pattern with
+ std::string replacement;
+ /// Max number of substrings to replace (-1 means unbounded)
+ int64_t max_replacements;
+};
+
+class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
+ public:
+ explicit ExtractRegexOptions(std::string pattern);
+ ExtractRegexOptions();
+ constexpr static char const kTypeName[] = "ExtractRegexOptions";
+
+ /// Regular expression with named capture fields
+ std::string pattern;
+};
+
/// Options for IsIn and IndexIn functions
-class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
- public:
- explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
- SetLookupOptions();
- constexpr static char const kTypeName[] = "SetLookupOptions";
+class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+ public:
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
+ SetLookupOptions();
+ constexpr static char const kTypeName[] = "SetLookupOptions";
/// The set of values to look up input values into.
Datum value_set;
@@ -168,47 +168,47 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
bool skip_nulls;
};
-class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
- public:
- explicit StrptimeOptions(std::string format, TimeUnit::type unit);
- StrptimeOptions();
- constexpr static char const kTypeName[] = "StrptimeOptions";
+class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
+ public:
+ explicit StrptimeOptions(std::string format, TimeUnit::type unit);
+ StrptimeOptions();
+ constexpr static char const kTypeName[] = "StrptimeOptions";
std::string format;
TimeUnit::type unit;
};
-class ARROW_EXPORT PadOptions : public FunctionOptions {
- public:
- explicit PadOptions(int64_t width, std::string padding = " ");
- PadOptions();
- constexpr static char const kTypeName[] = "PadOptions";
-
- /// The desired string length.
- int64_t width;
- /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
- std::string padding;
-};
-
-class ARROW_EXPORT TrimOptions : public FunctionOptions {
- public:
- explicit TrimOptions(std::string characters);
- TrimOptions();
- constexpr static char const kTypeName[] = "TrimOptions";
-
- /// The individual characters that can be trimmed from the string.
- std::string characters;
-};
-
-class ARROW_EXPORT SliceOptions : public FunctionOptions {
- public:
- explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits<int64_t>::max(),
- int64_t step = 1);
- SliceOptions();
- constexpr static char const kTypeName[] = "SliceOptions";
- int64_t start, stop, step;
-};
-
+class ARROW_EXPORT PadOptions : public FunctionOptions {
+ public:
+ explicit PadOptions(int64_t width, std::string padding = " ");
+ PadOptions();
+ constexpr static char const kTypeName[] = "PadOptions";
+
+ /// The desired string length.
+ int64_t width;
+ /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
+ std::string padding;
+};
+
+class ARROW_EXPORT TrimOptions : public FunctionOptions {
+ public:
+ explicit TrimOptions(std::string characters);
+ TrimOptions();
+ constexpr static char const kTypeName[] = "TrimOptions";
+
+ /// The individual characters that can be trimmed from the string.
+ std::string characters;
+};
+
+class ARROW_EXPORT SliceOptions : public FunctionOptions {
+ public:
+ explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits<int64_t>::max(),
+ int64_t step = 1);
+ SliceOptions();
+ constexpr static char const kTypeName[] = "SliceOptions";
+ int64_t start, stop, step;
+};
+
enum CompareOperator : int8_t {
EQUAL,
NOT_EQUAL,
@@ -218,57 +218,57 @@ enum CompareOperator : int8_t {
LESS_EQUAL,
};
-struct ARROW_EXPORT CompareOptions {
+struct ARROW_EXPORT CompareOptions {
explicit CompareOptions(CompareOperator op) : op(op) {}
- CompareOptions() : CompareOptions(CompareOperator::EQUAL) {}
+ CompareOptions() : CompareOptions(CompareOperator::EQUAL) {}
enum CompareOperator op;
};
-class ARROW_EXPORT MakeStructOptions : public FunctionOptions {
- public:
- MakeStructOptions(std::vector<std::string> n, std::vector<bool> r,
- std::vector<std::shared_ptr<const KeyValueMetadata>> m);
- explicit MakeStructOptions(std::vector<std::string> n);
- MakeStructOptions();
- constexpr static char const kTypeName[] = "MakeStructOptions";
-
- /// Names for wrapped columns
- std::vector<std::string> field_names;
-
- /// Nullability bits for wrapped columns
- std::vector<bool> field_nullability;
-
- /// Metadata attached to wrapped columns
- std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
-};
-
-struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
- public:
- explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1);
- constexpr static char const kTypeName[] = "DayOfWeekOptions";
- static DayOfWeekOptions Defaults() { return DayOfWeekOptions{}; }
-
- /// Number days from 1 if true and from 0 if false
- bool one_based_numbering;
- /// What day does the week start with (Monday=1, Sunday=7)
- uint32_t week_start;
-};
-
+class ARROW_EXPORT MakeStructOptions : public FunctionOptions {
+ public:
+ MakeStructOptions(std::vector<std::string> n, std::vector<bool> r,
+ std::vector<std::shared_ptr<const KeyValueMetadata>> m);
+ explicit MakeStructOptions(std::vector<std::string> n);
+ MakeStructOptions();
+ constexpr static char const kTypeName[] = "MakeStructOptions";
+
+ /// Names for wrapped columns
+ std::vector<std::string> field_names;
+
+ /// Nullability bits for wrapped columns
+ std::vector<bool> field_nullability;
+
+ /// Metadata attached to wrapped columns
+ std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
+};
+
+struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
+ public:
+ explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1);
+ constexpr static char const kTypeName[] = "DayOfWeekOptions";
+ static DayOfWeekOptions Defaults() { return DayOfWeekOptions{}; }
+
+ /// Number days from 1 if true and from 0 if false
+ bool one_based_numbering;
+ /// What day does the week start with (Monday=1, Sunday=7)
+ uint32_t week_start;
+};
+
/// @}
-/// \brief Get the absolute value of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg the value transformed
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise absolute value
-ARROW_EXPORT
-Result<Datum> AbsoluteValue(const Datum& arg,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
+/// \brief Get the absolute value of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value transformed
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise absolute value
+ARROW_EXPORT
+Result<Datum> AbsoluteValue(const Datum& arg,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
/// \brief Add two values together. Array values must be the same length. If
/// either addend is null the result will be null.
///
@@ -322,233 +322,233 @@ Result<Datum> Divide(const Datum& left, const Datum& right,
ArithmeticOptions options = ArithmeticOptions(),
ExecContext* ctx = NULLPTR);
-/// \brief Negate values.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg the value negated
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise negation
-ARROW_EXPORT
-Result<Datum> Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Raise the values of base array to the power of the exponent array values.
-/// Array values must be the same length. If either base or exponent is null the result
-/// will be null.
-///
-/// \param[in] left the base
-/// \param[in] right the exponent
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise base value raised to the power of exponent
-ARROW_EXPORT
-Result<Datum> Power(const Datum& left, const Datum& right,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Left shift the left array by the right array. Array values must be the
-/// same length. If either operand is null, the result will be null.
-///
-/// \param[in] left the value to shift
-/// \param[in] right the value to shift by
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise left value shifted left by the right value
-ARROW_EXPORT
-Result<Datum> ShiftLeft(const Datum& left, const Datum& right,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Right shift the left array by the right array. Array values must be the
-/// same length. If either operand is null, the result will be null. Performs a
-/// logical shift for unsigned values, and an arithmetic shift for signed values.
-///
-/// \param[in] left the value to shift
-/// \param[in] right the value to shift by
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise left value shifted right by the right value
-ARROW_EXPORT
-Result<Datum> ShiftRight(const Datum& left, const Datum& right,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the sine of the array values.
-/// \param[in] arg The values to compute the sine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise sine of the values
-ARROW_EXPORT
-Result<Datum> Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the cosine of the array values.
-/// \param[in] arg The values to compute the cosine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise cosine of the values
-ARROW_EXPORT
-Result<Datum> Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse sine (arcsine) of the array values.
-/// \param[in] arg The values to compute the inverse sine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse sine of the values
-ARROW_EXPORT
-Result<Datum> Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse cosine (arccosine) of the array values.
-/// \param[in] arg The values to compute the inverse cosine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse cosine of the values
-ARROW_EXPORT
-Result<Datum> Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the tangent of the array values.
-/// \param[in] arg The values to compute the tangent for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise tangent of the values
-ARROW_EXPORT
-Result<Datum> Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse tangent (arctangent) of the array values.
-/// \param[in] arg The values to compute the inverse tangent for.
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse tangent of the values
-ARROW_EXPORT
-Result<Datum> Atan(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse tangent (arctangent) of y/x, using the
-/// argument signs to determine the correct quadrant.
-/// \param[in] y The y-values to compute the inverse tangent for.
-/// \param[in] x The x-values to compute the inverse tangent for.
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse tangent of the values
-ARROW_EXPORT
-Result<Datum> Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR);
-
-/// \brief Get the natural log of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise natural log
-ARROW_EXPORT
-Result<Datum> Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the log base 10 of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise log base 10
-ARROW_EXPORT
-Result<Datum> Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the log base 2 of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise log base 2
-ARROW_EXPORT
-Result<Datum> Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the natural log of (1 + value).
-///
-/// If argument is null the result will be null.
-/// This function may be more accurate than Log(1 + value) for values close to zero.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise natural log
-ARROW_EXPORT
-Result<Datum> Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Round to the nearest integer less than or equal in magnitude to the
-/// argument. Array values can be of arbitrary length. If argument is null the
-/// result will be null.
-///
-/// \param[in] arg the value to round
-/// \param[in] ctx the function execution context, optional
-/// \return the rounded value
-ARROW_EXPORT
-Result<Datum> Floor(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Round to the nearest integer greater than or equal in magnitude to the
-/// argument. Array values can be of arbitrary length. If argument is null the
-/// result will be null.
-///
-/// \param[in] arg the value to round
-/// \param[in] ctx the function execution context, optional
-/// \return the rounded value
-ARROW_EXPORT
-Result<Datum> Ceil(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Get the integral part without fractional digits. Array values can be
-/// of arbitrary length. If argument is null the result will be null.
-///
-/// \param[in] arg the value to truncate
-/// \param[in] ctx the function execution context, optional
-/// \return the truncated value
-ARROW_EXPORT
-Result<Datum> Trunc(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Find the element-wise maximum of any number of arrays or scalars.
-/// Array values must be the same length.
-///
-/// \param[in] args arrays or scalars to operate on.
-/// \param[in] options options for handling nulls, optional
-/// \param[in] ctx the function execution context, optional
-/// \return the element-wise maximum
-ARROW_EXPORT
-Result<Datum> MaxElementWise(
- const std::vector<Datum>& args,
- ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Find the element-wise minimum of any number of arrays or scalars.
-/// Array values must be the same length.
-///
-/// \param[in] args arrays or scalars to operate on.
-/// \param[in] options options for handling nulls, optional
-/// \param[in] ctx the function execution context, optional
-/// \return the element-wise minimum
-ARROW_EXPORT
-Result<Datum> MinElementWise(
- const std::vector<Datum>& args,
- ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument
-/// is null the result will be null.
-///
-/// \param[in] arg the value to extract sign from
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise sign function
-ARROW_EXPORT
-Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
-
+/// \brief Negate values.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value negated
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise negation
+ARROW_EXPORT
+Result<Datum> Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Raise the values of base array to the power of the exponent array values.
+/// Array values must be the same length. If either base or exponent is null the result
+/// will be null.
+///
+/// \param[in] left the base
+/// \param[in] right the exponent
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise base value raised to the power of exponent
+ARROW_EXPORT
+Result<Datum> Power(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Left shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted left by the right value
+ARROW_EXPORT
+Result<Datum> ShiftLeft(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Right shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null. Performs a
+/// logical shift for unsigned values, and an arithmetic shift for signed values.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted right by the right value
+ARROW_EXPORT
+Result<Datum> ShiftRight(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the sine of the array values.
+/// \param[in] arg The values to compute the sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sine of the values
+ARROW_EXPORT
+Result<Datum> Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cosine of the array values.
+/// \param[in] arg The values to compute the cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise cosine of the values
+ARROW_EXPORT
+Result<Datum> Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse sine (arcsine) of the array values.
+/// \param[in] arg The values to compute the inverse sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse sine of the values
+ARROW_EXPORT
+Result<Datum> Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse cosine (arccosine) of the array values.
+/// \param[in] arg The values to compute the inverse cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse cosine of the values
+ARROW_EXPORT
+Result<Datum> Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the tangent of the array values.
+/// \param[in] arg The values to compute the tangent for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise tangent of the values
+ARROW_EXPORT
+Result<Datum> Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of the array values.
+/// \param[in] arg The values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of y/x, using the
+/// argument signs to determine the correct quadrant.
+/// \param[in] y The y-values to compute the inverse tangent for.
+/// \param[in] x The x-values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 10 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 10
+ARROW_EXPORT
+Result<Datum> Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 2 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 2
+ARROW_EXPORT
+Result<Datum> Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of (1 + value).
+///
+/// If argument is null the result will be null.
+/// This function may be more accurate than Log(1 + value) for values close to zero.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer less than or equal in magnitude to the
+/// argument. Array values can be of arbitrary length. If argument is null the
+/// result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Floor(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer greater than or equal in magnitude to the
+/// argument. Array values can be of arbitrary length. If argument is null the
+/// result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Ceil(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the integral part without fractional digits. Array values can be
+/// of arbitrary length. If argument is null the result will be null.
+///
+/// \param[in] arg the value to truncate
+/// \param[in] ctx the function execution context, optional
+/// \return the truncated value
+ARROW_EXPORT
+Result<Datum> Trunc(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise maximum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise maximum
+ARROW_EXPORT
+Result<Datum> MaxElementWise(
+ const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise minimum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise minimum
+ARROW_EXPORT
+Result<Datum> MinElementWise(
+ const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument
+/// is null the result will be null.
+///
+/// \param[in] arg the value to extract sign from
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sign function
+ARROW_EXPORT
+Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
+
/// \brief Compare a numeric array with a scalar.
///
/// \param[in] left datum to compare, must be an Array
@@ -562,10 +562,10 @@ Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
///
/// \since 1.0.0
/// \note API not yet finalized
-ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly")
+ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly")
ARROW_EXPORT
-Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
- ExecContext* ctx = NULLPTR);
+Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
+ ExecContext* ctx = NULLPTR);
/// \brief Invert the values of a boolean datum
/// \param[in] value datum to invert
@@ -580,8 +580,8 @@ Result<Datum> Invert(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Element-wise AND of two boolean datums which always propagates nulls
/// (null and false is null).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -593,8 +593,8 @@ Result<Datum> And(const Datum& left, const Datum& right, ExecContext* ctx = NULL
/// \brief Element-wise AND of two boolean datums with a Kleene truth table
/// (null and false is false).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -607,8 +607,8 @@ Result<Datum> KleeneAnd(const Datum& left, const Datum& right,
/// \brief Element-wise OR of two boolean datums which always propagates nulls
/// (null and true is null).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -620,8 +620,8 @@ Result<Datum> Or(const Datum& left, const Datum& right, ExecContext* ctx = NULLP
/// \brief Element-wise OR of two boolean datums with a Kleene truth table
/// (null or true is true).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -631,8 +631,8 @@ ARROW_EXPORT
Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
/// \brief Element-wise XOR of two boolean datums
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -641,49 +641,49 @@ Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx =
ARROW_EXPORT
Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
-/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls
-/// (null and not true is null).
-///
-/// \param[in] left left operand
-/// \param[in] right right operand
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
-
-/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table
-/// (false and not null is false, null and not true is false).
-///
-/// \param[in] left left operand
-/// \param[in] right right operand
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
- ExecContext* ctx = NULLPTR);
-
+/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls
+/// (null and not true is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table
+/// (false and not null is false, null and not true is false).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
/// \brief IsIn returns true for each element of `values` that is contained in
/// `value_set`
///
-/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
///
/// \param[in] values array-like input to look up in value_set
-/// \param[in] options SetLookupOptions
+/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx = NULLPTR);
-ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
Result<Datum> IsIn(const Datum& values, const Datum& value_set,
ExecContext* ctx = NULLPTR);
@@ -695,19 +695,19 @@ Result<Datum> IsIn(const Datum& values, const Datum& value_set,
/// For example given values = [99, 42, 3, null] and
/// value_set = [3, 3, 99], the output will be = [1, null, 0, null]
///
-/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
///
/// \param[in] values array-like input
-/// \param[in] options SetLookupOptions
+/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx = NULLPTR);
-ARROW_EXPORT
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
Result<Datum> IndexIn(const Datum& values, const Datum& value_set,
ExecContext* ctx = NULLPTR);
@@ -735,18 +735,18 @@ Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
ARROW_EXPORT
Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
-/// \brief IsNan returns true for each element of `values` that is NaN,
-/// false otherwise
-///
-/// \param[in] values input to look for NaN
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> IsNan(const Datum& values, ExecContext* ctx = NULLPTR);
-
+/// \brief IsNan returns true for each element of `values` that is NaN,
+/// false otherwise
+///
+/// \param[in] values input to look for NaN
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsNan(const Datum& values, ExecContext* ctx = NULLPTR);
+
/// \brief FillNull replaces each null element in `values`
/// with `fill_value`
///
@@ -762,228 +762,228 @@ ARROW_EXPORT
Result<Datum> FillNull(const Datum& values, const Datum& fill_value,
ExecContext* ctx = NULLPTR);
-/// \brief IfElse returns elements chosen from `left` or `right`
-/// depending on `cond`. `null` values in `cond` will be promoted to the result
-///
-/// \param[in] cond `Boolean` condition Scalar/ Array
-/// \param[in] left Scalar/ Array
-/// \param[in] right Scalar/ Array
-/// \param[in] ctx the function execution context, optional
-///
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right,
- ExecContext* ctx = NULLPTR);
-
-/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for
-/// each row, select the first value for which the corresponding condition is
-/// true, or (if given) select the 'else' value, else emit null. Note that a
-/// null condition is the same as false.
-///
-/// \param[in] cond Conditions (Boolean)
-/// \param[in] cases Values (any type), along with an optional 'else' value.
-/// \param[in] ctx the function execution context, optional
-///
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
- ExecContext* ctx = NULLPTR);
-
-/// \brief Year returns year for each element of `values`
-///
-/// \param[in] values input to extract year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Month returns month for each element of `values`.
-/// Month is encoded as January=1, December=12
-///
-/// \param[in] values input to extract month from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Day returns day number for each element of `values`
-///
-/// \param[in] values input to extract day from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief DayOfWeek returns number of the day of the week value for each element of
-/// `values`.
-///
-/// By default week starts on Monday denoted by 0 and ends on Sunday denoted
-/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be
-/// set using DayOfWeekOptions
-///
-/// \param[in] values input to extract number of the day of the week from
-/// \param[in] options for setting start of the week and day numbering
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values,
- DayOfWeekOptions options = DayOfWeekOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief DayOfYear returns number of day of the year for each element of `values`.
-/// January 1st maps to day number 1, February 1st to 32, etc.
-///
-/// \param[in] values input to extract number of day of the year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief ISOYear returns ISO year number for each element of `values`.
-/// First week of an ISO year has the majority (4 or more) of its days in January.
-///
-/// \param[in] values input to extract ISO year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief ISOWeek returns ISO week of year number for each element of `values`.
-/// First ISO week has the majority (4 or more) of its days in January.
-/// Week of the year starts with 1 and can run up to 53.
-///
-/// \param[in] values input to extract ISO week of year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
-/// each element of `values`.
-/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
-///
-/// \param[in] values input to ISO calendar struct from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Quarter returns the quarter of year number for each element of `values`
-/// First quarter maps to 1 and fourth quarter maps to 4.
-///
-/// \param[in] values input to extract quarter of year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Hour returns hour value for each element of `values`
-///
-/// \param[in] values input to extract hour from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Minute returns minutes value for each element of `values`
-///
-/// \param[in] values input to extract minutes from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Second returns seconds value for each element of `values`
-///
-/// \param[in] values input to extract seconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Millisecond returns number of milliseconds since the last full second
-/// for each element of `values`
-///
-/// \param[in] values input to extract milliseconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Microsecond returns number of microseconds since the last full millisecond
-/// for each element of `values`
-///
-/// \param[in] values input to extract microseconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Nanosecond returns number of nanoseconds since the last full millisecond
-/// for each element of `values`
-///
-/// \param[in] values input to extract nanoseconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Subsecond returns the fraction of second elapsed since last full second
-/// as a float for each element of `values`
-///
-/// \param[in] values input to extract subsecond from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
+/// \brief IfElse returns elements chosen from `left` or `right`
+/// depending on `cond`. `null` values in `cond` will be promoted to the result
+///
+/// \param[in] cond `Boolean` condition Scalar/ Array
+/// \param[in] left Scalar/ Array
+/// \param[in] right Scalar/ Array
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for
+/// each row, select the first value for which the corresponding condition is
+/// true, or (if given) select the 'else' value, else emit null. Note that a
+/// null condition is the same as false.
+///
+/// \param[in] cond Conditions (Boolean)
+/// \param[in] cases Values (any type), along with an optional 'else' value.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Year returns year for each element of `values`
+///
+/// \param[in] values input to extract year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Month returns month for each element of `values`.
+/// Month is encoded as January=1, December=12
+///
+/// \param[in] values input to extract month from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Day returns day number for each element of `values`
+///
+/// \param[in] values input to extract day from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfWeek returns number of the day of the week value for each element of
+/// `values`.
+///
+/// By default week starts on Monday denoted by 0 and ends on Sunday denoted
+/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be
+/// set using DayOfWeekOptions
+///
+/// \param[in] values input to extract number of the day of the week from
+/// \param[in] options for setting start of the week and day numbering
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values,
+ DayOfWeekOptions options = DayOfWeekOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfYear returns number of day of the year for each element of `values`.
+/// January 1st maps to day number 1, February 1st to 32, etc.
+///
+/// \param[in] values input to extract number of day of the year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOYear returns ISO year number for each element of `values`.
+/// First week of an ISO year has the majority (4 or more) of its days in January.
+///
+/// \param[in] values input to extract ISO year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOWeek returns ISO week of year number for each element of `values`.
+/// First ISO week has the majority (4 or more) of its days in January.
+/// Week of the year starts with 1 and can run up to 53.
+///
+/// \param[in] values input to extract ISO week of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
+/// each element of `values`.
+/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
+///
+/// \param[in] values input to ISO calendar struct from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Quarter returns the quarter of year number for each element of `values`
+/// First quarter maps to 1 and fourth quarter maps to 4.
+///
+/// \param[in] values input to extract quarter of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Hour returns hour value for each element of `values`
+///
+/// \param[in] values input to extract hour from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Minute returns minutes value for each element of `values`
+///
+/// \param[in] values input to extract minutes from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Second returns seconds value for each element of `values`
+///
+/// \param[in] values input to extract seconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Millisecond returns number of milliseconds since the last full second
+/// for each element of `values`
+///
+/// \param[in] values input to extract milliseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Microsecond returns number of microseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract microseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Nanosecond returns number of nanoseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract nanoseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Subsecond returns the fraction of second elapsed since last full second
+/// as a float for each element of `values`
+///
+/// \param[in] values input to extract subsecond from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
index a68969b2ee5..4b875ddaf04 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
@@ -18,140 +18,140 @@
#include "arrow/compute/api_vector.h"
#include <memory>
-#include <sstream>
+#include <sstream>
#include <utility>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/registry.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
#include "arrow/datum.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/logging.h"
namespace arrow {
-using internal::checked_cast;
+using internal::checked_cast;
using internal::checked_pointer_cast;
-namespace internal {
-using compute::DictionaryEncodeOptions;
-using compute::FilterOptions;
-template <>
-struct EnumTraits<FilterOptions::NullSelectionBehavior>
- : BasicEnumTraits<FilterOptions::NullSelectionBehavior, FilterOptions::DROP,
- FilterOptions::EMIT_NULL> {
- static std::string name() { return "FilterOptions::NullSelectionBehavior"; }
- static std::string value_name(FilterOptions::NullSelectionBehavior value) {
- switch (value) {
- case FilterOptions::DROP:
- return "DROP";
- case FilterOptions::EMIT_NULL:
- return "EMIT_NULL";
- }
- return "<INVALID>";
- }
-};
-template <>
-struct EnumTraits<DictionaryEncodeOptions::NullEncodingBehavior>
- : BasicEnumTraits<DictionaryEncodeOptions::NullEncodingBehavior,
- DictionaryEncodeOptions::ENCODE, DictionaryEncodeOptions::MASK> {
- static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; }
- static std::string value_name(DictionaryEncodeOptions::NullEncodingBehavior value) {
- switch (value) {
- case DictionaryEncodeOptions::ENCODE:
- return "ENCODE";
- case DictionaryEncodeOptions::MASK:
- return "MASK";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
+namespace internal {
+using compute::DictionaryEncodeOptions;
+using compute::FilterOptions;
+template <>
+struct EnumTraits<FilterOptions::NullSelectionBehavior>
+ : BasicEnumTraits<FilterOptions::NullSelectionBehavior, FilterOptions::DROP,
+ FilterOptions::EMIT_NULL> {
+ static std::string name() { return "FilterOptions::NullSelectionBehavior"; }
+ static std::string value_name(FilterOptions::NullSelectionBehavior value) {
+ switch (value) {
+ case FilterOptions::DROP:
+ return "DROP";
+ case FilterOptions::EMIT_NULL:
+ return "EMIT_NULL";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<DictionaryEncodeOptions::NullEncodingBehavior>
+ : BasicEnumTraits<DictionaryEncodeOptions::NullEncodingBehavior,
+ DictionaryEncodeOptions::ENCODE, DictionaryEncodeOptions::MASK> {
+ static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; }
+ static std::string value_name(DictionaryEncodeOptions::NullEncodingBehavior value) {
+ switch (value) {
+ case DictionaryEncodeOptions::ENCODE:
+ return "ENCODE";
+ case DictionaryEncodeOptions::MASK:
+ return "MASK";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
namespace compute {
// ----------------------------------------------------------------------
-// Function options
-
-bool SortKey::Equals(const SortKey& other) const {
- return name == other.name && order == other.order;
-}
-std::string SortKey::ToString() const {
- std::stringstream ss;
- ss << name << ' ';
- switch (order) {
- case SortOrder::Ascending:
- ss << "ASC";
- break;
- case SortOrder::Descending:
- ss << "DESC";
- break;
- }
- return ss.str();
-}
-
-namespace internal {
-namespace {
-using ::arrow::internal::DataMember;
-static auto kFilterOptionsType = GetFunctionOptionsType<FilterOptions>(
- DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior));
-static auto kTakeOptionsType = GetFunctionOptionsType<TakeOptions>(
- DataMember("boundscheck", &TakeOptions::boundscheck));
-static auto kDictionaryEncodeOptionsType =
- GetFunctionOptionsType<DictionaryEncodeOptions>(DataMember(
- "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior));
-static auto kArraySortOptionsType = GetFunctionOptionsType<ArraySortOptions>(
- DataMember("order", &ArraySortOptions::order));
-static auto kSortOptionsType =
- GetFunctionOptionsType<SortOptions>(DataMember("sort_keys", &SortOptions::sort_keys));
-static auto kPartitionNthOptionsType = GetFunctionOptionsType<PartitionNthOptions>(
- DataMember("pivot", &PartitionNthOptions::pivot));
-} // namespace
-} // namespace internal
-
-FilterOptions::FilterOptions(NullSelectionBehavior null_selection)
- : FunctionOptions(internal::kFilterOptionsType),
- null_selection_behavior(null_selection) {}
-constexpr char FilterOptions::kTypeName[];
-
-TakeOptions::TakeOptions(bool boundscheck)
- : FunctionOptions(internal::kTakeOptionsType), boundscheck(boundscheck) {}
-constexpr char TakeOptions::kTypeName[];
-
-DictionaryEncodeOptions::DictionaryEncodeOptions(NullEncodingBehavior null_encoding)
- : FunctionOptions(internal::kDictionaryEncodeOptionsType),
- null_encoding_behavior(null_encoding) {}
-constexpr char DictionaryEncodeOptions::kTypeName[];
-
-ArraySortOptions::ArraySortOptions(SortOrder order)
- : FunctionOptions(internal::kArraySortOptionsType), order(order) {}
-constexpr char ArraySortOptions::kTypeName[];
-
-SortOptions::SortOptions(std::vector<SortKey> sort_keys)
- : FunctionOptions(internal::kSortOptionsType), sort_keys(std::move(sort_keys)) {}
-constexpr char SortOptions::kTypeName[];
-
-PartitionNthOptions::PartitionNthOptions(int64_t pivot)
- : FunctionOptions(internal::kPartitionNthOptionsType), pivot(pivot) {}
-constexpr char PartitionNthOptions::kTypeName[];
-
-namespace internal {
-void RegisterVectorOptions(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kTakeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kDictionaryEncodeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kArraySortOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType));
-}
-} // namespace internal
-
-// ----------------------------------------------------------------------
+// Function options
+
+bool SortKey::Equals(const SortKey& other) const {
+ return name == other.name && order == other.order;
+}
+std::string SortKey::ToString() const {
+ std::stringstream ss;
+ ss << name << ' ';
+ switch (order) {
+ case SortOrder::Ascending:
+ ss << "ASC";
+ break;
+ case SortOrder::Descending:
+ ss << "DESC";
+ break;
+ }
+ return ss.str();
+}
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kFilterOptionsType = GetFunctionOptionsType<FilterOptions>(
+ DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior));
+static auto kTakeOptionsType = GetFunctionOptionsType<TakeOptions>(
+ DataMember("boundscheck", &TakeOptions::boundscheck));
+static auto kDictionaryEncodeOptionsType =
+ GetFunctionOptionsType<DictionaryEncodeOptions>(DataMember(
+ "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior));
+static auto kArraySortOptionsType = GetFunctionOptionsType<ArraySortOptions>(
+ DataMember("order", &ArraySortOptions::order));
+static auto kSortOptionsType =
+ GetFunctionOptionsType<SortOptions>(DataMember("sort_keys", &SortOptions::sort_keys));
+static auto kPartitionNthOptionsType = GetFunctionOptionsType<PartitionNthOptions>(
+ DataMember("pivot", &PartitionNthOptions::pivot));
+} // namespace
+} // namespace internal
+
+FilterOptions::FilterOptions(NullSelectionBehavior null_selection)
+ : FunctionOptions(internal::kFilterOptionsType),
+ null_selection_behavior(null_selection) {}
+constexpr char FilterOptions::kTypeName[];
+
+TakeOptions::TakeOptions(bool boundscheck)
+ : FunctionOptions(internal::kTakeOptionsType), boundscheck(boundscheck) {}
+constexpr char TakeOptions::kTypeName[];
+
+DictionaryEncodeOptions::DictionaryEncodeOptions(NullEncodingBehavior null_encoding)
+ : FunctionOptions(internal::kDictionaryEncodeOptionsType),
+ null_encoding_behavior(null_encoding) {}
+constexpr char DictionaryEncodeOptions::kTypeName[];
+
+ArraySortOptions::ArraySortOptions(SortOrder order)
+ : FunctionOptions(internal::kArraySortOptionsType), order(order) {}
+constexpr char ArraySortOptions::kTypeName[];
+
+SortOptions::SortOptions(std::vector<SortKey> sort_keys)
+ : FunctionOptions(internal::kSortOptionsType), sort_keys(std::move(sort_keys)) {}
+constexpr char SortOptions::kTypeName[];
+
+PartitionNthOptions::PartitionNthOptions(int64_t pivot)
+ : FunctionOptions(internal::kPartitionNthOptionsType), pivot(pivot) {}
+constexpr char PartitionNthOptions::kTypeName[];
+
+namespace internal {
+void RegisterVectorOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTakeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kDictionaryEncodeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kArraySortOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType));
+}
+} // namespace internal
+
+// ----------------------------------------------------------------------
// Direct exec interface to kernels
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
@@ -162,42 +162,42 @@ Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
return result.make_array();
}
-Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
- const Datum& replacements, ExecContext* ctx) {
- return CallFunction("replace_with_mask", {values, mask, replacements}, ctx);
-}
-
-Result<std::shared_ptr<Array>> SortIndices(const Array& values, SortOrder order,
- ExecContext* ctx) {
- ArraySortOptions options(order);
- ARROW_ASSIGN_OR_RAISE(
- Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx));
- return result.make_array();
-}
-
-Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
- SortOrder order, ExecContext* ctx) {
- SortOptions options({SortKey("not-used", order)});
- ARROW_ASSIGN_OR_RAISE(
- Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
- return result.make_array();
-}
-
-Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
- ExecContext* ctx) {
- ARROW_ASSIGN_OR_RAISE(Datum result,
- CallFunction("sort_indices", {datum}, &options, ctx));
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+ const Datum& replacements, ExecContext* ctx) {
+ return CallFunction("replace_with_mask", {values, mask, replacements}, ctx);
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const Array& values, SortOrder order,
+ ExecContext* ctx) {
+ ArraySortOptions options(order);
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx));
return result.make_array();
}
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+ SortOrder order, ExecContext* ctx) {
+ SortOptions options({SortKey("not-used", order)});
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+ ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result,
+ CallFunction("sort_indices", {datum}, &options, ctx));
+ return result.make_array();
+}
+
Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("unique", {value}, ctx));
return result.make_array();
}
-Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options,
- ExecContext* ctx) {
- return CallFunction("dictionary_encode", {value}, &options, ctx);
+Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("dictionary_encode", {value}, &options, ctx);
}
const char kValuesFieldName[] = "values";
@@ -275,9 +275,9 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
return result.table();
}
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
- return SortIndices(values, SortOrder::Ascending, ctx);
-}
-
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
+ return SortIndices(values, SortOrder::Ascending, ctx);
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
index 9d8d4271db8..c3a81542b76 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
@@ -32,8 +32,8 @@ class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
-class ARROW_EXPORT FilterOptions : public FunctionOptions {
- public:
+class ARROW_EXPORT FilterOptions : public FunctionOptions {
+ public:
/// Configure the action taken when a slot of the selection mask is null
enum NullSelectionBehavior {
/// the corresponding filtered value will be removed in the output
@@ -42,89 +42,89 @@ class ARROW_EXPORT FilterOptions : public FunctionOptions {
EMIT_NULL,
};
- explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
- constexpr static char const kTypeName[] = "FilterOptions";
+ explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
+ constexpr static char const kTypeName[] = "FilterOptions";
static FilterOptions Defaults() { return FilterOptions(); }
NullSelectionBehavior null_selection_behavior = DROP;
};
-class ARROW_EXPORT TakeOptions : public FunctionOptions {
- public:
- explicit TakeOptions(bool boundscheck = true);
- constexpr static char const kTypeName[] = "TakeOptions";
+class ARROW_EXPORT TakeOptions : public FunctionOptions {
+ public:
+ explicit TakeOptions(bool boundscheck = true);
+ constexpr static char const kTypeName[] = "TakeOptions";
static TakeOptions BoundsCheck() { return TakeOptions(true); }
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
static TakeOptions Defaults() { return BoundsCheck(); }
-
- bool boundscheck = true;
-};
-
-/// \brief Options for the dictionary encode function
-class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
- public:
- /// Configure how null values will be encoded
- enum NullEncodingBehavior {
- /// the null value will be added to the dictionary with a proper index
- ENCODE,
- /// the null value will be masked in the indices array
- MASK
- };
-
- explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
- constexpr static char const kTypeName[] = "DictionaryEncodeOptions";
- static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
-
- NullEncodingBehavior null_encoding_behavior = MASK;
-};
-
-enum class SortOrder {
- Ascending,
- Descending,
-};
-
-/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
-class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
- public:
- explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
- : name(name), order(order) {}
-
- using util::EqualityComparable<SortKey>::Equals;
- using util::EqualityComparable<SortKey>::operator==;
- using util::EqualityComparable<SortKey>::operator!=;
- bool Equals(const SortKey& other) const;
- std::string ToString() const;
-
- /// The name of the sort column.
- std::string name;
- /// How to order by this sort key.
- SortOrder order;
-};
-
-class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
- public:
- explicit ArraySortOptions(SortOrder order = SortOrder::Ascending);
- constexpr static char const kTypeName[] = "ArraySortOptions";
- static ArraySortOptions Defaults() { return ArraySortOptions{}; }
-
- SortOrder order;
-};
-
-class ARROW_EXPORT SortOptions : public FunctionOptions {
- public:
- explicit SortOptions(std::vector<SortKey> sort_keys = {});
- constexpr static char const kTypeName[] = "SortOptions";
- static SortOptions Defaults() { return SortOptions{}; }
-
- std::vector<SortKey> sort_keys;
+
+ bool boundscheck = true;
};
+/// \brief Options for the dictionary encode function
+class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
+ public:
+ /// Configure how null values will be encoded
+ enum NullEncodingBehavior {
+ /// the null value will be added to the dictionary with a proper index
+ ENCODE,
+ /// the null value will be masked in the indices array
+ MASK
+ };
+
+ explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
+ constexpr static char const kTypeName[] = "DictionaryEncodeOptions";
+ static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
+
+ NullEncodingBehavior null_encoding_behavior = MASK;
+};
+
+enum class SortOrder {
+ Ascending,
+ Descending,
+};
+
+/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
+class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
+ public:
+ explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
+ : name(name), order(order) {}
+
+ using util::EqualityComparable<SortKey>::Equals;
+ using util::EqualityComparable<SortKey>::operator==;
+ using util::EqualityComparable<SortKey>::operator!=;
+ bool Equals(const SortKey& other) const;
+ std::string ToString() const;
+
+ /// The name of the sort column.
+ std::string name;
+ /// How to order by this sort key.
+ SortOrder order;
+};
+
+class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
+ public:
+ explicit ArraySortOptions(SortOrder order = SortOrder::Ascending);
+ constexpr static char const kTypeName[] = "ArraySortOptions";
+ static ArraySortOptions Defaults() { return ArraySortOptions{}; }
+
+ SortOrder order;
+};
+
+class ARROW_EXPORT SortOptions : public FunctionOptions {
+ public:
+ explicit SortOptions(std::vector<SortKey> sort_keys = {});
+ constexpr static char const kTypeName[] = "SortOptions";
+ static SortOptions Defaults() { return SortOptions{}; }
+
+ std::vector<SortKey> sort_keys;
+};
+
/// \brief Partitioning options for NthToIndices
-class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
- public:
- explicit PartitionNthOptions(int64_t pivot);
- PartitionNthOptions() : PartitionNthOptions(0) {}
- constexpr static char const kTypeName[] = "PartitionNthOptions";
+class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
+ public:
+ explicit PartitionNthOptions(int64_t pivot);
+ PartitionNthOptions() : PartitionNthOptions(0) {}
+ constexpr static char const kTypeName[] = "PartitionNthOptions";
/// The index into the equivalent sorted array of the partition pivot element.
int64_t pivot;
@@ -171,23 +171,23 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndices(
} // namespace internal
-/// \brief ReplaceWithMask replaces each value in the array corresponding
-/// to a true value in the mask with the next element from `replacements`.
-///
-/// \param[in] values Array input to replace
-/// \param[in] mask Array or Scalar of Boolean mask values
-/// \param[in] replacements The replacement values to draw from. There must
-/// be as many replacement values as true values in the mask.
-/// \param[in] ctx the function execution context, optional
-///
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
- const Datum& replacements, ExecContext* ctx = NULLPTR);
-
+/// \brief ReplaceWithMask replaces each value in the array corresponding
+/// to a true value in the mask with the next element from `replacements`.
+///
+/// \param[in] values Array input to replace
+/// \param[in] mask Array or Scalar of Boolean mask values
+/// \param[in] replacements The replacement values to draw from. There must
+/// be as many replacement values as true values in the mask.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+ const Datum& replacements, ExecContext* ctx = NULLPTR);
+
/// \brief Take from an array of values at indices in another array
///
/// The output array will be of the same type as the input values
@@ -233,73 +233,73 @@ ARROW_EXPORT
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
ExecContext* ctx = NULLPTR);
-/// \brief Returns the indices that would sort an array in the
-/// specified order.
+/// \brief Returns the indices that would sort an array in the
+/// specified order.
///
/// Perform an indirect sort of array. The output array will contain
/// indices that would sort an array, which would be the same length
-/// as input. Nulls will be stably partitioned to the end of the output
-/// regardless of order.
+/// as input. Nulls will be stably partitioned to the end of the output
+/// regardless of order.
///
-/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
-/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
-/// 3].
+/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
+/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
+/// 3].
///
-/// \param[in] array array to sort
-/// \param[in] order ascending or descending
+/// \param[in] array array to sort
+/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortIndices(const Array& array,
- SortOrder order = SortOrder::Ascending,
- ExecContext* ctx = NULLPTR);
-
-/// \brief Returns the indices that would sort a chunked array in the
-/// specified order.
-///
-/// Perform an indirect sort of chunked array. The output array will
-/// contain indices that would sort a chunked array, which would be
-/// the same length as input. Nulls will be stably partitioned to the
-/// end of the output regardless of order.
-///
-/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
-/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
-/// 4, 1, 0, 3].
-///
-/// \param[in] chunked_array chunked array to sort
-/// \param[in] order ascending or descending
-/// \param[in] ctx the function execution context, optional
-/// \return offsets indices that would sort an array
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
- SortOrder order = SortOrder::Ascending,
- ExecContext* ctx = NULLPTR);
-
-/// \brief Returns the indices that would sort an input in the
-/// specified order. Input is one of array, chunked array record batch
-/// or table.
-///
-/// Perform an indirect sort of input. The output array will contain
-/// indices that would sort an input, which would be the same length
-/// as input. Nulls will be stably partitioned to the end of the
-/// output regardless of order.
-///
-/// For example given input (table) = {
-/// "column1": [[null, 1], [ 3, null, 2, 1]],
-/// "column2": [[ 5], [3, null, null, 5, 5]],
-/// } and options = {
-/// {"column1", SortOrder::Ascending},
-/// {"column2", SortOrder::Descending},
-/// }, the output will be [5, 1, 4, 2, 0, 3].
-///
-/// \param[in] datum array, chunked array, record batch or table to sort
-/// \param[in] options options
-/// \param[in] ctx the function execution context, optional
-/// \return offsets indices that would sort a table
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
- ExecContext* ctx = NULLPTR);
-
+Result<std::shared_ptr<Array>> SortIndices(const Array& array,
+ SortOrder order = SortOrder::Ascending,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort a chunked array in the
+/// specified order.
+///
+/// Perform an indirect sort of chunked array. The output array will
+/// contain indices that would sort a chunked array, which would be
+/// the same length as input. Nulls will be stably partitioned to the
+/// end of the output regardless of order.
+///
+/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
+/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
+/// 4, 1, 0, 3].
+///
+/// \param[in] chunked_array chunked array to sort
+/// \param[in] order ascending or descending
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+ SortOrder order = SortOrder::Ascending,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort an input in the
+/// specified order. Input is one of array, chunked array record batch
+/// or table.
+///
+/// Perform an indirect sort of input. The output array will contain
+/// indices that would sort an input, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the
+/// output regardless of order.
+///
+/// For example given input (table) = {
+/// "column1": [[null, 1], [ 3, null, 2, 1]],
+/// "column2": [[ 5], [3, null, null, 5, 5]],
+/// } and options = {
+/// {"column1", SortOrder::Ascending},
+/// {"column2", SortOrder::Descending},
+/// }, the output will be [5, 1, 4, 2, 0, 3].
+///
+/// \param[in] datum array, chunked array, record batch or table to sort
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort a table
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+ ExecContext* ctx = NULLPTR);
+
/// \brief Compute unique elements from an array-like object
///
/// Note if a null occurs in the input it will NOT be included in the output.
@@ -338,29 +338,29 @@ Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
ExecContext* ctx = NULLPTR);
/// \brief Dictionary-encode values in an array-like object
-///
-/// Any nulls encountered in the dictionary will be handled according to the
-/// specified null encoding behavior.
-///
-/// For example, given values ["a", "b", null, "a", null] the output will be
-/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
-/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
-///
-/// If the input is already dictionary encoded this function is a no-op unless
-/// it needs to modify the null_encoding (TODO)
-///
+///
+/// Any nulls encountered in the dictionary will be handled according to the
+/// specified null encoding behavior.
+///
+/// For example, given values ["a", "b", null, "a", null] the output will be
+/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
+/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
+///
+/// If the input is already dictionary encoded this function is a no-op unless
+/// it needs to modify the null_encoding (TODO)
+///
/// \param[in] data array-like input
/// \param[in] ctx the function execution context, optional
-/// \param[in] options configures null encoding behavior
+/// \param[in] options configures null encoding behavior
/// \return result with same shape and type as input
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> DictionaryEncode(
- const Datum& data,
- const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> DictionaryEncode(
+ const Datum& data,
+ const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
// ----------------------------------------------------------------------
// Deprecated functions
@@ -401,10 +401,10 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* context = NULLPTR);
-ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
- ExecContext* ctx = NULLPTR);
-
+ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
+ ExecContext* ctx = NULLPTR);
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
index 4de68ba8d90..db3b2e05da4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
@@ -18,7 +18,7 @@
#include "arrow/compute/cast.h"
#include <mutex>
-#include <sstream>
+#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
@@ -27,12 +27,12 @@
#include "arrow/compute/cast_internal.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
+#include "arrow/compute/function_internal.h"
#include "arrow/compute/kernel.h"
#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/registry.h"
#include "arrow/util/logging.h"
-#include "arrow/util/reflection_internal.h"
+#include "arrow/util/reflection_internal.h"
namespace arrow {
@@ -41,13 +41,13 @@ using internal::ToTypeName;
namespace compute {
namespace internal {
-// ----------------------------------------------------------------------
-// Function options
-
-namespace {
-
+// ----------------------------------------------------------------------
+// Function options
+
+namespace {
+
std::unordered_map<int, std::shared_ptr<CastFunction>> g_cast_table;
-std::once_flag cast_table_initialized;
+std::once_flag cast_table_initialized;
void AddCastFunctions(const std::vector<std::shared_ptr<CastFunction>>& funcs) {
for (const auto& func : funcs) {
@@ -61,7 +61,7 @@ void InitCastTable() {
AddCastFunctions(GetNestedCasts());
AddCastFunctions(GetNumericCasts());
AddCastFunctions(GetTemporalCasts());
- AddCastFunctions(GetDictionaryCasts());
+ AddCastFunctions(GetDictionaryCasts());
}
void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); }
@@ -85,17 +85,17 @@ Result<std::shared_ptr<CastFunction>> GetCastFunctionInternal(
return it->second;
}
-const FunctionDoc cast_doc{"Cast values to another data type",
- ("Behavior when values wouldn't fit in the target type\n"
- "can be controlled through CastOptions."),
- {"input"},
- "CastOptions"};
+const FunctionDoc cast_doc{"Cast values to another data type",
+ ("Behavior when values wouldn't fit in the target type\n"
+ "can be controlled through CastOptions."),
+ {"input"},
+ "CastOptions"};
-// Metafunction for dispatching to appropriate CastFunction. This corresponds
+// Metafunction for dispatching to appropriate CastFunction. This corresponds
// to the standard SQL CAST(expr AS target_type)
class CastMetaFunction : public MetaFunction {
public:
- CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {}
+ CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {}
Result<const CastOptions*> ValidateOptions(const FunctionOptions* options) const {
auto cast_options = static_cast<const CastOptions*>(options);
@@ -123,44 +123,44 @@ class CastMetaFunction : public MetaFunction {
}
};
-static auto kCastOptionsType = GetFunctionOptionsType<CastOptions>(
- arrow::internal::DataMember("to_type", &CastOptions::to_type),
- arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow),
- arrow::internal::DataMember("allow_time_truncate", &CastOptions::allow_time_truncate),
- arrow::internal::DataMember("allow_time_overflow", &CastOptions::allow_time_overflow),
- arrow::internal::DataMember("allow_decimal_truncate",
- &CastOptions::allow_decimal_truncate),
- arrow::internal::DataMember("allow_float_truncate",
- &CastOptions::allow_float_truncate),
- arrow::internal::DataMember("allow_invalid_utf8", &CastOptions::allow_invalid_utf8));
-} // namespace
-
+static auto kCastOptionsType = GetFunctionOptionsType<CastOptions>(
+ arrow::internal::DataMember("to_type", &CastOptions::to_type),
+ arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow),
+ arrow::internal::DataMember("allow_time_truncate", &CastOptions::allow_time_truncate),
+ arrow::internal::DataMember("allow_time_overflow", &CastOptions::allow_time_overflow),
+ arrow::internal::DataMember("allow_decimal_truncate",
+ &CastOptions::allow_decimal_truncate),
+ arrow::internal::DataMember("allow_float_truncate",
+ &CastOptions::allow_float_truncate),
+ arrow::internal::DataMember("allow_invalid_utf8", &CastOptions::allow_invalid_utf8));
+} // namespace
+
void RegisterScalarCast(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::make_shared<CastMetaFunction>()));
- DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType));
}
} // namespace internal
-CastOptions::CastOptions(bool safe)
- : FunctionOptions(internal::kCastOptionsType),
- allow_int_overflow(!safe),
- allow_time_truncate(!safe),
- allow_time_overflow(!safe),
- allow_decimal_truncate(!safe),
- allow_float_truncate(!safe),
- allow_invalid_utf8(!safe) {}
+CastOptions::CastOptions(bool safe)
+ : FunctionOptions(internal::kCastOptionsType),
+ allow_int_overflow(!safe),
+ allow_time_truncate(!safe),
+ allow_time_overflow(!safe),
+ allow_decimal_truncate(!safe),
+ allow_float_truncate(!safe),
+ allow_invalid_utf8(!safe) {}
-constexpr char CastOptions::kTypeName[];
+constexpr char CastOptions::kTypeName[];
-CastFunction::CastFunction(std::string name, Type::type out_type_id)
- : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr),
- out_type_id_(out_type_id) {}
+CastFunction::CastFunction(std::string name, Type::type out_type_id)
+ : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr),
+ out_type_id_(out_type_id) {}
Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
// We use the same KernelInit for every cast
kernel.init = internal::CastState::Init;
RETURN_NOT_OK(ScalarFunction::AddKernel(kernel));
- in_type_ids_.push_back(in_type_id);
+ in_type_ids_.push_back(in_type_id);
return Status::OK();
}
@@ -176,9 +176,9 @@ Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_
return AddKernel(in_type_id, std::move(kernel));
}
-Result<const Kernel*> CastFunction::DispatchExact(
+Result<const Kernel*> CastFunction::DispatchExact(
const std::vector<ValueDescr>& values) const {
- RETURN_NOT_OK(CheckArity(values));
+ RETURN_NOT_OK(CheckArity(values));
std::vector<const ScalarKernel*> candidate_kernels;
for (const auto& kernel : kernels_) {
@@ -189,28 +189,28 @@ Result<const Kernel*> CastFunction::DispatchExact(
if (candidate_kernels.size() == 0) {
return Status::NotImplemented("Unsupported cast from ", values[0].type->ToString(),
- " to ", ToTypeName(out_type_id_), " using function ",
+ " to ", ToTypeName(out_type_id_), " using function ",
this->name());
- }
-
- if (candidate_kernels.size() == 1) {
+ }
+
+ if (candidate_kernels.size() == 1) {
// One match, return it
return candidate_kernels[0];
- }
-
- // Now we are in a casting scenario where we may have both a EXACT_TYPE and
- // a SAME_TYPE_ID. So we will see if there is an exact match among the
- // candidate kernels and if not we will just return the first one
- for (auto kernel : candidate_kernels) {
- const InputType& arg0 = kernel->signature->in_types()[0];
- if (arg0.kind() == InputType::EXACT_TYPE) {
- // Bingo. Return it
- return kernel;
+ }
+
+ // Now we are in a casting scenario where we may have both a EXACT_TYPE and
+ // a SAME_TYPE_ID. So we will see if there is an exact match among the
+ // candidate kernels and if not we will just return the first one
+ for (auto kernel : candidate_kernels) {
+ const InputType& arg0 = kernel->signature->in_types()[0];
+ if (arg0.kind() == InputType::EXACT_TYPE) {
+ // Bingo. Return it
+ return kernel;
}
}
-
- // We didn't find an exact match. So just return some kernel that matches
- return candidate_kernels[0];
+
+ // We didn't find an exact match. So just return some kernel that matches
+ return candidate_kernels[0];
}
Result<Datum> Cast(const Datum& value, const CastOptions& options, ExecContext* ctx) {
@@ -237,37 +237,37 @@ Result<std::shared_ptr<CastFunction>> GetCastFunction(
bool CanCast(const DataType& from_type, const DataType& to_type) {
internal::EnsureInitCastTable();
- auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));
+ auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));
if (it == internal::g_cast_table.end()) {
return false;
}
-
- const CastFunction* function = it->second.get();
- DCHECK_EQ(function->out_type_id(), to_type.id());
-
- for (auto from_id : function->in_type_ids()) {
- // XXX should probably check the output type as well
- if (from_type.id() == from_id) return true;
- }
-
- return false;
-}
-
-Result<std::vector<Datum>> Cast(std::vector<Datum> datums, std::vector<ValueDescr> descrs,
- ExecContext* ctx) {
- for (size_t i = 0; i != datums.size(); ++i) {
- if (descrs[i] != datums[i].descr()) {
- if (descrs[i].shape != datums[i].shape()) {
- return Status::NotImplemented("casting between Datum shapes");
- }
-
- ARROW_ASSIGN_OR_RAISE(datums[i],
- Cast(datums[i], CastOptions::Safe(descrs[i].type), ctx));
- }
- }
-
- return datums;
+
+ const CastFunction* function = it->second.get();
+ DCHECK_EQ(function->out_type_id(), to_type.id());
+
+ for (auto from_id : function->in_type_ids()) {
+ // XXX should probably check the output type as well
+ if (from_type.id() == from_id) return true;
+ }
+
+ return false;
}
+Result<std::vector<Datum>> Cast(std::vector<Datum> datums, std::vector<ValueDescr> descrs,
+ ExecContext* ctx) {
+ for (size_t i = 0; i != datums.size(); ++i) {
+ if (descrs[i] != datums[i].descr()) {
+ if (descrs[i].shape != datums[i].shape()) {
+ return Status::NotImplemented("casting between Datum shapes");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(datums[i],
+ Cast(datums[i], CastOptions::Safe(descrs[i].type), ctx));
+ }
+ }
+
+ return datums;
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
index 131f57f892f..5a2afd86845 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
@@ -41,22 +41,22 @@ class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
-class ARROW_EXPORT CastOptions : public FunctionOptions {
- public:
- explicit CastOptions(bool safe = true);
-
- constexpr static char const kTypeName[] = "CastOptions";
- static CastOptions Safe(std::shared_ptr<DataType> to_type = NULLPTR) {
- CastOptions safe(true);
- safe.to_type = std::move(to_type);
- return safe;
- }
-
- static CastOptions Unsafe(std::shared_ptr<DataType> to_type = NULLPTR) {
- CastOptions unsafe(false);
- unsafe.to_type = std::move(to_type);
- return unsafe;
- }
+class ARROW_EXPORT CastOptions : public FunctionOptions {
+ public:
+ explicit CastOptions(bool safe = true);
+
+ constexpr static char const kTypeName[] = "CastOptions";
+ static CastOptions Safe(std::shared_ptr<DataType> to_type = NULLPTR) {
+ CastOptions safe(true);
+ safe.to_type = std::move(to_type);
+ return safe;
+ }
+
+ static CastOptions Unsafe(std::shared_ptr<DataType> to_type = NULLPTR) {
+ CastOptions unsafe(false);
+ unsafe.to_type = std::move(to_type);
+ return unsafe;
+ }
// Type being casted to. May be passed separate to eager function
// compute::Cast
@@ -78,10 +78,10 @@ class ARROW_EXPORT CastOptions : public FunctionOptions {
// the same execution machinery
class CastFunction : public ScalarFunction {
public:
- CastFunction(std::string name, Type::type out_type_id);
+ CastFunction(std::string name, Type::type out_type_id);
- Type::type out_type_id() const { return out_type_id_; }
- const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
+ Type::type out_type_id() const { return out_type_id_; }
+ const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
OutputType out_type, ArrayKernelExec exec,
@@ -92,12 +92,12 @@ class CastFunction : public ScalarFunction {
// function to CastInit
Status AddKernel(Type::type in_type_id, ScalarKernel kernel);
- Result<const Kernel*> DispatchExact(
+ Result<const Kernel*> DispatchExact(
const std::vector<ValueDescr>& values) const override;
private:
- std::vector<Type::type> in_type_ids_;
- const Type::type out_type_id_;
+ std::vector<Type::type> in_type_ids_;
+ const Type::type out_type_id_;
};
ARROW_EXPORT
@@ -151,17 +151,17 @@ Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
const CastOptions& options = CastOptions::Safe(),
ExecContext* ctx = NULLPTR);
-/// \brief Cast several values simultaneously. Safe cast options are used.
-/// \param[in] values datums to cast
-/// \param[in] descrs ValueDescrs to cast to
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datums
-///
-/// \since 4.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<std::vector<Datum>> Cast(std::vector<Datum> values, std::vector<ValueDescr> descrs,
- ExecContext* ctx = NULLPTR);
-
+/// \brief Cast several values simultaneously. Safe cast options are used.
+/// \param[in] values datums to cast
+/// \param[in] descrs ValueDescrs to cast to
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datums
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::vector<Datum>> Cast(std::vector<Datum> values, std::vector<ValueDescr> descrs,
+ ExecContext* ctx = NULLPTR);
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
index 0105d08a573..7e784a0b61a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
@@ -36,7 +36,7 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts();
std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts();
std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts();
std::vector<std::shared_ptr<CastFunction>> GetNestedCasts();
-std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts();
+std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts();
} // namespace internal
} // namespace compute
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
index 63f8d39f551..8998df465e5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
@@ -36,8 +36,8 @@
#include "arrow/compute/registry.h"
#include "arrow/compute/util_internal.h"
#include "arrow/datum.h"
-#include "arrow/pretty_print.h"
-#include "arrow/record_batch.h"
+#include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
@@ -47,8 +47,8 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/cpu_info.h"
#include "arrow/util/logging.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/util/vector.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/vector.h"
namespace arrow {
@@ -59,104 +59,104 @@ using internal::CpuInfo;
namespace compute {
-ExecContext* default_exec_context() {
- static ExecContext default_ctx;
- return &default_ctx;
-}
-
-ExecBatch::ExecBatch(const RecordBatch& batch)
- : values(batch.num_columns()), length(batch.num_rows()) {
- auto columns = batch.column_data();
- std::move(columns.begin(), columns.end(), values.begin());
-}
-
-bool ExecBatch::Equals(const ExecBatch& other) const {
- return guarantee == other.guarantee && values == other.values;
-}
-
-void PrintTo(const ExecBatch& batch, std::ostream* os) {
- *os << "ExecBatch\n";
-
- static const std::string indent = " ";
-
- *os << indent << "# Rows: " << batch.length << "\n";
- if (batch.guarantee != literal(true)) {
- *os << indent << "Guarantee: " << batch.guarantee.ToString() << "\n";
- }
-
- int i = 0;
- for (const Datum& value : batch.values) {
- *os << indent << "" << i++ << ": ";
-
- if (value.is_scalar()) {
- *os << "Scalar[" << value.scalar()->ToString() << "]\n";
- continue;
- }
-
- auto array = value.make_array();
- PrettyPrintOptions options;
- options.skip_new_lines = true;
- *os << "Array";
- ARROW_CHECK_OK(PrettyPrint(*array, options, os));
- *os << "\n";
- }
-}
-
-ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const {
- ExecBatch out = *this;
- for (auto& value : out.values) {
- if (value.is_scalar()) continue;
- value = value.array()->Slice(offset, length);
- }
- out.length = length;
- return out;
-}
-
-Result<ExecBatch> ExecBatch::Make(std::vector<Datum> values) {
- if (values.empty()) {
- return Status::Invalid("Cannot infer ExecBatch length without at least one value");
- }
-
- int64_t length = -1;
- for (const auto& value : values) {
- if (value.is_scalar()) {
- continue;
- }
-
- if (length == -1) {
- length = value.length();
- continue;
- }
-
- if (length != value.length()) {
- return Status::Invalid(
- "Arrays used to construct an ExecBatch must have equal length");
- }
- }
-
- if (length == -1) {
- length = 1;
- }
-
- return ExecBatch(std::move(values), length);
-}
-
-Result<std::shared_ptr<RecordBatch>> ExecBatch::ToRecordBatch(
- std::shared_ptr<Schema> schema, MemoryPool* pool) const {
- ArrayVector columns(schema->num_fields());
-
- for (size_t i = 0; i < columns.size(); ++i) {
- const Datum& value = values[i];
- if (value.is_array()) {
- columns[i] = value.make_array();
- continue;
- }
- ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool));
- }
-
- return RecordBatch::Make(std::move(schema), length, std::move(columns));
-}
-
+ExecContext* default_exec_context() {
+ static ExecContext default_ctx;
+ return &default_ctx;
+}
+
+ExecBatch::ExecBatch(const RecordBatch& batch)
+ : values(batch.num_columns()), length(batch.num_rows()) {
+ auto columns = batch.column_data();
+ std::move(columns.begin(), columns.end(), values.begin());
+}
+
+bool ExecBatch::Equals(const ExecBatch& other) const {
+ return guarantee == other.guarantee && values == other.values;
+}
+
+void PrintTo(const ExecBatch& batch, std::ostream* os) {
+ *os << "ExecBatch\n";
+
+ static const std::string indent = " ";
+
+ *os << indent << "# Rows: " << batch.length << "\n";
+ if (batch.guarantee != literal(true)) {
+ *os << indent << "Guarantee: " << batch.guarantee.ToString() << "\n";
+ }
+
+ int i = 0;
+ for (const Datum& value : batch.values) {
+ *os << indent << "" << i++ << ": ";
+
+ if (value.is_scalar()) {
+ *os << "Scalar[" << value.scalar()->ToString() << "]\n";
+ continue;
+ }
+
+ auto array = value.make_array();
+ PrettyPrintOptions options;
+ options.skip_new_lines = true;
+ *os << "Array";
+ ARROW_CHECK_OK(PrettyPrint(*array, options, os));
+ *os << "\n";
+ }
+}
+
+ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const {
+ ExecBatch out = *this;
+ for (auto& value : out.values) {
+ if (value.is_scalar()) continue;
+ value = value.array()->Slice(offset, length);
+ }
+ out.length = length;
+ return out;
+}
+
+Result<ExecBatch> ExecBatch::Make(std::vector<Datum> values) {
+ if (values.empty()) {
+ return Status::Invalid("Cannot infer ExecBatch length without at least one value");
+ }
+
+ int64_t length = -1;
+ for (const auto& value : values) {
+ if (value.is_scalar()) {
+ continue;
+ }
+
+ if (length == -1) {
+ length = value.length();
+ continue;
+ }
+
+ if (length != value.length()) {
+ return Status::Invalid(
+ "Arrays used to construct an ExecBatch must have equal length");
+ }
+ }
+
+ if (length == -1) {
+ length = 1;
+ }
+
+ return ExecBatch(std::move(values), length);
+}
+
+Result<std::shared_ptr<RecordBatch>> ExecBatch::ToRecordBatch(
+ std::shared_ptr<Schema> schema, MemoryPool* pool) const {
+ ArrayVector columns(schema->num_fields());
+
+ for (size_t i = 0; i < columns.size(); ++i) {
+ const Datum& value = values[i];
+ if (value.is_array()) {
+ columns[i] = value.make_array();
+ continue;
+ }
+ ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool));
+ }
+
+ return RecordBatch::Make(std::move(schema), length, std::move(columns));
+}
+
namespace {
Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t length,
@@ -164,57 +164,57 @@ Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t l
if (bit_width == 1) {
return ctx->AllocateBitmap(length);
} else {
- int64_t buffer_size = BitUtil::BytesForBits(length * bit_width);
+ int64_t buffer_size = BitUtil::BytesForBits(length * bit_width);
return ctx->Allocate(buffer_size);
}
}
-struct BufferPreallocation {
- explicit BufferPreallocation(int bit_width = -1, int added_length = 0)
- : bit_width(bit_width), added_length(added_length) {}
-
- int bit_width;
- int added_length;
-};
-
-void ComputeDataPreallocate(const DataType& type,
- std::vector<BufferPreallocation>* widths) {
- if (is_fixed_width(type.id()) && type.id() != Type::NA) {
- widths->emplace_back(checked_cast<const FixedWidthType&>(type).bit_width());
- return;
- }
- // Preallocate binary and list offsets
- switch (type.id()) {
- case Type::BINARY:
- case Type::STRING:
- case Type::LIST:
- case Type::MAP:
- widths->emplace_back(32, /*added_length=*/1);
- return;
- case Type::LARGE_BINARY:
- case Type::LARGE_STRING:
- case Type::LARGE_LIST:
- widths->emplace_back(64, /*added_length=*/1);
- return;
- default:
- break;
+struct BufferPreallocation {
+ explicit BufferPreallocation(int bit_width = -1, int added_length = 0)
+ : bit_width(bit_width), added_length(added_length) {}
+
+ int bit_width;
+ int added_length;
+};
+
+void ComputeDataPreallocate(const DataType& type,
+ std::vector<BufferPreallocation>* widths) {
+ if (is_fixed_width(type.id()) && type.id() != Type::NA) {
+ widths->emplace_back(checked_cast<const FixedWidthType&>(type).bit_width());
+ return;
}
+ // Preallocate binary and list offsets
+ switch (type.id()) {
+ case Type::BINARY:
+ case Type::STRING:
+ case Type::LIST:
+ case Type::MAP:
+ widths->emplace_back(32, /*added_length=*/1);
+ return;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ case Type::LARGE_LIST:
+ widths->emplace_back(64, /*added_length=*/1);
+ return;
+ default:
+ break;
+ }
}
} // namespace
namespace detail {
-Status CheckAllValues(const std::vector<Datum>& values) {
- for (const auto& value : values) {
- if (!value.is_value()) {
- return Status::Invalid("Tried executing function with non-value type: ",
- value.ToString());
- }
- }
- return Status::OK();
-}
-
+Status CheckAllValues(const std::vector<Datum>& values) {
+ for (const auto& value : values) {
+ if (!value.is_value()) {
+ return Status::Invalid("Tried executing function with non-value type: ",
+ value.ToString());
+ }
+ }
+ return Status::OK();
+}
+
ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
int64_t max_chunksize)
: args_(std::move(args)),
@@ -311,35 +311,35 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
return true;
}
-namespace {
-
-struct NullGeneralization {
- enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
-
- static type Get(const Datum& datum) {
- if (datum.type()->id() == Type::NA) {
- return ALL_NULL;
- }
-
- if (datum.is_scalar()) {
- return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL;
- }
-
- const auto& arr = *datum.array();
-
+namespace {
+
+struct NullGeneralization {
+ enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
+
+ static type Get(const Datum& datum) {
+ if (datum.type()->id() == Type::NA) {
+ return ALL_NULL;
+ }
+
+ if (datum.is_scalar()) {
+ return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL;
+ }
+
+ const auto& arr = *datum.array();
+
// Do not count the bits if they haven't been counted already
- const int64_t known_null_count = arr.null_count.load();
- if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) {
- return ALL_VALID;
- }
-
- if (known_null_count == arr.length) {
- return ALL_NULL;
- }
-
- return PERHAPS_NULL;
+ const int64_t known_null_count = arr.null_count.load();
+ if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) {
+ return ALL_VALID;
+ }
+
+ if (known_null_count == arr.length) {
+ return ALL_NULL;
+ }
+
+ return PERHAPS_NULL;
}
-};
+};
// Null propagation implementation that deals both with preallocated bitmaps
// and maybe-to-be allocated bitmaps
@@ -356,17 +356,17 @@ class NullPropagator {
public:
NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output)
: ctx_(ctx), batch_(batch), output_(output) {
- for (const Datum& datum : batch_.values) {
- auto null_generalization = NullGeneralization::Get(datum);
-
- if (null_generalization == NullGeneralization::ALL_NULL) {
- is_all_null_ = true;
- }
-
- if (null_generalization != NullGeneralization::ALL_VALID &&
- datum.kind() == Datum::ARRAY) {
- arrays_with_nulls_.push_back(datum.array().get());
+ for (const Datum& datum : batch_.values) {
+ auto null_generalization = NullGeneralization::Get(datum);
+
+ if (null_generalization == NullGeneralization::ALL_NULL) {
+ is_all_null_ = true;
}
+
+ if (null_generalization != NullGeneralization::ALL_VALID &&
+ datum.kind() == Datum::ARRAY) {
+ arrays_with_nulls_.push_back(datum.array().get());
+ }
}
if (output->buffers[0] != nullptr) {
@@ -386,33 +386,33 @@ class NullPropagator {
return Status::OK();
}
- Status AllNullShortCircuit() {
- // OK, the output should be all null
- output_->null_count = output_->length;
-
- if (bitmap_preallocated_) {
- BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
- return Status::OK();
- }
+ Status AllNullShortCircuit() {
+ // OK, the output should be all null
+ output_->null_count = output_->length;
+ if (bitmap_preallocated_) {
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+ return Status::OK();
+ }
+
// Walk all the values with nulls instead of breaking on the first in case
// we find a bitmap that can be reused in the non-preallocated case
- for (const ArrayData* arr : arrays_with_nulls_) {
- if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) {
- // Reuse this all null bitmap
- output_->buffers[0] = arr->buffers[0];
- return Status::OK();
+ for (const ArrayData* arr : arrays_with_nulls_) {
+ if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) {
+ // Reuse this all null bitmap
+ output_->buffers[0] = arr->buffers[0];
+ return Status::OK();
}
}
- RETURN_NOT_OK(EnsureAllocated());
- BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
- return Status::OK();
+ RETURN_NOT_OK(EnsureAllocated());
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+ return Status::OK();
}
Status PropagateSingle() {
// One array
- const ArrayData& arr = *arrays_with_nulls_[0];
+ const ArrayData& arr = *arrays_with_nulls_[0];
const std::shared_ptr<Buffer>& arr_bitmap = arr.buffers[0];
// Reuse the null count if it's known
@@ -420,27 +420,27 @@ class NullPropagator {
if (bitmap_preallocated_) {
CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset);
- return Status::OK();
- }
-
- // Two cases when memory was not pre-allocated:
- //
- // * Offset is zero: we reuse the bitmap as is
- // * Offset is nonzero but a multiple of 8: we can slice the bitmap
- // * Offset is not a multiple of 8: we must allocate and use CopyBitmap
- //
- // Keep in mind that output_->offset is not permitted to be nonzero when
- // the bitmap is not preallocated, and that precondition is asserted
- // higher in the call stack.
- if (arr.offset == 0) {
- output_->buffers[0] = arr_bitmap;
- } else if (arr.offset % 8 == 0) {
- output_->buffers[0] =
- SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
+ return Status::OK();
+ }
+
+ // Two cases when memory was not pre-allocated:
+ //
+ // * Offset is zero: we reuse the bitmap as is
+ // * Offset is nonzero but a multiple of 8: we can slice the bitmap
+ // * Offset is not a multiple of 8: we must allocate and use CopyBitmap
+ //
+ // Keep in mind that output_->offset is not permitted to be nonzero when
+ // the bitmap is not preallocated, and that precondition is asserted
+ // higher in the call stack.
+ if (arr.offset == 0) {
+ output_->buffers[0] = arr_bitmap;
+ } else if (arr.offset % 8 == 0) {
+ output_->buffers[0] =
+ SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
} else {
- RETURN_NOT_OK(EnsureAllocated());
- CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
- /*dst_offset=*/0);
+ RETURN_NOT_OK(EnsureAllocated());
+ CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
+ /*dst_offset=*/0);
}
return Status::OK();
}
@@ -459,27 +459,27 @@ class NullPropagator {
output_->buffers[0]->mutable_data());
};
- DCHECK_GT(arrays_with_nulls_.size(), 1);
+ DCHECK_GT(arrays_with_nulls_.size(), 1);
// Seed the output bitmap with the & of the first two bitmaps
- Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]);
+ Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]);
// Accumulate the rest
- for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) {
- Accumulate(*output_, *arrays_with_nulls_[i]);
+ for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) {
+ Accumulate(*output_, *arrays_with_nulls_[i]);
}
return Status::OK();
}
Status Execute() {
- if (is_all_null_) {
- // An all-null value (scalar null or all-null array) gives us a short
- // circuit opportunity
- return AllNullShortCircuit();
+ if (is_all_null_) {
+ // An all-null value (scalar null or all-null array) gives us a short
+ // circuit opportunity
+ return AllNullShortCircuit();
}
// At this point, by construction we know that all of the values in
- // arrays_with_nulls_ are arrays that are not all null. So there are a
+ // arrays_with_nulls_ are arrays that are not all null. So there are a
// few cases:
//
// * No arrays. This is a no-op w/o preallocation but when the bitmap is
@@ -494,27 +494,27 @@ class NullPropagator {
output_->null_count = kUnknownNullCount;
- if (arrays_with_nulls_.empty()) {
+ if (arrays_with_nulls_.empty()) {
// No arrays with nulls case
output_->null_count = 0;
if (bitmap_preallocated_) {
BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, true);
}
return Status::OK();
- }
-
- if (arrays_with_nulls_.size() == 1) {
+ }
+
+ if (arrays_with_nulls_.size() == 1) {
return PropagateSingle();
}
-
- return PropagateMultiple();
+
+ return PropagateMultiple();
}
private:
KernelContext* ctx_;
const ExecBatch& batch_;
- std::vector<const ArrayData*> arrays_with_nulls_;
- bool is_all_null_ = false;
+ std::vector<const ArrayData*> arrays_with_nulls_;
+ bool is_all_null_ = false;
ArrayData* output_;
uint8_t* bitmap_;
bool bitmap_preallocated_ = false;
@@ -523,15 +523,15 @@ class NullPropagator {
std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
const std::shared_ptr<DataType>& type) {
std::vector<std::shared_ptr<Array>> arrays;
- arrays.reserve(values.size());
- for (const Datum& val : values) {
- if (val.length() == 0) {
+ arrays.reserve(values.size());
+ for (const Datum& val : values) {
+ if (val.length() == 0) {
// Skip empty chunks
continue;
}
- arrays.emplace_back(val.make_array());
+ arrays.emplace_back(val.make_array());
}
- return std::make_shared<ChunkedArray>(std::move(arrays), type);
+ return std::make_shared<ChunkedArray>(std::move(arrays), type);
}
bool HaveChunkedArray(const std::vector<Datum>& values) {
@@ -543,25 +543,25 @@ bool HaveChunkedArray(const std::vector<Datum>& values) {
return false;
}
-template <typename KernelType>
-class KernelExecutorImpl : public KernelExecutor {
+template <typename KernelType>
+class KernelExecutorImpl : public KernelExecutor {
public:
- Status Init(KernelContext* kernel_ctx, KernelInitArgs args) override {
- kernel_ctx_ = kernel_ctx;
- kernel_ = static_cast<const KernelType*>(args.kernel);
+ Status Init(KernelContext* kernel_ctx, KernelInitArgs args) override {
+ kernel_ctx_ = kernel_ctx;
+ kernel_ = static_cast<const KernelType*>(args.kernel);
- // Resolve the output descriptor for this kernel
- ARROW_ASSIGN_OR_RAISE(
- output_descr_, kernel_->signature->out_type().Resolve(kernel_ctx_, args.inputs));
+ // Resolve the output descriptor for this kernel
+ ARROW_ASSIGN_OR_RAISE(
+ output_descr_, kernel_->signature->out_type().Resolve(kernel_ctx_, args.inputs));
return Status::OK();
}
- protected:
+ protected:
// This is overridden by the VectorExecutor
virtual Status SetupArgIteration(const std::vector<Datum>& args) {
- ARROW_ASSIGN_OR_RAISE(
- batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
+ ARROW_ASSIGN_OR_RAISE(
+ batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
return Status::OK();
}
@@ -570,29 +570,29 @@ class KernelExecutorImpl : public KernelExecutor {
out->buffers.resize(output_num_buffers_);
if (validity_preallocated_) {
- ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_->AllocateBitmap(length));
+ ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_->AllocateBitmap(length));
}
- if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
- out->null_count = 0;
- }
- for (size_t i = 0; i < data_preallocated_.size(); ++i) {
- const auto& prealloc = data_preallocated_[i];
- if (prealloc.bit_width >= 0) {
- ARROW_ASSIGN_OR_RAISE(
- out->buffers[i + 1],
- AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length,
- prealloc.bit_width));
- }
+ if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+ out->null_count = 0;
}
+ for (size_t i = 0; i < data_preallocated_.size(); ++i) {
+ const auto& prealloc = data_preallocated_[i];
+ if (prealloc.bit_width >= 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ out->buffers[i + 1],
+ AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length,
+ prealloc.bit_width));
+ }
+ }
return out;
}
- ExecContext* exec_context() { return kernel_ctx_->exec_context(); }
- KernelState* state() { return kernel_ctx_->state(); }
+ ExecContext* exec_context() { return kernel_ctx_->exec_context(); }
+ KernelState* state() { return kernel_ctx_->state(); }
// Not all of these members are used for every executor type
- KernelContext* kernel_ctx_;
+ KernelContext* kernel_ctx_;
const KernelType* kernel_;
std::unique_ptr<ExecBatchIterator> batch_iterator_;
ValueDescr output_descr_;
@@ -602,13 +602,13 @@ class KernelExecutorImpl : public KernelExecutor {
// If true, then memory is preallocated for the validity bitmap with the same
// strategy as the data buffer(s).
bool validity_preallocated_ = false;
-
- // The kernel writes into data buffers preallocated for these bit widths
- // (0 indicates no preallocation);
- std::vector<BufferPreallocation> data_preallocated_;
+
+ // The kernel writes into data buffers preallocated for these bit widths
+ // (0 indicates no preallocation);
+ std::vector<BufferPreallocation> data_preallocated_;
};
-class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
+class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
public:
Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
RETURN_NOT_OK(PrepareExecute(args));
@@ -646,9 +646,9 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
} else {
// XXX: In the case where no outputs are omitted, is returning a 0-length
// array always the correct move?
- return MakeArrayOfNull(output_descr_.type, /*length=*/0,
- exec_context()->memory_pool())
- .ValueOrDie();
+ return MakeArrayOfNull(output_descr_.type, /*length=*/0,
+ exec_context()->memory_pool())
+ .ValueOrDie();
}
}
}
@@ -661,7 +661,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
if (output_descr_.shape == ValueDescr::ARRAY) {
ArrayData* out_arr = out.mutable_array();
if (kernel_->null_handling == NullHandling::INTERSECTION) {
- RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
+ RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
} else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
out_arr->null_count = 0;
}
@@ -676,7 +676,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
}
}
- RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+ RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
if (!preallocate_contiguous_) {
// If we are producing chunked output rather than one big array, then
// emit each chunk as soon as it's available
@@ -686,7 +686,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
}
Status PrepareExecute(const std::vector<Datum>& args) {
- RETURN_NOT_OK(this->SetupArgIteration(args));
+ RETURN_NOT_OK(this->SetupArgIteration(args));
if (output_descr_.shape == ValueDescr::ARRAY) {
// If the executor is configured to produce a single large Array output for
@@ -749,26 +749,26 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
// Decide if we need to preallocate memory for this kernel
validity_preallocated_ =
(kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
- kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL &&
- output_descr_.type->id() != Type::NA);
- if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
- ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
- }
-
- // Contiguous preallocation only possible on non-nested types if all
- // buffers are preallocated. Otherwise, we must go chunk-by-chunk.
+ kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL &&
+ output_descr_.type->id() != Type::NA);
+ if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+ ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+ }
+
+ // Contiguous preallocation only possible on non-nested types if all
+ // buffers are preallocated. Otherwise, we must go chunk-by-chunk.
//
- // Some kernels are also unable to write into sliced outputs, so we respect the
- // kernel's attributes.
+ // Some kernels are also unable to write into sliced outputs, so we respect the
+ // kernel's attributes.
preallocate_contiguous_ =
- (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices &&
- validity_preallocated_ && !is_nested(output_descr_.type->id()) &&
- !is_dictionary(output_descr_.type->id()) &&
- data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
- std::all_of(data_preallocated_.begin(), data_preallocated_.end(),
- [](const BufferPreallocation& prealloc) {
- return prealloc.bit_width >= 0;
- }));
+ (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices &&
+ validity_preallocated_ && !is_nested(output_descr_.type->id()) &&
+ !is_dictionary(output_descr_.type->id()) &&
+ data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
+ std::all_of(data_preallocated_.begin(), data_preallocated_.end(),
+ [](const BufferPreallocation& prealloc) {
+ return prealloc.bit_width >= 0;
+ }));
if (preallocate_contiguous_) {
ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length));
}
@@ -790,7 +790,7 @@ Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
switch (arg.kind()) {
case Datum::SCALAR:
case Datum::ARRAY:
- case Datum::CHUNKED_ARRAY:
+ case Datum::CHUNKED_ARRAY:
length = std::max(arg.length(), length);
break;
default:
@@ -803,7 +803,7 @@ Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
return Status::OK();
}
-class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
+class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
public:
Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
RETURN_NOT_OK(PrepareExecute(args));
@@ -823,15 +823,15 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
const std::vector<Datum>& outputs) override {
// If execution yielded multiple chunks (because large arrays were split
// based on the ExecContext parameters, then the result is a ChunkedArray
- if (kernel_->output_chunked && (HaveChunkedArray(inputs) || outputs.size() > 1)) {
- return ToChunkedArray(outputs, output_descr_.type);
- } else if (outputs.size() == 1) {
- // Outputs have just one element
- return outputs[0];
+ if (kernel_->output_chunked && (HaveChunkedArray(inputs) || outputs.size() > 1)) {
+ return ToChunkedArray(outputs, output_descr_.type);
+ } else if (outputs.size() == 1) {
+ // Outputs have just one element
+ return outputs[0];
} else {
- // XXX: In the case where no outputs are omitted, is returning a 0-length
- // array always the correct move?
- return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
+ // XXX: In the case where no outputs are omitted, is returning a 0-length
+ // array always the correct move?
+ return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
}
}
@@ -851,9 +851,9 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
if (kernel_->null_handling == NullHandling::INTERSECTION &&
output_descr_.shape == ValueDescr::ARRAY) {
- RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array()));
+ RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array()));
}
- RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+ RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
if (!kernel_->finalize) {
// If there is no result finalizer (e.g. for hash-based functions, we can
// emit the processed batch right away rather than waiting
@@ -868,7 +868,7 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
if (kernel_->finalize) {
// Intermediate results require post-processing after the execution is
// completed (possibly involving some accumulated state)
- RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
+ RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
for (const auto& result : results_) {
RETURN_NOT_OK(listener->OnResult(result));
}
@@ -878,39 +878,39 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
Status SetupArgIteration(const std::vector<Datum>& args) override {
if (kernel_->can_execute_chunkwise) {
- ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
- args, exec_context()->exec_chunksize()));
+ ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
+ args, exec_context()->exec_chunksize()));
}
return Status::OK();
}
Status PrepareExecute(const std::vector<Datum>& args) {
- RETURN_NOT_OK(this->SetupArgIteration(args));
+ RETURN_NOT_OK(this->SetupArgIteration(args));
output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
// Decide if we need to preallocate memory for this kernel
validity_preallocated_ =
(kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
- if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
- ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
- }
+ if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+ ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+ }
return Status::OK();
}
std::vector<Datum> results_;
};
-class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
+class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
public:
- Status Init(KernelContext* ctx, KernelInitArgs args) override {
- input_descrs_ = &args.inputs;
- options_ = args.options;
- return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
- }
+ Status Init(KernelContext* ctx, KernelInitArgs args) override {
+ input_descrs_ = &args.inputs;
+ options_ = args.options;
+ return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
+ }
Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
- RETURN_NOT_OK(this->SetupArgIteration(args));
+ RETURN_NOT_OK(this->SetupArgIteration(args));
ExecBatch batch;
while (batch_iterator_->Next(&batch)) {
@@ -921,7 +921,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
}
Datum out;
- RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &out));
+ RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &out));
RETURN_NOT_OK(listener->OnResult(std::move(out)));
return Status::OK();
}
@@ -934,78 +934,78 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
private:
Status Consume(const ExecBatch& batch) {
- // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
- ARROW_ASSIGN_OR_RAISE(
- auto batch_state,
- kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
+ // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
+ ARROW_ASSIGN_OR_RAISE(
+ auto batch_state,
+ kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
if (batch_state == nullptr) {
- return Status::Invalid("ScalarAggregation requires non-null kernel state");
+ return Status::Invalid("ScalarAggregation requires non-null kernel state");
}
- KernelContext batch_ctx(exec_context());
+ KernelContext batch_ctx(exec_context());
batch_ctx.SetState(batch_state.get());
- RETURN_NOT_OK(kernel_->consume(&batch_ctx, batch));
- RETURN_NOT_OK(kernel_->merge(kernel_ctx_, std::move(*batch_state), state()));
+ RETURN_NOT_OK(kernel_->consume(&batch_ctx, batch));
+ RETURN_NOT_OK(kernel_->merge(kernel_ctx_, std::move(*batch_state), state()));
return Status::OK();
}
-
- const std::vector<ValueDescr>* input_descrs_;
- const FunctionOptions* options_;
+
+ const std::vector<ValueDescr>* input_descrs_;
+ const FunctionOptions* options_;
};
template <typename ExecutorType,
typename FunctionType = typename ExecutorType::FunctionType>
-Result<std::unique_ptr<KernelExecutor>> MakeExecutor(ExecContext* ctx,
- const Function* func,
- const FunctionOptions* options) {
+Result<std::unique_ptr<KernelExecutor>> MakeExecutor(ExecContext* ctx,
+ const Function* func,
+ const FunctionOptions* options) {
DCHECK_EQ(ExecutorType::function_kind, func->kind());
auto typed_func = checked_cast<const FunctionType*>(func);
- return std::unique_ptr<KernelExecutor>(new ExecutorType(ctx, typed_func, options));
+ return std::unique_ptr<KernelExecutor>(new ExecutorType(ctx, typed_func, options));
}
-} // namespace
-
-Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
- DCHECK_NE(nullptr, output);
- DCHECK_GT(output->buffers.size(), 0);
-
- if (output->type->id() == Type::NA) {
- // Null output type is a no-op (rare when this would happen but we at least
- // will test for it)
- return Status::OK();
- }
-
- // This function is ONLY able to write into output with non-zero offset
- // when the bitmap is preallocated. This could be a DCHECK but returning
- // error Status for now for emphasis
- if (output->offset != 0 && output->buffers[0] == nullptr) {
- return Status::Invalid(
- "Can only propagate nulls into pre-allocated memory "
- "when the output offset is non-zero");
+} // namespace
+
+Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
+ DCHECK_NE(nullptr, output);
+ DCHECK_GT(output->buffers.size(), 0);
+
+ if (output->type->id() == Type::NA) {
+ // Null output type is a no-op (rare when this would happen but we at least
+ // will test for it)
+ return Status::OK();
}
- NullPropagator propagator(ctx, batch, output);
- return propagator.Execute();
-}
-
-std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalar() {
- return ::arrow::internal::make_unique<detail::ScalarExecutor>();
-}
-
-std::unique_ptr<KernelExecutor> KernelExecutor::MakeVector() {
- return ::arrow::internal::make_unique<detail::VectorExecutor>();
-}
-
-std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
- return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
+
+ // This function is ONLY able to write into output with non-zero offset
+ // when the bitmap is preallocated. This could be a DCHECK but returning
+ // error Status for now for emphasis
+ if (output->offset != 0 && output->buffers[0] == nullptr) {
+ return Status::Invalid(
+ "Can only propagate nulls into pre-allocated memory "
+ "when the output offset is non-zero");
+ }
+ NullPropagator propagator(ctx, batch, output);
+ return propagator.Execute();
}
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalar() {
+ return ::arrow::internal::make_unique<detail::ScalarExecutor>();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeVector() {
+ return ::arrow::internal::make_unique<detail::VectorExecutor>();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
+ return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
+}
+
} // namespace detail
-ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
- FunctionRegistry* func_registry)
- : pool_(pool), executor_(executor) {
+ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+ FunctionRegistry* func_registry)
+ : pool_(pool), executor_(executor) {
this->func_registry_ = func_registry == nullptr ? GetFunctionRegistry() : func_registry;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
index de1b695de48..90fb291dbb8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
@@ -28,13 +28,13 @@
#include <vector>
#include "arrow/array/data.h"
-#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/expression.h"
#include "arrow/datum.h"
#include "arrow/memory_pool.h"
#include "arrow/result.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -46,7 +46,7 @@ class CpuInfo;
namespace compute {
-class FunctionOptions;
+class FunctionOptions;
class FunctionRegistry;
// It seems like 64K might be a good default chunksize to use for execution
@@ -61,7 +61,7 @@ class ARROW_EXPORT ExecContext {
public:
// If no function registry passed, the default is used.
explicit ExecContext(MemoryPool* pool = default_memory_pool(),
- ::arrow::internal::Executor* executor = NULLPTR,
+ ::arrow::internal::Executor* executor = NULLPTR,
FunctionRegistry* func_registry = NULLPTR);
/// \brief The MemoryPool used for allocations, default is
@@ -70,9 +70,9 @@ class ARROW_EXPORT ExecContext {
::arrow::internal::CpuInfo* cpu_info() const;
- /// \brief An Executor which may be used to parallelize execution.
- ::arrow::internal::Executor* executor() const { return executor_; }
-
+ /// \brief An Executor which may be used to parallelize execution.
+ ::arrow::internal::Executor* executor() const { return executor_; }
+
/// \brief The FunctionRegistry for looking up functions by name and
/// selecting kernels for execution. Defaults to the library-global function
/// registry provided by GetFunctionRegistry.
@@ -119,15 +119,15 @@ class ARROW_EXPORT ExecContext {
private:
MemoryPool* pool_;
- ::arrow::internal::Executor* executor_;
+ ::arrow::internal::Executor* executor_;
FunctionRegistry* func_registry_;
int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
bool preallocate_contiguous_ = true;
bool use_threads_ = true;
};
-ARROW_EXPORT ExecContext* default_exec_context();
-
+ARROW_EXPORT ExecContext* default_exec_context();
+
// TODO: Consider standardizing on uint16 selection vectors and only use them
// when we can ensure that each value is 64K length or smaller
@@ -173,18 +173,18 @@ class ARROW_EXPORT SelectionVector {
/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
/// than is desirable for this class. Microbenchmarks would help determine for
/// sure. See ARROW-8928.
-struct ARROW_EXPORT ExecBatch {
- ExecBatch() = default;
+struct ARROW_EXPORT ExecBatch {
+ ExecBatch() = default;
ExecBatch(std::vector<Datum> values, int64_t length)
: values(std::move(values)), length(length) {}
- explicit ExecBatch(const RecordBatch& batch);
-
- static Result<ExecBatch> Make(std::vector<Datum> values);
-
- Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
- std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
-
+ explicit ExecBatch(const RecordBatch& batch);
+
+ static Result<ExecBatch> Make(std::vector<Datum> values);
+
+ Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
+ std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
+
/// The values representing positional arguments to be passed to a kernel's
/// exec function for processing.
std::vector<Datum> values;
@@ -196,9 +196,9 @@ struct ARROW_EXPORT ExecBatch {
/// ExecBatch::length is equal to the length of this array.
std::shared_ptr<SelectionVector> selection_vector;
- /// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
- Expression guarantee = literal(true);
-
+ /// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
+ Expression guarantee = literal(true);
+
/// The semantic length of the ExecBatch. When the values are all scalars,
/// the length should be set to 1, otherwise the length is taken from the
/// array values, except when there is a selection vector. When there is a
@@ -216,13 +216,13 @@ struct ARROW_EXPORT ExecBatch {
return values[i];
}
- bool Equals(const ExecBatch& other) const;
-
+ bool Equals(const ExecBatch& other) const;
+
/// \brief A convenience for the number of values / arguments.
int num_values() const { return static_cast<int>(values.size()); }
- ExecBatch Slice(int64_t offset, int64_t length) const;
-
+ ExecBatch Slice(int64_t offset, int64_t length) const;
+
/// \brief A convenience for returning the ValueDescr objects (types and
/// shapes) from the batch.
std::vector<ValueDescr> GetDescriptors() const {
@@ -232,13 +232,13 @@ struct ARROW_EXPORT ExecBatch {
}
return result;
}
-
- ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*);
+
+ ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*);
};
-inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
-inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
-
+inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
+inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
+
/// \defgroup compute-call-function One-shot calls to compute functions
///
/// @{
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
index 433e895c243..aec7805ceea 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
@@ -1,823 +1,823 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/exec_plan.h"
-
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "arrow/array/util.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/exec/expression.h"
-#include "arrow/compute/registry.h"
-#include "arrow/datum.h"
-#include "arrow/record_batch.h"
-#include "arrow/result.h"
-#include "arrow/util/async_generator.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-namespace compute {
-
-namespace {
-
-struct ExecPlanImpl : public ExecPlan {
- explicit ExecPlanImpl(ExecContext* exec_context) : ExecPlan(exec_context) {}
-
- ~ExecPlanImpl() override {
- if (started_ && !finished_.is_finished()) {
- ARROW_LOG(WARNING) << "Plan was destroyed before finishing";
- StopProducing();
- finished().Wait();
- }
- }
-
- ExecNode* AddNode(std::unique_ptr<ExecNode> node) {
- if (node->num_inputs() == 0) {
- sources_.push_back(node.get());
- }
- if (node->num_outputs() == 0) {
- sinks_.push_back(node.get());
- }
- nodes_.push_back(std::move(node));
- return nodes_.back().get();
- }
-
- Status Validate() const {
- if (nodes_.empty()) {
- return Status::Invalid("ExecPlan has no node");
- }
- for (const auto& node : nodes_) {
- RETURN_NOT_OK(node->Validate());
- }
- return Status::OK();
- }
-
- Status StartProducing() {
- if (started_) {
- return Status::Invalid("restarted ExecPlan");
- }
- started_ = true;
-
- // producers precede consumers
- sorted_nodes_ = TopoSort();
-
- std::vector<Future<>> futures;
-
- Status st = Status::OK();
-
- using rev_it = std::reverse_iterator<NodeVector::iterator>;
- for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) {
- auto node = *it;
-
- st = node->StartProducing();
- if (!st.ok()) {
- // Stop nodes that successfully started, in reverse order
- stopped_ = true;
- StopProducingImpl(it.base(), sorted_nodes_.end());
- break;
- }
-
- futures.push_back(node->finished());
- }
-
- finished_ = AllComplete(std::move(futures));
- return st;
- }
-
- void StopProducing() {
- DCHECK(started_) << "stopped an ExecPlan which never started";
- stopped_ = true;
-
- StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end());
- }
-
- template <typename It>
- void StopProducingImpl(It begin, It end) {
- for (auto it = begin; it != end; ++it) {
- auto node = *it;
- node->StopProducing();
- }
- }
-
- NodeVector TopoSort() {
- struct Impl {
- const std::vector<std::unique_ptr<ExecNode>>& nodes;
- std::unordered_set<ExecNode*> visited;
- NodeVector sorted;
-
- explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
- visited.reserve(nodes.size());
- sorted.resize(nodes.size());
-
- for (const auto& node : nodes) {
- Visit(node.get());
- }
-
- DCHECK_EQ(visited.size(), nodes.size());
- }
-
- void Visit(ExecNode* node) {
- if (visited.count(node) != 0) return;
-
- for (auto input : node->inputs()) {
- // Ensure that producers are inserted before this consumer
- Visit(input);
- }
-
- sorted[visited.size()] = node;
- visited.insert(node);
- }
- };
-
- return std::move(Impl{nodes_}.sorted);
- }
-
- Future<> finished_ = Future<>::MakeFinished();
- bool started_ = false, stopped_ = false;
- std::vector<std::unique_ptr<ExecNode>> nodes_;
- NodeVector sources_, sinks_;
- NodeVector sorted_nodes_;
-};
-
-ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast<ExecPlanImpl*>(ptr); }
-
-const ExecPlanImpl* ToDerived(const ExecPlan* ptr) {
- return checked_cast<const ExecPlanImpl*>(ptr);
-}
-
-util::optional<int> GetNodeIndex(const std::vector<ExecNode*>& nodes,
- const ExecNode* node) {
- for (int i = 0; i < static_cast<int>(nodes.size()); ++i) {
- if (nodes[i] == node) return i;
- }
- return util::nullopt;
-}
-
-} // namespace
-
-Result<std::shared_ptr<ExecPlan>> ExecPlan::Make(ExecContext* ctx) {
- return std::shared_ptr<ExecPlan>(new ExecPlanImpl{ctx});
-}
-
-ExecNode* ExecPlan::AddNode(std::unique_ptr<ExecNode> node) {
- return ToDerived(this)->AddNode(std::move(node));
-}
-
-const ExecPlan::NodeVector& ExecPlan::sources() const {
- return ToDerived(this)->sources_;
-}
-
-const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; }
-
-Status ExecPlan::Validate() { return ToDerived(this)->Validate(); }
-
-Status ExecPlan::StartProducing() { return ToDerived(this)->StartProducing(); }
-
-void ExecPlan::StopProducing() { ToDerived(this)->StopProducing(); }
-
-Future<> ExecPlan::finished() { return ToDerived(this)->finished_; }
-
-ExecNode::ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
- std::vector<std::string> input_labels,
- std::shared_ptr<Schema> output_schema, int num_outputs)
- : plan_(plan),
- label_(std::move(label)),
- inputs_(std::move(inputs)),
- input_labels_(std::move(input_labels)),
- output_schema_(std::move(output_schema)),
- num_outputs_(num_outputs) {
- for (auto input : inputs_) {
- input->outputs_.push_back(this);
- }
-}
-
-Status ExecNode::Validate() const {
- if (inputs_.size() != input_labels_.size()) {
- return Status::Invalid("Invalid number of inputs for '", label(), "' (expected ",
- num_inputs(), ", actual ", input_labels_.size(), ")");
- }
-
- if (static_cast<int>(outputs_.size()) != num_outputs_) {
- return Status::Invalid("Invalid number of outputs for '", label(), "' (expected ",
- num_outputs(), ", actual ", outputs_.size(), ")");
- }
-
- for (auto out : outputs_) {
- auto input_index = GetNodeIndex(out->inputs(), this);
- if (!input_index) {
- return Status::Invalid("Node '", label(), "' outputs to node '", out->label(),
- "' but is not listed as an input.");
- }
- }
-
- return Status::OK();
-}
-
-struct SourceNode : ExecNode {
- SourceNode(ExecPlan* plan, std::string label, std::shared_ptr<Schema> output_schema,
- AsyncGenerator<util::optional<ExecBatch>> generator)
- : ExecNode(plan, std::move(label), {}, {}, std::move(output_schema),
- /*num_outputs=*/1),
- generator_(std::move(generator)) {}
-
- const char* kind_name() override { return "SourceNode"; }
-
- [[noreturn]] static void NoInputs() {
- DCHECK(false) << "no inputs; this should never be called";
- std::abort();
- }
- [[noreturn]] void InputReceived(ExecNode*, int, ExecBatch) override { NoInputs(); }
- [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); }
- [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); }
-
- Status StartProducing() override {
- DCHECK(!stop_requested_) << "Restarted SourceNode";
-
- CallbackOptions options;
- if (auto executor = plan()->exec_context()->executor()) {
- // These options will transfer execution to the desired Executor if necessary.
- // This can happen for in-memory scans where batches didn't require
- // any CPU work to decode. Otherwise, parsing etc should have already
- // been placed us on the desired Executor and no queues will be pushed to.
- options.executor = executor;
- options.should_schedule = ShouldSchedule::IfDifferentExecutor;
- }
-
- finished_ = Loop([this, options] {
- std::unique_lock<std::mutex> lock(mutex_);
- int seq = batch_count_++;
- if (stop_requested_) {
- return Future<ControlFlow<int>>::MakeFinished(Break(seq));
- }
- lock.unlock();
-
- return generator_().Then(
- [=](const util::optional<ExecBatch>& batch) -> ControlFlow<int> {
- std::unique_lock<std::mutex> lock(mutex_);
- if (IsIterationEnd(batch) || stop_requested_) {
- stop_requested_ = true;
- return Break(seq);
- }
- lock.unlock();
-
- outputs_[0]->InputReceived(this, seq, *batch);
- return Continue();
- },
- [=](const Status& error) -> ControlFlow<int> {
- // NB: ErrorReceived is independent of InputFinished, but
- // ErrorReceived will usually prompt StopProducing which will
- // prompt InputFinished. ErrorReceived may still be called from a
- // node which was requested to stop (indeed, the request to stop
- // may prompt an error).
- std::unique_lock<std::mutex> lock(mutex_);
- stop_requested_ = true;
- lock.unlock();
- outputs_[0]->ErrorReceived(this, error);
- return Break(seq);
- },
- options);
- }).Then([&](int seq) { outputs_[0]->InputFinished(this, seq); });
-
- return Status::OK();
- }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override {
- std::unique_lock<std::mutex> lock(mutex_);
- stop_requested_ = true;
- }
-
- Future<> finished() override { return finished_; }
-
- private:
- std::mutex mutex_;
- bool stop_requested_{false};
- int batch_count_{0};
- Future<> finished_ = Future<>::MakeFinished();
- AsyncGenerator<util::optional<ExecBatch>> generator_;
-};
-
-ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
- std::shared_ptr<Schema> output_schema,
- AsyncGenerator<util::optional<ExecBatch>> generator) {
- return plan->EmplaceNode<SourceNode>(plan, std::move(label), std::move(output_schema),
- std::move(generator));
-}
-
-struct FilterNode : ExecNode {
- FilterNode(ExecNode* input, std::string label, Expression filter)
- : ExecNode(input->plan(), std::move(label), {input}, {"target"},
- /*output_schema=*/input->output_schema(),
- /*num_outputs=*/1),
- filter_(std::move(filter)) {}
-
- const char* kind_name() override { return "FilterNode"; }
-
- Result<ExecBatch> DoFilter(const ExecBatch& target) {
- ARROW_ASSIGN_OR_RAISE(Expression simplified_filter,
- SimplifyWithGuarantee(filter_, target.guarantee));
-
- ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target,
- plan()->exec_context()));
-
- if (mask.is_scalar()) {
- const auto& mask_scalar = mask.scalar_as<BooleanScalar>();
- if (mask_scalar.is_valid && mask_scalar.value) {
- return target;
- }
-
- return target.Slice(0, 0);
- }
-
- // if the values are all scalar then the mask must also be
- DCHECK(!std::all_of(target.values.begin(), target.values.end(),
- [](const Datum& value) { return value.is_scalar(); }));
-
- auto values = target.values;
- for (auto& value : values) {
- if (value.is_scalar()) continue;
- ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults()));
- }
- return ExecBatch::Make(std::move(values));
- }
-
- void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- auto maybe_filtered = DoFilter(std::move(batch));
- if (!maybe_filtered.ok()) {
- outputs_[0]->ErrorReceived(this, maybe_filtered.status());
- return;
- }
-
- maybe_filtered->guarantee = batch.guarantee;
- outputs_[0]->InputReceived(this, seq, maybe_filtered.MoveValueUnsafe());
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->ErrorReceived(this, std::move(error));
- }
-
- void InputFinished(ExecNode* input, int seq) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->InputFinished(this, seq);
- }
-
- Status StartProducing() override { return Status::OK(); }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override { inputs_[0]->StopProducing(this); }
-
- Future<> finished() override { return inputs_[0]->finished(); }
-
- private:
- Expression filter_;
-};
-
-Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter) {
- if (!filter.IsBound()) {
- ARROW_ASSIGN_OR_RAISE(filter, filter.Bind(*input->output_schema()));
- }
-
- if (filter.type()->id() != Type::BOOL) {
- return Status::TypeError("Filter expression must evaluate to bool, but ",
- filter.ToString(), " evaluates to ",
- filter.type()->ToString());
- }
-
- return input->plan()->EmplaceNode<FilterNode>(input, std::move(label),
- std::move(filter));
-}
-
-struct ProjectNode : ExecNode {
- ProjectNode(ExecNode* input, std::string label, std::shared_ptr<Schema> output_schema,
- std::vector<Expression> exprs)
- : ExecNode(input->plan(), std::move(label), {input}, {"target"},
- /*output_schema=*/std::move(output_schema),
- /*num_outputs=*/1),
- exprs_(std::move(exprs)) {}
-
- const char* kind_name() override { return "ProjectNode"; }
-
- Result<ExecBatch> DoProject(const ExecBatch& target) {
- std::vector<Datum> values{exprs_.size()};
- for (size_t i = 0; i < exprs_.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(Expression simplified_expr,
- SimplifyWithGuarantee(exprs_[i], target.guarantee));
-
- ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target,
- plan()->exec_context()));
- }
- return ExecBatch{std::move(values), target.length};
- }
-
- void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- auto maybe_projected = DoProject(std::move(batch));
- if (!maybe_projected.ok()) {
- outputs_[0]->ErrorReceived(this, maybe_projected.status());
- return;
- }
-
- maybe_projected->guarantee = batch.guarantee;
- outputs_[0]->InputReceived(this, seq, maybe_projected.MoveValueUnsafe());
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->ErrorReceived(this, std::move(error));
- }
-
- void InputFinished(ExecNode* input, int seq) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->InputFinished(this, seq);
- }
-
- Status StartProducing() override { return Status::OK(); }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override { inputs_[0]->StopProducing(this); }
-
- Future<> finished() override { return inputs_[0]->finished(); }
-
- private:
- std::vector<Expression> exprs_;
-};
-
-Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
- std::vector<Expression> exprs,
- std::vector<std::string> names) {
- FieldVector fields(exprs.size());
-
- if (names.size() == 0) {
- names.resize(exprs.size());
- for (size_t i = 0; i < exprs.size(); ++i) {
- names[i] = exprs[i].ToString();
- }
- }
-
- int i = 0;
- for (auto& expr : exprs) {
- if (!expr.IsBound()) {
- ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*input->output_schema()));
- }
- fields[i] = field(std::move(names[i]), expr.type());
- ++i;
- }
-
- return input->plan()->EmplaceNode<ProjectNode>(
- input, std::move(label), schema(std::move(fields)), std::move(exprs));
-}
-
-struct SinkNode : ExecNode {
- SinkNode(ExecNode* input, std::string label,
- AsyncGenerator<util::optional<ExecBatch>>* generator)
- : ExecNode(input->plan(), std::move(label), {input}, {"collected"}, {},
- /*num_outputs=*/0),
- producer_(MakeProducer(generator)) {}
-
- static PushGenerator<util::optional<ExecBatch>>::Producer MakeProducer(
- AsyncGenerator<util::optional<ExecBatch>>* out_gen) {
- PushGenerator<util::optional<ExecBatch>> gen;
- auto out = gen.producer();
- *out_gen = std::move(gen);
- return out;
- }
-
- const char* kind_name() override { return "SinkNode"; }
-
- Status StartProducing() override {
- finished_ = Future<>::Make();
- return Status::OK();
- }
-
- // sink nodes have no outputs from which to feel backpressure
- [[noreturn]] static void NoOutputs() {
- DCHECK(false) << "no outputs; this should never be called";
- std::abort();
- }
- [[noreturn]] void ResumeProducing(ExecNode* output) override { NoOutputs(); }
- [[noreturn]] void PauseProducing(ExecNode* output) override { NoOutputs(); }
- [[noreturn]] void StopProducing(ExecNode* output) override { NoOutputs(); }
-
- void StopProducing() override {
- Finish();
- inputs_[0]->StopProducing(this);
- }
-
- Future<> finished() override { return finished_; }
-
- void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- std::unique_lock<std::mutex> lock(mutex_);
- if (finished_.is_finished()) return;
-
- ++num_received_;
- if (num_received_ == emit_stop_) {
- lock.unlock();
- producer_.Push(std::move(batch));
- Finish();
- return;
- }
-
- if (emit_stop_ != -1) {
- DCHECK_LE(seq_num, emit_stop_);
- }
-
- lock.unlock();
- producer_.Push(std::move(batch));
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- producer_.Push(std::move(error));
- Finish();
- inputs_[0]->StopProducing(this);
- }
-
- void InputFinished(ExecNode* input, int seq_stop) override {
- std::unique_lock<std::mutex> lock(mutex_);
- emit_stop_ = seq_stop;
- if (num_received_ == emit_stop_) {
- lock.unlock();
- Finish();
- }
- }
-
- private:
- void Finish() {
- if (producer_.Close()) {
- finished_.MarkFinished();
- }
- }
-
- std::mutex mutex_;
-
- int num_received_ = 0;
- int emit_stop_ = -1;
- Future<> finished_ = Future<>::MakeFinished();
-
- PushGenerator<util::optional<ExecBatch>>::Producer producer_;
-};
-
-AsyncGenerator<util::optional<ExecBatch>> MakeSinkNode(ExecNode* input,
- std::string label) {
- AsyncGenerator<util::optional<ExecBatch>> out;
- (void)input->plan()->EmplaceNode<SinkNode>(input, std::move(label), &out);
- return out;
-}
-
-std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
- std::shared_ptr<Schema> schema,
- std::function<Future<util::optional<ExecBatch>>()> gen, MemoryPool* pool) {
- struct Impl : RecordBatchReader {
- std::shared_ptr<Schema> schema() const override { return schema_; }
-
- Status ReadNext(std::shared_ptr<RecordBatch>* record_batch) override {
- ARROW_ASSIGN_OR_RAISE(auto batch, iterator_.Next());
- if (batch) {
- ARROW_ASSIGN_OR_RAISE(*record_batch, batch->ToRecordBatch(schema_, pool_));
- } else {
- *record_batch = IterationEnd<std::shared_ptr<RecordBatch>>();
- }
- return Status::OK();
- }
-
- MemoryPool* pool_;
- std::shared_ptr<Schema> schema_;
- Iterator<util::optional<ExecBatch>> iterator_;
- };
-
- auto out = std::make_shared<Impl>();
- out->pool_ = pool;
- out->schema_ = std::move(schema);
- out->iterator_ = MakeGeneratorIterator(std::move(gen));
- return out;
-}
-
-struct ScalarAggregateNode : ExecNode {
- ScalarAggregateNode(ExecNode* input, std::string label,
- std::shared_ptr<Schema> output_schema,
- std::vector<const ScalarAggregateKernel*> kernels,
- std::vector<std::vector<std::unique_ptr<KernelState>>> states)
- : ExecNode(input->plan(), std::move(label), {input}, {"target"},
- /*output_schema=*/std::move(output_schema),
- /*num_outputs=*/1),
- kernels_(std::move(kernels)),
- states_(std::move(states)) {}
-
- const char* kind_name() override { return "ScalarAggregateNode"; }
-
- Status DoConsume(const ExecBatch& batch, size_t thread_index) {
- for (size_t i = 0; i < kernels_.size(); ++i) {
- KernelContext batch_ctx{plan()->exec_context()};
- batch_ctx.SetState(states_[i][thread_index].get());
- ExecBatch single_column_batch{{batch.values[i]}, batch.length};
- RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch));
- }
- return Status::OK();
- }
-
- void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- std::unique_lock<std::mutex> lock(mutex_);
- auto it =
- thread_indices_.emplace(std::this_thread::get_id(), thread_indices_.size()).first;
- auto thread_index = it->second;
-
- lock.unlock();
-
- Status st = DoConsume(std::move(batch), thread_index);
- if (!st.ok()) {
- outputs_[0]->ErrorReceived(this, std::move(st));
- return;
- }
-
- lock.lock();
- ++num_received_;
- st = MaybeFinish(&lock);
- if (!st.ok()) {
- outputs_[0]->ErrorReceived(this, std::move(st));
- }
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->ErrorReceived(this, std::move(error));
- }
-
- void InputFinished(ExecNode* input, int seq) override {
- DCHECK_EQ(input, inputs_[0]);
- std::unique_lock<std::mutex> lock(mutex_);
- num_total_ = seq;
- Status st = MaybeFinish(&lock);
-
- if (!st.ok()) {
- outputs_[0]->ErrorReceived(this, std::move(st));
- }
- }
-
- Status StartProducing() override {
- finished_ = Future<>::Make();
- // Scalar aggregates will only output a single batch
- outputs_[0]->InputFinished(this, 1);
- return Status::OK();
- }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override {
- inputs_[0]->StopProducing(this);
- finished_.MarkFinished();
- }
-
- Future<> finished() override { return finished_; }
-
- private:
- Status MaybeFinish(std::unique_lock<std::mutex>* lock) {
- if (num_received_ != num_total_) return Status::OK();
-
- if (states_.empty()) return Status::OK();
-
- ExecBatch batch{{}, 1};
- batch.values.resize(kernels_.size());
-
- for (size_t i = 0; i < kernels_.size(); ++i) {
- KernelContext ctx{plan()->exec_context()};
- ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll(
- kernels_[i], &ctx, std::move(states_[i])));
- RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i]));
- }
- states_.clear();
- lock->unlock();
-
- outputs_[0]->InputReceived(this, 0, batch);
-
- finished_.MarkFinished();
- return Status::OK();
- }
-
- Future<> finished_ = Future<>::MakeFinished();
- std::vector<const ScalarAggregateKernel*> kernels_;
- std::vector<std::vector<std::unique_ptr<KernelState>>> states_;
- std::unordered_map<std::thread::id, size_t> thread_indices_;
- std::mutex mutex_;
- int num_received_ = 0, num_total_ = -1;
-};
-
-Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
- std::vector<internal::Aggregate> aggregates) {
- if (input->output_schema()->num_fields() != static_cast<int>(aggregates.size())) {
- return Status::Invalid("Provided ", aggregates.size(),
- " aggregates, expected one for each field of ",
- input->output_schema()->ToString());
- }
-
- auto exec_ctx = input->plan()->exec_context();
-
- std::vector<const ScalarAggregateKernel*> kernels(aggregates.size());
- std::vector<std::vector<std::unique_ptr<KernelState>>> states(kernels.size());
- FieldVector fields(kernels.size());
-
- for (size_t i = 0; i < kernels.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(auto function,
- exec_ctx->func_registry()->GetFunction(aggregates[i].function));
-
- if (function->kind() != Function::SCALAR_AGGREGATE) {
- return Status::Invalid("Provided non ScalarAggregateFunction ",
- aggregates[i].function);
- }
-
- auto in_type = ValueDescr::Array(input->output_schema()->fields()[i]->type());
-
- ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, function->DispatchExact({in_type}));
- kernels[i] = static_cast<const ScalarAggregateKernel*>(kernel);
-
- if (aggregates[i].options == nullptr) {
- aggregates[i].options = function->default_options();
- }
-
- KernelContext kernel_ctx{exec_ctx};
- states[i].resize(exec_ctx->executor() ? exec_ctx->executor()->GetCapacity() : 1);
- RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx,
- KernelInitArgs{kernels[i],
- {
- in_type,
- },
- aggregates[i].options},
- &states[i]));
-
- // pick one to resolve the kernel signature
- kernel_ctx.SetState(states[i][0].get());
- ARROW_ASSIGN_OR_RAISE(
- auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type}));
-
- fields[i] = field(aggregates[i].function, std::move(descr.type));
- }
-
- return input->plan()->EmplaceNode<ScalarAggregateNode>(
- input, std::move(label), schema(std::move(fields)), std::move(kernels),
- std::move(states));
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/exec_plan.h"
+
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arrow/array/util.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+
+namespace {
+
+struct ExecPlanImpl : public ExecPlan {
+ explicit ExecPlanImpl(ExecContext* exec_context) : ExecPlan(exec_context) {}
+
+ ~ExecPlanImpl() override {
+ if (started_ && !finished_.is_finished()) {
+ ARROW_LOG(WARNING) << "Plan was destroyed before finishing";
+ StopProducing();
+ finished().Wait();
+ }
+ }
+
+ ExecNode* AddNode(std::unique_ptr<ExecNode> node) {
+ if (node->num_inputs() == 0) {
+ sources_.push_back(node.get());
+ }
+ if (node->num_outputs() == 0) {
+ sinks_.push_back(node.get());
+ }
+ nodes_.push_back(std::move(node));
+ return nodes_.back().get();
+ }
+
+ Status Validate() const {
+ if (nodes_.empty()) {
+ return Status::Invalid("ExecPlan has no node");
+ }
+ for (const auto& node : nodes_) {
+ RETURN_NOT_OK(node->Validate());
+ }
+ return Status::OK();
+ }
+
+ Status StartProducing() {
+ if (started_) {
+ return Status::Invalid("restarted ExecPlan");
+ }
+ started_ = true;
+
+ // producers precede consumers
+ sorted_nodes_ = TopoSort();
+
+ std::vector<Future<>> futures;
+
+ Status st = Status::OK();
+
+ using rev_it = std::reverse_iterator<NodeVector::iterator>;
+ for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) {
+ auto node = *it;
+
+ st = node->StartProducing();
+ if (!st.ok()) {
+ // Stop nodes that successfully started, in reverse order
+ stopped_ = true;
+ StopProducingImpl(it.base(), sorted_nodes_.end());
+ break;
+ }
+
+ futures.push_back(node->finished());
+ }
+
+ finished_ = AllComplete(std::move(futures));
+ return st;
+ }
+
+ void StopProducing() {
+ DCHECK(started_) << "stopped an ExecPlan which never started";
+ stopped_ = true;
+
+ StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end());
+ }
+
+ template <typename It>
+ void StopProducingImpl(It begin, It end) {
+ for (auto it = begin; it != end; ++it) {
+ auto node = *it;
+ node->StopProducing();
+ }
+ }
+
+ NodeVector TopoSort() {
+ struct Impl {
+ const std::vector<std::unique_ptr<ExecNode>>& nodes;
+ std::unordered_set<ExecNode*> visited;
+ NodeVector sorted;
+
+ explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+ visited.reserve(nodes.size());
+ sorted.resize(nodes.size());
+
+ for (const auto& node : nodes) {
+ Visit(node.get());
+ }
+
+ DCHECK_EQ(visited.size(), nodes.size());
+ }
+
+ void Visit(ExecNode* node) {
+ if (visited.count(node) != 0) return;
+
+ for (auto input : node->inputs()) {
+ // Ensure that producers are inserted before this consumer
+ Visit(input);
+ }
+
+ sorted[visited.size()] = node;
+ visited.insert(node);
+ }
+ };
+
+ return std::move(Impl{nodes_}.sorted);
+ }
+
+ Future<> finished_ = Future<>::MakeFinished();
+ bool started_ = false, stopped_ = false;
+ std::vector<std::unique_ptr<ExecNode>> nodes_;
+ NodeVector sources_, sinks_;
+ NodeVector sorted_nodes_;
+};
+
+ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast<ExecPlanImpl*>(ptr); }
+
+const ExecPlanImpl* ToDerived(const ExecPlan* ptr) {
+ return checked_cast<const ExecPlanImpl*>(ptr);
+}
+
+util::optional<int> GetNodeIndex(const std::vector<ExecNode*>& nodes,
+ const ExecNode* node) {
+ for (int i = 0; i < static_cast<int>(nodes.size()); ++i) {
+ if (nodes[i] == node) return i;
+ }
+ return util::nullopt;
+}
+
+} // namespace
+
+Result<std::shared_ptr<ExecPlan>> ExecPlan::Make(ExecContext* ctx) {
+ return std::shared_ptr<ExecPlan>(new ExecPlanImpl{ctx});
+}
+
+ExecNode* ExecPlan::AddNode(std::unique_ptr<ExecNode> node) {
+ return ToDerived(this)->AddNode(std::move(node));
+}
+
+const ExecPlan::NodeVector& ExecPlan::sources() const {
+ return ToDerived(this)->sources_;
+}
+
+const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; }
+
+Status ExecPlan::Validate() { return ToDerived(this)->Validate(); }
+
+Status ExecPlan::StartProducing() { return ToDerived(this)->StartProducing(); }
+
+void ExecPlan::StopProducing() { ToDerived(this)->StopProducing(); }
+
+Future<> ExecPlan::finished() { return ToDerived(this)->finished_; }
+
+ExecNode::ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
+ std::vector<std::string> input_labels,
+ std::shared_ptr<Schema> output_schema, int num_outputs)
+ : plan_(plan),
+ label_(std::move(label)),
+ inputs_(std::move(inputs)),
+ input_labels_(std::move(input_labels)),
+ output_schema_(std::move(output_schema)),
+ num_outputs_(num_outputs) {
+ for (auto input : inputs_) {
+ input->outputs_.push_back(this);
+ }
+}
+
+Status ExecNode::Validate() const {
+ if (inputs_.size() != input_labels_.size()) {
+ return Status::Invalid("Invalid number of inputs for '", label(), "' (expected ",
+ num_inputs(), ", actual ", input_labels_.size(), ")");
+ }
+
+ if (static_cast<int>(outputs_.size()) != num_outputs_) {
+ return Status::Invalid("Invalid number of outputs for '", label(), "' (expected ",
+ num_outputs(), ", actual ", outputs_.size(), ")");
+ }
+
+ for (auto out : outputs_) {
+ auto input_index = GetNodeIndex(out->inputs(), this);
+ if (!input_index) {
+ return Status::Invalid("Node '", label(), "' outputs to node '", out->label(),
+ "' but is not listed as an input.");
+ }
+ }
+
+ return Status::OK();
+}
+
+struct SourceNode : ExecNode {
+ SourceNode(ExecPlan* plan, std::string label, std::shared_ptr<Schema> output_schema,
+ AsyncGenerator<util::optional<ExecBatch>> generator)
+ : ExecNode(plan, std::move(label), {}, {}, std::move(output_schema),
+ /*num_outputs=*/1),
+ generator_(std::move(generator)) {}
+
+ const char* kind_name() override { return "SourceNode"; }
+
+ [[noreturn]] static void NoInputs() {
+ DCHECK(false) << "no inputs; this should never be called";
+ std::abort();
+ }
+ [[noreturn]] void InputReceived(ExecNode*, int, ExecBatch) override { NoInputs(); }
+ [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); }
+ [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); }
+
+ Status StartProducing() override {
+ DCHECK(!stop_requested_) << "Restarted SourceNode";
+
+ CallbackOptions options;
+ if (auto executor = plan()->exec_context()->executor()) {
+ // These options will transfer execution to the desired Executor if necessary.
+ // This can happen for in-memory scans where batches didn't require
+ // any CPU work to decode. Otherwise, parsing etc should have already
+ // been placed us on the desired Executor and no queues will be pushed to.
+ options.executor = executor;
+ options.should_schedule = ShouldSchedule::IfDifferentExecutor;
+ }
+
+ finished_ = Loop([this, options] {
+ std::unique_lock<std::mutex> lock(mutex_);
+ int seq = batch_count_++;
+ if (stop_requested_) {
+ return Future<ControlFlow<int>>::MakeFinished(Break(seq));
+ }
+ lock.unlock();
+
+ return generator_().Then(
+ [=](const util::optional<ExecBatch>& batch) -> ControlFlow<int> {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (IsIterationEnd(batch) || stop_requested_) {
+ stop_requested_ = true;
+ return Break(seq);
+ }
+ lock.unlock();
+
+ outputs_[0]->InputReceived(this, seq, *batch);
+ return Continue();
+ },
+ [=](const Status& error) -> ControlFlow<int> {
+ // NB: ErrorReceived is independent of InputFinished, but
+ // ErrorReceived will usually prompt StopProducing which will
+ // prompt InputFinished. ErrorReceived may still be called from a
+ // node which was requested to stop (indeed, the request to stop
+ // may prompt an error).
+ std::unique_lock<std::mutex> lock(mutex_);
+ stop_requested_ = true;
+ lock.unlock();
+ outputs_[0]->ErrorReceived(this, error);
+ return Break(seq);
+ },
+ options);
+ }).Then([&](int seq) { outputs_[0]->InputFinished(this, seq); });
+
+ return Status::OK();
+ }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ stop_requested_ = true;
+ }
+
+ Future<> finished() override { return finished_; }
+
+ private:
+ std::mutex mutex_;
+ bool stop_requested_{false};
+ int batch_count_{0};
+ Future<> finished_ = Future<>::MakeFinished();
+ AsyncGenerator<util::optional<ExecBatch>> generator_;
+};
+
+ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ AsyncGenerator<util::optional<ExecBatch>> generator) {
+ return plan->EmplaceNode<SourceNode>(plan, std::move(label), std::move(output_schema),
+ std::move(generator));
+}
+
+struct FilterNode : ExecNode {
+ FilterNode(ExecNode* input, std::string label, Expression filter)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/input->output_schema(),
+ /*num_outputs=*/1),
+ filter_(std::move(filter)) {}
+
+ const char* kind_name() override { return "FilterNode"; }
+
+ Result<ExecBatch> DoFilter(const ExecBatch& target) {
+ ARROW_ASSIGN_OR_RAISE(Expression simplified_filter,
+ SimplifyWithGuarantee(filter_, target.guarantee));
+
+ ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target,
+ plan()->exec_context()));
+
+ if (mask.is_scalar()) {
+ const auto& mask_scalar = mask.scalar_as<BooleanScalar>();
+ if (mask_scalar.is_valid && mask_scalar.value) {
+ return target;
+ }
+
+ return target.Slice(0, 0);
+ }
+
+ // if the values are all scalar then the mask must also be
+ DCHECK(!std::all_of(target.values.begin(), target.values.end(),
+ [](const Datum& value) { return value.is_scalar(); }));
+
+ auto values = target.values;
+ for (auto& value : values) {
+ if (value.is_scalar()) continue;
+ ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults()));
+ }
+ return ExecBatch::Make(std::move(values));
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ auto maybe_filtered = DoFilter(std::move(batch));
+ if (!maybe_filtered.ok()) {
+ outputs_[0]->ErrorReceived(this, maybe_filtered.status());
+ return;
+ }
+
+ maybe_filtered->guarantee = batch.guarantee;
+ outputs_[0]->InputReceived(this, seq, maybe_filtered.MoveValueUnsafe());
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->InputFinished(this, seq);
+ }
+
+ Status StartProducing() override { return Status::OK(); }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override { inputs_[0]->StopProducing(this); }
+
+ Future<> finished() override { return inputs_[0]->finished(); }
+
+ private:
+ Expression filter_;
+};
+
+Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter) {
+ if (!filter.IsBound()) {
+ ARROW_ASSIGN_OR_RAISE(filter, filter.Bind(*input->output_schema()));
+ }
+
+ if (filter.type()->id() != Type::BOOL) {
+ return Status::TypeError("Filter expression must evaluate to bool, but ",
+ filter.ToString(), " evaluates to ",
+ filter.type()->ToString());
+ }
+
+ return input->plan()->EmplaceNode<FilterNode>(input, std::move(label),
+ std::move(filter));
+}
+
+struct ProjectNode : ExecNode {
+ ProjectNode(ExecNode* input, std::string label, std::shared_ptr<Schema> output_schema,
+ std::vector<Expression> exprs)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/std::move(output_schema),
+ /*num_outputs=*/1),
+ exprs_(std::move(exprs)) {}
+
+ const char* kind_name() override { return "ProjectNode"; }
+
+ Result<ExecBatch> DoProject(const ExecBatch& target) {
+ std::vector<Datum> values{exprs_.size()};
+ for (size_t i = 0; i < exprs_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(Expression simplified_expr,
+ SimplifyWithGuarantee(exprs_[i], target.guarantee));
+
+ ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target,
+ plan()->exec_context()));
+ }
+ return ExecBatch{std::move(values), target.length};
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ auto maybe_projected = DoProject(std::move(batch));
+ if (!maybe_projected.ok()) {
+ outputs_[0]->ErrorReceived(this, maybe_projected.status());
+ return;
+ }
+
+ maybe_projected->guarantee = batch.guarantee;
+ outputs_[0]->InputReceived(this, seq, maybe_projected.MoveValueUnsafe());
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->InputFinished(this, seq);
+ }
+
+ Status StartProducing() override { return Status::OK(); }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override { inputs_[0]->StopProducing(this); }
+
+ Future<> finished() override { return inputs_[0]->finished(); }
+
+ private:
+ std::vector<Expression> exprs_;
+};
+
+Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
+ std::vector<Expression> exprs,
+ std::vector<std::string> names) {
+ FieldVector fields(exprs.size());
+
+ if (names.size() == 0) {
+ names.resize(exprs.size());
+ for (size_t i = 0; i < exprs.size(); ++i) {
+ names[i] = exprs[i].ToString();
+ }
+ }
+
+ int i = 0;
+ for (auto& expr : exprs) {
+ if (!expr.IsBound()) {
+ ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*input->output_schema()));
+ }
+ fields[i] = field(std::move(names[i]), expr.type());
+ ++i;
+ }
+
+ return input->plan()->EmplaceNode<ProjectNode>(
+ input, std::move(label), schema(std::move(fields)), std::move(exprs));
+}
+
+struct SinkNode : ExecNode {
+ SinkNode(ExecNode* input, std::string label,
+ AsyncGenerator<util::optional<ExecBatch>>* generator)
+ : ExecNode(input->plan(), std::move(label), {input}, {"collected"}, {},
+ /*num_outputs=*/0),
+ producer_(MakeProducer(generator)) {}
+
+ static PushGenerator<util::optional<ExecBatch>>::Producer MakeProducer(
+ AsyncGenerator<util::optional<ExecBatch>>* out_gen) {
+ PushGenerator<util::optional<ExecBatch>> gen;
+ auto out = gen.producer();
+ *out_gen = std::move(gen);
+ return out;
+ }
+
+ const char* kind_name() override { return "SinkNode"; }
+
+ Status StartProducing() override {
+ finished_ = Future<>::Make();
+ return Status::OK();
+ }
+
+ // sink nodes have no outputs from which to feel backpressure
+ [[noreturn]] static void NoOutputs() {
+ DCHECK(false) << "no outputs; this should never be called";
+ std::abort();
+ }
+ [[noreturn]] void ResumeProducing(ExecNode* output) override { NoOutputs(); }
+ [[noreturn]] void PauseProducing(ExecNode* output) override { NoOutputs(); }
+ [[noreturn]] void StopProducing(ExecNode* output) override { NoOutputs(); }
+
+ void StopProducing() override {
+ Finish();
+ inputs_[0]->StopProducing(this);
+ }
+
+ Future<> finished() override { return finished_; }
+
+ void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (finished_.is_finished()) return;
+
+ ++num_received_;
+ if (num_received_ == emit_stop_) {
+ lock.unlock();
+ producer_.Push(std::move(batch));
+ Finish();
+ return;
+ }
+
+ if (emit_stop_ != -1) {
+ DCHECK_LE(seq_num, emit_stop_);
+ }
+
+ lock.unlock();
+ producer_.Push(std::move(batch));
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ producer_.Push(std::move(error));
+ Finish();
+ inputs_[0]->StopProducing(this);
+ }
+
+ void InputFinished(ExecNode* input, int seq_stop) override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ emit_stop_ = seq_stop;
+ if (num_received_ == emit_stop_) {
+ lock.unlock();
+ Finish();
+ }
+ }
+
+ private:
+ void Finish() {
+ if (producer_.Close()) {
+ finished_.MarkFinished();
+ }
+ }
+
+ std::mutex mutex_;
+
+ int num_received_ = 0;
+ int emit_stop_ = -1;
+ Future<> finished_ = Future<>::MakeFinished();
+
+ PushGenerator<util::optional<ExecBatch>>::Producer producer_;
+};
+
+AsyncGenerator<util::optional<ExecBatch>> MakeSinkNode(ExecNode* input,
+ std::string label) {
+ AsyncGenerator<util::optional<ExecBatch>> out;
+ (void)input->plan()->EmplaceNode<SinkNode>(input, std::move(label), &out);
+ return out;
+}
+
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+ std::shared_ptr<Schema> schema,
+ std::function<Future<util::optional<ExecBatch>>()> gen, MemoryPool* pool) {
+ struct Impl : RecordBatchReader {
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* record_batch) override {
+ ARROW_ASSIGN_OR_RAISE(auto batch, iterator_.Next());
+ if (batch) {
+ ARROW_ASSIGN_OR_RAISE(*record_batch, batch->ToRecordBatch(schema_, pool_));
+ } else {
+ *record_batch = IterationEnd<std::shared_ptr<RecordBatch>>();
+ }
+ return Status::OK();
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<Schema> schema_;
+ Iterator<util::optional<ExecBatch>> iterator_;
+ };
+
+ auto out = std::make_shared<Impl>();
+ out->pool_ = pool;
+ out->schema_ = std::move(schema);
+ out->iterator_ = MakeGeneratorIterator(std::move(gen));
+ return out;
+}
+
+struct ScalarAggregateNode : ExecNode {
+ ScalarAggregateNode(ExecNode* input, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ std::vector<const ScalarAggregateKernel*> kernels,
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/std::move(output_schema),
+ /*num_outputs=*/1),
+ kernels_(std::move(kernels)),
+ states_(std::move(states)) {}
+
+ const char* kind_name() override { return "ScalarAggregateNode"; }
+
+ Status DoConsume(const ExecBatch& batch, size_t thread_index) {
+ for (size_t i = 0; i < kernels_.size(); ++i) {
+ KernelContext batch_ctx{plan()->exec_context()};
+ batch_ctx.SetState(states_[i][thread_index].get());
+ ExecBatch single_column_batch{{batch.values[i]}, batch.length};
+ RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch));
+ }
+ return Status::OK();
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ std::unique_lock<std::mutex> lock(mutex_);
+ auto it =
+ thread_indices_.emplace(std::this_thread::get_id(), thread_indices_.size()).first;
+ auto thread_index = it->second;
+
+ lock.unlock();
+
+ Status st = DoConsume(std::move(batch), thread_index);
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ return;
+ }
+
+ lock.lock();
+ ++num_received_;
+ st = MaybeFinish(&lock);
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ }
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ std::unique_lock<std::mutex> lock(mutex_);
+ num_total_ = seq;
+ Status st = MaybeFinish(&lock);
+
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ }
+ }
+
+ Status StartProducing() override {
+ finished_ = Future<>::Make();
+ // Scalar aggregates will only output a single batch
+ outputs_[0]->InputFinished(this, 1);
+ return Status::OK();
+ }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override {
+ inputs_[0]->StopProducing(this);
+ finished_.MarkFinished();
+ }
+
+ Future<> finished() override { return finished_; }
+
+ private:
+ Status MaybeFinish(std::unique_lock<std::mutex>* lock) {
+ if (num_received_ != num_total_) return Status::OK();
+
+ if (states_.empty()) return Status::OK();
+
+ ExecBatch batch{{}, 1};
+ batch.values.resize(kernels_.size());
+
+ for (size_t i = 0; i < kernels_.size(); ++i) {
+ KernelContext ctx{plan()->exec_context()};
+ ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll(
+ kernels_[i], &ctx, std::move(states_[i])));
+ RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i]));
+ }
+ states_.clear();
+ lock->unlock();
+
+ outputs_[0]->InputReceived(this, 0, batch);
+
+ finished_.MarkFinished();
+ return Status::OK();
+ }
+
+ Future<> finished_ = Future<>::MakeFinished();
+ std::vector<const ScalarAggregateKernel*> kernels_;
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states_;
+ std::unordered_map<std::thread::id, size_t> thread_indices_;
+ std::mutex mutex_;
+ int num_received_ = 0, num_total_ = -1;
+};
+
+Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
+ std::vector<internal::Aggregate> aggregates) {
+ if (input->output_schema()->num_fields() != static_cast<int>(aggregates.size())) {
+ return Status::Invalid("Provided ", aggregates.size(),
+ " aggregates, expected one for each field of ",
+ input->output_schema()->ToString());
+ }
+
+ auto exec_ctx = input->plan()->exec_context();
+
+ std::vector<const ScalarAggregateKernel*> kernels(aggregates.size());
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states(kernels.size());
+ FieldVector fields(kernels.size());
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto function,
+ exec_ctx->func_registry()->GetFunction(aggregates[i].function));
+
+ if (function->kind() != Function::SCALAR_AGGREGATE) {
+ return Status::Invalid("Provided non ScalarAggregateFunction ",
+ aggregates[i].function);
+ }
+
+ auto in_type = ValueDescr::Array(input->output_schema()->fields()[i]->type());
+
+ ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, function->DispatchExact({in_type}));
+ kernels[i] = static_cast<const ScalarAggregateKernel*>(kernel);
+
+ if (aggregates[i].options == nullptr) {
+ aggregates[i].options = function->default_options();
+ }
+
+ KernelContext kernel_ctx{exec_ctx};
+ states[i].resize(exec_ctx->executor() ? exec_ctx->executor()->GetCapacity() : 1);
+ RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx,
+ KernelInitArgs{kernels[i],
+ {
+ in_type,
+ },
+ aggregates[i].options},
+ &states[i]));
+
+ // pick one to resolve the kernel signature
+ kernel_ctx.SetState(states[i][0].get());
+ ARROW_ASSIGN_OR_RAISE(
+ auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type}));
+
+ fields[i] = field(aggregates[i].function, std::move(descr.type));
+ }
+
+ return input->plan()->EmplaceNode<ScalarAggregateNode>(
+ input, std::move(label), schema(std::move(fields)), std::move(kernels),
+ std::move(states));
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
index c36c174af05..0df78fecd7c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
@@ -1,287 +1,287 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/type_fwd.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace compute {
-
-class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
- public:
- using NodeVector = std::vector<ExecNode*>;
-
- virtual ~ExecPlan() = default;
-
- ExecContext* exec_context() const { return exec_context_; }
-
- /// Make an empty exec plan
- static Result<std::shared_ptr<ExecPlan>> Make(ExecContext* = default_exec_context());
-
- ExecNode* AddNode(std::unique_ptr<ExecNode> node);
-
- template <typename Node, typename... Args>
- Node* EmplaceNode(Args&&... args) {
- std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
- auto out = node.get();
- AddNode(std::move(node));
- return out;
- }
-
- /// The initial inputs
- const NodeVector& sources() const;
-
- /// The final outputs
- const NodeVector& sinks() const;
-
- Status Validate();
-
- /// \brief Start producing on all nodes
- ///
- /// Nodes are started in reverse topological order, such that any node
- /// is started before all of its inputs.
- Status StartProducing();
-
- /// \brief Stop producing on all nodes
- ///
- /// Nodes are stopped in topological order, such that any node
- /// is stopped before all of its outputs.
- void StopProducing();
-
- /// \brief A future which will be marked finished when all nodes have stopped producing.
- Future<> finished();
-
- protected:
- ExecContext* exec_context_;
- explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {}
-};
-
-class ARROW_EXPORT ExecNode {
- public:
- using NodeVector = std::vector<ExecNode*>;
-
- virtual ~ExecNode() = default;
-
- virtual const char* kind_name() = 0;
-
- // The number of inputs/outputs expected by this node
- int num_inputs() const { return static_cast<int>(inputs_.size()); }
- int num_outputs() const { return num_outputs_; }
-
- /// This node's predecessors in the exec plan
- const NodeVector& inputs() const { return inputs_; }
-
- /// \brief Labels identifying the function of each input.
- const std::vector<std::string>& input_labels() const { return input_labels_; }
-
- /// This node's successors in the exec plan
- const NodeVector& outputs() const { return outputs_; }
-
- /// The datatypes for batches produced by this node
- const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
-
- /// This node's exec plan
- ExecPlan* plan() { return plan_; }
-
- /// \brief An optional label, for display and debugging
- ///
- /// There is no guarantee that this value is non-empty or unique.
- const std::string& label() const { return label_; }
-
- Status Validate() const;
-
- /// Upstream API:
- /// These functions are called by input nodes that want to inform this node
- /// about an updated condition (a new input batch, an error, an impeding
- /// end of stream).
- ///
- /// Implementation rules:
- /// - these may be called anytime after StartProducing() has succeeded
- /// (and even during or after StopProducing())
- /// - these may be called concurrently
- /// - these are allowed to call back into PauseProducing(), ResumeProducing()
- /// and StopProducing()
-
- /// Transfer input batch to ExecNode
- virtual void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) = 0;
-
- /// Signal error to ExecNode
- virtual void ErrorReceived(ExecNode* input, Status error) = 0;
-
- /// Mark the inputs finished after the given number of batches.
- ///
- /// This may be called before all inputs are received. This simply fixes
- /// the total number of incoming batches for an input, so that the ExecNode
- /// knows when it has received all input, regardless of order.
- virtual void InputFinished(ExecNode* input, int seq_stop) = 0;
-
- /// Lifecycle API:
- /// - start / stop to initiate and terminate production
- /// - pause / resume to apply backpressure
- ///
- /// Implementation rules:
- /// - StartProducing() should not recurse into the inputs, as it is
- /// handled by ExecPlan::StartProducing()
- /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
- /// concurrently (but only after StartProducing() has returned successfully)
- /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
- /// by the downstream nodes' InputReceived(), ErrorReceived(), InputFinished()
- /// methods
- /// - StopProducing() should recurse into the inputs
- /// - StopProducing() must be idempotent
-
- // XXX What happens if StartProducing() calls an output's InputReceived()
- // synchronously, and InputReceived() decides to call back into StopProducing()
- // (or PauseProducing()) because it received enough data?
- //
- // Right now, since synchronous calls happen in both directions (input to
- // output and then output to input), a node must be careful to be reentrant
- // against synchronous calls from its output, *and* also concurrent calls from
- // other threads. The most reliable solution is to update the internal state
- // first, and notify outputs only at the end.
- //
- // Alternate rules:
- // - StartProducing(), ResumeProducing() can call synchronously into
- // its ouputs' consuming methods (InputReceived() etc.)
- // - InputReceived(), ErrorReceived(), InputFinished() can call asynchronously
- // into its inputs' PauseProducing(), StopProducing()
- //
- // Alternate API:
- // - InputReceived(), ErrorReceived(), InputFinished() return a ProductionHint
- // enum: either None (default), PauseProducing, ResumeProducing, StopProducing
- // - A method allows passing a ProductionHint asynchronously from an output node
- // (replacing PauseProducing(), ResumeProducing(), StopProducing())
-
- /// \brief Start producing
- ///
- /// This must only be called once. If this fails, then other lifecycle
- /// methods must not be called.
- ///
- /// This is typically called automatically by ExecPlan::StartProducing().
- virtual Status StartProducing() = 0;
-
- /// \brief Pause producing temporarily
- ///
- /// This call is a hint that an output node is currently not willing
- /// to receive data.
- ///
- /// This may be called any number of times after StartProducing() succeeds.
- /// However, the node is still free to produce data (which may be difficult
- /// to prevent anyway if data is produced using multiple threads).
- virtual void PauseProducing(ExecNode* output) = 0;
-
- /// \brief Resume producing after a temporary pause
- ///
- /// This call is a hint that an output node is willing to receive data again.
- ///
- /// This may be called any number of times after StartProducing() succeeds.
- /// This may also be called concurrently with PauseProducing(), which suggests
- /// the implementation may use an atomic counter.
- virtual void ResumeProducing(ExecNode* output) = 0;
-
- /// \brief Stop producing definitively to a single output
- ///
- /// This call is a hint that an output node has completed and is not willing
- /// to receive any further data.
- virtual void StopProducing(ExecNode* output) = 0;
-
- /// \brief Stop producing definitively to all outputs
- virtual void StopProducing() = 0;
-
- /// \brief A future which will be marked finished when this node has stopped producing.
- virtual Future<> finished() = 0;
-
- protected:
- ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
- std::vector<std::string> input_labels, std::shared_ptr<Schema> output_schema,
- int num_outputs);
-
- ExecPlan* plan_;
- std::string label_;
-
- NodeVector inputs_;
- std::vector<std::string> input_labels_;
-
- std::shared_ptr<Schema> output_schema_;
- int num_outputs_;
- NodeVector outputs_;
-};
-
-/// \brief Adapt an AsyncGenerator<ExecBatch> as a source node
-///
-/// plan->exec_context()->executor() is used to parallelize pushing to
-/// outputs, if provided.
-ARROW_EXPORT
-ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
- std::shared_ptr<Schema> output_schema,
- std::function<Future<util::optional<ExecBatch>>()>);
-
-/// \brief Add a sink node which forwards to an AsyncGenerator<ExecBatch>
-///
-/// Emitted batches will not be ordered.
-ARROW_EXPORT
-std::function<Future<util::optional<ExecBatch>>()> MakeSinkNode(ExecNode* input,
- std::string label);
-
-/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
-///
-/// The RecordBatchReader does not impose any ordering on emitted batches.
-ARROW_EXPORT
-std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
- std::shared_ptr<Schema>, std::function<Future<util::optional<ExecBatch>>()>,
- MemoryPool*);
-
-/// \brief Make a node which excludes some rows from batches passed through it
-///
-/// The filter Expression will be evaluated against each batch which is pushed to
-/// this node. Any rows for which the filter does not evaluate to `true` will be excluded
-/// in the batch emitted by this node.
-///
-/// If the filter is not already bound, it will be bound against the input's schema.
-ARROW_EXPORT
-Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter);
-
-/// \brief Make a node which executes expressions on input batches, producing new batches.
-///
-/// Each expression will be evaluated against each batch which is pushed to
-/// this node to produce a corresponding output column.
-///
-/// If exprs are not already bound, they will be bound against the input's schema.
-/// If names are not provided, the string representations of exprs will be used.
-ARROW_EXPORT
-Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
- std::vector<Expression> exprs,
- std::vector<std::string> names = {});
-
-ARROW_EXPORT
-Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
- std::vector<internal::Aggregate> aggregates);
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
+ public:
+ using NodeVector = std::vector<ExecNode*>;
+
+ virtual ~ExecPlan() = default;
+
+ ExecContext* exec_context() const { return exec_context_; }
+
+ /// Make an empty exec plan
+ static Result<std::shared_ptr<ExecPlan>> Make(ExecContext* = default_exec_context());
+
+ ExecNode* AddNode(std::unique_ptr<ExecNode> node);
+
+ template <typename Node, typename... Args>
+ Node* EmplaceNode(Args&&... args) {
+ std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
+ auto out = node.get();
+ AddNode(std::move(node));
+ return out;
+ }
+
+ /// The initial inputs
+ const NodeVector& sources() const;
+
+ /// The final outputs
+ const NodeVector& sinks() const;
+
+ Status Validate();
+
+ /// \brief Start producing on all nodes
+ ///
+ /// Nodes are started in reverse topological order, such that any node
+ /// is started before all of its inputs.
+ Status StartProducing();
+
+ /// \brief Stop producing on all nodes
+ ///
+ /// Nodes are stopped in topological order, such that any node
+ /// is stopped before all of its outputs.
+ void StopProducing();
+
+ /// \brief A future which will be marked finished when all nodes have stopped producing.
+ Future<> finished();
+
+ protected:
+ ExecContext* exec_context_;
+ explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {}
+};
+
+class ARROW_EXPORT ExecNode {
+ public:
+ using NodeVector = std::vector<ExecNode*>;
+
+ virtual ~ExecNode() = default;
+
+ virtual const char* kind_name() = 0;
+
+ // The number of inputs/outputs expected by this node
+ int num_inputs() const { return static_cast<int>(inputs_.size()); }
+ int num_outputs() const { return num_outputs_; }
+
+ /// This node's predecessors in the exec plan
+ const NodeVector& inputs() const { return inputs_; }
+
+ /// \brief Labels identifying the function of each input.
+ const std::vector<std::string>& input_labels() const { return input_labels_; }
+
+ /// This node's successors in the exec plan
+ const NodeVector& outputs() const { return outputs_; }
+
+ /// The datatypes for batches produced by this node
+ const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
+
+ /// This node's exec plan
+ ExecPlan* plan() { return plan_; }
+
+ /// \brief An optional label, for display and debugging
+ ///
+ /// There is no guarantee that this value is non-empty or unique.
+ const std::string& label() const { return label_; }
+
+ Status Validate() const;
+
+ /// Upstream API:
+ /// These functions are called by input nodes that want to inform this node
+ /// about an updated condition (a new input batch, an error, an impeding
+ /// end of stream).
+ ///
+ /// Implementation rules:
+ /// - these may be called anytime after StartProducing() has succeeded
+ /// (and even during or after StopProducing())
+ /// - these may be called concurrently
+ /// - these are allowed to call back into PauseProducing(), ResumeProducing()
+ /// and StopProducing()
+
+ /// Transfer input batch to ExecNode
+ virtual void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) = 0;
+
+ /// Signal error to ExecNode
+ virtual void ErrorReceived(ExecNode* input, Status error) = 0;
+
+ /// Mark the inputs finished after the given number of batches.
+ ///
+ /// This may be called before all inputs are received. This simply fixes
+ /// the total number of incoming batches for an input, so that the ExecNode
+ /// knows when it has received all input, regardless of order.
+ virtual void InputFinished(ExecNode* input, int seq_stop) = 0;
+
+ /// Lifecycle API:
+ /// - start / stop to initiate and terminate production
+ /// - pause / resume to apply backpressure
+ ///
+ /// Implementation rules:
+ /// - StartProducing() should not recurse into the inputs, as it is
+ /// handled by ExecPlan::StartProducing()
+ /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+ /// concurrently (but only after StartProducing() has returned successfully)
+ /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+ /// by the downstream nodes' InputReceived(), ErrorReceived(), InputFinished()
+ /// methods
+ /// - StopProducing() should recurse into the inputs
+ /// - StopProducing() must be idempotent
+
+ // XXX What happens if StartProducing() calls an output's InputReceived()
+ // synchronously, and InputReceived() decides to call back into StopProducing()
+ // (or PauseProducing()) because it received enough data?
+ //
+ // Right now, since synchronous calls happen in both directions (input to
+ // output and then output to input), a node must be careful to be reentrant
+ // against synchronous calls from its output, *and* also concurrent calls from
+ // other threads. The most reliable solution is to update the internal state
+ // first, and notify outputs only at the end.
+ //
+ // Alternate rules:
+ // - StartProducing(), ResumeProducing() can call synchronously into
+ // its ouputs' consuming methods (InputReceived() etc.)
+ // - InputReceived(), ErrorReceived(), InputFinished() can call asynchronously
+ // into its inputs' PauseProducing(), StopProducing()
+ //
+ // Alternate API:
+ // - InputReceived(), ErrorReceived(), InputFinished() return a ProductionHint
+ // enum: either None (default), PauseProducing, ResumeProducing, StopProducing
+ // - A method allows passing a ProductionHint asynchronously from an output node
+ // (replacing PauseProducing(), ResumeProducing(), StopProducing())
+
+ /// \brief Start producing
+ ///
+ /// This must only be called once. If this fails, then other lifecycle
+ /// methods must not be called.
+ ///
+ /// This is typically called automatically by ExecPlan::StartProducing().
+ virtual Status StartProducing() = 0;
+
+ /// \brief Pause producing temporarily
+ ///
+ /// This call is a hint that an output node is currently not willing
+ /// to receive data.
+ ///
+ /// This may be called any number of times after StartProducing() succeeds.
+ /// However, the node is still free to produce data (which may be difficult
+ /// to prevent anyway if data is produced using multiple threads).
+ virtual void PauseProducing(ExecNode* output) = 0;
+
+ /// \brief Resume producing after a temporary pause
+ ///
+ /// This call is a hint that an output node is willing to receive data again.
+ ///
+ /// This may be called any number of times after StartProducing() succeeds.
+ /// This may also be called concurrently with PauseProducing(), which suggests
+ /// the implementation may use an atomic counter.
+ virtual void ResumeProducing(ExecNode* output) = 0;
+
+ /// \brief Stop producing definitively to a single output
+ ///
+ /// This call is a hint that an output node has completed and is not willing
+ /// to receive any further data.
+ virtual void StopProducing(ExecNode* output) = 0;
+
+ /// \brief Stop producing definitively to all outputs
+ virtual void StopProducing() = 0;
+
+ /// \brief A future which will be marked finished when this node has stopped producing.
+ virtual Future<> finished() = 0;
+
+ protected:
+ ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
+ std::vector<std::string> input_labels, std::shared_ptr<Schema> output_schema,
+ int num_outputs);
+
+ ExecPlan* plan_;
+ std::string label_;
+
+ NodeVector inputs_;
+ std::vector<std::string> input_labels_;
+
+ std::shared_ptr<Schema> output_schema_;
+ int num_outputs_;
+ NodeVector outputs_;
+};
+
+/// \brief Adapt an AsyncGenerator<ExecBatch> as a source node
+///
+/// plan->exec_context()->executor() is used to parallelize pushing to
+/// outputs, if provided.
+ARROW_EXPORT
+ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ std::function<Future<util::optional<ExecBatch>>()>);
+
+/// \brief Add a sink node which forwards to an AsyncGenerator<ExecBatch>
+///
+/// Emitted batches will not be ordered.
+ARROW_EXPORT
+std::function<Future<util::optional<ExecBatch>>()> MakeSinkNode(ExecNode* input,
+ std::string label);
+
+/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
+///
+/// The RecordBatchReader does not impose any ordering on emitted batches.
+ARROW_EXPORT
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+ std::shared_ptr<Schema>, std::function<Future<util::optional<ExecBatch>>()>,
+ MemoryPool*);
+
+/// \brief Make a node which excludes some rows from batches passed through it
+///
+/// The filter Expression will be evaluated against each batch which is pushed to
+/// this node. Any rows for which the filter does not evaluate to `true` will be excluded
+/// in the batch emitted by this node.
+///
+/// If the filter is not already bound, it will be bound against the input's schema.
+ARROW_EXPORT
+Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter);
+
+/// \brief Make a node which executes expressions on input batches, producing new batches.
+///
+/// Each expression will be evaluated against each batch which is pushed to
+/// this node to produce a corresponding output column.
+///
+/// If exprs are not already bound, they will be bound against the input's schema.
+/// If names are not provided, the string representations of exprs will be used.
+ARROW_EXPORT
+Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
+ std::vector<Expression> exprs,
+ std::vector<std::string> names = {});
+
+ARROW_EXPORT
+Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
+ std::vector<internal::Aggregate> aggregates);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
index 4aab64a46a4..44fb7cf1104 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
@@ -1,1186 +1,1186 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/expression.h"
-
-#include <unordered_map>
-#include <unordered_set>
-
-#include "arrow/chunked_array.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec/expression_internal.h"
-#include "arrow/compute/exec_internal.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/util/hash_util.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/string.h"
-#include "arrow/util/value_parsing.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-namespace compute {
-
-void Expression::Call::ComputeHash() {
- hash = std::hash<std::string>{}(function_name);
- for (const auto& arg : arguments) {
- arrow::internal::hash_combine(hash, arg.hash());
- }
-}
-
-Expression::Expression(Call call) {
- call.ComputeHash();
- impl_ = std::make_shared<Impl>(std::move(call));
-}
-
-Expression::Expression(Datum literal)
- : impl_(std::make_shared<Impl>(std::move(literal))) {}
-
-Expression::Expression(Parameter parameter)
- : impl_(std::make_shared<Impl>(std::move(parameter))) {}
-
-Expression literal(Datum lit) { return Expression(std::move(lit)); }
-
-Expression field_ref(FieldRef ref) {
- return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1});
-}
-
-Expression call(std::string function, std::vector<Expression> arguments,
- std::shared_ptr<compute::FunctionOptions> options) {
- Expression::Call call;
- call.function_name = std::move(function);
- call.arguments = std::move(arguments);
- call.options = std::move(options);
- return Expression(std::move(call));
-}
-
-const Datum* Expression::literal() const { return util::get_if<Datum>(impl_.get()); }
-
-const Expression::Parameter* Expression::parameter() const {
- return util::get_if<Parameter>(impl_.get());
-}
-
-const FieldRef* Expression::field_ref() const {
- if (auto parameter = this->parameter()) {
- return &parameter->ref;
- }
- return nullptr;
-}
-
-const Expression::Call* Expression::call() const {
- return util::get_if<Call>(impl_.get());
-}
-
-ValueDescr Expression::descr() const {
- if (impl_ == nullptr) return {};
-
- if (auto lit = literal()) {
- return lit->descr();
- }
-
- if (auto parameter = this->parameter()) {
- return parameter->descr;
- }
-
- return CallNotNull(*this)->descr;
-}
-
-namespace {
-
-std::string PrintDatum(const Datum& datum) {
- if (datum.is_scalar()) {
- if (!datum.scalar()->is_valid) return "null";
-
- switch (datum.type()->id()) {
- case Type::STRING:
- case Type::LARGE_STRING:
- return '"' +
- Escape(util::string_view(*datum.scalar_as<BaseBinaryScalar>().value)) +
- '"';
-
- case Type::BINARY:
- case Type::FIXED_SIZE_BINARY:
- case Type::LARGE_BINARY:
- return '"' + datum.scalar_as<BaseBinaryScalar>().value->ToHexString() + '"';
-
- default:
- break;
- }
-
- return datum.scalar()->ToString();
- }
- return datum.ToString();
-}
-
-} // namespace
-
-std::string Expression::ToString() const {
- if (auto lit = literal()) {
- return PrintDatum(*lit);
- }
-
- if (auto ref = field_ref()) {
- if (auto name = ref->name()) {
- return *name;
- }
- if (auto path = ref->field_path()) {
- return path->ToString();
- }
- return ref->ToString();
- }
-
- auto call = CallNotNull(*this);
- auto binary = [&](std::string op) {
- return "(" + call->arguments[0].ToString() + " " + op + " " +
- call->arguments[1].ToString() + ")";
- };
-
- if (auto cmp = Comparison::Get(call->function_name)) {
- return binary(Comparison::GetOp(*cmp));
- }
-
- constexpr util::string_view kleene = "_kleene";
- if (util::string_view{call->function_name}.ends_with(kleene)) {
- auto op = call->function_name.substr(0, call->function_name.size() - kleene.size());
- return binary(std::move(op));
- }
-
- if (auto options = GetMakeStructOptions(*call)) {
- std::string out = "{";
- auto argument = call->arguments.begin();
- for (const auto& field_name : options->field_names) {
- out += field_name + "=" + argument++->ToString() + ", ";
- }
- out.resize(out.size() - 1);
- out.back() = '}';
- return out;
- }
-
- std::string out = call->function_name + "(";
- for (const auto& arg : call->arguments) {
- out += arg.ToString() + ", ";
- }
-
- if (call->options) {
- out += call->options->ToString();
- out.resize(out.size() + 1);
- } else {
- out.resize(out.size() - 1);
- }
- out.back() = ')';
- return out;
-}
-
-void PrintTo(const Expression& expr, std::ostream* os) {
- *os << expr.ToString();
- if (expr.IsBound()) {
- *os << "[bound]";
- }
-}
-
-bool Expression::Equals(const Expression& other) const {
- if (Identical(*this, other)) return true;
-
- if (impl_->index() != other.impl_->index()) {
- return false;
- }
-
- if (auto lit = literal()) {
- return lit->Equals(*other.literal());
- }
-
- if (auto ref = field_ref()) {
- return ref->Equals(*other.field_ref());
- }
-
- auto call = CallNotNull(*this);
- auto other_call = CallNotNull(other);
-
- if (call->function_name != other_call->function_name ||
- call->kernel != other_call->kernel) {
- return false;
- }
-
- for (size_t i = 0; i < call->arguments.size(); ++i) {
- if (!call->arguments[i].Equals(other_call->arguments[i])) {
- return false;
- }
- }
-
- if (call->options == other_call->options) return true;
- if (call->options && other_call->options) {
- return call->options->Equals(other_call->options);
- }
- return false;
-}
-
-bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.impl_; }
-
-size_t Expression::hash() const {
- if (auto lit = literal()) {
- if (lit->is_scalar()) {
- return lit->scalar()->hash();
- }
- return 0;
- }
-
- if (auto ref = field_ref()) {
- return ref->hash();
- }
-
- return CallNotNull(*this)->hash;
-}
-
-bool Expression::IsBound() const {
- if (type() == nullptr) return false;
-
- if (auto call = this->call()) {
- if (call->kernel == nullptr) return false;
-
- for (const Expression& arg : call->arguments) {
- if (!arg.IsBound()) return false;
- }
- }
-
- return true;
-}
-
-bool Expression::IsScalarExpression() const {
- if (auto lit = literal()) {
- return lit->is_scalar();
- }
-
- if (field_ref()) return true;
-
- auto call = CallNotNull(*this);
-
- for (const Expression& arg : call->arguments) {
- if (!arg.IsScalarExpression()) return false;
- }
-
- if (call->function) {
- return call->function->kind() == compute::Function::SCALAR;
- }
-
- // this expression is not bound; make a best guess based on
- // the default function registry
- if (auto function = compute::GetFunctionRegistry()
- ->GetFunction(call->function_name)
- .ValueOr(nullptr)) {
- return function->kind() == compute::Function::SCALAR;
- }
-
- // unknown function or other error; conservatively return false
- return false;
-}
-
-bool Expression::IsNullLiteral() const {
- if (auto lit = literal()) {
- if (lit->null_count() == lit->length()) {
- return true;
- }
- }
-
- return false;
-}
-
-bool Expression::IsSatisfiable() const {
- if (type() && type()->id() == Type::NA) {
- return false;
- }
-
- if (auto lit = literal()) {
- if (lit->null_count() == lit->length()) {
- return false;
- }
-
- if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
- return lit->scalar_as<BooleanScalar>().value;
- }
- }
-
- return true;
-}
-
-namespace {
-
-// Produce a bound Expression from unbound Call and bound arguments.
-Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_casts,
- compute::ExecContext* exec_context) {
- DCHECK(std::all_of(call.arguments.begin(), call.arguments.end(),
- [](const Expression& argument) { return argument.IsBound(); }));
-
- auto descrs = GetDescriptors(call.arguments);
- ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context));
-
- if (!insert_implicit_casts) {
- ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchExact(descrs));
- } else {
- ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&descrs));
-
- for (size_t i = 0; i < descrs.size(); ++i) {
- if (descrs[i] == call.arguments[i].descr()) continue;
-
- if (descrs[i].shape != call.arguments[i].descr().shape) {
- return Status::NotImplemented(
- "Automatic broadcasting of scalars arguments to arrays in ",
- Expression(std::move(call)).ToString());
- }
-
- if (auto lit = call.arguments[i].literal()) {
- ARROW_ASSIGN_OR_RAISE(Datum new_lit, compute::Cast(*lit, descrs[i].type));
- call.arguments[i] = literal(std::move(new_lit));
- continue;
- }
-
- // construct an implicit cast Expression with which to replace this argument
- Expression::Call implicit_cast;
- implicit_cast.function_name = "cast";
- implicit_cast.arguments = {std::move(call.arguments[i])};
- implicit_cast.options = std::make_shared<compute::CastOptions>(
- compute::CastOptions::Safe(descrs[i].type));
-
- ARROW_ASSIGN_OR_RAISE(
- call.arguments[i],
- BindNonRecursive(std::move(implicit_cast),
- /*insert_implicit_casts=*/false, exec_context));
- }
- }
-
- compute::KernelContext kernel_context(exec_context);
- if (call.kernel->init) {
- ARROW_ASSIGN_OR_RAISE(
- call.kernel_state,
- call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()}));
-
- kernel_context.SetState(call.kernel_state.get());
- }
-
- ARROW_ASSIGN_OR_RAISE(
- call.descr, call.kernel->signature->out_type().Resolve(&kernel_context, descrs));
-
- return Expression(std::move(call));
-}
-
-template <typename TypeOrSchema>
-Result<Expression> BindImpl(Expression expr, const TypeOrSchema& in,
- ValueDescr::Shape shape, compute::ExecContext* exec_context) {
- if (exec_context == nullptr) {
- compute::ExecContext exec_context;
- return BindImpl(std::move(expr), in, shape, &exec_context);
- }
-
- if (expr.literal()) return expr;
-
- if (auto ref = expr.field_ref()) {
- if (ref->IsNested()) {
- return Status::NotImplemented("nested field references");
- }
-
- ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in));
-
- auto bound = *expr.parameter();
- bound.index = path[0];
- ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in));
- bound.descr.type = field->type();
- bound.descr.shape = shape;
- return Expression{std::move(bound)};
- }
-
- auto call = *CallNotNull(expr);
- for (auto& argument : call.arguments) {
- ARROW_ASSIGN_OR_RAISE(argument,
- BindImpl(std::move(argument), in, shape, exec_context));
- }
- return BindNonRecursive(std::move(call),
- /*insert_implicit_casts=*/true, exec_context);
-}
-
-} // namespace
-
-Result<Expression> Expression::Bind(const ValueDescr& in,
- compute::ExecContext* exec_context) const {
- return BindImpl(*this, *in.type, in.shape, exec_context);
-}
-
-Result<Expression> Expression::Bind(const Schema& in_schema,
- compute::ExecContext* exec_context) const {
- return BindImpl(*this, in_schema, ValueDescr::ARRAY, exec_context);
-}
-
-Result<ExecBatch> MakeExecBatch(const Schema& full_schema, const Datum& partial) {
- ExecBatch out;
-
- if (partial.kind() == Datum::RECORD_BATCH) {
- const auto& partial_batch = *partial.record_batch();
- out.length = partial_batch.num_rows();
-
- for (const auto& field : full_schema.fields()) {
- ARROW_ASSIGN_OR_RAISE(auto column,
- FieldRef(field->name()).GetOneOrNone(partial_batch));
-
- if (column) {
- if (!column->type()->Equals(field->type())) {
- // Referenced field was present but didn't have the expected type.
- // This *should* be handled by readers, and will just be an error in the future.
- ARROW_ASSIGN_OR_RAISE(
- auto converted,
- compute::Cast(column, field->type(), compute::CastOptions::Safe()));
- column = converted.make_array();
- }
- out.values.emplace_back(std::move(column));
- } else {
- out.values.emplace_back(MakeNullScalar(field->type()));
- }
- }
- return out;
- }
-
- // wasteful but useful for testing:
- if (partial.type()->id() == Type::STRUCT) {
- if (partial.is_array()) {
- ARROW_ASSIGN_OR_RAISE(auto partial_batch,
- RecordBatch::FromStructArray(partial.make_array()));
-
- return MakeExecBatch(full_schema, partial_batch);
- }
-
- if (partial.is_scalar()) {
- ARROW_ASSIGN_OR_RAISE(auto partial_array,
- MakeArrayFromScalar(*partial.scalar(), 1));
- ARROW_ASSIGN_OR_RAISE(auto out, MakeExecBatch(full_schema, partial_array));
-
- for (Datum& value : out.values) {
- if (value.is_scalar()) continue;
- ARROW_ASSIGN_OR_RAISE(value, value.make_array()->GetScalar(0));
- }
- return out;
- }
- }
-
- return Status::NotImplemented("MakeExecBatch from ", PrintDatum(partial));
-}
-
-Result<Datum> ExecuteScalarExpression(const Expression& expr, const Schema& full_schema,
- const Datum& partial_input,
- compute::ExecContext* exec_context) {
- ARROW_ASSIGN_OR_RAISE(auto input, MakeExecBatch(full_schema, partial_input));
- return ExecuteScalarExpression(expr, input, exec_context);
-}
-
-Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& input,
- compute::ExecContext* exec_context) {
- if (exec_context == nullptr) {
- compute::ExecContext exec_context;
- return ExecuteScalarExpression(expr, input, &exec_context);
- }
-
- if (!expr.IsBound()) {
- return Status::Invalid("Cannot Execute unbound expression.");
- }
-
- if (!expr.IsScalarExpression()) {
- return Status::Invalid(
- "ExecuteScalarExpression cannot Execute non-scalar expression ", expr.ToString());
- }
-
- if (auto lit = expr.literal()) return *lit;
-
- if (auto param = expr.parameter()) {
- if (param->descr.type->id() == Type::NA) {
- return MakeNullScalar(null());
- }
-
- const Datum& field = input[param->index];
- if (!field.type()->Equals(param->descr.type)) {
- return Status::Invalid("Referenced field ", expr.ToString(), " was ",
- field.type()->ToString(), " but should have been ",
- param->descr.type->ToString());
- }
-
- return field;
- }
-
- auto call = CallNotNull(expr);
-
- std::vector<Datum> arguments(call->arguments.size());
- for (size_t i = 0; i < arguments.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context));
- }
-
- auto executor = compute::detail::KernelExecutor::MakeScalar();
-
- compute::KernelContext kernel_context(exec_context);
- kernel_context.SetState(call->kernel_state.get());
-
- auto kernel = call->kernel;
- auto descrs = GetDescriptors(arguments);
- auto options = call->options.get();
- RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
-
- auto listener = std::make_shared<compute::detail::DatumAccumulator>();
- RETURN_NOT_OK(executor->Execute(arguments, listener.get()));
- return executor->WrapResults(arguments, listener->values());
-}
-
-namespace {
-
-std::array<std::pair<const Expression&, const Expression&>, 2>
-ArgumentsAndFlippedArguments(const Expression::Call& call) {
- DCHECK_EQ(call.arguments.size(), 2);
- return {std::pair<const Expression&, const Expression&>{call.arguments[0],
- call.arguments[1]},
- std::pair<const Expression&, const Expression&>{call.arguments[1],
- call.arguments[0]}};
-}
-
-template <typename BinOp, typename It,
- typename Out = typename std::iterator_traits<It>::value_type>
-util::optional<Out> FoldLeft(It begin, It end, const BinOp& bin_op) {
- if (begin == end) return util::nullopt;
-
- Out folded = std::move(*begin++);
- while (begin != end) {
- folded = bin_op(std::move(folded), std::move(*begin++));
- }
- return folded;
-}
-
-util::optional<compute::NullHandling::type> GetNullHandling(
- const Expression::Call& call) {
- if (call.function && call.function->kind() == compute::Function::SCALAR) {
- return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
- }
- return util::nullopt;
-}
-
-} // namespace
-
-std::vector<FieldRef> FieldsInExpression(const Expression& expr) {
- if (expr.literal()) return {};
-
- if (auto ref = expr.field_ref()) {
- return {*ref};
- }
-
- std::vector<FieldRef> fields;
- for (const Expression& arg : CallNotNull(expr)->arguments) {
- auto argument_fields = FieldsInExpression(arg);
- std::move(argument_fields.begin(), argument_fields.end(), std::back_inserter(fields));
- }
- return fields;
-}
-
-bool ExpressionHasFieldRefs(const Expression& expr) {
- if (expr.literal()) return false;
-
- if (expr.field_ref()) return true;
-
- for (const Expression& arg : CallNotNull(expr)->arguments) {
- if (ExpressionHasFieldRefs(arg)) return true;
- }
- return false;
-}
-
-Result<Expression> FoldConstants(Expression expr) {
- return Modify(
- std::move(expr), [](Expression expr) { return expr; },
- [](Expression expr, ...) -> Result<Expression> {
- auto call = CallNotNull(expr);
- if (std::all_of(call->arguments.begin(), call->arguments.end(),
- [](const Expression& argument) { return argument.literal(); })) {
- // all arguments are literal; we can evaluate this subexpression *now*
- static const ExecBatch ignored_input = ExecBatch{};
- ARROW_ASSIGN_OR_RAISE(Datum constant,
- ExecuteScalarExpression(expr, ignored_input));
-
- return literal(std::move(constant));
- }
-
- // XXX the following should probably be in a registry of passes instead
- // of inline
-
- if (GetNullHandling(*call) == compute::NullHandling::INTERSECTION) {
- // kernels which always produce intersected validity can be resolved
- // to null *now* if any of their inputs is a null literal
- for (const auto& argument : call->arguments) {
- if (argument.IsNullLiteral()) {
- return argument;
- }
- }
- }
-
- if (call->function_name == "and_kleene") {
- for (auto args : ArgumentsAndFlippedArguments(*call)) {
- // true and x == x
- if (args.first == literal(true)) return args.second;
-
- // false and x == false
- if (args.first == literal(false)) return args.first;
-
- // x and x == x
- if (args.first == args.second) return args.first;
- }
- return expr;
- }
-
- if (call->function_name == "or_kleene") {
- for (auto args : ArgumentsAndFlippedArguments(*call)) {
- // false or x == x
- if (args.first == literal(false)) return args.second;
-
- // true or x == true
- if (args.first == literal(true)) return args.first;
-
- // x or x == x
- if (args.first == args.second) return args.first;
- }
- return expr;
- }
-
- return expr;
- });
-}
-
-namespace {
-
-std::vector<Expression> GuaranteeConjunctionMembers(
- const Expression& guaranteed_true_predicate) {
- auto guarantee = guaranteed_true_predicate.call();
- if (!guarantee || guarantee->function_name != "and_kleene") {
- return {guaranteed_true_predicate};
- }
- return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
-}
-
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
- std::vector<Expression>* conjunction_members,
- std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
- auto unconsumed_end =
- std::partition(conjunction_members->begin(), conjunction_members->end(),
- [](const Expression& expr) {
- // search for an equality conditions between a field and a literal
- auto call = expr.call();
- if (!call) return true;
-
- if (call->function_name == "equal") {
- auto ref = call->arguments[0].field_ref();
- auto lit = call->arguments[1].literal();
- return !(ref && lit);
- }
-
- if (call->function_name == "is_null") {
- auto ref = call->arguments[0].field_ref();
- return !ref;
- }
-
- return true;
- });
-
- for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
- auto call = CallNotNull(*it);
-
- if (call->function_name == "equal") {
- auto ref = call->arguments[0].field_ref();
- auto lit = call->arguments[1].literal();
- known_values->emplace(*ref, *lit);
- } else if (call->function_name == "is_null") {
- auto ref = call->arguments[0].field_ref();
- known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
- }
- }
-
- conjunction_members->erase(unconsumed_end, conjunction_members->end());
-
- return Status::OK();
-}
-
-} // namespace
-
-Result<KnownFieldValues> ExtractKnownFieldValues(
- const Expression& guaranteed_true_predicate) {
- auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
- KnownFieldValues known_values;
- RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
- return known_values;
-}
-
-Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
- Expression expr) {
- if (!expr.IsBound()) {
- return Status::Invalid(
- "ReplaceFieldsWithKnownValues called on an unbound Expression");
- }
-
- return Modify(
- std::move(expr),
- [&known_values](Expression expr) -> Result<Expression> {
- if (auto ref = expr.field_ref()) {
- auto it = known_values.map.find(*ref);
- if (it != known_values.map.end()) {
- Datum lit = it->second;
- if (lit.descr() == expr.descr()) return literal(std::move(lit));
- // type mismatch, try casting the known value to the correct type
-
- if (expr.type()->id() == Type::DICTIONARY &&
- lit.type()->id() != Type::DICTIONARY) {
- // the known value must be dictionary encoded
-
- const auto& dict_type = checked_cast<const DictionaryType&>(*expr.type());
- if (!lit.type()->Equals(dict_type.value_type())) {
- ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, dict_type.value_type()));
- }
-
- if (lit.is_scalar()) {
- ARROW_ASSIGN_OR_RAISE(auto dictionary,
- MakeArrayFromScalar(*lit.scalar(), 1));
-
- lit = Datum{DictionaryScalar::Make(MakeScalar<int32_t>(0),
- std::move(dictionary))};
- }
- }
-
- ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
- return literal(std::move(lit));
- }
- }
- return expr;
- },
- [](Expression expr, ...) { return expr; });
-}
-
-namespace {
-
-bool IsBinaryAssociativeCommutative(const Expression::Call& call) {
- static std::unordered_set<std::string> binary_associative_commutative{
- "and", "or", "and_kleene", "or_kleene", "xor",
- "multiply", "add", "multiply_checked", "add_checked"};
-
- auto it = binary_associative_commutative.find(call.function_name);
- return it != binary_associative_commutative.end();
-}
-
-} // namespace
-
-Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_context) {
- if (exec_context == nullptr) {
- compute::ExecContext exec_context;
- return Canonicalize(std::move(expr), &exec_context);
- }
-
- // If potentially reconstructing more deeply than a call's immediate arguments
- // (for example, when reorganizing an associative chain), add expressions to this set to
- // avoid unnecessary work
- struct {
- std::unordered_set<Expression, Expression::Hash> set_;
-
- bool operator()(const Expression& expr) const {
- return set_.find(expr) != set_.end();
- }
-
- void Add(std::vector<Expression> exprs) {
- std::move(exprs.begin(), exprs.end(), std::inserter(set_, set_.end()));
- }
- } AlreadyCanonicalized;
-
- return Modify(
- std::move(expr),
- [&AlreadyCanonicalized, exec_context](Expression expr) -> Result<Expression> {
- auto call = expr.call();
- if (!call) return expr;
-
- if (AlreadyCanonicalized(expr)) return expr;
-
- if (IsBinaryAssociativeCommutative(*call)) {
- struct {
- int Priority(const Expression& operand) const {
- // order literals first, starting with nulls
- if (operand.IsNullLiteral()) return 0;
- if (operand.literal()) return 1;
- return 2;
- }
- bool operator()(const Expression& l, const Expression& r) const {
- return Priority(l) < Priority(r);
- }
- } CanonicalOrdering;
-
- FlattenedAssociativeChain chain(expr);
- if (chain.was_left_folded &&
- std::is_sorted(chain.fringe.begin(), chain.fringe.end(),
- CanonicalOrdering)) {
- AlreadyCanonicalized.Add(std::move(chain.exprs));
- return expr;
- }
-
- std::stable_sort(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering);
-
- // fold the chain back up
- auto folded =
- FoldLeft(chain.fringe.begin(), chain.fringe.end(),
- [call, &AlreadyCanonicalized](Expression l, Expression r) {
- auto canonicalized_call = *call;
- canonicalized_call.arguments = {std::move(l), std::move(r)};
- Expression expr(std::move(canonicalized_call));
- AlreadyCanonicalized.Add({expr});
- return expr;
- });
- return std::move(*folded);
- }
-
- if (auto cmp = Comparison::Get(call->function_name)) {
- if (call->arguments[0].literal() && !call->arguments[1].literal()) {
- // ensure that literals are on comparisons' RHS
- auto flipped_call = *call;
-
- std::swap(flipped_call.arguments[0], flipped_call.arguments[1]);
- flipped_call.function_name =
- Comparison::GetName(Comparison::GetFlipped(*cmp));
-
- return BindNonRecursive(flipped_call,
- /*insert_implicit_casts=*/false, exec_context);
- }
- }
-
- return expr;
- },
- [](Expression expr, ...) { return expr; });
-}
-
-namespace {
-
-Result<Expression> DirectComparisonSimplification(Expression expr,
- const Expression::Call& guarantee) {
- return Modify(
- std::move(expr), [](Expression expr) { return expr; },
- [&guarantee](Expression expr, ...) -> Result<Expression> {
- auto call = expr.call();
- if (!call) return expr;
-
- // Ensure both calls are comparisons with equal LHS and scalar RHS
- auto cmp = Comparison::Get(expr);
- auto cmp_guarantee = Comparison::Get(guarantee.function_name);
-
- if (!cmp) return expr;
- if (!cmp_guarantee) return expr;
-
- const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
- const auto& guarantee_lhs = guarantee.arguments[0];
- if (lhs != guarantee_lhs) return expr;
-
- auto rhs = call->arguments[1].literal();
- auto guarantee_rhs = guarantee.arguments[1].literal();
-
- if (!rhs) return expr;
- if (!rhs->is_scalar()) return expr;
-
- if (!guarantee_rhs) return expr;
- if (!guarantee_rhs->is_scalar()) return expr;
-
- ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
- Comparison::Execute(*rhs, *guarantee_rhs));
- DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
-
- if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
- // RHS of filter is equal to RHS of guarantee
-
- if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
- // guarantee is a subset of filter, so all data will be included
- // x > 1, x >= 1, x != 1 guaranteed by x > 1
- return literal(true);
- }
-
- if ((*cmp & *cmp_guarantee) == 0) {
- // guarantee disjoint with filter, so all data will be excluded
- // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
- return literal(false);
- }
-
- return expr;
- }
-
- if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
- // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
- return expr;
- }
-
- if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
- // x > 1, x >= 1, x != 1 guaranteed by x >= 3
- return literal(true);
- } else {
- // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
- return literal(false);
- }
- });
-}
-
-} // namespace
-
-Result<Expression> SimplifyWithGuarantee(Expression expr,
- const Expression& guaranteed_true_predicate) {
- auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
-
- KnownFieldValues known_values;
- RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
-
- ARROW_ASSIGN_OR_RAISE(expr,
- ReplaceFieldsWithKnownValues(known_values, std::move(expr)));
-
- auto CanonicalizeAndFoldConstants = [&expr] {
- ARROW_ASSIGN_OR_RAISE(expr, Canonicalize(std::move(expr)));
- ARROW_ASSIGN_OR_RAISE(expr, FoldConstants(std::move(expr)));
- return Status::OK();
- };
- RETURN_NOT_OK(CanonicalizeAndFoldConstants());
-
- for (const auto& guarantee : conjunction_members) {
- if (Comparison::Get(guarantee) && guarantee.call()->arguments[1].literal()) {
- ARROW_ASSIGN_OR_RAISE(
- auto simplified, DirectComparisonSimplification(expr, *CallNotNull(guarantee)));
-
- if (Identical(simplified, expr)) continue;
-
- expr = std::move(simplified);
- RETURN_NOT_OK(CanonicalizeAndFoldConstants());
- }
- }
-
- return expr;
-}
-
-// Serialization is accomplished by converting expressions to KeyValueMetadata and storing
-// this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its
-// columns. Finally, the RecordBatch is written to an IPC file.
-Result<std::shared_ptr<Buffer>> Serialize(const Expression& expr) {
- struct {
- std::shared_ptr<KeyValueMetadata> metadata_ = std::make_shared<KeyValueMetadata>();
- ArrayVector columns_;
-
- Result<std::string> AddScalar(const Scalar& scalar) {
- auto ret = columns_.size();
- ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(scalar, 1));
- columns_.push_back(std::move(array));
- return std::to_string(ret);
- }
-
- Status Visit(const Expression& expr) {
- if (auto lit = expr.literal()) {
- if (!lit->is_scalar()) {
- return Status::NotImplemented("Serialization of non-scalar literals");
- }
- ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*lit->scalar()));
- metadata_->Append("literal", std::move(value));
- return Status::OK();
- }
-
- if (auto ref = expr.field_ref()) {
- if (!ref->name()) {
- return Status::NotImplemented("Serialization of non-name field_refs");
- }
- metadata_->Append("field_ref", *ref->name());
- return Status::OK();
- }
-
- auto call = CallNotNull(expr);
- metadata_->Append("call", call->function_name);
-
- for (const auto& argument : call->arguments) {
- RETURN_NOT_OK(Visit(argument));
- }
-
- if (call->options) {
- ARROW_ASSIGN_OR_RAISE(auto options_scalar,
- internal::FunctionOptionsToStructScalar(*call->options));
- ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*options_scalar));
- metadata_->Append("options", std::move(value));
- }
-
- metadata_->Append("end", call->function_name);
- return Status::OK();
- }
-
- Result<std::shared_ptr<RecordBatch>> operator()(const Expression& expr) {
- RETURN_NOT_OK(Visit(expr));
- FieldVector fields(columns_.size());
- for (size_t i = 0; i < fields.size(); ++i) {
- fields[i] = field("", columns_[i]->type());
- }
- return RecordBatch::Make(schema(std::move(fields), std::move(metadata_)), 1,
- std::move(columns_));
- }
- } ToRecordBatch;
-
- ARROW_ASSIGN_OR_RAISE(auto batch, ToRecordBatch(expr));
- ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
- ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
- RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
- RETURN_NOT_OK(writer->Close());
- return stream->Finish();
-}
-
-Result<Expression> Deserialize(std::shared_ptr<Buffer> buffer) {
- io::BufferReader stream(std::move(buffer));
- ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
- ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
- if (batch->schema()->metadata() == nullptr) {
- return Status::Invalid("serialized Expression's batch repr had null metadata");
- }
- if (batch->num_rows() != 1) {
- return Status::Invalid(
- "serialized Expression's batch repr was not a single row - had ",
- batch->num_rows());
- }
-
- struct FromRecordBatch {
- const RecordBatch& batch_;
- int index_;
-
- const KeyValueMetadata& metadata() { return *batch_.schema()->metadata(); }
-
- Result<std::shared_ptr<Scalar>> GetScalar(const std::string& i) {
- int32_t column_index;
- if (!::arrow::internal::ParseValue<Int32Type>(i.data(), i.length(),
- &column_index)) {
- return Status::Invalid("Couldn't parse column_index");
- }
- if (column_index >= batch_.num_columns()) {
- return Status::Invalid("column_index out of bounds");
- }
- return batch_.column(column_index)->GetScalar(0);
- }
-
- Result<Expression> GetOne() {
- if (index_ >= metadata().size()) {
- return Status::Invalid("unterminated serialized Expression");
- }
-
- const std::string& key = metadata().key(index_);
- const std::string& value = metadata().value(index_);
- ++index_;
-
- if (key == "literal") {
- ARROW_ASSIGN_OR_RAISE(auto scalar, GetScalar(value));
- return literal(std::move(scalar));
- }
-
- if (key == "field_ref") {
- return field_ref(value);
- }
-
- if (key != "call") {
- return Status::Invalid("Unrecognized serialized Expression key ", key);
- }
-
- std::vector<Expression> arguments;
- while (metadata().key(index_) != "end") {
- if (metadata().key(index_) == "options") {
- ARROW_ASSIGN_OR_RAISE(auto options_scalar, GetScalar(metadata().value(index_)));
- std::shared_ptr<compute::FunctionOptions> options;
- if (options_scalar) {
- ARROW_ASSIGN_OR_RAISE(
- options, internal::FunctionOptionsFromStructScalar(
- checked_cast<const StructScalar&>(*options_scalar)));
- }
- auto expr = call(value, std::move(arguments), std::move(options));
- index_ += 2;
- return expr;
- }
-
- ARROW_ASSIGN_OR_RAISE(auto argument, GetOne());
- arguments.push_back(std::move(argument));
- }
-
- ++index_;
- return call(value, std::move(arguments));
- }
- };
-
- return FromRecordBatch{*batch, 0}.GetOne();
-}
-
-Expression project(std::vector<Expression> values, std::vector<std::string> names) {
- return call("make_struct", std::move(values),
- compute::MakeStructOptions{std::move(names)});
-}
-
-Expression equal(Expression lhs, Expression rhs) {
- return call("equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression not_equal(Expression lhs, Expression rhs) {
- return call("not_equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression less(Expression lhs, Expression rhs) {
- return call("less", {std::move(lhs), std::move(rhs)});
-}
-
-Expression less_equal(Expression lhs, Expression rhs) {
- return call("less_equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression greater(Expression lhs, Expression rhs) {
- return call("greater", {std::move(lhs), std::move(rhs)});
-}
-
-Expression greater_equal(Expression lhs, Expression rhs) {
- return call("greater_equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
-
-Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }
-
-Expression and_(Expression lhs, Expression rhs) {
- return call("and_kleene", {std::move(lhs), std::move(rhs)});
-}
-
-Expression and_(const std::vector<Expression>& operands) {
- auto folded = FoldLeft<Expression(Expression, Expression)>(operands.begin(),
- operands.end(), and_);
- if (folded) {
- return std::move(*folded);
- }
- return literal(true);
-}
-
-Expression or_(Expression lhs, Expression rhs) {
- return call("or_kleene", {std::move(lhs), std::move(rhs)});
-}
-
-Expression or_(const std::vector<Expression>& operands) {
- auto folded =
- FoldLeft<Expression(Expression, Expression)>(operands.begin(), operands.end(), or_);
- if (folded) {
- return std::move(*folded);
- }
- return literal(false);
-}
-
-Expression not_(Expression operand) { return call("invert", {std::move(operand)}); }
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/expression.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec/expression_internal.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+
+void Expression::Call::ComputeHash() {
+ hash = std::hash<std::string>{}(function_name);
+ for (const auto& arg : arguments) {
+ arrow::internal::hash_combine(hash, arg.hash());
+ }
+}
+
+Expression::Expression(Call call) {
+ call.ComputeHash();
+ impl_ = std::make_shared<Impl>(std::move(call));
+}
+
+Expression::Expression(Datum literal)
+ : impl_(std::make_shared<Impl>(std::move(literal))) {}
+
+Expression::Expression(Parameter parameter)
+ : impl_(std::make_shared<Impl>(std::move(parameter))) {}
+
+Expression literal(Datum lit) { return Expression(std::move(lit)); }
+
+Expression field_ref(FieldRef ref) {
+ return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1});
+}
+
+Expression call(std::string function, std::vector<Expression> arguments,
+ std::shared_ptr<compute::FunctionOptions> options) {
+ Expression::Call call;
+ call.function_name = std::move(function);
+ call.arguments = std::move(arguments);
+ call.options = std::move(options);
+ return Expression(std::move(call));
+}
+
+const Datum* Expression::literal() const { return util::get_if<Datum>(impl_.get()); }
+
+const Expression::Parameter* Expression::parameter() const {
+ return util::get_if<Parameter>(impl_.get());
+}
+
+const FieldRef* Expression::field_ref() const {
+ if (auto parameter = this->parameter()) {
+ return &parameter->ref;
+ }
+ return nullptr;
+}
+
+const Expression::Call* Expression::call() const {
+ return util::get_if<Call>(impl_.get());
+}
+
+ValueDescr Expression::descr() const {
+ if (impl_ == nullptr) return {};
+
+ if (auto lit = literal()) {
+ return lit->descr();
+ }
+
+ if (auto parameter = this->parameter()) {
+ return parameter->descr;
+ }
+
+ return CallNotNull(*this)->descr;
+}
+
+namespace {
+
+std::string PrintDatum(const Datum& datum) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) return "null";
+
+ switch (datum.type()->id()) {
+ case Type::STRING:
+ case Type::LARGE_STRING:
+ return '"' +
+ Escape(util::string_view(*datum.scalar_as<BaseBinaryScalar>().value)) +
+ '"';
+
+ case Type::BINARY:
+ case Type::FIXED_SIZE_BINARY:
+ case Type::LARGE_BINARY:
+ return '"' + datum.scalar_as<BaseBinaryScalar>().value->ToHexString() + '"';
+
+ default:
+ break;
+ }
+
+ return datum.scalar()->ToString();
+ }
+ return datum.ToString();
+}
+
+} // namespace
+
+std::string Expression::ToString() const {
+ if (auto lit = literal()) {
+ return PrintDatum(*lit);
+ }
+
+ if (auto ref = field_ref()) {
+ if (auto name = ref->name()) {
+ return *name;
+ }
+ if (auto path = ref->field_path()) {
+ return path->ToString();
+ }
+ return ref->ToString();
+ }
+
+ auto call = CallNotNull(*this);
+ auto binary = [&](std::string op) {
+ return "(" + call->arguments[0].ToString() + " " + op + " " +
+ call->arguments[1].ToString() + ")";
+ };
+
+ if (auto cmp = Comparison::Get(call->function_name)) {
+ return binary(Comparison::GetOp(*cmp));
+ }
+
+ constexpr util::string_view kleene = "_kleene";
+ if (util::string_view{call->function_name}.ends_with(kleene)) {
+ auto op = call->function_name.substr(0, call->function_name.size() - kleene.size());
+ return binary(std::move(op));
+ }
+
+ if (auto options = GetMakeStructOptions(*call)) {
+ std::string out = "{";
+ auto argument = call->arguments.begin();
+ for (const auto& field_name : options->field_names) {
+ out += field_name + "=" + argument++->ToString() + ", ";
+ }
+ out.resize(out.size() - 1);
+ out.back() = '}';
+ return out;
+ }
+
+ std::string out = call->function_name + "(";
+ for (const auto& arg : call->arguments) {
+ out += arg.ToString() + ", ";
+ }
+
+ if (call->options) {
+ out += call->options->ToString();
+ out.resize(out.size() + 1);
+ } else {
+ out.resize(out.size() - 1);
+ }
+ out.back() = ')';
+ return out;
+}
+
+void PrintTo(const Expression& expr, std::ostream* os) {
+ *os << expr.ToString();
+ if (expr.IsBound()) {
+ *os << "[bound]";
+ }
+}
+
+bool Expression::Equals(const Expression& other) const {
+ if (Identical(*this, other)) return true;
+
+ if (impl_->index() != other.impl_->index()) {
+ return false;
+ }
+
+ if (auto lit = literal()) {
+ return lit->Equals(*other.literal());
+ }
+
+ if (auto ref = field_ref()) {
+ return ref->Equals(*other.field_ref());
+ }
+
+ auto call = CallNotNull(*this);
+ auto other_call = CallNotNull(other);
+
+ if (call->function_name != other_call->function_name ||
+ call->kernel != other_call->kernel) {
+ return false;
+ }
+
+ for (size_t i = 0; i < call->arguments.size(); ++i) {
+ if (!call->arguments[i].Equals(other_call->arguments[i])) {
+ return false;
+ }
+ }
+
+ if (call->options == other_call->options) return true;
+ if (call->options && other_call->options) {
+ return call->options->Equals(other_call->options);
+ }
+ return false;
+}
+
+bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.impl_; }
+
+size_t Expression::hash() const {
+ if (auto lit = literal()) {
+ if (lit->is_scalar()) {
+ return lit->scalar()->hash();
+ }
+ return 0;
+ }
+
+ if (auto ref = field_ref()) {
+ return ref->hash();
+ }
+
+ return CallNotNull(*this)->hash;
+}
+
+bool Expression::IsBound() const {
+ if (type() == nullptr) return false;
+
+ if (auto call = this->call()) {
+ if (call->kernel == nullptr) return false;
+
+ for (const Expression& arg : call->arguments) {
+ if (!arg.IsBound()) return false;
+ }
+ }
+
+ return true;
+}
+
+bool Expression::IsScalarExpression() const {
+ if (auto lit = literal()) {
+ return lit->is_scalar();
+ }
+
+ if (field_ref()) return true;
+
+ auto call = CallNotNull(*this);
+
+ for (const Expression& arg : call->arguments) {
+ if (!arg.IsScalarExpression()) return false;
+ }
+
+ if (call->function) {
+ return call->function->kind() == compute::Function::SCALAR;
+ }
+
+ // this expression is not bound; make a best guess based on
+ // the default function registry
+ if (auto function = compute::GetFunctionRegistry()
+ ->GetFunction(call->function_name)
+ .ValueOr(nullptr)) {
+ return function->kind() == compute::Function::SCALAR;
+ }
+
+ // unknown function or other error; conservatively return false
+ return false;
+}
+
+bool Expression::IsNullLiteral() const {
+ if (auto lit = literal()) {
+ if (lit->null_count() == lit->length()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool Expression::IsSatisfiable() const {
+ if (type() && type()->id() == Type::NA) {
+ return false;
+ }
+
+ if (auto lit = literal()) {
+ if (lit->null_count() == lit->length()) {
+ return false;
+ }
+
+ if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+ return lit->scalar_as<BooleanScalar>().value;
+ }
+ }
+
+ return true;
+}
+
+namespace {
+
+// Produce a bound Expression from unbound Call and bound arguments.
+Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_casts,
+ compute::ExecContext* exec_context) {
+ DCHECK(std::all_of(call.arguments.begin(), call.arguments.end(),
+ [](const Expression& argument) { return argument.IsBound(); }));
+
+ auto descrs = GetDescriptors(call.arguments);
+ ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context));
+
+ if (!insert_implicit_casts) {
+ ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchExact(descrs));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&descrs));
+
+ for (size_t i = 0; i < descrs.size(); ++i) {
+ if (descrs[i] == call.arguments[i].descr()) continue;
+
+ if (descrs[i].shape != call.arguments[i].descr().shape) {
+ return Status::NotImplemented(
+ "Automatic broadcasting of scalars arguments to arrays in ",
+ Expression(std::move(call)).ToString());
+ }
+
+ if (auto lit = call.arguments[i].literal()) {
+ ARROW_ASSIGN_OR_RAISE(Datum new_lit, compute::Cast(*lit, descrs[i].type));
+ call.arguments[i] = literal(std::move(new_lit));
+ continue;
+ }
+
+ // construct an implicit cast Expression with which to replace this argument
+ Expression::Call implicit_cast;
+ implicit_cast.function_name = "cast";
+ implicit_cast.arguments = {std::move(call.arguments[i])};
+ implicit_cast.options = std::make_shared<compute::CastOptions>(
+ compute::CastOptions::Safe(descrs[i].type));
+
+ ARROW_ASSIGN_OR_RAISE(
+ call.arguments[i],
+ BindNonRecursive(std::move(implicit_cast),
+ /*insert_implicit_casts=*/false, exec_context));
+ }
+ }
+
+ compute::KernelContext kernel_context(exec_context);
+ if (call.kernel->init) {
+ ARROW_ASSIGN_OR_RAISE(
+ call.kernel_state,
+ call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()}));
+
+ kernel_context.SetState(call.kernel_state.get());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ call.descr, call.kernel->signature->out_type().Resolve(&kernel_context, descrs));
+
+ return Expression(std::move(call));
+}
+
+template <typename TypeOrSchema>
+Result<Expression> BindImpl(Expression expr, const TypeOrSchema& in,
+ ValueDescr::Shape shape, compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return BindImpl(std::move(expr), in, shape, &exec_context);
+ }
+
+ if (expr.literal()) return expr;
+
+ if (auto ref = expr.field_ref()) {
+ if (ref->IsNested()) {
+ return Status::NotImplemented("nested field references");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in));
+
+ auto bound = *expr.parameter();
+ bound.index = path[0];
+ ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in));
+ bound.descr.type = field->type();
+ bound.descr.shape = shape;
+ return Expression{std::move(bound)};
+ }
+
+ auto call = *CallNotNull(expr);
+ for (auto& argument : call.arguments) {
+ ARROW_ASSIGN_OR_RAISE(argument,
+ BindImpl(std::move(argument), in, shape, exec_context));
+ }
+ return BindNonRecursive(std::move(call),
+ /*insert_implicit_casts=*/true, exec_context);
+}
+
+} // namespace
+
+Result<Expression> Expression::Bind(const ValueDescr& in,
+ compute::ExecContext* exec_context) const {
+ return BindImpl(*this, *in.type, in.shape, exec_context);
+}
+
+Result<Expression> Expression::Bind(const Schema& in_schema,
+ compute::ExecContext* exec_context) const {
+ return BindImpl(*this, in_schema, ValueDescr::ARRAY, exec_context);
+}
+
+Result<ExecBatch> MakeExecBatch(const Schema& full_schema, const Datum& partial) {
+ ExecBatch out;
+
+ if (partial.kind() == Datum::RECORD_BATCH) {
+ const auto& partial_batch = *partial.record_batch();
+ out.length = partial_batch.num_rows();
+
+ for (const auto& field : full_schema.fields()) {
+ ARROW_ASSIGN_OR_RAISE(auto column,
+ FieldRef(field->name()).GetOneOrNone(partial_batch));
+
+ if (column) {
+ if (!column->type()->Equals(field->type())) {
+ // Referenced field was present but didn't have the expected type.
+ // This *should* be handled by readers, and will just be an error in the future.
+ ARROW_ASSIGN_OR_RAISE(
+ auto converted,
+ compute::Cast(column, field->type(), compute::CastOptions::Safe()));
+ column = converted.make_array();
+ }
+ out.values.emplace_back(std::move(column));
+ } else {
+ out.values.emplace_back(MakeNullScalar(field->type()));
+ }
+ }
+ return out;
+ }
+
+ // wasteful but useful for testing:
+ if (partial.type()->id() == Type::STRUCT) {
+ if (partial.is_array()) {
+ ARROW_ASSIGN_OR_RAISE(auto partial_batch,
+ RecordBatch::FromStructArray(partial.make_array()));
+
+ return MakeExecBatch(full_schema, partial_batch);
+ }
+
+ if (partial.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto partial_array,
+ MakeArrayFromScalar(*partial.scalar(), 1));
+ ARROW_ASSIGN_OR_RAISE(auto out, MakeExecBatch(full_schema, partial_array));
+
+ for (Datum& value : out.values) {
+ if (value.is_scalar()) continue;
+ ARROW_ASSIGN_OR_RAISE(value, value.make_array()->GetScalar(0));
+ }
+ return out;
+ }
+ }
+
+ return Status::NotImplemented("MakeExecBatch from ", PrintDatum(partial));
+}
+
+Result<Datum> ExecuteScalarExpression(const Expression& expr, const Schema& full_schema,
+ const Datum& partial_input,
+ compute::ExecContext* exec_context) {
+ ARROW_ASSIGN_OR_RAISE(auto input, MakeExecBatch(full_schema, partial_input));
+ return ExecuteScalarExpression(expr, input, exec_context);
+}
+
+Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& input,
+ compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return ExecuteScalarExpression(expr, input, &exec_context);
+ }
+
+ if (!expr.IsBound()) {
+ return Status::Invalid("Cannot Execute unbound expression.");
+ }
+
+ if (!expr.IsScalarExpression()) {
+ return Status::Invalid(
+ "ExecuteScalarExpression cannot Execute non-scalar expression ", expr.ToString());
+ }
+
+ if (auto lit = expr.literal()) return *lit;
+
+ if (auto param = expr.parameter()) {
+ if (param->descr.type->id() == Type::NA) {
+ return MakeNullScalar(null());
+ }
+
+ const Datum& field = input[param->index];
+ if (!field.type()->Equals(param->descr.type)) {
+ return Status::Invalid("Referenced field ", expr.ToString(), " was ",
+ field.type()->ToString(), " but should have been ",
+ param->descr.type->ToString());
+ }
+
+ return field;
+ }
+
+ auto call = CallNotNull(expr);
+
+ std::vector<Datum> arguments(call->arguments.size());
+ for (size_t i = 0; i < arguments.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context));
+ }
+
+ auto executor = compute::detail::KernelExecutor::MakeScalar();
+
+ compute::KernelContext kernel_context(exec_context);
+ kernel_context.SetState(call->kernel_state.get());
+
+ auto kernel = call->kernel;
+ auto descrs = GetDescriptors(arguments);
+ auto options = call->options.get();
+ RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
+
+ auto listener = std::make_shared<compute::detail::DatumAccumulator>();
+ RETURN_NOT_OK(executor->Execute(arguments, listener.get()));
+ return executor->WrapResults(arguments, listener->values());
+}
+
+namespace {
+
+std::array<std::pair<const Expression&, const Expression&>, 2>
+ArgumentsAndFlippedArguments(const Expression::Call& call) {
+ DCHECK_EQ(call.arguments.size(), 2);
+ return {std::pair<const Expression&, const Expression&>{call.arguments[0],
+ call.arguments[1]},
+ std::pair<const Expression&, const Expression&>{call.arguments[1],
+ call.arguments[0]}};
+}
+
+template <typename BinOp, typename It,
+ typename Out = typename std::iterator_traits<It>::value_type>
+util::optional<Out> FoldLeft(It begin, It end, const BinOp& bin_op) {
+ if (begin == end) return util::nullopt;
+
+ Out folded = std::move(*begin++);
+ while (begin != end) {
+ folded = bin_op(std::move(folded), std::move(*begin++));
+ }
+ return folded;
+}
+
+util::optional<compute::NullHandling::type> GetNullHandling(
+ const Expression::Call& call) {
+ if (call.function && call.function->kind() == compute::Function::SCALAR) {
+ return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
+ }
+ return util::nullopt;
+}
+
+} // namespace
+
+std::vector<FieldRef> FieldsInExpression(const Expression& expr) {
+ if (expr.literal()) return {};
+
+ if (auto ref = expr.field_ref()) {
+ return {*ref};
+ }
+
+ std::vector<FieldRef> fields;
+ for (const Expression& arg : CallNotNull(expr)->arguments) {
+ auto argument_fields = FieldsInExpression(arg);
+ std::move(argument_fields.begin(), argument_fields.end(), std::back_inserter(fields));
+ }
+ return fields;
+}
+
+bool ExpressionHasFieldRefs(const Expression& expr) {
+ if (expr.literal()) return false;
+
+ if (expr.field_ref()) return true;
+
+ for (const Expression& arg : CallNotNull(expr)->arguments) {
+ if (ExpressionHasFieldRefs(arg)) return true;
+ }
+ return false;
+}
+
+Result<Expression> FoldConstants(Expression expr) {
+ return Modify(
+ std::move(expr), [](Expression expr) { return expr; },
+ [](Expression expr, ...) -> Result<Expression> {
+ auto call = CallNotNull(expr);
+ if (std::all_of(call->arguments.begin(), call->arguments.end(),
+ [](const Expression& argument) { return argument.literal(); })) {
+ // all arguments are literal; we can evaluate this subexpression *now*
+ static const ExecBatch ignored_input = ExecBatch{};
+ ARROW_ASSIGN_OR_RAISE(Datum constant,
+ ExecuteScalarExpression(expr, ignored_input));
+
+ return literal(std::move(constant));
+ }
+
+ // XXX the following should probably be in a registry of passes instead
+ // of inline
+
+ if (GetNullHandling(*call) == compute::NullHandling::INTERSECTION) {
+ // kernels which always produce intersected validity can be resolved
+ // to null *now* if any of their inputs is a null literal
+ for (const auto& argument : call->arguments) {
+ if (argument.IsNullLiteral()) {
+ return argument;
+ }
+ }
+ }
+
+ if (call->function_name == "and_kleene") {
+ for (auto args : ArgumentsAndFlippedArguments(*call)) {
+ // true and x == x
+ if (args.first == literal(true)) return args.second;
+
+ // false and x == false
+ if (args.first == literal(false)) return args.first;
+
+ // x and x == x
+ if (args.first == args.second) return args.first;
+ }
+ return expr;
+ }
+
+ if (call->function_name == "or_kleene") {
+ for (auto args : ArgumentsAndFlippedArguments(*call)) {
+ // false or x == x
+ if (args.first == literal(false)) return args.second;
+
+ // true or x == true
+ if (args.first == literal(true)) return args.first;
+
+ // x or x == x
+ if (args.first == args.second) return args.first;
+ }
+ return expr;
+ }
+
+ return expr;
+ });
+}
+
+namespace {
+
+std::vector<Expression> GuaranteeConjunctionMembers(
+ const Expression& guaranteed_true_predicate) {
+ auto guarantee = guaranteed_true_predicate.call();
+ if (!guarantee || guarantee->function_name != "and_kleene") {
+ return {guaranteed_true_predicate};
+ }
+ return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
+}
+
+// Conjunction members which are represented in known_values are erased from
+// conjunction_members
+Status ExtractKnownFieldValuesImpl(
+ std::vector<Expression>* conjunction_members,
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
+ auto unconsumed_end =
+ std::partition(conjunction_members->begin(), conjunction_members->end(),
+ [](const Expression& expr) {
+ // search for an equality conditions between a field and a literal
+ auto call = expr.call();
+ if (!call) return true;
+
+ if (call->function_name == "equal") {
+ auto ref = call->arguments[0].field_ref();
+ auto lit = call->arguments[1].literal();
+ return !(ref && lit);
+ }
+
+ if (call->function_name == "is_null") {
+ auto ref = call->arguments[0].field_ref();
+ return !ref;
+ }
+
+ return true;
+ });
+
+ for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
+ auto call = CallNotNull(*it);
+
+ if (call->function_name == "equal") {
+ auto ref = call->arguments[0].field_ref();
+ auto lit = call->arguments[1].literal();
+ known_values->emplace(*ref, *lit);
+ } else if (call->function_name == "is_null") {
+ auto ref = call->arguments[0].field_ref();
+ known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
+ }
+ }
+
+ conjunction_members->erase(unconsumed_end, conjunction_members->end());
+
+ return Status::OK();
+}
+
+} // namespace
+
+Result<KnownFieldValues> ExtractKnownFieldValues(
+ const Expression& guaranteed_true_predicate) {
+ auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
+ KnownFieldValues known_values;
+ RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
+ return known_values;
+}
+
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+ Expression expr) {
+ if (!expr.IsBound()) {
+ return Status::Invalid(
+ "ReplaceFieldsWithKnownValues called on an unbound Expression");
+ }
+
+ return Modify(
+ std::move(expr),
+ [&known_values](Expression expr) -> Result<Expression> {
+ if (auto ref = expr.field_ref()) {
+ auto it = known_values.map.find(*ref);
+ if (it != known_values.map.end()) {
+ Datum lit = it->second;
+ if (lit.descr() == expr.descr()) return literal(std::move(lit));
+ // type mismatch, try casting the known value to the correct type
+
+ if (expr.type()->id() == Type::DICTIONARY &&
+ lit.type()->id() != Type::DICTIONARY) {
+ // the known value must be dictionary encoded
+
+ const auto& dict_type = checked_cast<const DictionaryType&>(*expr.type());
+ if (!lit.type()->Equals(dict_type.value_type())) {
+ ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, dict_type.value_type()));
+ }
+
+ if (lit.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto dictionary,
+ MakeArrayFromScalar(*lit.scalar(), 1));
+
+ lit = Datum{DictionaryScalar::Make(MakeScalar<int32_t>(0),
+ std::move(dictionary))};
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
+ return literal(std::move(lit));
+ }
+ }
+ return expr;
+ },
+ [](Expression expr, ...) { return expr; });
+}
+
+namespace {
+
+bool IsBinaryAssociativeCommutative(const Expression::Call& call) {
+ static std::unordered_set<std::string> binary_associative_commutative{
+ "and", "or", "and_kleene", "or_kleene", "xor",
+ "multiply", "add", "multiply_checked", "add_checked"};
+
+ auto it = binary_associative_commutative.find(call.function_name);
+ return it != binary_associative_commutative.end();
+}
+
+} // namespace
+
+Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return Canonicalize(std::move(expr), &exec_context);
+ }
+
+ // If potentially reconstructing more deeply than a call's immediate arguments
+ // (for example, when reorganizing an associative chain), add expressions to this set to
+ // avoid unnecessary work
+ struct {
+ std::unordered_set<Expression, Expression::Hash> set_;
+
+ bool operator()(const Expression& expr) const {
+ return set_.find(expr) != set_.end();
+ }
+
+ void Add(std::vector<Expression> exprs) {
+ std::move(exprs.begin(), exprs.end(), std::inserter(set_, set_.end()));
+ }
+ } AlreadyCanonicalized;
+
+ return Modify(
+ std::move(expr),
+ [&AlreadyCanonicalized, exec_context](Expression expr) -> Result<Expression> {
+ auto call = expr.call();
+ if (!call) return expr;
+
+ if (AlreadyCanonicalized(expr)) return expr;
+
+ if (IsBinaryAssociativeCommutative(*call)) {
+ struct {
+ int Priority(const Expression& operand) const {
+ // order literals first, starting with nulls
+ if (operand.IsNullLiteral()) return 0;
+ if (operand.literal()) return 1;
+ return 2;
+ }
+ bool operator()(const Expression& l, const Expression& r) const {
+ return Priority(l) < Priority(r);
+ }
+ } CanonicalOrdering;
+
+ FlattenedAssociativeChain chain(expr);
+ if (chain.was_left_folded &&
+ std::is_sorted(chain.fringe.begin(), chain.fringe.end(),
+ CanonicalOrdering)) {
+ AlreadyCanonicalized.Add(std::move(chain.exprs));
+ return expr;
+ }
+
+ std::stable_sort(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering);
+
+ // fold the chain back up
+ auto folded =
+ FoldLeft(chain.fringe.begin(), chain.fringe.end(),
+ [call, &AlreadyCanonicalized](Expression l, Expression r) {
+ auto canonicalized_call = *call;
+ canonicalized_call.arguments = {std::move(l), std::move(r)};
+ Expression expr(std::move(canonicalized_call));
+ AlreadyCanonicalized.Add({expr});
+ return expr;
+ });
+ return std::move(*folded);
+ }
+
+ if (auto cmp = Comparison::Get(call->function_name)) {
+ if (call->arguments[0].literal() && !call->arguments[1].literal()) {
+ // ensure that literals are on comparisons' RHS
+ auto flipped_call = *call;
+
+ std::swap(flipped_call.arguments[0], flipped_call.arguments[1]);
+ flipped_call.function_name =
+ Comparison::GetName(Comparison::GetFlipped(*cmp));
+
+ return BindNonRecursive(flipped_call,
+ /*insert_implicit_casts=*/false, exec_context);
+ }
+ }
+
+ return expr;
+ },
+ [](Expression expr, ...) { return expr; });
+}
+
+namespace {
+
+Result<Expression> DirectComparisonSimplification(Expression expr,
+ const Expression::Call& guarantee) {
+ return Modify(
+ std::move(expr), [](Expression expr) { return expr; },
+ [&guarantee](Expression expr, ...) -> Result<Expression> {
+ auto call = expr.call();
+ if (!call) return expr;
+
+ // Ensure both calls are comparisons with equal LHS and scalar RHS
+ auto cmp = Comparison::Get(expr);
+ auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+
+ if (!cmp) return expr;
+ if (!cmp_guarantee) return expr;
+
+ const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+ const auto& guarantee_lhs = guarantee.arguments[0];
+ if (lhs != guarantee_lhs) return expr;
+
+ auto rhs = call->arguments[1].literal();
+ auto guarantee_rhs = guarantee.arguments[1].literal();
+
+ if (!rhs) return expr;
+ if (!rhs->is_scalar()) return expr;
+
+ if (!guarantee_rhs) return expr;
+ if (!guarantee_rhs->is_scalar()) return expr;
+
+ ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
+ Comparison::Execute(*rhs, *guarantee_rhs));
+ DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+
+ if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
+ // RHS of filter is equal to RHS of guarantee
+
+ if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
+ // guarantee is a subset of filter, so all data will be included
+ // x > 1, x >= 1, x != 1 guaranteed by x > 1
+ return literal(true);
+ }
+
+ if ((*cmp & *cmp_guarantee) == 0) {
+ // guarantee disjoint with filter, so all data will be excluded
+ // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+ return literal(false);
+ }
+
+ return expr;
+ }
+
+ if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
+ // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
+ return expr;
+ }
+
+ if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
+ // x > 1, x >= 1, x != 1 guaranteed by x >= 3
+ return literal(true);
+ } else {
+ // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
+ return literal(false);
+ }
+ });
+}
+
+} // namespace
+
+Result<Expression> SimplifyWithGuarantee(Expression expr,
+ const Expression& guaranteed_true_predicate) {
+ auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
+
+ KnownFieldValues known_values;
+ RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
+
+ ARROW_ASSIGN_OR_RAISE(expr,
+ ReplaceFieldsWithKnownValues(known_values, std::move(expr)));
+
+ auto CanonicalizeAndFoldConstants = [&expr] {
+ ARROW_ASSIGN_OR_RAISE(expr, Canonicalize(std::move(expr)));
+ ARROW_ASSIGN_OR_RAISE(expr, FoldConstants(std::move(expr)));
+ return Status::OK();
+ };
+ RETURN_NOT_OK(CanonicalizeAndFoldConstants());
+
+ for (const auto& guarantee : conjunction_members) {
+ if (Comparison::Get(guarantee) && guarantee.call()->arguments[1].literal()) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto simplified, DirectComparisonSimplification(expr, *CallNotNull(guarantee)));
+
+ if (Identical(simplified, expr)) continue;
+
+ expr = std::move(simplified);
+ RETURN_NOT_OK(CanonicalizeAndFoldConstants());
+ }
+ }
+
+ return expr;
+}
+
+// Serialization is accomplished by converting expressions to KeyValueMetadata and storing
+// this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its
+// columns. Finally, the RecordBatch is written to an IPC file.
+Result<std::shared_ptr<Buffer>> Serialize(const Expression& expr) {
+ struct {
+ std::shared_ptr<KeyValueMetadata> metadata_ = std::make_shared<KeyValueMetadata>();
+ ArrayVector columns_;
+
+ Result<std::string> AddScalar(const Scalar& scalar) {
+ auto ret = columns_.size();
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(scalar, 1));
+ columns_.push_back(std::move(array));
+ return std::to_string(ret);
+ }
+
+ Status Visit(const Expression& expr) {
+ if (auto lit = expr.literal()) {
+ if (!lit->is_scalar()) {
+ return Status::NotImplemented("Serialization of non-scalar literals");
+ }
+ ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*lit->scalar()));
+ metadata_->Append("literal", std::move(value));
+ return Status::OK();
+ }
+
+ if (auto ref = expr.field_ref()) {
+ if (!ref->name()) {
+ return Status::NotImplemented("Serialization of non-name field_refs");
+ }
+ metadata_->Append("field_ref", *ref->name());
+ return Status::OK();
+ }
+
+ auto call = CallNotNull(expr);
+ metadata_->Append("call", call->function_name);
+
+ for (const auto& argument : call->arguments) {
+ RETURN_NOT_OK(Visit(argument));
+ }
+
+ if (call->options) {
+ ARROW_ASSIGN_OR_RAISE(auto options_scalar,
+ internal::FunctionOptionsToStructScalar(*call->options));
+ ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*options_scalar));
+ metadata_->Append("options", std::move(value));
+ }
+
+ metadata_->Append("end", call->function_name);
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<RecordBatch>> operator()(const Expression& expr) {
+ RETURN_NOT_OK(Visit(expr));
+ FieldVector fields(columns_.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ fields[i] = field("", columns_[i]->type());
+ }
+ return RecordBatch::Make(schema(std::move(fields), std::move(metadata_)), 1,
+ std::move(columns_));
+ }
+ } ToRecordBatch;
+
+ ARROW_ASSIGN_OR_RAISE(auto batch, ToRecordBatch(expr));
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+ ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ RETURN_NOT_OK(writer->Close());
+ return stream->Finish();
+}
+
+Result<Expression> Deserialize(std::shared_ptr<Buffer> buffer) {
+ io::BufferReader stream(std::move(buffer));
+ ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
+ ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
+ if (batch->schema()->metadata() == nullptr) {
+ return Status::Invalid("serialized Expression's batch repr had null metadata");
+ }
+ if (batch->num_rows() != 1) {
+ return Status::Invalid(
+ "serialized Expression's batch repr was not a single row - had ",
+ batch->num_rows());
+ }
+
+ struct FromRecordBatch {
+ const RecordBatch& batch_;
+ int index_;
+
+ const KeyValueMetadata& metadata() { return *batch_.schema()->metadata(); }
+
+ Result<std::shared_ptr<Scalar>> GetScalar(const std::string& i) {
+ int32_t column_index;
+ if (!::arrow::internal::ParseValue<Int32Type>(i.data(), i.length(),
+ &column_index)) {
+ return Status::Invalid("Couldn't parse column_index");
+ }
+ if (column_index >= batch_.num_columns()) {
+ return Status::Invalid("column_index out of bounds");
+ }
+ return batch_.column(column_index)->GetScalar(0);
+ }
+
+ Result<Expression> GetOne() {
+ if (index_ >= metadata().size()) {
+ return Status::Invalid("unterminated serialized Expression");
+ }
+
+ const std::string& key = metadata().key(index_);
+ const std::string& value = metadata().value(index_);
+ ++index_;
+
+ if (key == "literal") {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, GetScalar(value));
+ return literal(std::move(scalar));
+ }
+
+ if (key == "field_ref") {
+ return field_ref(value);
+ }
+
+ if (key != "call") {
+ return Status::Invalid("Unrecognized serialized Expression key ", key);
+ }
+
+ std::vector<Expression> arguments;
+ while (metadata().key(index_) != "end") {
+ if (metadata().key(index_) == "options") {
+ ARROW_ASSIGN_OR_RAISE(auto options_scalar, GetScalar(metadata().value(index_)));
+ std::shared_ptr<compute::FunctionOptions> options;
+ if (options_scalar) {
+ ARROW_ASSIGN_OR_RAISE(
+ options, internal::FunctionOptionsFromStructScalar(
+ checked_cast<const StructScalar&>(*options_scalar)));
+ }
+ auto expr = call(value, std::move(arguments), std::move(options));
+ index_ += 2;
+ return expr;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto argument, GetOne());
+ arguments.push_back(std::move(argument));
+ }
+
+ ++index_;
+ return call(value, std::move(arguments));
+ }
+ };
+
+ return FromRecordBatch{*batch, 0}.GetOne();
+}
+
+Expression project(std::vector<Expression> values, std::vector<std::string> names) {
+ return call("make_struct", std::move(values),
+ compute::MakeStructOptions{std::move(names)});
+}
+
+Expression equal(Expression lhs, Expression rhs) {
+ return call("equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression not_equal(Expression lhs, Expression rhs) {
+ return call("not_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression less(Expression lhs, Expression rhs) {
+ return call("less", {std::move(lhs), std::move(rhs)});
+}
+
+Expression less_equal(Expression lhs, Expression rhs) {
+ return call("less_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression greater(Expression lhs, Expression rhs) {
+ return call("greater", {std::move(lhs), std::move(rhs)});
+}
+
+Expression greater_equal(Expression lhs, Expression rhs) {
+ return call("greater_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
+
+Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }
+
+Expression and_(Expression lhs, Expression rhs) {
+ return call("and_kleene", {std::move(lhs), std::move(rhs)});
+}
+
+Expression and_(const std::vector<Expression>& operands) {
+ auto folded = FoldLeft<Expression(Expression, Expression)>(operands.begin(),
+ operands.end(), and_);
+ if (folded) {
+ return std::move(*folded);
+ }
+ return literal(true);
+}
+
+Expression or_(Expression lhs, Expression rhs) {
+ return call("or_kleene", {std::move(lhs), std::move(rhs)});
+}
+
+Expression or_(const std::vector<Expression>& operands) {
+ auto folded =
+ FoldLeft<Expression(Expression, Expression)>(operands.begin(), operands.end(), or_);
+ if (folded) {
+ return std::move(*folded);
+ }
+ return literal(false);
+}
+
+Expression not_(Expression operand) { return call("invert", {std::move(operand)}); }
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
index 3810accf70a..5ae95532c2b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
@@ -1,269 +1,269 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This API is EXPERIMENTAL.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/compute/type_fwd.h"
-#include "arrow/datum.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/variant.h"
-
-namespace arrow {
-namespace compute {
-
-/// An unbound expression which maps a single Datum to another Datum.
-/// An expression is one of
-/// - A literal Datum.
-/// - A reference to a single (potentially nested) field of the input Datum.
-/// - A call to a compute function, with arguments specified by other Expressions.
-class ARROW_EXPORT Expression {
- public:
- struct Call {
- std::string function_name;
- std::vector<Expression> arguments;
- std::shared_ptr<FunctionOptions> options;
- // Cached hash value
- size_t hash;
-
- // post-Bind properties:
- std::shared_ptr<Function> function;
- const Kernel* kernel = NULLPTR;
- std::shared_ptr<KernelState> kernel_state;
- ValueDescr descr;
-
- void ComputeHash();
- };
-
- std::string ToString() const;
- bool Equals(const Expression& other) const;
- size_t hash() const;
- struct Hash {
- size_t operator()(const Expression& expr) const { return expr.hash(); }
- };
-
- /// Bind this expression to the given input type, looking up Kernels and field types.
- /// Some expression simplification may be performed and implicit casts will be inserted.
- /// Any state necessary for execution will be initialized and returned.
- Result<Expression> Bind(const ValueDescr& in, ExecContext* = NULLPTR) const;
- Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
-
- // XXX someday
- // Clone all KernelState in this bound expression. If any function referenced by this
- // expression has mutable KernelState, it is not safe to execute or apply simplification
- // passes to it (or copies of it!) from multiple threads. Cloning state produces new
- // KernelStates where necessary to ensure that Expressions may be manipulated safely
- // on multiple threads.
- // Result<ExpressionState> CloneState() const;
- // Status SetState(ExpressionState);
-
- /// Return true if all an expression's field references have explicit ValueDescr and all
- /// of its functions' kernels are looked up.
- bool IsBound() const;
-
- /// Return true if this expression is composed only of Scalar literals, field
- /// references, and calls to ScalarFunctions.
- bool IsScalarExpression() const;
-
- /// Return true if this expression is literal and entirely null.
- bool IsNullLiteral() const;
-
- /// Return true if this expression could evaluate to true.
- bool IsSatisfiable() const;
-
- // XXX someday
- // Result<PipelineGraph> GetPipelines();
-
- /// Access a Call or return nullptr if this expression is not a call
- const Call* call() const;
- /// Access a Datum or return nullptr if this expression is not a literal
- const Datum* literal() const;
- /// Access a FieldRef or return nullptr if this expression is not a field_ref
- const FieldRef* field_ref() const;
-
- /// The type and shape to which this expression will evaluate
- ValueDescr descr() const;
- std::shared_ptr<DataType> type() const { return descr().type; }
- // XXX someday
- // NullGeneralization::type nullable() const;
-
- struct Parameter {
- FieldRef ref;
-
- // post-bind properties
- ValueDescr descr;
- int index;
- };
- const Parameter* parameter() const;
-
- Expression() = default;
- explicit Expression(Call call);
- explicit Expression(Datum literal);
- explicit Expression(Parameter parameter);
-
- private:
- using Impl = util::Variant<Datum, Parameter, Call>;
- std::shared_ptr<Impl> impl_;
-
- ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r);
-
- ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*);
-};
-
-inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
-inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
-
-// Factories
-
-ARROW_EXPORT
-Expression literal(Datum lit);
-
-template <typename Arg>
-Expression literal(Arg&& arg) {
- return literal(Datum(std::forward<Arg>(arg)));
-}
-
-ARROW_EXPORT
-Expression field_ref(FieldRef ref);
-
-ARROW_EXPORT
-Expression call(std::string function, std::vector<Expression> arguments,
- std::shared_ptr<FunctionOptions> options = NULLPTR);
-
-template <typename Options, typename = typename std::enable_if<
- std::is_base_of<FunctionOptions, Options>::value>::type>
-Expression call(std::string function, std::vector<Expression> arguments,
- Options options) {
- return call(std::move(function), std::move(arguments),
- std::make_shared<Options>(std::move(options)));
-}
-
-/// Assemble a list of all fields referenced by an Expression at any depth.
-ARROW_EXPORT
-std::vector<FieldRef> FieldsInExpression(const Expression&);
-
-/// Check if the expression references any fields.
-ARROW_EXPORT
-bool ExpressionHasFieldRefs(const Expression&);
-
-/// Assemble a mapping from field references to known values.
-struct ARROW_EXPORT KnownFieldValues;
-ARROW_EXPORT
-Result<KnownFieldValues> ExtractKnownFieldValues(
- const Expression& guaranteed_true_predicate);
-
-/// \defgroup expression-passes Functions for modification of Expressions
-///
-/// @{
-///
-/// These transform bound expressions. Some transforms utilize a guarantee, which is
-/// provided as an Expression which is guaranteed to evaluate to true. The
-/// guaranteed_true_predicate need not be bound, but canonicalization is currently
-/// deferred to producers of guarantees. For example in order to be recognized as a
-/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
-/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
-/// other semantically identical Expressions will not be recognized.
-
-/// Weak canonicalization which establishes guarantees for subsequent passes. Even
-/// equivalent Expressions may result in different canonicalized expressions.
-/// TODO this could be a strong canonicalization
-ARROW_EXPORT
-Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
-
-/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
-/// be null so replace the call with a null literal). Includes early evaluation of all
-/// calls whose arguments are entirely literal.
-ARROW_EXPORT
-Result<Expression> FoldConstants(Expression);
-
-/// Simplify Expressions by replacing with known values of the fields which it references.
-ARROW_EXPORT
-Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
- Expression);
-
-/// Simplify an expression by replacing subexpressions based on a guarantee:
-/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
-/// used to remove redundant function calls from a filter expression or to replace a
-/// reference to a constant-value field with a literal.
-ARROW_EXPORT
-Result<Expression> SimplifyWithGuarantee(Expression,
- const Expression& guaranteed_true_predicate);
-
-/// @}
-
-// Execution
-
-/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
-/// RecordBatch which may have missing or incorrectly ordered columns.
-/// Missing fields will be replaced with null scalars.
-ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
- const Datum& partial);
-
-/// Execute a scalar expression against the provided state and input ExecBatch. This
-/// expression must be bound.
-ARROW_EXPORT
-Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
- ExecContext* = NULLPTR);
-
-/// Convenience function for invoking against a RecordBatch
-ARROW_EXPORT
-Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
- const Datum& partial_input, ExecContext* = NULLPTR);
-
-// Serialization
-
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
-
-ARROW_EXPORT
-Result<Expression> Deserialize(std::shared_ptr<Buffer>);
-
-// Convenience aliases for factories
-
-ARROW_EXPORT Expression project(std::vector<Expression> values,
- std::vector<std::string> names);
-
-ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression is_null(Expression lhs);
-
-ARROW_EXPORT Expression is_valid(Expression lhs);
-
-ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
-ARROW_EXPORT Expression and_(const std::vector<Expression>&);
-ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
-ARROW_EXPORT Expression or_(const std::vector<Expression>&);
-ARROW_EXPORT Expression not_(Expression operand);
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/variant.h"
+
+namespace arrow {
+namespace compute {
+
+/// An unbound expression which maps a single Datum to another Datum.
+/// An expression is one of
+/// - A literal Datum.
+/// - A reference to a single (potentially nested) field of the input Datum.
+/// - A call to a compute function, with arguments specified by other Expressions.
+class ARROW_EXPORT Expression {
+ public:
+ struct Call {
+ std::string function_name;
+ std::vector<Expression> arguments;
+ std::shared_ptr<FunctionOptions> options;
+ // Cached hash value
+ size_t hash;
+
+ // post-Bind properties:
+ std::shared_ptr<Function> function;
+ const Kernel* kernel = NULLPTR;
+ std::shared_ptr<KernelState> kernel_state;
+ ValueDescr descr;
+
+ void ComputeHash();
+ };
+
+ std::string ToString() const;
+ bool Equals(const Expression& other) const;
+ size_t hash() const;
+ struct Hash {
+ size_t operator()(const Expression& expr) const { return expr.hash(); }
+ };
+
+ /// Bind this expression to the given input type, looking up Kernels and field types.
+ /// Some expression simplification may be performed and implicit casts will be inserted.
+ /// Any state necessary for execution will be initialized and returned.
+ Result<Expression> Bind(const ValueDescr& in, ExecContext* = NULLPTR) const;
+ Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
+
+ // XXX someday
+ // Clone all KernelState in this bound expression. If any function referenced by this
+ // expression has mutable KernelState, it is not safe to execute or apply simplification
+ // passes to it (or copies of it!) from multiple threads. Cloning state produces new
+ // KernelStates where necessary to ensure that Expressions may be manipulated safely
+ // on multiple threads.
+ // Result<ExpressionState> CloneState() const;
+ // Status SetState(ExpressionState);
+
+ /// Return true if all an expression's field references have explicit ValueDescr and all
+ /// of its functions' kernels are looked up.
+ bool IsBound() const;
+
+ /// Return true if this expression is composed only of Scalar literals, field
+ /// references, and calls to ScalarFunctions.
+ bool IsScalarExpression() const;
+
+ /// Return true if this expression is literal and entirely null.
+ bool IsNullLiteral() const;
+
+ /// Return true if this expression could evaluate to true.
+ bool IsSatisfiable() const;
+
+ // XXX someday
+ // Result<PipelineGraph> GetPipelines();
+
+ /// Access a Call or return nullptr if this expression is not a call
+ const Call* call() const;
+ /// Access a Datum or return nullptr if this expression is not a literal
+ const Datum* literal() const;
+ /// Access a FieldRef or return nullptr if this expression is not a field_ref
+ const FieldRef* field_ref() const;
+
+ /// The type and shape to which this expression will evaluate
+ ValueDescr descr() const;
+ std::shared_ptr<DataType> type() const { return descr().type; }
+ // XXX someday
+ // NullGeneralization::type nullable() const;
+
+ struct Parameter {
+ FieldRef ref;
+
+ // post-bind properties
+ ValueDescr descr;
+ int index;
+ };
+ const Parameter* parameter() const;
+
+ Expression() = default;
+ explicit Expression(Call call);
+ explicit Expression(Datum literal);
+ explicit Expression(Parameter parameter);
+
+ private:
+ using Impl = util::Variant<Datum, Parameter, Call>;
+ std::shared_ptr<Impl> impl_;
+
+ ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r);
+
+ ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*);
+};
+
+inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
+inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
+
+// Factories
+
+ARROW_EXPORT
+Expression literal(Datum lit);
+
+template <typename Arg>
+Expression literal(Arg&& arg) {
+ return literal(Datum(std::forward<Arg>(arg)));
+}
+
+ARROW_EXPORT
+Expression field_ref(FieldRef ref);
+
+ARROW_EXPORT
+Expression call(std::string function, std::vector<Expression> arguments,
+ std::shared_ptr<FunctionOptions> options = NULLPTR);
+
+template <typename Options, typename = typename std::enable_if<
+ std::is_base_of<FunctionOptions, Options>::value>::type>
+Expression call(std::string function, std::vector<Expression> arguments,
+ Options options) {
+ return call(std::move(function), std::move(arguments),
+ std::make_shared<Options>(std::move(options)));
+}
+
+/// Assemble a list of all fields referenced by an Expression at any depth.
+ARROW_EXPORT
+std::vector<FieldRef> FieldsInExpression(const Expression&);
+
+/// Check if the expression references any fields.
+ARROW_EXPORT
+bool ExpressionHasFieldRefs(const Expression&);
+
+/// Assemble a mapping from field references to known values.
+struct ARROW_EXPORT KnownFieldValues;
+ARROW_EXPORT
+Result<KnownFieldValues> ExtractKnownFieldValues(
+ const Expression& guaranteed_true_predicate);
+
+/// \defgroup expression-passes Functions for modification of Expressions
+///
+/// @{
+///
+/// These transform bound expressions. Some transforms utilize a guarantee, which is
+/// provided as an Expression which is guaranteed to evaluate to true. The
+/// guaranteed_true_predicate need not be bound, but canonicalization is currently
+/// deferred to producers of guarantees. For example in order to be recognized as a
+/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
+/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
+/// other semantically identical Expressions will not be recognized.
+
+/// Weak canonicalization which establishes guarantees for subsequent passes. Even
+/// equivalent Expressions may result in different canonicalized expressions.
+/// TODO this could be a strong canonicalization
+ARROW_EXPORT
+Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
+
+/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
+/// be null so replace the call with a null literal). Includes early evaluation of all
+/// calls whose arguments are entirely literal.
+ARROW_EXPORT
+Result<Expression> FoldConstants(Expression);
+
+/// Simplify Expressions by replacing with known values of the fields which it references.
+ARROW_EXPORT
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+ Expression);
+
+/// Simplify an expression by replacing subexpressions based on a guarantee:
+/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
+/// used to remove redundant function calls from a filter expression or to replace a
+/// reference to a constant-value field with a literal.
+ARROW_EXPORT
+Result<Expression> SimplifyWithGuarantee(Expression,
+ const Expression& guaranteed_true_predicate);
+
+/// @}
+
+// Execution
+
+/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
+/// RecordBatch which may have missing or incorrectly ordered columns.
+/// Missing fields will be replaced with null scalars.
+ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
+ const Datum& partial);
+
+/// Execute a scalar expression against the provided state and input ExecBatch. This
+/// expression must be bound.
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
+ ExecContext* = NULLPTR);
+
+/// Convenience function for invoking against a RecordBatch
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
+ const Datum& partial_input, ExecContext* = NULLPTR);
+
+// Serialization
+
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
+
+ARROW_EXPORT
+Result<Expression> Deserialize(std::shared_ptr<Buffer>);
+
+// Convenience aliases for factories
+
+ARROW_EXPORT Expression project(std::vector<Expression> values,
+ std::vector<std::string> names);
+
+ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression is_null(Expression lhs);
+
+ARROW_EXPORT Expression is_valid(Expression lhs);
+
+ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression and_(const std::vector<Expression>&);
+ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression or_(const std::vector<Expression>&);
+ARROW_EXPORT Expression not_(Expression operand);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
index dc38924d932..abcb99bc576 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
@@ -1,336 +1,336 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/expression.h"
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/cast.h"
-#include "arrow/compute/registry.h"
-#include "arrow/record_batch.h"
-#include "arrow/table.h"
-#include "arrow/util/logging.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-
-namespace compute {
-
-struct KnownFieldValues {
- std::unordered_map<FieldRef, Datum, FieldRef::Hash> map;
-};
-
-inline const Expression::Call* CallNotNull(const Expression& expr) {
- auto call = expr.call();
- DCHECK_NE(call, nullptr);
- return call;
-}
-
-inline std::vector<ValueDescr> GetDescriptors(const std::vector<Expression>& exprs) {
- std::vector<ValueDescr> descrs(exprs.size());
- for (size_t i = 0; i < exprs.size(); ++i) {
- DCHECK(exprs[i].IsBound());
- descrs[i] = exprs[i].descr();
- }
- return descrs;
-}
-
-inline std::vector<ValueDescr> GetDescriptors(const std::vector<Datum>& values) {
- std::vector<ValueDescr> descrs(values.size());
- for (size_t i = 0; i < values.size(); ++i) {
- descrs[i] = values[i].descr();
- }
- return descrs;
-}
-
-struct Comparison {
- enum type {
- NA = 0,
- EQUAL = 1,
- LESS = 2,
- GREATER = 4,
- NOT_EQUAL = LESS | GREATER,
- LESS_EQUAL = LESS | EQUAL,
- GREATER_EQUAL = GREATER | EQUAL,
- };
-
- static const type* Get(const std::string& function) {
- static std::unordered_map<std::string, type> map{
- {"equal", EQUAL}, {"not_equal", NOT_EQUAL},
- {"less", LESS}, {"less_equal", LESS_EQUAL},
- {"greater", GREATER}, {"greater_equal", GREATER_EQUAL},
- };
-
- auto it = map.find(function);
- return it != map.end() ? &it->second : nullptr;
- }
-
- static const type* Get(const Expression& expr) {
- if (auto call = expr.call()) {
- return Comparison::Get(call->function_name);
- }
- return nullptr;
- }
-
- // Execute a simple Comparison between scalars
- static Result<type> Execute(Datum l, Datum r) {
- if (!l.is_scalar() || !r.is_scalar()) {
- return Status::Invalid("Cannot Execute Comparison on non-scalars");
- }
-
- std::vector<Datum> arguments{std::move(l), std::move(r)};
-
- ARROW_ASSIGN_OR_RAISE(auto equal, compute::CallFunction("equal", arguments));
-
- if (!equal.scalar()->is_valid) return NA;
- if (equal.scalar_as<BooleanScalar>().value) return EQUAL;
-
- ARROW_ASSIGN_OR_RAISE(auto less, compute::CallFunction("less", arguments));
-
- if (!less.scalar()->is_valid) return NA;
- return less.scalar_as<BooleanScalar>().value ? LESS : GREATER;
- }
-
- // Given an Expression wrapped in casts which preserve ordering
- // (for example, cast(field_ref("i16"), to_type=int32())), unwrap the inner Expression.
- // This is used to destructure implicitly cast field_refs during Expression
- // simplification.
- static const Expression& StripOrderPreservingCasts(const Expression& expr) {
- auto call = expr.call();
- if (!call) return expr;
- if (call->function_name != "cast") return expr;
-
- const Expression& from = call->arguments[0];
-
- auto from_id = from.type()->id();
- auto to_id = expr.type()->id();
-
- if (is_floating(to_id)) {
- if (is_integer(from_id) || is_floating(from_id)) {
- return StripOrderPreservingCasts(from);
- }
- return expr;
- }
-
- if (is_unsigned_integer(to_id)) {
- if (is_unsigned_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
- return StripOrderPreservingCasts(from);
- }
- return expr;
- }
-
- if (is_signed_integer(to_id)) {
- if (is_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
- return StripOrderPreservingCasts(from);
- }
- return expr;
- }
-
- return expr;
- }
-
- static type GetFlipped(type op) {
- switch (op) {
- case NA:
- return NA;
- case EQUAL:
- return EQUAL;
- case LESS:
- return GREATER;
- case GREATER:
- return LESS;
- case NOT_EQUAL:
- return NOT_EQUAL;
- case LESS_EQUAL:
- return GREATER_EQUAL;
- case GREATER_EQUAL:
- return LESS_EQUAL;
- }
- DCHECK(false);
- return NA;
- }
-
- static std::string GetName(type op) {
- switch (op) {
- case NA:
- break;
- case EQUAL:
- return "equal";
- case LESS:
- return "less";
- case GREATER:
- return "greater";
- case NOT_EQUAL:
- return "not_equal";
- case LESS_EQUAL:
- return "less_equal";
- case GREATER_EQUAL:
- return "greater_equal";
- }
- return "na";
- }
-
- static std::string GetOp(type op) {
- switch (op) {
- case NA:
- DCHECK(false) << "unreachable";
- break;
- case EQUAL:
- return "==";
- case LESS:
- return "<";
- case GREATER:
- return ">";
- case NOT_EQUAL:
- return "!=";
- case LESS_EQUAL:
- return "<=";
- case GREATER_EQUAL:
- return ">=";
- }
- DCHECK(false);
- return "";
- }
-};
-
-inline const compute::CastOptions* GetCastOptions(const Expression::Call& call) {
- if (call.function_name != "cast") return nullptr;
- return checked_cast<const compute::CastOptions*>(call.options.get());
-}
-
-inline bool IsSetLookup(const std::string& function) {
- return function == "is_in" || function == "index_in";
-}
-
-inline const compute::MakeStructOptions* GetMakeStructOptions(
- const Expression::Call& call) {
- if (call.function_name != "make_struct") return nullptr;
- return checked_cast<const compute::MakeStructOptions*>(call.options.get());
-}
-
-/// A helper for unboxing an Expression composed of associative function calls.
-/// Such expressions can frequently be rearranged to a semantically equivalent
-/// expression for more optimal execution or more straightforward manipulation.
-/// For example, (a + ((b + 3) + 4)) is equivalent to (((4 + 3) + a) + b) and the latter
-/// can be trivially constant-folded to ((7 + a) + b).
-struct FlattenedAssociativeChain {
- /// True if a chain was already a left fold.
- bool was_left_folded = true;
-
- /// All "branch" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
- /// exprs would be [(a + ((b + 3) + 4)), ((b + 3) + 4), (b + 3)]
- std::vector<Expression> exprs;
-
- /// All "leaf" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
- /// the fringe would be [a, b, 3, 4]
- std::vector<Expression> fringe;
-
- explicit FlattenedAssociativeChain(Expression expr) : exprs{std::move(expr)} {
- auto call = CallNotNull(exprs.back());
- fringe = call->arguments;
-
- auto it = fringe.begin();
-
- while (it != fringe.end()) {
- auto sub_call = it->call();
- if (!sub_call || sub_call->function_name != call->function_name) {
- ++it;
- continue;
- }
-
- if (it != fringe.begin()) {
- was_left_folded = false;
- }
-
- exprs.push_back(std::move(*it));
- it = fringe.erase(it);
-
- auto index = it - fringe.begin();
- fringe.insert(it, sub_call->arguments.begin(), sub_call->arguments.end());
- it = fringe.begin() + index;
- // NB: no increment so we hit sub_call's first argument next iteration
- }
-
- DCHECK(std::all_of(exprs.begin(), exprs.end(), [](const Expression& expr) {
- return CallNotNull(expr)->options == nullptr;
- }));
- }
-};
-
-inline Result<std::shared_ptr<compute::Function>> GetFunction(
- const Expression::Call& call, compute::ExecContext* exec_context) {
- if (call.function_name != "cast") {
- return exec_context->func_registry()->GetFunction(call.function_name);
- }
- // XXX this special case is strange; why not make "cast" a ScalarFunction?
- const auto& to_type = checked_cast<const compute::CastOptions&>(*call.options).to_type;
- return compute::GetCastFunction(to_type);
-}
-
-/// Modify an Expression with pre-order and post-order visitation.
-/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
-/// arguments, `post_call` will visit Calls (and no other Expressions) after their
-/// arguments. Visitors should return the Identical expression to indicate no change; this
-/// will prevent unnecessary construction in the common case where a modification is not
-/// possible/necessary/...
-///
-/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
-/// arguments but also receives a pointer to the unmodified Expression as a second
-/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
-template <typename PreVisit, typename PostVisitCall>
-Result<Expression> Modify(Expression expr, const PreVisit& pre,
- const PostVisitCall& post_call) {
- ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
-
- auto call = expr.call();
- if (!call) return expr;
-
- bool at_least_one_modified = false;
- std::vector<Expression> modified_arguments;
-
- for (size_t i = 0; i < call->arguments.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(auto modified_argument,
- Modify(call->arguments[i], pre, post_call));
-
- if (Identical(modified_argument, call->arguments[i])) {
- continue;
- }
-
- if (!at_least_one_modified) {
- modified_arguments = call->arguments;
- at_least_one_modified = true;
- }
-
- modified_arguments[i] = std::move(modified_argument);
- }
-
- if (at_least_one_modified) {
- // reconstruct the call expression with the modified arguments
- auto modified_call = *call;
- modified_call.arguments = std::move(modified_arguments);
- return post_call(Expression(std::move(modified_call)), &expr);
- }
-
- return post_call(std::move(expr), nullptr);
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/expression.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/registry.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+
+struct KnownFieldValues {
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash> map;
+};
+
+inline const Expression::Call* CallNotNull(const Expression& expr) {
+ auto call = expr.call();
+ DCHECK_NE(call, nullptr);
+ return call;
+}
+
+inline std::vector<ValueDescr> GetDescriptors(const std::vector<Expression>& exprs) {
+ std::vector<ValueDescr> descrs(exprs.size());
+ for (size_t i = 0; i < exprs.size(); ++i) {
+ DCHECK(exprs[i].IsBound());
+ descrs[i] = exprs[i].descr();
+ }
+ return descrs;
+}
+
+inline std::vector<ValueDescr> GetDescriptors(const std::vector<Datum>& values) {
+ std::vector<ValueDescr> descrs(values.size());
+ for (size_t i = 0; i < values.size(); ++i) {
+ descrs[i] = values[i].descr();
+ }
+ return descrs;
+}
+
+struct Comparison {
+ enum type {
+ NA = 0,
+ EQUAL = 1,
+ LESS = 2,
+ GREATER = 4,
+ NOT_EQUAL = LESS | GREATER,
+ LESS_EQUAL = LESS | EQUAL,
+ GREATER_EQUAL = GREATER | EQUAL,
+ };
+
+ static const type* Get(const std::string& function) {
+ static std::unordered_map<std::string, type> map{
+ {"equal", EQUAL}, {"not_equal", NOT_EQUAL},
+ {"less", LESS}, {"less_equal", LESS_EQUAL},
+ {"greater", GREATER}, {"greater_equal", GREATER_EQUAL},
+ };
+
+ auto it = map.find(function);
+ return it != map.end() ? &it->second : nullptr;
+ }
+
+ static const type* Get(const Expression& expr) {
+ if (auto call = expr.call()) {
+ return Comparison::Get(call->function_name);
+ }
+ return nullptr;
+ }
+
+ // Execute a simple Comparison between scalars
+ static Result<type> Execute(Datum l, Datum r) {
+ if (!l.is_scalar() || !r.is_scalar()) {
+ return Status::Invalid("Cannot Execute Comparison on non-scalars");
+ }
+
+ std::vector<Datum> arguments{std::move(l), std::move(r)};
+
+ ARROW_ASSIGN_OR_RAISE(auto equal, compute::CallFunction("equal", arguments));
+
+ if (!equal.scalar()->is_valid) return NA;
+ if (equal.scalar_as<BooleanScalar>().value) return EQUAL;
+
+ ARROW_ASSIGN_OR_RAISE(auto less, compute::CallFunction("less", arguments));
+
+ if (!less.scalar()->is_valid) return NA;
+ return less.scalar_as<BooleanScalar>().value ? LESS : GREATER;
+ }
+
+ // Given an Expression wrapped in casts which preserve ordering
+ // (for example, cast(field_ref("i16"), to_type=int32())), unwrap the inner Expression.
+ // This is used to destructure implicitly cast field_refs during Expression
+ // simplification.
+ static const Expression& StripOrderPreservingCasts(const Expression& expr) {
+ auto call = expr.call();
+ if (!call) return expr;
+ if (call->function_name != "cast") return expr;
+
+ const Expression& from = call->arguments[0];
+
+ auto from_id = from.type()->id();
+ auto to_id = expr.type()->id();
+
+ if (is_floating(to_id)) {
+ if (is_integer(from_id) || is_floating(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ if (is_unsigned_integer(to_id)) {
+ if (is_unsigned_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ if (is_signed_integer(to_id)) {
+ if (is_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ return expr;
+ }
+
+ static type GetFlipped(type op) {
+ switch (op) {
+ case NA:
+ return NA;
+ case EQUAL:
+ return EQUAL;
+ case LESS:
+ return GREATER;
+ case GREATER:
+ return LESS;
+ case NOT_EQUAL:
+ return NOT_EQUAL;
+ case LESS_EQUAL:
+ return GREATER_EQUAL;
+ case GREATER_EQUAL:
+ return LESS_EQUAL;
+ }
+ DCHECK(false);
+ return NA;
+ }
+
+ static std::string GetName(type op) {
+ switch (op) {
+ case NA:
+ break;
+ case EQUAL:
+ return "equal";
+ case LESS:
+ return "less";
+ case GREATER:
+ return "greater";
+ case NOT_EQUAL:
+ return "not_equal";
+ case LESS_EQUAL:
+ return "less_equal";
+ case GREATER_EQUAL:
+ return "greater_equal";
+ }
+ return "na";
+ }
+
+ static std::string GetOp(type op) {
+ switch (op) {
+ case NA:
+ DCHECK(false) << "unreachable";
+ break;
+ case EQUAL:
+ return "==";
+ case LESS:
+ return "<";
+ case GREATER:
+ return ">";
+ case NOT_EQUAL:
+ return "!=";
+ case LESS_EQUAL:
+ return "<=";
+ case GREATER_EQUAL:
+ return ">=";
+ }
+ DCHECK(false);
+ return "";
+ }
+};
+
+inline const compute::CastOptions* GetCastOptions(const Expression::Call& call) {
+ if (call.function_name != "cast") return nullptr;
+ return checked_cast<const compute::CastOptions*>(call.options.get());
+}
+
+inline bool IsSetLookup(const std::string& function) {
+ return function == "is_in" || function == "index_in";
+}
+
+inline const compute::MakeStructOptions* GetMakeStructOptions(
+ const Expression::Call& call) {
+ if (call.function_name != "make_struct") return nullptr;
+ return checked_cast<const compute::MakeStructOptions*>(call.options.get());
+}
+
+/// A helper for unboxing an Expression composed of associative function calls.
+/// Such expressions can frequently be rearranged to a semantically equivalent
+/// expression for more optimal execution or more straightforward manipulation.
+/// For example, (a + ((b + 3) + 4)) is equivalent to (((4 + 3) + a) + b) and the latter
+/// can be trivially constant-folded to ((7 + a) + b).
+struct FlattenedAssociativeChain {
+ /// True if a chain was already a left fold.
+ bool was_left_folded = true;
+
+ /// All "branch" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
+ /// exprs would be [(a + ((b + 3) + 4)), ((b + 3) + 4), (b + 3)]
+ std::vector<Expression> exprs;
+
+ /// All "leaf" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
+ /// the fringe would be [a, b, 3, 4]
+ std::vector<Expression> fringe;
+
+ explicit FlattenedAssociativeChain(Expression expr) : exprs{std::move(expr)} {
+ auto call = CallNotNull(exprs.back());
+ fringe = call->arguments;
+
+ auto it = fringe.begin();
+
+ while (it != fringe.end()) {
+ auto sub_call = it->call();
+ if (!sub_call || sub_call->function_name != call->function_name) {
+ ++it;
+ continue;
+ }
+
+ if (it != fringe.begin()) {
+ was_left_folded = false;
+ }
+
+ exprs.push_back(std::move(*it));
+ it = fringe.erase(it);
+
+ auto index = it - fringe.begin();
+ fringe.insert(it, sub_call->arguments.begin(), sub_call->arguments.end());
+ it = fringe.begin() + index;
+ // NB: no increment so we hit sub_call's first argument next iteration
+ }
+
+ DCHECK(std::all_of(exprs.begin(), exprs.end(), [](const Expression& expr) {
+ return CallNotNull(expr)->options == nullptr;
+ }));
+ }
+};
+
+inline Result<std::shared_ptr<compute::Function>> GetFunction(
+ const Expression::Call& call, compute::ExecContext* exec_context) {
+ if (call.function_name != "cast") {
+ return exec_context->func_registry()->GetFunction(call.function_name);
+ }
+ // XXX this special case is strange; why not make "cast" a ScalarFunction?
+ const auto& to_type = checked_cast<const compute::CastOptions&>(*call.options).to_type;
+ return compute::GetCastFunction(to_type);
+}
+
+/// Modify an Expression with pre-order and post-order visitation.
+/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
+/// arguments, `post_call` will visit Calls (and no other Expressions) after their
+/// arguments. Visitors should return the Identical expression to indicate no change; this
+/// will prevent unnecessary construction in the common case where a modification is not
+/// possible/necessary/...
+///
+/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
+/// arguments but also receives a pointer to the unmodified Expression as a second
+/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
+template <typename PreVisit, typename PostVisitCall>
+Result<Expression> Modify(Expression expr, const PreVisit& pre,
+ const PostVisitCall& post_call) {
+ ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
+
+ auto call = expr.call();
+ if (!call) return expr;
+
+ bool at_least_one_modified = false;
+ std::vector<Expression> modified_arguments;
+
+ for (size_t i = 0; i < call->arguments.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto modified_argument,
+ Modify(call->arguments[i], pre, post_call));
+
+ if (Identical(modified_argument, call->arguments[i])) {
+ continue;
+ }
+
+ if (!at_least_one_modified) {
+ modified_arguments = call->arguments;
+ at_least_one_modified = true;
+ }
+
+ modified_arguments[i] = std::move(modified_argument);
+ }
+
+ if (at_least_one_modified) {
+ // reconstruct the call expression with the modified arguments
+ auto modified_call = *call;
+ modified_call.arguments = std::move(modified_arguments);
+ return post_call(Expression(std::move(modified_call)), &expr);
+ }
+
+ return post_call(std::move(expr), nullptr);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
index 7a5b0be9990..01de727978f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
@@ -1,268 +1,268 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_compare.h"
-
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace compute {
-
-void KeyCompare::CompareRows(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
- uint16_t* out_sel_left_maybe_same,
- const KeyEncoder::KeyRowArray& rows_left,
- const KeyEncoder::KeyRowArray& rows_right) {
- ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata()));
-
- if (num_rows_to_compare == 0) {
- *out_num_rows = 0;
- return;
- }
-
- // Allocate temporary byte and bit vectors
- auto bytevector_holder =
- util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
- auto bitvector_holder =
- util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
-
- uint8_t* match_bytevector = bytevector_holder.mutable_data();
- uint8_t* match_bitvector = bitvector_holder.mutable_data();
-
- // All comparison functions called here will update match byte vector
- // (AND it with comparison result) instead of overwriting it.
- memset(match_bytevector, 0xff, num_rows_to_compare);
-
- if (rows_left.metadata().is_fixed_length) {
- CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
- match_bytevector, ctx, rows_left.metadata().fixed_length,
- rows_left.data(1), rows_right.data(1));
- } else {
- CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
- match_bytevector, ctx, rows_left.data(2), rows_right.data(2),
- rows_left.offsets(), rows_right.offsets());
- }
-
- // CompareFixedLength can be used to compare nulls as well
- bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx);
- if (nulls_present) {
- CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
- match_bytevector, ctx,
- rows_left.metadata().null_masks_bytes_per_row,
- rows_left.null_masks(), rows_right.null_masks());
- }
-
- util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector,
- match_bitvector);
- if (sel_left_maybe_null) {
- int out_num_rows_int;
- util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare,
- match_bitvector, sel_left_maybe_null,
- &out_num_rows_int, out_sel_left_maybe_same);
- *out_num_rows = out_num_rows_int;
- } else {
- int out_num_rows_int;
- util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare,
- match_bitvector, &out_num_rows_int,
- out_sel_left_maybe_same);
- *out_num_rows = out_num_rows_int;
- }
-}
-
-void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- uint32_t fixed_length, const uint8_t* rows_left,
- const uint8_t* rows_right) {
- bool use_selection = (sel_left_maybe_null != nullptr);
-
- uint32_t num_rows_already_processed = 0;
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && !use_selection) {
- // Choose between up-to-8B length, up-to-16B length and any size versions
- if (fixed_length <= 8) {
- num_rows_already_processed = CompareFixedLength_UpTo8B_avx2(
- num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
- rows_left, rows_right);
- } else if (fixed_length <= 16) {
- num_rows_already_processed = CompareFixedLength_UpTo16B_avx2(
- num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
- rows_left, rows_right);
- } else {
- num_rows_already_processed =
- CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map,
- match_bytevector, fixed_length, rows_left, rows_right);
- }
- }
-#endif
-
- typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*,
- const uint32_t*, uint8_t*, uint32_t,
- const uint8_t*, const uint8_t*);
- static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = {
- CompareFixedLengthImp<false, 1>, CompareFixedLengthImp<false, 2>,
- CompareFixedLengthImp<false, 0>, CompareFixedLengthImp<true, 1>,
- CompareFixedLengthImp<true, 2>, CompareFixedLengthImp<true, 0>};
- int dispatch_const = (use_selection ? 3 : 0) +
- ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2));
- CompareFixedLengthImp_fn[dispatch_const](
- num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null,
- left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right);
-}
-
-template <bool use_selection, int num_64bit_words>
-void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed,
- uint32_t num_rows,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, uint32_t length,
- const uint8_t* rows_left,
- const uint8_t* rows_right) {
- // Key length (for encoded key) has to be non-zero
- ARROW_DCHECK(length > 0);
-
- // Non-zero length guarantees no underflow
- int32_t num_loops_less_one = (static_cast<int32_t>(length) + 7) / 8 - 1;
-
- // Length remaining in last loop can only be zero for input length equal to zero
- uint32_t length_remaining_last_loop = length - num_loops_less_one * 8;
- uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop));
-
- for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) {
- uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input;
- uint32_t irow_right = left_to_right_map[irow_left];
- uint32_t begin_left = length * irow_left;
- uint32_t begin_right = length * irow_right;
- const uint64_t* key_left_ptr =
- reinterpret_cast<const uint64_t*>(rows_left + begin_left);
- const uint64_t* key_right_ptr =
- reinterpret_cast<const uint64_t*>(rows_right + begin_right);
- uint64_t result_or = 0ULL;
- int32_t istripe = 0;
-
- // Specializations for keys up to 8 bytes and between 9 and 16 bytes to
- // avoid internal loop over words in the value for short ones.
- //
- // Template argument 0 means arbitrarily many 64-bit words,
- // 1 means up to 1 and 2 means up to 2.
- //
- if (num_64bit_words == 0) {
- for (; istripe < num_loops_less_one; ++istripe) {
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (key_left ^ key_right);
- }
- } else if (num_64bit_words == 2) {
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (key_left ^ key_right);
- ++istripe;
- }
-
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (tail_mask & (key_left ^ key_right));
-
- int result = (result_or == 0 ? 0xff : 0);
- match_bytevector[id_input] &= result;
- }
-}
-
-void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- const uint8_t* rows_left, const uint8_t* rows_right,
- const uint32_t* offsets_left,
- const uint32_t* offsets_right) {
- bool use_selection = (sel_left_maybe_null != nullptr);
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && !use_selection) {
- CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector,
- rows_left, rows_right, offsets_left, offsets_right);
- } else {
-#endif
- if (use_selection) {
- CompareVaryingLengthImp<true>(num_rows_to_compare, sel_left_maybe_null,
- left_to_right_map, match_bytevector, rows_left,
- rows_right, offsets_left, offsets_right);
- } else {
- CompareVaryingLengthImp<false>(num_rows_to_compare, sel_left_maybe_null,
- left_to_right_map, match_bytevector, rows_left,
- rows_right, offsets_left, offsets_right);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-template <bool use_selection>
-void KeyCompare::CompareVaryingLengthImp(
- uint32_t num_rows, const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
- const uint32_t* offsets_right) {
- static const uint64_t tail_masks[] = {
- 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL,
- 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL,
- 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL};
- for (uint32_t i = 0; i < num_rows; ++i) {
- uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
- uint32_t irow_right = left_to_right_map[irow_left];
- uint32_t begin_left = offsets_left[irow_left];
- uint32_t begin_right = offsets_right[irow_right];
- uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
- uint32_t length_right = offsets_right[irow_right + 1] - begin_right;
- uint32_t length = std::min(length_left, length_right);
- const uint64_t* key_left_ptr =
- reinterpret_cast<const uint64_t*>(rows_left + begin_left);
- const uint64_t* key_right_ptr =
- reinterpret_cast<const uint64_t*>(rows_right + begin_right);
- uint64_t result_or = 0;
- int32_t istripe;
- // length can be zero
- for (istripe = 0; istripe < (static_cast<int32_t>(length) + 7) / 8 - 1; ++istripe) {
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (key_left ^ key_right);
- }
-
- uint32_t length_remaining = length - static_cast<uint32_t>(istripe) * 8;
- uint64_t tail_mask = tail_masks[length_remaining];
-
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (tail_mask & (key_left ^ key_right));
-
- int result = (result_or == 0 ? 0xff : 0);
- match_bytevector[i] &= result;
- }
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_compare.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace compute {
+
+void KeyCompare::CompareRows(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
+ uint16_t* out_sel_left_maybe_same,
+ const KeyEncoder::KeyRowArray& rows_left,
+ const KeyEncoder::KeyRowArray& rows_right) {
+ ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata()));
+
+ if (num_rows_to_compare == 0) {
+ *out_num_rows = 0;
+ return;
+ }
+
+ // Allocate temporary byte and bit vectors
+ auto bytevector_holder =
+ util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
+ auto bitvector_holder =
+ util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
+
+ uint8_t* match_bytevector = bytevector_holder.mutable_data();
+ uint8_t* match_bitvector = bitvector_holder.mutable_data();
+
+ // All comparison functions called here will update match byte vector
+ // (AND it with comparison result) instead of overwriting it.
+ memset(match_bytevector, 0xff, num_rows_to_compare);
+
+ if (rows_left.metadata().is_fixed_length) {
+ CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx, rows_left.metadata().fixed_length,
+ rows_left.data(1), rows_right.data(1));
+ } else {
+ CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx, rows_left.data(2), rows_right.data(2),
+ rows_left.offsets(), rows_right.offsets());
+ }
+
+ // CompareFixedLength can be used to compare nulls as well
+ bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx);
+ if (nulls_present) {
+ CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx,
+ rows_left.metadata().null_masks_bytes_per_row,
+ rows_left.null_masks(), rows_right.null_masks());
+ }
+
+ util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector,
+ match_bitvector);
+ if (sel_left_maybe_null) {
+ int out_num_rows_int;
+ util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare,
+ match_bitvector, sel_left_maybe_null,
+ &out_num_rows_int, out_sel_left_maybe_same);
+ *out_num_rows = out_num_rows_int;
+ } else {
+ int out_num_rows_int;
+ util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare,
+ match_bitvector, &out_num_rows_int,
+ out_sel_left_maybe_same);
+ *out_num_rows = out_num_rows_int;
+ }
+}
+
+void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ uint32_t fixed_length, const uint8_t* rows_left,
+ const uint8_t* rows_right) {
+ bool use_selection = (sel_left_maybe_null != nullptr);
+
+ uint32_t num_rows_already_processed = 0;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && !use_selection) {
+ // Choose between up-to-8B length, up-to-16B length and any size versions
+ if (fixed_length <= 8) {
+ num_rows_already_processed = CompareFixedLength_UpTo8B_avx2(
+ num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
+ rows_left, rows_right);
+ } else if (fixed_length <= 16) {
+ num_rows_already_processed = CompareFixedLength_UpTo16B_avx2(
+ num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
+ rows_left, rows_right);
+ } else {
+ num_rows_already_processed =
+ CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map,
+ match_bytevector, fixed_length, rows_left, rows_right);
+ }
+ }
+#endif
+
+ typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*,
+ const uint32_t*, uint8_t*, uint32_t,
+ const uint8_t*, const uint8_t*);
+ static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = {
+ CompareFixedLengthImp<false, 1>, CompareFixedLengthImp<false, 2>,
+ CompareFixedLengthImp<false, 0>, CompareFixedLengthImp<true, 1>,
+ CompareFixedLengthImp<true, 2>, CompareFixedLengthImp<true, 0>};
+ int dispatch_const = (use_selection ? 3 : 0) +
+ ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2));
+ CompareFixedLengthImp_fn[dispatch_const](
+ num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right);
+}
+
+template <bool use_selection, int num_64bit_words>
+void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed,
+ uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left,
+ const uint8_t* rows_right) {
+ // Key length (for encoded key) has to be non-zero
+ ARROW_DCHECK(length > 0);
+
+ // Non-zero length guarantees no underflow
+ int32_t num_loops_less_one = (static_cast<int32_t>(length) + 7) / 8 - 1;
+
+ // Length remaining in last loop can only be zero for input length equal to zero
+ uint32_t length_remaining_last_loop = length - num_loops_less_one * 8;
+ uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop));
+
+ for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) {
+ uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input;
+ uint32_t irow_right = left_to_right_map[irow_left];
+ uint32_t begin_left = length * irow_left;
+ uint32_t begin_right = length * irow_right;
+ const uint64_t* key_left_ptr =
+ reinterpret_cast<const uint64_t*>(rows_left + begin_left);
+ const uint64_t* key_right_ptr =
+ reinterpret_cast<const uint64_t*>(rows_right + begin_right);
+ uint64_t result_or = 0ULL;
+ int32_t istripe = 0;
+
+ // Specializations for keys up to 8 bytes and between 9 and 16 bytes to
+ // avoid internal loop over words in the value for short ones.
+ //
+ // Template argument 0 means arbitrarily many 64-bit words,
+ // 1 means up to 1 and 2 means up to 2.
+ //
+ if (num_64bit_words == 0) {
+ for (; istripe < num_loops_less_one; ++istripe) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ }
+ } else if (num_64bit_words == 2) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ ++istripe;
+ }
+
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (tail_mask & (key_left ^ key_right));
+
+ int result = (result_or == 0 ? 0xff : 0);
+ match_bytevector[id_input] &= result;
+ }
+}
+
+void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ const uint8_t* rows_left, const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right) {
+ bool use_selection = (sel_left_maybe_null != nullptr);
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && !use_selection) {
+ CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector,
+ rows_left, rows_right, offsets_left, offsets_right);
+ } else {
+#endif
+ if (use_selection) {
+ CompareVaryingLengthImp<true>(num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, rows_left,
+ rows_right, offsets_left, offsets_right);
+ } else {
+ CompareVaryingLengthImp<false>(num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, rows_left,
+ rows_right, offsets_left, offsets_right);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+template <bool use_selection>
+void KeyCompare::CompareVaryingLengthImp(
+ uint32_t num_rows, const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
+ const uint32_t* offsets_right) {
+ static const uint64_t tail_masks[] = {
+ 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL,
+ 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL,
+ 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL};
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
+ uint32_t irow_right = left_to_right_map[irow_left];
+ uint32_t begin_left = offsets_left[irow_left];
+ uint32_t begin_right = offsets_right[irow_right];
+ uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
+ uint32_t length_right = offsets_right[irow_right + 1] - begin_right;
+ uint32_t length = std::min(length_left, length_right);
+ const uint64_t* key_left_ptr =
+ reinterpret_cast<const uint64_t*>(rows_left + begin_left);
+ const uint64_t* key_right_ptr =
+ reinterpret_cast<const uint64_t*>(rows_right + begin_right);
+ uint64_t result_or = 0;
+ int32_t istripe;
+ // length can be zero
+ for (istripe = 0; istripe < (static_cast<int32_t>(length) + 7) / 8 - 1; ++istripe) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ }
+
+ uint32_t length_remaining = length - static_cast<uint32_t>(istripe) * 8;
+ uint64_t tail_mask = tail_masks[length_remaining];
+
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (tail_mask & (key_left ^ key_right));
+
+ int result = (result_or == 0 ? 0xff : 0);
+ match_bytevector[i] &= result;
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
index 1dffabb884b..397a729dac6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
@@ -1,101 +1,101 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/compute/exec/key_encode.h"
-#include "arrow/compute/exec/util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-
-namespace arrow {
-namespace compute {
-
-class KeyCompare {
- public:
- // Returns a single 16-bit selection vector of rows that failed comparison.
- // If there is input selection on the left, the resulting selection is a filtered image
- // of input selection.
- static void CompareRows(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
- uint16_t* out_sel_left_maybe_same,
- const KeyEncoder::KeyRowArray& rows_left,
- const KeyEncoder::KeyRowArray& rows_right);
-
- private:
- static void CompareFixedLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- uint32_t fixed_length, const uint8_t* rows_left,
- const uint8_t* rows_right);
- static void CompareVaryingLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- const uint8_t* rows_left, const uint8_t* rows_right,
- const uint32_t* offsets_left,
- const uint32_t* offsets_right);
-
- // Second template argument is 0, 1 or 2.
- // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2.
- template <bool use_selection, int num_64bit_words>
- static void CompareFixedLengthImp(uint32_t num_rows_already_processed,
- uint32_t num_rows,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, uint32_t length,
- const uint8_t* rows_left, const uint8_t* rows_right);
- template <bool use_selection>
- static void CompareVaryingLengthImp(uint32_t num_rows,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, const uint8_t* rows_left,
- const uint8_t* rows_right,
- const uint32_t* offsets_left,
- const uint32_t* offsets_right);
-
-#if defined(ARROW_HAVE_AVX2)
-
- static uint32_t CompareFixedLength_UpTo8B_avx2(
- uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
- static uint32_t CompareFixedLength_UpTo16B_avx2(
- uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
- static uint32_t CompareFixedLength_avx2(uint32_t num_rows,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, uint32_t length,
- const uint8_t* rows_left,
- const uint8_t* rows_right);
- static void CompareVaryingLength_avx2(
- uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
- const uint32_t* offsets_right);
-
-#endif
-};
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/compute/exec/key_encode.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace compute {
+
+class KeyCompare {
+ public:
+ // Returns a single 16-bit selection vector of rows that failed comparison.
+ // If there is input selection on the left, the resulting selection is a filtered image
+ // of input selection.
+ static void CompareRows(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
+ uint16_t* out_sel_left_maybe_same,
+ const KeyEncoder::KeyRowArray& rows_left,
+ const KeyEncoder::KeyRowArray& rows_right);
+
+ private:
+ static void CompareFixedLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ uint32_t fixed_length, const uint8_t* rows_left,
+ const uint8_t* rows_right);
+ static void CompareVaryingLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ const uint8_t* rows_left, const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+ // Second template argument is 0, 1 or 2.
+ // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2.
+ template <bool use_selection, int num_64bit_words>
+ static void CompareFixedLengthImp(uint32_t num_rows_already_processed,
+ uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left, const uint8_t* rows_right);
+ template <bool use_selection>
+ static void CompareVaryingLengthImp(uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, const uint8_t* rows_left,
+ const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+#if defined(ARROW_HAVE_AVX2)
+
+ static uint32_t CompareFixedLength_UpTo8B_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
+ static uint32_t CompareFixedLength_UpTo16B_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
+ static uint32_t CompareFixedLength_avx2(uint32_t num_rows,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left,
+ const uint8_t* rows_right);
+ static void CompareVaryingLength_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+#endif
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
index de79558f2c2..f0498b509a1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
@@ -1,1649 +1,1649 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_encode.h"
-
-#include <memory.h>
-
-#include <algorithm>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace compute {
-
-KeyEncoder::KeyRowArray::KeyRowArray()
- : pool_(nullptr), rows_capacity_(0), bytes_capacity_(0) {}
-
-Status KeyEncoder::KeyRowArray::Init(MemoryPool* pool, const KeyRowMetadata& metadata) {
- pool_ = pool;
- metadata_ = metadata;
-
- DCHECK(!null_masks_ && !offsets_ && !rows_);
-
- constexpr int64_t rows_capacity = 8;
- constexpr int64_t bytes_capacity = 1024;
-
- // Null masks
- ARROW_ASSIGN_OR_RAISE(auto null_masks,
- AllocateResizableBuffer(size_null_masks(rows_capacity), pool_));
- null_masks_ = std::move(null_masks);
- memset(null_masks_->mutable_data(), 0, size_null_masks(rows_capacity));
-
- // Offsets and rows
- if (!metadata.is_fixed_length) {
- ARROW_ASSIGN_OR_RAISE(auto offsets,
- AllocateResizableBuffer(size_offsets(rows_capacity), pool_));
- offsets_ = std::move(offsets);
- memset(offsets_->mutable_data(), 0, size_offsets(rows_capacity));
- reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
-
- ARROW_ASSIGN_OR_RAISE(
- auto rows,
- AllocateResizableBuffer(size_rows_varying_length(bytes_capacity), pool_));
- rows_ = std::move(rows);
- memset(rows_->mutable_data(), 0, size_rows_varying_length(bytes_capacity));
- bytes_capacity_ = size_rows_varying_length(bytes_capacity) - padding_for_vectors;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- auto rows, AllocateResizableBuffer(size_rows_fixed_length(rows_capacity), pool_));
- rows_ = std::move(rows);
- memset(rows_->mutable_data(), 0, size_rows_fixed_length(rows_capacity));
- bytes_capacity_ = size_rows_fixed_length(rows_capacity) - padding_for_vectors;
- }
-
- update_buffer_pointers();
-
- rows_capacity_ = rows_capacity;
-
- num_rows_ = 0;
- num_rows_for_has_any_nulls_ = 0;
- has_any_nulls_ = false;
-
- return Status::OK();
-}
-
-void KeyEncoder::KeyRowArray::Clean() {
- num_rows_ = 0;
- num_rows_for_has_any_nulls_ = 0;
- has_any_nulls_ = false;
-
- if (!metadata_.is_fixed_length) {
- reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
- }
-}
-
-int64_t KeyEncoder::KeyRowArray::size_null_masks(int64_t num_rows) {
- return num_rows * metadata_.null_masks_bytes_per_row + padding_for_vectors;
-}
-
-int64_t KeyEncoder::KeyRowArray::size_offsets(int64_t num_rows) {
- return (num_rows + 1) * sizeof(uint32_t) + padding_for_vectors;
-}
-
-int64_t KeyEncoder::KeyRowArray::size_rows_fixed_length(int64_t num_rows) {
- return num_rows * metadata_.fixed_length + padding_for_vectors;
-}
-
-int64_t KeyEncoder::KeyRowArray::size_rows_varying_length(int64_t num_bytes) {
- return num_bytes + padding_for_vectors;
-}
-
-void KeyEncoder::KeyRowArray::update_buffer_pointers() {
- buffers_[0] = mutable_buffers_[0] = null_masks_->mutable_data();
- if (metadata_.is_fixed_length) {
- buffers_[1] = mutable_buffers_[1] = rows_->mutable_data();
- buffers_[2] = mutable_buffers_[2] = nullptr;
- } else {
- buffers_[1] = mutable_buffers_[1] = offsets_->mutable_data();
- buffers_[2] = mutable_buffers_[2] = rows_->mutable_data();
- }
-}
-
-Status KeyEncoder::KeyRowArray::ResizeFixedLengthBuffers(int64_t num_extra_rows) {
- if (rows_capacity_ >= num_rows_ + num_extra_rows) {
- return Status::OK();
- }
-
- int64_t rows_capacity_new = std::max(static_cast<int64_t>(1), 2 * rows_capacity_);
- while (rows_capacity_new < num_rows_ + num_extra_rows) {
- rows_capacity_new *= 2;
- }
-
- // Null masks
- RETURN_NOT_OK(null_masks_->Resize(size_null_masks(rows_capacity_new), false));
- memset(null_masks_->mutable_data() + size_null_masks(rows_capacity_), 0,
- size_null_masks(rows_capacity_new) - size_null_masks(rows_capacity_));
-
- // Either offsets or rows
- if (!metadata_.is_fixed_length) {
- RETURN_NOT_OK(offsets_->Resize(size_offsets(rows_capacity_new), false));
- memset(offsets_->mutable_data() + size_offsets(rows_capacity_), 0,
- size_offsets(rows_capacity_new) - size_offsets(rows_capacity_));
- } else {
- RETURN_NOT_OK(rows_->Resize(size_rows_fixed_length(rows_capacity_new), false));
- memset(rows_->mutable_data() + size_rows_fixed_length(rows_capacity_), 0,
- size_rows_fixed_length(rows_capacity_new) -
- size_rows_fixed_length(rows_capacity_));
- bytes_capacity_ = size_rows_fixed_length(rows_capacity_new) - padding_for_vectors;
- }
-
- update_buffer_pointers();
-
- rows_capacity_ = rows_capacity_new;
-
- return Status::OK();
-}
-
-Status KeyEncoder::KeyRowArray::ResizeOptionalVaryingLengthBuffer(
- int64_t num_extra_bytes) {
- int64_t num_bytes = offsets()[num_rows_];
- if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) {
- return Status::OK();
- }
-
- int64_t bytes_capacity_new = std::max(static_cast<int64_t>(1), 2 * bytes_capacity_);
- while (bytes_capacity_new < num_bytes + num_extra_bytes) {
- bytes_capacity_new *= 2;
- }
-
- RETURN_NOT_OK(rows_->Resize(size_rows_varying_length(bytes_capacity_new), false));
- memset(rows_->mutable_data() + size_rows_varying_length(bytes_capacity_), 0,
- size_rows_varying_length(bytes_capacity_new) -
- size_rows_varying_length(bytes_capacity_));
-
- update_buffer_pointers();
-
- bytes_capacity_ = bytes_capacity_new;
-
- return Status::OK();
-}
-
-Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from,
- uint32_t num_rows_to_append,
- const uint16_t* source_row_ids) {
- DCHECK(metadata_.is_compatible(from.metadata()));
-
- RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
-
- if (!metadata_.is_fixed_length) {
- // Varying-length rows
- auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
- auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
- uint32_t total_length = to_offsets[num_rows_];
- uint32_t total_length_to_append = 0;
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint16_t row_id = source_row_ids[i];
- uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
- total_length_to_append += length;
- to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
- }
-
- RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
-
- const uint8_t* src = from.rows_->data();
- uint8_t* dst = rows_->mutable_data() + total_length;
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint16_t row_id = source_row_ids[i];
- uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
- auto src64 = reinterpret_cast<const uint64_t*>(src + from_offsets[row_id]);
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
- dst64[j] = src64[j];
- }
- dst += length;
- }
- } else {
- // Fixed-length rows
- const uint8_t* src = from.rows_->data();
- uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length;
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint16_t row_id = source_row_ids[i];
- uint32_t length = metadata_.fixed_length;
- auto src64 = reinterpret_cast<const uint64_t*>(src + length * row_id);
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
- dst64[j] = src64[j];
- }
- dst += length;
- }
- }
-
- // Null masks
- uint32_t byte_length = metadata_.null_masks_bytes_per_row;
- uint64_t dst_byte_offset = num_rows_ * byte_length;
- const uint8_t* src_base = from.null_masks_->data();
- uint8_t* dst_base = null_masks_->mutable_data();
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint32_t row_id = source_row_ids[i];
- int64_t src_byte_offset = row_id * byte_length;
- const uint8_t* src = src_base + src_byte_offset;
- uint8_t* dst = dst_base + dst_byte_offset;
- for (uint32_t ibyte = 0; ibyte < byte_length; ++ibyte) {
- dst[ibyte] = src[ibyte];
- }
- dst_byte_offset += byte_length;
- }
-
- num_rows_ += num_rows_to_append;
-
- return Status::OK();
-}
-
-Status KeyEncoder::KeyRowArray::AppendEmpty(uint32_t num_rows_to_append,
- uint32_t num_extra_bytes_to_append) {
- RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
- RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append));
- num_rows_ += num_rows_to_append;
- if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) {
- memset(rows_->mutable_data(), 0, bytes_capacity_);
- }
- return Status::OK();
-}
-
-bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const {
- if (has_any_nulls_) {
- return true;
- }
- if (num_rows_for_has_any_nulls_ < num_rows_) {
- auto size_per_row = metadata().null_masks_bytes_per_row;
- has_any_nulls_ = !util::BitUtil::are_all_bytes_zero(
- ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_,
- static_cast<uint32_t>(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_)));
- num_rows_for_has_any_nulls_ = num_rows_;
- }
- return has_any_nulls_;
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
- const KeyColumnArray& left,
- const KeyColumnArray& right,
- int buffer_id_to_replace) {
- metadata_ = metadata;
- length_ = left.length();
- for (int i = 0; i < max_buffers_; ++i) {
- buffers_[i] = left.buffers_[i];
- mutable_buffers_[i] = left.mutable_buffers_[i];
- }
- buffers_[buffer_id_to_replace] = right.buffers_[buffer_id_to_replace];
- mutable_buffers_[buffer_id_to_replace] = right.mutable_buffers_[buffer_id_to_replace];
- bit_offset_[0] = left.bit_offset_[0];
- bit_offset_[1] = left.bit_offset_[1];
- if (buffer_id_to_replace < max_buffers_ - 1) {
- bit_offset_[buffer_id_to_replace] = right.bit_offset_[buffer_id_to_replace];
- }
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
- int64_t length, const uint8_t* buffer0,
- const uint8_t* buffer1, const uint8_t* buffer2,
- int bit_offset0, int bit_offset1) {
- metadata_ = metadata;
- length_ = length;
- buffers_[0] = buffer0;
- buffers_[1] = buffer1;
- buffers_[2] = buffer2;
- mutable_buffers_[0] = mutable_buffers_[1] = mutable_buffers_[2] = nullptr;
- bit_offset_[0] = bit_offset0;
- bit_offset_[1] = bit_offset1;
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
- int64_t length, uint8_t* buffer0,
- uint8_t* buffer1, uint8_t* buffer2,
- int bit_offset0, int bit_offset1) {
- metadata_ = metadata;
- length_ = length;
- buffers_[0] = mutable_buffers_[0] = buffer0;
- buffers_[1] = mutable_buffers_[1] = buffer1;
- buffers_[2] = mutable_buffers_[2] = buffer2;
- bit_offset_[0] = bit_offset0;
- bit_offset_[1] = bit_offset1;
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnArray& from, int64_t start,
- int64_t length) {
- metadata_ = from.metadata_;
- length_ = length;
- uint32_t fixed_size =
- !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
-
- buffers_[0] =
- from.buffers_[0] ? from.buffers_[0] + (from.bit_offset_[0] + start) / 8 : nullptr;
- mutable_buffers_[0] = from.mutable_buffers_[0]
- ? from.mutable_buffers_[0] + (from.bit_offset_[0] + start) / 8
- : nullptr;
- bit_offset_[0] = (from.bit_offset_[0] + start) % 8;
-
- if (fixed_size == 0) {
- buffers_[1] =
- from.buffers_[1] ? from.buffers_[1] + (from.bit_offset_[1] + start) / 8 : nullptr;
- mutable_buffers_[1] = from.mutable_buffers_[1] ? from.mutable_buffers_[1] +
- (from.bit_offset_[1] + start) / 8
- : nullptr;
- bit_offset_[1] = (from.bit_offset_[1] + start) % 8;
- } else {
- buffers_[1] = from.buffers_[1] ? from.buffers_[1] + start * fixed_size : nullptr;
- mutable_buffers_[1] = from.mutable_buffers_[1]
- ? from.mutable_buffers_[1] + start * fixed_size
- : nullptr;
- bit_offset_[1] = 0;
- }
-
- buffers_[2] = from.buffers_[2];
- mutable_buffers_[2] = from.mutable_buffers_[2];
-}
-
-KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace(
- const KeyColumnArray& column, const KeyColumnArray& temp) {
- // Make sure that the temp buffer is large enough
- DCHECK(temp.length() >= column.length() && temp.metadata().is_fixed_length &&
- temp.metadata().fixed_length >= sizeof(uint8_t));
- KeyColumnMetadata metadata;
- metadata.is_fixed_length = true;
- metadata.fixed_length = sizeof(uint8_t);
- constexpr int buffer_index = 1;
- KeyColumnArray result = KeyColumnArray(metadata, column, temp, buffer_index);
- return result;
-}
-
-void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- // Make sure that metadata and lengths are compatible.
- DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
- DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0);
- DCHECK(output->length() == input.length());
- constexpr int buffer_index = 1;
- DCHECK(input.data(buffer_index) != nullptr);
- DCHECK(output->mutable_data(buffer_index) != nullptr);
- util::BitUtil::bits_to_bytes(
- ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
- output->mutable_data(buffer_index), input.bit_offset(buffer_index));
-}
-
-void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- // Make sure that metadata and lengths are compatible.
- DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
- DCHECK(output->metadata().fixed_length == 0 && input.metadata().fixed_length == 1);
- DCHECK(output->length() == input.length());
- constexpr int buffer_index = 1;
- DCHECK(input.data(buffer_index) != nullptr);
- DCHECK(output->mutable_data(buffer_index) != nullptr);
-
- util::BitUtil::bytes_to_bits(
- ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
- output->mutable_data(buffer_index), output->bit_offset(buffer_index));
-}
-
-bool KeyEncoder::EncoderInteger::IsBoolean(const KeyColumnMetadata& metadata) {
- return metadata.is_fixed_length && metadata.fixed_length == 0;
-}
-
-bool KeyEncoder::EncoderInteger::UsesTransform(const KeyColumnArray& column) {
- return IsBoolean(column.metadata());
-}
-
-KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace(
- const KeyColumnArray& column, const KeyColumnArray& temp) {
- if (IsBoolean(column.metadata())) {
- return TransformBoolean::ArrayReplace(column, temp);
- }
- return column;
-}
-
-void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- if (IsBoolean(input.metadata())) {
- TransformBoolean::PreEncode(input, output, ctx);
- }
-}
-
-void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- if (IsBoolean(output->metadata())) {
- TransformBoolean::PostDecode(input, output, ctx);
- }
-}
-
-void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp) {
- KeyColumnArray col_prep;
- if (UsesTransform(col)) {
- col_prep = ArrayReplace(col, *temp);
- PreEncode(col, &col_prep, ctx);
- } else {
- col_prep = col;
- }
-
- const auto num_rows = static_cast<uint32_t>(col.length());
-
- // When we have a single fixed length column we can just do memcpy
- if (rows->metadata().is_fixed_length &&
- rows->metadata().fixed_length == col.metadata().fixed_length) {
- DCHECK_EQ(offset_within_row, 0);
- uint32_t row_size = col.metadata().fixed_length;
- memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size);
- } else if (rows->metadata().is_fixed_length) {
- uint32_t row_size = rows->metadata().fixed_length;
- uint8_t* row_base = rows->mutable_data(1) + offset_within_row;
- const uint8_t* col_base = col_prep.data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- row_base[i * row_size] = col_base[i];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint16_t*>(row_base + i * row_size) =
- reinterpret_cast<const uint16_t*>(col_base)[i];
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint32_t*>(row_base + i * row_size) =
- reinterpret_cast<const uint32_t*>(col_base)[i];
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint64_t*>(row_base + i * row_size) =
- reinterpret_cast<const uint64_t*>(col_base)[i];
- }
- break;
- default:
- DCHECK(false);
- }
- } else {
- const uint32_t* row_offsets = rows->offsets();
- uint8_t* row_base = rows->mutable_data(2) + offset_within_row;
- const uint8_t* col_base = col_prep.data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- row_base[row_offsets[i]] = col_base[i];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint16_t*>(row_base + row_offsets[i]) =
- reinterpret_cast<const uint16_t*>(col_base)[i];
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint32_t*>(row_base + row_offsets[i]) =
- reinterpret_cast<const uint32_t*>(col_base)[i];
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint64_t*>(row_base + row_offsets[i]) =
- reinterpret_cast<const uint64_t*>(col_base)[i];
- }
- break;
- default:
- DCHECK(false);
- }
- }
-}
-
-void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp) {
- KeyColumnArray col_prep;
- if (UsesTransform(*col)) {
- col_prep = ArrayReplace(*col, *temp);
- } else {
- col_prep = *col;
- }
-
- // When we have a single fixed length column we can just do memcpy
- if (rows.metadata().is_fixed_length &&
- col_prep.metadata().fixed_length == rows.metadata().fixed_length) {
- DCHECK_EQ(offset_within_row, 0);
- uint32_t row_size = rows.metadata().fixed_length;
- memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size,
- num_rows * row_size);
- } else if (rows.metadata().is_fixed_length) {
- uint32_t row_size = rows.metadata().fixed_length;
- const uint8_t* row_base = rows.data(1) + start_row * row_size;
- row_base += offset_within_row;
- uint8_t* col_base = col_prep.mutable_data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- col_base[i] = row_base[i * row_size];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint16_t*>(col_base)[i] =
- *reinterpret_cast<const uint16_t*>(row_base + i * row_size);
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint32_t*>(col_base)[i] =
- *reinterpret_cast<const uint32_t*>(row_base + i * row_size);
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint64_t*>(col_base)[i] =
- *reinterpret_cast<const uint64_t*>(row_base + i * row_size);
- }
- break;
- default:
- DCHECK(false);
- }
- } else {
- const uint32_t* row_offsets = rows.offsets() + start_row;
- const uint8_t* row_base = rows.data(2);
- row_base += offset_within_row;
- uint8_t* col_base = col_prep.mutable_data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- col_base[i] = row_base[row_offsets[i]];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint16_t*>(col_base)[i] =
- *reinterpret_cast<const uint16_t*>(row_base + row_offsets[i]);
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint32_t*>(col_base)[i] =
- *reinterpret_cast<const uint32_t*>(row_base + row_offsets[i]);
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint64_t*>(col_base)[i] =
- *reinterpret_cast<const uint64_t*>(row_base + row_offsets[i]);
- }
- break;
- default:
- DCHECK(false);
- }
- }
-
- if (UsesTransform(*col)) {
- PostDecode(col_prep, col, ctx);
- }
-}
-
-bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) {
- bool is_fixed_length = metadata.is_fixed_length;
- auto size = metadata.fixed_length;
- return is_fixed_length &&
- (size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
-}
-
-void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp) {
- if (IsInteger(col.metadata())) {
- EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp);
- } else {
- KeyColumnArray col_prep;
- if (EncoderInteger::UsesTransform(col)) {
- col_prep = EncoderInteger::ArrayReplace(col, *temp);
- EncoderInteger::PreEncode(col, &col_prep, ctx);
- } else {
- col_prep = col;
- }
-
- bool is_row_fixed_length = rows->metadata().is_fixed_length;
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col);
- } else {
-#endif
- if (is_row_fixed_length) {
- EncodeImp<true>(offset_within_row, rows, col);
- } else {
- EncodeImp<false>(offset_within_row, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
- }
-
- DCHECK(temp->metadata().is_fixed_length);
- DCHECK(temp->length() * temp->metadata().fixed_length >=
- col.length() * static_cast<int64_t>(sizeof(uint16_t)));
-
- KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(),
- nullptr, temp->mutable_data(1), nullptr);
- ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae);
-}
-
-void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp) {
- if (IsInteger(col->metadata())) {
- EncoderInteger::Decode(start_row, num_rows, offset_within_row, rows, col, ctx, temp);
- } else {
- KeyColumnArray col_prep;
- if (EncoderInteger::UsesTransform(*col)) {
- col_prep = EncoderInteger::ArrayReplace(*col, *temp);
- } else {
- col_prep = *col;
- }
-
- bool is_row_fixed_length = rows.metadata().is_fixed_length;
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows,
- col);
- } else {
-#endif
- if (is_row_fixed_length) {
- DecodeImp<true>(start_row, num_rows, offset_within_row, rows, col);
- } else {
- DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-
- if (EncoderInteger::UsesTransform(*col)) {
- EncoderInteger::PostDecode(col_prep, col, ctx);
- }
- }
-}
-
-template <bool is_row_fixed_length>
-void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col) {
- EncodeDecodeHelper<is_row_fixed_length, true>(
- 0, static_cast<uint32_t>(col.length()), offset_within_row, rows, rows, &col,
- nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- uint32_t istripe;
- for (istripe = 0; istripe < length / 8; ++istripe) {
- dst64[istripe] = util::SafeLoad(src64 + istripe);
- }
- if ((length % 8) > 0) {
- uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
- dst64[istripe] = (dst64[istripe] & ~mask_last) |
- (util::SafeLoad(src64 + istripe) & mask_last);
- }
- });
-}
-
-template <bool is_row_fixed_length>
-void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col) {
- EncodeDecodeHelper<is_row_fixed_length, false>(
- start_row, num_rows, offset_within_row, &rows, nullptr, col, col,
- [](uint8_t* dst, const uint8_t* src, int64_t length) {
- for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- util::SafeStore(dst64 + istripe, src64[istripe]);
- }
- });
-}
-
-void KeyEncoder::EncoderBinary::ColumnMemsetNulls(
- uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
- KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
- using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&,
- KeyEncoderContext*, KeyColumnArray*, uint8_t);
- static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = {
- ColumnMemsetNullsImp<false, 1>, ColumnMemsetNullsImp<false, 2>,
- ColumnMemsetNullsImp<false, 4>, ColumnMemsetNullsImp<false, 8>,
- ColumnMemsetNullsImp<false, 16>, ColumnMemsetNullsImp<true, 1>,
- ColumnMemsetNullsImp<true, 2>, ColumnMemsetNullsImp<true, 4>,
- ColumnMemsetNullsImp<true, 8>, ColumnMemsetNullsImp<true, 16>};
- uint32_t col_width = col.metadata().fixed_length;
- int dispatch_const =
- (rows->metadata().is_fixed_length ? 5 : 0) +
- (col_width == 1 ? 0
- : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4);
- ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx,
- temp_vector_16bit, byte_value);
-}
-
-template <bool is_row_fixed_length, uint32_t col_width>
-void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp(
- uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
- KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
- // Nothing to do when there are no nulls
- if (!col.data(0)) {
- return;
- }
-
- const auto num_rows = static_cast<uint32_t>(col.length());
-
- // Temp vector needs space for the required number of rows
- DCHECK(temp_vector_16bit->length() >= num_rows);
- DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
- temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
- auto temp_vector = reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1));
-
- // Bit vector to index vector of null positions
- int num_selected;
- util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast<int>(col.length()),
- col.data(0), &num_selected, temp_vector,
- col.bit_offset(0));
-
- for (int i = 0; i < num_selected; ++i) {
- uint32_t row_id = temp_vector[i];
-
- // Target binary field pointer
- uint8_t* dst;
- if (is_row_fixed_length) {
- dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id;
- } else {
- dst = rows->mutable_data(2) + rows->offsets()[row_id];
- }
- dst += offset_within_row;
-
- if (col_width == 1) {
- *dst = byte_value;
- } else if (col_width == 2) {
- *reinterpret_cast<uint16_t*>(dst) =
- (static_cast<uint16_t>(byte_value) * static_cast<uint16_t>(0x0101));
- } else if (col_width == 4) {
- *reinterpret_cast<uint32_t*>(dst) =
- (static_cast<uint32_t>(byte_value) * static_cast<uint32_t>(0x01010101));
- } else if (col_width == 8) {
- *reinterpret_cast<uint64_t*>(dst) =
- (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
- } else {
- uint64_t value = (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
- uint32_t col_width_actual = col.metadata().fixed_length;
- uint32_t j;
- for (j = 0; j < col_width_actual / 8; ++j) {
- reinterpret_cast<uint64_t*>(dst)[j] = value;
- }
- int tail = col_width_actual % 8;
- if (tail) {
- uint64_t mask = ~0ULL >> (8 * (8 - tail));
- reinterpret_cast<uint64_t*>(dst)[j] =
- (reinterpret_cast<const uint64_t*>(dst)[j] & ~mask) | (value & mask);
- }
- }
- }
-}
-
-void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2,
- KeyEncoderContext* ctx, KeyColumnArray* temp1,
- KeyColumnArray* temp2) {
- DCHECK(CanProcessPair(col1.metadata(), col2.metadata()));
-
- KeyColumnArray col_prep[2];
- if (EncoderInteger::UsesTransform(col1)) {
- col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1);
- EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx);
- } else {
- col_prep[0] = col1;
- }
- if (EncoderInteger::UsesTransform(col2)) {
- col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2);
- EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx);
- } else {
- col_prep[1] = col2;
- }
-
- uint32_t col_width1 = col_prep[0].metadata().fixed_length;
- uint32_t col_width2 = col_prep[1].metadata().fixed_length;
- int log_col_width1 =
- col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
- int log_col_width2 =
- col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
-
- bool is_row_fixed_length = rows->metadata().is_fixed_length;
-
- const auto num_rows = static_cast<uint32_t>(col1.length());
- uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && col_width1 == col_width2) {
- num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row,
- rows, col_prep[0], col_prep[1]);
- }
-#endif
- if (num_processed < num_rows) {
- using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&,
- const KeyColumnArray&);
- static const EncodeImp_t EncodeImp_fn[] = {
- EncodeImp<false, uint8_t, uint8_t>, EncodeImp<false, uint16_t, uint8_t>,
- EncodeImp<false, uint32_t, uint8_t>, EncodeImp<false, uint64_t, uint8_t>,
- EncodeImp<false, uint8_t, uint16_t>, EncodeImp<false, uint16_t, uint16_t>,
- EncodeImp<false, uint32_t, uint16_t>, EncodeImp<false, uint64_t, uint16_t>,
- EncodeImp<false, uint8_t, uint32_t>, EncodeImp<false, uint16_t, uint32_t>,
- EncodeImp<false, uint32_t, uint32_t>, EncodeImp<false, uint64_t, uint32_t>,
- EncodeImp<false, uint8_t, uint64_t>, EncodeImp<false, uint16_t, uint64_t>,
- EncodeImp<false, uint32_t, uint64_t>, EncodeImp<false, uint64_t, uint64_t>,
- EncodeImp<true, uint8_t, uint8_t>, EncodeImp<true, uint16_t, uint8_t>,
- EncodeImp<true, uint32_t, uint8_t>, EncodeImp<true, uint64_t, uint8_t>,
- EncodeImp<true, uint8_t, uint16_t>, EncodeImp<true, uint16_t, uint16_t>,
- EncodeImp<true, uint32_t, uint16_t>, EncodeImp<true, uint64_t, uint16_t>,
- EncodeImp<true, uint8_t, uint32_t>, EncodeImp<true, uint16_t, uint32_t>,
- EncodeImp<true, uint32_t, uint32_t>, EncodeImp<true, uint64_t, uint32_t>,
- EncodeImp<true, uint8_t, uint64_t>, EncodeImp<true, uint16_t, uint64_t>,
- EncodeImp<true, uint32_t, uint64_t>, EncodeImp<true, uint64_t, uint64_t>};
- int dispatch_const = (log_col_width2 << 2) | log_col_width1;
- dispatch_const += (is_row_fixed_length ? 16 : 0);
- EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0],
- col_prep[1]);
- }
-}
-
-template <bool is_row_fixed_length, typename col1_type, typename col2_type>
-void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip,
- uint32_t offset_within_row,
- KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2) {
- const uint8_t* src_A = col1.data(1);
- const uint8_t* src_B = col2.data(1);
-
- const auto num_rows = static_cast<uint32_t>(col1.length());
-
- uint32_t fixed_length = rows->metadata().fixed_length;
- const uint32_t* offsets;
- uint8_t* dst_base;
- if (is_row_fixed_length) {
- dst_base = rows->mutable_data(1) + offset_within_row;
- offsets = nullptr;
- } else {
- dst_base = rows->mutable_data(2) + offset_within_row;
- offsets = rows->offsets();
- }
-
- using col1_type_const = typename std::add_const<col1_type>::type;
- using col2_type_const = typename std::add_const<col2_type>::type;
-
- if (is_row_fixed_length) {
- uint8_t* dst = dst_base + num_rows_to_skip * fixed_length;
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
- *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
- reinterpret_cast<col2_type_const*>(src_B)[i];
- dst += fixed_length;
- }
- } else {
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- uint8_t* dst = dst_base + offsets[i];
- *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
- *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
- reinterpret_cast<col2_type_const*>(src_B)[i];
- }
- }
-}
-
-void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col1,
- KeyColumnArray* col2, KeyEncoderContext* ctx,
- KeyColumnArray* temp1, KeyColumnArray* temp2) {
- DCHECK(CanProcessPair(col1->metadata(), col2->metadata()));
-
- KeyColumnArray col_prep[2];
- if (EncoderInteger::UsesTransform(*col1)) {
- col_prep[0] = EncoderInteger::ArrayReplace(*col1, *temp1);
- } else {
- col_prep[0] = *col1;
- }
- if (EncoderInteger::UsesTransform(*col2)) {
- col_prep[1] = EncoderInteger::ArrayReplace(*col2, *temp2);
- } else {
- col_prep[1] = *col2;
- }
-
- uint32_t col_width1 = col_prep[0].metadata().fixed_length;
- uint32_t col_width2 = col_prep[1].metadata().fixed_length;
- int log_col_width1 =
- col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
- int log_col_width2 =
- col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
-
- bool is_row_fixed_length = rows.metadata().is_fixed_length;
-
- uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && col_width1 == col_width2) {
- num_processed =
- DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
- offset_within_row, rows, &col_prep[0], &col_prep[1]);
- }
-#endif
- if (num_processed < num_rows) {
- using DecodeImp_t = void (*)(uint32_t, uint32_t, uint32_t, uint32_t,
- const KeyRowArray&, KeyColumnArray*, KeyColumnArray*);
- static const DecodeImp_t DecodeImp_fn[] = {
- DecodeImp<false, uint8_t, uint8_t>, DecodeImp<false, uint16_t, uint8_t>,
- DecodeImp<false, uint32_t, uint8_t>, DecodeImp<false, uint64_t, uint8_t>,
- DecodeImp<false, uint8_t, uint16_t>, DecodeImp<false, uint16_t, uint16_t>,
- DecodeImp<false, uint32_t, uint16_t>, DecodeImp<false, uint64_t, uint16_t>,
- DecodeImp<false, uint8_t, uint32_t>, DecodeImp<false, uint16_t, uint32_t>,
- DecodeImp<false, uint32_t, uint32_t>, DecodeImp<false, uint64_t, uint32_t>,
- DecodeImp<false, uint8_t, uint64_t>, DecodeImp<false, uint16_t, uint64_t>,
- DecodeImp<false, uint32_t, uint64_t>, DecodeImp<false, uint64_t, uint64_t>,
- DecodeImp<true, uint8_t, uint8_t>, DecodeImp<true, uint16_t, uint8_t>,
- DecodeImp<true, uint32_t, uint8_t>, DecodeImp<true, uint64_t, uint8_t>,
- DecodeImp<true, uint8_t, uint16_t>, DecodeImp<true, uint16_t, uint16_t>,
- DecodeImp<true, uint32_t, uint16_t>, DecodeImp<true, uint64_t, uint16_t>,
- DecodeImp<true, uint8_t, uint32_t>, DecodeImp<true, uint16_t, uint32_t>,
- DecodeImp<true, uint32_t, uint32_t>, DecodeImp<true, uint64_t, uint32_t>,
- DecodeImp<true, uint8_t, uint64_t>, DecodeImp<true, uint16_t, uint64_t>,
- DecodeImp<true, uint32_t, uint64_t>, DecodeImp<true, uint64_t, uint64_t>};
- int dispatch_const =
- (log_col_width2 << 2) | log_col_width1 | (is_row_fixed_length ? 16 : 0);
- DecodeImp_fn[dispatch_const](num_processed, start_row, num_rows, offset_within_row,
- rows, &(col_prep[0]), &(col_prep[1]));
- }
-
- if (EncoderInteger::UsesTransform(*col1)) {
- EncoderInteger::PostDecode(col_prep[0], col1, ctx);
- }
- if (EncoderInteger::UsesTransform(*col2)) {
- EncoderInteger::PostDecode(col_prep[1], col2, ctx);
- }
-}
-
-template <bool is_row_fixed_length, typename col1_type, typename col2_type>
-void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip,
- uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows,
- KeyColumnArray* col1,
- KeyColumnArray* col2) {
- DCHECK(rows.length() >= start_row + num_rows);
- DCHECK(col1->length() == num_rows && col2->length() == num_rows);
-
- uint8_t* dst_A = col1->mutable_data(1);
- uint8_t* dst_B = col2->mutable_data(1);
-
- uint32_t fixed_length = rows.metadata().fixed_length;
- const uint32_t* offsets;
- const uint8_t* src_base;
- if (is_row_fixed_length) {
- src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
- offsets = nullptr;
- } else {
- src_base = rows.data(2) + offset_within_row;
- offsets = rows.offsets() + start_row;
- }
-
- using col1_type_const = typename std::add_const<col1_type>::type;
- using col2_type_const = typename std::add_const<col2_type>::type;
-
- if (is_row_fixed_length) {
- const uint8_t* src = src_base + num_rows_to_skip * fixed_length;
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
- reinterpret_cast<col2_type*>(dst_B)[i] =
- *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
- src += fixed_length;
- }
- } else {
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- const uint8_t* src = src_base + offsets[i];
- reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
- reinterpret_cast<col2_type*>(dst_B)[i] =
- *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
- }
- }
-}
-
-void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols,
- KeyEncoderContext* ctx) {
- DCHECK(!varbinary_cols.empty());
-
- // Rows and columns must all be varying-length
- DCHECK(!rows->metadata().is_fixed_length);
- for (const auto& col : varbinary_cols) {
- DCHECK(!col.metadata().is_fixed_length);
- }
-
- const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
-
- uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector
- bool has_bit_offset = false;
-
- // The space in columns must be exactly equal to a space for offsets in rows
- DCHECK(rows->length() == num_rows);
- for (const auto& col : varbinary_cols) {
- DCHECK(col.length() == num_rows);
- if (col.bit_offset(0) != 0) {
- has_bit_offset = true;
- }
- }
-
- if (ctx->has_avx2() && !has_bit_offset) {
- // Create a temp vector sized based on the number of columns
- auto temp_buffer_holder = util::TempVectorHolder<uint32_t>(
- ctx->stack, static_cast<uint32_t>(varbinary_cols.size()) * 8);
- auto temp_buffer_32B_per_col = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder.mutable_data()), nullptr);
-
- num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col);
- }
-#endif
- if (num_processed < num_rows) {
- EncodeImp(num_processed, rows, varbinary_cols);
- }
-}
-
-void KeyEncoder::EncoderOffsets::EncodeImp(
- uint32_t num_rows_already_processed, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols) {
- DCHECK_GT(varbinary_cols.size(), 0);
-
- int row_alignment = rows->metadata().row_alignment;
- int string_alignment = rows->metadata().string_alignment;
-
- uint32_t* row_offsets = rows->mutable_offsets();
- uint8_t* row_values = rows->mutable_data(2);
- const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
-
- if (num_rows_already_processed == 0) {
- row_offsets[0] = 0;
- }
-
- uint32_t row_offset = row_offsets[num_rows_already_processed];
- for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) {
- uint32_t* varbinary_end =
- rows->metadata().varbinary_end_array(row_values + row_offset);
-
- // Zero out lengths for nulls.
- // Add lengths of all columns to get row size.
- // Store varbinary field ends while summing their lengths.
-
- uint32_t offset_within_row = rows->metadata().fixed_length;
-
- for (size_t col = 0; col < varbinary_cols.size(); ++col) {
- const uint32_t* col_offsets = varbinary_cols[col].offsets();
- uint32_t col_length = col_offsets[i + 1] - col_offsets[i];
-
- const int bit_offset = varbinary_cols[col].bit_offset(0);
-
- const uint8_t* non_nulls = varbinary_cols[col].data(0);
- if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) {
- col_length = 0;
- }
-
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
- offset_within_row += col_length;
-
- varbinary_end[col] = offset_within_row;
- }
-
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment);
- row_offset += offset_within_row;
- row_offsets[i + 1] = row_offset;
- }
-}
-
-void KeyEncoder::EncoderOffsets::Decode(
- uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* varbinary_cols,
- const std::vector<uint32_t>& varbinary_cols_base_offset, KeyEncoderContext* ctx) {
- DCHECK(!varbinary_cols->empty());
- DCHECK(varbinary_cols->size() == varbinary_cols_base_offset.size());
-
- DCHECK(!rows.metadata().is_fixed_length);
- DCHECK(rows.length() >= start_row + num_rows);
- for (const auto& col : *varbinary_cols) {
- // Rows and columns must all be varying-length
- DCHECK(!col.metadata().is_fixed_length);
- // The space in columns must be exactly equal to a subset of rows selected
- DCHECK(col.length() == num_rows);
- }
-
- // Offsets of varbinary columns data within each encoded row are stored
- // in the same encoded row as an array of 32-bit integers.
- // This array follows immediately the data of fixed-length columns.
- // There is one element for each varying-length column.
- // The Nth element is the sum of all the lengths of varbinary columns data in
- // that row, up to and including Nth varbinary column.
-
- const uint32_t* row_offsets = rows.offsets() + start_row;
-
- // Set the base offset for each column
- for (size_t col = 0; col < varbinary_cols->size(); ++col) {
- uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
- col_offsets[0] = varbinary_cols_base_offset[col];
- }
-
- int string_alignment = rows.metadata().string_alignment;
-
- for (uint32_t i = 0; i < num_rows; ++i) {
- // Find the beginning of cumulative lengths array for next row
- const uint8_t* row = rows.data(2) + row_offsets[i];
- const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row);
-
- // Update the offset of each column
- uint32_t offset_within_row = rows.metadata().fixed_length;
- for (size_t col = 0; col < varbinary_cols->size(); ++col) {
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
- uint32_t length = varbinary_ends[col] - offset_within_row;
- offset_within_row = varbinary_ends[col];
- uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
- col_offsets[i + 1] = col_offsets[i] + length;
- }
- }
-}
-
-void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col,
- KeyEncoderContext* ctx) {
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- EncodeHelper_avx2(varbinary_col_id, rows, col);
- } else {
-#endif
- if (varbinary_col_id == 0) {
- EncodeImp<true>(varbinary_col_id, rows, col);
- } else {
- EncodeImp<false>(varbinary_col_id, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx) {
- // Output column varbinary buffer needs an extra 32B
- // at the end in avx2 version and 8B otherwise.
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
- } else {
-#endif
- if (varbinary_col_id == 0) {
- DecodeImp<true>(start_row, num_rows, varbinary_col_id, rows, col);
- } else {
- DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-template <bool first_varbinary_col>
-void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col) {
- EncodeDecodeHelper<first_varbinary_col, true>(
- 0, static_cast<uint32_t>(col.length()), varbinary_col_id, rows, rows, &col, nullptr,
- [](uint8_t* dst, const uint8_t* src, int64_t length) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- uint32_t istripe;
- for (istripe = 0; istripe < length / 8; ++istripe) {
- dst64[istripe] = util::SafeLoad(src64 + istripe);
- }
- if ((length % 8) > 0) {
- uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
- dst64[istripe] = (dst64[istripe] & ~mask_last) |
- (util::SafeLoad(src64 + istripe) & mask_last);
- }
- });
-}
-
-template <bool first_varbinary_col>
-void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id,
- const KeyRowArray& rows,
- KeyColumnArray* col) {
- EncodeDecodeHelper<first_varbinary_col, false>(
- start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col,
- [](uint8_t* dst, const uint8_t* src, int64_t length) {
- for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- util::SafeStore(dst64 + istripe, src64[istripe]);
- }
- });
-}
-
-void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& cols,
- KeyEncoderContext* ctx,
- KeyColumnArray* temp_vector_16bit) {
- DCHECK_GT(cols.size(), 0);
- const auto num_rows = static_cast<uint32_t>(rows->length());
-
- // All input columns should have the same number of rows.
- // They may or may not have non-nulls bit-vectors allocated.
- for (const auto& col : cols) {
- DCHECK(col.length() == num_rows);
- }
-
- // Temp vector needs space for the required number of rows
- DCHECK(temp_vector_16bit->length() >= num_rows);
- DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
- temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
-
- uint8_t* null_masks = rows->null_masks();
- uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row;
- memset(null_masks, 0, null_masks_bytes_per_row * num_rows);
- for (size_t col = 0; col < cols.size(); ++col) {
- const uint8_t* non_nulls = cols[col].data(0);
- if (!non_nulls) {
- continue;
- }
- int bit_offset = cols[col].bit_offset(0);
- DCHECK_LT(bit_offset, 8);
- int num_selected;
- util::BitUtil::bits_to_indexes(
- 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected,
- reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1)), bit_offset);
- for (int i = 0; i < num_selected; ++i) {
- uint16_t row_id = reinterpret_cast<const uint16_t*>(temp_vector_16bit->data(1))[i];
- int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col;
- BitUtil::SetBit(null_masks, null_masks_bit_id);
- }
- }
-}
-
-void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows,
- const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols) {
- // Every output column needs to have a space for exactly the required number
- // of rows. It also needs to have non-nulls bit-vector allocated and mutable.
- DCHECK_GT(cols->size(), 0);
- for (auto& col : *cols) {
- DCHECK(col.length() == num_rows);
- DCHECK(col.mutable_data(0));
- }
-
- const uint8_t* null_masks = rows.null_masks();
- uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row;
- for (size_t col = 0; col < cols->size(); ++col) {
- uint8_t* non_nulls = (*cols)[col].mutable_data(0);
- const int bit_offset = (*cols)[col].bit_offset(0);
- DCHECK_LT(bit_offset, 8);
- non_nulls[0] |= 0xff << (bit_offset);
- if (bit_offset + num_rows > 8) {
- int bits_in_first_byte = 8 - bit_offset;
- memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte));
- }
- for (uint32_t row = 0; row < num_rows; ++row) {
- uint32_t null_masks_bit_id =
- (start_row + row) * null_masks_bytes_per_row * 8 + static_cast<uint32_t>(col);
- bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id);
- if (is_set) {
- BitUtil::ClearBit(non_nulls, bit_offset + row);
- }
- }
- }
-}
-
-uint32_t KeyEncoder::KeyRowMetadata::num_varbinary_cols() const {
- uint32_t result = 0;
- for (auto column_metadata : column_metadatas) {
- if (!column_metadata.is_fixed_length) {
- ++result;
- }
- }
- return result;
-}
-
-bool KeyEncoder::KeyRowMetadata::is_compatible(const KeyRowMetadata& other) const {
- if (other.num_cols() != num_cols()) {
- return false;
- }
- if (row_alignment != other.row_alignment ||
- string_alignment != other.string_alignment) {
- return false;
- }
- for (size_t i = 0; i < column_metadatas.size(); ++i) {
- if (column_metadatas[i].is_fixed_length !=
- other.column_metadatas[i].is_fixed_length) {
- return false;
- }
- if (column_metadatas[i].fixed_length != other.column_metadatas[i].fixed_length) {
- return false;
- }
- }
- return true;
-}
-
-void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector(
- const std::vector<KeyColumnMetadata>& cols, int in_row_alignment,
- int in_string_alignment) {
- column_metadatas.resize(cols.size());
- for (size_t i = 0; i < cols.size(); ++i) {
- column_metadatas[i] = cols[i];
- }
-
- const auto num_cols = static_cast<uint32_t>(cols.size());
-
- // Sort columns.
- // Columns are sorted based on the size in bytes of their fixed-length part.
- // For the varying-length column, the fixed-length part is the 32-bit field storing
- // cumulative length of varying-length fields.
- // The rules are:
- // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
- // part of 1 byte. b) Columns with fixed-length part being power of 2 or multiple of row
- // alignment precede other columns. They are sorted among themselves based on size of
- // fixed-length part. c) Fixed-length columns precede varying-length columns when both
- // have the same size fixed-length part.
- column_order.resize(num_cols);
- for (uint32_t i = 0; i < num_cols; ++i) {
- column_order[i] = i;
- }
- std::sort(
- column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
- bool is_left_pow2 =
- !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
- bool is_right_pow2 = !cols[right].is_fixed_length ||
- ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
- bool is_left_fixedlen = cols[left].is_fixed_length;
- bool is_right_fixedlen = cols[right].is_fixed_length;
- uint32_t width_left =
- cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
- uint32_t width_right =
- cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
- if (is_left_pow2 != is_right_pow2) {
- return is_left_pow2;
- }
- if (!is_left_pow2) {
- return left < right;
- }
- if (width_left != width_right) {
- return width_left > width_right;
- }
- if (is_left_fixedlen != is_right_fixedlen) {
- return is_left_fixedlen;
- }
- return left < right;
- });
-
- row_alignment = in_row_alignment;
- string_alignment = in_string_alignment;
- varbinary_end_array_offset = 0;
-
- column_offsets.resize(num_cols);
- uint32_t num_varbinary_cols = 0;
- uint32_t offset_within_row = 0;
- for (uint32_t i = 0; i < num_cols; ++i) {
- const KeyColumnMetadata& col = cols[column_order[i]];
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col);
- column_offsets[i] = offset_within_row;
- if (!col.is_fixed_length) {
- if (num_varbinary_cols == 0) {
- varbinary_end_array_offset = offset_within_row;
- }
- DCHECK(column_offsets[i] - varbinary_end_array_offset ==
- num_varbinary_cols * sizeof(uint32_t));
- ++num_varbinary_cols;
- offset_within_row += sizeof(uint32_t);
- } else {
- // Boolean column is a bit-vector, which is indicated by
- // setting fixed length in column metadata to zero.
- // It will be stored as a byte in output row.
- if (col.fixed_length == 0) {
- offset_within_row += 1;
- } else {
- offset_within_row += col.fixed_length;
- }
- }
- }
-
- is_fixed_length = (num_varbinary_cols == 0);
- fixed_length =
- offset_within_row +
- KeyRowMetadata::padding_for_alignment(
- offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment);
-
- // We set the number of bytes per row storing null masks of individual key columns
- // to be a power of two. This is not required. It could be also set to the minimal
- // number of bytes required for a given number of bits (one bit per column).
- null_masks_bytes_per_row = 1;
- while (static_cast<uint32_t>(null_masks_bytes_per_row * 8) < num_cols) {
- null_masks_bytes_per_row *= 2;
- }
-}
-
-void KeyEncoder::Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
- int row_alignment, int string_alignment) {
- ctx_ = ctx;
- row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment);
- uint32_t num_cols = row_metadata_.num_cols();
- uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols();
- batch_all_cols_.resize(num_cols);
- batch_varbinary_cols_.resize(num_varbinary_cols);
- batch_varbinary_cols_base_offsets_.resize(num_varbinary_cols);
-}
-
-void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
- const std::vector<KeyColumnArray>& cols_in) {
- const auto num_cols = static_cast<uint32_t>(cols_in.size());
- DCHECK(batch_all_cols_.size() == num_cols);
-
- uint32_t num_varbinary_visited = 0;
- for (uint32_t i = 0; i < num_cols; ++i) {
- const KeyColumnArray& col = cols_in[row_metadata_.column_order[i]];
- KeyColumnArray col_window(col, start_row, num_rows);
- batch_all_cols_[i] = col_window;
- if (!col.metadata().is_fixed_length) {
- DCHECK(num_varbinary_visited < batch_varbinary_cols_.size());
- // If start row is zero, then base offset of varbinary column is also zero.
- if (start_row == 0) {
- batch_varbinary_cols_base_offsets_[num_varbinary_visited] = 0;
- } else {
- batch_varbinary_cols_base_offsets_[num_varbinary_visited] =
- col.offsets()[start_row];
- }
- batch_varbinary_cols_[num_varbinary_visited++] = col_window;
- }
- }
-}
-
-Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows,
- KeyRowArray* rows,
- const std::vector<KeyColumnArray>& all_cols) {
- int64_t num_bytes_required = 0;
-
- int64_t fixed_part = row_metadata_.fixed_length * num_rows;
- int64_t var_part = 0;
- for (const auto& col : all_cols) {
- if (!col.metadata().is_fixed_length) {
- DCHECK(col.length() >= start_row + num_rows);
- const uint32_t* offsets = col.offsets();
- var_part += offsets[start_row + num_rows] - offsets[start_row];
- // Include maximum padding that can be added to align the start of varbinary fields.
- var_part += num_rows * row_metadata_.string_alignment;
- }
- }
- // Include maximum padding that can be added to align the start of the rows.
- if (!row_metadata_.is_fixed_length) {
- fixed_part += row_metadata_.row_alignment * num_rows;
- }
- num_bytes_required = fixed_part + var_part;
-
- rows->Clean();
- RETURN_NOT_OK(rows->AppendEmpty(static_cast<uint32_t>(num_rows),
- static_cast<uint32_t>(num_bytes_required)));
-
- return Status::OK();
-}
-
-void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& cols) {
- // Prepare column array vectors
- PrepareKeyColumnArrays(start_row, num_rows, cols);
-
- // Create two temp vectors with 16-bit elements
- auto temp_buffer_holder_A =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_A = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
- auto temp_buffer_holder_B =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_B = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
-
- bool is_row_fixed_length = row_metadata_.is_fixed_length;
- if (!is_row_fixed_length) {
- // This call will generate and fill in data for both:
- // - offsets to the entire encoded arrays
- // - offsets for individual varbinary fields within each row
- EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_);
-
- for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
- // Memcpy varbinary fields into precomputed in the previous step
- // positions in the output row buffer.
- EncoderVarBinary::Encode(static_cast<uint32_t>(i), rows, batch_varbinary_cols_[i],
- ctx_);
- }
- }
-
- // Process fixed length columns
- const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
- for (uint32_t i = 0; i < num_cols;) {
- if (!batch_all_cols_[i].metadata().is_fixed_length) {
- i += 1;
- continue;
- }
- bool can_process_pair =
- (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
- EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
- batch_all_cols_[i + 1].metadata());
- if (!can_process_pair) {
- EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
- ctx_, &temp_buffer_A);
- i += 1;
- } else {
- EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
- batch_all_cols_[i + 1], ctx_, &temp_buffer_A,
- &temp_buffer_B);
- i += 2;
- }
- }
-
- // Process nulls
- EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A);
-}
-
-void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input,
- int64_t start_row_output, int64_t num_rows,
- const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols) {
- // Prepare column array vectors
- PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
-
- // Create two temp vectors with 16-bit elements
- auto temp_buffer_holder_A =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_A = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
- auto temp_buffer_holder_B =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_B = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
-
- bool is_row_fixed_length = row_metadata_.is_fixed_length;
- if (!is_row_fixed_length) {
- EncoderOffsets::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows), rows, &batch_varbinary_cols_,
- batch_varbinary_cols_base_offsets_, ctx_);
- }
-
- // Process fixed length columns
- const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
- for (uint32_t i = 0; i < num_cols;) {
- if (!batch_all_cols_[i].metadata().is_fixed_length) {
- i += 1;
- continue;
- }
- bool can_process_pair =
- (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
- EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
- batch_all_cols_[i + 1].metadata());
- if (!can_process_pair) {
- EncoderBinary::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows),
- row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
- ctx_, &temp_buffer_A);
- i += 1;
- } else {
- EncoderBinaryPair::Decode(
- static_cast<uint32_t>(start_row_input), static_cast<uint32_t>(num_rows),
- row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
- &batch_all_cols_[i + 1], ctx_, &temp_buffer_A, &temp_buffer_B);
- i += 2;
- }
- }
-
- // Process nulls
- EncoderNulls::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows), rows, &batch_all_cols_);
-}
-
-void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input,
- int64_t start_row_output, int64_t num_rows,
- const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols) {
- // Prepare column array vectors
- PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
-
- bool is_row_fixed_length = row_metadata_.is_fixed_length;
- if (!is_row_fixed_length) {
- for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
- // Memcpy varbinary fields into precomputed in the previous step
- // positions in the output row buffer.
- EncoderVarBinary::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows), static_cast<uint32_t>(i),
- rows, &batch_varbinary_cols_[i], ctx_);
- }
- }
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_encode.h"
+
+#include <memory.h>
+
+#include <algorithm>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace compute {
+
+KeyEncoder::KeyRowArray::KeyRowArray()
+ : pool_(nullptr), rows_capacity_(0), bytes_capacity_(0) {}
+
+Status KeyEncoder::KeyRowArray::Init(MemoryPool* pool, const KeyRowMetadata& metadata) {
+ pool_ = pool;
+ metadata_ = metadata;
+
+ DCHECK(!null_masks_ && !offsets_ && !rows_);
+
+ constexpr int64_t rows_capacity = 8;
+ constexpr int64_t bytes_capacity = 1024;
+
+ // Null masks
+ ARROW_ASSIGN_OR_RAISE(auto null_masks,
+ AllocateResizableBuffer(size_null_masks(rows_capacity), pool_));
+ null_masks_ = std::move(null_masks);
+ memset(null_masks_->mutable_data(), 0, size_null_masks(rows_capacity));
+
+ // Offsets and rows
+ if (!metadata.is_fixed_length) {
+ ARROW_ASSIGN_OR_RAISE(auto offsets,
+ AllocateResizableBuffer(size_offsets(rows_capacity), pool_));
+ offsets_ = std::move(offsets);
+ memset(offsets_->mutable_data(), 0, size_offsets(rows_capacity));
+ reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+
+ ARROW_ASSIGN_OR_RAISE(
+ auto rows,
+ AllocateResizableBuffer(size_rows_varying_length(bytes_capacity), pool_));
+ rows_ = std::move(rows);
+ memset(rows_->mutable_data(), 0, size_rows_varying_length(bytes_capacity));
+ bytes_capacity_ = size_rows_varying_length(bytes_capacity) - padding_for_vectors;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ auto rows, AllocateResizableBuffer(size_rows_fixed_length(rows_capacity), pool_));
+ rows_ = std::move(rows);
+ memset(rows_->mutable_data(), 0, size_rows_fixed_length(rows_capacity));
+ bytes_capacity_ = size_rows_fixed_length(rows_capacity) - padding_for_vectors;
+ }
+
+ update_buffer_pointers();
+
+ rows_capacity_ = rows_capacity;
+
+ num_rows_ = 0;
+ num_rows_for_has_any_nulls_ = 0;
+ has_any_nulls_ = false;
+
+ return Status::OK();
+}
+
+void KeyEncoder::KeyRowArray::Clean() {
+ num_rows_ = 0;
+ num_rows_for_has_any_nulls_ = 0;
+ has_any_nulls_ = false;
+
+ if (!metadata_.is_fixed_length) {
+ reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+ }
+}
+
+int64_t KeyEncoder::KeyRowArray::size_null_masks(int64_t num_rows) {
+ return num_rows * metadata_.null_masks_bytes_per_row + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_offsets(int64_t num_rows) {
+ return (num_rows + 1) * sizeof(uint32_t) + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_rows_fixed_length(int64_t num_rows) {
+ return num_rows * metadata_.fixed_length + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_rows_varying_length(int64_t num_bytes) {
+ return num_bytes + padding_for_vectors;
+}
+
+void KeyEncoder::KeyRowArray::update_buffer_pointers() {
+ buffers_[0] = mutable_buffers_[0] = null_masks_->mutable_data();
+ if (metadata_.is_fixed_length) {
+ buffers_[1] = mutable_buffers_[1] = rows_->mutable_data();
+ buffers_[2] = mutable_buffers_[2] = nullptr;
+ } else {
+ buffers_[1] = mutable_buffers_[1] = offsets_->mutable_data();
+ buffers_[2] = mutable_buffers_[2] = rows_->mutable_data();
+ }
+}
+
+Status KeyEncoder::KeyRowArray::ResizeFixedLengthBuffers(int64_t num_extra_rows) {
+ if (rows_capacity_ >= num_rows_ + num_extra_rows) {
+ return Status::OK();
+ }
+
+ int64_t rows_capacity_new = std::max(static_cast<int64_t>(1), 2 * rows_capacity_);
+ while (rows_capacity_new < num_rows_ + num_extra_rows) {
+ rows_capacity_new *= 2;
+ }
+
+ // Null masks
+ RETURN_NOT_OK(null_masks_->Resize(size_null_masks(rows_capacity_new), false));
+ memset(null_masks_->mutable_data() + size_null_masks(rows_capacity_), 0,
+ size_null_masks(rows_capacity_new) - size_null_masks(rows_capacity_));
+
+ // Either offsets or rows
+ if (!metadata_.is_fixed_length) {
+ RETURN_NOT_OK(offsets_->Resize(size_offsets(rows_capacity_new), false));
+ memset(offsets_->mutable_data() + size_offsets(rows_capacity_), 0,
+ size_offsets(rows_capacity_new) - size_offsets(rows_capacity_));
+ } else {
+ RETURN_NOT_OK(rows_->Resize(size_rows_fixed_length(rows_capacity_new), false));
+ memset(rows_->mutable_data() + size_rows_fixed_length(rows_capacity_), 0,
+ size_rows_fixed_length(rows_capacity_new) -
+ size_rows_fixed_length(rows_capacity_));
+ bytes_capacity_ = size_rows_fixed_length(rows_capacity_new) - padding_for_vectors;
+ }
+
+ update_buffer_pointers();
+
+ rows_capacity_ = rows_capacity_new;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::ResizeOptionalVaryingLengthBuffer(
+ int64_t num_extra_bytes) {
+ int64_t num_bytes = offsets()[num_rows_];
+ if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) {
+ return Status::OK();
+ }
+
+ int64_t bytes_capacity_new = std::max(static_cast<int64_t>(1), 2 * bytes_capacity_);
+ while (bytes_capacity_new < num_bytes + num_extra_bytes) {
+ bytes_capacity_new *= 2;
+ }
+
+ RETURN_NOT_OK(rows_->Resize(size_rows_varying_length(bytes_capacity_new), false));
+ memset(rows_->mutable_data() + size_rows_varying_length(bytes_capacity_), 0,
+ size_rows_varying_length(bytes_capacity_new) -
+ size_rows_varying_length(bytes_capacity_));
+
+ update_buffer_pointers();
+
+ bytes_capacity_ = bytes_capacity_new;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from,
+ uint32_t num_rows_to_append,
+ const uint16_t* source_row_ids) {
+ DCHECK(metadata_.is_compatible(from.metadata()));
+
+ RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
+
+ if (!metadata_.is_fixed_length) {
+ // Varying-length rows
+ auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
+ auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
+ uint32_t total_length = to_offsets[num_rows_];
+ uint32_t total_length_to_append = 0;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+ total_length_to_append += length;
+ to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
+ }
+
+ RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
+
+ const uint8_t* src = from.rows_->data();
+ uint8_t* dst = rows_->mutable_data() + total_length;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+ auto src64 = reinterpret_cast<const uint64_t*>(src + from_offsets[row_id]);
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
+ dst64[j] = src64[j];
+ }
+ dst += length;
+ }
+ } else {
+ // Fixed-length rows
+ const uint8_t* src = from.rows_->data();
+ uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = metadata_.fixed_length;
+ auto src64 = reinterpret_cast<const uint64_t*>(src + length * row_id);
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
+ dst64[j] = src64[j];
+ }
+ dst += length;
+ }
+ }
+
+ // Null masks
+ uint32_t byte_length = metadata_.null_masks_bytes_per_row;
+ uint64_t dst_byte_offset = num_rows_ * byte_length;
+ const uint8_t* src_base = from.null_masks_->data();
+ uint8_t* dst_base = null_masks_->mutable_data();
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint32_t row_id = source_row_ids[i];
+ int64_t src_byte_offset = row_id * byte_length;
+ const uint8_t* src = src_base + src_byte_offset;
+ uint8_t* dst = dst_base + dst_byte_offset;
+ for (uint32_t ibyte = 0; ibyte < byte_length; ++ibyte) {
+ dst[ibyte] = src[ibyte];
+ }
+ dst_byte_offset += byte_length;
+ }
+
+ num_rows_ += num_rows_to_append;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::AppendEmpty(uint32_t num_rows_to_append,
+ uint32_t num_extra_bytes_to_append) {
+ RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
+ RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append));
+ num_rows_ += num_rows_to_append;
+ if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) {
+ memset(rows_->mutable_data(), 0, bytes_capacity_);
+ }
+ return Status::OK();
+}
+
+bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const {
+ if (has_any_nulls_) {
+ return true;
+ }
+ if (num_rows_for_has_any_nulls_ < num_rows_) {
+ auto size_per_row = metadata().null_masks_bytes_per_row;
+ has_any_nulls_ = !util::BitUtil::are_all_bytes_zero(
+ ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_,
+ static_cast<uint32_t>(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_)));
+ num_rows_for_has_any_nulls_ = num_rows_;
+ }
+ return has_any_nulls_;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ const KeyColumnArray& left,
+ const KeyColumnArray& right,
+ int buffer_id_to_replace) {
+ metadata_ = metadata;
+ length_ = left.length();
+ for (int i = 0; i < max_buffers_; ++i) {
+ buffers_[i] = left.buffers_[i];
+ mutable_buffers_[i] = left.mutable_buffers_[i];
+ }
+ buffers_[buffer_id_to_replace] = right.buffers_[buffer_id_to_replace];
+ mutable_buffers_[buffer_id_to_replace] = right.mutable_buffers_[buffer_id_to_replace];
+ bit_offset_[0] = left.bit_offset_[0];
+ bit_offset_[1] = left.bit_offset_[1];
+ if (buffer_id_to_replace < max_buffers_ - 1) {
+ bit_offset_[buffer_id_to_replace] = right.bit_offset_[buffer_id_to_replace];
+ }
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ int64_t length, const uint8_t* buffer0,
+ const uint8_t* buffer1, const uint8_t* buffer2,
+ int bit_offset0, int bit_offset1) {
+ metadata_ = metadata;
+ length_ = length;
+ buffers_[0] = buffer0;
+ buffers_[1] = buffer1;
+ buffers_[2] = buffer2;
+ mutable_buffers_[0] = mutable_buffers_[1] = mutable_buffers_[2] = nullptr;
+ bit_offset_[0] = bit_offset0;
+ bit_offset_[1] = bit_offset1;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ int64_t length, uint8_t* buffer0,
+ uint8_t* buffer1, uint8_t* buffer2,
+ int bit_offset0, int bit_offset1) {
+ metadata_ = metadata;
+ length_ = length;
+ buffers_[0] = mutable_buffers_[0] = buffer0;
+ buffers_[1] = mutable_buffers_[1] = buffer1;
+ buffers_[2] = mutable_buffers_[2] = buffer2;
+ bit_offset_[0] = bit_offset0;
+ bit_offset_[1] = bit_offset1;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnArray& from, int64_t start,
+ int64_t length) {
+ metadata_ = from.metadata_;
+ length_ = length;
+ uint32_t fixed_size =
+ !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
+
+ buffers_[0] =
+ from.buffers_[0] ? from.buffers_[0] + (from.bit_offset_[0] + start) / 8 : nullptr;
+ mutable_buffers_[0] = from.mutable_buffers_[0]
+ ? from.mutable_buffers_[0] + (from.bit_offset_[0] + start) / 8
+ : nullptr;
+ bit_offset_[0] = (from.bit_offset_[0] + start) % 8;
+
+ if (fixed_size == 0) {
+ buffers_[1] =
+ from.buffers_[1] ? from.buffers_[1] + (from.bit_offset_[1] + start) / 8 : nullptr;
+ mutable_buffers_[1] = from.mutable_buffers_[1] ? from.mutable_buffers_[1] +
+ (from.bit_offset_[1] + start) / 8
+ : nullptr;
+ bit_offset_[1] = (from.bit_offset_[1] + start) % 8;
+ } else {
+ buffers_[1] = from.buffers_[1] ? from.buffers_[1] + start * fixed_size : nullptr;
+ mutable_buffers_[1] = from.mutable_buffers_[1]
+ ? from.mutable_buffers_[1] + start * fixed_size
+ : nullptr;
+ bit_offset_[1] = 0;
+ }
+
+ buffers_[2] = from.buffers_[2];
+ mutable_buffers_[2] = from.mutable_buffers_[2];
+}
+
+KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace(
+ const KeyColumnArray& column, const KeyColumnArray& temp) {
+ // Make sure that the temp buffer is large enough
+ DCHECK(temp.length() >= column.length() && temp.metadata().is_fixed_length &&
+ temp.metadata().fixed_length >= sizeof(uint8_t));
+ KeyColumnMetadata metadata;
+ metadata.is_fixed_length = true;
+ metadata.fixed_length = sizeof(uint8_t);
+ constexpr int buffer_index = 1;
+ KeyColumnArray result = KeyColumnArray(metadata, column, temp, buffer_index);
+ return result;
+}
+
+void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ // Make sure that metadata and lengths are compatible.
+ DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
+ DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0);
+ DCHECK(output->length() == input.length());
+ constexpr int buffer_index = 1;
+ DCHECK(input.data(buffer_index) != nullptr);
+ DCHECK(output->mutable_data(buffer_index) != nullptr);
+ util::BitUtil::bits_to_bytes(
+ ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
+ output->mutable_data(buffer_index), input.bit_offset(buffer_index));
+}
+
+void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ // Make sure that metadata and lengths are compatible.
+ DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
+ DCHECK(output->metadata().fixed_length == 0 && input.metadata().fixed_length == 1);
+ DCHECK(output->length() == input.length());
+ constexpr int buffer_index = 1;
+ DCHECK(input.data(buffer_index) != nullptr);
+ DCHECK(output->mutable_data(buffer_index) != nullptr);
+
+ util::BitUtil::bytes_to_bits(
+ ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
+ output->mutable_data(buffer_index), output->bit_offset(buffer_index));
+}
+
+bool KeyEncoder::EncoderInteger::IsBoolean(const KeyColumnMetadata& metadata) {
+ return metadata.is_fixed_length && metadata.fixed_length == 0;
+}
+
+bool KeyEncoder::EncoderInteger::UsesTransform(const KeyColumnArray& column) {
+ return IsBoolean(column.metadata());
+}
+
+KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace(
+ const KeyColumnArray& column, const KeyColumnArray& temp) {
+ if (IsBoolean(column.metadata())) {
+ return TransformBoolean::ArrayReplace(column, temp);
+ }
+ return column;
+}
+
+void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ if (IsBoolean(input.metadata())) {
+ TransformBoolean::PreEncode(input, output, ctx);
+ }
+}
+
+void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ if (IsBoolean(output->metadata())) {
+ TransformBoolean::PostDecode(input, output, ctx);
+ }
+}
+
+void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp) {
+ KeyColumnArray col_prep;
+ if (UsesTransform(col)) {
+ col_prep = ArrayReplace(col, *temp);
+ PreEncode(col, &col_prep, ctx);
+ } else {
+ col_prep = col;
+ }
+
+ const auto num_rows = static_cast<uint32_t>(col.length());
+
+ // When we have a single fixed length column we can just do memcpy
+ if (rows->metadata().is_fixed_length &&
+ rows->metadata().fixed_length == col.metadata().fixed_length) {
+ DCHECK_EQ(offset_within_row, 0);
+ uint32_t row_size = col.metadata().fixed_length;
+ memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size);
+ } else if (rows->metadata().is_fixed_length) {
+ uint32_t row_size = rows->metadata().fixed_length;
+ uint8_t* row_base = rows->mutable_data(1) + offset_within_row;
+ const uint8_t* col_base = col_prep.data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ row_base[i * row_size] = col_base[i];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint16_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint16_t*>(col_base)[i];
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint32_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint32_t*>(col_base)[i];
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint64_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint64_t*>(col_base)[i];
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ } else {
+ const uint32_t* row_offsets = rows->offsets();
+ uint8_t* row_base = rows->mutable_data(2) + offset_within_row;
+ const uint8_t* col_base = col_prep.data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ row_base[row_offsets[i]] = col_base[i];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint16_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint16_t*>(col_base)[i];
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint32_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint32_t*>(col_base)[i];
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint64_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint64_t*>(col_base)[i];
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ }
+}
+
+void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp) {
+ KeyColumnArray col_prep;
+ if (UsesTransform(*col)) {
+ col_prep = ArrayReplace(*col, *temp);
+ } else {
+ col_prep = *col;
+ }
+
+ // When we have a single fixed length column we can just do memcpy
+ if (rows.metadata().is_fixed_length &&
+ col_prep.metadata().fixed_length == rows.metadata().fixed_length) {
+ DCHECK_EQ(offset_within_row, 0);
+ uint32_t row_size = rows.metadata().fixed_length;
+ memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size,
+ num_rows * row_size);
+ } else if (rows.metadata().is_fixed_length) {
+ uint32_t row_size = rows.metadata().fixed_length;
+ const uint8_t* row_base = rows.data(1) + start_row * row_size;
+ row_base += offset_within_row;
+ uint8_t* col_base = col_prep.mutable_data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ col_base[i] = row_base[i * row_size];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint16_t*>(col_base)[i] =
+ *reinterpret_cast<const uint16_t*>(row_base + i * row_size);
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint32_t*>(col_base)[i] =
+ *reinterpret_cast<const uint32_t*>(row_base + i * row_size);
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint64_t*>(col_base)[i] =
+ *reinterpret_cast<const uint64_t*>(row_base + i * row_size);
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ } else {
+ const uint32_t* row_offsets = rows.offsets() + start_row;
+ const uint8_t* row_base = rows.data(2);
+ row_base += offset_within_row;
+ uint8_t* col_base = col_prep.mutable_data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ col_base[i] = row_base[row_offsets[i]];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint16_t*>(col_base)[i] =
+ *reinterpret_cast<const uint16_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint32_t*>(col_base)[i] =
+ *reinterpret_cast<const uint32_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint64_t*>(col_base)[i] =
+ *reinterpret_cast<const uint64_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ }
+
+ if (UsesTransform(*col)) {
+ PostDecode(col_prep, col, ctx);
+ }
+}
+
+bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) {
+ bool is_fixed_length = metadata.is_fixed_length;
+ auto size = metadata.fixed_length;
+ return is_fixed_length &&
+ (size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+}
+
+void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp) {
+ if (IsInteger(col.metadata())) {
+ EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp);
+ } else {
+ KeyColumnArray col_prep;
+ if (EncoderInteger::UsesTransform(col)) {
+ col_prep = EncoderInteger::ArrayReplace(col, *temp);
+ EncoderInteger::PreEncode(col, &col_prep, ctx);
+ } else {
+ col_prep = col;
+ }
+
+ bool is_row_fixed_length = rows->metadata().is_fixed_length;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col);
+ } else {
+#endif
+ if (is_row_fixed_length) {
+ EncodeImp<true>(offset_within_row, rows, col);
+ } else {
+ EncodeImp<false>(offset_within_row, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+ }
+
+ DCHECK(temp->metadata().is_fixed_length);
+ DCHECK(temp->length() * temp->metadata().fixed_length >=
+ col.length() * static_cast<int64_t>(sizeof(uint16_t)));
+
+ KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(),
+ nullptr, temp->mutable_data(1), nullptr);
+ ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae);
+}
+
+void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp) {
+ if (IsInteger(col->metadata())) {
+ EncoderInteger::Decode(start_row, num_rows, offset_within_row, rows, col, ctx, temp);
+ } else {
+ KeyColumnArray col_prep;
+ if (EncoderInteger::UsesTransform(*col)) {
+ col_prep = EncoderInteger::ArrayReplace(*col, *temp);
+ } else {
+ col_prep = *col;
+ }
+
+ bool is_row_fixed_length = rows.metadata().is_fixed_length;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows,
+ col);
+ } else {
+#endif
+ if (is_row_fixed_length) {
+ DecodeImp<true>(start_row, num_rows, offset_within_row, rows, col);
+ } else {
+ DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+
+ if (EncoderInteger::UsesTransform(*col)) {
+ EncoderInteger::PostDecode(col_prep, col, ctx);
+ }
+ }
+}
+
+template <bool is_row_fixed_length>
+void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col) {
+ EncodeDecodeHelper<is_row_fixed_length, true>(
+ 0, static_cast<uint32_t>(col.length()), offset_within_row, rows, rows, &col,
+ nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ uint32_t istripe;
+ for (istripe = 0; istripe < length / 8; ++istripe) {
+ dst64[istripe] = util::SafeLoad(src64 + istripe);
+ }
+ if ((length % 8) > 0) {
+ uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
+ dst64[istripe] = (dst64[istripe] & ~mask_last) |
+ (util::SafeLoad(src64 + istripe) & mask_last);
+ }
+ });
+}
+
+template <bool is_row_fixed_length>
+void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col) {
+ EncodeDecodeHelper<is_row_fixed_length, false>(
+ start_row, num_rows, offset_within_row, &rows, nullptr, col, col,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ util::SafeStore(dst64 + istripe, src64[istripe]);
+ }
+ });
+}
+
+void KeyEncoder::EncoderBinary::ColumnMemsetNulls(
+ uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
+ using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&,
+ KeyEncoderContext*, KeyColumnArray*, uint8_t);
+ static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = {
+ ColumnMemsetNullsImp<false, 1>, ColumnMemsetNullsImp<false, 2>,
+ ColumnMemsetNullsImp<false, 4>, ColumnMemsetNullsImp<false, 8>,
+ ColumnMemsetNullsImp<false, 16>, ColumnMemsetNullsImp<true, 1>,
+ ColumnMemsetNullsImp<true, 2>, ColumnMemsetNullsImp<true, 4>,
+ ColumnMemsetNullsImp<true, 8>, ColumnMemsetNullsImp<true, 16>};
+ uint32_t col_width = col.metadata().fixed_length;
+ int dispatch_const =
+ (rows->metadata().is_fixed_length ? 5 : 0) +
+ (col_width == 1 ? 0
+ : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4);
+ ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx,
+ temp_vector_16bit, byte_value);
+}
+
+template <bool is_row_fixed_length, uint32_t col_width>
+void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp(
+ uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
+ // Nothing to do when there are no nulls
+ if (!col.data(0)) {
+ return;
+ }
+
+ const auto num_rows = static_cast<uint32_t>(col.length());
+
+ // Temp vector needs space for the required number of rows
+ DCHECK(temp_vector_16bit->length() >= num_rows);
+ DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
+ temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
+ auto temp_vector = reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1));
+
+ // Bit vector to index vector of null positions
+ int num_selected;
+ util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast<int>(col.length()),
+ col.data(0), &num_selected, temp_vector,
+ col.bit_offset(0));
+
+ for (int i = 0; i < num_selected; ++i) {
+ uint32_t row_id = temp_vector[i];
+
+ // Target binary field pointer
+ uint8_t* dst;
+ if (is_row_fixed_length) {
+ dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id;
+ } else {
+ dst = rows->mutable_data(2) + rows->offsets()[row_id];
+ }
+ dst += offset_within_row;
+
+ if (col_width == 1) {
+ *dst = byte_value;
+ } else if (col_width == 2) {
+ *reinterpret_cast<uint16_t*>(dst) =
+ (static_cast<uint16_t>(byte_value) * static_cast<uint16_t>(0x0101));
+ } else if (col_width == 4) {
+ *reinterpret_cast<uint32_t*>(dst) =
+ (static_cast<uint32_t>(byte_value) * static_cast<uint32_t>(0x01010101));
+ } else if (col_width == 8) {
+ *reinterpret_cast<uint64_t*>(dst) =
+ (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
+ } else {
+ uint64_t value = (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
+ uint32_t col_width_actual = col.metadata().fixed_length;
+ uint32_t j;
+ for (j = 0; j < col_width_actual / 8; ++j) {
+ reinterpret_cast<uint64_t*>(dst)[j] = value;
+ }
+ int tail = col_width_actual % 8;
+ if (tail) {
+ uint64_t mask = ~0ULL >> (8 * (8 - tail));
+ reinterpret_cast<uint64_t*>(dst)[j] =
+ (reinterpret_cast<const uint64_t*>(dst)[j] & ~mask) | (value & mask);
+ }
+ }
+ }
+}
+
+void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2,
+ KeyEncoderContext* ctx, KeyColumnArray* temp1,
+ KeyColumnArray* temp2) {
+ DCHECK(CanProcessPair(col1.metadata(), col2.metadata()));
+
+ KeyColumnArray col_prep[2];
+ if (EncoderInteger::UsesTransform(col1)) {
+ col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1);
+ EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx);
+ } else {
+ col_prep[0] = col1;
+ }
+ if (EncoderInteger::UsesTransform(col2)) {
+ col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2);
+ EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx);
+ } else {
+ col_prep[1] = col2;
+ }
+
+ uint32_t col_width1 = col_prep[0].metadata().fixed_length;
+ uint32_t col_width2 = col_prep[1].metadata().fixed_length;
+ int log_col_width1 =
+ col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
+ int log_col_width2 =
+ col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+
+ bool is_row_fixed_length = rows->metadata().is_fixed_length;
+
+ const auto num_rows = static_cast<uint32_t>(col1.length());
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && col_width1 == col_width2) {
+ num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row,
+ rows, col_prep[0], col_prep[1]);
+ }
+#endif
+ if (num_processed < num_rows) {
+ using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&,
+ const KeyColumnArray&);
+ static const EncodeImp_t EncodeImp_fn[] = {
+ EncodeImp<false, uint8_t, uint8_t>, EncodeImp<false, uint16_t, uint8_t>,
+ EncodeImp<false, uint32_t, uint8_t>, EncodeImp<false, uint64_t, uint8_t>,
+ EncodeImp<false, uint8_t, uint16_t>, EncodeImp<false, uint16_t, uint16_t>,
+ EncodeImp<false, uint32_t, uint16_t>, EncodeImp<false, uint64_t, uint16_t>,
+ EncodeImp<false, uint8_t, uint32_t>, EncodeImp<false, uint16_t, uint32_t>,
+ EncodeImp<false, uint32_t, uint32_t>, EncodeImp<false, uint64_t, uint32_t>,
+ EncodeImp<false, uint8_t, uint64_t>, EncodeImp<false, uint16_t, uint64_t>,
+ EncodeImp<false, uint32_t, uint64_t>, EncodeImp<false, uint64_t, uint64_t>,
+ EncodeImp<true, uint8_t, uint8_t>, EncodeImp<true, uint16_t, uint8_t>,
+ EncodeImp<true, uint32_t, uint8_t>, EncodeImp<true, uint64_t, uint8_t>,
+ EncodeImp<true, uint8_t, uint16_t>, EncodeImp<true, uint16_t, uint16_t>,
+ EncodeImp<true, uint32_t, uint16_t>, EncodeImp<true, uint64_t, uint16_t>,
+ EncodeImp<true, uint8_t, uint32_t>, EncodeImp<true, uint16_t, uint32_t>,
+ EncodeImp<true, uint32_t, uint32_t>, EncodeImp<true, uint64_t, uint32_t>,
+ EncodeImp<true, uint8_t, uint64_t>, EncodeImp<true, uint16_t, uint64_t>,
+ EncodeImp<true, uint32_t, uint64_t>, EncodeImp<true, uint64_t, uint64_t>};
+ int dispatch_const = (log_col_width2 << 2) | log_col_width1;
+ dispatch_const += (is_row_fixed_length ? 16 : 0);
+ EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0],
+ col_prep[1]);
+ }
+}
+
+template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip,
+ uint32_t offset_within_row,
+ KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2) {
+ const uint8_t* src_A = col1.data(1);
+ const uint8_t* src_B = col2.data(1);
+
+ const auto num_rows = static_cast<uint32_t>(col1.length());
+
+ uint32_t fixed_length = rows->metadata().fixed_length;
+ const uint32_t* offsets;
+ uint8_t* dst_base;
+ if (is_row_fixed_length) {
+ dst_base = rows->mutable_data(1) + offset_within_row;
+ offsets = nullptr;
+ } else {
+ dst_base = rows->mutable_data(2) + offset_within_row;
+ offsets = rows->offsets();
+ }
+
+ using col1_type_const = typename std::add_const<col1_type>::type;
+ using col2_type_const = typename std::add_const<col2_type>::type;
+
+ if (is_row_fixed_length) {
+ uint8_t* dst = dst_base + num_rows_to_skip * fixed_length;
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
+ *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
+ reinterpret_cast<col2_type_const*>(src_B)[i];
+ dst += fixed_length;
+ }
+ } else {
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ uint8_t* dst = dst_base + offsets[i];
+ *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
+ *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
+ reinterpret_cast<col2_type_const*>(src_B)[i];
+ }
+ }
+}
+
+void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2, KeyEncoderContext* ctx,
+ KeyColumnArray* temp1, KeyColumnArray* temp2) {
+ DCHECK(CanProcessPair(col1->metadata(), col2->metadata()));
+
+ KeyColumnArray col_prep[2];
+ if (EncoderInteger::UsesTransform(*col1)) {
+ col_prep[0] = EncoderInteger::ArrayReplace(*col1, *temp1);
+ } else {
+ col_prep[0] = *col1;
+ }
+ if (EncoderInteger::UsesTransform(*col2)) {
+ col_prep[1] = EncoderInteger::ArrayReplace(*col2, *temp2);
+ } else {
+ col_prep[1] = *col2;
+ }
+
+ uint32_t col_width1 = col_prep[0].metadata().fixed_length;
+ uint32_t col_width2 = col_prep[1].metadata().fixed_length;
+ int log_col_width1 =
+ col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
+ int log_col_width2 =
+ col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+
+ bool is_row_fixed_length = rows.metadata().is_fixed_length;
+
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && col_width1 == col_width2) {
+ num_processed =
+ DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
+ offset_within_row, rows, &col_prep[0], &col_prep[1]);
+ }
+#endif
+ if (num_processed < num_rows) {
+ using DecodeImp_t = void (*)(uint32_t, uint32_t, uint32_t, uint32_t,
+ const KeyRowArray&, KeyColumnArray*, KeyColumnArray*);
+ static const DecodeImp_t DecodeImp_fn[] = {
+ DecodeImp<false, uint8_t, uint8_t>, DecodeImp<false, uint16_t, uint8_t>,
+ DecodeImp<false, uint32_t, uint8_t>, DecodeImp<false, uint64_t, uint8_t>,
+ DecodeImp<false, uint8_t, uint16_t>, DecodeImp<false, uint16_t, uint16_t>,
+ DecodeImp<false, uint32_t, uint16_t>, DecodeImp<false, uint64_t, uint16_t>,
+ DecodeImp<false, uint8_t, uint32_t>, DecodeImp<false, uint16_t, uint32_t>,
+ DecodeImp<false, uint32_t, uint32_t>, DecodeImp<false, uint64_t, uint32_t>,
+ DecodeImp<false, uint8_t, uint64_t>, DecodeImp<false, uint16_t, uint64_t>,
+ DecodeImp<false, uint32_t, uint64_t>, DecodeImp<false, uint64_t, uint64_t>,
+ DecodeImp<true, uint8_t, uint8_t>, DecodeImp<true, uint16_t, uint8_t>,
+ DecodeImp<true, uint32_t, uint8_t>, DecodeImp<true, uint64_t, uint8_t>,
+ DecodeImp<true, uint8_t, uint16_t>, DecodeImp<true, uint16_t, uint16_t>,
+ DecodeImp<true, uint32_t, uint16_t>, DecodeImp<true, uint64_t, uint16_t>,
+ DecodeImp<true, uint8_t, uint32_t>, DecodeImp<true, uint16_t, uint32_t>,
+ DecodeImp<true, uint32_t, uint32_t>, DecodeImp<true, uint64_t, uint32_t>,
+ DecodeImp<true, uint8_t, uint64_t>, DecodeImp<true, uint16_t, uint64_t>,
+ DecodeImp<true, uint32_t, uint64_t>, DecodeImp<true, uint64_t, uint64_t>};
+ int dispatch_const =
+ (log_col_width2 << 2) | log_col_width1 | (is_row_fixed_length ? 16 : 0);
+ DecodeImp_fn[dispatch_const](num_processed, start_row, num_rows, offset_within_row,
+ rows, &(col_prep[0]), &(col_prep[1]));
+ }
+
+ if (EncoderInteger::UsesTransform(*col1)) {
+ EncoderInteger::PostDecode(col_prep[0], col1, ctx);
+ }
+ if (EncoderInteger::UsesTransform(*col2)) {
+ EncoderInteger::PostDecode(col_prep[1], col2, ctx);
+ }
+}
+
+template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip,
+ uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows,
+ KeyColumnArray* col1,
+ KeyColumnArray* col2) {
+ DCHECK(rows.length() >= start_row + num_rows);
+ DCHECK(col1->length() == num_rows && col2->length() == num_rows);
+
+ uint8_t* dst_A = col1->mutable_data(1);
+ uint8_t* dst_B = col2->mutable_data(1);
+
+ uint32_t fixed_length = rows.metadata().fixed_length;
+ const uint32_t* offsets;
+ const uint8_t* src_base;
+ if (is_row_fixed_length) {
+ src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
+ offsets = nullptr;
+ } else {
+ src_base = rows.data(2) + offset_within_row;
+ offsets = rows.offsets() + start_row;
+ }
+
+ using col1_type_const = typename std::add_const<col1_type>::type;
+ using col2_type_const = typename std::add_const<col2_type>::type;
+
+ if (is_row_fixed_length) {
+ const uint8_t* src = src_base + num_rows_to_skip * fixed_length;
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
+ reinterpret_cast<col2_type*>(dst_B)[i] =
+ *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
+ src += fixed_length;
+ }
+ } else {
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ const uint8_t* src = src_base + offsets[i];
+ reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
+ reinterpret_cast<col2_type*>(dst_B)[i] =
+ *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
+ }
+ }
+}
+
+void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyEncoderContext* ctx) {
+ DCHECK(!varbinary_cols.empty());
+
+ // Rows and columns must all be varying-length
+ DCHECK(!rows->metadata().is_fixed_length);
+ for (const auto& col : varbinary_cols) {
+ DCHECK(!col.metadata().is_fixed_length);
+ }
+
+ const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
+
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector
+ bool has_bit_offset = false;
+
+ // The space in columns must be exactly equal to a space for offsets in rows
+ DCHECK(rows->length() == num_rows);
+ for (const auto& col : varbinary_cols) {
+ DCHECK(col.length() == num_rows);
+ if (col.bit_offset(0) != 0) {
+ has_bit_offset = true;
+ }
+ }
+
+ if (ctx->has_avx2() && !has_bit_offset) {
+ // Create a temp vector sized based on the number of columns
+ auto temp_buffer_holder = util::TempVectorHolder<uint32_t>(
+ ctx->stack, static_cast<uint32_t>(varbinary_cols.size()) * 8);
+ auto temp_buffer_32B_per_col = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder.mutable_data()), nullptr);
+
+ num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col);
+ }
+#endif
+ if (num_processed < num_rows) {
+ EncodeImp(num_processed, rows, varbinary_cols);
+ }
+}
+
+void KeyEncoder::EncoderOffsets::EncodeImp(
+ uint32_t num_rows_already_processed, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols) {
+ DCHECK_GT(varbinary_cols.size(), 0);
+
+ int row_alignment = rows->metadata().row_alignment;
+ int string_alignment = rows->metadata().string_alignment;
+
+ uint32_t* row_offsets = rows->mutable_offsets();
+ uint8_t* row_values = rows->mutable_data(2);
+ const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
+
+ if (num_rows_already_processed == 0) {
+ row_offsets[0] = 0;
+ }
+
+ uint32_t row_offset = row_offsets[num_rows_already_processed];
+ for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) {
+ uint32_t* varbinary_end =
+ rows->metadata().varbinary_end_array(row_values + row_offset);
+
+ // Zero out lengths for nulls.
+ // Add lengths of all columns to get row size.
+ // Store varbinary field ends while summing their lengths.
+
+ uint32_t offset_within_row = rows->metadata().fixed_length;
+
+ for (size_t col = 0; col < varbinary_cols.size(); ++col) {
+ const uint32_t* col_offsets = varbinary_cols[col].offsets();
+ uint32_t col_length = col_offsets[i + 1] - col_offsets[i];
+
+ const int bit_offset = varbinary_cols[col].bit_offset(0);
+
+ const uint8_t* non_nulls = varbinary_cols[col].data(0);
+ if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) {
+ col_length = 0;
+ }
+
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
+ offset_within_row += col_length;
+
+ varbinary_end[col] = offset_within_row;
+ }
+
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment);
+ row_offset += offset_within_row;
+ row_offsets[i + 1] = row_offset;
+ }
+}
+
+void KeyEncoder::EncoderOffsets::Decode(
+ uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* varbinary_cols,
+ const std::vector<uint32_t>& varbinary_cols_base_offset, KeyEncoderContext* ctx) {
+ DCHECK(!varbinary_cols->empty());
+ DCHECK(varbinary_cols->size() == varbinary_cols_base_offset.size());
+
+ DCHECK(!rows.metadata().is_fixed_length);
+ DCHECK(rows.length() >= start_row + num_rows);
+ for (const auto& col : *varbinary_cols) {
+ // Rows and columns must all be varying-length
+ DCHECK(!col.metadata().is_fixed_length);
+ // The space in columns must be exactly equal to a subset of rows selected
+ DCHECK(col.length() == num_rows);
+ }
+
+ // Offsets of varbinary columns data within each encoded row are stored
+ // in the same encoded row as an array of 32-bit integers.
+ // This array follows immediately the data of fixed-length columns.
+ // There is one element for each varying-length column.
+ // The Nth element is the sum of all the lengths of varbinary columns data in
+ // that row, up to and including Nth varbinary column.
+
+ const uint32_t* row_offsets = rows.offsets() + start_row;
+
+ // Set the base offset for each column
+ for (size_t col = 0; col < varbinary_cols->size(); ++col) {
+ uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
+ col_offsets[0] = varbinary_cols_base_offset[col];
+ }
+
+ int string_alignment = rows.metadata().string_alignment;
+
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ // Find the beginning of cumulative lengths array for next row
+ const uint8_t* row = rows.data(2) + row_offsets[i];
+ const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row);
+
+ // Update the offset of each column
+ uint32_t offset_within_row = rows.metadata().fixed_length;
+ for (size_t col = 0; col < varbinary_cols->size(); ++col) {
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
+ uint32_t length = varbinary_ends[col] - offset_within_row;
+ offset_within_row = varbinary_ends[col];
+ uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
+ col_offsets[i + 1] = col_offsets[i] + length;
+ }
+ }
+}
+
+void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col,
+ KeyEncoderContext* ctx) {
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ EncodeHelper_avx2(varbinary_col_id, rows, col);
+ } else {
+#endif
+ if (varbinary_col_id == 0) {
+ EncodeImp<true>(varbinary_col_id, rows, col);
+ } else {
+ EncodeImp<false>(varbinary_col_id, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx) {
+ // Output column varbinary buffer needs an extra 32B
+ // at the end in avx2 version and 8B otherwise.
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
+ } else {
+#endif
+ if (varbinary_col_id == 0) {
+ DecodeImp<true>(start_row, num_rows, varbinary_col_id, rows, col);
+ } else {
+ DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+template <bool first_varbinary_col>
+void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col) {
+ EncodeDecodeHelper<first_varbinary_col, true>(
+ 0, static_cast<uint32_t>(col.length()), varbinary_col_id, rows, rows, &col, nullptr,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ uint32_t istripe;
+ for (istripe = 0; istripe < length / 8; ++istripe) {
+ dst64[istripe] = util::SafeLoad(src64 + istripe);
+ }
+ if ((length % 8) > 0) {
+ uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
+ dst64[istripe] = (dst64[istripe] & ~mask_last) |
+ (util::SafeLoad(src64 + istripe) & mask_last);
+ }
+ });
+}
+
+template <bool first_varbinary_col>
+void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray& rows,
+ KeyColumnArray* col) {
+ EncodeDecodeHelper<first_varbinary_col, false>(
+ start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ util::SafeStore(dst64 + istripe, src64[istripe]);
+ }
+ });
+}
+
+void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols,
+ KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit) {
+ DCHECK_GT(cols.size(), 0);
+ const auto num_rows = static_cast<uint32_t>(rows->length());
+
+ // All input columns should have the same number of rows.
+ // They may or may not have non-nulls bit-vectors allocated.
+ for (const auto& col : cols) {
+ DCHECK(col.length() == num_rows);
+ }
+
+ // Temp vector needs space for the required number of rows
+ DCHECK(temp_vector_16bit->length() >= num_rows);
+ DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
+ temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
+
+ uint8_t* null_masks = rows->null_masks();
+ uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row;
+ memset(null_masks, 0, null_masks_bytes_per_row * num_rows);
+ for (size_t col = 0; col < cols.size(); ++col) {
+ const uint8_t* non_nulls = cols[col].data(0);
+ if (!non_nulls) {
+ continue;
+ }
+ int bit_offset = cols[col].bit_offset(0);
+ DCHECK_LT(bit_offset, 8);
+ int num_selected;
+ util::BitUtil::bits_to_indexes(
+ 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected,
+ reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1)), bit_offset);
+ for (int i = 0; i < num_selected; ++i) {
+ uint16_t row_id = reinterpret_cast<const uint16_t*>(temp_vector_16bit->data(1))[i];
+ int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col;
+ BitUtil::SetBit(null_masks, null_masks_bit_id);
+ }
+ }
+}
+
+void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Every output column needs to have a space for exactly the required number
+ // of rows. It also needs to have non-nulls bit-vector allocated and mutable.
+ DCHECK_GT(cols->size(), 0);
+ for (auto& col : *cols) {
+ DCHECK(col.length() == num_rows);
+ DCHECK(col.mutable_data(0));
+ }
+
+ const uint8_t* null_masks = rows.null_masks();
+ uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row;
+ for (size_t col = 0; col < cols->size(); ++col) {
+ uint8_t* non_nulls = (*cols)[col].mutable_data(0);
+ const int bit_offset = (*cols)[col].bit_offset(0);
+ DCHECK_LT(bit_offset, 8);
+ non_nulls[0] |= 0xff << (bit_offset);
+ if (bit_offset + num_rows > 8) {
+ int bits_in_first_byte = 8 - bit_offset;
+ memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte));
+ }
+ for (uint32_t row = 0; row < num_rows; ++row) {
+ uint32_t null_masks_bit_id =
+ (start_row + row) * null_masks_bytes_per_row * 8 + static_cast<uint32_t>(col);
+ bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id);
+ if (is_set) {
+ BitUtil::ClearBit(non_nulls, bit_offset + row);
+ }
+ }
+ }
+}
+
+uint32_t KeyEncoder::KeyRowMetadata::num_varbinary_cols() const {
+ uint32_t result = 0;
+ for (auto column_metadata : column_metadatas) {
+ if (!column_metadata.is_fixed_length) {
+ ++result;
+ }
+ }
+ return result;
+}
+
+bool KeyEncoder::KeyRowMetadata::is_compatible(const KeyRowMetadata& other) const {
+ if (other.num_cols() != num_cols()) {
+ return false;
+ }
+ if (row_alignment != other.row_alignment ||
+ string_alignment != other.string_alignment) {
+ return false;
+ }
+ for (size_t i = 0; i < column_metadatas.size(); ++i) {
+ if (column_metadatas[i].is_fixed_length !=
+ other.column_metadatas[i].is_fixed_length) {
+ return false;
+ }
+ if (column_metadatas[i].fixed_length != other.column_metadatas[i].fixed_length) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector(
+ const std::vector<KeyColumnMetadata>& cols, int in_row_alignment,
+ int in_string_alignment) {
+ column_metadatas.resize(cols.size());
+ for (size_t i = 0; i < cols.size(); ++i) {
+ column_metadatas[i] = cols[i];
+ }
+
+ const auto num_cols = static_cast<uint32_t>(cols.size());
+
+ // Sort columns.
+ // Columns are sorted based on the size in bytes of their fixed-length part.
+ // For the varying-length column, the fixed-length part is the 32-bit field storing
+ // cumulative length of varying-length fields.
+ // The rules are:
+ // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
+ // part of 1 byte. b) Columns with fixed-length part being power of 2 or multiple of row
+ // alignment precede other columns. They are sorted among themselves based on size of
+ // fixed-length part. c) Fixed-length columns precede varying-length columns when both
+ // have the same size fixed-length part.
+ column_order.resize(num_cols);
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ column_order[i] = i;
+ }
+ std::sort(
+ column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
+ bool is_left_pow2 =
+ !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
+ bool is_right_pow2 = !cols[right].is_fixed_length ||
+ ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
+ bool is_left_fixedlen = cols[left].is_fixed_length;
+ bool is_right_fixedlen = cols[right].is_fixed_length;
+ uint32_t width_left =
+ cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
+ uint32_t width_right =
+ cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
+ if (is_left_pow2 != is_right_pow2) {
+ return is_left_pow2;
+ }
+ if (!is_left_pow2) {
+ return left < right;
+ }
+ if (width_left != width_right) {
+ return width_left > width_right;
+ }
+ if (is_left_fixedlen != is_right_fixedlen) {
+ return is_left_fixedlen;
+ }
+ return left < right;
+ });
+
+ row_alignment = in_row_alignment;
+ string_alignment = in_string_alignment;
+ varbinary_end_array_offset = 0;
+
+ column_offsets.resize(num_cols);
+ uint32_t num_varbinary_cols = 0;
+ uint32_t offset_within_row = 0;
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ const KeyColumnMetadata& col = cols[column_order[i]];
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col);
+ column_offsets[i] = offset_within_row;
+ if (!col.is_fixed_length) {
+ if (num_varbinary_cols == 0) {
+ varbinary_end_array_offset = offset_within_row;
+ }
+ DCHECK(column_offsets[i] - varbinary_end_array_offset ==
+ num_varbinary_cols * sizeof(uint32_t));
+ ++num_varbinary_cols;
+ offset_within_row += sizeof(uint32_t);
+ } else {
+ // Boolean column is a bit-vector, which is indicated by
+ // setting fixed length in column metadata to zero.
+ // It will be stored as a byte in output row.
+ if (col.fixed_length == 0) {
+ offset_within_row += 1;
+ } else {
+ offset_within_row += col.fixed_length;
+ }
+ }
+ }
+
+ is_fixed_length = (num_varbinary_cols == 0);
+ fixed_length =
+ offset_within_row +
+ KeyRowMetadata::padding_for_alignment(
+ offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment);
+
+ // We set the number of bytes per row storing null masks of individual key columns
+ // to be a power of two. This is not required. It could be also set to the minimal
+ // number of bytes required for a given number of bits (one bit per column).
+ null_masks_bytes_per_row = 1;
+ while (static_cast<uint32_t>(null_masks_bytes_per_row * 8) < num_cols) {
+ null_masks_bytes_per_row *= 2;
+ }
+}
+
+void KeyEncoder::Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
+ int row_alignment, int string_alignment) {
+ ctx_ = ctx;
+ row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment);
+ uint32_t num_cols = row_metadata_.num_cols();
+ uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols();
+ batch_all_cols_.resize(num_cols);
+ batch_varbinary_cols_.resize(num_varbinary_cols);
+ batch_varbinary_cols_base_offsets_.resize(num_varbinary_cols);
+}
+
+void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in) {
+ const auto num_cols = static_cast<uint32_t>(cols_in.size());
+ DCHECK(batch_all_cols_.size() == num_cols);
+
+ uint32_t num_varbinary_visited = 0;
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ const KeyColumnArray& col = cols_in[row_metadata_.column_order[i]];
+ KeyColumnArray col_window(col, start_row, num_rows);
+ batch_all_cols_[i] = col_window;
+ if (!col.metadata().is_fixed_length) {
+ DCHECK(num_varbinary_visited < batch_varbinary_cols_.size());
+ // If start row is zero, then base offset of varbinary column is also zero.
+ if (start_row == 0) {
+ batch_varbinary_cols_base_offsets_[num_varbinary_visited] = 0;
+ } else {
+ batch_varbinary_cols_base_offsets_[num_varbinary_visited] =
+ col.offsets()[start_row];
+ }
+ batch_varbinary_cols_[num_varbinary_visited++] = col_window;
+ }
+ }
+}
+
+Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows,
+ KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& all_cols) {
+ int64_t num_bytes_required = 0;
+
+ int64_t fixed_part = row_metadata_.fixed_length * num_rows;
+ int64_t var_part = 0;
+ for (const auto& col : all_cols) {
+ if (!col.metadata().is_fixed_length) {
+ DCHECK(col.length() >= start_row + num_rows);
+ const uint32_t* offsets = col.offsets();
+ var_part += offsets[start_row + num_rows] - offsets[start_row];
+ // Include maximum padding that can be added to align the start of varbinary fields.
+ var_part += num_rows * row_metadata_.string_alignment;
+ }
+ }
+ // Include maximum padding that can be added to align the start of the rows.
+ if (!row_metadata_.is_fixed_length) {
+ fixed_part += row_metadata_.row_alignment * num_rows;
+ }
+ num_bytes_required = fixed_part + var_part;
+
+ rows->Clean();
+ RETURN_NOT_OK(rows->AppendEmpty(static_cast<uint32_t>(num_rows),
+ static_cast<uint32_t>(num_bytes_required)));
+
+ return Status::OK();
+}
+
+void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row, num_rows, cols);
+
+ // Create two temp vectors with 16-bit elements
+ auto temp_buffer_holder_A =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_A = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
+ auto temp_buffer_holder_B =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_B = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ // This call will generate and fill in data for both:
+ // - offsets to the entire encoded arrays
+ // - offsets for individual varbinary fields within each row
+ EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_);
+
+ for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
+ // Memcpy varbinary fields into precomputed in the previous step
+ // positions in the output row buffer.
+ EncoderVarBinary::Encode(static_cast<uint32_t>(i), rows, batch_varbinary_cols_[i],
+ ctx_);
+ }
+ }
+
+ // Process fixed length columns
+ const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
+ for (uint32_t i = 0; i < num_cols;) {
+ if (!batch_all_cols_[i].metadata().is_fixed_length) {
+ i += 1;
+ continue;
+ }
+ bool can_process_pair =
+ (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
+ EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
+ batch_all_cols_[i + 1].metadata());
+ if (!can_process_pair) {
+ EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
+ ctx_, &temp_buffer_A);
+ i += 1;
+ } else {
+ EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
+ batch_all_cols_[i + 1], ctx_, &temp_buffer_A,
+ &temp_buffer_B);
+ i += 2;
+ }
+ }
+
+ // Process nulls
+ EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A);
+}
+
+void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input,
+ int64_t start_row_output, int64_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
+
+ // Create two temp vectors with 16-bit elements
+ auto temp_buffer_holder_A =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_A = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
+ auto temp_buffer_holder_B =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_B = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ EncoderOffsets::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), rows, &batch_varbinary_cols_,
+ batch_varbinary_cols_base_offsets_, ctx_);
+ }
+
+ // Process fixed length columns
+ const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
+ for (uint32_t i = 0; i < num_cols;) {
+ if (!batch_all_cols_[i].metadata().is_fixed_length) {
+ i += 1;
+ continue;
+ }
+ bool can_process_pair =
+ (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
+ EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
+ batch_all_cols_[i + 1].metadata());
+ if (!can_process_pair) {
+ EncoderBinary::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows),
+ row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
+ ctx_, &temp_buffer_A);
+ i += 1;
+ } else {
+ EncoderBinaryPair::Decode(
+ static_cast<uint32_t>(start_row_input), static_cast<uint32_t>(num_rows),
+ row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
+ &batch_all_cols_[i + 1], ctx_, &temp_buffer_A, &temp_buffer_B);
+ i += 2;
+ }
+ }
+
+ // Process nulls
+ EncoderNulls::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), rows, &batch_all_cols_);
+}
+
+void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input,
+ int64_t start_row_output, int64_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
+ // Memcpy varbinary fields into precomputed in the previous step
+ // positions in the output row buffer.
+ EncoderVarBinary::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), static_cast<uint32_t>(i),
+ rows, &batch_varbinary_cols_[i], ctx_);
+ }
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
index e5397b9dfd4..f59690e0e6c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
@@ -1,635 +1,635 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/bit_util.h"
-
-namespace arrow {
-namespace compute {
-
-class KeyColumnMetadata;
-
-/// Converts between key representation as a collection of arrays for
-/// individual columns and another representation as a single array of rows
-/// combining data from all columns into one value.
-/// This conversion is reversible.
-/// Row-oriented storage is beneficial when there is a need for random access
-/// of individual rows and at the same time all included columns are likely to
-/// be accessed together, as in the case of hash table key.
-class KeyEncoder {
- public:
- struct KeyEncoderContext {
- bool has_avx2() const {
- return (hardware_flags & arrow::internal::CpuInfo::AVX2) > 0;
- }
- int64_t hardware_flags;
- util::TempVectorStack* stack;
- };
-
- /// Description of a storage format of a single key column as needed
- /// for the purpose of row encoding.
- struct KeyColumnMetadata {
- KeyColumnMetadata() = default;
- KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in)
- : is_fixed_length(is_fixed_length_in), fixed_length(fixed_length_in) {}
- /// Is column storing a varying-length binary, using offsets array
- /// to find a beginning of a value, or is it a fixed-length binary.
- bool is_fixed_length;
- /// For a fixed-length binary column: number of bytes per value.
- /// Zero has a special meaning, indicating a bit vector with one bit per value.
- /// For a varying-length binary column: number of bytes per offset.
- uint32_t fixed_length;
- };
-
- /// Description of a storage format for rows produced by encoder.
- struct KeyRowMetadata {
- /// Is row a varying-length binary, using offsets array to find a beginning of a row,
- /// or is it a fixed-length binary.
- bool is_fixed_length;
-
- /// For a fixed-length binary row, common size of rows in bytes,
- /// rounded up to the multiple of alignment.
- ///
- /// For a varying-length binary, size of all encoded fixed-length key columns,
- /// including lengths of varying-length columns, rounded up to the multiple of string
- /// alignment.
- uint32_t fixed_length;
-
- /// Offset within a row to the array of 32-bit offsets within a row of
- /// ends of varbinary fields.
- /// Used only when the row is not fixed-length, zero for fixed-length row.
- /// There are N elements for N varbinary fields.
- /// Each element is the offset within a row of the first byte after
- /// the corresponding varbinary field bytes in that row.
- /// If varbinary fields begin at aligned addresses, than the end of the previous
- /// varbinary field needs to be rounded up according to the specified alignment
- /// to obtain the beginning of the next varbinary field.
- /// The first varbinary field starts at offset specified by fixed_length,
- /// which should already be aligned.
- uint32_t varbinary_end_array_offset;
-
- /// Fixed number of bytes per row that are used to encode null masks.
- /// Null masks indicate for a single row which of its key columns are null.
- /// Nth bit in the sequence of bytes assigned to a row represents null
- /// information for Nth field according to the order in which they are encoded.
- int null_masks_bytes_per_row;
-
- /// Power of 2. Every row will start at the offset aligned to that number of bytes.
- int row_alignment;
-
- /// Power of 2. Must be no greater than row alignment.
- /// Every non-power-of-2 binary field and every varbinary field bytes
- /// will start aligned to that number of bytes.
- int string_alignment;
-
- /// Metadata of encoded columns in their original order.
- std::vector<KeyColumnMetadata> column_metadatas;
-
- /// Order in which fields are encoded.
- std::vector<uint32_t> column_order;
-
- /// Offsets within a row to fields in their encoding order.
- std::vector<uint32_t> column_offsets;
-
- /// Rounding up offset to the nearest multiple of alignment value.
- /// Alignment must be a power of 2.
- static inline uint32_t padding_for_alignment(uint32_t offset,
- int required_alignment) {
- ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
- return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
- (required_alignment - 1));
- }
-
- /// Rounding up offset to the beginning of next column,
- /// chosing required alignment based on the data type of that column.
- static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment,
- const KeyColumnMetadata& col_metadata) {
- if (!col_metadata.is_fixed_length ||
- ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
- return 0;
- } else {
- return padding_for_alignment(offset, string_alignment);
- }
- }
-
- /// Returns an array of offsets within a row of ends of varbinary fields.
- inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
- ARROW_DCHECK(!is_fixed_length);
- return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
- }
- inline uint32_t* varbinary_end_array(uint8_t* row) const {
- ARROW_DCHECK(!is_fixed_length);
- return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
- }
-
- /// Returns the offset within the row and length of the first varbinary field.
- inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t* offset,
- uint32_t* length) const {
- ARROW_DCHECK(!is_fixed_length);
- *offset = fixed_length;
- *length = varbinary_end_array(row)[0] - fixed_length;
- }
-
- /// Returns the offset within the row and length of the second and further varbinary
- /// fields.
- inline void nth_varbinary_offset_and_length(const uint8_t* row, int varbinary_id,
- uint32_t* out_offset,
- uint32_t* out_length) const {
- ARROW_DCHECK(!is_fixed_length);
- ARROW_DCHECK(varbinary_id > 0);
- const uint32_t* varbinary_end = varbinary_end_array(row);
- uint32_t offset = varbinary_end[varbinary_id - 1];
- offset += padding_for_alignment(offset, string_alignment);
- *out_offset = offset;
- *out_length = varbinary_end[varbinary_id] - offset;
- }
-
- uint32_t encoded_field_order(uint32_t icol) const { return column_order[icol]; }
-
- uint32_t encoded_field_offset(uint32_t icol) const { return column_offsets[icol]; }
-
- uint32_t num_cols() const { return static_cast<uint32_t>(column_metadatas.size()); }
-
- uint32_t num_varbinary_cols() const;
-
- void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
- int in_row_alignment, int in_string_alignment);
-
- bool is_compatible(const KeyRowMetadata& other) const;
- };
-
- class KeyRowArray {
- public:
- KeyRowArray();
- Status Init(MemoryPool* pool, const KeyRowMetadata& metadata);
- void Clean();
- Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append);
- Status AppendSelectionFrom(const KeyRowArray& from, uint32_t num_rows_to_append,
- const uint16_t* source_row_ids);
- const KeyRowMetadata& metadata() const { return metadata_; }
- int64_t length() const { return num_rows_; }
- const uint8_t* data(int i) const {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return buffers_[i];
- }
- uint8_t* mutable_data(int i) {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return mutable_buffers_[i];
- }
- const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
- uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
- const uint8_t* null_masks() const { return null_masks_->data(); }
- uint8_t* null_masks() { return null_masks_->mutable_data(); }
-
- bool has_any_nulls(const KeyEncoderContext* ctx) const;
-
- private:
- Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
- Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
-
- int64_t size_null_masks(int64_t num_rows);
- int64_t size_offsets(int64_t num_rows);
- int64_t size_rows_fixed_length(int64_t num_rows);
- int64_t size_rows_varying_length(int64_t num_bytes);
- void update_buffer_pointers();
-
- static constexpr int64_t padding_for_vectors = 64;
- MemoryPool* pool_;
- KeyRowMetadata metadata_;
- /// Buffers can only expand during lifetime and never shrink.
- std::unique_ptr<ResizableBuffer> null_masks_;
- std::unique_ptr<ResizableBuffer> offsets_;
- std::unique_ptr<ResizableBuffer> rows_;
- static constexpr int max_buffers_ = 3;
- const uint8_t* buffers_[max_buffers_];
- uint8_t* mutable_buffers_[max_buffers_];
- int64_t num_rows_;
- int64_t rows_capacity_;
- int64_t bytes_capacity_;
-
- // Mutable to allow lazy evaluation
- mutable int64_t num_rows_for_has_any_nulls_;
- mutable bool has_any_nulls_;
- };
-
- /// A lightweight description of an array representing one of key columns.
- class KeyColumnArray {
- public:
- KeyColumnArray() = default;
- /// Create as a mix of buffers according to the mask from two descriptions
- /// (Nth bit is set to 0 if Nth buffer from the first input
- /// should be used and is set to 1 otherwise).
- /// Metadata is inherited from the first input.
- KeyColumnArray(const KeyColumnMetadata& metadata, const KeyColumnArray& left,
- const KeyColumnArray& right, int buffer_id_to_replace);
- /// Create for reading
- KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
- const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
- int bit_offset0 = 0, int bit_offset1 = 0);
- /// Create for writing
- KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
- uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
- int bit_offset1 = 0);
- /// Create as a window view of original description that is offset
- /// by a given number of rows.
- /// The number of rows used in offset must be divisible by 8
- /// in order to not split bit vectors within a single byte.
- KeyColumnArray(const KeyColumnArray& from, int64_t start, int64_t length);
- uint8_t* mutable_data(int i) {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return mutable_buffers_[i];
- }
- const uint8_t* data(int i) const {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return buffers_[i];
- }
- uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
- const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
- const KeyColumnMetadata& metadata() const { return metadata_; }
- int64_t length() const { return length_; }
- int bit_offset(int i) const {
- ARROW_DCHECK(i >= 0 && i < max_buffers_);
- return bit_offset_[i];
- }
-
- private:
- static constexpr int max_buffers_ = 3;
- const uint8_t* buffers_[max_buffers_];
- uint8_t* mutable_buffers_[max_buffers_];
- KeyColumnMetadata metadata_;
- int64_t length_;
- // Starting bit offset within the first byte (between 0 and 7)
- // to be used when accessing buffers that store bit vectors.
- int bit_offset_[max_buffers_ - 1];
- };
-
- void Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
- int row_alignment, int string_alignment);
-
- const KeyRowMetadata& row_metadata() { return row_metadata_; }
-
- /// Find out the required sizes of all buffers output buffers for encoding
- /// (including varying-length buffers).
- /// Use that information to resize provided row array so that it can fit
- /// encoded data.
- Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows,
- KeyRowArray* rows,
- const std::vector<KeyColumnArray>& all_cols);
-
- /// Encode a window of column oriented data into the entire output
- /// row oriented storage.
- /// The output buffers for encoding need to be correctly sized before
- /// starting encoding.
- void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& cols);
-
- /// Decode a window of row oriented data into a corresponding
- /// window of column oriented storage.
- /// The output buffers need to be correctly allocated and sized before
- /// calling each method.
- /// For that reason decoding is split into two functions.
- /// The output of the first one, that processes everything except for
- /// varying length buffers, can be used to find out required varying
- /// length buffers sizes.
- void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output,
- int64_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols);
-
- void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t start_row_output,
- int64_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols);
-
- private:
- /// Prepare column array vectors.
- /// Output column arrays represent a range of input column arrays
- /// specified by starting row and number of rows.
- /// Three vectors are generated:
- /// - all columns
- /// - fixed-length columns only
- /// - varying-length columns only
- void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
- const std::vector<KeyColumnArray>& cols_in);
-
- class TransformBoolean {
- public:
- static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
- const KeyColumnArray& temp);
- static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
- static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
- };
-
- class EncoderInteger {
- public:
- static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp);
- static bool UsesTransform(const KeyColumnArray& column);
- static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
- const KeyColumnArray& temp);
- static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
- static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
-
- private:
- static bool IsBoolean(const KeyColumnMetadata& metadata);
- };
-
- class EncoderBinary {
- public:
- static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp);
- static bool IsInteger(const KeyColumnMetadata& metadata);
-
- private:
- template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
- static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray* rows_const,
- KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const,
- KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn);
- template <bool is_row_fixed_length>
- static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool is_row_fixed_length>
- static void DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
- static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row,
- KeyRowArray* rows, const KeyColumnArray& col);
- static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
- uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col);
- template <bool is_row_fixed_length>
- static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool is_row_fixed_length>
- static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col);
-#endif
- static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp_vector_16bit, uint8_t byte_value);
- template <bool is_row_fixed_length, uint32_t col_width>
- static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp_vector_16bit,
- uint8_t byte_value);
- };
-
- class EncoderBinaryPair {
- public:
- static bool CanProcessPair(const KeyColumnMetadata& col1,
- const KeyColumnMetadata& col2) {
- return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2);
- }
- static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1, const KeyColumnArray& col2,
- KeyEncoderContext* ctx, KeyColumnArray* temp1,
- KeyColumnArray* temp2);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col1,
- KeyColumnArray* col2, KeyEncoderContext* ctx,
- KeyColumnArray* temp1, KeyColumnArray* temp2);
-
- private:
- template <bool is_row_fixed_length, typename col1_type, typename col2_type>
- static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row,
- KeyRowArray* rows, const KeyColumnArray& col1,
- const KeyColumnArray& col2);
- template <bool is_row_fixed_length, typename col1_type, typename col2_type>
- static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
- uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col1,
- KeyColumnArray* col2);
-#if defined(ARROW_HAVE_AVX2)
- static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
- uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2);
- static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
- uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col1, KeyColumnArray* col2);
- template <bool is_row_fixed_length, uint32_t col_width>
- static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2);
- template <bool is_row_fixed_length, uint32_t col_width>
- static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col1, KeyColumnArray* col2);
-#endif
- };
-
- class EncoderOffsets {
- public:
- // In order not to repeat work twice,
- // encoding combines in a single pass computing of:
- // a) row offsets for varying-length rows
- // b) within each new row, the cumulative length array
- // of varying-length values within a row.
- static void Encode(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols,
- KeyEncoderContext* ctx);
- static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* varbinary_cols,
- const std::vector<uint32_t>& varbinary_cols_base_offset,
- KeyEncoderContext* ctx);
-
- private:
- static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols);
-#if defined(ARROW_HAVE_AVX2)
- static uint32_t EncodeImp_avx2(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols,
- KeyColumnArray* temp_buffer_32B_per_col);
-#endif
- };
-
- class EncoderVarBinary {
- public:
- static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx);
-
- private:
- template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
- static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id,
- const KeyRowArray* rows_const,
- KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const,
- KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn);
- template <bool first_varbinary_col>
- static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool first_varbinary_col>
- static void DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id, const KeyRowArray& rows,
- KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
- static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col);
- static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id, const KeyRowArray& rows,
- KeyColumnArray* col);
- template <bool first_varbinary_col>
- static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool first_varbinary_col>
- static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id, const KeyRowArray& rows,
- KeyColumnArray* col);
-#endif
- };
-
- class EncoderNulls {
- public:
- static void Encode(KeyRowArray* rows, const std::vector<KeyColumnArray>& cols,
- KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit);
- static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols);
- };
-
- KeyEncoderContext* ctx_;
-
- // Data initialized once, based on data types of key columns
- KeyRowMetadata row_metadata_;
-
- // Data initialized for each input batch.
- // All elements are ordered according to the order of encoded fields in a row.
- std::vector<KeyColumnArray> batch_all_cols_;
- std::vector<KeyColumnArray> batch_varbinary_cols_;
- std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
-};
-
-template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
-inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper(
- uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn) {
- ARROW_DCHECK(col_const && col_const->metadata().is_fixed_length);
- uint32_t col_width = col_const->metadata().fixed_length;
-
- if (is_row_fixed_length) {
- uint32_t row_width = rows_const->metadata().fixed_length;
- for (uint32_t i = 0; i < num_rows; ++i) {
- const uint8_t* src;
- uint8_t* dst;
- if (is_encoding) {
- src = col_const->data(1) + col_width * i;
- dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) +
- offset_within_row;
- } else {
- src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row;
- dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
- }
- copy_fn(dst, src, col_width);
- }
- } else {
- const uint32_t* row_offsets = rows_const->offsets();
- for (uint32_t i = 0; i < num_rows; ++i) {
- const uint8_t* src;
- uint8_t* dst;
- if (is_encoding) {
- src = col_const->data(1) + col_width * i;
- dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] +
- offset_within_row;
- } else {
- src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row;
- dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
- }
- copy_fn(dst, src, col_width);
- }
- }
-}
-
-template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
-inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper(
- uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
- const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn) {
- // Column and rows need to be varying length
- ARROW_DCHECK(!rows_const->metadata().is_fixed_length &&
- !col_const->metadata().is_fixed_length);
-
- const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row;
- const uint32_t* col_offsets = col_const->offsets();
-
- uint32_t col_offset_next = col_offsets[0];
- for (uint32_t i = 0; i < num_rows; ++i) {
- uint32_t col_offset = col_offset_next;
- col_offset_next = col_offsets[i + 1];
-
- uint32_t row_offset = row_offsets_for_batch[i];
- const uint8_t* row = rows_const->data(2) + row_offset;
-
- uint32_t offset_within_row;
- uint32_t length;
- if (first_varbinary_col) {
- rows_const->metadata().first_varbinary_offset_and_length(row, &offset_within_row,
- &length);
- } else {
- rows_const->metadata().nth_varbinary_offset_and_length(row, varbinary_col_id,
- &offset_within_row, &length);
- }
-
- row_offset += offset_within_row;
-
- const uint8_t* src;
- uint8_t* dst;
- if (is_encoding) {
- src = col_const->data(2) + col_offset;
- dst = rows_mutable_maybe_null->mutable_data(2) + row_offset;
- } else {
- src = rows_const->data(2) + row_offset;
- dst = col_mutable_maybe_null->mutable_data(2) + col_offset;
- }
- copy_fn(dst, src, length);
- }
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace compute {
+
+class KeyColumnMetadata;
+
+/// Converts between key representation as a collection of arrays for
+/// individual columns and another representation as a single array of rows
+/// combining data from all columns into one value.
+/// This conversion is reversible.
+/// Row-oriented storage is beneficial when there is a need for random access
+/// of individual rows and at the same time all included columns are likely to
+/// be accessed together, as in the case of hash table key.
+class KeyEncoder {
+ public:
+ struct KeyEncoderContext {
+ bool has_avx2() const {
+ return (hardware_flags & arrow::internal::CpuInfo::AVX2) > 0;
+ }
+ int64_t hardware_flags;
+ util::TempVectorStack* stack;
+ };
+
+ /// Description of a storage format of a single key column as needed
+ /// for the purpose of row encoding.
+ struct KeyColumnMetadata {
+ KeyColumnMetadata() = default;
+ KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in)
+ : is_fixed_length(is_fixed_length_in), fixed_length(fixed_length_in) {}
+ /// Is column storing a varying-length binary, using offsets array
+ /// to find a beginning of a value, or is it a fixed-length binary.
+ bool is_fixed_length;
+ /// For a fixed-length binary column: number of bytes per value.
+ /// Zero has a special meaning, indicating a bit vector with one bit per value.
+ /// For a varying-length binary column: number of bytes per offset.
+ uint32_t fixed_length;
+ };
+
+ /// Description of a storage format for rows produced by encoder.
+ struct KeyRowMetadata {
+ /// Is row a varying-length binary, using offsets array to find a beginning of a row,
+ /// or is it a fixed-length binary.
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key columns,
+ /// including lengths of varying-length columns, rounded up to the multiple of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the previous
+ /// varbinary field needs to be rounded up according to the specified alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its key columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at the offset aligned to that number of bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset,
+ int required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// chosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment,
+ const KeyColumnMetadata& col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t* offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return column_offsets[icol]; }
+
+ uint32_t num_cols() const { return static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ bool is_compatible(const KeyRowMetadata& other) const;
+ };
+
+ class KeyRowArray {
+ public:
+ KeyRowArray();
+ Status Init(MemoryPool* pool, const KeyRowMetadata& metadata);
+ void Clean();
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append);
+ Status AppendSelectionFrom(const KeyRowArray& from, uint32_t num_rows_to_append,
+ const uint16_t* source_row_ids);
+ const KeyRowMetadata& metadata() const { return metadata_; }
+ int64_t length() const { return num_rows_; }
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ bool has_any_nulls(const KeyEncoderContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
+ MemoryPool* pool_;
+ KeyRowMetadata metadata_;
+ /// Buffers can only expand during lifetime and never shrink.
+ std::unique_ptr<ResizableBuffer> null_masks_;
+ std::unique_ptr<ResizableBuffer> offsets_;
+ std::unique_ptr<ResizableBuffer> rows_;
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ int64_t num_rows_;
+ int64_t rows_capacity_;
+ int64_t bytes_capacity_;
+
+ // Mutable to allow lazy evaluation
+ mutable int64_t num_rows_for_has_any_nulls_;
+ mutable bool has_any_nulls_;
+ };
+
+ /// A lightweight description of an array representing one of key columns.
+ class KeyColumnArray {
+ public:
+ KeyColumnArray() = default;
+ /// Create as a mix of buffers according to the mask from two descriptions
+ /// (Nth bit is set to 0 if Nth buffer from the first input
+ /// should be used and is set to 1 otherwise).
+ /// Metadata is inherited from the first input.
+ KeyColumnArray(const KeyColumnMetadata& metadata, const KeyColumnArray& left,
+ const KeyColumnArray& right, int buffer_id_to_replace);
+ /// Create for reading
+ KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
+ const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
+ int bit_offset0 = 0, int bit_offset1 = 0);
+ /// Create for writing
+ KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
+ uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
+ int bit_offset1 = 0);
+ /// Create as a window view of original description that is offset
+ /// by a given number of rows.
+ /// The number of rows used in offset must be divisible by 8
+ /// in order to not split bit vectors within a single byte.
+ KeyColumnArray(const KeyColumnArray& from, int64_t start, int64_t length);
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
+ const KeyColumnMetadata& metadata() const { return metadata_; }
+ int64_t length() const { return length_; }
+ int bit_offset(int i) const {
+ ARROW_DCHECK(i >= 0 && i < max_buffers_);
+ return bit_offset_[i];
+ }
+
+ private:
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ KeyColumnMetadata metadata_;
+ int64_t length_;
+ // Starting bit offset within the first byte (between 0 and 7)
+ // to be used when accessing buffers that store bit vectors.
+ int bit_offset_[max_buffers_ - 1];
+ };
+
+ void Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
+ int row_alignment, int string_alignment);
+
+ const KeyRowMetadata& row_metadata() { return row_metadata_; }
+
+ /// Find out the required sizes of all buffers output buffers for encoding
+ /// (including varying-length buffers).
+ /// Use that information to resize provided row array so that it can fit
+ /// encoded data.
+ Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows,
+ KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& all_cols);
+
+ /// Encode a window of column oriented data into the entire output
+ /// row oriented storage.
+ /// The output buffers for encoding need to be correctly sized before
+ /// starting encoding.
+ void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols);
+
+ /// Decode a window of row oriented data into a corresponding
+ /// window of column oriented storage.
+ /// The output buffers need to be correctly allocated and sized before
+ /// calling each method.
+ /// For that reason decoding is split into two functions.
+ /// The output of the first one, that processes everything except for
+ /// varying length buffers, can be used to find out required varying
+ /// length buffers sizes.
+ void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output,
+ int64_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t start_row_output,
+ int64_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ private:
+ /// Prepare column array vectors.
+ /// Output column arrays represent a range of input column arrays
+ /// specified by starting row and number of rows.
+ /// Three vectors are generated:
+ /// - all columns
+ /// - fixed-length columns only
+ /// - varying-length columns only
+ void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in);
+
+ class TransformBoolean {
+ public:
+ static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
+ const KeyColumnArray& temp);
+ static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ };
+
+ class EncoderInteger {
+ public:
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp);
+ static bool UsesTransform(const KeyColumnArray& column);
+ static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
+ const KeyColumnArray& temp);
+ static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+
+ private:
+ static bool IsBoolean(const KeyColumnMetadata& metadata);
+ };
+
+ class EncoderBinary {
+ public:
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp);
+ static bool IsInteger(const KeyColumnMetadata& metadata);
+
+ private:
+ template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
+ static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray* rows_const,
+ KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const,
+ KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn);
+ template <bool is_row_fixed_length>
+ static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool is_row_fixed_length>
+ static void DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#if defined(ARROW_HAVE_AVX2)
+ static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row,
+ KeyRowArray* rows, const KeyColumnArray& col);
+ static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
+ uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col);
+ template <bool is_row_fixed_length>
+ static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool is_row_fixed_length>
+ static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#endif
+ static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit, uint8_t byte_value);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit,
+ uint8_t byte_value);
+ };
+
+ class EncoderBinaryPair {
+ public:
+ static bool CanProcessPair(const KeyColumnMetadata& col1,
+ const KeyColumnMetadata& col2) {
+ return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2);
+ }
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1, const KeyColumnArray& col2,
+ KeyEncoderContext* ctx, KeyColumnArray* temp1,
+ KeyColumnArray* temp2);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2, KeyEncoderContext* ctx,
+ KeyColumnArray* temp1, KeyColumnArray* temp2);
+
+ private:
+ template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+ static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row,
+ KeyRowArray* rows, const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+ static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
+ uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2);
+#if defined(ARROW_HAVE_AVX2)
+ static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
+ uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
+ uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col1, KeyColumnArray* col2);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col1, KeyColumnArray* col2);
+#endif
+ };
+
+ class EncoderOffsets {
+ public:
+ // In order not to repeat work twice,
+ // encoding combines in a single pass computing of:
+ // a) row offsets for varying-length rows
+ // b) within each new row, the cumulative length array
+ // of varying-length values within a row.
+ static void Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyEncoderContext* ctx);
+ static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* varbinary_cols,
+ const std::vector<uint32_t>& varbinary_cols_base_offset,
+ KeyEncoderContext* ctx);
+
+ private:
+ static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols);
+#if defined(ARROW_HAVE_AVX2)
+ static uint32_t EncodeImp_avx2(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyColumnArray* temp_buffer_32B_per_col);
+#endif
+ };
+
+ class EncoderVarBinary {
+ public:
+ static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx);
+
+ private:
+ template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
+ static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray* rows_const,
+ KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const,
+ KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn);
+ template <bool first_varbinary_col>
+ static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool first_varbinary_col>
+ static void DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#if defined(ARROW_HAVE_AVX2)
+ static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+ template <bool first_varbinary_col>
+ static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool first_varbinary_col>
+ static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#endif
+ };
+
+ class EncoderNulls {
+ public:
+ static void Encode(KeyRowArray* rows, const std::vector<KeyColumnArray>& cols,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit);
+ static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+ };
+
+ KeyEncoderContext* ctx_;
+
+ // Data initialized once, based on data types of key columns
+ KeyRowMetadata row_metadata_;
+
+ // Data initialized for each input batch.
+ // All elements are ordered according to the order of encoded fields in a row.
+ std::vector<KeyColumnArray> batch_all_cols_;
+ std::vector<KeyColumnArray> batch_varbinary_cols_;
+ std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
+};
+
+template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
+inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper(
+ uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn) {
+ ARROW_DCHECK(col_const && col_const->metadata().is_fixed_length);
+ uint32_t col_width = col_const->metadata().fixed_length;
+
+ if (is_row_fixed_length) {
+ uint32_t row_width = rows_const->metadata().fixed_length;
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(1) + col_width * i;
+ dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) +
+ offset_within_row;
+ } else {
+ src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row;
+ dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
+ }
+ copy_fn(dst, src, col_width);
+ }
+ } else {
+ const uint32_t* row_offsets = rows_const->offsets();
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(1) + col_width * i;
+ dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] +
+ offset_within_row;
+ } else {
+ src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row;
+ dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
+ }
+ copy_fn(dst, src, col_width);
+ }
+ }
+}
+
+template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
+inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper(
+ uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
+ const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn) {
+ // Column and rows need to be varying length
+ ARROW_DCHECK(!rows_const->metadata().is_fixed_length &&
+ !col_const->metadata().is_fixed_length);
+
+ const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row;
+ const uint32_t* col_offsets = col_const->offsets();
+
+ uint32_t col_offset_next = col_offsets[0];
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t col_offset = col_offset_next;
+ col_offset_next = col_offsets[i + 1];
+
+ uint32_t row_offset = row_offsets_for_batch[i];
+ const uint8_t* row = rows_const->data(2) + row_offset;
+
+ uint32_t offset_within_row;
+ uint32_t length;
+ if (first_varbinary_col) {
+ rows_const->metadata().first_varbinary_offset_and_length(row, &offset_within_row,
+ &length);
+ } else {
+ rows_const->metadata().nth_varbinary_offset_and_length(row, varbinary_col_id,
+ &offset_within_row, &length);
+ }
+
+ row_offset += offset_within_row;
+
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(2) + col_offset;
+ dst = rows_mutable_maybe_null->mutable_data(2) + row_offset;
+ } else {
+ src = rows_const->data(2) + row_offset;
+ dst = col_mutable_maybe_null->mutable_data(2) + col_offset;
+ }
+ copy_fn(dst, src, length);
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
index 081411e708e..db69ac37d1d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
@@ -1,238 +1,238 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_hash.h"
-
-#include <memory.h>
-
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/compute/exec/util.h"
-
-namespace arrow {
-namespace compute {
-
-inline uint32_t Hashing::avalanche_helper(uint32_t acc) {
- acc ^= (acc >> 15);
- acc *= PRIME32_2;
- acc ^= (acc >> 13);
- acc *= PRIME32_3;
- acc ^= (acc >> 16);
- return acc;
-}
-
-void Hashing::avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes) {
- uint32_t processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- int tail = num_keys % 8;
- avalanche_avx2(num_keys - tail, hashes);
- processed = num_keys - tail;
- }
-#endif
- for (uint32_t i = processed; i < num_keys; ++i) {
- hashes[i] = avalanche_helper(hashes[i]);
- }
-}
-
-inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_t acc2,
- const uint32_t acc3, const uint32_t acc4) {
- return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18);
-}
-
-inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys,
- const uint8_t* keys, uint32_t* hashes) {
- ARROW_DCHECK(key_length <= 8);
- uint64_t mask = ~0ULL >> (8 * (8 - key_length));
- constexpr uint64_t multiplier = 14029467366897019727ULL;
- uint32_t offset = 0;
- for (uint32_t ikey = 0; ikey < num_keys; ++ikey) {
- uint64_t x = *reinterpret_cast<const uint64_t*>(keys + offset);
- x &= mask;
- hashes[ikey] = static_cast<uint32_t>(BYTESWAP(x * multiplier));
- offset += key_length;
- }
-}
-
-inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
- uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
- uint32_t& acc4) {
- uint64_t v1 = reinterpret_cast<const uint64_t*>(keys + offset)[0];
- // We do not need to mask v1, because we will not process a stripe
- // unless at least 9 bytes of it are part of the key.
- uint64_t v2 = reinterpret_cast<const uint64_t*>(keys + offset)[1];
- v2 &= mask_hi;
- uint32_t x1 = static_cast<uint32_t>(v1);
- uint32_t x2 = static_cast<uint32_t>(v1 >> 32);
- uint32_t x3 = static_cast<uint32_t>(v2);
- uint32_t x4 = static_cast<uint32_t>(v2 >> 32);
- acc1 += x1 * PRIME32_2;
- acc1 = ROTL(acc1, 13) * PRIME32_1;
- acc2 += x2 * PRIME32_2;
- acc2 = ROTL(acc2, 13) * PRIME32_1;
- acc3 += x3 * PRIME32_2;
- acc3 = ROTL(acc3, 13) * PRIME32_1;
- acc4 += x4 * PRIME32_2;
- acc4 = ROTL(acc4, 13) * PRIME32_1;
-}
-
-void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys,
- uint32_t key_length, const uint8_t* keys, uint32_t* hash) {
- uint32_t processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- int tail = num_keys % 2;
- helper_stripes_avx2(num_keys - tail, key_length, keys, hash);
- processed = num_keys - tail;
- }
-#endif
-
- // If length modulo stripe length is less than or equal 8, round down to the nearest 16B
- // boundary (8B ending will be processed in a separate function), otherwise round up.
- const uint32_t num_stripes = (key_length + 7) / 16;
- uint64_t mask_hi =
- ~0ULL >>
- (8 * ((num_stripes * 16 > key_length) ? num_stripes * 16 - key_length : 0));
-
- for (uint32_t i = processed; i < num_keys; ++i) {
- uint32_t acc1, acc2, acc3, acc4;
- acc1 = static_cast<uint32_t>(
- (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
- 0xffffffff);
- acc2 = PRIME32_2;
- acc3 = 0;
- acc4 = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
- uint32_t offset = i * key_length;
- for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) {
- helper_stripe(offset, ~0ULL, keys, acc1, acc2, acc3, acc4);
- offset += 16;
- }
- helper_stripe(offset, mask_hi, keys, acc1, acc2, acc3, acc4);
- hash[i] = combine_accumulators(acc1, acc2, acc3, acc4);
- }
-}
-
-inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
- uint32_t acc) {
- uint64_t v = reinterpret_cast<const uint64_t*>(keys + offset)[0];
- v &= mask;
- uint32_t x1 = static_cast<uint32_t>(v);
- uint32_t x2 = static_cast<uint32_t>(v >> 32);
- acc += x1 * PRIME32_3;
- acc = ROTL(acc, 17) * PRIME32_4;
- acc += x2 * PRIME32_3;
- acc = ROTL(acc, 17) * PRIME32_4;
- return acc;
-}
-
-void Hashing::helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash) {
- uint32_t processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- int tail = num_keys % 8;
- helper_tails_avx2(num_keys - tail, key_length, keys, hash);
- processed = num_keys - tail;
- }
-#endif
- uint64_t mask = ~0ULL >> (8 * (((key_length % 8) == 0) ? 0 : 8 - (key_length % 8)));
- uint32_t offset = key_length / 16 * 16;
- offset += processed * key_length;
- for (uint32_t i = processed; i < num_keys; ++i) {
- hash[i] = helper_tail(offset, mask, keys, hash[i]);
- offset += key_length;
- }
-}
-
-void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
- const uint8_t* keys, uint32_t* hashes) {
- ARROW_DCHECK(length_key > 0);
-
- if (length_key <= 8) {
- helper_8B(length_key, num_keys, keys, hashes);
- return;
- }
- helper_stripes(hardware_flags, num_keys, length_key, keys, hashes);
- if ((length_key % 16) > 0 && (length_key % 16) <= 8) {
- helper_tails(hardware_flags, num_keys, length_key, keys, hashes);
- }
- avalanche(hardware_flags, num_keys, hashes);
-}
-
-void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) {
- for (uint32_t i = 0; i < length / 16; ++i) {
- for (int j = 0; j < 4; ++j) {
- uint32_t lane = reinterpret_cast<const uint32_t*>(key)[i * 4 + j];
- acc[j] += (lane * PRIME32_2);
- acc[j] = ROTL(acc[j], 13);
- acc[j] *= PRIME32_1;
- }
- }
-
- int tail = length % 16;
- if (tail) {
- uint64_t last_stripe[2];
- const uint64_t* last_stripe_base =
- reinterpret_cast<const uint64_t*>(key + length - (length % 16));
- last_stripe[0] = last_stripe_base[0];
- uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length));
- if (tail <= 8) {
- last_stripe[1] = 0;
- last_stripe[0] &= mask;
- } else {
- last_stripe[1] = last_stripe_base[1];
- last_stripe[1] &= mask;
- }
- for (int j = 0; j < 4; ++j) {
- uint32_t lane = reinterpret_cast<const uint32_t*>(last_stripe)[j];
- acc[j] += (lane * PRIME32_2);
- acc[j] = ROTL(acc[j], 13);
- acc[j] *= PRIME32_1;
- }
- }
-}
-
-void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows,
- const uint32_t* offsets, const uint8_t* concatenated_keys,
- uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
- uint32_t* hashes) {
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes);
- } else {
-#endif
- for (uint32_t i = 0; i < num_rows; ++i) {
- uint32_t acc[4];
- acc[0] = static_cast<uint32_t>(
- (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
- 0xffffffff);
- acc[1] = PRIME32_2;
- acc[2] = 0;
- acc[3] = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
- uint32_t length = offsets[i + 1] - offsets[i];
- hash_varlen_helper(length, concatenated_keys + offsets[i], acc);
- hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]);
- }
- avalanche(hardware_flags, num_rows, hashes);
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_hash.h"
+
+#include <memory.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+
+namespace arrow {
+namespace compute {
+
+inline uint32_t Hashing::avalanche_helper(uint32_t acc) {
+ acc ^= (acc >> 15);
+ acc *= PRIME32_2;
+ acc ^= (acc >> 13);
+ acc *= PRIME32_3;
+ acc ^= (acc >> 16);
+ return acc;
+}
+
+void Hashing::avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 8;
+ avalanche_avx2(num_keys - tail, hashes);
+ processed = num_keys - tail;
+ }
+#endif
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ hashes[i] = avalanche_helper(hashes[i]);
+ }
+}
+
+inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_t acc2,
+ const uint32_t acc3, const uint32_t acc4) {
+ return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18);
+}
+
+inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys,
+ const uint8_t* keys, uint32_t* hashes) {
+ ARROW_DCHECK(key_length <= 8);
+ uint64_t mask = ~0ULL >> (8 * (8 - key_length));
+ constexpr uint64_t multiplier = 14029467366897019727ULL;
+ uint32_t offset = 0;
+ for (uint32_t ikey = 0; ikey < num_keys; ++ikey) {
+ uint64_t x = *reinterpret_cast<const uint64_t*>(keys + offset);
+ x &= mask;
+ hashes[ikey] = static_cast<uint32_t>(BYTESWAP(x * multiplier));
+ offset += key_length;
+ }
+}
+
+inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
+ uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
+ uint32_t& acc4) {
+ uint64_t v1 = reinterpret_cast<const uint64_t*>(keys + offset)[0];
+ // We do not need to mask v1, because we will not process a stripe
+ // unless at least 9 bytes of it are part of the key.
+ uint64_t v2 = reinterpret_cast<const uint64_t*>(keys + offset)[1];
+ v2 &= mask_hi;
+ uint32_t x1 = static_cast<uint32_t>(v1);
+ uint32_t x2 = static_cast<uint32_t>(v1 >> 32);
+ uint32_t x3 = static_cast<uint32_t>(v2);
+ uint32_t x4 = static_cast<uint32_t>(v2 >> 32);
+ acc1 += x1 * PRIME32_2;
+ acc1 = ROTL(acc1, 13) * PRIME32_1;
+ acc2 += x2 * PRIME32_2;
+ acc2 = ROTL(acc2, 13) * PRIME32_1;
+ acc3 += x3 * PRIME32_2;
+ acc3 = ROTL(acc3, 13) * PRIME32_1;
+ acc4 += x4 * PRIME32_2;
+ acc4 = ROTL(acc4, 13) * PRIME32_1;
+}
+
+void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys,
+ uint32_t key_length, const uint8_t* keys, uint32_t* hash) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 2;
+ helper_stripes_avx2(num_keys - tail, key_length, keys, hash);
+ processed = num_keys - tail;
+ }
+#endif
+
+ // If length modulo stripe length is less than or equal 8, round down to the nearest 16B
+ // boundary (8B ending will be processed in a separate function), otherwise round up.
+ const uint32_t num_stripes = (key_length + 7) / 16;
+ uint64_t mask_hi =
+ ~0ULL >>
+ (8 * ((num_stripes * 16 > key_length) ? num_stripes * 16 - key_length : 0));
+
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ uint32_t acc1, acc2, acc3, acc4;
+ acc1 = static_cast<uint32_t>(
+ (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
+ 0xffffffff);
+ acc2 = PRIME32_2;
+ acc3 = 0;
+ acc4 = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
+ uint32_t offset = i * key_length;
+ for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) {
+ helper_stripe(offset, ~0ULL, keys, acc1, acc2, acc3, acc4);
+ offset += 16;
+ }
+ helper_stripe(offset, mask_hi, keys, acc1, acc2, acc3, acc4);
+ hash[i] = combine_accumulators(acc1, acc2, acc3, acc4);
+ }
+}
+
+inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
+ uint32_t acc) {
+ uint64_t v = reinterpret_cast<const uint64_t*>(keys + offset)[0];
+ v &= mask;
+ uint32_t x1 = static_cast<uint32_t>(v);
+ uint32_t x2 = static_cast<uint32_t>(v >> 32);
+ acc += x1 * PRIME32_3;
+ acc = ROTL(acc, 17) * PRIME32_4;
+ acc += x2 * PRIME32_3;
+ acc = ROTL(acc, 17) * PRIME32_4;
+ return acc;
+}
+
+void Hashing::helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 8;
+ helper_tails_avx2(num_keys - tail, key_length, keys, hash);
+ processed = num_keys - tail;
+ }
+#endif
+ uint64_t mask = ~0ULL >> (8 * (((key_length % 8) == 0) ? 0 : 8 - (key_length % 8)));
+ uint32_t offset = key_length / 16 * 16;
+ offset += processed * key_length;
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ hash[i] = helper_tail(offset, mask, keys, hash[i]);
+ offset += key_length;
+ }
+}
+
+void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
+ const uint8_t* keys, uint32_t* hashes) {
+ ARROW_DCHECK(length_key > 0);
+
+ if (length_key <= 8) {
+ helper_8B(length_key, num_keys, keys, hashes);
+ return;
+ }
+ helper_stripes(hardware_flags, num_keys, length_key, keys, hashes);
+ if ((length_key % 16) > 0 && (length_key % 16) <= 8) {
+ helper_tails(hardware_flags, num_keys, length_key, keys, hashes);
+ }
+ avalanche(hardware_flags, num_keys, hashes);
+}
+
+void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) {
+ for (uint32_t i = 0; i < length / 16; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ uint32_t lane = reinterpret_cast<const uint32_t*>(key)[i * 4 + j];
+ acc[j] += (lane * PRIME32_2);
+ acc[j] = ROTL(acc[j], 13);
+ acc[j] *= PRIME32_1;
+ }
+ }
+
+ int tail = length % 16;
+ if (tail) {
+ uint64_t last_stripe[2];
+ const uint64_t* last_stripe_base =
+ reinterpret_cast<const uint64_t*>(key + length - (length % 16));
+ last_stripe[0] = last_stripe_base[0];
+ uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length));
+ if (tail <= 8) {
+ last_stripe[1] = 0;
+ last_stripe[0] &= mask;
+ } else {
+ last_stripe[1] = last_stripe_base[1];
+ last_stripe[1] &= mask;
+ }
+ for (int j = 0; j < 4; ++j) {
+ uint32_t lane = reinterpret_cast<const uint32_t*>(last_stripe)[j];
+ acc[j] += (lane * PRIME32_2);
+ acc[j] = ROTL(acc[j], 13);
+ acc[j] *= PRIME32_1;
+ }
+ }
+}
+
+void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows,
+ const uint32_t* offsets, const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes) {
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes);
+ } else {
+#endif
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t acc[4];
+ acc[0] = static_cast<uint32_t>(
+ (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
+ 0xffffffff);
+ acc[1] = PRIME32_2;
+ acc[2] = 0;
+ acc[3] = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
+ uint32_t length = offsets[i + 1] - offsets[i];
+ hash_varlen_helper(length, concatenated_keys + offsets[i], acc);
+ hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]);
+ }
+ avalanche(hardware_flags, num_rows, hashes);
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
index 7f8ab5185cc..4d36c9aa585 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
@@ -1,94 +1,94 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#if defined(ARROW_HAVE_AVX2)
-#include <immintrin.h>
-#endif
-
-#include <cstdint>
-
-#include "arrow/compute/exec/util.h"
-
-namespace arrow {
-namespace compute {
-
-// Implementations are based on xxh3 32-bit algorithm description from:
-// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
-//
-class Hashing {
- public:
- static void hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
- const uint8_t* keys, uint32_t* hashes);
-
- static void hash_varlen(int64_t hardware_flags, uint32_t num_rows,
- const uint32_t* offsets, const uint8_t* concatenated_keys,
- uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
- uint32_t* hashes);
-
- private:
- static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
- static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
- static const uint32_t PRIME32_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
- static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
- static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001
-
- // Avalanche
- static inline uint32_t avalanche_helper(uint32_t acc);
-#if defined(ARROW_HAVE_AVX2)
- static void avalanche_avx2(uint32_t num_keys, uint32_t* hashes);
-#endif
- static void avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes);
-
- // Accumulator combine
- static inline uint32_t combine_accumulators(const uint32_t acc1, const uint32_t acc2,
- const uint32_t acc3, const uint32_t acc4);
-#if defined(ARROW_HAVE_AVX2)
- static inline uint64_t combine_accumulators_avx2(__m256i acc);
-#endif
-
- // Helpers
- static inline void helper_8B(uint32_t key_length, uint32_t num_keys,
- const uint8_t* keys, uint32_t* hashes);
- static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
- uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
- uint32_t& acc4);
- static inline uint32_t helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
- uint32_t acc);
-#if defined(ARROW_HAVE_AVX2)
- static void helper_stripes_avx2(uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash);
- static void helper_tails_avx2(uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash);
-#endif
- static void helper_stripes(int64_t hardware_flags, uint32_t num_keys,
- uint32_t key_length, const uint8_t* keys, uint32_t* hash);
- static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash);
-
- static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc);
-#if defined(ARROW_HAVE_AVX2)
- static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets,
- const uint8_t* concatenated_keys,
- uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
- uint32_t* hashes);
-#endif
-};
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(ARROW_HAVE_AVX2)
+#include <immintrin.h>
+#endif
+
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+
+namespace arrow {
+namespace compute {
+
+// Implementations are based on xxh3 32-bit algorithm description from:
+// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
+//
+class Hashing {
+ public:
+ static void hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
+ const uint8_t* keys, uint32_t* hashes);
+
+ static void hash_varlen(int64_t hardware_flags, uint32_t num_rows,
+ const uint32_t* offsets, const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes);
+
+ private:
+ static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
+ static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
+ static const uint32_t PRIME32_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
+ static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
+ static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001
+
+ // Avalanche
+ static inline uint32_t avalanche_helper(uint32_t acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void avalanche_avx2(uint32_t num_keys, uint32_t* hashes);
+#endif
+ static void avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes);
+
+ // Accumulator combine
+ static inline uint32_t combine_accumulators(const uint32_t acc1, const uint32_t acc2,
+ const uint32_t acc3, const uint32_t acc4);
+#if defined(ARROW_HAVE_AVX2)
+ static inline uint64_t combine_accumulators_avx2(__m256i acc);
+#endif
+
+ // Helpers
+ static inline void helper_8B(uint32_t key_length, uint32_t num_keys,
+ const uint8_t* keys, uint32_t* hashes);
+ static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
+ uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
+ uint32_t& acc4);
+ static inline uint32_t helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
+ uint32_t acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void helper_stripes_avx2(uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+ static void helper_tails_avx2(uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+#endif
+ static void helper_stripes(int64_t hardware_flags, uint32_t num_keys,
+ uint32_t key_length, const uint8_t* keys, uint32_t* hash);
+ static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+
+ static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets,
+ const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes);
+#endif
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
index ac47c04403c..5cc4105f45c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
@@ -1,610 +1,610 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_map.h"
-
-#include <memory.h>
-
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-
-using BitUtil::CountLeadingZeros;
-
-namespace compute {
-
-constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL;
-
-// Search status bytes inside a block of 8 slots (64-bit word).
-// Try to find a slot that contains a 7-bit stamp matching the one provided.
-// There are three possible outcomes:
-// 1. A matching slot is found.
-// -> Return its index between 0 and 7 and set match found flag.
-// 2. A matching slot is not found and there is an empty slot in the block.
-// -> Return the index of the first empty slot and clear match found flag.
-// 3. A matching slot is not found and there are no empty slots in the block.
-// -> Return 8 as the output slot index and clear match found flag.
-//
-// Optionally an index of the first slot to start the search from can be specified.
-// In this case slots before it will be ignored.
-//
-template <bool use_start_slot>
-inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot,
- int* out_slot, int* out_match_found) {
- // Filled slot bytes have the highest bit set to 0 and empty slots are equal to 0x80.
- uint64_t block_high_bits = block & kHighBitOfEachByte;
-
- // Replicate 7-bit stamp to all non-empty slots, leaving zeroes for empty slots.
- uint64_t stamp_pattern = stamp * ((block_high_bits ^ kHighBitOfEachByte) >> 7);
-
- // If we xor this pattern with block status bytes we get in individual bytes:
- // a) 0x00, for filled slots matching the stamp,
- // b) 0x00 < x < 0x80, for filled slots not matching the stamp,
- // c) 0x80, for empty slots.
- uint64_t block_xor_pattern = block ^ stamp_pattern;
-
- // If we then add 0x7f to every byte, we get:
- // a) 0x7F
- // b) 0x80 <= x < 0xFF
- // c) 0xFF
- uint64_t match_base = block_xor_pattern + ~kHighBitOfEachByte;
-
- // The highest bit now tells us if we have a match (0) or not (1).
- // We will negate the bits so that match is represented by a set bit.
- uint64_t matches = ~match_base;
-
- // Clear 7 non-relevant bits in each byte.
- // Also clear bytes that correspond to slots that we were supposed to
- // skip due to provided start slot index.
- // Note: the highest byte corresponds to the first slot.
- if (use_start_slot) {
- matches &= kHighBitOfEachByte >> (8 * start_slot);
- } else {
- matches &= kHighBitOfEachByte;
- }
-
- // We get 0 if there are no matches
- *out_match_found = (matches == 0 ? 0 : 1);
-
- // Now if we or with the highest bits of the block and scan zero bits in reverse,
- // we get 8x slot index that we were looking for.
- // This formula works in all three cases a), b) and c).
- *out_slot = static_cast<int>(CountLeadingZeros(matches | block_high_bits) >> 3);
-}
-
-// This call follows the call to search_block.
-// The input slot index is the output returned by it, which is a value from 0 to 8,
-// with 8 indicating that both: no match was found and there were no empty slots.
-//
-// If the slot corresponds to a non-empty slot return a group id associated with it.
-// Otherwise return any group id from any of the slots or
-// zero, which is the default value stored in empty slots.
-//
-inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
- uint64_t group_id_mask) {
- // Input slot can be equal to 8, in which case we need to output any valid group id
- // value, so we take the one from slot 0 in the block.
- int clamped_slot = slot & 7;
-
- // Group id values for all 8 slots in the block are bit-packed and follow the status
- // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
- // that case we can extract group id using aligned 64-bit word access.
- int num_groupid_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
- ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
- num_groupid_bits == 32 || num_groupid_bits == 64);
-
- int bit_offset = clamped_slot * num_groupid_bits;
- const uint64_t* group_id_bytes =
- reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
- uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
-
- return group_id;
-}
-
-// Return global slot id (the index including the information about the block)
-// where the search should continue if the first comparison fails.
-// This function always follows search_block and receives the slot id returned by it.
-//
-inline uint64_t SwissTable::next_slot_to_visit(uint64_t block_index, int slot,
- int match_found) {
- // The result should be taken modulo the number of all slots in all blocks,
- // but here we allow it to take a value one above the last slot index.
- // Modulo operation is postponed to later.
- return block_index * 8 + slot + match_found;
-}
-
-// Implements first (fast-path, optimistic) lookup.
-// Searches for a match only within the start block and
-// trying only the first slot with a matching stamp.
-//
-// Comparison callback needed for match verification is done outside of this function.
-// Match bit vector filled by it only indicates finding a matching stamp in a slot.
-//
-template <bool use_selection>
-void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
- const uint32_t* hashes, uint8_t* out_match_bitvector,
- uint32_t* out_groupids, uint32_t* out_slot_ids) {
- // Clear the output bit vector
- memset(out_match_bitvector, 0, (num_keys + 7) / 8);
-
- // Based on the size of the table, prepare bit number constants.
- uint32_t stamp_mask = (1 << bits_stamp_) - 1;
- int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- uint32_t groupid_mask = (1 << num_groupid_bits) - 1;
-
- for (int i = 0; i < num_keys; ++i) {
- int id;
- if (use_selection) {
- id = util::SafeLoad(&selection[i]);
- } else {
- id = i;
- }
-
- // Extract from hash: block index and stamp
- //
- uint32_t hash = hashes[id];
- uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_);
- uint32_t stamp = iblock & stamp_mask;
- iblock >>= bits_stamp_;
-
- uint32_t num_block_bytes = num_groupid_bits + 8;
- const uint8_t* blockbase = reinterpret_cast<const uint8_t*>(blocks_) +
- static_cast<uint64_t>(iblock) * num_block_bytes;
- uint64_t block = util::SafeLoadAs<uint64_t>(blockbase);
-
- // Call helper functions to obtain the output triplet:
- // - match (of a stamp) found flag
- // - group id for key comparison
- // - slot to resume search from in case of no match or false positive
- int match_found;
- int islot_in_block;
- search_block<false>(block, stamp, 0, &islot_in_block, &match_found);
- uint64_t groupid = extract_group_id(blockbase, islot_in_block, groupid_mask);
- ARROW_DCHECK(groupid < num_inserted_ || num_inserted_ == 0);
- uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found);
-
- out_match_bitvector[id / 8] |= match_found << (id & 7);
- util::SafeStore(&out_groupids[id], static_cast<uint32_t>(groupid));
- util::SafeStore(&out_slot_ids[id], static_cast<uint32_t>(islot));
- }
-}
-
-// How many groups we can keep in the hash table without the need for resizing.
-// When we reach this limit, we need to break processing of any further rows and resize.
-//
-uint64_t SwissTable::num_groups_for_resize() const {
- // Resize small hash tables when 50% full (up to 12KB).
- // Resize large hash tables when 75% full.
- constexpr int log_blocks_small_ = 9;
- uint64_t num_slots = 1ULL << (log_blocks_ + 3);
- if (log_blocks_ <= log_blocks_small_) {
- return num_slots / 2;
- } else {
- return num_slots * 3 / 4;
- }
-}
-
-uint64_t SwissTable::wrap_global_slot_id(uint64_t global_slot_id) {
- uint64_t global_slot_id_mask = (1 << (log_blocks_ + 3)) - 1;
- return global_slot_id & global_slot_id_mask;
-}
-
-// Run a single round of slot search - comparison / insert - filter unprocessed.
-// Update selection vector to reflect which items have been processed.
-// Ids in selection vector do not have to be sorted.
-//
-Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
- uint16_t* inout_selection, bool* out_need_resize,
- uint32_t* out_group_ids, uint32_t* inout_next_slot_ids) {
- auto num_groups_limit = num_groups_for_resize();
- ARROW_DCHECK(num_inserted_ < num_groups_limit);
-
- // Temporary arrays are of limited size.
- // The input needs to be split into smaller portions if it exceeds that limit.
- //
- ARROW_DCHECK(*inout_num_selected <= static_cast<uint32_t>(1 << log_minibatch_));
-
- // We will split input row ids into three categories:
- // - needing to visit next block [0]
- // - needing comparison [1]
- // - inserted [2]
- //
- auto ids_inserted_buf =
- util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
- auto ids_for_comparison_buf =
- util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
- constexpr int category_nomatch = 0;
- constexpr int category_cmp = 1;
- constexpr int category_inserted = 2;
- int num_ids[3];
- num_ids[0] = num_ids[1] = num_ids[2] = 0;
- uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(),
- ids_inserted_buf.mutable_data()};
- auto push_id = [&num_ids, &ids](int category, int id) {
- util::SafeStore(&ids[category][num_ids[category]++], static_cast<uint16_t>(id));
- };
-
- uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- uint64_t groupid_mask = (1ULL << num_groupid_bits) - 1;
- constexpr uint64_t stamp_mask = 0x7f;
- uint64_t num_block_bytes = (8 + num_groupid_bits);
-
- uint32_t num_processed;
- for (num_processed = 0;
- // Second condition in for loop:
- // We need to break processing and have the caller of this function
- // resize hash table if we reach the limit of the number of groups present.
- num_processed < *inout_num_selected &&
- num_inserted_ + num_ids[category_inserted] < num_groups_limit;
- ++num_processed) {
- // row id in original batch
- int id = util::SafeLoad(&inout_selection[num_processed]);
-
- uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id]));
- uint64_t block_id = slot_id >> 3;
- uint32_t hash = hashes[id];
- uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
- uint64_t block = *reinterpret_cast<uint64_t*>(blockbase);
- uint64_t stamp = (hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask;
- int start_slot = (slot_id & 7);
-
- bool isempty = (blockbase[7 - start_slot] == 0x80);
- if (isempty) {
- // If we reach the empty slot we insert key for new group
-
- blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
- uint32_t group_id = num_inserted_ + num_ids[category_inserted];
- int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
-
- // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
- // In that case we can insert group id value using aligned 64-bit word access.
- ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
- num_groupid_bits == 32 || num_groupid_bits == 64);
- uint64_t* ptr =
- &reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6];
- util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast<uint64_t>(group_id)
- << (groupid_bit_offset & 63)));
-
- hashes_[slot_id] = hash;
- util::SafeStore(&out_group_ids[id], group_id);
- push_id(category_inserted, id);
- } else {
- // We search for a slot with a matching stamp within a single block.
- // We append row id to the appropriate sequence of ids based on
- // whether the match has been found or not.
-
- int new_match_found;
- int new_slot;
- search_block<true>(block, static_cast<int>(stamp), start_slot, &new_slot,
- &new_match_found);
- auto new_groupid =
- static_cast<uint32_t>(extract_group_id(blockbase, new_slot, groupid_mask));
- ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]);
- new_slot =
- static_cast<int>(next_slot_to_visit(block_id, new_slot, new_match_found));
- util::SafeStore(&inout_next_slot_ids[id], new_slot);
- util::SafeStore(&out_group_ids[id], new_groupid);
- push_id(new_match_found, id);
- }
- }
-
- // Copy keys for newly inserted rows using callback
- RETURN_NOT_OK(append_impl_(num_ids[category_inserted], ids[category_inserted]));
- num_inserted_ += num_ids[category_inserted];
-
- // Evaluate comparisons and append ids of rows that failed it to the non-match set.
- uint32_t num_not_equal;
- equal_impl_(num_ids[category_cmp], ids[category_cmp], out_group_ids, &num_not_equal,
- ids[category_nomatch] + num_ids[category_nomatch]);
- num_ids[category_nomatch] += num_not_equal;
-
- // Append ids of any unprocessed entries if we aborted processing due to the need
- // to resize.
- if (num_processed < *inout_num_selected) {
- memmove(ids[category_nomatch] + num_ids[category_nomatch],
- inout_selection + num_processed,
- sizeof(uint16_t) * (*inout_num_selected - num_processed));
- num_ids[category_nomatch] += (*inout_num_selected - num_processed);
- }
-
- *out_need_resize = (num_inserted_ == num_groups_limit);
- *inout_num_selected = num_ids[category_nomatch];
- return Status::OK();
-}
-
-// Use hashes and callbacks to find group ids for already existing keys and
-// to insert and report newly assigned group ids for new keys.
-//
-Status SwissTable::map(const int num_keys, const uint32_t* hashes,
- uint32_t* out_groupids) {
- // Temporary buffers have limited size.
- // Caller is responsible for splitting larger input arrays into smaller chunks.
- ARROW_DCHECK(num_keys <= (1 << log_minibatch_));
-
- // Allocate temporary buffers with a lifetime of this function
- auto match_bitvector_buf = util::TempVectorHolder<uint8_t>(temp_stack_, num_keys);
- uint8_t* match_bitvector = match_bitvector_buf.mutable_data();
- auto slot_ids_buf = util::TempVectorHolder<uint32_t>(temp_stack_, num_keys);
- uint32_t* slot_ids = slot_ids_buf.mutable_data();
- auto ids_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
- uint16_t* ids = ids_buf.mutable_data();
- uint32_t num_ids;
-
- // First-pass processing.
- // Optimistically use simplified lookup involving only a start block to find
- // a single group id candidate for every input.
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
- if (log_blocks_ <= 4) {
- int tail = num_keys % 32;
- int delta = num_keys - tail;
- lookup_1_avx2_x32(num_keys - tail, hashes, match_bitvector, out_groupids, slot_ids);
- lookup_1_avx2_x8(tail, hashes + delta, match_bitvector + delta / 8,
- out_groupids + delta, slot_ids + delta);
- } else {
- lookup_1_avx2_x8(num_keys, hashes, match_bitvector, out_groupids, slot_ids);
- }
- } else {
-#endif
- lookup_1<false>(nullptr, num_keys, hashes, match_bitvector, out_groupids, slot_ids);
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-
- int64_t num_matches =
- arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys);
-
- // After the first-pass processing count rows with matches (based on stamp comparison)
- // and decide based on their percentage whether to call dense or sparse comparison
- // function. Dense comparison means evaluating it for all inputs, even if the matching
- // stamp was not found. It may be cheaper to evaluate comparison for all inputs if the
- // extra cost of filtering is higher than the wasted processing of rows with no match.
- //
- // Dense comparison can only be used if there is at least one inserted key,
- // because otherwise there is no key to compare to.
- //
- if (num_inserted_ > 0 && num_matches > 0 && num_matches > 3 * num_keys / 4) {
- // Dense comparisons
- equal_impl_(num_keys, nullptr, out_groupids, &num_ids, ids);
- } else {
- // Sparse comparisons that involve filtering the input set of keys
- auto ids_cmp_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
- uint16_t* ids_cmp = ids_cmp_buf.mutable_data();
- int num_ids_result;
- util::BitUtil::bits_split_indexes(hardware_flags_, num_keys, match_bitvector,
- &num_ids_result, ids, ids_cmp);
- num_ids = num_ids_result;
- uint32_t num_not_equal;
- equal_impl_(num_keys - num_ids, ids_cmp, out_groupids, &num_not_equal, ids + num_ids);
- num_ids += num_not_equal;
- }
-
- do {
- // A single round of slow-pass (robust) lookup or insert.
- // A single round ends with either a single comparison verifying the match candidate
- // or inserting a new key. A single round of slow-pass may return early if we reach
- // the limit of the number of groups due to inserts of new keys. In that case we need
- // to resize and recalculating starting global slot ids for new bigger hash table.
- bool out_of_capacity;
- RETURN_NOT_OK(
- lookup_2(hashes, &num_ids, ids, &out_of_capacity, out_groupids, slot_ids));
- if (out_of_capacity) {
- RETURN_NOT_OK(grow_double());
- // Reset start slot ids for still unprocessed input keys.
- //
- for (uint32_t i = 0; i < num_ids; ++i) {
- // First slot in the new starting block
- const int16_t id = util::SafeLoad(&ids[i]);
- util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8);
- }
- }
- } while (num_ids > 0);
-
- return Status::OK();
-}
-
-Status SwissTable::grow_double() {
- // Before and after metadata
- int num_group_id_bits_before = num_groupid_bits_from_log_blocks(log_blocks_);
- int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1);
- uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before);
- int log_blocks_before = log_blocks_;
- int log_blocks_after = log_blocks_ + 1;
- uint64_t block_size_before = (8 + num_group_id_bits_before);
- uint64_t block_size_after = (8 + num_group_id_bits_after);
- uint64_t block_size_total_before = (block_size_before << log_blocks_before) + padding_;
- uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_;
- uint64_t hashes_size_total_before =
- (bits_hash_ / 8 * (1 << (log_blocks_before + 3))) + padding_;
- uint64_t hashes_size_total_after =
- (bits_hash_ / 8 * (1 << (log_blocks_after + 3))) + padding_;
- constexpr uint32_t stamp_mask = (1 << bits_stamp_) - 1;
-
- // Allocate new buffers
- uint8_t* blocks_new;
- RETURN_NOT_OK(pool_->Allocate(block_size_total_after, &blocks_new));
- memset(blocks_new, 0, block_size_total_after);
- uint8_t* hashes_new_8B;
- uint32_t* hashes_new;
- RETURN_NOT_OK(pool_->Allocate(hashes_size_total_after, &hashes_new_8B));
- hashes_new = reinterpret_cast<uint32_t*>(hashes_new_8B);
-
- // First pass over all old blocks.
- // Reinsert entries that were not in the overflow block
- // (block other than selected by hash bits corresponding to the entry).
- for (int i = 0; i < (1 << log_blocks_); ++i) {
- // How many full slots in this block
- uint8_t* block_base = blocks_ + i * block_size_before;
- uint8_t* double_block_base_new = blocks_new + 2 * i * block_size_after;
- uint64_t block = *reinterpret_cast<const uint64_t*>(block_base);
-
- auto full_slots =
- static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
- int full_slots_new[2];
- full_slots_new[0] = full_slots_new[1] = 0;
- util::SafeStore(double_block_base_new, kHighBitOfEachByte);
- util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte);
-
- for (int j = 0; j < full_slots; ++j) {
- uint64_t slot_id = i * 8 + j;
- uint32_t hash = hashes_[slot_id];
- uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
- bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
- if (is_overflow_entry) {
- continue;
- }
-
- int ihalf = block_id_new & 1;
- uint8_t stamp_new =
- hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
- uint64_t group_id_bit_offs = j * num_group_id_bits_before;
- uint64_t group_id =
- (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
- (group_id_bit_offs & 7)) &
- group_id_mask_before;
-
- uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf];
- hashes_new[slot_id_new] = hash;
- uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after;
- block_base_new[7 - full_slots_new[ihalf]] = stamp_new;
- int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after;
- uint64_t* ptr =
- reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
- util::SafeStore(ptr,
- util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
- full_slots_new[ihalf]++;
- }
- }
-
- // Second pass over all old blocks.
- // Reinsert entries that were in an overflow block.
- for (int i = 0; i < (1 << log_blocks_); ++i) {
- // How many full slots in this block
- uint8_t* block_base = blocks_ + i * block_size_before;
- uint64_t block = util::SafeLoadAs<uint64_t>(block_base);
- int full_slots = static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
-
- for (int j = 0; j < full_slots; ++j) {
- uint64_t slot_id = i * 8 + j;
- uint32_t hash = hashes_[slot_id];
- uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
- bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
- if (!is_overflow_entry) {
- continue;
- }
-
- uint64_t group_id_bit_offs = j * num_group_id_bits_before;
- uint64_t group_id =
- (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
- (group_id_bit_offs & 7)) &
- group_id_mask_before;
- uint8_t stamp_new =
- hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
-
- uint8_t* block_base_new = blocks_new + block_id_new * block_size_after;
- uint64_t block_new = util::SafeLoadAs<uint64_t>(block_base_new);
- int full_slots_new =
- static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
- while (full_slots_new == 8) {
- block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1);
- block_base_new = blocks_new + block_id_new * block_size_after;
- block_new = util::SafeLoadAs<uint64_t>(block_base_new);
- full_slots_new =
- static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
- }
-
- hashes_new[block_id_new * 8 + full_slots_new] = hash;
- block_base_new[7 - full_slots_new] = stamp_new;
- int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after;
- uint64_t* ptr =
- reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
- util::SafeStore(ptr,
- util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
- }
- }
-
- pool_->Free(blocks_, block_size_total_before);
- pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hashes_size_total_before);
- log_blocks_ = log_blocks_after;
- blocks_ = blocks_new;
- hashes_ = hashes_new;
-
- return Status::OK();
-}
-
-Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool,
- util::TempVectorStack* temp_stack, int log_minibatch,
- EqualImpl equal_impl, AppendImpl append_impl) {
- hardware_flags_ = hardware_flags;
- pool_ = pool;
- temp_stack_ = temp_stack;
- log_minibatch_ = log_minibatch;
- equal_impl_ = equal_impl;
- append_impl_ = append_impl;
-
- log_blocks_ = 0;
- int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- num_inserted_ = 0;
-
- const uint64_t block_bytes = 8 + num_groupid_bits;
- const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
- RETURN_NOT_OK(pool_->Allocate(slot_bytes, &blocks_));
-
- // Make sure group ids are initially set to zero for all slots.
- memset(blocks_, 0, slot_bytes);
-
- // Initialize all status bytes to represent an empty slot.
- for (uint64_t i = 0; i < (static_cast<uint64_t>(1) << log_blocks_); ++i) {
- util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte);
- }
-
- uint64_t num_slots = 1ULL << (log_blocks_ + 3);
- const uint64_t hash_size = sizeof(uint32_t);
- const uint64_t hash_bytes = hash_size * num_slots + padding_;
- uint8_t* hashes8;
- RETURN_NOT_OK(pool_->Allocate(hash_bytes, &hashes8));
- hashes_ = reinterpret_cast<uint32_t*>(hashes8);
-
- return Status::OK();
-}
-
-void SwissTable::cleanup() {
- if (blocks_) {
- int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- const uint64_t block_bytes = 8 + num_groupid_bits;
- const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
- pool_->Free(blocks_, slot_bytes);
- blocks_ = nullptr;
- }
- if (hashes_) {
- uint64_t num_slots = 1ULL << (log_blocks_ + 3);
- const uint64_t hash_size = sizeof(uint32_t);
- const uint64_t hash_bytes = hash_size * num_slots + padding_;
- pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hash_bytes);
- hashes_ = nullptr;
- }
- log_blocks_ = 0;
- num_inserted_ = 0;
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_map.h"
+
+#include <memory.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+
+using BitUtil::CountLeadingZeros;
+
+namespace compute {
+
+constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL;
+
+// Search status bytes inside a block of 8 slots (64-bit word).
+// Try to find a slot that contains a 7-bit stamp matching the one provided.
+// There are three possible outcomes:
+// 1. A matching slot is found.
+// -> Return its index between 0 and 7 and set match found flag.
+// 2. A matching slot is not found and there is an empty slot in the block.
+// -> Return the index of the first empty slot and clear match found flag.
+// 3. A matching slot is not found and there are no empty slots in the block.
+// -> Return 8 as the output slot index and clear match found flag.
+//
+// Optionally an index of the first slot to start the search from can be specified.
+// In this case slots before it will be ignored.
+//
+template <bool use_start_slot>
+inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot,
+ int* out_slot, int* out_match_found) {
+ // Filled slot bytes have the highest bit set to 0 and empty slots are equal to 0x80.
+ uint64_t block_high_bits = block & kHighBitOfEachByte;
+
+ // Replicate 7-bit stamp to all non-empty slots, leaving zeroes for empty slots.
+ uint64_t stamp_pattern = stamp * ((block_high_bits ^ kHighBitOfEachByte) >> 7);
+
+ // If we xor this pattern with block status bytes we get in individual bytes:
+ // a) 0x00, for filled slots matching the stamp,
+ // b) 0x00 < x < 0x80, for filled slots not matching the stamp,
+ // c) 0x80, for empty slots.
+ uint64_t block_xor_pattern = block ^ stamp_pattern;
+
+ // If we then add 0x7f to every byte, we get:
+ // a) 0x7F
+ // b) 0x80 <= x < 0xFF
+ // c) 0xFF
+ uint64_t match_base = block_xor_pattern + ~kHighBitOfEachByte;
+
+ // The highest bit now tells us if we have a match (0) or not (1).
+ // We will negate the bits so that match is represented by a set bit.
+ uint64_t matches = ~match_base;
+
+ // Clear 7 non-relevant bits in each byte.
+ // Also clear bytes that correspond to slots that we were supposed to
+ // skip due to provided start slot index.
+ // Note: the highest byte corresponds to the first slot.
+ if (use_start_slot) {
+ matches &= kHighBitOfEachByte >> (8 * start_slot);
+ } else {
+ matches &= kHighBitOfEachByte;
+ }
+
+ // We get 0 if there are no matches
+ *out_match_found = (matches == 0 ? 0 : 1);
+
+ // Now if we or with the highest bits of the block and scan zero bits in reverse,
+ // we get 8x slot index that we were looking for.
+ // This formula works in all three cases a), b) and c).
+ *out_slot = static_cast<int>(CountLeadingZeros(matches | block_high_bits) >> 3);
+}
+
+// This call follows the call to search_block.
+// The input slot index is the output returned by it, which is a value from 0 to 8,
+// with 8 indicating that both: no match was found and there were no empty slots.
+//
+// If the slot corresponds to a non-empty slot return a group id associated with it.
+// Otherwise return any group id from any of the slots or
+// zero, which is the default value stored in empty slots.
+//
+inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
+ uint64_t group_id_mask) {
+ // Input slot can be equal to 8, in which case we need to output any valid group id
+ // value, so we take the one from slot 0 in the block.
+ int clamped_slot = slot & 7;
+
+ // Group id values for all 8 slots in the block are bit-packed and follow the status
+ // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
+ // that case we can extract group id using aligned 64-bit word access.
+ int num_groupid_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
+ ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+ num_groupid_bits == 32 || num_groupid_bits == 64);
+
+ int bit_offset = clamped_slot * num_groupid_bits;
+ const uint64_t* group_id_bytes =
+ reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
+ uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
+
+ return group_id;
+}
+
+// Return global slot id (the index including the information about the block)
+// where the search should continue if the first comparison fails.
+// This function always follows search_block and receives the slot id returned by it.
+//
+inline uint64_t SwissTable::next_slot_to_visit(uint64_t block_index, int slot,
+ int match_found) {
+ // The result should be taken modulo the number of all slots in all blocks,
+ // but here we allow it to take a value one above the last slot index.
+ // Modulo operation is postponed to later.
+ return block_index * 8 + slot + match_found;
+}
+
+// Implements first (fast-path, optimistic) lookup.
+// Searches for a match only within the start block and
+// trying only the first slot with a matching stamp.
+//
+// Comparison callback needed for match verification is done outside of this function.
+// Match bit vector filled by it only indicates finding a matching stamp in a slot.
+//
+template <bool use_selection>
+void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
+ const uint32_t* hashes, uint8_t* out_match_bitvector,
+ uint32_t* out_groupids, uint32_t* out_slot_ids) {
+ // Clear the output bit vector
+ memset(out_match_bitvector, 0, (num_keys + 7) / 8);
+
+ // Based on the size of the table, prepare bit number constants.
+ uint32_t stamp_mask = (1 << bits_stamp_) - 1;
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ uint32_t groupid_mask = (1 << num_groupid_bits) - 1;
+
+ for (int i = 0; i < num_keys; ++i) {
+ int id;
+ if (use_selection) {
+ id = util::SafeLoad(&selection[i]);
+ } else {
+ id = i;
+ }
+
+ // Extract from hash: block index and stamp
+ //
+ uint32_t hash = hashes[id];
+ uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_);
+ uint32_t stamp = iblock & stamp_mask;
+ iblock >>= bits_stamp_;
+
+ uint32_t num_block_bytes = num_groupid_bits + 8;
+ const uint8_t* blockbase = reinterpret_cast<const uint8_t*>(blocks_) +
+ static_cast<uint64_t>(iblock) * num_block_bytes;
+ uint64_t block = util::SafeLoadAs<uint64_t>(blockbase);
+
+ // Call helper functions to obtain the output triplet:
+ // - match (of a stamp) found flag
+ // - group id for key comparison
+ // - slot to resume search from in case of no match or false positive
+ int match_found;
+ int islot_in_block;
+ search_block<false>(block, stamp, 0, &islot_in_block, &match_found);
+ uint64_t groupid = extract_group_id(blockbase, islot_in_block, groupid_mask);
+ ARROW_DCHECK(groupid < num_inserted_ || num_inserted_ == 0);
+ uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found);
+
+ out_match_bitvector[id / 8] |= match_found << (id & 7);
+ util::SafeStore(&out_groupids[id], static_cast<uint32_t>(groupid));
+ util::SafeStore(&out_slot_ids[id], static_cast<uint32_t>(islot));
+ }
+}
+
+// How many groups we can keep in the hash table without the need for resizing.
+// When we reach this limit, we need to break processing of any further rows and resize.
+//
+uint64_t SwissTable::num_groups_for_resize() const {
+ // Resize small hash tables when 50% full (up to 12KB).
+ // Resize large hash tables when 75% full.
+ constexpr int log_blocks_small_ = 9;
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ if (log_blocks_ <= log_blocks_small_) {
+ return num_slots / 2;
+ } else {
+ return num_slots * 3 / 4;
+ }
+}
+
+uint64_t SwissTable::wrap_global_slot_id(uint64_t global_slot_id) {
+ uint64_t global_slot_id_mask = (1 << (log_blocks_ + 3)) - 1;
+ return global_slot_id & global_slot_id_mask;
+}
+
+// Run a single round of slot search - comparison / insert - filter unprocessed.
+// Update selection vector to reflect which items have been processed.
+// Ids in selection vector do not have to be sorted.
+//
+Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
+ uint16_t* inout_selection, bool* out_need_resize,
+ uint32_t* out_group_ids, uint32_t* inout_next_slot_ids) {
+ auto num_groups_limit = num_groups_for_resize();
+ ARROW_DCHECK(num_inserted_ < num_groups_limit);
+
+ // Temporary arrays are of limited size.
+ // The input needs to be split into smaller portions if it exceeds that limit.
+ //
+ ARROW_DCHECK(*inout_num_selected <= static_cast<uint32_t>(1 << log_minibatch_));
+
+ // We will split input row ids into three categories:
+ // - needing to visit next block [0]
+ // - needing comparison [1]
+ // - inserted [2]
+ //
+ auto ids_inserted_buf =
+ util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
+ auto ids_for_comparison_buf =
+ util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
+ constexpr int category_nomatch = 0;
+ constexpr int category_cmp = 1;
+ constexpr int category_inserted = 2;
+ int num_ids[3];
+ num_ids[0] = num_ids[1] = num_ids[2] = 0;
+ uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(),
+ ids_inserted_buf.mutable_data()};
+ auto push_id = [&num_ids, &ids](int category, int id) {
+ util::SafeStore(&ids[category][num_ids[category]++], static_cast<uint16_t>(id));
+ };
+
+ uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ uint64_t groupid_mask = (1ULL << num_groupid_bits) - 1;
+ constexpr uint64_t stamp_mask = 0x7f;
+ uint64_t num_block_bytes = (8 + num_groupid_bits);
+
+ uint32_t num_processed;
+ for (num_processed = 0;
+ // Second condition in for loop:
+ // We need to break processing and have the caller of this function
+ // resize hash table if we reach the limit of the number of groups present.
+ num_processed < *inout_num_selected &&
+ num_inserted_ + num_ids[category_inserted] < num_groups_limit;
+ ++num_processed) {
+ // row id in original batch
+ int id = util::SafeLoad(&inout_selection[num_processed]);
+
+ uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id]));
+ uint64_t block_id = slot_id >> 3;
+ uint32_t hash = hashes[id];
+ uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
+ uint64_t block = *reinterpret_cast<uint64_t*>(blockbase);
+ uint64_t stamp = (hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask;
+ int start_slot = (slot_id & 7);
+
+ bool isempty = (blockbase[7 - start_slot] == 0x80);
+ if (isempty) {
+ // If we reach the empty slot we insert key for new group
+
+ blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
+ uint32_t group_id = num_inserted_ + num_ids[category_inserted];
+ int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
+
+ // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
+ // In that case we can insert group id value using aligned 64-bit word access.
+ ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+ num_groupid_bits == 32 || num_groupid_bits == 64);
+ uint64_t* ptr =
+ &reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6];
+ util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast<uint64_t>(group_id)
+ << (groupid_bit_offset & 63)));
+
+ hashes_[slot_id] = hash;
+ util::SafeStore(&out_group_ids[id], group_id);
+ push_id(category_inserted, id);
+ } else {
+ // We search for a slot with a matching stamp within a single block.
+ // We append row id to the appropriate sequence of ids based on
+ // whether the match has been found or not.
+
+ int new_match_found;
+ int new_slot;
+ search_block<true>(block, static_cast<int>(stamp), start_slot, &new_slot,
+ &new_match_found);
+ auto new_groupid =
+ static_cast<uint32_t>(extract_group_id(blockbase, new_slot, groupid_mask));
+ ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]);
+ new_slot =
+ static_cast<int>(next_slot_to_visit(block_id, new_slot, new_match_found));
+ util::SafeStore(&inout_next_slot_ids[id], new_slot);
+ util::SafeStore(&out_group_ids[id], new_groupid);
+ push_id(new_match_found, id);
+ }
+ }
+
+ // Copy keys for newly inserted rows using callback
+ RETURN_NOT_OK(append_impl_(num_ids[category_inserted], ids[category_inserted]));
+ num_inserted_ += num_ids[category_inserted];
+
+ // Evaluate comparisons and append ids of rows that failed it to the non-match set.
+ uint32_t num_not_equal;
+ equal_impl_(num_ids[category_cmp], ids[category_cmp], out_group_ids, &num_not_equal,
+ ids[category_nomatch] + num_ids[category_nomatch]);
+ num_ids[category_nomatch] += num_not_equal;
+
+ // Append ids of any unprocessed entries if we aborted processing due to the need
+ // to resize.
+ if (num_processed < *inout_num_selected) {
+ memmove(ids[category_nomatch] + num_ids[category_nomatch],
+ inout_selection + num_processed,
+ sizeof(uint16_t) * (*inout_num_selected - num_processed));
+ num_ids[category_nomatch] += (*inout_num_selected - num_processed);
+ }
+
+ *out_need_resize = (num_inserted_ == num_groups_limit);
+ *inout_num_selected = num_ids[category_nomatch];
+ return Status::OK();
+}
+
+// Use hashes and callbacks to find group ids for already existing keys and
+// to insert and report newly assigned group ids for new keys.
+//
+Status SwissTable::map(const int num_keys, const uint32_t* hashes,
+ uint32_t* out_groupids) {
+ // Temporary buffers have limited size.
+ // Caller is responsible for splitting larger input arrays into smaller chunks.
+ ARROW_DCHECK(num_keys <= (1 << log_minibatch_));
+
+ // Allocate temporary buffers with a lifetime of this function
+ auto match_bitvector_buf = util::TempVectorHolder<uint8_t>(temp_stack_, num_keys);
+ uint8_t* match_bitvector = match_bitvector_buf.mutable_data();
+ auto slot_ids_buf = util::TempVectorHolder<uint32_t>(temp_stack_, num_keys);
+ uint32_t* slot_ids = slot_ids_buf.mutable_data();
+ auto ids_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
+ uint16_t* ids = ids_buf.mutable_data();
+ uint32_t num_ids;
+
+ // First-pass processing.
+ // Optimistically use simplified lookup involving only a start block to find
+ // a single group id candidate for every input.
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
+ if (log_blocks_ <= 4) {
+ int tail = num_keys % 32;
+ int delta = num_keys - tail;
+ lookup_1_avx2_x32(num_keys - tail, hashes, match_bitvector, out_groupids, slot_ids);
+ lookup_1_avx2_x8(tail, hashes + delta, match_bitvector + delta / 8,
+ out_groupids + delta, slot_ids + delta);
+ } else {
+ lookup_1_avx2_x8(num_keys, hashes, match_bitvector, out_groupids, slot_ids);
+ }
+ } else {
+#endif
+ lookup_1<false>(nullptr, num_keys, hashes, match_bitvector, out_groupids, slot_ids);
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+
+ int64_t num_matches =
+ arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys);
+
+ // After the first-pass processing count rows with matches (based on stamp comparison)
+ // and decide based on their percentage whether to call dense or sparse comparison
+ // function. Dense comparison means evaluating it for all inputs, even if the matching
+ // stamp was not found. It may be cheaper to evaluate comparison for all inputs if the
+ // extra cost of filtering is higher than the wasted processing of rows with no match.
+ //
+ // Dense comparison can only be used if there is at least one inserted key,
+ // because otherwise there is no key to compare to.
+ //
+ if (num_inserted_ > 0 && num_matches > 0 && num_matches > 3 * num_keys / 4) {
+ // Dense comparisons
+ equal_impl_(num_keys, nullptr, out_groupids, &num_ids, ids);
+ } else {
+ // Sparse comparisons that involve filtering the input set of keys
+ auto ids_cmp_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
+ uint16_t* ids_cmp = ids_cmp_buf.mutable_data();
+ int num_ids_result;
+ util::BitUtil::bits_split_indexes(hardware_flags_, num_keys, match_bitvector,
+ &num_ids_result, ids, ids_cmp);
+ num_ids = num_ids_result;
+ uint32_t num_not_equal;
+ equal_impl_(num_keys - num_ids, ids_cmp, out_groupids, &num_not_equal, ids + num_ids);
+ num_ids += num_not_equal;
+ }
+
+ do {
+ // A single round of slow-pass (robust) lookup or insert.
+ // A single round ends with either a single comparison verifying the match candidate
+ // or inserting a new key. A single round of slow-pass may return early if we reach
+ // the limit of the number of groups due to inserts of new keys. In that case we need
+ // to resize and recalculating starting global slot ids for new bigger hash table.
+ bool out_of_capacity;
+ RETURN_NOT_OK(
+ lookup_2(hashes, &num_ids, ids, &out_of_capacity, out_groupids, slot_ids));
+ if (out_of_capacity) {
+ RETURN_NOT_OK(grow_double());
+ // Reset start slot ids for still unprocessed input keys.
+ //
+ for (uint32_t i = 0; i < num_ids; ++i) {
+ // First slot in the new starting block
+ const int16_t id = util::SafeLoad(&ids[i]);
+ util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8);
+ }
+ }
+ } while (num_ids > 0);
+
+ return Status::OK();
+}
+
+Status SwissTable::grow_double() {
+ // Before and after metadata
+ int num_group_id_bits_before = num_groupid_bits_from_log_blocks(log_blocks_);
+ int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1);
+ uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before);
+ int log_blocks_before = log_blocks_;
+ int log_blocks_after = log_blocks_ + 1;
+ uint64_t block_size_before = (8 + num_group_id_bits_before);
+ uint64_t block_size_after = (8 + num_group_id_bits_after);
+ uint64_t block_size_total_before = (block_size_before << log_blocks_before) + padding_;
+ uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_;
+ uint64_t hashes_size_total_before =
+ (bits_hash_ / 8 * (1 << (log_blocks_before + 3))) + padding_;
+ uint64_t hashes_size_total_after =
+ (bits_hash_ / 8 * (1 << (log_blocks_after + 3))) + padding_;
+ constexpr uint32_t stamp_mask = (1 << bits_stamp_) - 1;
+
+ // Allocate new buffers
+ uint8_t* blocks_new;
+ RETURN_NOT_OK(pool_->Allocate(block_size_total_after, &blocks_new));
+ memset(blocks_new, 0, block_size_total_after);
+ uint8_t* hashes_new_8B;
+ uint32_t* hashes_new;
+ RETURN_NOT_OK(pool_->Allocate(hashes_size_total_after, &hashes_new_8B));
+ hashes_new = reinterpret_cast<uint32_t*>(hashes_new_8B);
+
+ // First pass over all old blocks.
+ // Reinsert entries that were not in the overflow block
+ // (block other than selected by hash bits corresponding to the entry).
+ for (int i = 0; i < (1 << log_blocks_); ++i) {
+ // How many full slots in this block
+ uint8_t* block_base = blocks_ + i * block_size_before;
+ uint8_t* double_block_base_new = blocks_new + 2 * i * block_size_after;
+ uint64_t block = *reinterpret_cast<const uint64_t*>(block_base);
+
+ auto full_slots =
+ static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
+ int full_slots_new[2];
+ full_slots_new[0] = full_slots_new[1] = 0;
+ util::SafeStore(double_block_base_new, kHighBitOfEachByte);
+ util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte);
+
+ for (int j = 0; j < full_slots; ++j) {
+ uint64_t slot_id = i * 8 + j;
+ uint32_t hash = hashes_[slot_id];
+ uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
+ bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
+ if (is_overflow_entry) {
+ continue;
+ }
+
+ int ihalf = block_id_new & 1;
+ uint8_t stamp_new =
+ hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
+ uint64_t group_id_bit_offs = j * num_group_id_bits_before;
+ uint64_t group_id =
+ (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+ (group_id_bit_offs & 7)) &
+ group_id_mask_before;
+
+ uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf];
+ hashes_new[slot_id_new] = hash;
+ uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after;
+ block_base_new[7 - full_slots_new[ihalf]] = stamp_new;
+ int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after;
+ uint64_t* ptr =
+ reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+ util::SafeStore(ptr,
+ util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
+ full_slots_new[ihalf]++;
+ }
+ }
+
+ // Second pass over all old blocks.
+ // Reinsert entries that were in an overflow block.
+ for (int i = 0; i < (1 << log_blocks_); ++i) {
+ // How many full slots in this block
+ uint8_t* block_base = blocks_ + i * block_size_before;
+ uint64_t block = util::SafeLoadAs<uint64_t>(block_base);
+ int full_slots = static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
+
+ for (int j = 0; j < full_slots; ++j) {
+ uint64_t slot_id = i * 8 + j;
+ uint32_t hash = hashes_[slot_id];
+ uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
+ bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
+ if (!is_overflow_entry) {
+ continue;
+ }
+
+ uint64_t group_id_bit_offs = j * num_group_id_bits_before;
+ uint64_t group_id =
+ (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+ (group_id_bit_offs & 7)) &
+ group_id_mask_before;
+ uint8_t stamp_new =
+ hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
+
+ uint8_t* block_base_new = blocks_new + block_id_new * block_size_after;
+ uint64_t block_new = util::SafeLoadAs<uint64_t>(block_base_new);
+ int full_slots_new =
+ static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
+ while (full_slots_new == 8) {
+ block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1);
+ block_base_new = blocks_new + block_id_new * block_size_after;
+ block_new = util::SafeLoadAs<uint64_t>(block_base_new);
+ full_slots_new =
+ static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
+ }
+
+ hashes_new[block_id_new * 8 + full_slots_new] = hash;
+ block_base_new[7 - full_slots_new] = stamp_new;
+ int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after;
+ uint64_t* ptr =
+ reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+ util::SafeStore(ptr,
+ util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
+ }
+ }
+
+ pool_->Free(blocks_, block_size_total_before);
+ pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hashes_size_total_before);
+ log_blocks_ = log_blocks_after;
+ blocks_ = blocks_new;
+ hashes_ = hashes_new;
+
+ return Status::OK();
+}
+
+Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool,
+ util::TempVectorStack* temp_stack, int log_minibatch,
+ EqualImpl equal_impl, AppendImpl append_impl) {
+ hardware_flags_ = hardware_flags;
+ pool_ = pool;
+ temp_stack_ = temp_stack;
+ log_minibatch_ = log_minibatch;
+ equal_impl_ = equal_impl;
+ append_impl_ = append_impl;
+
+ log_blocks_ = 0;
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ num_inserted_ = 0;
+
+ const uint64_t block_bytes = 8 + num_groupid_bits;
+ const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
+ RETURN_NOT_OK(pool_->Allocate(slot_bytes, &blocks_));
+
+ // Make sure group ids are initially set to zero for all slots.
+ memset(blocks_, 0, slot_bytes);
+
+ // Initialize all status bytes to represent an empty slot.
+ for (uint64_t i = 0; i < (static_cast<uint64_t>(1) << log_blocks_); ++i) {
+ util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte);
+ }
+
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ const uint64_t hash_size = sizeof(uint32_t);
+ const uint64_t hash_bytes = hash_size * num_slots + padding_;
+ uint8_t* hashes8;
+ RETURN_NOT_OK(pool_->Allocate(hash_bytes, &hashes8));
+ hashes_ = reinterpret_cast<uint32_t*>(hashes8);
+
+ return Status::OK();
+}
+
+void SwissTable::cleanup() {
+ if (blocks_) {
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ const uint64_t block_bytes = 8 + num_groupid_bits;
+ const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
+ pool_->Free(blocks_, slot_bytes);
+ blocks_ = nullptr;
+ }
+ if (hashes_) {
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ const uint64_t hash_size = sizeof(uint32_t);
+ const uint64_t hash_bytes = hash_size * num_slots + padding_;
+ pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hash_bytes);
+ hashes_ = nullptr;
+ }
+ log_blocks_ = 0;
+ num_inserted_ = 0;
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
index 8c472736ec4..da50db91040 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
@@ -1,172 +1,172 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <functional>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-
-namespace arrow {
-namespace compute {
-
-class SwissTable {
- public:
- SwissTable() = default;
- ~SwissTable() { cleanup(); }
-
- using EqualImpl =
- std::function<void(int num_keys, const uint16_t* selection /* may be null */,
- const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
- uint16_t* out_selection_mismatch)>;
- using AppendImpl = std::function<Status(int num_keys, const uint16_t* selection)>;
-
- Status init(int64_t hardware_flags, MemoryPool* pool, util::TempVectorStack* temp_stack,
- int log_minibatch, EqualImpl equal_impl, AppendImpl append_impl);
- void cleanup();
-
- Status map(const int ckeys, const uint32_t* hashes, uint32_t* outgroupids);
-
- private:
- // Lookup helpers
-
- /// \brief Scan bytes in block in reverse and stop as soon
- /// as a position of interest is found.
- ///
- /// Positions of interest:
- /// a) slot with a matching stamp is encountered,
- /// b) first empty slot is encountered,
- /// c) we reach the end of the block.
- ///
- /// \param[in] block 8 byte block of hash table
- /// \param[in] stamp 7 bits of hash used as a stamp
- /// \param[in] start_slot Index of the first slot in the block to start search from. We
- /// assume that this index always points to a non-empty slot, equivalently
- /// that it comes before any empty slots. (Used only by one template
- /// variant.)
- /// \param[out] out_slot index corresponding to the discovered position of interest (8
- /// represents end of block).
- /// \param[out] out_match_found an integer flag (0 or 1) indicating if we found a
- /// matching stamp.
- template <bool use_start_slot>
- inline void search_block(uint64_t block, int stamp, int start_slot, int* out_slot,
- int* out_match_found);
-
- /// \brief Extract group id for a given slot in a given block.
- ///
- /// Group ids follow in memory after 64-bit block data.
- /// Maximum number of groups inserted is equal to the number
- /// of all slots in all blocks, which is 8 * the number of blocks.
- /// Group ids are bit packed using that maximum to determine the necessary number of
- /// bits.
- inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
- uint64_t group_id_mask);
-
- inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found);
-
- inline void insert(uint8_t* block_base, uint64_t slot_id, uint32_t hash, uint8_t stamp,
- uint32_t group_id);
-
- inline uint64_t num_groups_for_resize() const;
-
- inline uint64_t wrap_global_slot_id(uint64_t global_slot_id);
-
- // First hash table access
- // Find first match in the start block if exists.
- // Possible cases:
- // 1. Stamp match in a block
- // 2. No stamp match in a block, no empty buckets in a block
- // 3. No stamp match in a block, empty buckets in a block
- //
- template <bool use_selection>
- void lookup_1(const uint16_t* selection, const int num_keys, const uint32_t* hashes,
- uint8_t* out_match_bitvector, uint32_t* out_group_ids,
- uint32_t* out_slot_ids);
-#if defined(ARROW_HAVE_AVX2)
- void lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes,
- uint8_t* out_match_bitvector, uint32_t* out_group_ids,
- uint32_t* out_next_slot_ids);
- void lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes,
- uint8_t* out_match_bitvector, uint32_t* out_group_ids,
- uint32_t* out_next_slot_ids);
-#endif
-
- // Completing hash table lookup post first access
- Status lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
- uint16_t* inout_selection, bool* out_need_resize,
- uint32_t* out_group_ids, uint32_t* out_next_slot_ids);
-
- // Resize small hash tables when 50% full (up to 8KB).
- // Resize large hash tables when 75% full.
- Status grow_double();
-
- static int num_groupid_bits_from_log_blocks(int log_blocks) {
- int required_bits = log_blocks + 3;
- return required_bits <= 8 ? 8
- : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64;
- }
-
- // Use 32-bit hash for now
- static constexpr int bits_hash_ = 32;
-
- // Number of hash bits stored in slots in a block.
- // The highest bits of hash determine block id.
- // The next set of highest bits is a "stamp" stored in a slot in a block.
- static constexpr int bits_stamp_ = 7;
-
- // Padding bytes added at the end of buffers for ease of SIMD access
- static constexpr int padding_ = 64;
-
- int log_minibatch_;
- // Base 2 log of the number of blocks
- int log_blocks_ = 0;
- // Number of keys inserted into hash table
- uint32_t num_inserted_ = 0;
-
- // Data for blocks.
- // Each block has 8 status bytes for 8 slots, followed by 8 bit packed group ids for
- // these slots. In 8B status word, the order of bytes is reversed. Group ids are in
- // normal order. There is 64B padding at the end.
- //
- // 0 byte - 7 bucket | 1. byte - 6 bucket | ...
- // ---------------------------------------------------
- // | Empty bit* | Empty bit |
- // ---------------------------------------------------
- // | 7-bit hash | 7-bit hash |
- // ---------------------------------------------------
- // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0.
- //
- uint8_t* blocks_;
-
- // Array of hashes of values inserted into slots.
- // Undefined if the corresponding slot is empty.
- // There is 64B padding at the end.
- uint32_t* hashes_;
-
- int64_t hardware_flags_;
- MemoryPool* pool_;
- util::TempVectorStack* temp_stack_;
-
- EqualImpl equal_impl_;
- AppendImpl append_impl_;
-};
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace compute {
+
+class SwissTable {
+ public:
+ SwissTable() = default;
+ ~SwissTable() { cleanup(); }
+
+ using EqualImpl =
+ std::function<void(int num_keys, const uint16_t* selection /* may be null */,
+ const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
+ uint16_t* out_selection_mismatch)>;
+ using AppendImpl = std::function<Status(int num_keys, const uint16_t* selection)>;
+
+ Status init(int64_t hardware_flags, MemoryPool* pool, util::TempVectorStack* temp_stack,
+ int log_minibatch, EqualImpl equal_impl, AppendImpl append_impl);
+ void cleanup();
+
+ Status map(const int ckeys, const uint32_t* hashes, uint32_t* outgroupids);
+
+ private:
+ // Lookup helpers
+
+ /// \brief Scan bytes in block in reverse and stop as soon
+ /// as a position of interest is found.
+ ///
+ /// Positions of interest:
+ /// a) slot with a matching stamp is encountered,
+ /// b) first empty slot is encountered,
+ /// c) we reach the end of the block.
+ ///
+ /// \param[in] block 8 byte block of hash table
+ /// \param[in] stamp 7 bits of hash used as a stamp
+ /// \param[in] start_slot Index of the first slot in the block to start search from. We
+ /// assume that this index always points to a non-empty slot, equivalently
+ /// that it comes before any empty slots. (Used only by one template
+ /// variant.)
+ /// \param[out] out_slot index corresponding to the discovered position of interest (8
+ /// represents end of block).
+ /// \param[out] out_match_found an integer flag (0 or 1) indicating if we found a
+ /// matching stamp.
+ template <bool use_start_slot>
+ inline void search_block(uint64_t block, int stamp, int start_slot, int* out_slot,
+ int* out_match_found);
+
+ /// \brief Extract group id for a given slot in a given block.
+ ///
+ /// Group ids follow in memory after 64-bit block data.
+ /// Maximum number of groups inserted is equal to the number
+ /// of all slots in all blocks, which is 8 * the number of blocks.
+ /// Group ids are bit packed using that maximum to determine the necessary number of
+ /// bits.
+ inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
+ uint64_t group_id_mask);
+
+ inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found);
+
+ inline void insert(uint8_t* block_base, uint64_t slot_id, uint32_t hash, uint8_t stamp,
+ uint32_t group_id);
+
+ inline uint64_t num_groups_for_resize() const;
+
+ inline uint64_t wrap_global_slot_id(uint64_t global_slot_id);
+
+ // First hash table access
+ // Find first match in the start block if exists.
+ // Possible cases:
+ // 1. Stamp match in a block
+ // 2. No stamp match in a block, no empty buckets in a block
+ // 3. No stamp match in a block, empty buckets in a block
+ //
+ template <bool use_selection>
+ void lookup_1(const uint16_t* selection, const int num_keys, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_slot_ids);
+#if defined(ARROW_HAVE_AVX2)
+ void lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_next_slot_ids);
+ void lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_next_slot_ids);
+#endif
+
+ // Completing hash table lookup post first access
+ Status lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
+ uint16_t* inout_selection, bool* out_need_resize,
+ uint32_t* out_group_ids, uint32_t* out_next_slot_ids);
+
+ // Resize small hash tables when 50% full (up to 8KB).
+ // Resize large hash tables when 75% full.
+ Status grow_double();
+
+ static int num_groupid_bits_from_log_blocks(int log_blocks) {
+ int required_bits = log_blocks + 3;
+ return required_bits <= 8 ? 8
+ : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64;
+ }
+
+ // Use 32-bit hash for now
+ static constexpr int bits_hash_ = 32;
+
+ // Number of hash bits stored in slots in a block.
+ // The highest bits of hash determine block id.
+ // The next set of highest bits is a "stamp" stored in a slot in a block.
+ static constexpr int bits_stamp_ = 7;
+
+ // Padding bytes added at the end of buffers for ease of SIMD access
+ static constexpr int padding_ = 64;
+
+ int log_minibatch_;
+ // Base 2 log of the number of blocks
+ int log_blocks_ = 0;
+ // Number of keys inserted into hash table
+ uint32_t num_inserted_ = 0;
+
+ // Data for blocks.
+ // Each block has 8 status bytes for 8 slots, followed by 8 bit packed group ids for
+ // these slots. In 8B status word, the order of bytes is reversed. Group ids are in
+ // normal order. There is 64B padding at the end.
+ //
+ // 0 byte - 7 bucket | 1. byte - 6 bucket | ...
+ // ---------------------------------------------------
+ // | Empty bit* | Empty bit |
+ // ---------------------------------------------------
+ // | 7-bit hash | 7-bit hash |
+ // ---------------------------------------------------
+ // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0.
+ //
+ uint8_t* blocks_;
+
+ // Array of hashes of values inserted into slots.
+ // Undefined if the corresponding slot is empty.
+ // There is 64B padding at the end.
+ uint32_t* hashes_;
+
+ int64_t hardware_flags_;
+ MemoryPool* pool_;
+ util::TempVectorStack* temp_stack_;
+
+ EqualImpl equal_impl_;
+ AppendImpl append_impl_;
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
index a44676c2f0d..b667afc65bb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
@@ -1,278 +1,278 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/util.h"
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-
-using BitUtil::CountTrailingZeros;
-
-namespace util {
-
-inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
- int* num_indexes, uint16_t* indexes) {
- int n = *num_indexes;
- while (word) {
- indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
- word &= word - 1;
- }
- *num_indexes = n;
-}
-
-inline void BitUtil::bits_filter_indexes_helper(uint64_t word,
- const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes) {
- int n = *num_indexes;
- while (word) {
- indexes[n++] = input_indexes[CountTrailingZeros(word)];
- word &= word - 1;
- }
- *num_indexes = n;
-}
-
-template <int bit_to_search, bool filter_input_indexes>
-void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes) {
- // 64 bits at a time
- constexpr int unroll = 64;
- int tail = num_bits % unroll;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- if (filter_input_indexes) {
- bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
- num_indexes, indexes);
- } else {
- bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes);
- }
- } else {
-#endif
- *num_indexes = 0;
- for (int i = 0; i < num_bits / unroll; ++i) {
- uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
- if (bit_to_search == 0) {
- word = ~word;
- }
- if (filter_input_indexes) {
- bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
- } else {
- bits_to_indexes_helper(word, i * 64, num_indexes, indexes);
- }
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
- // Optionally process the last partial word with masking out bits outside range
- if (tail) {
- uint64_t word =
- util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll]);
- if (bit_to_search == 0) {
- word = ~word;
- }
- word &= ~0ULL >> (64 - tail);
- if (filter_input_indexes) {
- bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
- indexes);
- } else {
- bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes);
- }
- }
-}
-
-void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits, int* num_indexes,
- uint16_t* indexes, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- int num_indexes_head = 0;
- uint64_t bits_head =
- util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
- reinterpret_cast<const uint8_t*>(&bits_head), &num_indexes_head,
- indexes);
- int num_indexes_tail = 0;
- if (num_bits > bits_in_first_byte) {
- bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
- bits + 1, &num_indexes_tail, indexes + num_indexes_head);
- }
- *num_indexes = num_indexes_head + num_indexes_tail;
- return;
- }
-
- if (bit_to_search == 0) {
- bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
- num_indexes, indexes);
- } else {
- ARROW_DCHECK(bit_to_search == 1);
- bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
- num_indexes, indexes);
- }
-}
-
-void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits,
- const uint16_t* input_indexes, int* num_indexes,
- uint16_t* indexes, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- int num_indexes_head = 0;
- uint64_t bits_head =
- util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
- reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
- &num_indexes_head, indexes);
- int num_indexes_tail = 0;
- if (num_bits > bits_in_first_byte) {
- bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
- bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
- indexes + num_indexes_head);
- }
- *num_indexes = num_indexes_head + num_indexes_tail;
- return;
- }
-
- if (bit_to_search == 0) {
- bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
- num_indexes, indexes);
- } else {
- ARROW_DCHECK(bit_to_search == 1);
- bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
- num_indexes, indexes);
- }
-}
-
-void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, int* num_indexes_bit0,
- uint16_t* indexes_bit0, uint16_t* indexes_bit1,
- int bit_offset) {
- bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
- bit_offset);
- int num_indexes_bit1;
- bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
- bit_offset);
-}
-
-void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, uint8_t* bytes, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- uint64_t bits_head =
- util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bits_to_bytes(hardware_flags, bits_in_first_byte,
- reinterpret_cast<const uint8_t*>(&bits_head), bytes);
- if (num_bits > bits_in_first_byte) {
- bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
- bytes + bits_in_first_byte);
- }
- return;
- }
-
- int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- // The function call below processes whole 32 bit chunks together.
- num_processed = num_bits - (num_bits % 32);
- bits_to_bytes_avx2(num_processed, bits, bytes);
- }
-#endif
- // Processing 8 bits at a time
- constexpr int unroll = 8;
- for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
- uint8_t bits_next = bits[i];
- // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
- // from the previous.
- uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
- ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
- (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
- unpacked |= (bits_next & 1);
- unpacked &= 0x0101010101010101ULL;
- unpacked *= 255;
- util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
- }
-}
-
-void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits,
- const uint8_t* bytes, uint8_t* bits, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- uint64_t bits_head;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
- reinterpret_cast<uint8_t*>(&bits_head));
- uint8_t mask = (1 << bit_offset) - 1;
- *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
-
- if (num_bits > bits_in_first_byte) {
- bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
- bytes + bits_in_first_byte, bits + 1);
- }
- return;
- }
-
- int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- // The function call below processes whole 32 bit chunks together.
- num_processed = num_bits - (num_bits % 32);
- bytes_to_bits_avx2(num_processed, bytes, bits);
- }
-#endif
- // Process 8 bits at a time
- constexpr int unroll = 8;
- for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
- uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
- bytes_next &= 0x0101010101010101ULL;
- bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes
- bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes
- bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte
- bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
- }
-}
-
-bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
- uint32_t num_bytes) {
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- return are_all_bytes_zero_avx2(bytes, num_bytes);
- }
-#endif
- uint64_t result_or = 0;
- uint32_t i;
- for (i = 0; i < num_bytes / 8; ++i) {
- uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
- result_or |= x;
- }
- if (num_bytes % 8 > 0) {
- uint64_t tail = 0;
- result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
- }
- return result_or == 0;
-}
-
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/util.h"
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+
+using BitUtil::CountTrailingZeros;
+
+namespace util {
+
+inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+ int* num_indexes, uint16_t* indexes) {
+ int n = *num_indexes;
+ while (word) {
+ indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
+ word &= word - 1;
+ }
+ *num_indexes = n;
+}
+
+inline void BitUtil::bits_filter_indexes_helper(uint64_t word,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes) {
+ int n = *num_indexes;
+ while (word) {
+ indexes[n++] = input_indexes[CountTrailingZeros(word)];
+ word &= word - 1;
+ }
+ *num_indexes = n;
+}
+
+template <int bit_to_search, bool filter_input_indexes>
+void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes) {
+ // 64 bits at a time
+ constexpr int unroll = 64;
+ int tail = num_bits % unroll;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ if (filter_input_indexes) {
+ bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
+ num_indexes, indexes);
+ } else {
+ bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes);
+ }
+ } else {
+#endif
+ *num_indexes = 0;
+ for (int i = 0; i < num_bits / unroll; ++i) {
+ uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
+ if (bit_to_search == 0) {
+ word = ~word;
+ }
+ if (filter_input_indexes) {
+ bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
+ } else {
+ bits_to_indexes_helper(word, i * 64, num_indexes, indexes);
+ }
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+ // Optionally process the last partial word with masking out bits outside range
+ if (tail) {
+ uint64_t word =
+ util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll]);
+ if (bit_to_search == 0) {
+ word = ~word;
+ }
+ word &= ~0ULL >> (64 - tail);
+ if (filter_input_indexes) {
+ bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
+ indexes);
+ } else {
+ bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes);
+ }
+ }
+}
+
+void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ int num_indexes_head = 0;
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), &num_indexes_head,
+ indexes);
+ int num_indexes_tail = 0;
+ if (num_bits > bits_in_first_byte) {
+ bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+ bits + 1, &num_indexes_tail, indexes + num_indexes_head);
+ }
+ *num_indexes = num_indexes_head + num_indexes_tail;
+ return;
+ }
+
+ if (bit_to_search == 0) {
+ bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
+ num_indexes, indexes);
+ } else {
+ ARROW_DCHECK(bit_to_search == 1);
+ bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
+ num_indexes, indexes);
+ }
+}
+
+void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes, int* num_indexes,
+ uint16_t* indexes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ int num_indexes_head = 0;
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
+ &num_indexes_head, indexes);
+ int num_indexes_tail = 0;
+ if (num_bits > bits_in_first_byte) {
+ bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+ bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
+ indexes + num_indexes_head);
+ }
+ *num_indexes = num_indexes_head + num_indexes_tail;
+ return;
+ }
+
+ if (bit_to_search == 0) {
+ bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
+ num_indexes, indexes);
+ } else {
+ ARROW_DCHECK(bit_to_search == 1);
+ bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
+ num_indexes, indexes);
+ }
+}
+
+void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, int* num_indexes_bit0,
+ uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+ int bit_offset) {
+ bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
+ bit_offset);
+ int num_indexes_bit1;
+ bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
+ bit_offset);
+}
+
+void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, uint8_t* bytes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_to_bytes(hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), bytes);
+ if (num_bits > bits_in_first_byte) {
+ bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
+ bytes + bits_in_first_byte);
+ }
+ return;
+ }
+
+ int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ // The function call below processes whole 32 bit chunks together.
+ num_processed = num_bits - (num_bits % 32);
+ bits_to_bytes_avx2(num_processed, bits, bytes);
+ }
+#endif
+ // Processing 8 bits at a time
+ constexpr int unroll = 8;
+ for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
+ uint8_t bits_next = bits[i];
+ // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+ // from the previous.
+ uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+ ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+ (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+ unpacked |= (bits_next & 1);
+ unpacked &= 0x0101010101010101ULL;
+ unpacked *= 255;
+ util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
+ }
+}
+
+void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bytes, uint8_t* bits, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ uint64_t bits_head;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
+ reinterpret_cast<uint8_t*>(&bits_head));
+ uint8_t mask = (1 << bit_offset) - 1;
+ *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
+
+ if (num_bits > bits_in_first_byte) {
+ bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
+ bytes + bits_in_first_byte, bits + 1);
+ }
+ return;
+ }
+
+ int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ // The function call below processes whole 32 bit chunks together.
+ num_processed = num_bits - (num_bits % 32);
+ bytes_to_bits_avx2(num_processed, bytes, bits);
+ }
+#endif
+ // Process 8 bits at a time
+ constexpr int unroll = 8;
+ for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
+ uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+ bytes_next &= 0x0101010101010101ULL;
+ bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes
+ bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes
+ bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte
+ bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
+ }
+}
+
+bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+ uint32_t num_bytes) {
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ return are_all_bytes_zero_avx2(bytes, num_bytes);
+ }
+#endif
+ uint64_t result_or = 0;
+ uint32_t i;
+ for (i = 0; i < num_bytes / 8; ++i) {
+ uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+ result_or |= x;
+ }
+ if (num_bytes % 8 > 0) {
+ uint64_t tail = 0;
+ result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
+ }
+ return result_or == 0;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
index 471cc332220..1025476ac63 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
@@ -1,171 +1,171 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include "arrow/buffer.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/cpu_info.h"
-#include "arrow/util/logging.h"
-
-#if defined(__clang__) || defined(__GNUC__)
-#define BYTESWAP(x) __builtin_bswap64(x)
-#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#define BYTESWAP(x) _byteswap_uint64(x)
-#define ROTL(x, n) _rotl((x), (n))
-#endif
-
-namespace arrow {
-namespace util {
-
-// Some platforms typedef int64_t as long int instead of long long int,
-// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
-// which need long long.
-// We use the cast to the type below in these intrinsics to make the code
-// compile in all cases.
-//
-using int64_for_gather_t = const long long int; // NOLINT runtime-int
-
-/// Storage used to allocate temporary vectors of a batch size.
-/// Temporary vectors should resemble allocating temporary variables on the stack
-/// but in the context of vectorized processing where we need to store a vector of
-/// temporaries instead of a single value.
-class TempVectorStack {
- template <typename>
- friend class TempVectorHolder;
-
- public:
- Status Init(MemoryPool* pool, int64_t size) {
- num_vectors_ = 0;
- top_ = 0;
- buffer_size_ = size;
- ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
- buffer_ = std::move(buffer);
- return Status::OK();
- }
-
- private:
- void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
- int64_t old_top = top_;
- top_ += num_bytes + padding;
- // Stack overflow check
- ARROW_DCHECK(top_ <= buffer_size_);
- *data = buffer_->mutable_data() + old_top;
- *id = num_vectors_++;
- }
- void release(int id, uint32_t num_bytes) {
- ARROW_DCHECK(num_vectors_ == id + 1);
- int64_t size = num_bytes + padding;
- ARROW_DCHECK(top_ >= size);
- top_ -= size;
- --num_vectors_;
- }
- static constexpr int64_t padding = 64;
- int num_vectors_;
- int64_t top_;
- std::unique_ptr<Buffer> buffer_;
- int64_t buffer_size_;
-};
-
-template <typename T>
-class TempVectorHolder {
- friend class TempVectorStack;
-
- public:
- ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
- T* mutable_data() { return reinterpret_cast<T*>(data_); }
- TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
- stack_ = stack;
- num_elements_ = num_elements;
- stack_->alloc(num_elements * sizeof(T), &data_, &id_);
- }
-
- private:
- TempVectorStack* stack_;
- uint8_t* data_;
- int id_;
- uint32_t num_elements_;
-};
-
-class BitUtil {
- public:
- static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits, int* num_indexes,
- uint16_t* indexes, int bit_offset = 0);
-
- static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits,
- const uint16_t* input_indexes, int* num_indexes,
- uint16_t* indexes, int bit_offset = 0);
-
- // Input and output indexes may be pointing to the same data (in-place filtering).
- static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, int* num_indexes_bit0,
- uint16_t* indexes_bit0, uint16_t* indexes_bit1,
- int bit_offset = 0);
-
- // Bit 1 is replaced with byte 0xFF.
- static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
-
- // Return highest bit of each byte.
- static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
- const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
-
- static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
- uint32_t num_bytes);
-
- private:
- inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
- int* num_indexes, uint16_t* indexes);
- inline static void bits_filter_indexes_helper(uint64_t word,
- const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
- template <int bit_to_search, bool filter_input_indexes>
- static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
-
-#if defined(ARROW_HAVE_AVX2)
- static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
- const uint8_t* bits, int* num_indexes,
- uint16_t* indexes);
- static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
- const uint8_t* bits, const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
- template <int bit_to_search>
- static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
- int* num_indexes, uint16_t* indexes);
- template <int bit_to_search>
- static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
- const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
- static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
- static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
- static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
-#endif
-};
-
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+
+#if defined(__clang__) || defined(__GNUC__)
+#define BYTESWAP(x) __builtin_bswap64(x)
+#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#define BYTESWAP(x) _byteswap_uint64(x)
+#define ROTL(x, n) _rotl((x), (n))
+#endif
+
+namespace arrow {
+namespace util {
+
+// Some platforms typedef int64_t as long int instead of long long int,
+// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
+// which need long long.
+// We use the cast to the type below in these intrinsics to make the code
+// compile in all cases.
+//
+using int64_for_gather_t = const long long int; // NOLINT runtime-int
+
+/// Storage used to allocate temporary vectors of a batch size.
+/// Temporary vectors should resemble allocating temporary variables on the stack
+/// but in the context of vectorized processing where we need to store a vector of
+/// temporaries instead of a single value.
+class TempVectorStack {
+ template <typename>
+ friend class TempVectorHolder;
+
+ public:
+ Status Init(MemoryPool* pool, int64_t size) {
+ num_vectors_ = 0;
+ top_ = 0;
+ buffer_size_ = size;
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
+ buffer_ = std::move(buffer);
+ return Status::OK();
+ }
+
+ private:
+ void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
+ int64_t old_top = top_;
+ top_ += num_bytes + padding;
+ // Stack overflow check
+ ARROW_DCHECK(top_ <= buffer_size_);
+ *data = buffer_->mutable_data() + old_top;
+ *id = num_vectors_++;
+ }
+ void release(int id, uint32_t num_bytes) {
+ ARROW_DCHECK(num_vectors_ == id + 1);
+ int64_t size = num_bytes + padding;
+ ARROW_DCHECK(top_ >= size);
+ top_ -= size;
+ --num_vectors_;
+ }
+ static constexpr int64_t padding = 64;
+ int num_vectors_;
+ int64_t top_;
+ std::unique_ptr<Buffer> buffer_;
+ int64_t buffer_size_;
+};
+
+template <typename T>
+class TempVectorHolder {
+ friend class TempVectorStack;
+
+ public:
+ ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
+ T* mutable_data() { return reinterpret_cast<T*>(data_); }
+ TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
+ stack_ = stack;
+ num_elements_ = num_elements;
+ stack_->alloc(num_elements * sizeof(T), &data_, &id_);
+ }
+
+ private:
+ TempVectorStack* stack_;
+ uint8_t* data_;
+ int id_;
+ uint32_t num_elements_;
+};
+
+class BitUtil {
+ public:
+ static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes, int bit_offset = 0);
+
+ static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes, int* num_indexes,
+ uint16_t* indexes, int bit_offset = 0);
+
+ // Input and output indexes may be pointing to the same data (in-place filtering).
+ static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, int* num_indexes_bit0,
+ uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+ int bit_offset = 0);
+
+ // Bit 1 is replaced with byte 0xFF.
+ static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
+
+ // Return highest bit of each byte.
+ static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
+
+ static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+ uint32_t num_bytes);
+
+ private:
+ inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+ int* num_indexes, uint16_t* indexes);
+ inline static void bits_filter_indexes_helper(uint64_t word,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search, bool filter_input_indexes>
+ static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+
+#if defined(ARROW_HAVE_AVX2)
+ static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+ const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes);
+ static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search>
+ static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search>
+ static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
+ static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
+ static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+#endif
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
index 55daa243cd3..abc9861537f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
@@ -84,14 +84,14 @@ class ARROW_EXPORT ExecListener {
class DatumAccumulator : public ExecListener {
public:
- DatumAccumulator() = default;
+ DatumAccumulator() = default;
Status OnResult(Datum value) override {
values_.emplace_back(value);
return Status::OK();
}
- std::vector<Datum> values() { return std::move(values_); }
+ std::vector<Datum> values() { return std::move(values_); }
private:
std::vector<Datum> values_;
@@ -102,17 +102,17 @@ class DatumAccumulator : public ExecListener {
/// inputs will be split into non-chunked ExecBatch values for execution
Status CheckAllValues(const std::vector<Datum>& values);
-class ARROW_EXPORT KernelExecutor {
+class ARROW_EXPORT KernelExecutor {
public:
- virtual ~KernelExecutor() = default;
-
- /// The Kernel's `init` method must be called and any KernelState set in the
- /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate
- /// the case where init may be expensive and does not need to be called again for
- /// each execution of the kernel, for example the same lookup table can be re-used
- /// for all scanned batches in a dataset filter.
- virtual Status Init(KernelContext*, KernelInitArgs) = 0;
-
+ virtual ~KernelExecutor() = default;
+
+ /// The Kernel's `init` method must be called and any KernelState set in the
+ /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate
+ /// the case where init may be expensive and does not need to be called again for
+ /// each execution of the kernel, for example the same lookup table can be re-used
+ /// for all scanned batches in a dataset filter.
+ virtual Status Init(KernelContext*, KernelInitArgs) = 0;
+
/// XXX: Better configurability for listener
/// Not thread-safe
virtual Status Execute(const std::vector<Datum>& args, ExecListener* listener) = 0;
@@ -120,9 +120,9 @@ class ARROW_EXPORT KernelExecutor {
virtual Datum WrapResults(const std::vector<Datum>& args,
const std::vector<Datum>& outputs) = 0;
- static std::unique_ptr<KernelExecutor> MakeScalar();
- static std::unique_ptr<KernelExecutor> MakeVector();
- static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
+ static std::unique_ptr<KernelExecutor> MakeScalar();
+ static std::unique_ptr<KernelExecutor> MakeVector();
+ static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
};
/// \brief Populate validity bitmap with the intersection of the nullity of the
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
index 05d14d03b16..1958f442849 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
@@ -21,108 +21,108 @@
#include <memory>
#include <sstream>
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/cast.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
#include "arrow/compute/exec.h"
#include "arrow/compute/exec_internal.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/registry.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
#include "arrow/datum.h"
#include "arrow/util/cpu_info.h"
namespace arrow {
-
-using internal::checked_cast;
-
+
+using internal::checked_cast;
+
namespace compute {
-Result<std::shared_ptr<Buffer>> FunctionOptionsType::Serialize(
- const FunctionOptions&) const {
- return Status::NotImplemented("Serialize for ", type_name());
-}
-
-Result<std::unique_ptr<FunctionOptions>> FunctionOptionsType::Deserialize(
- const Buffer& buffer) const {
- return Status::NotImplemented("Deserialize for ", type_name());
-}
-
-std::string FunctionOptions::ToString() const { return options_type()->Stringify(*this); }
-
-bool FunctionOptions::Equals(const FunctionOptions& other) const {
- if (this == &other) return true;
- if (options_type() != other.options_type()) return false;
- return options_type()->Compare(*this, other);
-}
-
-Result<std::shared_ptr<Buffer>> FunctionOptions::Serialize() const {
- return options_type()->Serialize(*this);
-}
-
-Result<std::unique_ptr<FunctionOptions>> FunctionOptions::Deserialize(
- const std::string& type_name, const Buffer& buffer) {
- ARROW_ASSIGN_OR_RAISE(auto options,
- GetFunctionRegistry()->GetFunctionOptionsType(type_name));
- return options->Deserialize(buffer);
-}
-
-void PrintTo(const FunctionOptions& options, std::ostream* os) {
- *os << options.ToString();
-}
-
-static const FunctionDoc kEmptyFunctionDoc{};
-
-const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; }
-
-static Status CheckArityImpl(const Function* function, int passed_num_args,
- const char* passed_num_args_label) {
- if (function->arity().is_varargs && passed_num_args < function->arity().num_args) {
- return Status::Invalid("VarArgs function ", function->name(), " needs at least ",
- function->arity().num_args, " arguments but ",
- passed_num_args_label, " only ", passed_num_args);
+Result<std::shared_ptr<Buffer>> FunctionOptionsType::Serialize(
+ const FunctionOptions&) const {
+ return Status::NotImplemented("Serialize for ", type_name());
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsType::Deserialize(
+ const Buffer& buffer) const {
+ return Status::NotImplemented("Deserialize for ", type_name());
+}
+
+std::string FunctionOptions::ToString() const { return options_type()->Stringify(*this); }
+
+bool FunctionOptions::Equals(const FunctionOptions& other) const {
+ if (this == &other) return true;
+ if (options_type() != other.options_type()) return false;
+ return options_type()->Compare(*this, other);
+}
+
+Result<std::shared_ptr<Buffer>> FunctionOptions::Serialize() const {
+ return options_type()->Serialize(*this);
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptions::Deserialize(
+ const std::string& type_name, const Buffer& buffer) {
+ ARROW_ASSIGN_OR_RAISE(auto options,
+ GetFunctionRegistry()->GetFunctionOptionsType(type_name));
+ return options->Deserialize(buffer);
+}
+
+void PrintTo(const FunctionOptions& options, std::ostream* os) {
+ *os << options.ToString();
+}
+
+static const FunctionDoc kEmptyFunctionDoc{};
+
+const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; }
+
+static Status CheckArityImpl(const Function* function, int passed_num_args,
+ const char* passed_num_args_label) {
+ if (function->arity().is_varargs && passed_num_args < function->arity().num_args) {
+ return Status::Invalid("VarArgs function ", function->name(), " needs at least ",
+ function->arity().num_args, " arguments but ",
+ passed_num_args_label, " only ", passed_num_args);
}
-
- if (!function->arity().is_varargs && passed_num_args != function->arity().num_args) {
- return Status::Invalid("Function ", function->name(), " accepts ",
- function->arity().num_args, " arguments but ",
- passed_num_args_label, " ", passed_num_args);
- }
-
+
+ if (!function->arity().is_varargs && passed_num_args != function->arity().num_args) {
+ return Status::Invalid("Function ", function->name(), " accepts ",
+ function->arity().num_args, " arguments but ",
+ passed_num_args_label, " ", passed_num_args);
+ }
+
return Status::OK();
}
-Status Function::CheckArity(const std::vector<InputType>& in_types) const {
- return CheckArityImpl(this, static_cast<int>(in_types.size()), "kernel accepts");
-}
-
-Status Function::CheckArity(const std::vector<ValueDescr>& descrs) const {
- return CheckArityImpl(this, static_cast<int>(descrs.size()),
- "attempted to look up kernel(s) with");
-}
-
-namespace detail {
-
-Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>& descrs) {
- return Status::NotImplemented("Function ", func->name(),
- " has no kernel matching input types ",
- ValueDescr::ToString(descrs));
+Status Function::CheckArity(const std::vector<InputType>& in_types) const {
+ return CheckArityImpl(this, static_cast<int>(in_types.size()), "kernel accepts");
}
-template <typename KernelType>
-const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
- const std::vector<ValueDescr>& values) {
- const KernelType* kernel_matches[SimdLevel::MAX] = {nullptr};
-
+Status Function::CheckArity(const std::vector<ValueDescr>& descrs) const {
+ return CheckArityImpl(this, static_cast<int>(descrs.size()),
+ "attempted to look up kernel(s) with");
+}
+
+namespace detail {
+
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>& descrs) {
+ return Status::NotImplemented("Function ", func->name(),
+ " has no kernel matching input types ",
+ ValueDescr::ToString(descrs));
+}
+
+template <typename KernelType>
+const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
+ const std::vector<ValueDescr>& values) {
+ const KernelType* kernel_matches[SimdLevel::MAX] = {nullptr};
+
// Validate arity
for (const auto& kernel : kernels) {
- if (kernel->signature->MatchesInputs(values)) {
- kernel_matches[kernel->simd_level] = kernel;
+ if (kernel->signature->MatchesInputs(values)) {
+ kernel_matches[kernel->simd_level] = kernel;
}
}
// Dispatch as the CPU feature
-#if defined(ARROW_HAVE_RUNTIME_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX2)
auto cpu_info = arrow::internal::CpuInfo::GetInstance();
-#endif
+#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) {
if (kernel_matches[SimdLevel::AVX512]) {
@@ -141,54 +141,54 @@ const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
return kernel_matches[SimdLevel::NONE];
}
- return nullptr;
-}
-
-const Kernel* DispatchExactImpl(const Function* func,
- const std::vector<ValueDescr>& values) {
- if (func->kind() == Function::SCALAR) {
- return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
- values);
- }
-
- if (func->kind() == Function::VECTOR) {
- return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
- values);
- }
-
- if (func->kind() == Function::SCALAR_AGGREGATE) {
- return DispatchExactImpl(
- checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
- }
-
- if (func->kind() == Function::HASH_AGGREGATE) {
- return DispatchExactImpl(checked_cast<const HashAggregateFunction*>(func)->kernels(),
- values);
- }
-
- return nullptr;
-}
-
-} // namespace detail
-
-Result<const Kernel*> Function::DispatchExact(
- const std::vector<ValueDescr>& values) const {
- if (kind_ == Function::META) {
- return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
- }
- RETURN_NOT_OK(CheckArity(values));
-
- if (auto kernel = detail::DispatchExactImpl(this, values)) {
- return kernel;
- }
- return detail::NoMatchingKernel(this, values);
-}
-
-Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
- // TODO(ARROW-11508) permit generic conversions here
- return DispatchExact(*values);
+ return nullptr;
}
+const Kernel* DispatchExactImpl(const Function* func,
+ const std::vector<ValueDescr>& values) {
+ if (func->kind() == Function::SCALAR) {
+ return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
+ values);
+ }
+
+ if (func->kind() == Function::VECTOR) {
+ return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
+ values);
+ }
+
+ if (func->kind() == Function::SCALAR_AGGREGATE) {
+ return DispatchExactImpl(
+ checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
+ }
+
+ if (func->kind() == Function::HASH_AGGREGATE) {
+ return DispatchExactImpl(checked_cast<const HashAggregateFunction*>(func)->kernels(),
+ values);
+ }
+
+ return nullptr;
+}
+
+} // namespace detail
+
+Result<const Kernel*> Function::DispatchExact(
+ const std::vector<ValueDescr>& values) const {
+ if (kind_ == Function::META) {
+ return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+ }
+ RETURN_NOT_OK(CheckArity(values));
+
+ if (auto kernel = detail::DispatchExactImpl(this, values)) {
+ return kernel;
+ }
+ return detail::NoMatchingKernel(this, values);
+}
+
+Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
+ // TODO(ARROW-11508) permit generic conversions here
+ return DispatchExact(*values);
+}
+
Result<Datum> Function::Execute(const std::vector<Datum>& args,
const FunctionOptions* options, ExecContext* ctx) const {
if (options == nullptr) {
@@ -198,63 +198,63 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
ExecContext default_ctx;
return Execute(args, options, &default_ctx);
}
-
+
// type-check Datum arguments here. Really we'd like to avoid this as much as
// possible
RETURN_NOT_OK(detail::CheckAllValues(args));
- std::vector<ValueDescr> inputs(args.size());
- for (size_t i = 0; i != args.size(); ++i) {
- inputs[i] = args[i].descr();
- }
-
- ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
- ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx));
-
- std::unique_ptr<KernelState> state;
-
- KernelContext kernel_ctx{ctx};
- if (kernel->init) {
- ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
- kernel_ctx.SetState(state.get());
- }
-
- std::unique_ptr<detail::KernelExecutor> executor;
- if (kind() == Function::SCALAR) {
- executor = detail::KernelExecutor::MakeScalar();
- } else if (kind() == Function::VECTOR) {
- executor = detail::KernelExecutor::MakeVector();
- } else if (kind() == Function::SCALAR_AGGREGATE) {
- executor = detail::KernelExecutor::MakeScalarAggregate();
- } else {
- return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
- }
- RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
-
+ std::vector<ValueDescr> inputs(args.size());
+ for (size_t i = 0; i != args.size(); ++i) {
+ inputs[i] = args[i].descr();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
+ ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx));
+
+ std::unique_ptr<KernelState> state;
+
+ KernelContext kernel_ctx{ctx};
+ if (kernel->init) {
+ ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
+ kernel_ctx.SetState(state.get());
+ }
+
+ std::unique_ptr<detail::KernelExecutor> executor;
+ if (kind() == Function::SCALAR) {
+ executor = detail::KernelExecutor::MakeScalar();
+ } else if (kind() == Function::VECTOR) {
+ executor = detail::KernelExecutor::MakeVector();
+ } else if (kind() == Function::SCALAR_AGGREGATE) {
+ executor = detail::KernelExecutor::MakeScalarAggregate();
+ } else {
+ return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
+ }
+ RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
+
auto listener = std::make_shared<detail::DatumAccumulator>();
- RETURN_NOT_OK(executor->Execute(implicitly_cast_args, listener.get()));
- return executor->WrapResults(implicitly_cast_args, listener->values());
-}
-
-Status Function::Validate() const {
- if (!doc_->summary.empty()) {
- // Documentation given, check its contents
- int arg_count = static_cast<int>(doc_->arg_names.size());
- if (arg_count == arity_.num_args) {
- return Status::OK();
- }
- if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
- return Status::OK();
- }
- return Status::Invalid(
- "In function '", name_,
- "': ", "number of argument names for function documentation != function arity");
- }
- return Status::OK();
+ RETURN_NOT_OK(executor->Execute(implicitly_cast_args, listener.get()));
+ return executor->WrapResults(implicitly_cast_args, listener->values());
}
+Status Function::Validate() const {
+ if (!doc_->summary.empty()) {
+ // Documentation given, check its contents
+ int arg_count = static_cast<int>(doc_->arg_names.size());
+ if (arg_count == arity_.num_args) {
+ return Status::OK();
+ }
+ if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
+ return Status::OK();
+ }
+ return Status::Invalid(
+ "In function '", name_,
+ "': ", "number of argument names for function documentation != function arity");
+ }
+ return Status::OK();
+}
+
Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, KernelInit init) {
- RETURN_NOT_OK(CheckArity(in_types));
+ RETURN_NOT_OK(CheckArity(in_types));
if (arity_.is_varargs && in_types.size() != 1) {
return Status::Invalid("VarArgs signatures must have exactly one input type");
@@ -266,7 +266,7 @@ Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out
}
Status ScalarFunction::AddKernel(ScalarKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
if (arity_.is_varargs && !kernel.signature->is_varargs()) {
return Status::Invalid("Function accepts varargs but kernel signature does not");
}
@@ -276,7 +276,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) {
Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, KernelInit init) {
- RETURN_NOT_OK(CheckArity(in_types));
+ RETURN_NOT_OK(CheckArity(in_types));
if (arity_.is_varargs && in_types.size() != 1) {
return Status::Invalid("VarArgs signatures must have exactly one input type");
@@ -288,7 +288,7 @@ Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out
}
Status VectorFunction::AddKernel(VectorKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
if (arity_.is_varargs && !kernel.signature->is_varargs()) {
return Status::Invalid("Function accepts varargs but kernel signature does not");
}
@@ -297,7 +297,7 @@ Status VectorFunction::AddKernel(VectorKernel kernel) {
}
Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
if (arity_.is_varargs && !kernel.signature->is_varargs()) {
return Status::Invalid("Function accepts varargs but kernel signature does not");
}
@@ -305,21 +305,21 @@ Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
return Status::OK();
}
-Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
- if (arity_.is_varargs && !kernel.signature->is_varargs()) {
- return Status::Invalid("Function accepts varargs but kernel signature does not");
- }
- kernels_.emplace_back(std::move(kernel));
- return Status::OK();
+Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) {
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+ return Status::Invalid("Function accepts varargs but kernel signature does not");
+ }
+ kernels_.emplace_back(std::move(kernel));
+ return Status::OK();
}
Result<Datum> MetaFunction::Execute(const std::vector<Datum>& args,
const FunctionOptions* options,
ExecContext* ctx) const {
- RETURN_NOT_OK(
- CheckArityImpl(this, static_cast<int>(args.size()), "attempted to Execute with"));
-
+ RETURN_NOT_OK(
+ CheckArityImpl(this, static_cast<int>(args.size()), "attempted to Execute with"));
+
if (options == nullptr) {
options = default_options();
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
index bd854bbb28e..e50ba155244 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
@@ -29,7 +29,7 @@
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/compare.h"
+#include "arrow/util/compare.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -40,50 +40,50 @@ namespace compute {
///
/// @{
-/// \brief Extension point for defining options outside libarrow (but
-/// still within this project).
-class ARROW_EXPORT FunctionOptionsType {
- public:
- virtual ~FunctionOptionsType() = default;
-
- virtual const char* type_name() const = 0;
- virtual std::string Stringify(const FunctionOptions&) const = 0;
- virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
- virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
- virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
- const Buffer& buffer) const;
-};
-
+/// \brief Extension point for defining options outside libarrow (but
+/// still within this project).
+class ARROW_EXPORT FunctionOptionsType {
+ public:
+ virtual ~FunctionOptionsType() = default;
+
+ virtual const char* type_name() const = 0;
+ virtual std::string Stringify(const FunctionOptions&) const = 0;
+ virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
+ virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
+ virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const Buffer& buffer) const;
+};
+
/// \brief Base class for specifying options configuring a function's behavior,
/// such as error handling.
-class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
- public:
- virtual ~FunctionOptions() = default;
-
- const FunctionOptionsType* options_type() const { return options_type_; }
- const char* type_name() const { return options_type()->type_name(); }
-
- bool Equals(const FunctionOptions& other) const;
- using util::EqualityComparable<FunctionOptions>::Equals;
- using util::EqualityComparable<FunctionOptions>::operator==;
- using util::EqualityComparable<FunctionOptions>::operator!=;
- std::string ToString() const;
- /// \brief Serialize an options struct to a buffer.
- Result<std::shared_ptr<Buffer>> Serialize() const;
- /// \brief Deserialize an options struct from a buffer.
- /// Note: this will only look for `type_name` in the default FunctionRegistry;
- /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
- /// call FunctionOptionsType::Deserialize().
- static Result<std::unique_ptr<FunctionOptions>> Deserialize(
- const std::string& type_name, const Buffer& buffer);
-
- protected:
- explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
- const FunctionOptionsType* options_type_;
-};
-
-ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
-
+class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
+ public:
+ virtual ~FunctionOptions() = default;
+
+ const FunctionOptionsType* options_type() const { return options_type_; }
+ const char* type_name() const { return options_type()->type_name(); }
+
+ bool Equals(const FunctionOptions& other) const;
+ using util::EqualityComparable<FunctionOptions>::Equals;
+ using util::EqualityComparable<FunctionOptions>::operator==;
+ using util::EqualityComparable<FunctionOptions>::operator!=;
+ std::string ToString() const;
+ /// \brief Serialize an options struct to a buffer.
+ Result<std::shared_ptr<Buffer>> Serialize() const;
+ /// \brief Deserialize an options struct from a buffer.
+ /// Note: this will only look for `type_name` in the default FunctionRegistry;
+ /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
+ /// call FunctionOptionsType::Deserialize().
+ static Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const std::string& type_name, const Buffer& buffer);
+
+ protected:
+ explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
+ const FunctionOptionsType* options_type_;
+};
+
+ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
+
/// \brief Contains the number of required arguments for the function.
///
/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
@@ -118,37 +118,37 @@ struct ARROW_EXPORT Arity {
bool is_varargs = false;
};
-struct ARROW_EXPORT FunctionDoc {
- /// \brief A one-line summary of the function, using a verb.
- ///
- /// For example, "Add two numeric arrays or scalars".
- std::string summary;
-
- /// \brief A detailed description of the function, meant to follow the summary.
- std::string description;
-
- /// \brief Symbolic names (identifiers) for the function arguments.
- ///
- /// Some bindings may use this to generate nicer function signatures.
- std::vector<std::string> arg_names;
-
- // TODO add argument descriptions?
-
- /// \brief Name of the options class, if any.
- std::string options_class;
-
- FunctionDoc() = default;
-
- FunctionDoc(std::string summary, std::string description,
- std::vector<std::string> arg_names, std::string options_class = "")
- : summary(std::move(summary)),
- description(std::move(description)),
- arg_names(std::move(arg_names)),
- options_class(std::move(options_class)) {}
-
- static const FunctionDoc& Empty();
-};
-
+struct ARROW_EXPORT FunctionDoc {
+ /// \brief A one-line summary of the function, using a verb.
+ ///
+ /// For example, "Add two numeric arrays or scalars".
+ std::string summary;
+
+ /// \brief A detailed description of the function, meant to follow the summary.
+ std::string description;
+
+ /// \brief Symbolic names (identifiers) for the function arguments.
+ ///
+ /// Some bindings may use this to generate nicer function signatures.
+ std::vector<std::string> arg_names;
+
+ // TODO add argument descriptions?
+
+ /// \brief Name of the options class, if any.
+ std::string options_class;
+
+ FunctionDoc() = default;
+
+ FunctionDoc(std::string summary, std::string description,
+ std::vector<std::string> arg_names, std::string options_class = "")
+ : summary(std::move(summary)),
+ description(std::move(description)),
+ arg_names(std::move(arg_names)),
+ options_class(std::move(options_class)) {}
+
+ static const FunctionDoc& Empty();
+};
+
/// \brief Base class for compute functions. Function implementations contain a
/// collection of "kernels" which are implementations of the function for
/// specific argument types. Selecting a viable kernel for executing a function
@@ -172,10 +172,10 @@ class ARROW_EXPORT Function {
/// A function that computes scalar summary statistics from array input.
SCALAR_AGGREGATE,
- /// A function that computes grouped summary statistics from array input
- /// and an array of group identifiers.
- HASH_AGGREGATE,
-
+ /// A function that computes grouped summary statistics from array input
+ /// and an array of group identifiers.
+ HASH_AGGREGATE,
+
/// A function that dispatches to other functions and does not contain its
/// own kernels.
META
@@ -194,27 +194,27 @@ class ARROW_EXPORT Function {
/// function accepts variable numbers of arguments.
const Arity& arity() const { return arity_; }
- /// \brief Return the function documentation
- const FunctionDoc& doc() const { return *doc_; }
-
+ /// \brief Return the function documentation
+ const FunctionDoc& doc() const { return *doc_; }
+
/// \brief Returns the number of registered kernels for this function.
virtual int num_kernels() const = 0;
- /// \brief Return a kernel that can execute the function given the exact
- /// argument types (without implicit type casts or scalar->array promotions).
- ///
- /// NB: This function is overridden in CastFunction.
- virtual Result<const Kernel*> DispatchExact(
- const std::vector<ValueDescr>& values) const;
-
- /// \brief Return a best-match kernel that can execute the function given the argument
- /// types, after implicit casts are applied.
- ///
- /// \param[in,out] values Argument types. An element may be modified to indicate that
- /// the returned kernel only approximately matches the input value descriptors; callers
- /// are responsible for casting inputs to the type and shape required by the kernel.
- virtual Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const;
-
+ /// \brief Return a kernel that can execute the function given the exact
+ /// argument types (without implicit type casts or scalar->array promotions).
+ ///
+ /// NB: This function is overridden in CastFunction.
+ virtual Result<const Kernel*> DispatchExact(
+ const std::vector<ValueDescr>& values) const;
+
+ /// \brief Return a best-match kernel that can execute the function given the argument
+ /// types, after implicit casts are applied.
+ ///
+ /// \param[in,out] values Argument types. An element may be modified to indicate that
+ /// the returned kernel only approximately matches the input value descriptors; callers
+ /// are responsible for casting inputs to the type and shape required by the kernel.
+ virtual Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const;
+
/// \brief Execute the function eagerly with the passed input arguments with
/// kernel dispatch, batch iteration, and memory allocation details taken
/// care of.
@@ -231,24 +231,24 @@ class ARROW_EXPORT Function {
/// that default_options() is valid to pass to Execute as options.
const FunctionOptions* default_options() const { return default_options_; }
- virtual Status Validate() const;
-
+ virtual Status Validate() const;
+
protected:
Function(std::string name, Function::Kind kind, const Arity& arity,
- const FunctionDoc* doc, const FunctionOptions* default_options)
+ const FunctionDoc* doc, const FunctionOptions* default_options)
: name_(std::move(name)),
kind_(kind),
arity_(arity),
- doc_(doc ? doc : &FunctionDoc::Empty()),
+ doc_(doc ? doc : &FunctionDoc::Empty()),
default_options_(default_options) {}
- Status CheckArity(const std::vector<InputType>&) const;
- Status CheckArity(const std::vector<ValueDescr>&) const;
+ Status CheckArity(const std::vector<InputType>&) const;
+ Status CheckArity(const std::vector<ValueDescr>&) const;
std::string name_;
Function::Kind kind_;
Arity arity_;
- const FunctionDoc* doc_;
+ const FunctionDoc* doc_;
const FunctionOptions* default_options_ = NULLPTR;
};
@@ -270,20 +270,20 @@ class FunctionImpl : public Function {
protected:
FunctionImpl(std::string name, Function::Kind kind, const Arity& arity,
- const FunctionDoc* doc, const FunctionOptions* default_options)
- : Function(std::move(name), kind, arity, doc, default_options) {}
+ const FunctionDoc* doc, const FunctionOptions* default_options)
+ : Function(std::move(name), kind, arity, doc, default_options) {}
std::vector<KernelType> kernels_;
};
-/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
-ARROW_EXPORT
-const Kernel* DispatchExactImpl(const Function* func, const std::vector<ValueDescr>&);
-
-/// \brief Return an error message if no Kernel is found.
-ARROW_EXPORT
-Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>&);
-
+/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
+ARROW_EXPORT
+const Kernel* DispatchExactImpl(const Function* func, const std::vector<ValueDescr>&);
+
+/// \brief Return an error message if no Kernel is found.
+ARROW_EXPORT
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>&);
+
} // namespace detail
/// \brief A function that executes elementwise operations on arrays or
@@ -295,9 +295,9 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
public:
using KernelType = ScalarKernel;
- ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
- : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity, doc,
+ : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity, doc,
default_options) {}
/// \brief Add a kernel with given input/output types, no required state
@@ -319,9 +319,9 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
public:
using KernelType = VectorKernel;
- VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
- : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity, doc,
+ : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity, doc,
default_options) {}
/// \brief Add a simple kernel with given input/output types, no required
@@ -340,29 +340,29 @@ class ARROW_EXPORT ScalarAggregateFunction
public:
using KernelType = ScalarAggregateKernel;
- ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
: detail::FunctionImpl<ScalarAggregateKernel>(
- std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {}
+ std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {}
/// \brief Add a kernel (function implementation). Returns error if the
/// kernel's signature does not match the function's arity.
Status AddKernel(ScalarAggregateKernel kernel);
-};
-
-class ARROW_EXPORT HashAggregateFunction
- : public detail::FunctionImpl<HashAggregateKernel> {
- public:
- using KernelType = HashAggregateKernel;
-
- HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
- const FunctionOptions* default_options = NULLPTR)
- : detail::FunctionImpl<HashAggregateKernel>(
- std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {}
-
- /// \brief Add a kernel (function implementation). Returns error if the
- /// kernel's signature does not match the function's arity.
- Status AddKernel(HashAggregateKernel kernel);
+};
+
+class ARROW_EXPORT HashAggregateFunction
+ : public detail::FunctionImpl<HashAggregateKernel> {
+ public:
+ using KernelType = HashAggregateKernel;
+
+ HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : detail::FunctionImpl<HashAggregateKernel>(
+ std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {}
+
+ /// \brief Add a kernel (function implementation). Returns error if the
+ /// kernel's signature does not match the function's arity.
+ Status AddKernel(HashAggregateKernel kernel);
};
/// \brief A function that dispatches to other functions. Must implement
@@ -382,9 +382,9 @@ class ARROW_EXPORT MetaFunction : public Function {
const FunctionOptions* options,
ExecContext* ctx) const = 0;
- MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
- : Function(std::move(name), Function::META, arity, doc, default_options) {}
+ : Function(std::move(name), Function::META, arity, doc, default_options) {}
};
/// @}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
index 0a926e0a39c..8515d957cbd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
@@ -1,113 +1,113 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/function_internal.h"
-
-#include "arrow/array/util.h"
-#include "arrow/compute/function.h"
-#include "arrow/compute/registry.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/record_batch.h"
-#include "arrow/scalar.h"
-#include "arrow/util/checked_cast.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-using ::arrow::internal::checked_cast;
-
-constexpr char kTypeNameField[] = "_type_name";
-
-Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
- const FunctionOptions& options) {
- std::vector<std::string> field_names;
- std::vector<std::shared_ptr<Scalar>> values;
- const auto* options_type =
- dynamic_cast<const GenericOptionsType*>(options.options_type());
- if (!options_type) {
- return Status::NotImplemented("serializing ", options.type_name(),
- " to StructScalar");
- }
- RETURN_NOT_OK(options_type->ToStructScalar(options, &field_names, &values));
- field_names.push_back(kTypeNameField);
- const char* options_name = options.type_name();
- values.emplace_back(
- new BinaryScalar(Buffer::Wrap(options_name, std::strlen(options_name))));
- return StructScalar::Make(std::move(values), std::move(field_names));
-}
-
-Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
- const StructScalar& scalar) {
- ARROW_ASSIGN_OR_RAISE(auto type_name_holder, scalar.field(kTypeNameField));
- const std::string type_name =
- checked_cast<const BinaryScalar&>(*type_name_holder).value->ToString();
- ARROW_ASSIGN_OR_RAISE(auto raw_options_type,
- GetFunctionRegistry()->GetFunctionOptionsType(type_name));
- const auto* options_type = checked_cast<const GenericOptionsType*>(raw_options_type);
- return options_type->FromStructScalar(scalar);
-}
-
-Result<std::shared_ptr<Buffer>> GenericOptionsType::Serialize(
- const FunctionOptions& options) const {
- ARROW_ASSIGN_OR_RAISE(auto scalar, FunctionOptionsToStructScalar(options));
- ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*scalar, 1));
- auto batch =
- RecordBatch::Make(schema({field("", array->type())}), /*num_rows=*/1, {array});
- ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
- ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
- RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
- RETURN_NOT_OK(writer->Close());
- return stream->Finish();
-}
-
-Result<std::unique_ptr<FunctionOptions>> GenericOptionsType::Deserialize(
- const Buffer& buffer) const {
- return DeserializeFunctionOptions(buffer);
-}
-
-Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(
- const Buffer& buffer) {
- io::BufferReader stream(buffer);
- ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
- ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
- if (batch->num_rows() != 1) {
- return Status::Invalid(
- "serialized FunctionOptions's batch repr was not a single row - had ",
- batch->num_rows());
- }
- if (batch->num_columns() != 1) {
- return Status::Invalid(
- "serialized FunctionOptions's batch repr was not a single column - had ",
- batch->num_columns());
- }
- auto column = batch->column(0);
- if (column->type()->id() != Type::STRUCT) {
- return Status::Invalid(
- "serialized FunctionOptions's batch repr was not a struct column - was ",
- column->type()->ToString());
- }
- ARROW_ASSIGN_OR_RAISE(auto raw_scalar,
- checked_cast<const StructArray&>(*column).GetScalar(0));
- auto scalar = checked_cast<const StructScalar&>(*raw_scalar);
- return FunctionOptionsFromStructScalar(scalar);
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/function_internal.h"
+
+#include "arrow/array/util.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/scalar.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+using ::arrow::internal::checked_cast;
+
+constexpr char kTypeNameField[] = "_type_name";
+
+Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
+ const FunctionOptions& options) {
+ std::vector<std::string> field_names;
+ std::vector<std::shared_ptr<Scalar>> values;
+ const auto* options_type =
+ dynamic_cast<const GenericOptionsType*>(options.options_type());
+ if (!options_type) {
+ return Status::NotImplemented("serializing ", options.type_name(),
+ " to StructScalar");
+ }
+ RETURN_NOT_OK(options_type->ToStructScalar(options, &field_names, &values));
+ field_names.push_back(kTypeNameField);
+ const char* options_name = options.type_name();
+ values.emplace_back(
+ new BinaryScalar(Buffer::Wrap(options_name, std::strlen(options_name))));
+ return StructScalar::Make(std::move(values), std::move(field_names));
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
+ const StructScalar& scalar) {
+ ARROW_ASSIGN_OR_RAISE(auto type_name_holder, scalar.field(kTypeNameField));
+ const std::string type_name =
+ checked_cast<const BinaryScalar&>(*type_name_holder).value->ToString();
+ ARROW_ASSIGN_OR_RAISE(auto raw_options_type,
+ GetFunctionRegistry()->GetFunctionOptionsType(type_name));
+ const auto* options_type = checked_cast<const GenericOptionsType*>(raw_options_type);
+ return options_type->FromStructScalar(scalar);
+}
+
+Result<std::shared_ptr<Buffer>> GenericOptionsType::Serialize(
+ const FunctionOptions& options) const {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, FunctionOptionsToStructScalar(options));
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*scalar, 1));
+ auto batch =
+ RecordBatch::Make(schema({field("", array->type())}), /*num_rows=*/1, {array});
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+ ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ RETURN_NOT_OK(writer->Close());
+ return stream->Finish();
+}
+
+Result<std::unique_ptr<FunctionOptions>> GenericOptionsType::Deserialize(
+ const Buffer& buffer) const {
+ return DeserializeFunctionOptions(buffer);
+}
+
+Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(
+ const Buffer& buffer) {
+ io::BufferReader stream(buffer);
+ ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
+ ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
+ if (batch->num_rows() != 1) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a single row - had ",
+ batch->num_rows());
+ }
+ if (batch->num_columns() != 1) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a single column - had ",
+ batch->num_columns());
+ }
+ auto column = batch->column(0);
+ if (column->type()->id() != Type::STRUCT) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a struct column - was ",
+ column->type()->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto raw_scalar,
+ checked_cast<const StructArray&>(*column).GetScalar(0));
+ auto scalar = checked_cast<const StructScalar&>(*raw_scalar);
+ return FunctionOptionsFromStructScalar(scalar);
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
index fdd7f09ba1f..9ce0c3cc84e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
@@ -1,626 +1,626 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "arrow/array/builder_base.h"
-#include "arrow/array/builder_binary.h"
-#include "arrow/array/builder_nested.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/function.h"
-#include "arrow/compute/type_fwd.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/reflection_internal.h"
-#include "arrow/util/string.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-struct Scalar;
-struct StructScalar;
-using ::arrow::internal::checked_cast;
-
-namespace internal {
-template <>
-struct EnumTraits<compute::SortOrder>
- : BasicEnumTraits<compute::SortOrder, compute::SortOrder::Ascending,
- compute::SortOrder::Descending> {
- static std::string name() { return "SortOrder"; }
- static std::string value_name(compute::SortOrder value) {
- switch (value) {
- case compute::SortOrder::Ascending:
- return "Ascending";
- case compute::SortOrder::Descending:
- return "Descending";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
-namespace compute {
-namespace internal {
-
-using arrow::internal::EnumTraits;
-using arrow::internal::has_enum_traits;
-
-template <typename Enum, typename CType = typename std::underlying_type<Enum>::type>
-Result<Enum> ValidateEnumValue(CType raw) {
- for (auto valid : EnumTraits<Enum>::values()) {
- if (raw == static_cast<CType>(valid)) {
- return static_cast<Enum>(raw);
- }
- }
- return Status::Invalid("Invalid value for ", EnumTraits<Enum>::name(), ": ", raw);
-}
-
-class GenericOptionsType : public FunctionOptionsType {
- public:
- Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const override;
- Result<std::unique_ptr<FunctionOptions>> Deserialize(
- const Buffer& buffer) const override;
- virtual Status ToStructScalar(const FunctionOptions& options,
- std::vector<std::string>* field_names,
- std::vector<std::shared_ptr<Scalar>>* values) const = 0;
- virtual Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
- const StructScalar& scalar) const = 0;
-};
-
-ARROW_EXPORT
-Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
- const FunctionOptions&);
-ARROW_EXPORT
-Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
- const StructScalar&);
-ARROW_EXPORT
-Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(const Buffer& buffer);
-
-template <typename T>
-static inline enable_if_t<!has_enum_traits<T>::value, std::string> GenericToString(
- const T& value) {
- std::stringstream ss;
- ss << value;
- return ss.str();
-}
-
-static inline std::string GenericToString(bool value) { return value ? "true" : "false"; }
-
-static inline std::string GenericToString(const std::string& value) {
- std::stringstream ss;
- ss << '"' << value << '"';
- return ss.str();
-}
-
-template <typename T>
-static inline enable_if_t<has_enum_traits<T>::value, std::string> GenericToString(
- const T value) {
- return EnumTraits<T>::value_name(value);
-}
-
-template <typename T>
-static inline std::string GenericToString(const std::shared_ptr<T>& value) {
- std::stringstream ss;
- return value ? value->ToString() : "<NULLPTR>";
-}
-
-static inline std::string GenericToString(const std::shared_ptr<Scalar>& value) {
- std::stringstream ss;
- ss << value->type->ToString() << ":" << value->ToString();
- return ss.str();
-}
-
-static inline std::string GenericToString(
- const std::shared_ptr<const KeyValueMetadata>& value) {
- std::stringstream ss;
- ss << "KeyValueMetadata{";
- if (value) {
- bool first = true;
- for (const auto& pair : value->sorted_pairs()) {
- if (!first) ss << ", ";
- first = false;
- ss << pair.first << ':' << pair.second;
- }
- }
- ss << '}';
- return ss.str();
-}
-
-static inline std::string GenericToString(const Datum& value) {
- switch (value.kind()) {
- case Datum::NONE:
- return "<NULL DATUM>";
- case Datum::SCALAR:
- return GenericToString(value.scalar());
- case Datum::ARRAY: {
- std::stringstream ss;
- ss << value.type()->ToString() << ':' << value.make_array()->ToString();
- return ss.str();
- }
- case Datum::CHUNKED_ARRAY:
- case Datum::RECORD_BATCH:
- case Datum::TABLE:
- case Datum::COLLECTION:
- return value.ToString();
- }
- return value.ToString();
-}
-
-template <typename T>
-static inline std::string GenericToString(const std::vector<T>& value) {
- std::stringstream ss;
- ss << "[";
- bool first = true;
- // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
- for (auto it = value.begin(); it != value.end(); it++) {
- if (!first) ss << ", ";
- first = false;
- ss << GenericToString(*it);
- }
- ss << ']';
- return ss.str();
-}
-
-static inline std::string GenericToString(SortOrder value) {
- switch (value) {
- case SortOrder::Ascending:
- return "Ascending";
- case SortOrder::Descending:
- return "Descending";
- }
- return "<INVALID SORT ORDER>";
-}
-
-static inline std::string GenericToString(const std::vector<SortKey>& value) {
- std::stringstream ss;
- ss << '[';
- bool first = true;
- for (const auto& key : value) {
- if (!first) {
- ss << ", ";
- }
- first = false;
- ss << key.ToString();
- }
- ss << ']';
- return ss.str();
-}
-
-template <typename T>
-static inline bool GenericEquals(const T& left, const T& right) {
- return left == right;
-}
-
-template <typename T>
-static inline bool GenericEquals(const std::shared_ptr<T>& left,
- const std::shared_ptr<T>& right) {
- if (left && right) {
- return left->Equals(*right);
- }
- return left == right;
-}
-
-static inline bool IsEmpty(const std::shared_ptr<const KeyValueMetadata>& meta) {
- return !meta || meta->size() == 0;
-}
-
-static inline bool GenericEquals(const std::shared_ptr<const KeyValueMetadata>& left,
- const std::shared_ptr<const KeyValueMetadata>& right) {
- // Special case since null metadata is considered equivalent to empty
- if (IsEmpty(left) || IsEmpty(right)) {
- return IsEmpty(left) && IsEmpty(right);
- }
- return left->Equals(*right);
-}
-
-template <typename T>
-static inline bool GenericEquals(const std::vector<T>& left,
- const std::vector<T>& right) {
- if (left.size() != right.size()) return false;
- for (size_t i = 0; i < left.size(); i++) {
- if (!GenericEquals(left[i], right[i])) return false;
- }
- return true;
-}
-
-template <typename T>
-static inline decltype(TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton())
-GenericTypeSingleton() {
- return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton();
-}
-
-template <typename T>
-static inline enable_if_same<T, std::shared_ptr<const KeyValueMetadata>,
- std::shared_ptr<DataType>>
-GenericTypeSingleton() {
- return map(binary(), binary());
-}
-
-template <typename T>
-static inline enable_if_t<has_enum_traits<T>::value, std::shared_ptr<DataType>>
-GenericTypeSingleton() {
- return TypeTraits<typename EnumTraits<T>::Type>::type_singleton();
-}
-
-template <typename T>
-static inline enable_if_same<T, SortKey, std::shared_ptr<DataType>>
-GenericTypeSingleton() {
- std::vector<std::shared_ptr<Field>> fields;
- fields.emplace_back(new Field("name", GenericTypeSingleton<std::string>()));
- fields.emplace_back(new Field("order", GenericTypeSingleton<SortOrder>()));
- return std::make_shared<StructType>(std::move(fields));
-}
-
-// N.B. ordering of overloads is relatively fragile
-template <typename T>
-static inline Result<decltype(MakeScalar(std::declval<T>()))> GenericToScalar(
- const T& value) {
- return MakeScalar(value);
-}
-
-// For Clang/libc++: when iterating through vector<bool>, we can't
-// pass it by reference so the overload above doesn't apply
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(bool value) {
- return MakeScalar(value);
-}
-
-template <typename T, typename Enable = enable_if_t<has_enum_traits<T>::value>>
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const T value) {
- using CType = typename EnumTraits<T>::CType;
- return GenericToScalar(static_cast<CType>(value));
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& value) {
- ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name));
- ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order));
- return StructScalar::Make({name, order}, {"name", "order"});
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<const KeyValueMetadata>& value) {
- auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
- std::unique_ptr<ArrayBuilder> builder;
- RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder));
- auto* map_builder = checked_cast<MapBuilder*>(builder.get());
- auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder());
- auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder());
- RETURN_NOT_OK(map_builder->Append());
- if (value) {
- RETURN_NOT_OK(key_builder->AppendValues(value->keys()));
- RETURN_NOT_OK(item_builder->AppendValues(value->values()));
- }
- std::shared_ptr<Array> arr;
- RETURN_NOT_OK(map_builder->Finish(&arr));
- return arr->GetScalar(0);
-}
-
-template <typename T>
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::vector<T>& value) {
- std::shared_ptr<DataType> type = GenericTypeSingleton<T>();
- std::vector<std::shared_ptr<Scalar>> scalars;
- scalars.reserve(value.size());
- // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
- for (auto it = value.begin(); it != value.end(); it++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, GenericToScalar(*it));
- scalars.push_back(std::move(scalar));
- }
- std::unique_ptr<ArrayBuilder> builder;
- RETURN_NOT_OK(
- MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder));
- RETURN_NOT_OK(builder->AppendScalars(scalars));
- std::shared_ptr<Array> out;
- RETURN_NOT_OK(builder->Finish(&out));
- return std::make_shared<ListScalar>(std::move(out));
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<DataType>& value) {
- if (!value) {
- return Status::Invalid("shared_ptr<DataType> is nullptr");
- }
- return MakeNullScalar(value);
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<Scalar>& value) {
- return value;
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<Array>& value) {
- return std::make_shared<ListScalar>(value);
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const Datum& value) {
- // TODO(ARROW-9434): store in a union instead.
- switch (value.kind()) {
- case Datum::ARRAY:
- return GenericToScalar(value.make_array());
- break;
- default:
- return Status::NotImplemented("Cannot serialize Datum kind ", value.kind());
- }
-}
-
-template <typename T>
-static inline enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType, Result<T>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- using ArrowType = typename CTypeTraits<T>::ArrowType;
- using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
- if (value->type->id() != ArrowType::type_id) {
- return Status::Invalid("Expected type ", ArrowType::type_id, " but got ",
- value->type->ToString());
- }
- const auto& holder = checked_cast<const ScalarType&>(*value);
- if (!holder.is_valid) return Status::Invalid("Got null scalar");
- return holder.value;
-}
-
-template <typename T>
-static inline enable_if_primitive_ctype<typename EnumTraits<T>::Type, Result<T>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- ARROW_ASSIGN_OR_RAISE(auto raw_val,
- GenericFromScalar<typename EnumTraits<T>::CType>(value));
- return ValidateEnumValue<T>(raw_val);
-}
-
-template <typename T, typename U>
-using enable_if_same_result = enable_if_same<T, U, Result<T>>;
-
-template <typename T>
-static inline enable_if_same_result<T, std::string> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- if (!is_base_binary_like(value->type->id())) {
- return Status::Invalid("Expected binary-like type but got ", value->type->ToString());
- }
- const auto& holder = checked_cast<const BaseBinaryScalar&>(*value);
- if (!holder.is_valid) return Status::Invalid("Got null scalar");
- return holder.value->ToString();
-}
-
-template <typename T>
-static inline enable_if_same_result<T, SortKey> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- if (value->type->id() != Type::STRUCT) {
- return Status::Invalid("Expected type STRUCT but got ", value->type->id());
- }
- if (!value->is_valid) return Status::Invalid("Got null scalar");
- const auto& holder = checked_cast<const StructScalar&>(*value);
- ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name"));
- ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order"));
- ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar<std::string>(name_holder));
- ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar<SortOrder>(order_holder));
- return SortKey{std::move(name), order};
-}
-
-template <typename T>
-static inline enable_if_same_result<T, std::shared_ptr<DataType>> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- return value->type;
-}
-
-template <typename T>
-static inline enable_if_same_result<T, std::shared_ptr<Scalar>> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- return value;
-}
-
-template <typename T>
-static inline enable_if_same_result<T, std::shared_ptr<const KeyValueMetadata>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
- if (!value->type->Equals(ty)) {
- return Status::Invalid("Expected ", ty->ToString(), " but got ",
- value->type->ToString());
- }
- const auto& holder = checked_cast<const MapScalar&>(*value);
- std::vector<std::string> keys;
- std::vector<std::string> values;
- const auto& list = checked_cast<const StructArray&>(*holder.value);
- const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0));
- const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1));
- for (int64_t i = 0; i < list.length(); i++) {
- keys.push_back(key_arr.GetString(i));
- values.push_back(value_arr.GetString(i));
- }
- return key_value_metadata(std::move(keys), std::move(values));
-}
-
-template <typename T>
-static inline enable_if_same_result<T, Datum> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- if (value->type->id() == Type::LIST) {
- const auto& holder = checked_cast<const BaseListScalar&>(*value);
- return holder.value;
- }
- // TODO(ARROW-9434): handle other possible datum kinds by looking for a union
- return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
-}
-
-template <typename T>
-static enable_if_same<typename CTypeTraits<T>::ArrowType, ListType, Result<T>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- using ValueType = typename T::value_type;
- if (value->type->id() != Type::LIST) {
- return Status::Invalid("Expected type LIST but got ", value->type->ToString());
- }
- const auto& holder = checked_cast<const BaseListScalar&>(*value);
- if (!holder.is_valid) return Status::Invalid("Got null scalar");
- std::vector<ValueType> result;
- for (int i = 0; i < holder.value->length(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i));
- ARROW_ASSIGN_OR_RAISE(auto v, GenericFromScalar<ValueType>(scalar));
- result.push_back(std::move(v));
- }
- return result;
-}
-
-template <typename Options>
-struct StringifyImpl {
- template <typename Tuple>
- StringifyImpl(const Options& obj, const Tuple& props)
- : obj_(obj), members_(props.size()) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t i) {
- std::stringstream ss;
- ss << prop.name() << '=' << GenericToString(prop.get(obj_));
- members_[i] = ss.str();
- }
-
- std::string Finish() {
- return "{" + arrow::internal::JoinStrings(members_, ", ") + "}";
- }
-
- const Options& obj_;
- std::vector<std::string> members_;
-};
-
-template <typename Options>
-struct CompareImpl {
- template <typename Tuple>
- CompareImpl(const Options& l, const Options& r, const Tuple& props)
- : left_(l), right_(r) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t) {
- equal_ &= GenericEquals(prop.get(left_), prop.get(right_));
- }
-
- const Options& left_;
- const Options& right_;
- bool equal_ = true;
-};
-
-template <typename Options>
-struct ToStructScalarImpl {
- template <typename Tuple>
- ToStructScalarImpl(const Options& obj, const Tuple& props,
- std::vector<std::string>* field_names,
- std::vector<std::shared_ptr<Scalar>>* values)
- : obj_(obj), field_names_(field_names), values_(values) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t) {
- if (!status_.ok()) return;
- auto result = GenericToScalar(prop.get(obj_));
- if (!result.ok()) {
- status_ = result.status().WithMessage("Could not serialize field ", prop.name(),
- " of options type ", Options::kTypeName, ": ",
- result.status().message());
- return;
- }
- field_names_->emplace_back(prop.name());
- values_->push_back(result.MoveValueUnsafe());
- }
-
- const Options& obj_;
- Status status_;
- std::vector<std::string>* field_names_;
- std::vector<std::shared_ptr<Scalar>>* values_;
-};
-
-template <typename Options>
-struct FromStructScalarImpl {
- template <typename Tuple>
- FromStructScalarImpl(Options* obj, const StructScalar& scalar, const Tuple& props)
- : obj_(obj), scalar_(scalar) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t) {
- if (!status_.ok()) return;
- auto maybe_holder = scalar_.field(std::string(prop.name()));
- if (!maybe_holder.ok()) {
- status_ = maybe_holder.status().WithMessage(
- "Cannot deserialize field ", prop.name(), " of options type ",
- Options::kTypeName, ": ", maybe_holder.status().message());
- return;
- }
- auto holder = maybe_holder.MoveValueUnsafe();
- auto result = GenericFromScalar<typename Property::Type>(holder);
- if (!result.ok()) {
- status_ = result.status().WithMessage("Cannot deserialize field ", prop.name(),
- " of options type ", Options::kTypeName, ": ",
- result.status().message());
- return;
- }
- prop.set(obj_, result.MoveValueUnsafe());
- }
-
- Options* obj_;
- Status status_;
- const StructScalar& scalar_;
-};
-
-template <typename Options, typename... Properties>
-const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) {
- static const class OptionsType : public GenericOptionsType {
- public:
- explicit OptionsType(const arrow::internal::PropertyTuple<Properties...> properties)
- : properties_(properties) {}
-
- const char* type_name() const override { return Options::kTypeName; }
-
- std::string Stringify(const FunctionOptions& options) const override {
- const auto& self = checked_cast<const Options&>(options);
- return StringifyImpl<Options>(self, properties_).Finish();
- }
- bool Compare(const FunctionOptions& options,
- const FunctionOptions& other) const override {
- const auto& lhs = checked_cast<const Options&>(options);
- const auto& rhs = checked_cast<const Options&>(other);
- return CompareImpl<Options>(lhs, rhs, properties_).equal_;
- }
- Status ToStructScalar(const FunctionOptions& options,
- std::vector<std::string>* field_names,
- std::vector<std::shared_ptr<Scalar>>* values) const override {
- const auto& self = checked_cast<const Options&>(options);
- RETURN_NOT_OK(
- ToStructScalarImpl<Options>(self, properties_, field_names, values).status_);
- return Status::OK();
- }
- Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
- const StructScalar& scalar) const override {
- auto options = std::unique_ptr<Options>(new Options());
- RETURN_NOT_OK(
- FromStructScalarImpl<Options>(options.get(), scalar, properties_).status_);
- return std::move(options);
- }
-
- private:
- const arrow::internal::PropertyTuple<Properties...> properties_;
- } instance(arrow::internal::MakeProperties(properties...));
- return &instance;
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/reflection_internal.h"
+#include "arrow/util/string.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+struct Scalar;
+struct StructScalar;
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+template <>
+struct EnumTraits<compute::SortOrder>
+ : BasicEnumTraits<compute::SortOrder, compute::SortOrder::Ascending,
+ compute::SortOrder::Descending> {
+ static std::string name() { return "SortOrder"; }
+ static std::string value_name(compute::SortOrder value) {
+ switch (value) {
+ case compute::SortOrder::Ascending:
+ return "Ascending";
+ case compute::SortOrder::Descending:
+ return "Descending";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
+namespace compute {
+namespace internal {
+
+using arrow::internal::EnumTraits;
+using arrow::internal::has_enum_traits;
+
+template <typename Enum, typename CType = typename std::underlying_type<Enum>::type>
+Result<Enum> ValidateEnumValue(CType raw) {
+ for (auto valid : EnumTraits<Enum>::values()) {
+ if (raw == static_cast<CType>(valid)) {
+ return static_cast<Enum>(raw);
+ }
+ }
+ return Status::Invalid("Invalid value for ", EnumTraits<Enum>::name(), ": ", raw);
+}
+
+class GenericOptionsType : public FunctionOptionsType {
+ public:
+ Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const override;
+ Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const Buffer& buffer) const override;
+ virtual Status ToStructScalar(const FunctionOptions& options,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values) const = 0;
+ virtual Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
+ const StructScalar& scalar) const = 0;
+};
+
+ARROW_EXPORT
+Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
+ const FunctionOptions&);
+ARROW_EXPORT
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
+ const StructScalar&);
+ARROW_EXPORT
+Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(const Buffer& buffer);
+
+template <typename T>
+static inline enable_if_t<!has_enum_traits<T>::value, std::string> GenericToString(
+ const T& value) {
+ std::stringstream ss;
+ ss << value;
+ return ss.str();
+}
+
+static inline std::string GenericToString(bool value) { return value ? "true" : "false"; }
+
+static inline std::string GenericToString(const std::string& value) {
+ std::stringstream ss;
+ ss << '"' << value << '"';
+ return ss.str();
+}
+
+template <typename T>
+static inline enable_if_t<has_enum_traits<T>::value, std::string> GenericToString(
+ const T value) {
+ return EnumTraits<T>::value_name(value);
+}
+
+template <typename T>
+static inline std::string GenericToString(const std::shared_ptr<T>& value) {
+ std::stringstream ss;
+ return value ? value->ToString() : "<NULLPTR>";
+}
+
+static inline std::string GenericToString(const std::shared_ptr<Scalar>& value) {
+ std::stringstream ss;
+ ss << value->type->ToString() << ":" << value->ToString();
+ return ss.str();
+}
+
+static inline std::string GenericToString(
+ const std::shared_ptr<const KeyValueMetadata>& value) {
+ std::stringstream ss;
+ ss << "KeyValueMetadata{";
+ if (value) {
+ bool first = true;
+ for (const auto& pair : value->sorted_pairs()) {
+ if (!first) ss << ", ";
+ first = false;
+ ss << pair.first << ':' << pair.second;
+ }
+ }
+ ss << '}';
+ return ss.str();
+}
+
+static inline std::string GenericToString(const Datum& value) {
+ switch (value.kind()) {
+ case Datum::NONE:
+ return "<NULL DATUM>";
+ case Datum::SCALAR:
+ return GenericToString(value.scalar());
+ case Datum::ARRAY: {
+ std::stringstream ss;
+ ss << value.type()->ToString() << ':' << value.make_array()->ToString();
+ return ss.str();
+ }
+ case Datum::CHUNKED_ARRAY:
+ case Datum::RECORD_BATCH:
+ case Datum::TABLE:
+ case Datum::COLLECTION:
+ return value.ToString();
+ }
+ return value.ToString();
+}
+
+template <typename T>
+static inline std::string GenericToString(const std::vector<T>& value) {
+ std::stringstream ss;
+ ss << "[";
+ bool first = true;
+ // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
+ for (auto it = value.begin(); it != value.end(); it++) {
+ if (!first) ss << ", ";
+ first = false;
+ ss << GenericToString(*it);
+ }
+ ss << ']';
+ return ss.str();
+}
+
+static inline std::string GenericToString(SortOrder value) {
+ switch (value) {
+ case SortOrder::Ascending:
+ return "Ascending";
+ case SortOrder::Descending:
+ return "Descending";
+ }
+ return "<INVALID SORT ORDER>";
+}
+
+static inline std::string GenericToString(const std::vector<SortKey>& value) {
+ std::stringstream ss;
+ ss << '[';
+ bool first = true;
+ for (const auto& key : value) {
+ if (!first) {
+ ss << ", ";
+ }
+ first = false;
+ ss << key.ToString();
+ }
+ ss << ']';
+ return ss.str();
+}
+
+template <typename T>
+static inline bool GenericEquals(const T& left, const T& right) {
+ return left == right;
+}
+
+template <typename T>
+static inline bool GenericEquals(const std::shared_ptr<T>& left,
+ const std::shared_ptr<T>& right) {
+ if (left && right) {
+ return left->Equals(*right);
+ }
+ return left == right;
+}
+
+static inline bool IsEmpty(const std::shared_ptr<const KeyValueMetadata>& meta) {
+ return !meta || meta->size() == 0;
+}
+
+static inline bool GenericEquals(const std::shared_ptr<const KeyValueMetadata>& left,
+ const std::shared_ptr<const KeyValueMetadata>& right) {
+ // Special case since null metadata is considered equivalent to empty
+ if (IsEmpty(left) || IsEmpty(right)) {
+ return IsEmpty(left) && IsEmpty(right);
+ }
+ return left->Equals(*right);
+}
+
+template <typename T>
+static inline bool GenericEquals(const std::vector<T>& left,
+ const std::vector<T>& right) {
+ if (left.size() != right.size()) return false;
+ for (size_t i = 0; i < left.size(); i++) {
+ if (!GenericEquals(left[i], right[i])) return false;
+ }
+ return true;
+}
+
+template <typename T>
+static inline decltype(TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton())
+GenericTypeSingleton() {
+ return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton();
+}
+
+template <typename T>
+static inline enable_if_same<T, std::shared_ptr<const KeyValueMetadata>,
+ std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ return map(binary(), binary());
+}
+
+template <typename T>
+static inline enable_if_t<has_enum_traits<T>::value, std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ return TypeTraits<typename EnumTraits<T>::Type>::type_singleton();
+}
+
+template <typename T>
+static inline enable_if_same<T, SortKey, std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ std::vector<std::shared_ptr<Field>> fields;
+ fields.emplace_back(new Field("name", GenericTypeSingleton<std::string>()));
+ fields.emplace_back(new Field("order", GenericTypeSingleton<SortOrder>()));
+ return std::make_shared<StructType>(std::move(fields));
+}
+
+// N.B. ordering of overloads is relatively fragile
+template <typename T>
+static inline Result<decltype(MakeScalar(std::declval<T>()))> GenericToScalar(
+ const T& value) {
+ return MakeScalar(value);
+}
+
+// For Clang/libc++: when iterating through vector<bool>, we can't
+// pass it by reference so the overload above doesn't apply
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(bool value) {
+ return MakeScalar(value);
+}
+
+template <typename T, typename Enable = enable_if_t<has_enum_traits<T>::value>>
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const T value) {
+ using CType = typename EnumTraits<T>::CType;
+ return GenericToScalar(static_cast<CType>(value));
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& value) {
+ ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name));
+ ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order));
+ return StructScalar::Make({name, order}, {"name", "order"});
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<const KeyValueMetadata>& value) {
+ auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder));
+ auto* map_builder = checked_cast<MapBuilder*>(builder.get());
+ auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder());
+ auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder());
+ RETURN_NOT_OK(map_builder->Append());
+ if (value) {
+ RETURN_NOT_OK(key_builder->AppendValues(value->keys()));
+ RETURN_NOT_OK(item_builder->AppendValues(value->values()));
+ }
+ std::shared_ptr<Array> arr;
+ RETURN_NOT_OK(map_builder->Finish(&arr));
+ return arr->GetScalar(0);
+}
+
+template <typename T>
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::vector<T>& value) {
+ std::shared_ptr<DataType> type = GenericTypeSingleton<T>();
+ std::vector<std::shared_ptr<Scalar>> scalars;
+ scalars.reserve(value.size());
+ // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
+ for (auto it = value.begin(); it != value.end(); it++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, GenericToScalar(*it));
+ scalars.push_back(std::move(scalar));
+ }
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(
+ MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder));
+ RETURN_NOT_OK(builder->AppendScalars(scalars));
+ std::shared_ptr<Array> out;
+ RETURN_NOT_OK(builder->Finish(&out));
+ return std::make_shared<ListScalar>(std::move(out));
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<DataType>& value) {
+ if (!value) {
+ return Status::Invalid("shared_ptr<DataType> is nullptr");
+ }
+ return MakeNullScalar(value);
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value;
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<Array>& value) {
+ return std::make_shared<ListScalar>(value);
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const Datum& value) {
+ // TODO(ARROW-9434): store in a union instead.
+ switch (value.kind()) {
+ case Datum::ARRAY:
+ return GenericToScalar(value.make_array());
+ break;
+ default:
+ return Status::NotImplemented("Cannot serialize Datum kind ", value.kind());
+ }
+}
+
+template <typename T>
+static inline enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ using ArrowType = typename CTypeTraits<T>::ArrowType;
+ using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+ if (value->type->id() != ArrowType::type_id) {
+ return Status::Invalid("Expected type ", ArrowType::type_id, " but got ",
+ value->type->ToString());
+ }
+ const auto& holder = checked_cast<const ScalarType&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ return holder.value;
+}
+
+template <typename T>
+static inline enable_if_primitive_ctype<typename EnumTraits<T>::Type, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ ARROW_ASSIGN_OR_RAISE(auto raw_val,
+ GenericFromScalar<typename EnumTraits<T>::CType>(value));
+ return ValidateEnumValue<T>(raw_val);
+}
+
+template <typename T, typename U>
+using enable_if_same_result = enable_if_same<T, U, Result<T>>;
+
+template <typename T>
+static inline enable_if_same_result<T, std::string> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (!is_base_binary_like(value->type->id())) {
+ return Status::Invalid("Expected binary-like type but got ", value->type->ToString());
+ }
+ const auto& holder = checked_cast<const BaseBinaryScalar&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ return holder.value->ToString();
+}
+
+template <typename T>
+static inline enable_if_same_result<T, SortKey> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (value->type->id() != Type::STRUCT) {
+ return Status::Invalid("Expected type STRUCT but got ", value->type->id());
+ }
+ if (!value->is_valid) return Status::Invalid("Got null scalar");
+ const auto& holder = checked_cast<const StructScalar&>(*value);
+ ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name"));
+ ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order"));
+ ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar<std::string>(name_holder));
+ ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar<SortOrder>(order_holder));
+ return SortKey{std::move(name), order};
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<DataType>> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value->type;
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<Scalar>> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value;
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<const KeyValueMetadata>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
+ if (!value->type->Equals(ty)) {
+ return Status::Invalid("Expected ", ty->ToString(), " but got ",
+ value->type->ToString());
+ }
+ const auto& holder = checked_cast<const MapScalar&>(*value);
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ const auto& list = checked_cast<const StructArray&>(*holder.value);
+ const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0));
+ const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1));
+ for (int64_t i = 0; i < list.length(); i++) {
+ keys.push_back(key_arr.GetString(i));
+ values.push_back(value_arr.GetString(i));
+ }
+ return key_value_metadata(std::move(keys), std::move(values));
+}
+
+template <typename T>
+static inline enable_if_same_result<T, Datum> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (value->type->id() == Type::LIST) {
+ const auto& holder = checked_cast<const BaseListScalar&>(*value);
+ return holder.value;
+ }
+ // TODO(ARROW-9434): handle other possible datum kinds by looking for a union
+ return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
+}
+
+template <typename T>
+static enable_if_same<typename CTypeTraits<T>::ArrowType, ListType, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ using ValueType = typename T::value_type;
+ if (value->type->id() != Type::LIST) {
+ return Status::Invalid("Expected type LIST but got ", value->type->ToString());
+ }
+ const auto& holder = checked_cast<const BaseListScalar&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ std::vector<ValueType> result;
+ for (int i = 0; i < holder.value->length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i));
+ ARROW_ASSIGN_OR_RAISE(auto v, GenericFromScalar<ValueType>(scalar));
+ result.push_back(std::move(v));
+ }
+ return result;
+}
+
+template <typename Options>
+struct StringifyImpl {
+ template <typename Tuple>
+ StringifyImpl(const Options& obj, const Tuple& props)
+ : obj_(obj), members_(props.size()) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t i) {
+ std::stringstream ss;
+ ss << prop.name() << '=' << GenericToString(prop.get(obj_));
+ members_[i] = ss.str();
+ }
+
+ std::string Finish() {
+ return "{" + arrow::internal::JoinStrings(members_, ", ") + "}";
+ }
+
+ const Options& obj_;
+ std::vector<std::string> members_;
+};
+
+template <typename Options>
+struct CompareImpl {
+ template <typename Tuple>
+ CompareImpl(const Options& l, const Options& r, const Tuple& props)
+ : left_(l), right_(r) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ equal_ &= GenericEquals(prop.get(left_), prop.get(right_));
+ }
+
+ const Options& left_;
+ const Options& right_;
+ bool equal_ = true;
+};
+
+template <typename Options>
+struct ToStructScalarImpl {
+ template <typename Tuple>
+ ToStructScalarImpl(const Options& obj, const Tuple& props,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values)
+ : obj_(obj), field_names_(field_names), values_(values) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ if (!status_.ok()) return;
+ auto result = GenericToScalar(prop.get(obj_));
+ if (!result.ok()) {
+ status_ = result.status().WithMessage("Could not serialize field ", prop.name(),
+ " of options type ", Options::kTypeName, ": ",
+ result.status().message());
+ return;
+ }
+ field_names_->emplace_back(prop.name());
+ values_->push_back(result.MoveValueUnsafe());
+ }
+
+ const Options& obj_;
+ Status status_;
+ std::vector<std::string>* field_names_;
+ std::vector<std::shared_ptr<Scalar>>* values_;
+};
+
+template <typename Options>
+struct FromStructScalarImpl {
+ template <typename Tuple>
+ FromStructScalarImpl(Options* obj, const StructScalar& scalar, const Tuple& props)
+ : obj_(obj), scalar_(scalar) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ if (!status_.ok()) return;
+ auto maybe_holder = scalar_.field(std::string(prop.name()));
+ if (!maybe_holder.ok()) {
+ status_ = maybe_holder.status().WithMessage(
+ "Cannot deserialize field ", prop.name(), " of options type ",
+ Options::kTypeName, ": ", maybe_holder.status().message());
+ return;
+ }
+ auto holder = maybe_holder.MoveValueUnsafe();
+ auto result = GenericFromScalar<typename Property::Type>(holder);
+ if (!result.ok()) {
+ status_ = result.status().WithMessage("Cannot deserialize field ", prop.name(),
+ " of options type ", Options::kTypeName, ": ",
+ result.status().message());
+ return;
+ }
+ prop.set(obj_, result.MoveValueUnsafe());
+ }
+
+ Options* obj_;
+ Status status_;
+ const StructScalar& scalar_;
+};
+
+template <typename Options, typename... Properties>
+const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) {
+ static const class OptionsType : public GenericOptionsType {
+ public:
+ explicit OptionsType(const arrow::internal::PropertyTuple<Properties...> properties)
+ : properties_(properties) {}
+
+ const char* type_name() const override { return Options::kTypeName; }
+
+ std::string Stringify(const FunctionOptions& options) const override {
+ const auto& self = checked_cast<const Options&>(options);
+ return StringifyImpl<Options>(self, properties_).Finish();
+ }
+ bool Compare(const FunctionOptions& options,
+ const FunctionOptions& other) const override {
+ const auto& lhs = checked_cast<const Options&>(options);
+ const auto& rhs = checked_cast<const Options&>(other);
+ return CompareImpl<Options>(lhs, rhs, properties_).equal_;
+ }
+ Status ToStructScalar(const FunctionOptions& options,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values) const override {
+ const auto& self = checked_cast<const Options&>(options);
+ RETURN_NOT_OK(
+ ToStructScalarImpl<Options>(self, properties_, field_names, values).status_);
+ return Status::OK();
+ }
+ Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
+ const StructScalar& scalar) const override {
+ auto options = std::unique_ptr<Options>(new Options());
+ RETURN_NOT_OK(
+ FromStructScalarImpl<Options>(options.get(), scalar, properties_).status_);
+ return std::move(options);
+ }
+
+ private:
+ const arrow::internal::PropertyTuple<Properties...> properties_;
+ } instance(arrow::internal::MakeProperties(properties...));
+ return &instance;
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
index f131f524d2e..c730cbd131a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
@@ -59,25 +59,25 @@ Result<std::shared_ptr<ResizableBuffer>> KernelContext::AllocateBitmap(int64_t n
return result;
}
-Status Kernel::InitAll(KernelContext* ctx, const KernelInitArgs& args,
- std::vector<std::unique_ptr<KernelState>>* states) {
- for (auto& state : *states) {
- ARROW_ASSIGN_OR_RAISE(state, args.kernel->init(ctx, args));
+Status Kernel::InitAll(KernelContext* ctx, const KernelInitArgs& args,
+ std::vector<std::unique_ptr<KernelState>>* states) {
+ for (auto& state : *states) {
+ ARROW_ASSIGN_OR_RAISE(state, args.kernel->init(ctx, args));
}
- return Status::OK();
+ return Status::OK();
}
-Result<std::unique_ptr<KernelState>> ScalarAggregateKernel::MergeAll(
- const ScalarAggregateKernel* kernel, KernelContext* ctx,
- std::vector<std::unique_ptr<KernelState>> states) {
- auto out = std::move(states.back());
- states.pop_back();
- ctx->SetState(out.get());
- for (auto& state : states) {
- RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get()));
- }
- return std::move(out);
-}
+Result<std::unique_ptr<KernelState>> ScalarAggregateKernel::MergeAll(
+ const ScalarAggregateKernel* kernel, KernelContext* ctx,
+ std::vector<std::unique_ptr<KernelState>> states) {
+ auto out = std::move(states.back());
+ states.pop_back();
+ ctx->SetState(out.get());
+ for (auto& state : states) {
+ RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get()));
+ }
+ return std::move(out);
+}
// ----------------------------------------------------------------------
// Some basic TypeMatcher implementations
@@ -402,7 +402,7 @@ KernelSignature::KernelSignature(std::vector<InputType> in_types, OutputType out
out_type_(std::move(out_type)),
is_varargs_(is_varargs),
hash_code_(0) {
- DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1)));
+ DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1)));
}
std::shared_ptr<KernelSignature> KernelSignature::Make(std::vector<InputType> in_types,
@@ -429,8 +429,8 @@ bool KernelSignature::Equals(const KernelSignature& other) const {
bool KernelSignature::MatchesInputs(const std::vector<ValueDescr>& args) const {
if (is_varargs_) {
- for (size_t i = 0; i < args.size(); ++i) {
- if (!in_types_[std::min(i, in_types_.size() - 1)].Matches(args[i])) {
+ for (size_t i = 0; i < args.size(); ++i) {
+ if (!in_types_[std::min(i, in_types_.size() - 1)].Matches(args[i])) {
return false;
}
}
@@ -463,19 +463,19 @@ std::string KernelSignature::ToString() const {
std::stringstream ss;
if (is_varargs_) {
- ss << "varargs[";
+ ss << "varargs[";
} else {
ss << "(";
- }
- for (size_t i = 0; i < in_types_.size(); ++i) {
- if (i > 0) {
- ss << ", ";
+ }
+ for (size_t i = 0; i < in_types_.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
}
- ss << in_types_[i].ToString();
- }
- if (is_varargs_) {
- ss << "]";
- } else {
+ ss << in_types_[i].ToString();
+ }
+ if (is_varargs_) {
+ ss << "]";
+ } else {
ss << ")";
}
ss << " -> " << out_type_.ToString();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
index 36d20c7289e..c90c764f5ec 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
@@ -41,7 +41,7 @@
namespace arrow {
namespace compute {
-class FunctionOptions;
+class FunctionOptions;
/// \brief Base class for opaque kernel-specific state. For example, if there
/// is some kind of initialization required.
@@ -52,7 +52,7 @@ struct ARROW_EXPORT KernelState {
/// \brief Context/state for the execution of a particular kernel.
class ARROW_EXPORT KernelContext {
public:
- explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx), state_() {}
+ explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx), state_() {}
/// \brief Allocate buffer from the context's memory pool. The contents are
/// not initialized.
@@ -91,7 +91,7 @@ class ARROW_EXPORT KernelContext {
/// into pre-allocated memory if they are able, though for some kernels
/// (e.g. in cases when a builder like StringBuilder) must be employed this may
/// not be possible.
-using ArrayKernelExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
+using ArrayKernelExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
/// \brief An type-checking interface to permit customizable validation rules
/// for use with InputType and KernelSignature. This is for scenarios where the
@@ -321,9 +321,9 @@ class ARROW_EXPORT OutputType {
this->resolver_ = other.resolver_;
}
- OutputType& operator=(const OutputType&) = default;
- OutputType& operator=(OutputType&&) = default;
-
+ OutputType& operator=(const OutputType&) = default;
+ OutputType& operator=(OutputType&&) = default;
+
/// \brief Return the shape and type of the expected output value of the
/// kernel given the value descriptors (shapes and types) of the input
/// arguments. The resolver may make use of state information kept in the
@@ -366,10 +366,10 @@ class ARROW_EXPORT OutputType {
/// \brief Holds the input types and output type of the kernel.
///
-/// VarArgs functions with minimum N arguments should pass up to N input types to be
-/// used to validate the input types of a function invocation. The first N-1 types
-/// will be matched against the first N-1 arguments, and the last type will be
-/// matched against the remaining arguments.
+/// VarArgs functions with minimum N arguments should pass up to N input types to be
+/// used to validate the input types of a function invocation. The first N-1 types
+/// will be matched against the first N-1 arguments, and the last type will be
+/// matched against the remaining arguments.
class ARROW_EXPORT KernelSignature {
public:
KernelSignature(std::vector<InputType> in_types, OutputType out_type,
@@ -500,8 +500,8 @@ struct KernelInitArgs {
};
/// \brief Common initializer function for all kernel types.
-using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
- KernelContext*, const KernelInitArgs&)>;
+using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
+ KernelContext*, const KernelInitArgs&)>;
/// \brief Base type for kernels. Contains the function signature and
/// optionally the state initialization function, along with some common
@@ -513,8 +513,8 @@ struct Kernel {
: signature(std::move(sig)), init(std::move(init)) {}
Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
- : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
- std::move(init)) {}
+ : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init)) {}
/// \brief The "signature" of the kernel containing the InputType input
/// argument validators and OutputType output type and shape resolver.
@@ -524,10 +524,10 @@ struct Kernel {
/// set up any options or state relevant for execution.
KernelInit init;
- /// \brief Create a vector of new KernelState for invocations of this kernel.
- static Status InitAll(KernelContext*, const KernelInitArgs&,
- std::vector<std::unique_ptr<KernelState>>*);
-
+ /// \brief Create a vector of new KernelState for invocations of this kernel.
+ static Status InitAll(KernelContext*, const KernelInitArgs&,
+ std::vector<std::unique_ptr<KernelState>>*);
+
/// \brief Indicates whether execution can benefit from parallelization
/// (splitting large chunks into smaller chunks and using multiple
/// threads). Some kernels may not support parallel execution at
@@ -547,7 +547,7 @@ struct Kernel {
/// output array values (as opposed to scalar values in the case of aggregate
/// functions).
struct ArrayKernel : public Kernel {
- ArrayKernel() = default;
+ ArrayKernel() = default;
ArrayKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
KernelInit init = NULLPTR)
@@ -555,8 +555,8 @@ struct ArrayKernel : public Kernel {
ArrayKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
KernelInit init = NULLPTR)
- : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
- exec(std::move(exec)) {}
+ : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
+ exec(std::move(exec)) {}
/// \brief Perform a single invocation of this kernel. Depending on the
/// implementation, it may only write into preallocated memory, while in some
@@ -588,7 +588,7 @@ struct ScalarKernel : public ArrayKernel {
// VectorKernel (for VectorFunction)
/// \brief See VectorKernel::finalize member for usage
-using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)>;
+using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)>;
/// \brief Kernel data structure for implementations of VectorFunction. In
/// addition to the members found in ArrayKernel, contains an optional
@@ -596,10 +596,10 @@ using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)
/// (which have different defaults from ScalarKernel), and some other
/// execution-related options.
struct VectorKernel : public ArrayKernel {
- VectorKernel() = default;
+ VectorKernel() = default;
VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec)
- : ArrayKernel(std::move(sig), std::move(exec)) {}
+ : ArrayKernel(std::move(sig), std::move(exec)) {}
VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
@@ -643,13 +643,13 @@ struct VectorKernel : public ArrayKernel {
// ----------------------------------------------------------------------
// ScalarAggregateKernel (for ScalarAggregateFunction)
-using ScalarAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
+using ScalarAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
using ScalarAggregateMerge =
- std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
+ std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
// Finalize returns Datum to permit multiple return values
-using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
+using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
/// \brief Kernel data structure for implementations of
/// ScalarAggregateFunction. The four necessary components of an aggregation
@@ -662,12 +662,12 @@ using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
/// * finalize: produces the end result of the aggregation using the
/// KernelState in the KernelContext.
struct ScalarAggregateKernel : public Kernel {
- ScalarAggregateKernel() = default;
+ ScalarAggregateKernel() = default;
ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
ScalarAggregateConsume consume, ScalarAggregateMerge merge,
ScalarAggregateFinalize finalize)
- : Kernel(std::move(sig), std::move(init)),
+ : Kernel(std::move(sig), std::move(init)),
consume(std::move(consume)),
merge(std::move(merge)),
finalize(std::move(finalize)) {}
@@ -675,65 +675,65 @@ struct ScalarAggregateKernel : public Kernel {
ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
KernelInit init, ScalarAggregateConsume consume,
ScalarAggregateMerge merge, ScalarAggregateFinalize finalize)
- : ScalarAggregateKernel(
- KernelSignature::Make(std::move(in_types), std::move(out_type)),
- std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
-
- /// \brief Merge a vector of KernelStates into a single KernelState.
- /// The merged state will be returned and will be set on the KernelContext.
- static Result<std::unique_ptr<KernelState>> MergeAll(
- const ScalarAggregateKernel* kernel, KernelContext* ctx,
- std::vector<std::unique_ptr<KernelState>> states);
-
+ : ScalarAggregateKernel(
+ KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+ /// \brief Merge a vector of KernelStates into a single KernelState.
+ /// The merged state will be returned and will be set on the KernelContext.
+ static Result<std::unique_ptr<KernelState>> MergeAll(
+ const ScalarAggregateKernel* kernel, KernelContext* ctx,
+ std::vector<std::unique_ptr<KernelState>> states);
+
ScalarAggregateConsume consume;
ScalarAggregateMerge merge;
ScalarAggregateFinalize finalize;
};
-// ----------------------------------------------------------------------
-// HashAggregateKernel (for HashAggregateFunction)
-
-using HashAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
-
-using HashAggregateMerge =
- std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
-
-// Finalize returns Datum to permit multiple return values
-using HashAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
-
-/// \brief Kernel data structure for implementations of
-/// HashAggregateFunction. The four necessary components of an aggregation
-/// kernel are the init, consume, merge, and finalize functions.
-///
-/// * init: creates a new KernelState for a kernel.
-/// * consume: processes an ExecBatch (which includes the argument as well
-/// as an array of group identifiers) and updates the KernelState found in the
-/// KernelContext.
-/// * merge: combines one KernelState with another.
-/// * finalize: produces the end result of the aggregation using the
-/// KernelState in the KernelContext.
-struct HashAggregateKernel : public Kernel {
- HashAggregateKernel() = default;
-
- HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
- HashAggregateConsume consume, HashAggregateMerge merge,
- HashAggregateFinalize finalize)
- : Kernel(std::move(sig), std::move(init)),
- consume(std::move(consume)),
- merge(std::move(merge)),
- finalize(std::move(finalize)) {}
-
- HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
- KernelInit init, HashAggregateMerge merge,
- HashAggregateConsume consume, HashAggregateFinalize finalize)
- : HashAggregateKernel(
- KernelSignature::Make(std::move(in_types), std::move(out_type)),
- std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
-
- HashAggregateConsume consume;
- HashAggregateMerge merge;
- HashAggregateFinalize finalize;
-};
-
+// ----------------------------------------------------------------------
+// HashAggregateKernel (for HashAggregateFunction)
+
+using HashAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
+
+using HashAggregateMerge =
+ std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
+
+// Finalize returns Datum to permit multiple return values
+using HashAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
+
+/// \brief Kernel data structure for implementations of
+/// HashAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * consume: processes an ExecBatch (which includes the argument as well
+/// as an array of group identifiers) and updates the KernelState found in the
+/// KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+/// KernelState in the KernelContext.
+struct HashAggregateKernel : public Kernel {
+ HashAggregateKernel() = default;
+
+ HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ HashAggregateConsume consume, HashAggregateMerge merge,
+ HashAggregateFinalize finalize)
+ : Kernel(std::move(sig), std::move(init)),
+ consume(std::move(consume)),
+ merge(std::move(merge)),
+ finalize(std::move(finalize)) {}
+
+ HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+ KernelInit init, HashAggregateMerge merge,
+ HashAggregateConsume consume, HashAggregateFinalize finalize)
+ : HashAggregateKernel(
+ KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+ HashAggregateConsume consume;
+ HashAggregateMerge merge;
+ HashAggregateFinalize finalize;
+};
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index a7df66695b2..0c9636eae09 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -25,375 +25,375 @@
namespace arrow {
namespace compute {
-namespace {
-
-Status AggregateConsume(KernelContext* ctx, const ExecBatch& batch) {
- return checked_cast<ScalarAggregator*>(ctx->state())->Consume(ctx, batch);
-}
-
-Status AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) {
- return checked_cast<ScalarAggregator*>(dst)->MergeFrom(ctx, std::move(src));
+namespace {
+
+Status AggregateConsume(KernelContext* ctx, const ExecBatch& batch) {
+ return checked_cast<ScalarAggregator*>(ctx->state())->Consume(ctx, batch);
}
-Status AggregateFinalize(KernelContext* ctx, Datum* out) {
- return checked_cast<ScalarAggregator*>(ctx->state())->Finalize(ctx, out);
+Status AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) {
+ return checked_cast<ScalarAggregator*>(dst)->MergeFrom(ctx, std::move(src));
}
-} // namespace
-
-void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
- ScalarAggregateFunction* func, SimdLevel::type simd_level) {
- ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge,
- AggregateFinalize);
- // Set the simd level
- kernel.simd_level = simd_level;
- DCHECK_OK(func->AddKernel(kernel));
+Status AggregateFinalize(KernelContext* ctx, Datum* out) {
+ return checked_cast<ScalarAggregator*>(ctx->state())->Finalize(ctx, out);
}
-namespace aggregate {
-
+} // namespace
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateFunction* func, SimdLevel::type simd_level) {
+ ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge,
+ AggregateFinalize);
+ // Set the simd level
+ kernel.simd_level = simd_level;
+ DCHECK_OK(func->AddKernel(kernel));
+}
+
+namespace aggregate {
+
// ----------------------------------------------------------------------
// Count implementation
struct CountImpl : public ScalarAggregator {
- explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- const ArrayData& input = *batch[0].array();
- const int64_t nulls = input.GetNullCount();
- this->nulls += nulls;
- this->non_nulls += input.length - nulls;
- } else {
- const Scalar& input = *batch[0].scalar();
- this->nulls += !input.is_valid * batch.length;
- this->non_nulls += input.is_valid * batch.length;
- }
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
+ explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const ArrayData& input = *batch[0].array();
+ const int64_t nulls = input.GetNullCount();
+ this->nulls += nulls;
+ this->non_nulls += input.length - nulls;
+ } else {
+ const Scalar& input = *batch[0].scalar();
+ this->nulls += !input.is_valid * batch.length;
+ this->non_nulls += input.is_valid * batch.length;
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
const auto& other_state = checked_cast<const CountImpl&>(src);
this->non_nulls += other_state.non_nulls;
this->nulls += other_state.nulls;
- return Status::OK();
+ return Status::OK();
}
- Status Finalize(KernelContext* ctx, Datum* out) override {
+ Status Finalize(KernelContext* ctx, Datum* out) override {
const auto& state = checked_cast<const CountImpl&>(*ctx->state());
- if (state.options.skip_nulls) {
- *out = Datum(state.non_nulls);
- } else {
- *out = Datum(state.nulls);
+ if (state.options.skip_nulls) {
+ *out = Datum(state.non_nulls);
+ } else {
+ *out = Datum(state.nulls);
}
- return Status::OK();
+ return Status::OK();
}
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
int64_t non_nulls = 0;
int64_t nulls = 0;
};
-Result<std::unique_ptr<KernelState>> CountInit(KernelContext*,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> CountInit(KernelContext*,
+ const KernelInitArgs& args) {
return ::arrow::internal::make_unique<CountImpl>(
- static_cast<const ScalarAggregateOptions&>(*args.options));
+ static_cast<const ScalarAggregateOptions&>(*args.options));
}
// ----------------------------------------------------------------------
// Sum implementation
-template <typename ArrowType>
-struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
- explicit SumImplDefault(const ScalarAggregateOptions& options_) {
- this->options = options_;
- }
+template <typename ArrowType>
+struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
+ explicit SumImplDefault(const ScalarAggregateOptions& options_) {
+ this->options = options_;
+ }
};
-template <typename ArrowType>
-struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
- explicit MeanImplDefault(const ScalarAggregateOptions& options_) {
- this->options = options_;
- }
+template <typename ArrowType>
+struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
+ explicit MeanImplDefault(const ScalarAggregateOptions& options_) {
+ this->options = options_;
+ }
};
-Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- SumLikeInit<SumImplDefault> visitor(
- ctx, *args.inputs[0].type,
- static_cast<const ScalarAggregateOptions&>(*args.options));
+Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ SumLikeInit<SumImplDefault> visitor(
+ ctx, *args.inputs[0].type,
+ static_cast<const ScalarAggregateOptions&>(*args.options));
return visitor.Create();
}
-Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- SumLikeInit<MeanImplDefault> visitor(
- ctx, *args.inputs[0].type,
- static_cast<const ScalarAggregateOptions&>(*args.options));
+Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ SumLikeInit<MeanImplDefault> visitor(
+ ctx, *args.inputs[0].type,
+ static_cast<const ScalarAggregateOptions&>(*args.options));
return visitor.Create();
}
// ----------------------------------------------------------------------
// MinMax implementation
-Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
MinMaxInitState<SimdLevel::NONE> visitor(
ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
- static_cast<const ScalarAggregateOptions&>(*args.options));
+ static_cast<const ScalarAggregateOptions&>(*args.options));
return visitor.Create();
}
-// ----------------------------------------------------------------------
-// Any implementation
-
-struct BooleanAnyImpl : public ScalarAggregator {
- explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- // short-circuit if seen a True already
- if (this->any == true) {
- return Status::OK();
- }
- if (batch[0].is_scalar()) {
- const auto& scalar = *batch[0].scalar();
- this->has_nulls = !scalar.is_valid;
- this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
- return Status::OK();
- }
- const auto& data = *batch[0].array();
- this->has_nulls = data.GetNullCount() > 0;
- arrow::internal::OptionalBinaryBitBlockCounter counter(
- data.buffers[0], data.offset, data.buffers[1], data.offset, data.length);
- int64_t position = 0;
- while (position < data.length) {
- const auto block = counter.NextAndBlock();
- if (block.popcount > 0) {
- this->any = true;
- break;
- }
- position += block.length;
- }
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const BooleanAnyImpl&>(src);
- this->any |= other.any;
- this->has_nulls |= other.has_nulls;
- return Status::OK();
- }
-
- Status Finalize(KernelContext* ctx, Datum* out) override {
- if (!options.skip_nulls && !this->any && this->has_nulls) {
- out->value = std::make_shared<BooleanScalar>();
- } else {
- out->value = std::make_shared<BooleanScalar>(this->any);
- }
- return Status::OK();
- }
-
- bool any = false;
- bool has_nulls = false;
- ScalarAggregateOptions options;
-};
-
-Result<std::unique_ptr<KernelState>> AnyInit(KernelContext*, const KernelInitArgs& args) {
- const ScalarAggregateOptions options =
- static_cast<const ScalarAggregateOptions&>(*args.options);
- return ::arrow::internal::make_unique<BooleanAnyImpl>(
- static_cast<const ScalarAggregateOptions&>(*args.options));
-}
-
-// ----------------------------------------------------------------------
-// All implementation
-
-struct BooleanAllImpl : public ScalarAggregator {
- explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- // short-circuit if seen a false already
- if (this->all == false) {
- return Status::OK();
- }
- // short-circuit if seen a null already
- if (!options.skip_nulls && this->has_nulls) {
- return Status::OK();
- }
- if (batch[0].is_scalar()) {
- const auto& scalar = *batch[0].scalar();
- this->has_nulls = !scalar.is_valid;
- this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
- return Status::OK();
- }
- const auto& data = *batch[0].array();
- this->has_nulls = data.GetNullCount() > 0;
- arrow::internal::OptionalBinaryBitBlockCounter counter(
- data.buffers[1], data.offset, data.buffers[0], data.offset, data.length);
- int64_t position = 0;
- while (position < data.length) {
- const auto block = counter.NextOrNotBlock();
- if (!block.AllSet()) {
- this->all = false;
- break;
- }
- position += block.length;
- }
-
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const BooleanAllImpl&>(src);
- this->all &= other.all;
- this->has_nulls |= other.has_nulls;
- return Status::OK();
- }
-
- Status Finalize(KernelContext*, Datum* out) override {
- if (!options.skip_nulls && this->all && this->has_nulls) {
- out->value = std::make_shared<BooleanScalar>();
- } else {
- out->value = std::make_shared<BooleanScalar>(this->all);
- }
- return Status::OK();
- }
-
- bool all = true;
- bool has_nulls = false;
- ScalarAggregateOptions options;
-};
-
-Result<std::unique_ptr<KernelState>> AllInit(KernelContext*, const KernelInitArgs& args) {
- return ::arrow::internal::make_unique<BooleanAllImpl>(
- static_cast<const ScalarAggregateOptions&>(*args.options));
+// ----------------------------------------------------------------------
+// Any implementation
+
+struct BooleanAnyImpl : public ScalarAggregator {
+ explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ // short-circuit if seen a True already
+ if (this->any == true) {
+ return Status::OK();
+ }
+ if (batch[0].is_scalar()) {
+ const auto& scalar = *batch[0].scalar();
+ this->has_nulls = !scalar.is_valid;
+ this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
+ return Status::OK();
+ }
+ const auto& data = *batch[0].array();
+ this->has_nulls = data.GetNullCount() > 0;
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ data.buffers[0], data.offset, data.buffers[1], data.offset, data.length);
+ int64_t position = 0;
+ while (position < data.length) {
+ const auto block = counter.NextAndBlock();
+ if (block.popcount > 0) {
+ this->any = true;
+ break;
+ }
+ position += block.length;
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const BooleanAnyImpl&>(src);
+ this->any |= other.any;
+ this->has_nulls |= other.has_nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ if (!options.skip_nulls && !this->any && this->has_nulls) {
+ out->value = std::make_shared<BooleanScalar>();
+ } else {
+ out->value = std::make_shared<BooleanScalar>(this->any);
+ }
+ return Status::OK();
+ }
+
+ bool any = false;
+ bool has_nulls = false;
+ ScalarAggregateOptions options;
+};
+
+Result<std::unique_ptr<KernelState>> AnyInit(KernelContext*, const KernelInitArgs& args) {
+ const ScalarAggregateOptions options =
+ static_cast<const ScalarAggregateOptions&>(*args.options);
+ return ::arrow::internal::make_unique<BooleanAnyImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
}
-// ----------------------------------------------------------------------
-// Index implementation
-
-template <typename ArgType>
-struct IndexImpl : public ScalarAggregator {
- using ArgValue = typename internal::GetViewType<ArgType>::T;
-
- explicit IndexImpl(IndexOptions options, KernelState* raw_state)
- : options(std::move(options)), seen(0), index(-1) {
- if (auto state = static_cast<IndexImpl<ArgType>*>(raw_state)) {
- seen = state->seen;
- index = state->index;
- }
- }
-
- Status Consume(KernelContext* ctx, const ExecBatch& batch) override {
- // short-circuit
- if (index >= 0 || !options.value->is_valid) {
- return Status::OK();
- }
-
- auto input = batch[0].array();
- seen = input->length;
- const ArgValue desired = internal::UnboxScalar<ArgType>::Unbox(*options.value);
- int64_t i = 0;
-
- ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
- *input,
- [&](ArgValue v) -> Status {
- if (v == desired) {
- index = i;
- return Status::Cancelled("Found");
- } else {
- ++i;
- return Status::OK();
- }
- },
- [&]() -> Status {
- ++i;
- return Status::OK();
- }));
-
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const IndexImpl&>(src);
- if (index < 0 && other.index >= 0) {
- index = seen + other.index;
- }
- seen += other.seen;
- return Status::OK();
- }
-
- Status Finalize(KernelContext*, Datum* out) override {
- out->value = std::make_shared<Int64Scalar>(index >= 0 ? index : -1);
- return Status::OK();
- }
-
- const IndexOptions options;
- int64_t seen = 0;
- int64_t index = -1;
-};
-
-struct IndexInit {
- std::unique_ptr<KernelState> state;
- KernelContext* ctx;
- const IndexOptions& options;
- const DataType& type;
-
- IndexInit(KernelContext* ctx, const IndexOptions& options, const DataType& type)
- : ctx(ctx), options(options), type(type) {}
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("Index kernel not implemented for ", type.ToString());
- }
-
- Status Visit(const BooleanType&) {
- state.reset(new IndexImpl<BooleanType>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_number<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_base_binary<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_date<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_time<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_timestamp<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(type, this));
- return std::move(state);
- }
-
- static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
- const KernelInitArgs& args) {
- IndexInit visitor(ctx, static_cast<const IndexOptions&>(*args.options),
- *args.inputs[0].type);
- return visitor.Create();
- }
-};
-
+// ----------------------------------------------------------------------
+// All implementation
+
+struct BooleanAllImpl : public ScalarAggregator {
+ explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ // short-circuit if seen a false already
+ if (this->all == false) {
+ return Status::OK();
+ }
+ // short-circuit if seen a null already
+ if (!options.skip_nulls && this->has_nulls) {
+ return Status::OK();
+ }
+ if (batch[0].is_scalar()) {
+ const auto& scalar = *batch[0].scalar();
+ this->has_nulls = !scalar.is_valid;
+ this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
+ return Status::OK();
+ }
+ const auto& data = *batch[0].array();
+ this->has_nulls = data.GetNullCount() > 0;
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ data.buffers[1], data.offset, data.buffers[0], data.offset, data.length);
+ int64_t position = 0;
+ while (position < data.length) {
+ const auto block = counter.NextOrNotBlock();
+ if (!block.AllSet()) {
+ this->all = false;
+ break;
+ }
+ position += block.length;
+ }
+
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const BooleanAllImpl&>(src);
+ this->all &= other.all;
+ this->has_nulls |= other.has_nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (!options.skip_nulls && this->all && this->has_nulls) {
+ out->value = std::make_shared<BooleanScalar>();
+ } else {
+ out->value = std::make_shared<BooleanScalar>(this->all);
+ }
+ return Status::OK();
+ }
+
+ bool all = true;
+ bool has_nulls = false;
+ ScalarAggregateOptions options;
+};
+
+Result<std::unique_ptr<KernelState>> AllInit(KernelContext*, const KernelInitArgs& args) {
+ return ::arrow::internal::make_unique<BooleanAllImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+}
+
+// ----------------------------------------------------------------------
+// Index implementation
+
+template <typename ArgType>
+struct IndexImpl : public ScalarAggregator {
+ using ArgValue = typename internal::GetViewType<ArgType>::T;
+
+ explicit IndexImpl(IndexOptions options, KernelState* raw_state)
+ : options(std::move(options)), seen(0), index(-1) {
+ if (auto state = static_cast<IndexImpl<ArgType>*>(raw_state)) {
+ seen = state->seen;
+ index = state->index;
+ }
+ }
+
+ Status Consume(KernelContext* ctx, const ExecBatch& batch) override {
+ // short-circuit
+ if (index >= 0 || !options.value->is_valid) {
+ return Status::OK();
+ }
+
+ auto input = batch[0].array();
+ seen = input->length;
+ const ArgValue desired = internal::UnboxScalar<ArgType>::Unbox(*options.value);
+ int64_t i = 0;
+
+ ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
+ *input,
+ [&](ArgValue v) -> Status {
+ if (v == desired) {
+ index = i;
+ return Status::Cancelled("Found");
+ } else {
+ ++i;
+ return Status::OK();
+ }
+ },
+ [&]() -> Status {
+ ++i;
+ return Status::OK();
+ }));
+
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const IndexImpl&>(src);
+ if (index < 0 && other.index >= 0) {
+ index = seen + other.index;
+ }
+ seen += other.seen;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ out->value = std::make_shared<Int64Scalar>(index >= 0 ? index : -1);
+ return Status::OK();
+ }
+
+ const IndexOptions options;
+ int64_t seen = 0;
+ int64_t index = -1;
+};
+
+struct IndexInit {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const IndexOptions& options;
+ const DataType& type;
+
+ IndexInit(KernelContext* ctx, const IndexOptions& options, const DataType& type)
+ : ctx(ctx), options(options), type(type) {}
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Index kernel not implemented for ", type.ToString());
+ }
+
+ Status Visit(const BooleanType&) {
+ state.reset(new IndexImpl<BooleanType>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_number<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_base_binary<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_date<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_time<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_timestamp<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ return std::move(state);
+ }
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ IndexInit visitor(ctx, static_cast<const IndexOptions&>(*args.options),
+ *args.inputs[0].type);
+ return visitor.Create();
+ }
+};
+
void AddBasicAggKernels(KernelInit init,
const std::vector<std::shared_ptr<DataType>>& types,
std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func,
@@ -405,33 +405,33 @@ void AddBasicAggKernels(KernelInit init,
}
}
-void AddScalarAggKernels(KernelInit init,
- const std::vector<std::shared_ptr<DataType>>& types,
- std::shared_ptr<DataType> out_ty,
- ScalarAggregateFunction* func) {
- for (const auto& ty : types) {
- // scalar[InT] -> scalar[OutT]
- auto sig = KernelSignature::Make({InputType::Scalar(ty)}, ValueDescr::Scalar(out_ty));
- AddAggKernel(std::move(sig), init, func, SimdLevel::NONE);
- }
-}
-
-void AddArrayScalarAggKernels(KernelInit init,
- const std::vector<std::shared_ptr<DataType>>& types,
- std::shared_ptr<DataType> out_ty,
- ScalarAggregateFunction* func,
- SimdLevel::type simd_level = SimdLevel::NONE) {
- AddBasicAggKernels(init, types, out_ty, func, simd_level);
- AddScalarAggKernels(init, types, out_ty, func);
-}
-
+void AddScalarAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ // scalar[InT] -> scalar[OutT]
+ auto sig = KernelSignature::Make({InputType::Scalar(ty)}, ValueDescr::Scalar(out_ty));
+ AddAggKernel(std::move(sig), init, func, SimdLevel::NONE);
+ }
+}
+
+void AddArrayScalarAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE) {
+ AddBasicAggKernels(init, types, out_ty, func, simd_level);
+ AddScalarAggKernels(init, types, out_ty, func);
+}
+
void AddMinMaxKernels(KernelInit init,
const std::vector<std::shared_ptr<DataType>>& types,
ScalarAggregateFunction* func, SimdLevel::type simd_level) {
for (const auto& ty : types) {
- // any[T] -> scalar[struct<min: T, max: T>]
+ // any[T] -> scalar[struct<min: T, max: T>]
auto out_ty = struct_({field("min", ty), field("max", ty)});
- auto sig = KernelSignature::Make({InputType(ty)}, ValueDescr::Scalar(out_ty));
+ auto sig = KernelSignature::Make({InputType(ty)}, ValueDescr::Scalar(out_ty));
AddAggKernel(std::move(sig), init, func, simd_level);
}
}
@@ -439,92 +439,92 @@ void AddMinMaxKernels(KernelInit init,
} // namespace aggregate
namespace internal {
-namespace {
-
-const FunctionDoc count_doc{"Count the number of null / non-null values",
- ("By default, only non-null values are counted.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc sum_doc{
- "Compute the sum of a numeric array",
- ("Null values are ignored by default. Minimum count of non-null\n"
- "values can be set and null is returned if too few are present.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc mean_doc{
- "Compute the mean of a numeric array",
- ("Null values are ignored by default. Minimum count of non-null\n"
- "values can be set and null is returned if too few are "
- "present.\nThis can be changed through ScalarAggregateOptions.\n"
- "The result is always computed as a double, regardless of the input types."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array",
- ("Null values are ignored by default.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
- ("Null values are ignored by default.\n"
- "If null values are taken into account by setting "
- "ScalarAggregateOptions parameter skip_nulls = false then "
- "Kleene logic is used.\n"
- "See KleeneOr for more details on Kleene logic."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
- ("Null values are ignored by default.\n"
- "If null values are taken into account by setting "
- "ScalarAggregateOptions parameter skip_nulls = false then "
- "Kleene logic is used.\n"
- "See KleeneAnd for more details on Kleene logic."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc index_doc{"Find the index of the first occurrence of a given value",
- ("The result is always computed as an int64_t, regardless\n"
- "of the offset type of the input array."),
- {"array"},
- "IndexOptions"};
-
-} // namespace
-
+namespace {
+
+const FunctionDoc count_doc{"Count the number of null / non-null values",
+ ("By default, only non-null values are counted.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc sum_doc{
+ "Compute the sum of a numeric array",
+ ("Null values are ignored by default. Minimum count of non-null\n"
+ "values can be set and null is returned if too few are present.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc mean_doc{
+ "Compute the mean of a numeric array",
+ ("Null values are ignored by default. Minimum count of non-null\n"
+ "values can be set and null is returned if too few are "
+ "present.\nThis can be changed through ScalarAggregateOptions.\n"
+ "The result is always computed as a double, regardless of the input types."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array",
+ ("Null values are ignored by default.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
+ ("Null values are ignored by default.\n"
+ "If null values are taken into account by setting "
+ "ScalarAggregateOptions parameter skip_nulls = false then "
+ "Kleene logic is used.\n"
+ "See KleeneOr for more details on Kleene logic."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
+ ("Null values are ignored by default.\n"
+ "If null values are taken into account by setting "
+ "ScalarAggregateOptions parameter skip_nulls = false then "
+ "Kleene logic is used.\n"
+ "See KleeneAnd for more details on Kleene logic."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc index_doc{"Find the index of the first occurrence of a given value",
+ ("The result is always computed as an int64_t, regardless\n"
+ "of the offset type of the input array."),
+ {"array"},
+ "IndexOptions"};
+
+} // namespace
+
void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
- static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
-
- auto func = std::make_shared<ScalarAggregateFunction>(
- "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options);
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options);
+
// Takes any array input, outputs int64 scalar
InputType any_array(ValueDescr::ARRAY);
- AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())),
- aggregate::CountInit, func.get());
- AddAggKernel(
- KernelSignature::Make({InputType(ValueDescr::SCALAR)}, ValueDescr::Scalar(int64())),
- aggregate::CountInit, func.get());
+ AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())),
+ aggregate::CountInit, func.get());
+ AddAggKernel(
+ KernelSignature::Make({InputType(ValueDescr::SCALAR)}, ValueDescr::Scalar(int64())),
+ aggregate::CountInit, func.get());
DCHECK_OK(registry->AddFunction(std::move(func)));
- func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary(), &sum_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, {boolean()}, int64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, SignedIntTypes(), int64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(),
- func.get());
+ func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary(), &sum_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, {boolean()}, int64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, SignedIntTypes(), int64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(),
+ func.get());
// Add the SIMD variants for sum
-#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
auto cpu_info = arrow::internal::CpuInfo::GetInstance();
-#endif
+#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
aggregate::AddSumAvx2AggKernels(func.get());
@@ -537,12 +537,12 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
#endif
DCHECK_OK(registry->AddFunction(std::move(func)));
- func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary(), &mean_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, {boolean()}, float64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, NumericTypes(), float64(),
- func.get());
+ func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary(), &mean_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, {boolean()}, float64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, NumericTypes(), float64(),
+ func.get());
// Add the SIMD variants for mean
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
@@ -556,8 +556,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
#endif
DCHECK_OK(registry->AddFunction(std::move(func)));
- func = std::make_shared<ScalarAggregateFunction>(
- "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options);
+ func = std::make_shared<ScalarAggregateFunction>(
+ "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options);
aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get());
aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get());
// Add the SIMD variants for min max
@@ -574,29 +574,29 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
- // any
- func = std::make_shared<ScalarAggregateFunction>("any", Arity::Unary(), &any_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::AnyInit, {boolean()}, boolean(),
- func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
-
- // all
- func = std::make_shared<ScalarAggregateFunction>("all", Arity::Unary(), &all_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::AllInit, {boolean()}, boolean(),
- func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
-
- // index
- func = std::make_shared<ScalarAggregateFunction>("index", Arity::Unary(), &index_doc);
- aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, BaseBinaryTypes(), int64(),
- func.get());
- aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, PrimitiveTypes(), int64(),
- func.get());
- aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, TemporalTypes(), int64(),
- func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
+ // any
+ func = std::make_shared<ScalarAggregateFunction>("any", Arity::Unary(), &any_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::AnyInit, {boolean()}, boolean(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // all
+ func = std::make_shared<ScalarAggregateFunction>("all", Arity::Unary(), &all_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::AllInit, {boolean()}, boolean(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // index
+ func = std::make_shared<ScalarAggregateFunction>("index", Arity::Unary(), &index_doc);
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, BaseBinaryTypes(), int64(),
+ func.get());
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, PrimitiveTypes(), int64(),
+ func.get());
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, TemporalTypes(), int64(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
index 5163d3fd03d..4b1ae8d3d6c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
@@ -51,68 +51,68 @@ void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
// ----------------------------------------------------------------------
// Sum implementation
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct SumImpl : public ScalarAggregator {
- using ThisType = SumImpl<ArrowType, SimdLevel>;
- using CType = typename ArrowType::c_type;
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct SumImpl : public ScalarAggregator {
+ using ThisType = SumImpl<ArrowType, SimdLevel>;
+ using CType = typename ArrowType::c_type;
using SumType = typename FindAccumulatorType<ArrowType>::Type;
- using OutputType = typename TypeTraits<SumType>::ScalarType;
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- const auto& data = batch[0].array();
- this->count += data->length - data->GetNullCount();
- if (is_boolean_type<ArrowType>::value) {
- this->sum +=
- static_cast<typename SumType::c_type>(BooleanArray(data).true_count());
- } else {
- this->sum +=
- arrow::compute::detail::SumArray<CType, typename SumType::c_type, SimdLevel>(
- *data);
- }
+ using OutputType = typename TypeTraits<SumType>::ScalarType;
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const auto& data = batch[0].array();
+ this->count += data->length - data->GetNullCount();
+ if (is_boolean_type<ArrowType>::value) {
+ this->sum +=
+ static_cast<typename SumType::c_type>(BooleanArray(data).true_count());
+ } else {
+ this->sum +=
+ arrow::compute::detail::SumArray<CType, typename SumType::c_type, SimdLevel>(
+ *data);
+ }
} else {
- const auto& data = *batch[0].scalar();
- this->count += data.is_valid * batch.length;
- if (data.is_valid) {
- this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
+ const auto& data = *batch[0].scalar();
+ this->count += data.is_valid * batch.length;
+ if (data.is_valid) {
+ this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
}
}
- return Status::OK();
+ return Status::OK();
}
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const ThisType&>(src);
- this->count += other.count;
- this->sum += other.sum;
- return Status::OK();
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+ this->count += other.count;
+ this->sum += other.sum;
+ return Status::OK();
}
- Status Finalize(KernelContext*, Datum* out) override {
- if (this->count < options.min_count) {
- out->value = std::make_shared<OutputType>();
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->count < options.min_count) {
+ out->value = std::make_shared<OutputType>();
} else {
- out->value = MakeScalar(this->sum);
+ out->value = MakeScalar(this->sum);
}
- return Status::OK();
+ return Status::OK();
}
size_t count = 0;
typename SumType::c_type sum = 0;
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
};
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MeanImpl : public SumImpl<ArrowType, SimdLevel> {
- Status Finalize(KernelContext*, Datum* out) override {
- if (this->count < options.min_count) {
- out->value = std::make_shared<DoubleScalar>();
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl : public SumImpl<ArrowType, SimdLevel> {
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->count < options.min_count) {
+ out->value = std::make_shared<DoubleScalar>();
} else {
- const double mean = static_cast<double>(this->sum) / this->count;
- out->value = std::make_shared<DoubleScalar>(mean);
+ const double mean = static_cast<double>(this->sum) / this->count;
+ out->value = std::make_shared<DoubleScalar>(mean);
}
- return Status::OK();
+ return Status::OK();
}
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
};
template <template <typename> class KernelClass>
@@ -120,11 +120,11 @@ struct SumLikeInit {
std::unique_ptr<KernelState> state;
KernelContext* ctx;
const DataType& type;
- const ScalarAggregateOptions& options;
+ const ScalarAggregateOptions& options;
- SumLikeInit(KernelContext* ctx, const DataType& type,
- const ScalarAggregateOptions& options)
- : ctx(ctx), type(type), options(options) {}
+ SumLikeInit(KernelContext* ctx, const DataType& type,
+ const ScalarAggregateOptions& options)
+ : ctx(ctx), type(type), options(options) {}
Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
@@ -133,18 +133,18 @@ struct SumLikeInit {
}
Status Visit(const BooleanType&) {
- state.reset(new KernelClass<BooleanType>(options));
+ state.reset(new KernelClass<BooleanType>(options));
return Status::OK();
}
template <typename Type>
enable_if_number<Type, Status> Visit(const Type&) {
- state.reset(new KernelClass<Type>(options));
+ state.reset(new KernelClass<Type>(options));
return Status::OK();
}
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(type, this));
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
return std::move(state);
}
};
@@ -233,42 +233,42 @@ struct MinMaxImpl : public ScalarAggregator {
using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
using StateType = MinMaxState<ArrowType, SimdLevel>;
- MinMaxImpl(const std::shared_ptr<DataType>& out_type,
- const ScalarAggregateOptions& options)
+ MinMaxImpl(const std::shared_ptr<DataType>& out_type,
+ const ScalarAggregateOptions& options)
: out_type(out_type), options(options) {}
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- return ConsumeArray(ArrayType(batch[0].array()));
- }
- return ConsumeScalar(*batch[0].scalar());
- }
-
- Status ConsumeScalar(const Scalar& scalar) {
- StateType local;
- local.has_nulls = !scalar.is_valid;
- local.has_values = scalar.is_valid;
-
- if (local.has_nulls && !options.skip_nulls) {
- this->state = local;
- return Status::OK();
- }
-
- local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
- this->state = local;
- return Status::OK();
- }
-
- Status ConsumeArray(const ArrayType& arr) {
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ return ConsumeArray(ArrayType(batch[0].array()));
+ }
+ return ConsumeScalar(*batch[0].scalar());
+ }
+
+ Status ConsumeScalar(const Scalar& scalar) {
StateType local;
-
+ local.has_nulls = !scalar.is_valid;
+ local.has_values = scalar.is_valid;
+
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+ this->state = local;
+ return Status::OK();
+ }
+
+ Status ConsumeArray(const ArrayType& arr) {
+ StateType local;
+
const auto null_count = arr.null_count();
local.has_nulls = null_count > 0;
local.has_values = (arr.length() - null_count) > 0;
- if (local.has_nulls && !options.skip_nulls) {
+ if (local.has_nulls && !options.skip_nulls) {
this->state = local;
- return Status::OK();
+ return Status::OK();
}
if (local.has_nulls) {
@@ -279,32 +279,32 @@ struct MinMaxImpl : public ScalarAggregator {
}
}
this->state = local;
- return Status::OK();
+ return Status::OK();
}
- Status MergeFrom(KernelContext*, KernelState&& src) override {
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
const auto& other = checked_cast<const ThisType&>(src);
this->state += other.state;
- return Status::OK();
+ return Status::OK();
}
- Status Finalize(KernelContext*, Datum* out) override {
+ Status Finalize(KernelContext*, Datum* out) override {
using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
std::vector<std::shared_ptr<Scalar>> values;
- if (!state.has_values || (state.has_nulls && !options.skip_nulls)) {
+ if (!state.has_values || (state.has_nulls && !options.skip_nulls)) {
// (null, null)
values = {std::make_shared<ScalarType>(), std::make_shared<ScalarType>()};
} else {
values = {std::make_shared<ScalarType>(state.min),
std::make_shared<ScalarType>(state.max)};
}
- out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
- return Status::OK();
+ out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+ return Status::OK();
}
std::shared_ptr<DataType> out_type;
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
MinMaxState<ArrowType, SimdLevel> state;
private:
@@ -373,10 +373,10 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
using MinMaxImpl<BooleanType, SimdLevel>::options;
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
- return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar()));
- }
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
+ return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar()));
+ }
StateType local;
ArrayType arr(batch[0].array());
@@ -386,9 +386,9 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
local.has_nulls = null_count > 0;
local.has_values = valid_count > 0;
- if (local.has_nulls && !options.skip_nulls) {
+ if (local.has_nulls && !options.skip_nulls) {
this->state = local;
- return Status::OK();
+ return Status::OK();
}
const auto true_count = arr.true_count();
@@ -397,27 +397,27 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
local.min = false_count == 0;
this->state = local;
- return Status::OK();
- }
-
- Status ConsumeScalar(const BooleanScalar& scalar) {
- StateType local;
-
- local.has_nulls = !scalar.is_valid;
- local.has_values = scalar.is_valid;
- if (local.has_nulls && !options.skip_nulls) {
- this->state = local;
- return Status::OK();
- }
-
- const int true_count = scalar.is_valid && scalar.value;
- const int false_count = scalar.is_valid && !scalar.value;
- local.max = true_count > 0;
- local.min = false_count == 0;
-
- this->state = local;
- return Status::OK();
+ return Status::OK();
}
+
+ Status ConsumeScalar(const BooleanScalar& scalar) {
+ StateType local;
+
+ local.has_nulls = !scalar.is_valid;
+ local.has_values = scalar.is_valid;
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ const int true_count = scalar.is_valid && scalar.value;
+ const int false_count = scalar.is_valid && !scalar.value;
+ local.max = true_count > 0;
+ local.min = false_count == 0;
+
+ this->state = local;
+ return Status::OK();
+ }
};
template <SimdLevel::type SimdLevel>
@@ -426,11 +426,11 @@ struct MinMaxInitState {
KernelContext* ctx;
const DataType& in_type;
const std::shared_ptr<DataType>& out_type;
- const ScalarAggregateOptions& options;
+ const ScalarAggregateOptions& options;
MinMaxInitState(KernelContext* ctx, const DataType& in_type,
- const std::shared_ptr<DataType>& out_type,
- const ScalarAggregateOptions& options)
+ const std::shared_ptr<DataType>& out_type,
+ const ScalarAggregateOptions& options)
: ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
Status Visit(const DataType&) {
@@ -452,8 +452,8 @@ struct MinMaxInitState {
return Status::OK();
}
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
return std::move(state);
}
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
index ed29f26f2c3..d72cdb14941 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
@@ -19,8 +19,8 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace compute {
@@ -49,124 +49,124 @@ struct FindAccumulatorType<I, enable_if_floating_point<I>> {
using Type = DoubleType;
};
-struct ScalarAggregator : public KernelState {
- virtual Status Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
- virtual Status MergeFrom(KernelContext* ctx, KernelState&& src) = 0;
- virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
-};
-
-void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
- ScalarAggregateFunction* func,
- SimdLevel::type simd_level = SimdLevel::NONE);
-
-namespace detail {
-
-using arrow::internal::VisitSetBitRunsVoid;
-
-// SumArray must be parameterized with the SIMD level since it's called both from
-// translation units with and without vectorization. Normally it gets inlined but
-// if not, without the parameter, we'll have multiple definitions of the same
-// symbol and we'll get unexpected results.
-
-// non-recursive pairwise summation for floating points
-// https://en.wikipedia.org/wiki/Pairwise_summation
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
- typename ValueFunc>
-enable_if_t<std::is_floating_point<SumType>::value, SumType> SumArray(
- const ArrayData& data, ValueFunc&& func) {
- const int64_t data_size = data.length - data.GetNullCount();
- if (data_size == 0) {
- return 0;
- }
-
- // number of inputs to accumulate before merging with another block
- constexpr int kBlockSize = 16; // same as numpy
- // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary
- const int levels = BitUtil::Log2(static_cast<uint64_t>(data_size)) + 1;
- // temporary summation per level
- std::vector<SumType> sum(levels);
- // whether two summations are ready and should be reduced to upper level
- // one bit for each level, bit0 -> level0, ...
- uint64_t mask = 0;
- // level of root node holding the final summation
- int root_level = 0;
-
- // reduce summation of one block (may be smaller than kBlockSize) from leaf node
- // continue reducing to upper level if two summations are ready for non-leaf node
- auto reduce = [&](SumType block_sum) {
- int cur_level = 0;
- uint64_t cur_level_mask = 1ULL;
- sum[cur_level] += block_sum;
- mask ^= cur_level_mask;
- while ((mask & cur_level_mask) == 0) {
- block_sum = sum[cur_level];
- sum[cur_level] = 0;
- ++cur_level;
- DCHECK_LT(cur_level, levels);
- cur_level_mask <<= 1;
- sum[cur_level] += block_sum;
- mask ^= cur_level_mask;
- }
- root_level = std::max(root_level, cur_level);
- };
-
- const ValueType* values = data.GetValues<ValueType>(1);
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- const ValueType* v = &values[pos];
- // unsigned division by constant is cheaper than signed one
- const uint64_t blocks = static_cast<uint64_t>(len) / kBlockSize;
- const uint64_t remains = static_cast<uint64_t>(len) % kBlockSize;
-
- for (uint64_t i = 0; i < blocks; ++i) {
- SumType block_sum = 0;
- for (int j = 0; j < kBlockSize; ++j) {
- block_sum += func(v[j]);
- }
- reduce(block_sum);
- v += kBlockSize;
- }
-
- if (remains > 0) {
- SumType block_sum = 0;
- for (uint64_t i = 0; i < remains; ++i) {
- block_sum += func(v[i]);
- }
- reduce(block_sum);
- }
- });
-
- // reduce intermediate summations from all non-leaf nodes
- for (int i = 1; i <= root_level; ++i) {
- sum[i] += sum[i - 1];
- }
-
- return sum[root_level];
-}
-
-// naive summation for integers
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
- typename ValueFunc>
-enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
- const ArrayData& data, ValueFunc&& func) {
- SumType sum = 0;
- const ValueType* values = data.GetValues<ValueType>(1);
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- sum += func(values[pos + i]);
- }
- });
- return sum;
-}
-
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel>
-SumType SumArray(const ArrayData& data) {
- return SumArray<ValueType, SumType, SimdLevel>(
- data, [](ValueType v) { return static_cast<SumType>(v); });
-}
-
-} // namespace detail
-
+struct ScalarAggregator : public KernelState {
+ virtual Status Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
+ virtual Status MergeFrom(KernelContext* ctx, KernelState&& src) = 0;
+ virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
+};
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE);
+
+namespace detail {
+
+using arrow::internal::VisitSetBitRunsVoid;
+
+// SumArray must be parameterized with the SIMD level since it's called both from
+// translation units with and without vectorization. Normally it gets inlined but
+// if not, without the parameter, we'll have multiple definitions of the same
+// symbol and we'll get unexpected results.
+
+// non-recursive pairwise summation for floating points
+// https://en.wikipedia.org/wiki/Pairwise_summation
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+ typename ValueFunc>
+enable_if_t<std::is_floating_point<SumType>::value, SumType> SumArray(
+ const ArrayData& data, ValueFunc&& func) {
+ const int64_t data_size = data.length - data.GetNullCount();
+ if (data_size == 0) {
+ return 0;
+ }
+
+ // number of inputs to accumulate before merging with another block
+ constexpr int kBlockSize = 16; // same as numpy
+ // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary
+ const int levels = BitUtil::Log2(static_cast<uint64_t>(data_size)) + 1;
+ // temporary summation per level
+ std::vector<SumType> sum(levels);
+ // whether two summations are ready and should be reduced to upper level
+ // one bit for each level, bit0 -> level0, ...
+ uint64_t mask = 0;
+ // level of root node holding the final summation
+ int root_level = 0;
+
+ // reduce summation of one block (may be smaller than kBlockSize) from leaf node
+ // continue reducing to upper level if two summations are ready for non-leaf node
+ auto reduce = [&](SumType block_sum) {
+ int cur_level = 0;
+ uint64_t cur_level_mask = 1ULL;
+ sum[cur_level] += block_sum;
+ mask ^= cur_level_mask;
+ while ((mask & cur_level_mask) == 0) {
+ block_sum = sum[cur_level];
+ sum[cur_level] = 0;
+ ++cur_level;
+ DCHECK_LT(cur_level, levels);
+ cur_level_mask <<= 1;
+ sum[cur_level] += block_sum;
+ mask ^= cur_level_mask;
+ }
+ root_level = std::max(root_level, cur_level);
+ };
+
+ const ValueType* values = data.GetValues<ValueType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ const ValueType* v = &values[pos];
+ // unsigned division by constant is cheaper than signed one
+ const uint64_t blocks = static_cast<uint64_t>(len) / kBlockSize;
+ const uint64_t remains = static_cast<uint64_t>(len) % kBlockSize;
+
+ for (uint64_t i = 0; i < blocks; ++i) {
+ SumType block_sum = 0;
+ for (int j = 0; j < kBlockSize; ++j) {
+ block_sum += func(v[j]);
+ }
+ reduce(block_sum);
+ v += kBlockSize;
+ }
+
+ if (remains > 0) {
+ SumType block_sum = 0;
+ for (uint64_t i = 0; i < remains; ++i) {
+ block_sum += func(v[i]);
+ }
+ reduce(block_sum);
+ }
+ });
+
+ // reduce intermediate summations from all non-leaf nodes
+ for (int i = 1; i <= root_level; ++i) {
+ sum[i] += sum[i - 1];
+ }
+
+ return sum[root_level];
+}
+
+// naive summation for integers
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+ typename ValueFunc>
+enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
+ const ArrayData& data, ValueFunc&& func) {
+ SumType sum = 0;
+ const ValueType* values = data.GetValues<ValueType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ sum += func(values[pos + i]);
+ }
+ });
+ return sum;
+}
+
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel>
+SumType SumArray(const ArrayData& data) {
+ return SumArray<ValueType, SumType, SimdLevel>(
+ data, [](ValueType v) { return static_cast<SumType>(v); });
+}
+
+} // namespace detail
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index 6ad0eeb6456..b2659355ba9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -16,377 +16,377 @@
// under the License.
#include <cmath>
-#include <queue>
-#include <utility>
+#include <queue>
+#include <utility>
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/result.h"
-#include "arrow/stl_allocator.h"
-#include "arrow/type_traits.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
namespace arrow {
namespace compute {
-namespace internal {
+namespace internal {
namespace {
-using ModeState = OptionsWrapper<ModeOptions>;
+using ModeState = OptionsWrapper<ModeOptions>;
-constexpr char kModeFieldName[] = "mode";
-constexpr char kCountFieldName[] = "count";
+constexpr char kModeFieldName[] = "mode";
+constexpr char kCountFieldName[] = "count";
-constexpr uint64_t kCountEOF = ~0ULL;
+constexpr uint64_t kCountEOF = ~0ULL;
-template <typename InType, typename CType = typename InType::c_type>
-Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
- Datum* out) {
- const auto& mode_type = TypeTraits<InType>::type_singleton();
- const auto& count_type = int64();
+template <typename InType, typename CType = typename InType::c_type>
+Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
+ Datum* out) {
+ const auto& mode_type = TypeTraits<InType>::type_singleton();
+ const auto& count_type = int64();
- auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0);
- mode_data->buffers.resize(2, nullptr);
- auto count_data = ArrayData::Make(count_type, n, 0);
- count_data->buffers.resize(2, nullptr);
+ auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0);
+ mode_data->buffers.resize(2, nullptr);
+ auto count_data = ArrayData::Make(count_type, n, 0);
+ count_data->buffers.resize(2, nullptr);
- CType* mode_buffer = nullptr;
- int64_t* count_buffer = nullptr;
-
- if (n > 0) {
- ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(n * sizeof(CType)));
- ARROW_ASSIGN_OR_RAISE(count_data->buffers[1], ctx->Allocate(n * sizeof(int64_t)));
- mode_buffer = mode_data->template GetMutableValues<CType>(1);
- count_buffer = count_data->template GetMutableValues<int64_t>(1);
+ CType* mode_buffer = nullptr;
+ int64_t* count_buffer = nullptr;
+
+ if (n > 0) {
+ ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(n * sizeof(CType)));
+ ARROW_ASSIGN_OR_RAISE(count_data->buffers[1], ctx->Allocate(n * sizeof(int64_t)));
+ mode_buffer = mode_data->template GetMutableValues<CType>(1);
+ count_buffer = count_data->template GetMutableValues<int64_t>(1);
}
- const auto& out_type =
- struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
- *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0));
-
- return std::make_pair(mode_buffer, count_buffer);
+ const auto& out_type =
+ struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+ *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0));
+
+ return std::make_pair(mode_buffer, count_buffer);
}
-// find top-n value:count pairs with minimal heap
-// suboptimal for tiny or large n, possibly okay as we're not in hot path
-template <typename InType, typename Generator>
-Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
- using CType = typename InType::c_type;
-
- using ValueCountPair = std::pair<CType, uint64_t>;
- auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) {
- const bool rhs_is_nan = rhs.first != rhs.first; // nan as largest value
- return lhs.second > rhs.second ||
- (lhs.second == rhs.second && (lhs.first < rhs.first || rhs_is_nan));
- };
-
- std::priority_queue<ValueCountPair, std::vector<ValueCountPair>, decltype(gt)> min_heap(
- std::move(gt));
-
- const ModeOptions& options = ModeState::Get(ctx);
- while (true) {
- const ValueCountPair& value_count = gen();
- DCHECK_NE(value_count.second, 0);
- if (value_count.second == kCountEOF) break;
- if (static_cast<int64_t>(min_heap.size()) < options.n) {
- min_heap.push(value_count);
- } else if (gt(value_count, min_heap.top())) {
- min_heap.pop();
- min_heap.push(value_count);
+// find top-n value:count pairs with minimal heap
+// suboptimal for tiny or large n, possibly okay as we're not in hot path
+template <typename InType, typename Generator>
+Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
+ using CType = typename InType::c_type;
+
+ using ValueCountPair = std::pair<CType, uint64_t>;
+ auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) {
+ const bool rhs_is_nan = rhs.first != rhs.first; // nan as largest value
+ return lhs.second > rhs.second ||
+ (lhs.second == rhs.second && (lhs.first < rhs.first || rhs_is_nan));
+ };
+
+ std::priority_queue<ValueCountPair, std::vector<ValueCountPair>, decltype(gt)> min_heap(
+ std::move(gt));
+
+ const ModeOptions& options = ModeState::Get(ctx);
+ while (true) {
+ const ValueCountPair& value_count = gen();
+ DCHECK_NE(value_count.second, 0);
+ if (value_count.second == kCountEOF) break;
+ if (static_cast<int64_t>(min_heap.size()) < options.n) {
+ min_heap.push(value_count);
+ } else if (gt(value_count, min_heap.top())) {
+ min_heap.pop();
+ min_heap.push(value_count);
}
}
- const int64_t n = min_heap.size();
-
- CType* mode_buffer;
- int64_t* count_buffer;
- ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
- PrepareOutput<InType>(n, ctx, out));
-
- for (int64_t i = n - 1; i >= 0; --i) {
- std::tie(mode_buffer[i], count_buffer[i]) = min_heap.top();
- min_heap.pop();
- }
-
- return Status::OK();
+ const int64_t n = min_heap.size();
+
+ CType* mode_buffer;
+ int64_t* count_buffer;
+ ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
+ PrepareOutput<InType>(n, ctx, out));
+
+ for (int64_t i = n - 1; i >= 0; --i) {
+ std::tie(mode_buffer[i], count_buffer[i]) = min_heap.top();
+ min_heap.pop();
+ }
+
+ return Status::OK();
}
-// count value occurances for integers with narrow value range
-// O(1) space, O(n) time
-template <typename T>
-struct CountModer {
- using CType = typename T::c_type;
+// count value occurances for integers with narrow value range
+// O(1) space, O(n) time
+template <typename T>
+struct CountModer {
+ using CType = typename T::c_type;
- CType min;
- std::vector<uint64_t> counts;
+ CType min;
+ std::vector<uint64_t> counts;
- CountModer(CType min, CType max) {
- uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
- DCHECK_LT(value_range, 1 << 20);
- this->min = min;
- this->counts.resize(value_range, 0);
+ CountModer(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 20);
+ this->min = min;
+ this->counts.resize(value_range, 0);
}
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // count values in all chunks, ignore nulls
- const Datum& datum = batch[0];
- CountValues<CType>(this->counts.data(), datum, this->min);
-
- // generator to emit next value:count pair
- int index = 0;
- auto gen = [&]() {
- for (; index < static_cast<int>(counts.size()); ++index) {
- if (counts[index] != 0) {
- auto value_count =
- std::make_pair(static_cast<CType>(index + this->min), counts[index]);
- ++index;
- return value_count;
- }
- }
- return std::pair<CType, uint64_t>(0, kCountEOF);
- };
-
- return Finalize<T>(ctx, out, std::move(gen));
- }
-};
-
-// booleans can be handled more straightforward
-template <>
-struct CountModer<BooleanType> {
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- int64_t counts[2]{};
-
- const Datum& datum = batch[0];
- for (const auto& array : datum.chunks()) {
- if (array->length() > array->null_count()) {
- const int64_t true_count =
- arrow::internal::checked_pointer_cast<BooleanArray>(array)->true_count();
- const int64_t false_count = array->length() - array->null_count() - true_count;
- counts[true] += true_count;
- counts[false] += false_count;
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ CountValues<CType>(this->counts.data(), datum, this->min);
+
+ // generator to emit next value:count pair
+ int index = 0;
+ auto gen = [&]() {
+ for (; index < static_cast<int>(counts.size()); ++index) {
+ if (counts[index] != 0) {
+ auto value_count =
+ std::make_pair(static_cast<CType>(index + this->min), counts[index]);
+ ++index;
+ return value_count;
+ }
+ }
+ return std::pair<CType, uint64_t>(0, kCountEOF);
+ };
+
+ return Finalize<T>(ctx, out, std::move(gen));
+ }
+};
+
+// booleans can be handled more straightforward
+template <>
+struct CountModer<BooleanType> {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ int64_t counts[2]{};
+
+ const Datum& datum = batch[0];
+ for (const auto& array : datum.chunks()) {
+ if (array->length() > array->null_count()) {
+ const int64_t true_count =
+ arrow::internal::checked_pointer_cast<BooleanArray>(array)->true_count();
+ const int64_t false_count = array->length() - array->null_count() - true_count;
+ counts[true] += true_count;
+ counts[false] += false_count;
}
}
- const ModeOptions& options = ModeState::Get(ctx);
- const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0);
- const int64_t n = std::min(options.n, distinct_values);
-
- bool* mode_buffer;
- int64_t* count_buffer;
- ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
- PrepareOutput<BooleanType>(n, ctx, out));
-
- if (n >= 1) {
- const bool index = counts[1] > counts[0];
- mode_buffer[0] = index;
- count_buffer[0] = counts[index];
- if (n == 2) {
- mode_buffer[1] = !index;
- count_buffer[1] = counts[!index];
+ const ModeOptions& options = ModeState::Get(ctx);
+ const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0);
+ const int64_t n = std::min(options.n, distinct_values);
+
+ bool* mode_buffer;
+ int64_t* count_buffer;
+ ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
+ PrepareOutput<BooleanType>(n, ctx, out));
+
+ if (n >= 1) {
+ const bool index = counts[1] > counts[0];
+ mode_buffer[0] = index;
+ count_buffer[0] = counts[index];
+ if (n == 2) {
+ mode_buffer[1] = !index;
+ count_buffer[1] = counts[!index];
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
-// copy and sort approach for floating points or integers with wide value range
-// O(n) space, O(nlogn) time
-template <typename T>
-struct SortModer {
- using CType = typename T::c_type;
- using Allocator = arrow::stl::allocator<CType>;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // copy all chunks to a buffer, ignore nulls and nans
- std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
-
- uint64_t nan_count = 0;
- const Datum& datum = batch[0];
- const int64_t in_length = datum.length() - datum.null_count();
- if (in_length > 0) {
- in_buffer.resize(in_length);
- CopyNonNullValues(datum, in_buffer.data());
-
- // drop nan
- if (is_floating_type<T>::value) {
- const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
- [](CType v) { return v != v; });
- nan_count = in_buffer.end() - it;
- in_buffer.resize(it - in_buffer.begin());
- }
- }
-
- // sort the input data to count same values
- std::sort(in_buffer.begin(), in_buffer.end());
-
- // generator to emit next value:count pair
- auto it = in_buffer.cbegin();
- auto gen = [&]() {
- if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
- // handle NAN at last
- if (nan_count > 0) {
- auto value_count = std::make_pair(static_cast<CType>(NAN), nan_count);
- nan_count = 0;
- return value_count;
- }
- return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
- }
- // count same values
- const CType value = *it;
- uint64_t count = 0;
- do {
- ++it;
- ++count;
- } while (it != in_buffer.cend() && *it == value);
- return std::make_pair(value, count);
- };
-
- return Finalize<T>(ctx, out, std::move(gen));
+// copy and sort approach for floating points or integers with wide value range
+// O(n) space, O(nlogn) time
+template <typename T>
+struct SortModer {
+ using CType = typename T::c_type;
+ using Allocator = arrow::stl::allocator<CType>;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // copy all chunks to a buffer, ignore nulls and nans
+ std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+
+ uint64_t nan_count = 0;
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ in_buffer.resize(in_length);
+ CopyNonNullValues(datum, in_buffer.data());
+
+ // drop nan
+ if (is_floating_type<T>::value) {
+ const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
+ [](CType v) { return v != v; });
+ nan_count = in_buffer.end() - it;
+ in_buffer.resize(it - in_buffer.begin());
+ }
+ }
+
+ // sort the input data to count same values
+ std::sort(in_buffer.begin(), in_buffer.end());
+
+ // generator to emit next value:count pair
+ auto it = in_buffer.cbegin();
+ auto gen = [&]() {
+ if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
+ // handle NAN at last
+ if (nan_count > 0) {
+ auto value_count = std::make_pair(static_cast<CType>(NAN), nan_count);
+ nan_count = 0;
+ return value_count;
+ }
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ }
+ // count same values
+ const CType value = *it;
+ uint64_t count = 0;
+ do {
+ ++it;
+ ++count;
+ } while (it != in_buffer.cend() && *it == value);
+ return std::make_pair(value, count);
+ };
+
+ return Finalize<T>(ctx, out, std::move(gen));
}
-};
-
-// pick counting or sorting approach per integers value range
-template <typename T>
-struct CountOrSortModer {
- using CType = typename T::c_type;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // cross point to benefit from counting approach
- // about 2x improvement for int32/64 from micro-benchmarking
- static constexpr int kMinArraySize = 8192;
- static constexpr int kMaxValueRange = 32768;
-
- const Datum& datum = batch[0];
- if (datum.length() - datum.null_count() >= kMinArraySize) {
- CType min, max;
- std::tie(min, max) = GetMinMax<CType>(datum);
-
- if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
- return CountModer<T>(min, max).Exec(ctx, batch, out);
- }
+};
+
+// pick counting or sorting approach per integers value range
+template <typename T>
+struct CountOrSortModer {
+ using CType = typename T::c_type;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from counting approach
+ // about 2x improvement for int32/64 from micro-benchmarking
+ static constexpr int kMinArraySize = 8192;
+ static constexpr int kMaxValueRange = 32768;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
+ CType min, max;
+ std::tie(min, max) = GetMinMax<CType>(datum);
+
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+ return CountModer<T>(min, max).Exec(ctx, batch, out);
+ }
}
-
- return SortModer<T>().Exec(ctx, batch, out);
+
+ return SortModer<T>().Exec(ctx, batch, out);
}
+};
+
+template <typename InType, typename Enable = void>
+struct Moder;
+
+template <>
+struct Moder<Int8Type> {
+ CountModer<Int8Type> impl;
+ Moder() : impl(-128, 127) {}
};
-template <typename InType, typename Enable = void>
-struct Moder;
-
-template <>
-struct Moder<Int8Type> {
- CountModer<Int8Type> impl;
- Moder() : impl(-128, 127) {}
-};
-
-template <>
-struct Moder<UInt8Type> {
- CountModer<UInt8Type> impl;
- Moder() : impl(0, 255) {}
-};
-
-template <>
-struct Moder<BooleanType> {
- CountModer<BooleanType> impl;
-};
-
-template <typename InType>
-struct Moder<InType, enable_if_t<(is_integer_type<InType>::value &&
- (sizeof(typename InType::c_type) > 1))>> {
- CountOrSortModer<InType> impl;
-};
-
-template <typename InType>
-struct Moder<InType, enable_if_t<is_floating_type<InType>::value>> {
- SortModer<InType> impl;
-};
-
-template <typename T>
-Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
- using CType = typename T::c_type;
- if (scalar.is_valid) {
- bool called = false;
- return Finalize<T>(ctx, out, [&]() {
- if (!called) {
- called = true;
- return std::pair<CType, uint64_t>(UnboxScalar<T>::Unbox(scalar), 1);
- }
- return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
- });
+template <>
+struct Moder<UInt8Type> {
+ CountModer<UInt8Type> impl;
+ Moder() : impl(0, 255) {}
+};
+
+template <>
+struct Moder<BooleanType> {
+ CountModer<BooleanType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_t<(is_integer_type<InType>::value &&
+ (sizeof(typename InType::c_type) > 1))>> {
+ CountOrSortModer<InType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_t<is_floating_type<InType>::value>> {
+ SortModer<InType> impl;
+};
+
+template <typename T>
+Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
+ using CType = typename T::c_type;
+ if (scalar.is_valid) {
+ bool called = false;
+ return Finalize<T>(ctx, out, [&]() {
+ if (!called) {
+ called = true;
+ return std::pair<CType, uint64_t>(UnboxScalar<T>::Unbox(scalar), 1);
+ }
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ });
}
- return Finalize<T>(ctx, out, []() {
- return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
- });
-}
-
-template <typename _, typename InType>
-struct ModeExecutor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (ctx->state() == nullptr) {
- return Status::Invalid("Mode requires ModeOptions");
- }
- const ModeOptions& options = ModeState::Get(ctx);
- if (options.n <= 0) {
- return Status::Invalid("ModeOption::n must be strictly positive");
- }
-
- if (batch[0].is_scalar()) {
- return ScalarMode<InType>(ctx, *batch[0].scalar(), out);
- }
-
- return Moder<InType>().impl.Exec(ctx, batch, out);
+ return Finalize<T>(ctx, out, []() {
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ });
+}
+
+template <typename _, typename InType>
+struct ModeExecutor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("Mode requires ModeOptions");
+ }
+ const ModeOptions& options = ModeState::Get(ctx);
+ if (options.n <= 0) {
+ return Status::Invalid("ModeOption::n must be strictly positive");
+ }
+
+ if (batch[0].is_scalar()) {
+ return ScalarMode<InType>(ctx, *batch[0].scalar(), out);
+ }
+
+ return Moder<InType>().impl.Exec(ctx, batch, out);
}
};
-VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type) {
- VectorKernel kernel;
- kernel.init = ModeState::Init;
- kernel.can_execute_chunkwise = false;
- kernel.output_chunked = false;
- auto out_type =
- struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
- kernel.signature =
- KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type));
- return kernel;
+VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type) {
+ VectorKernel kernel;
+ kernel.init = ModeState::Init;
+ kernel.can_execute_chunkwise = false;
+ kernel.output_chunked = false;
+ auto out_type =
+ struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
+ kernel.signature =
+ KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type));
+ return kernel;
}
-void AddBooleanModeKernel(VectorFunction* func) {
- VectorKernel kernel = NewModeKernel(boolean());
- kernel.exec = ModeExecutor<StructType, BooleanType>::Exec;
- DCHECK_OK(func->AddKernel(kernel));
-}
-
-void AddNumericModeKernels(VectorFunction* func) {
- for (const auto& type : NumericTypes()) {
- VectorKernel kernel = NewModeKernel(type);
- kernel.exec = GenerateNumeric<ModeExecutor, StructType>(*type);
- DCHECK_OK(func->AddKernel(kernel));
+void AddBooleanModeKernel(VectorFunction* func) {
+ VectorKernel kernel = NewModeKernel(boolean());
+ kernel.exec = ModeExecutor<StructType, BooleanType>::Exec;
+ DCHECK_OK(func->AddKernel(kernel));
+}
+
+void AddNumericModeKernels(VectorFunction* func) {
+ for (const auto& type : NumericTypes()) {
+ VectorKernel kernel = NewModeKernel(type);
+ kernel.exec = GenerateNumeric<ModeExecutor, StructType>(*type);
+ DCHECK_OK(func->AddKernel(kernel));
}
}
-const FunctionDoc mode_doc{
- "Calculate the modal (most common) values of a numeric array",
- ("Returns top-n most common values and number of times they occur in an array.\n"
- "Result is an array of `struct<mode: T, count: int64>`, where T is the input type.\n"
- "Values with larger counts are returned before smaller counts.\n"
- "If there are more than one values with same count, smaller one is returned first.\n"
- "Nulls are ignored. If there are no non-null values in the array,\n"
- "empty array is returned."),
- {"array"},
- "ModeOptions"};
-
+const FunctionDoc mode_doc{
+ "Calculate the modal (most common) values of a numeric array",
+ ("Returns top-n most common values and number of times they occur in an array.\n"
+ "Result is an array of `struct<mode: T, count: int64>`, where T is the input type.\n"
+ "Values with larger counts are returned before smaller counts.\n"
+ "If there are more than one values with same count, smaller one is returned first.\n"
+ "Nulls are ignored. If there are no non-null values in the array,\n"
+ "empty array is returned."),
+ {"array"},
+ "ModeOptions"};
+
} // namespace
-void RegisterScalarAggregateMode(FunctionRegistry* registry) {
- static auto default_options = ModeOptions::Defaults();
- auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), &mode_doc,
- &default_options);
- AddBooleanModeKernel(func.get());
- AddNumericModeKernels(func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
+void RegisterScalarAggregateMode(FunctionRegistry* registry) {
+ static auto default_options = ModeOptions::Defaults();
+ auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), &mode_doc,
+ &default_options);
+ AddBooleanModeKernel(func.get());
+ AddNumericModeKernels(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
}
-} // namespace internal
+} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
index 7d2ffe0770c..feacedbb96e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -1,493 +1,493 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cmath>
-#include <vector>
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/stl_allocator.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-
-namespace {
-
-using QuantileState = internal::OptionsWrapper<QuantileOptions>;
-
-// output is at some input data point, not interpolated
-bool IsDataPoint(const QuantileOptions& options) {
- // some interpolation methods return exact data point
- return options.interpolation == QuantileOptions::LOWER ||
- options.interpolation == QuantileOptions::HIGHER ||
- options.interpolation == QuantileOptions::NEAREST;
-}
-
-// quantile to exact datapoint index (IsDataPoint == true)
-uint64_t QuantileToDataPoint(size_t length, double q,
- enum QuantileOptions::Interpolation interpolation) {
- const double index = (length - 1) * q;
- uint64_t datapoint_index = static_cast<uint64_t>(index);
- const double fraction = index - datapoint_index;
-
- if (interpolation == QuantileOptions::LINEAR ||
- interpolation == QuantileOptions::MIDPOINT) {
- DCHECK_EQ(fraction, 0);
- }
-
- // convert NEAREST interpolation method to LOWER or HIGHER
- if (interpolation == QuantileOptions::NEAREST) {
- if (fraction < 0.5) {
- interpolation = QuantileOptions::LOWER;
- } else if (fraction > 0.5) {
- interpolation = QuantileOptions::HIGHER;
- } else {
- // round 0.5 to nearest even number, similar to numpy.around
- interpolation =
- (datapoint_index & 1) ? QuantileOptions::HIGHER : QuantileOptions::LOWER;
- }
- }
-
- if (interpolation == QuantileOptions::HIGHER && fraction != 0) {
- ++datapoint_index;
- }
-
- return datapoint_index;
-}
-
-// copy and nth_element approach, large memory footprint
-template <typename InType>
-struct SortQuantiler {
- using CType = typename InType::c_type;
- using Allocator = arrow::stl::allocator<CType>;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const QuantileOptions& options = QuantileState::Get(ctx);
-
- // copy all chunks to a buffer, ignore nulls and nans
- std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
-
- const Datum& datum = batch[0];
- const int64_t in_length = datum.length() - datum.null_count();
- if (in_length > 0) {
- in_buffer.resize(in_length);
- CopyNonNullValues(datum, in_buffer.data());
-
- // drop nan
- if (is_floating_type<InType>::value) {
- const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
- [](CType v) { return v != v; });
- in_buffer.resize(it - in_buffer.begin());
- }
- }
-
- // prepare out array
- int64_t out_length = options.q.size();
- if (in_buffer.empty()) {
- out_length = 0; // input is empty or only contains null and nan, return empty array
- }
- // out type depends on options
- const bool is_datapoint = IsDataPoint(options);
- const std::shared_ptr<DataType> out_type =
- is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
- auto out_data = ArrayData::Make(out_type, out_length, 0);
- out_data->buffers.resize(2, nullptr);
-
- // calculate quantiles
- if (out_length > 0) {
- ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
- ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
-
- // find quantiles in descending order
- std::vector<int64_t> q_indices(out_length);
- std::iota(q_indices.begin(), q_indices.end(), 0);
- std::sort(q_indices.begin(), q_indices.end(),
- [&options](int64_t left_index, int64_t right_index) {
- return options.q[right_index] < options.q[left_index];
- });
-
- // input array is partitioned around data point at `last_index` (pivot)
- // for next quatile which is smaller, we only consider inputs left of the pivot
- uint64_t last_index = in_buffer.size();
- if (is_datapoint) {
- CType* out_buffer = out_data->template GetMutableValues<CType>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileAtDataPoint(
- in_buffer, &last_index, options.q[q_index], options.interpolation);
- }
- } else {
- double* out_buffer = out_data->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileByInterp(
- in_buffer, &last_index, options.q[q_index], options.interpolation);
- }
- }
- }
-
- *out = Datum(std::move(out_data));
- return Status::OK();
- }
-
- // return quantile located exactly at some input data point
- CType GetQuantileAtDataPoint(std::vector<CType, Allocator>& in, uint64_t* last_index,
- double q,
- enum QuantileOptions::Interpolation interpolation) {
- const uint64_t datapoint_index = QuantileToDataPoint(in.size(), q, interpolation);
-
- if (datapoint_index != *last_index) {
- DCHECK_LT(datapoint_index, *last_index);
- std::nth_element(in.begin(), in.begin() + datapoint_index,
- in.begin() + *last_index);
- *last_index = datapoint_index;
- }
-
- return in[datapoint_index];
- }
-
- // return quantile interpolated from adjacent input data points
- double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
- double q,
- enum QuantileOptions::Interpolation interpolation) {
- const double index = (in.size() - 1) * q;
- const uint64_t lower_index = static_cast<uint64_t>(index);
- const double fraction = index - lower_index;
-
- if (lower_index != *last_index) {
- DCHECK_LT(lower_index, *last_index);
- std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
- }
-
- const double lower_value = static_cast<double>(in[lower_index]);
- if (fraction == 0) {
- *last_index = lower_index;
- return lower_value;
- }
-
- const uint64_t higher_index = lower_index + 1;
- DCHECK_LT(higher_index, in.size());
- if (lower_index != *last_index && higher_index != *last_index) {
- DCHECK_LT(higher_index, *last_index);
- // higher value must be the minimal value after lower_index
- auto min = std::min_element(in.begin() + higher_index, in.begin() + *last_index);
- std::iter_swap(in.begin() + higher_index, min);
- }
- *last_index = lower_index;
-
- const double higher_value = static_cast<double>(in[higher_index]);
-
- if (interpolation == QuantileOptions::LINEAR) {
- // more stable than naive linear interpolation
- return fraction * higher_value + (1 - fraction) * lower_value;
- } else if (interpolation == QuantileOptions::MIDPOINT) {
- return lower_value / 2 + higher_value / 2;
- } else {
- DCHECK(false);
- return NAN;
- }
- }
-};
-
-// histogram approach with constant memory, only for integers within limited value range
-template <typename InType>
-struct CountQuantiler {
- using CType = typename InType::c_type;
-
- CType min;
- std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
-
- // indices to adjacent non-empty bins covering current quantile
- struct AdjacentBins {
- int left_index;
- int right_index;
- uint64_t total_count; // accumulated counts till left_index (inclusive)
- };
-
- CountQuantiler(CType min, CType max) {
- uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
- DCHECK_LT(value_range, 1 << 30);
- this->min = min;
- this->counts.resize(value_range, 0);
- }
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const QuantileOptions& options = QuantileState::Get(ctx);
-
- // count values in all chunks, ignore nulls
- const Datum& datum = batch[0];
- int64_t in_length = CountValues<CType>(this->counts.data(), datum, this->min);
-
- // prepare out array
- int64_t out_length = options.q.size();
- if (in_length == 0) {
- out_length = 0; // input is empty or only contains null, return empty array
- }
- // out type depends on options
- const bool is_datapoint = IsDataPoint(options);
- const std::shared_ptr<DataType> out_type =
- is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
- auto out_data = ArrayData::Make(out_type, out_length, 0);
- out_data->buffers.resize(2, nullptr);
-
- // calculate quantiles
- if (out_length > 0) {
- ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
- ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
-
- // find quantiles in ascending order
- std::vector<int64_t> q_indices(out_length);
- std::iota(q_indices.begin(), q_indices.end(), 0);
- std::sort(q_indices.begin(), q_indices.end(),
- [&options](int64_t left_index, int64_t right_index) {
- return options.q[left_index] < options.q[right_index];
- });
-
- AdjacentBins bins{0, 0, this->counts[0]};
- if (is_datapoint) {
- CType* out_buffer = out_data->template GetMutableValues<CType>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileAtDataPoint(
- in_length, &bins, options.q[q_index], options.interpolation);
- }
- } else {
- double* out_buffer = out_data->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
- options.interpolation);
- }
- }
- }
-
- *out = Datum(std::move(out_data));
- return Status::OK();
- }
-
- // return quantile located exactly at some input data point
- CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
- enum QuantileOptions::Interpolation interpolation) {
- const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
- while (datapoint_index >= bins->total_count &&
- static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
- ++bins->left_index;
- bins->total_count += this->counts[bins->left_index];
- }
- DCHECK_LT(datapoint_index, bins->total_count);
- return static_cast<CType>(bins->left_index + this->min);
- }
-
- // return quantile interpolated from adjacent input data points
- double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
- enum QuantileOptions::Interpolation interpolation) {
- const double index = (in_length - 1) * q;
- const uint64_t index_floor = static_cast<uint64_t>(index);
- const double fraction = index - index_floor;
-
- while (index_floor >= bins->total_count &&
- static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
- ++bins->left_index;
- bins->total_count += this->counts[bins->left_index];
- }
- DCHECK_LT(index_floor, bins->total_count);
- const double lower_value = static_cast<double>(bins->left_index + this->min);
-
- // quantile lies in this bin, no interpolation needed
- if (index <= bins->total_count - 1) {
- return lower_value;
- }
-
- // quantile lies across two bins, locate next bin if not already done
- DCHECK_EQ(index_floor, bins->total_count - 1);
- if (bins->right_index <= bins->left_index) {
- bins->right_index = bins->left_index + 1;
- while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
- this->counts[bins->right_index] == 0) {
- ++bins->right_index;
- }
- }
- DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
- DCHECK_GT(this->counts[bins->right_index], 0);
- const double higher_value = static_cast<double>(bins->right_index + this->min);
-
- if (interpolation == QuantileOptions::LINEAR) {
- return fraction * higher_value + (1 - fraction) * lower_value;
- } else if (interpolation == QuantileOptions::MIDPOINT) {
- return lower_value / 2 + higher_value / 2;
- } else {
- DCHECK(false);
- return NAN;
- }
- }
-};
-
-// histogram or 'copy & nth_element' approach per value range and size, only for integers
-template <typename InType>
-struct CountOrSortQuantiler {
- using CType = typename InType::c_type;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // cross point to benefit from histogram approach
- // parameters estimated from ad-hoc benchmarks manually
- static constexpr int kMinArraySize = 65536;
- static constexpr int kMaxValueRange = 65536;
-
- const Datum& datum = batch[0];
- if (datum.length() - datum.null_count() >= kMinArraySize) {
- CType min, max;
- std::tie(min, max) = GetMinMax<CType>(datum);
-
- if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
- return CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
- }
- }
-
- return SortQuantiler<InType>().Exec(ctx, batch, out);
- }
-};
-
-template <typename InType, typename Enable = void>
-struct ExactQuantiler;
-
-template <>
-struct ExactQuantiler<UInt8Type> {
- CountQuantiler<UInt8Type> impl;
- ExactQuantiler() : impl(0, 255) {}
-};
-
-template <>
-struct ExactQuantiler<Int8Type> {
- CountQuantiler<Int8Type> impl;
- ExactQuantiler() : impl(-128, 127) {}
-};
-
-template <typename InType>
-struct ExactQuantiler<InType, enable_if_t<(is_integer_type<InType>::value &&
- (sizeof(typename InType::c_type) > 1))>> {
- CountOrSortQuantiler<InType> impl;
-};
-
-template <typename InType>
-struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
- SortQuantiler<InType> impl;
-};
-
-template <typename T>
-Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
- const Scalar& scalar, Datum* out) {
- using CType = typename T::c_type;
- ArrayData* output = out->mutable_array();
- if (!scalar.is_valid) {
- output->length = 0;
- output->null_count = 0;
- return Status::OK();
- }
- auto out_type = IsDataPoint(options) ? scalar.type : float64();
- output->length = options.q.size();
- output->null_count = 0;
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[1],
- ctx->Allocate(output->length * BitUtil::BytesForBits(GetBitWidth(*out_type))));
- if (IsDataPoint(options)) {
- CType* out_buffer = output->template GetMutableValues<CType>(1);
- for (int64_t i = 0; i < output->length; i++) {
- out_buffer[i] = UnboxScalar<T>::Unbox(scalar);
- }
- } else {
- double* out_buffer = output->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < output->length; i++) {
- out_buffer[i] = static_cast<double>(UnboxScalar<T>::Unbox(scalar));
- }
- }
- return Status::OK();
-}
-
-template <typename _, typename InType>
-struct QuantileExecutor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (ctx->state() == nullptr) {
- return Status::Invalid("Quantile requires QuantileOptions");
- }
-
- const QuantileOptions& options = QuantileState::Get(ctx);
- if (options.q.empty()) {
- return Status::Invalid("Requires quantile argument");
- }
- for (double q : options.q) {
- if (q < 0 || q > 1) {
- return Status::Invalid("Quantile must be between 0 and 1");
- }
- }
-
- if (batch[0].is_scalar()) {
- return ScalarQuantile<InType>(ctx, options, *batch[0].scalar(), out);
- }
-
- return ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
- }
-};
-
-Result<ValueDescr> ResolveOutput(KernelContext* ctx,
- const std::vector<ValueDescr>& args) {
- const QuantileOptions& options = QuantileState::Get(ctx);
- if (IsDataPoint(options)) {
- return ValueDescr::Array(args[0].type);
- } else {
- return ValueDescr::Array(float64());
- }
-}
-
-void AddQuantileKernels(VectorFunction* func) {
- VectorKernel base;
- base.init = QuantileState::Init;
- base.can_execute_chunkwise = false;
- base.output_chunked = false;
-
- for (const auto& ty : NumericTypes()) {
- base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
- // output type is determined at runtime, set template argument to nulltype
- base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
- DCHECK_OK(func->AddKernel(base));
- }
-}
-
-const FunctionDoc quantile_doc{
- "Compute an array of quantiles of a numeric array or chunked array",
- ("By default, 0.5 quantile (median) is returned.\n"
- "If quantile lies between two data points, an interpolated value is\n"
- "returned based on selected interpolation method.\n"
- "Nulls and NaNs are ignored.\n"
- "An empty array is returned if there is no valid data point."),
- {"array"},
- "QuantileOptions"};
-
-} // namespace
-
-void RegisterScalarAggregateQuantile(FunctionRegistry* registry) {
- static QuantileOptions default_options;
- auto func = std::make_shared<VectorFunction>("quantile", Arity::Unary(), &quantile_doc,
- &default_options);
- AddQuantileKernels(func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/stl_allocator.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using QuantileState = internal::OptionsWrapper<QuantileOptions>;
+
+// output is at some input data point, not interpolated
+bool IsDataPoint(const QuantileOptions& options) {
+ // some interpolation methods return exact data point
+ return options.interpolation == QuantileOptions::LOWER ||
+ options.interpolation == QuantileOptions::HIGHER ||
+ options.interpolation == QuantileOptions::NEAREST;
+}
+
+// quantile to exact datapoint index (IsDataPoint == true)
+uint64_t QuantileToDataPoint(size_t length, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (length - 1) * q;
+ uint64_t datapoint_index = static_cast<uint64_t>(index);
+ const double fraction = index - datapoint_index;
+
+ if (interpolation == QuantileOptions::LINEAR ||
+ interpolation == QuantileOptions::MIDPOINT) {
+ DCHECK_EQ(fraction, 0);
+ }
+
+ // convert NEAREST interpolation method to LOWER or HIGHER
+ if (interpolation == QuantileOptions::NEAREST) {
+ if (fraction < 0.5) {
+ interpolation = QuantileOptions::LOWER;
+ } else if (fraction > 0.5) {
+ interpolation = QuantileOptions::HIGHER;
+ } else {
+ // round 0.5 to nearest even number, similar to numpy.around
+ interpolation =
+ (datapoint_index & 1) ? QuantileOptions::HIGHER : QuantileOptions::LOWER;
+ }
+ }
+
+ if (interpolation == QuantileOptions::HIGHER && fraction != 0) {
+ ++datapoint_index;
+ }
+
+ return datapoint_index;
+}
+
+// copy and nth_element approach, large memory footprint
+template <typename InType>
+struct SortQuantiler {
+ using CType = typename InType::c_type;
+ using Allocator = arrow::stl::allocator<CType>;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // copy all chunks to a buffer, ignore nulls and nans
+ std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ in_buffer.resize(in_length);
+ CopyNonNullValues(datum, in_buffer.data());
+
+ // drop nan
+ if (is_floating_type<InType>::value) {
+ const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
+ [](CType v) { return v != v; });
+ in_buffer.resize(it - in_buffer.begin());
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_buffer.empty()) {
+ out_length = 0; // input is empty or only contains null and nan, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+
+ // find quantiles in descending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[right_index] < options.q[left_index];
+ });
+
+ // input array is partitioned around data point at `last_index` (pivot)
+ // for next quatile which is smaller, we only consider inputs left of the pivot
+ uint64_t last_index = in_buffer.size();
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_buffer, &last_index, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(
+ in_buffer, &last_index, options.q[q_index], options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(std::vector<CType, Allocator>& in, uint64_t* last_index,
+ double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in.size(), q, interpolation);
+
+ if (datapoint_index != *last_index) {
+ DCHECK_LT(datapoint_index, *last_index);
+ std::nth_element(in.begin(), in.begin() + datapoint_index,
+ in.begin() + *last_index);
+ *last_index = datapoint_index;
+ }
+
+ return in[datapoint_index];
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
+ double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in.size() - 1) * q;
+ const uint64_t lower_index = static_cast<uint64_t>(index);
+ const double fraction = index - lower_index;
+
+ if (lower_index != *last_index) {
+ DCHECK_LT(lower_index, *last_index);
+ std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
+ }
+
+ const double lower_value = static_cast<double>(in[lower_index]);
+ if (fraction == 0) {
+ *last_index = lower_index;
+ return lower_value;
+ }
+
+ const uint64_t higher_index = lower_index + 1;
+ DCHECK_LT(higher_index, in.size());
+ if (lower_index != *last_index && higher_index != *last_index) {
+ DCHECK_LT(higher_index, *last_index);
+ // higher value must be the minimal value after lower_index
+ auto min = std::min_element(in.begin() + higher_index, in.begin() + *last_index);
+ std::iter_swap(in.begin() + higher_index, min);
+ }
+ *last_index = lower_index;
+
+ const double higher_value = static_cast<double>(in[higher_index]);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ // more stable than naive linear interpolation
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range, 0);
+ }
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ int64_t in_length = CountValues<CType>(this->counts.data(), datum, this->min);
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or 'copy & nth_element' approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536;
+ static constexpr int kMaxValueRange = 65536;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
+ CType min, max;
+ std::tie(min, max) = GetMinMax<CType>(datum);
+
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+ return CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
+ }
+ }
+
+ return SortQuantiler<InType>().Exec(ctx, batch, out);
+ }
+};
+
+template <typename InType, typename Enable = void>
+struct ExactQuantiler;
+
+template <>
+struct ExactQuantiler<UInt8Type> {
+ CountQuantiler<UInt8Type> impl;
+ ExactQuantiler() : impl(0, 255) {}
+};
+
+template <>
+struct ExactQuantiler<Int8Type> {
+ CountQuantiler<Int8Type> impl;
+ ExactQuantiler() : impl(-128, 127) {}
+};
+
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<(is_integer_type<InType>::value &&
+ (sizeof(typename InType::c_type) > 1))>> {
+ CountOrSortQuantiler<InType> impl;
+};
+
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
+ SortQuantiler<InType> impl;
+};
+
+template <typename T>
+Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
+ const Scalar& scalar, Datum* out) {
+ using CType = typename T::c_type;
+ ArrayData* output = out->mutable_array();
+ if (!scalar.is_valid) {
+ output->length = 0;
+ output->null_count = 0;
+ return Status::OK();
+ }
+ auto out_type = IsDataPoint(options) ? scalar.type : float64();
+ output->length = options.q.size();
+ output->null_count = 0;
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[1],
+ ctx->Allocate(output->length * BitUtil::BytesForBits(GetBitWidth(*out_type))));
+ if (IsDataPoint(options)) {
+ CType* out_buffer = output->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < output->length; i++) {
+ out_buffer[i] = UnboxScalar<T>::Unbox(scalar);
+ }
+ } else {
+ double* out_buffer = output->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < output->length; i++) {
+ out_buffer[i] = static_cast<double>(UnboxScalar<T>::Unbox(scalar));
+ }
+ }
+ return Status::OK();
+}
+
+template <typename _, typename InType>
+struct QuantileExecutor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("Quantile requires QuantileOptions");
+ }
+
+ const QuantileOptions& options = QuantileState::Get(ctx);
+ if (options.q.empty()) {
+ return Status::Invalid("Requires quantile argument");
+ }
+ for (double q : options.q) {
+ if (q < 0 || q > 1) {
+ return Status::Invalid("Quantile must be between 0 and 1");
+ }
+ }
+
+ if (batch[0].is_scalar()) {
+ return ScalarQuantile<InType>(ctx, options, *batch[0].scalar(), out);
+ }
+
+ return ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
+ }
+};
+
+Result<ValueDescr> ResolveOutput(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+ if (IsDataPoint(options)) {
+ return ValueDescr::Array(args[0].type);
+ } else {
+ return ValueDescr::Array(float64());
+ }
+}
+
+void AddQuantileKernels(VectorFunction* func) {
+ VectorKernel base;
+ base.init = QuantileState::Init;
+ base.can_execute_chunkwise = false;
+ base.output_chunked = false;
+
+ for (const auto& ty : NumericTypes()) {
+ base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
+ // output type is determined at runtime, set template argument to nulltype
+ base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
+}
+
+const FunctionDoc quantile_doc{
+ "Compute an array of quantiles of a numeric array or chunked array",
+ ("By default, 0.5 quantile (median) is returned.\n"
+ "If quantile lies between two data points, an interpolated value is\n"
+ "returned based on selected interpolation method.\n"
+ "Nulls and NaNs are ignored.\n"
+ "An empty array is returned if there is no valid data point."),
+ {"array"},
+ "QuantileOptions"};
+
+} // namespace
+
+void RegisterScalarAggregateQuantile(FunctionRegistry* registry) {
+ static QuantileOptions default_options;
+ auto func = std::make_shared<VectorFunction>("quantile", Arity::Unary(), &quantile_doc,
+ &default_options);
+ AddQuantileKernels(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
index 4c261604c85..54f36ab9159 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
@@ -1,164 +1,164 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/tdigest.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-
-namespace {
-
-using arrow::internal::TDigest;
-using arrow::internal::VisitSetBitRunsVoid;
-
-template <typename ArrowType>
-struct TDigestImpl : public ScalarAggregator {
- using ThisType = TDigestImpl<ArrowType>;
- using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
- using CType = typename ArrowType::c_type;
-
- explicit TDigestImpl(const TDigestOptions& options)
- : q{options.q}, tdigest{options.delta, options.buffer_size} {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- const ArrayData& data = *batch[0].array();
- const CType* values = data.GetValues<CType>(1);
-
- if (data.length > data.GetNullCount()) {
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- this->tdigest.NanAdd(values[pos + i]);
- }
- });
- }
- } else {
- const CType value = UnboxScalar<ArrowType>::Unbox(*batch[0].scalar());
- if (batch[0].scalar()->is_valid) {
- this->tdigest.NanAdd(value);
- }
- }
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- auto& other = checked_cast<ThisType&>(src);
- std::vector<TDigest> other_tdigest;
- other_tdigest.push_back(std::move(other.tdigest));
- this->tdigest.Merge(&other_tdigest);
- return Status::OK();
- }
-
- Status Finalize(KernelContext* ctx, Datum* out) override {
- const int64_t out_length = this->tdigest.is_empty() ? 0 : this->q.size();
- auto out_data = ArrayData::Make(float64(), out_length, 0);
- out_data->buffers.resize(2, nullptr);
-
- if (out_length > 0) {
- ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
- ctx->Allocate(out_length * sizeof(double)));
- double* out_buffer = out_data->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- out_buffer[i] = this->tdigest.Quantile(this->q[i]);
- }
- }
-
- *out = Datum(std::move(out_data));
- return Status::OK();
- }
-
- const std::vector<double>& q;
- TDigest tdigest;
-};
-
-struct TDigestInitState {
- std::unique_ptr<KernelState> state;
- KernelContext* ctx;
- const DataType& in_type;
- const TDigestOptions& options;
-
- TDigestInitState(KernelContext* ctx, const DataType& in_type,
- const TDigestOptions& options)
- : ctx(ctx), in_type(in_type), options(options) {}
-
- Status Visit(const DataType&) {
- return Status::NotImplemented("No tdigest implemented");
- }
-
- Status Visit(const HalfFloatType&) {
- return Status::NotImplemented("No tdigest implemented");
- }
-
- template <typename Type>
- enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
- state.reset(new TDigestImpl<Type>(options));
- return Status::OK();
- }
-
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(in_type, this));
- return std::move(state);
- }
-};
-
-Result<std::unique_ptr<KernelState>> TDigestInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- TDigestInitState visitor(ctx, *args.inputs[0].type,
- static_cast<const TDigestOptions&>(*args.options));
- return visitor.Create();
-}
-
-void AddTDigestKernels(KernelInit init,
- const std::vector<std::shared_ptr<DataType>>& types,
- ScalarAggregateFunction* func) {
- for (const auto& ty : types) {
- auto sig = KernelSignature::Make({InputType(ty)}, float64());
- AddAggKernel(std::move(sig), init, func);
- }
-}
-
-const FunctionDoc tdigest_doc{
- "Approximate quantiles of a numeric array with T-Digest algorithm",
- ("By default, 0.5 quantile (median) is returned.\n"
- "Nulls and NaNs are ignored.\n"
- "An empty array is returned if there is no valid data point."),
- {"array"},
- "TDigestOptions"};
-
-std::shared_ptr<ScalarAggregateFunction> AddTDigestAggKernels() {
- static auto default_tdigest_options = TDigestOptions::Defaults();
- auto func = std::make_shared<ScalarAggregateFunction>(
- "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options);
- AddTDigestKernels(TDigestInit, NumericTypes(), func.get());
- return func;
-}
-
-} // namespace
-
-void RegisterScalarAggregateTDigest(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunction(AddTDigestAggKernels()));
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/tdigest.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow::internal::TDigest;
+using arrow::internal::VisitSetBitRunsVoid;
+
+template <typename ArrowType>
+struct TDigestImpl : public ScalarAggregator {
+ using ThisType = TDigestImpl<ArrowType>;
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using CType = typename ArrowType::c_type;
+
+ explicit TDigestImpl(const TDigestOptions& options)
+ : q{options.q}, tdigest{options.delta, options.buffer_size} {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const ArrayData& data = *batch[0].array();
+ const CType* values = data.GetValues<CType>(1);
+
+ if (data.length > data.GetNullCount()) {
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ this->tdigest.NanAdd(values[pos + i]);
+ }
+ });
+ }
+ } else {
+ const CType value = UnboxScalar<ArrowType>::Unbox(*batch[0].scalar());
+ if (batch[0].scalar()->is_valid) {
+ this->tdigest.NanAdd(value);
+ }
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ auto& other = checked_cast<ThisType&>(src);
+ std::vector<TDigest> other_tdigest;
+ other_tdigest.push_back(std::move(other.tdigest));
+ this->tdigest.Merge(&other_tdigest);
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ const int64_t out_length = this->tdigest.is_empty() ? 0 : this->q.size();
+ auto out_data = ArrayData::Make(float64(), out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * sizeof(double)));
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ out_buffer[i] = this->tdigest.Quantile(this->q[i]);
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ const std::vector<double>& q;
+ TDigest tdigest;
+};
+
+struct TDigestInitState {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const DataType& in_type;
+ const TDigestOptions& options;
+
+ TDigestInitState(KernelContext* ctx, const DataType& in_type,
+ const TDigestOptions& options)
+ : ctx(ctx), in_type(in_type), options(options) {}
+
+ Status Visit(const DataType&) {
+ return Status::NotImplemented("No tdigest implemented");
+ }
+
+ Status Visit(const HalfFloatType&) {
+ return Status::NotImplemented("No tdigest implemented");
+ }
+
+ template <typename Type>
+ enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
+ state.reset(new TDigestImpl<Type>(options));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ return std::move(state);
+ }
+};
+
+Result<std::unique_ptr<KernelState>> TDigestInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ TDigestInitState visitor(ctx, *args.inputs[0].type,
+ static_cast<const TDigestOptions&>(*args.options));
+ return visitor.Create();
+}
+
+void AddTDigestKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ auto sig = KernelSignature::Make({InputType(ty)}, float64());
+ AddAggKernel(std::move(sig), init, func);
+ }
+}
+
+const FunctionDoc tdigest_doc{
+ "Approximate quantiles of a numeric array with T-Digest algorithm",
+ ("By default, 0.5 quantile (median) is returned.\n"
+ "Nulls and NaNs are ignored.\n"
+ "An empty array is returned if there is no valid data point."),
+ {"array"},
+ "TDigestOptions"};
+
+std::shared_ptr<ScalarAggregateFunction> AddTDigestAggKernels() {
+ static auto default_tdigest_options = TDigestOptions::Defaults();
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options);
+ AddTDigestKernels(TDigestInit, NumericTypes(), func.get());
+ return func;
+}
+
+} // namespace
+
+void RegisterScalarAggregateTDigest(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(AddTDigestAggKernels()));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
index d6965fed4a3..d879630e697 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
@@ -15,130 +15,130 @@
// specific language governing permissions and limitations
// under the License.
-#include <cmath>
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/int128_internal.h"
-
+#include <cmath>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int128_internal.h"
+
namespace arrow {
namespace compute {
-namespace internal {
+namespace internal {
namespace {
-using arrow::internal::int128_t;
-using arrow::internal::VisitSetBitRunsVoid;
-
+using arrow::internal::int128_t;
+using arrow::internal::VisitSetBitRunsVoid;
+
template <typename ArrowType>
struct VarStdState {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
- using CType = typename ArrowType::c_type;
+ using CType = typename ArrowType::c_type;
using ThisType = VarStdState<ArrowType>;
- // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
+ // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
- template <typename T = ArrowType>
- enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4)> Consume(
- const ArrayType& array) {
+ template <typename T = ArrowType>
+ enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4)> Consume(
+ const ArrayType& array) {
int64_t count = array.length() - array.null_count();
if (count == 0) {
return;
}
- using SumType =
- typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
- SumType sum =
- arrow::compute::detail::SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
+ using SumType =
+ typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
+ SumType sum =
+ arrow::compute::detail::SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
- const double mean = static_cast<double>(sum) / count;
- const double m2 = arrow::compute::detail::SumArray<CType, double, SimdLevel::NONE>(
- *array.data(), [mean](CType value) {
- const double v = static_cast<double>(value);
- return (v - mean) * (v - mean);
- });
+ const double mean = static_cast<double>(sum) / count;
+ const double m2 = arrow::compute::detail::SumArray<CType, double, SimdLevel::NONE>(
+ *array.data(), [mean](CType value) {
+ const double v = static_cast<double>(value);
+ return (v - mean) * (v - mean);
+ });
this->count = count;
- this->mean = mean;
+ this->mean = mean;
this->m2 = m2;
}
- // int32/16/8: textbook one pass algorithm with integer arithmetic
- template <typename T = ArrowType>
- enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
- const ArrayType& array) {
- // max number of elements that sum will not overflow int64 (2Gi int32 elements)
- // for uint32: 0 <= sum < 2^63 (int64 >= 0)
- // for int32: -2^62 <= sum < 2^62
- constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
-
- int64_t start_index = 0;
- int64_t valid_count = array.length() - array.null_count();
-
- while (valid_count > 0) {
- // process in chunks that overflow will never happen
- const auto slice = array.Slice(start_index, max_length);
- const int64_t count = slice->length() - slice->null_count();
- start_index += max_length;
- valid_count -= count;
-
- if (count > 0) {
- int64_t sum = 0;
- int128_t square_sum = 0;
- const ArrayData& data = *slice->data();
- const CType* values = data.GetValues<CType>(1);
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- const auto value = values[pos + i];
- sum += value;
- square_sum += static_cast<uint64_t>(value) * value;
- }
- });
-
- const double mean = static_cast<double>(sum) / count;
- // calculate m2 = square_sum - sum * sum / count
- // decompose `sum * sum / count` into integers and fractions
- const int128_t sum_square = static_cast<int128_t>(sum) * sum;
- const int128_t integers = sum_square / count;
- const double fractions = static_cast<double>(sum_square % count) / count;
- const double m2 = static_cast<double>(square_sum - integers) - fractions;
-
- // merge variance
- ThisType state;
- state.count = count;
- state.mean = mean;
- state.m2 = m2;
- this->MergeFrom(state);
- }
- }
- }
-
- // Combine `m2` from two chunks (m2 = n*s2)
- // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
+ // int32/16/8: textbook one pass algorithm with integer arithmetic
+ template <typename T = ArrowType>
+ enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
+ const ArrayType& array) {
+ // max number of elements that sum will not overflow int64 (2Gi int32 elements)
+ // for uint32: 0 <= sum < 2^63 (int64 >= 0)
+ // for int32: -2^62 <= sum < 2^62
+ constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
+
+ int64_t start_index = 0;
+ int64_t valid_count = array.length() - array.null_count();
+
+ while (valid_count > 0) {
+ // process in chunks that overflow will never happen
+ const auto slice = array.Slice(start_index, max_length);
+ const int64_t count = slice->length() - slice->null_count();
+ start_index += max_length;
+ valid_count -= count;
+
+ if (count > 0) {
+ int64_t sum = 0;
+ int128_t square_sum = 0;
+ const ArrayData& data = *slice->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ const auto value = values[pos + i];
+ sum += value;
+ square_sum += static_cast<uint64_t>(value) * value;
+ }
+ });
+
+ const double mean = static_cast<double>(sum) / count;
+ // calculate m2 = square_sum - sum * sum / count
+ // decompose `sum * sum / count` into integers and fractions
+ const int128_t sum_square = static_cast<int128_t>(sum) * sum;
+ const int128_t integers = sum_square / count;
+ const double fractions = static_cast<double>(sum_square % count) / count;
+ const double m2 = static_cast<double>(square_sum - integers) - fractions;
+
+ // merge variance
+ ThisType state;
+ state.count = count;
+ state.mean = mean;
+ state.m2 = m2;
+ this->MergeFrom(state);
+ }
+ }
+ }
+
+ // Combine `m2` from two chunks (m2 = n*s2)
+ // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
void MergeFrom(const ThisType& state) {
if (state.count == 0) {
return;
}
if (this->count == 0) {
this->count = state.count;
- this->mean = state.mean;
+ this->mean = state.mean;
this->m2 = state.m2;
return;
}
- double mean = (this->mean * this->count + state.mean * state.count) /
- (this->count + state.count);
- this->m2 += state.m2 + this->count * (this->mean - mean) * (this->mean - mean) +
- state.count * (state.mean - mean) * (state.mean - mean);
+ double mean = (this->mean * this->count + state.mean * state.count) /
+ (this->count + state.count);
+ this->m2 += state.m2 + this->count * (this->mean - mean) * (this->mean - mean) +
+ state.count * (state.mean - mean) * (state.mean - mean);
this->count += state.count;
- this->mean = mean;
+ this->mean = mean;
}
int64_t count = 0;
- double mean = 0;
- double m2 = 0; // m2 = count*s2 = sum((X-mean)^2)
+ double mean = 0;
+ double m2 = 0; // m2 = count*s2 = sum((X-mean)^2)
};
enum class VarOrStd : bool { Var, Std };
@@ -152,27 +152,27 @@ struct VarStdImpl : public ScalarAggregator {
const VarianceOptions& options, VarOrStd return_type)
: out_type(out_type), options(options), return_type(return_type) {}
- Status Consume(KernelContext*, const ExecBatch& batch) override {
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
ArrayType array(batch[0].array());
this->state.Consume(array);
- return Status::OK();
+ return Status::OK();
}
- Status MergeFrom(KernelContext*, KernelState&& src) override {
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
const auto& other = checked_cast<const ThisType&>(src);
this->state.MergeFrom(other.state);
- return Status::OK();
+ return Status::OK();
}
- Status Finalize(KernelContext*, Datum* out) override {
+ Status Finalize(KernelContext*, Datum* out) override {
if (this->state.count <= options.ddof) {
- out->value = std::make_shared<DoubleScalar>();
+ out->value = std::make_shared<DoubleScalar>();
} else {
double var = this->state.m2 / (this->state.count - options.ddof);
out->value =
- std::make_shared<DoubleScalar>(return_type == VarOrStd::Var ? var : sqrt(var));
+ std::make_shared<DoubleScalar>(return_type == VarOrStd::Var ? var : sqrt(var));
}
- return Status::OK();
+ return Status::OK();
}
std::shared_ptr<DataType> out_type;
@@ -181,34 +181,34 @@ struct VarStdImpl : public ScalarAggregator {
VarOrStd return_type;
};
-struct ScalarVarStdImpl : public ScalarAggregator {
- explicit ScalarVarStdImpl(const VarianceOptions& options)
- : options(options), seen(false) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- seen = batch[0].scalar()->is_valid;
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const ScalarVarStdImpl&>(src);
- seen = seen || other.seen;
- return Status::OK();
- }
-
- Status Finalize(KernelContext*, Datum* out) override {
- if (!seen || options.ddof > 0) {
- out->value = std::make_shared<DoubleScalar>();
- } else {
- out->value = std::make_shared<DoubleScalar>(0.0);
- }
- return Status::OK();
- }
-
- const VarianceOptions options;
- bool seen;
-};
-
+struct ScalarVarStdImpl : public ScalarAggregator {
+ explicit ScalarVarStdImpl(const VarianceOptions& options)
+ : options(options), seen(false) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ seen = batch[0].scalar()->is_valid;
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ScalarVarStdImpl&>(src);
+ seen = seen || other.seen;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (!seen || options.ddof > 0) {
+ out->value = std::make_shared<DoubleScalar>();
+ } else {
+ out->value = std::make_shared<DoubleScalar>(0.0);
+ }
+ return Status::OK();
+ }
+
+ const VarianceOptions options;
+ bool seen;
+};
+
struct VarStdInitState {
std::unique_ptr<KernelState> state;
KernelContext* ctx;
@@ -240,87 +240,87 @@ struct VarStdInitState {
return Status::OK();
}
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
return std::move(state);
}
};
-Result<std::unique_ptr<KernelState>> StddevInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> StddevInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
VarStdInitState visitor(
ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
static_cast<const VarianceOptions&>(*args.options), VarOrStd::Std);
return visitor.Create();
}
-Result<std::unique_ptr<KernelState>> VarianceInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> VarianceInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
VarStdInitState visitor(
ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
static_cast<const VarianceOptions&>(*args.options), VarOrStd::Var);
return visitor.Create();
}
-Result<std::unique_ptr<KernelState>> ScalarVarStdInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- return arrow::internal::make_unique<ScalarVarStdImpl>(
- static_cast<const VarianceOptions&>(*args.options));
-}
-
+Result<std::unique_ptr<KernelState>> ScalarVarStdInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ return arrow::internal::make_unique<ScalarVarStdImpl>(
+ static_cast<const VarianceOptions&>(*args.options));
+}
+
void AddVarStdKernels(KernelInit init,
const std::vector<std::shared_ptr<DataType>>& types,
ScalarAggregateFunction* func) {
for (const auto& ty : types) {
auto sig = KernelSignature::Make({InputType::Array(ty)}, float64());
AddAggKernel(std::move(sig), init, func);
-
- sig = KernelSignature::Make({InputType::Scalar(ty)}, float64());
- AddAggKernel(std::move(sig), ScalarVarStdInit, func);
+
+ sig = KernelSignature::Make({InputType::Scalar(ty)}, float64());
+ AddAggKernel(std::move(sig), ScalarVarStdInit, func);
}
}
-const FunctionDoc stddev_doc{
- "Calculate the standard deviation of a numeric array",
- ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
- "By default (`ddof` = 0), the population standard deviation is calculated.\n"
- "Nulls are ignored. If there are not enough non-null values in the array\n"
- "to satisfy `ddof`, null is returned."),
- {"array"},
- "VarianceOptions"};
-
-const FunctionDoc variance_doc{
- "Calculate the variance of a numeric array",
- ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
- "By default (`ddof` = 0), the population variance is calculated.\n"
- "Nulls are ignored. If there are not enough non-null values in the array\n"
- "to satisfy `ddof`, null is returned."),
- {"array"},
- "VarianceOptions"};
-
+const FunctionDoc stddev_doc{
+ "Calculate the standard deviation of a numeric array",
+ ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
+ "By default (`ddof` = 0), the population standard deviation is calculated.\n"
+ "Nulls are ignored. If there are not enough non-null values in the array\n"
+ "to satisfy `ddof`, null is returned."),
+ {"array"},
+ "VarianceOptions"};
+
+const FunctionDoc variance_doc{
+ "Calculate the variance of a numeric array",
+ ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
+ "By default (`ddof` = 0), the population variance is calculated.\n"
+ "Nulls are ignored. If there are not enough non-null values in the array\n"
+ "to satisfy `ddof`, null is returned."),
+ {"array"},
+ "VarianceOptions"};
+
std::shared_ptr<ScalarAggregateFunction> AddStddevAggKernels() {
static auto default_std_options = VarianceOptions::Defaults();
- auto func = std::make_shared<ScalarAggregateFunction>(
- "stddev", Arity::Unary(), &stddev_doc, &default_std_options);
- AddVarStdKernels(StddevInit, NumericTypes(), func.get());
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "stddev", Arity::Unary(), &stddev_doc, &default_std_options);
+ AddVarStdKernels(StddevInit, NumericTypes(), func.get());
return func;
}
std::shared_ptr<ScalarAggregateFunction> AddVarianceAggKernels() {
static auto default_var_options = VarianceOptions::Defaults();
- auto func = std::make_shared<ScalarAggregateFunction>(
- "variance", Arity::Unary(), &variance_doc, &default_var_options);
- AddVarStdKernels(VarianceInit, NumericTypes(), func.get());
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "variance", Arity::Unary(), &variance_doc, &default_var_options);
+ AddVarStdKernels(VarianceInit, NumericTypes(), func.get());
return func;
}
-} // namespace
-
-void RegisterScalarAggregateVariance(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunction(AddVarianceAggKernels()));
- DCHECK_OK(registry->AddFunction(AddStddevAggKernels()));
-}
-
-} // namespace internal
+} // namespace
+
+void RegisterScalarAggregateVariance(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(AddVarianceAggKernels()));
+ DCHECK_OK(registry->AddFunction(AddStddevAggKernels()));
+}
+
+} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
index bab8e7000cd..7133b175472 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -28,15 +28,15 @@ namespace arrow {
namespace compute {
namespace internal {
-Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::NotImplemented("This kernel is malformed");
+Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::NotImplemented("This kernel is malformed");
}
ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) {
return [exec](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ExecBatch flipped_batch = batch;
std::swap(flipped_batch.values[0], flipped_batch.values[1]);
- return exec(ctx, flipped_batch, out);
+ return exec(ctx, flipped_batch, out);
};
}
@@ -48,7 +48,7 @@ std::vector<std::shared_ptr<DataType>> g_numeric_types;
std::vector<std::shared_ptr<DataType>> g_base_binary_types;
std::vector<std::shared_ptr<DataType>> g_temporal_types;
std::vector<std::shared_ptr<DataType>> g_primitive_types;
-std::vector<Type::type> g_decimal_type_ids;
+std::vector<Type::type> g_decimal_type_ids;
static std::once_flag codegen_static_initialized;
template <typename T>
@@ -72,9 +72,9 @@ static void InitStaticData() {
// Floating point types
g_floating_types = {float32(), float64()};
- // Decimal types
- g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256};
-
+ // Decimal types
+ g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256};
+
// Numeric types
Extend(g_int_types, &g_numeric_types);
Extend(g_floating_types, &g_numeric_types);
@@ -136,11 +136,11 @@ const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes() {
return g_floating_types;
}
-const std::vector<Type::type>& DecimalTypeIds() {
- std::call_once(codegen_static_initialized, InitStaticData);
- return g_decimal_type_ids;
-}
-
+const std::vector<Type::type>& DecimalTypeIds() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_decimal_type_ids;
+}
+
const std::vector<TimeUnit::type>& AllTimeUnits() {
static std::vector<TimeUnit::type> units = {TimeUnit::SECOND, TimeUnit::MILLI,
TimeUnit::MICRO, TimeUnit::NANO};
@@ -164,7 +164,7 @@ const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
static DataTypeVector example_parametric_types = {
- decimal128(12, 2),
+ decimal128(12, 2),
duration(TimeUnit::SECOND),
timestamp(TimeUnit::SECOND),
time32(TimeUnit::SECOND),
@@ -185,153 +185,153 @@ const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
// work above
Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs) {
- ValueDescr result = descrs.front();
- result.shape = GetBroadcastShape(descrs);
- return result;
-}
-
-void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
- for (ValueDescr& descr : *descrs) {
- if (descr.type->id() == Type::DICTIONARY) {
- descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
- }
- }
-}
-
-void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
- DCHECK_EQ(descrs->size(), 2);
-
- if (descrs->at(0).type->id() == Type::NA) {
- descrs->at(0).type = descrs->at(1).type;
- return;
- }
-
- if (descrs->at(1).type->id() == Type::NA) {
- descrs->at(1).type = descrs->at(0).type;
- return;
- }
-}
-
-void ReplaceTypes(const std::shared_ptr<DataType>& type,
- std::vector<ValueDescr>* descrs) {
- for (auto& descr : *descrs) {
- descr.type = type;
- }
-}
-
-std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
- return CommonNumeric(descrs.data(), descrs.size());
-}
-
-std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count) {
- DCHECK_GT(count, 0) << "tried to find CommonNumeric type of an empty set";
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- auto id = descr.type->id();
- if (!is_floating(id) && !is_integer(id)) {
- // a common numeric type is only possible if all types are numeric
- return nullptr;
- }
- if (id == Type::HALF_FLOAT) {
- // float16 arithmetic is not currently supported
- return nullptr;
- }
- }
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- if (descr.type->id() == Type::DOUBLE) return float64();
- }
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- if (descr.type->id() == Type::FLOAT) return float32();
- }
-
- int max_width_signed = 0, max_width_unsigned = 0;
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- auto id = descr.type->id();
- auto max_width = &(is_signed_integer(id) ? max_width_signed : max_width_unsigned);
- *max_width = std::max(bit_width(id), *max_width);
- }
-
- if (max_width_signed == 0) {
- if (max_width_unsigned >= 64) return uint64();
- if (max_width_unsigned == 32) return uint32();
- if (max_width_unsigned == 16) return uint16();
- DCHECK_EQ(max_width_unsigned, 8);
- return uint8();
- }
-
- if (max_width_signed <= max_width_unsigned) {
- max_width_signed = static_cast<int>(BitUtil::NextPower2(max_width_unsigned + 1));
- }
-
- if (max_width_signed >= 64) return int64();
- if (max_width_signed == 32) return int32();
- if (max_width_signed == 16) return int16();
- DCHECK_EQ(max_width_signed, 8);
- return int8();
-}
-
-std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs) {
- TimeUnit::type finest_unit = TimeUnit::SECOND;
-
- for (const auto& descr : descrs) {
- auto id = descr.type->id();
- // a common timestamp is only possible if all types are timestamp like
- switch (id) {
- case Type::DATE32:
- case Type::DATE64:
- continue;
- case Type::TIMESTAMP:
- finest_unit =
- std::max(finest_unit, checked_cast<const TimestampType&>(*descr.type).unit());
- continue;
- default:
- return nullptr;
- }
- }
-
- return timestamp(finest_unit);
-}
-
-std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs) {
- bool all_utf8 = true, all_offset32 = true;
-
- for (const auto& descr : descrs) {
- auto id = descr.type->id();
- // a common varbinary type is only possible if all types are binary like
- switch (id) {
- case Type::STRING:
- continue;
- case Type::BINARY:
- all_utf8 = false;
- continue;
- case Type::LARGE_STRING:
- all_offset32 = false;
- continue;
- case Type::LARGE_BINARY:
- all_offset32 = false;
- all_utf8 = false;
- continue;
- default:
- return nullptr;
- }
- }
-
- if (all_utf8) {
- if (all_offset32) return utf8();
- return large_utf8();
- }
-
- if (all_offset32) return binary();
- return large_binary();
+ ValueDescr result = descrs.front();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
}
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
+ for (ValueDescr& descr : *descrs) {
+ if (descr.type->id() == Type::DICTIONARY) {
+ descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
+ }
+ }
+}
+
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
+ DCHECK_EQ(descrs->size(), 2);
+
+ if (descrs->at(0).type->id() == Type::NA) {
+ descrs->at(0).type = descrs->at(1).type;
+ return;
+ }
+
+ if (descrs->at(1).type->id() == Type::NA) {
+ descrs->at(1).type = descrs->at(0).type;
+ return;
+ }
+}
+
+void ReplaceTypes(const std::shared_ptr<DataType>& type,
+ std::vector<ValueDescr>* descrs) {
+ for (auto& descr : *descrs) {
+ descr.type = type;
+ }
+}
+
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
+ return CommonNumeric(descrs.data(), descrs.size());
+}
+
+std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count) {
+ DCHECK_GT(count, 0) << "tried to find CommonNumeric type of an empty set";
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ auto id = descr.type->id();
+ if (!is_floating(id) && !is_integer(id)) {
+ // a common numeric type is only possible if all types are numeric
+ return nullptr;
+ }
+ if (id == Type::HALF_FLOAT) {
+ // float16 arithmetic is not currently supported
+ return nullptr;
+ }
+ }
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ if (descr.type->id() == Type::DOUBLE) return float64();
+ }
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ if (descr.type->id() == Type::FLOAT) return float32();
+ }
+
+ int max_width_signed = 0, max_width_unsigned = 0;
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ auto id = descr.type->id();
+ auto max_width = &(is_signed_integer(id) ? max_width_signed : max_width_unsigned);
+ *max_width = std::max(bit_width(id), *max_width);
+ }
+
+ if (max_width_signed == 0) {
+ if (max_width_unsigned >= 64) return uint64();
+ if (max_width_unsigned == 32) return uint32();
+ if (max_width_unsigned == 16) return uint16();
+ DCHECK_EQ(max_width_unsigned, 8);
+ return uint8();
+ }
+
+ if (max_width_signed <= max_width_unsigned) {
+ max_width_signed = static_cast<int>(BitUtil::NextPower2(max_width_unsigned + 1));
+ }
+
+ if (max_width_signed >= 64) return int64();
+ if (max_width_signed == 32) return int32();
+ if (max_width_signed == 16) return int16();
+ DCHECK_EQ(max_width_signed, 8);
+ return int8();
+}
+
+std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs) {
+ TimeUnit::type finest_unit = TimeUnit::SECOND;
+
+ for (const auto& descr : descrs) {
+ auto id = descr.type->id();
+ // a common timestamp is only possible if all types are timestamp like
+ switch (id) {
+ case Type::DATE32:
+ case Type::DATE64:
+ continue;
+ case Type::TIMESTAMP:
+ finest_unit =
+ std::max(finest_unit, checked_cast<const TimestampType&>(*descr.type).unit());
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ return timestamp(finest_unit);
+}
+
+std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs) {
+ bool all_utf8 = true, all_offset32 = true;
+
+ for (const auto& descr : descrs) {
+ auto id = descr.type->id();
+ // a common varbinary type is only possible if all types are binary like
+ switch (id) {
+ case Type::STRING:
+ continue;
+ case Type::BINARY:
+ all_utf8 = false;
+ continue;
+ case Type::LARGE_STRING:
+ all_offset32 = false;
+ continue;
+ case Type::LARGE_BINARY:
+ all_offset32 = false;
+ all_utf8 = false;
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ if (all_utf8) {
+ if (all_offset32) return utf8();
+ return large_utf8();
+ }
+
+ if (all_offset32) return binary();
+ return large_binary();
+}
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
index cb9b13bb3d7..c1950a2b11a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -18,7 +18,7 @@
#pragma once
#include <cstdint>
-#include <cstring>
+#include <cstring>
#include <memory>
#include <string>
#include <utility>
@@ -71,14 +71,14 @@ template <typename OptionsType>
struct OptionsWrapper : public KernelState {
explicit OptionsWrapper(OptionsType options) : options(std::move(options)) {}
- static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
- const KernelInitArgs& args) {
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
if (auto options = static_cast<const OptionsType*>(args.options)) {
return ::arrow::internal::make_unique<OptionsWrapper>(*options);
}
- return Status::Invalid(
- "Attempted to initialize KernelState from null FunctionOptions");
+ return Status::Invalid(
+ "Attempted to initialize KernelState from null FunctionOptions");
}
static const OptionsType& Get(const KernelState& state) {
@@ -90,34 +90,34 @@ struct OptionsWrapper : public KernelState {
OptionsType options;
};
-/// KernelState adapter for when the state is an instance constructed with the
-/// KernelContext and the FunctionOptions as argument
-template <typename StateType, typename OptionsType>
-struct KernelStateFromFunctionOptions : public KernelState {
- explicit KernelStateFromFunctionOptions(KernelContext* ctx, OptionsType state)
- : state(StateType(ctx, std::move(state))) {}
-
- static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
- const KernelInitArgs& args) {
- if (auto options = static_cast<const OptionsType*>(args.options)) {
- return ::arrow::internal::make_unique<KernelStateFromFunctionOptions>(ctx,
- *options);
- }
-
- return Status::Invalid(
- "Attempted to initialize KernelState from null FunctionOptions");
- }
-
- static const StateType& Get(const KernelState& state) {
- return ::arrow::internal::checked_cast<const KernelStateFromFunctionOptions&>(state)
- .state;
- }
-
- static const StateType& Get(KernelContext* ctx) { return Get(*ctx->state()); }
-
- StateType state;
-};
-
+/// KernelState adapter for when the state is an instance constructed with the
+/// KernelContext and the FunctionOptions as argument
+template <typename StateType, typename OptionsType>
+struct KernelStateFromFunctionOptions : public KernelState {
+ explicit KernelStateFromFunctionOptions(KernelContext* ctx, OptionsType state)
+ : state(StateType(ctx, std::move(state))) {}
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (auto options = static_cast<const OptionsType*>(args.options)) {
+ return ::arrow::internal::make_unique<KernelStateFromFunctionOptions>(ctx,
+ *options);
+ }
+
+ return Status::Invalid(
+ "Attempted to initialize KernelState from null FunctionOptions");
+ }
+
+ static const StateType& Get(const KernelState& state) {
+ return ::arrow::internal::checked_cast<const KernelStateFromFunctionOptions&>(state)
+ .state;
+ }
+
+ static const StateType& Get(KernelContext* ctx) { return Get(*ctx->state()); }
+
+ StateType state;
+};
+
// ----------------------------------------------------------------------
// Input and output value type definitions
@@ -149,22 +149,22 @@ struct GetViewType<Decimal128Type> {
static T LogicalValue(PhysicalType value) {
return Decimal128(reinterpret_cast<const uint8_t*>(value.data()));
}
-
- static T LogicalValue(T value) { return value; }
-};
-
-template <>
-struct GetViewType<Decimal256Type> {
- using T = Decimal256;
- using PhysicalType = util::string_view;
-
- static T LogicalValue(PhysicalType value) {
- return Decimal256(reinterpret_cast<const uint8_t*>(value.data()));
- }
-
- static T LogicalValue(T value) { return value; }
+
+ static T LogicalValue(T value) { return value; }
};
+template <>
+struct GetViewType<Decimal256Type> {
+ using T = Decimal256;
+ using PhysicalType = util::string_view;
+
+ static T LogicalValue(PhysicalType value) {
+ return Decimal256(reinterpret_cast<const uint8_t*>(value.data()));
+ }
+
+ static T LogicalValue(T value) { return value; }
+};
+
template <typename Type, typename Enable = void>
struct GetOutputType;
@@ -183,11 +183,11 @@ struct GetOutputType<Decimal128Type> {
using T = Decimal128;
};
-template <>
-struct GetOutputType<Decimal256Type> {
- using T = Decimal256;
-};
-
+template <>
+struct GetOutputType<Decimal256Type> {
+ using T = Decimal256;
+};
+
// ----------------------------------------------------------------------
// Iteration / value access utilities
@@ -247,18 +247,18 @@ struct ArrayIterator<Type, enable_if_base_binary<Type>> {
}
};
-template <typename Type>
-struct ArrayIterator<Type, enable_if_decimal<Type>> {
- using T = typename TypeTraits<Type>::ScalarType::ValueType;
- using endian_agnostic = std::array<uint8_t, sizeof(T)>;
- const endian_agnostic* values;
-
- explicit ArrayIterator(const ArrayData& data)
- : values(data.GetValues<endian_agnostic>(1)) {}
-
- T operator()() { return T{values++->data()}; }
-};
-
+template <typename Type>
+struct ArrayIterator<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+ const endian_agnostic* values;
+
+ explicit ArrayIterator(const ArrayData& data)
+ : values(data.GetValues<endian_agnostic>(1)) {}
+
+ T operator()() { return T{values++->data()}; }
+};
+
// Iterator over various output array types, taking a GetOutputType<Type>
template <typename Type, typename Enable = void>
@@ -276,26 +276,26 @@ struct OutputArrayWriter<Type, enable_if_has_c_type_not_boolean<Type>> {
// Note that this doesn't write the null bitmap, which should be consistent
// with Write / WriteNull calls
void WriteNull() { *values++ = T{}; }
-
- void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
-};
-
-template <typename Type>
-struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
- using T = typename TypeTraits<Type>::ScalarType::ValueType;
- using endian_agnostic = std::array<uint8_t, sizeof(T)>;
- endian_agnostic* values;
-
- explicit OutputArrayWriter(ArrayData* data)
- : values(data->GetMutableValues<endian_agnostic>(1)) {}
-
- void Write(T value) { value.ToBytes(values++->data()); }
-
- void WriteNull() { T{}.ToBytes(values++->data()); }
-
- void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
+
+ void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
};
+template <typename Type>
+struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+ endian_agnostic* values;
+
+ explicit OutputArrayWriter(ArrayData* data)
+ : values(data->GetMutableValues<endian_agnostic>(1)) {}
+
+ void Write(T value) { value.ToBytes(values++->data()); }
+
+ void WriteNull() { T{}.ToBytes(values++->data()); }
+
+ void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
+};
+
// (Un)box Scalar to / from C++ value
template <typename Type, typename Enable = void>
@@ -311,9 +311,9 @@ struct UnboxScalar<Type, enable_if_has_c_type<Type>> {
};
template <typename Type>
-struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
+struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
static util::string_view Unbox(const Scalar& val) {
- if (!val.is_valid) return util::string_view();
+ if (!val.is_valid) return util::string_view();
return util::string_view(*checked_cast<const BaseBinaryScalar&>(val).value);
}
};
@@ -325,25 +325,25 @@ struct UnboxScalar<Decimal128Type> {
}
};
-template <>
-struct UnboxScalar<Decimal256Type> {
- static Decimal256 Unbox(const Scalar& val) {
- return checked_cast<const Decimal256Scalar&>(val).value;
- }
-};
-
+template <>
+struct UnboxScalar<Decimal256Type> {
+ static Decimal256 Unbox(const Scalar& val) {
+ return checked_cast<const Decimal256Scalar&>(val).value;
+ }
+};
+
template <typename Type, typename Enable = void>
struct BoxScalar;
template <typename Type>
struct BoxScalar<Type, enable_if_has_c_type<Type>> {
using T = typename GetOutputType<Type>::T;
- static void Box(T val, Scalar* out) {
- // Enables BoxScalar<Int64Type> to work on a (for example) Time64Scalar
- T* mutable_data = reinterpret_cast<T*>(
- checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data());
- *mutable_data = val;
- }
+ static void Box(T val, Scalar* out) {
+ // Enables BoxScalar<Int64Type> to work on a (for example) Time64Scalar
+ T* mutable_data = reinterpret_cast<T*>(
+ checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data());
+ *mutable_data = val;
+ }
};
template <typename Type>
@@ -362,20 +362,20 @@ struct BoxScalar<Decimal128Type> {
static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
};
-template <>
-struct BoxScalar<Decimal256Type> {
- using T = Decimal256;
- using ScalarType = Decimal256Scalar;
- static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
-};
-
+template <>
+struct BoxScalar<Decimal256Type> {
+ using T = Decimal256;
+ using ScalarType = Decimal256Scalar;
+ static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
+};
+
// A VisitArrayDataInline variant that calls its visitor function with logical
// values, such as Decimal128 rather than util::string_view.
template <typename T, typename VisitFunc, typename NullFunc>
-static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
-VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
- NullFunc&& null_func) {
+static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
+VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+ NullFunc&& null_func) {
VisitArrayDataInline<T>(
arr,
[&](typename GetViewType<T>::PhysicalType v) {
@@ -384,18 +384,18 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
std::forward<NullFunc>(null_func));
}
-template <typename T, typename VisitFunc, typename NullFunc>
-static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
-VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
- NullFunc&& null_func) {
- return VisitArrayDataInline<T>(
- arr,
- [&](typename GetViewType<T>::PhysicalType v) {
- return valid_func(GetViewType<T>::LogicalValue(std::move(v)));
- },
- std::forward<NullFunc>(null_func));
-}
-
+template <typename T, typename VisitFunc, typename NullFunc>
+static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
+VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+ NullFunc&& null_func) {
+ return VisitArrayDataInline<T>(
+ arr,
+ [&](typename GetViewType<T>::PhysicalType v) {
+ return valid_func(GetViewType<T>::LogicalValue(std::move(v)));
+ },
+ std::forward<NullFunc>(null_func));
+}
+
// Like VisitArrayValuesInline, but for binary functions.
template <typename Arg0Type, typename Arg1Type, typename VisitFunc, typename NullFunc>
@@ -425,7 +425,7 @@ Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& desc
// ----------------------------------------------------------------------
// Generate an array kernel given template classes
-Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec);
@@ -439,7 +439,7 @@ const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& IntTypes();
const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
-const std::vector<Type::type>& DecimalTypeIds();
+const std::vector<Type::type>& DecimalTypeIds();
ARROW_EXPORT
const std::vector<TimeUnit::type>& AllTimeUnits();
@@ -483,16 +483,16 @@ namespace applicator {
//
// Operator must implement
//
-// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& in, Scalar* out)
+// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& in, Scalar* out)
template <typename Operator>
-static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::SCALAR) {
- return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get());
+ return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get());
} else if (batch.length > 0) {
- return Operator::Call(ctx, *batch[0].array(), out->mutable_array());
+ return Operator::Call(ctx, *batch[0].array(), out->mutable_array());
}
- return Status::OK();
+ return Status::OK();
}
// Generate an ArrayKernelExec given a functor that handles all of its own
@@ -500,34 +500,34 @@ static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out
//
// Operator must implement
//
-// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
-// ArrayData* out)
-// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1,
-// ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1,
-// ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1,
-// Scalar* out)
+// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1,
+// Scalar* out)
template <typename Operator>
-static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (batch.length == 0) return Status::OK();
-
- if (batch[0].kind() == Datum::ARRAY) {
- if (batch[1].kind() == Datum::ARRAY) {
- return Operator::Call(ctx, *batch[0].array(), *batch[1].array(),
- out->mutable_array());
- } else {
- return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(),
- out->mutable_array());
- }
- } else {
- if (batch[1].kind() == Datum::ARRAY) {
- return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(),
- out->mutable_array());
- } else {
- return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(),
- out->scalar().get());
- }
+static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch.length == 0) return Status::OK();
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return Operator::Call(ctx, *batch[0].array(), *batch[1].array(),
+ out->mutable_array());
+ } else {
+ return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ out->mutable_array());
+ }
+ } else {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(),
+ out->mutable_array());
+ } else {
+ return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(),
+ out->scalar().get());
+ }
}
}
@@ -541,53 +541,53 @@ struct OutputAdapter;
template <typename Type>
struct OutputAdapter<Type, enable_if_boolean<Type>> {
template <typename Generator>
- static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
ArrayData* out_arr = out->mutable_array();
auto out_bitmap = out_arr->buffers[1]->mutable_data();
GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
std::forward<Generator>(generator));
- return Status::OK();
+ return Status::OK();
}
};
template <typename Type>
struct OutputAdapter<Type, enable_if_has_c_type_not_boolean<Type>> {
template <typename Generator>
- static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
ArrayData* out_arr = out->mutable_array();
auto out_data = out_arr->GetMutableValues<typename Type::c_type>(1);
// TODO: Is this as fast as a more explicitly inlined function?
for (int64_t i = 0; i < out_arr->length; ++i) {
*out_data++ = generator();
}
- return Status::OK();
+ return Status::OK();
}
};
template <typename Type>
struct OutputAdapter<Type, enable_if_base_binary<Type>> {
template <typename Generator>
- static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) {
- return Status::NotImplemented("NYI");
- }
-};
-
-template <typename Type>
-struct OutputAdapter<Type, enable_if_decimal<Type>> {
- using T = typename TypeTraits<Type>::ScalarType::ValueType;
- using endian_agnostic = std::array<uint8_t, sizeof(T)>;
-
- template <typename Generator>
- static Status Write(KernelContext*, Datum* out, Generator&& generator) {
- ArrayData* out_arr = out->mutable_array();
- auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
- for (int64_t i = 0; i < out_arr->length; ++i) {
- generator().ToBytes(out_data++->data());
- }
- return Status::OK();
+ static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) {
+ return Status::NotImplemented("NYI");
}
};
+template <typename Type>
+struct OutputAdapter<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+
+ template <typename Generator>
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ ArrayData* out_arr = out->mutable_array();
+ auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+ for (int64_t i = 0; i < out_arr->length; ++i) {
+ generator().ToBytes(out_data++->data());
+ }
+ return Status::OK();
+ }
+};
+
// A kernel exec generator for unary functions that addresses both array and
// scalar inputs and dispatches input iteration and output writing to other
// templates
@@ -600,10 +600,10 @@ struct OutputAdapter<Type, enable_if_decimal<Type>> {
//
// struct Op {
// template <typename OutValue, typename Arg0Value>
-// static OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) {
+// static OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) {
// // implementation
-// // NOTE: "status" should only populated with errors,
-// // leave it unmodified to indicate Status::OK()
+// // NOTE: "status" should only populated with errors,
+// // leave it unmodified to indicate Status::OK()
// }
// };
template <typename OutType, typename Arg0Type, typename Op>
@@ -611,34 +611,34 @@ struct ScalarUnary {
using OutValue = typename GetOutputType<OutType>::T;
using Arg0Value = typename GetViewType<Arg0Type>::T;
- static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) {
- Status st = Status::OK();
+ static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) {
+ Status st = Status::OK();
ArrayIterator<Arg0Type> arg0_it(arg0);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
+ }));
+ return st;
}
- static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
- Status st = Status::OK();
- Scalar* out_scalar = out->scalar().get();
+ static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+ Status st = Status::OK();
+ Scalar* out_scalar = out->scalar().get();
if (arg0.is_valid) {
Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
- out_scalar->is_valid = true;
- BoxScalar<OutType>::Box(Op::template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
- out_scalar);
+ out_scalar->is_valid = true;
+ BoxScalar<OutType>::Box(Op::template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
+ out_scalar);
} else {
- out_scalar->is_valid = false;
+ out_scalar->is_valid = false;
}
- return st;
+ return st;
}
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
- return ExecArray(ctx, *batch[0].array(), out);
+ return ExecArray(ctx, *batch[0].array(), out);
} else {
- return ExecScalar(ctx, *batch[0].scalar(), out);
+ return ExecScalar(ctx, *batch[0].scalar(), out);
}
}
};
@@ -658,69 +658,69 @@ struct ScalarUnaryNotNullStateful {
template <typename Type, typename Enable = void>
struct ArrayExec {
- static Status Exec(const ThisType& functor, KernelContext* ctx,
- const ExecBatch& batch, Datum* out) {
+ static Status Exec(const ThisType& functor, KernelContext* ctx,
+ const ExecBatch& batch, Datum* out) {
ARROW_LOG(FATAL) << "Missing ArrayExec specialization for output type "
<< out->type();
- return Status::NotImplemented("NYI");
+ return Status::NotImplemented("NYI");
}
};
template <typename Type>
struct ArrayExec<
Type, enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
- Status st = Status::OK();
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
ArrayData* out_arr = out->mutable_array();
auto out_data = out_arr->GetMutableValues<OutValue>(1);
VisitArrayValuesInline<Arg0Type>(
arg0,
[&](Arg0Value v) {
- *out_data++ = functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st);
+ *out_data++ = functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st);
},
[&]() {
// null
- *out_data++ = OutValue{};
+ *out_data++ = OutValue{};
});
- return st;
+ return st;
}
};
template <typename Type>
struct ArrayExec<Type, enable_if_base_binary<Type>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
// NOTE: This code is not currently used by any kernels and has
// suboptimal performance because it's recomputing the validity bitmap
// that is already computed by the kernel execution layer. Consider
// writing a lower-level "output adapter" for base binary types.
typename TypeTraits<Type>::BuilderType builder;
- Status st = Status::OK();
- RETURN_NOT_OK(VisitArrayValuesInline<Arg0Type>(
- arg0, [&](Arg0Value v) { return builder.Append(functor.op.Call(ctx, v, &st)); },
- [&]() { return builder.AppendNull(); }));
- if (st.ok()) {
+ Status st = Status::OK();
+ RETURN_NOT_OK(VisitArrayValuesInline<Arg0Type>(
+ arg0, [&](Arg0Value v) { return builder.Append(functor.op.Call(ctx, v, &st)); },
+ [&]() { return builder.AppendNull(); }));
+ if (st.ok()) {
std::shared_ptr<ArrayData> result;
- RETURN_NOT_OK(builder.FinishInternal(&result));
+ RETURN_NOT_OK(builder.FinishInternal(&result));
out->value = std::move(result);
}
- return st;
+ return st;
}
};
template <typename Type>
struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
- Status st = Status::OK();
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
ArrayData* out_arr = out->mutable_array();
FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(),
out_arr->offset, out_arr->length);
VisitArrayValuesInline<Arg0Type>(
arg0,
[&](Arg0Value v) {
- if (functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)) {
+ if (functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)) {
out_writer.Set();
}
out_writer.Next();
@@ -731,49 +731,49 @@ struct ScalarUnaryNotNullStateful {
out_writer.Next();
});
out_writer.Finish();
- return st;
+ return st;
}
};
template <typename Type>
- struct ArrayExec<Type, enable_if_decimal<Type>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
- Status st = Status::OK();
+ struct ArrayExec<Type, enable_if_decimal<Type>> {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
ArrayData* out_arr = out->mutable_array();
- // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian
- using endian_agnostic =
- std::array<uint8_t, sizeof(typename TypeTraits<Type>::ScalarType::ValueType)>;
- auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+ // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian
+ using endian_agnostic =
+ std::array<uint8_t, sizeof(typename TypeTraits<Type>::ScalarType::ValueType)>;
+ auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
VisitArrayValuesInline<Arg0Type>(
arg0,
[&](Arg0Value v) {
- functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)
- .ToBytes(out_data++->data());
+ functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)
+ .ToBytes(out_data++->data());
},
- [&]() {
- // null
- std::memset(out_data, 0, sizeof(*out_data));
- ++out_data;
- });
- return st;
+ [&]() {
+ // null
+ std::memset(out_data, 0, sizeof(*out_data));
+ ++out_data;
+ });
+ return st;
}
};
- Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
- Status st = Status::OK();
+ Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+ Status st = Status::OK();
if (arg0.is_valid) {
Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
- BoxScalar<OutType>::Box(
- this->op.template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
- out->scalar().get());
+ BoxScalar<OutType>::Box(
+ this->op.template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
+ out->scalar().get());
}
- return st;
+ return st;
}
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
- return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
+ return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
} else {
return Scalar(ctx, *batch[0].scalar(), out);
}
@@ -788,7 +788,7 @@ struct ScalarUnaryNotNull {
using OutValue = typename GetOutputType<OutType>::T;
using Arg0Value = typename GetViewType<Arg0Type>::T;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Seed kernel with dummy state
ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
return kernel.Exec(ctx, batch, out);
@@ -807,11 +807,11 @@ struct ScalarUnaryNotNull {
//
// struct Op {
// template <typename OutValue, typename Arg0Value, typename Arg1Value>
-// static OutValue Call(KernelContext* ctx, Arg0Value arg0, Arg1Value arg1, Status* st)
-// {
+// static OutValue Call(KernelContext* ctx, Arg0Value arg0, Arg1Value arg1, Status* st)
+// {
// // implementation
-// // NOTE: "status" should only populated with errors,
-// // leave it unmodified to indicate Status::OK()
+// // NOTE: "status" should only populated with errors,
+// // leave it unmodified to indicate Status::OK()
// }
// };
template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op>
@@ -820,56 +820,56 @@ struct ScalarBinary {
using Arg0Value = typename GetViewType<Arg0Type>::T;
using Arg1Value = typename GetViewType<Arg1Type>::T;
- static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0,
- const ArrayData& arg1, Datum* out) {
- Status st = Status::OK();
+ static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0,
+ const ArrayData& arg1, Datum* out) {
+ Status st = Status::OK();
ArrayIterator<Arg0Type> arg0_it(arg0);
ArrayIterator<Arg1Type> arg1_it(arg1);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_it(),
- &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_it(),
+ &st);
+ }));
+ return st;
}
- static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
ArrayIterator<Arg0Type> arg0_it(arg0);
auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_val,
- &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_val,
+ &st);
+ }));
+ return st;
}
- static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
- Datum* out) {
- Status st = Status::OK();
+ static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
ArrayIterator<Arg1Type> arg1_it(arg1);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_it(),
- &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_it(),
+ &st);
+ }));
+ return st;
}
- static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
if (out->scalar()->is_valid) {
auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
- BoxScalar<OutType>::Box(
- Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
- out->scalar().get());
+ BoxScalar<OutType>::Box(
+ Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
+ out->scalar().get());
}
- return st;
+ return st;
}
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
if (batch[1].kind() == Datum::ARRAY) {
return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
@@ -900,22 +900,22 @@ struct ScalarBinaryNotNullStateful {
// NOTE: In ArrayExec<Type>, Type is really OutputType
- Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
OutputArrayWriter<OutType> writer(out->mutable_array());
VisitTwoArrayValuesInline<Arg0Type, Arg1Type>(
arg0, arg1,
[&](Arg0Value u, Arg1Value v) {
- writer.Write(op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, v, &st));
+ writer.Write(op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, v, &st));
},
[&]() { writer.WriteNull(); });
- return st;
+ return st;
}
- Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
OutputArrayWriter<OutType> writer(out->mutable_array());
if (arg1.is_valid) {
const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
@@ -923,18 +923,18 @@ struct ScalarBinaryNotNullStateful {
arg0,
[&](Arg0Value u) {
writer.Write(
- op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, arg1_val, &st));
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, arg1_val, &st));
},
[&]() { writer.WriteNull(); });
- } else {
- writer.WriteAllNull(out->mutable_array()->length);
+ } else {
+ writer.WriteAllNull(out->mutable_array()->length);
}
- return st;
+ return st;
}
- Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
OutputArrayWriter<OutType> writer(out->mutable_array());
if (arg0.is_valid) {
const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
@@ -942,29 +942,29 @@ struct ScalarBinaryNotNullStateful {
arg1,
[&](Arg1Value v) {
writer.Write(
- op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, v, &st));
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, v, &st));
},
[&]() { writer.WriteNull(); });
- } else {
- writer.WriteAllNull(out->mutable_array()->length);
+ } else {
+ writer.WriteAllNull(out->mutable_array()->length);
}
- return st;
+ return st;
}
- Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
if (arg0.is_valid && arg1.is_valid) {
const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
BoxScalar<OutType>::Box(
- op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
out->scalar().get());
}
- return st;
+ return st;
}
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
if (batch[1].kind() == Datum::ARRAY) {
return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
@@ -991,7 +991,7 @@ struct ScalarBinaryNotNull {
using Arg0Value = typename GetViewType<Arg0Type>::T;
using Arg1Value = typename GetViewType<Arg1Type>::T;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Seed kernel with dummy state
ScalarBinaryNotNullStateful<OutType, Arg0Type, Arg1Type, Op> kernel({});
return kernel.Exec(ctx, batch, out);
@@ -1160,41 +1160,41 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
}
}
-template <template <typename... Args> class Generator, typename... Args>
-ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- return Generator<Int8Type, Args...>::Exec;
- case Type::INT16:
- return Generator<Int16Type, Args...>::Exec;
- case Type::INT32:
- case Type::DATE32:
- case Type::TIME32:
- return Generator<Int32Type, Args...>::Exec;
- case Type::INT64:
- case Type::DATE64:
- case Type::TIMESTAMP:
- case Type::TIME64:
- case Type::DURATION:
- return Generator<Int64Type, Args...>::Exec;
- case Type::UINT8:
- return Generator<UInt8Type, Args...>::Exec;
- case Type::UINT16:
- return Generator<UInt16Type, Args...>::Exec;
- case Type::UINT32:
- return Generator<UInt32Type, Args...>::Exec;
- case Type::UINT64:
- return Generator<UInt64Type, Args...>::Exec;
- case Type::FLOAT:
- return Generator<FloatType, Args...>::Exec;
- case Type::DOUBLE:
- return Generator<DoubleType, Args...>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
+template <template <typename... Args> class Generator, typename... Args>
+ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Int8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Int16Type, Args...>::Exec;
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return Generator<Int32Type, Args...>::Exec;
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIMESTAMP:
+ case Type::TIME64:
+ case Type::DURATION:
+ return Generator<Int64Type, Args...>::Exec;
+ case Type::UINT8:
+ return Generator<UInt8Type, Args...>::Exec;
+ case Type::UINT16:
+ return Generator<UInt16Type, Args...>::Exec;
+ case Type::UINT32:
+ return Generator<UInt32Type, Args...>::Exec;
+ case Type::UINT64:
+ return Generator<UInt64Type, Args...>::Exec;
+ case Type::FLOAT:
+ return Generator<FloatType, Args...>::Exec;
+ case Type::DOUBLE:
+ return Generator<DoubleType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
// Generate a kernel given a templated functor for integer types
//
// See "Numeric" above for description of the generator functor
@@ -1222,26 +1222,26 @@ ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
// bits).
//
// See "Numeric" above for description of the generator functor
-template <template <typename...> class Generator, typename... Args>
+template <template <typename...> class Generator, typename... Args>
ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::NA:
- return Generator<NullType, Args...>::Exec;
+ return Generator<NullType, Args...>::Exec;
case Type::BOOL:
- return Generator<BooleanType, Args...>::Exec;
+ return Generator<BooleanType, Args...>::Exec;
case Type::UINT8:
case Type::INT8:
- return Generator<UInt8Type, Args...>::Exec;
+ return Generator<UInt8Type, Args...>::Exec;
case Type::UINT16:
case Type::INT16:
- return Generator<UInt16Type, Args...>::Exec;
+ return Generator<UInt16Type, Args...>::Exec;
case Type::UINT32:
case Type::INT32:
case Type::FLOAT:
case Type::DATE32:
case Type::TIME32:
- case Type::INTERVAL_MONTHS:
- return Generator<UInt32Type, Args...>::Exec;
+ case Type::INTERVAL_MONTHS:
+ return Generator<UInt32Type, Args...>::Exec;
case Type::UINT64:
case Type::INT64:
case Type::DOUBLE:
@@ -1249,30 +1249,30 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
case Type::TIMESTAMP:
case Type::TIME64:
case Type::DURATION:
- case Type::INTERVAL_DAY_TIME:
- return Generator<UInt64Type, Args...>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-// similar to GenerateTypeAgnosticPrimitive, but for variable types
-template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::BINARY:
- case Type::STRING:
- return Generator<BinaryType, Args...>::Exec;
- case Type::LARGE_BINARY:
- case Type::LARGE_STRING:
- return Generator<LargeBinaryType, Args...>::Exec;
+ case Type::INTERVAL_DAY_TIME:
+ return Generator<UInt64Type, Args...>::Exec;
default:
DCHECK(false);
return ExecFail;
}
}
+// similar to GenerateTypeAgnosticPrimitive, but for variable types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::BINARY:
+ case Type::STRING:
+ return Generator<BinaryType, Args...>::Exec;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return Generator<LargeBinaryType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
// Generate a kernel given a templated functor for base binary types. Generates
// a single kernel for binary/string and large binary / large string. If your
// kernel implementation needs access to the specific type at compile time,
@@ -1336,46 +1336,46 @@ ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
}
}
-// Generate a kernel given a templated functor for decimal types
-//
-// See "Numeric" above for description of the generator functor
-template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::DECIMAL128:
- return Generator<Type0, Decimal128Type, Args...>::Exec;
- case Type::DECIMAL256:
- return Generator<Type0, Decimal256Type, Args...>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
+// Generate a kernel given a templated functor for decimal types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::DECIMAL128:
+ return Generator<Type0, Decimal128Type, Args...>::Exec;
+ case Type::DECIMAL256:
+ return Generator<Type0, Decimal256Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
// END of kernel generator-dispatchers
// ----------------------------------------------------------------------
-ARROW_EXPORT
-void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs);
-
-ARROW_EXPORT
-void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs);
-
-ARROW_EXPORT
-void ReplaceTypes(const std::shared_ptr<DataType>&, std::vector<ValueDescr>* descrs);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs);
-
+ARROW_EXPORT
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+void ReplaceTypes(const std::shared_ptr<DataType>&, std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs);
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index ed40a6b1b8c..63d41392203 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1,1379 +1,1379 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "arrow/buffer_builder.h"
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec/key_compare.h"
-#include "arrow/compute/exec/key_encode.h"
-#include "arrow/compute/exec/key_hash.h"
-#include "arrow/compute/exec/key_map.h"
-#include "arrow/compute/exec/util.h"
-#include "arrow/compute/exec_internal.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_writer.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/cpu_info.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::FirstTimeBitmapWriter;
-
-namespace compute {
-namespace internal {
-namespace {
-
-struct KeyEncoder {
- // the first byte of an encoded key is used to indicate nullity
- static constexpr bool kExtraByteForNull = true;
-
- static constexpr uint8_t kNullByte = 1;
- static constexpr uint8_t kValidByte = 0;
-
- virtual ~KeyEncoder() = default;
-
- virtual void AddLength(const ArrayData&, int32_t* lengths) = 0;
-
- virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0;
-
- virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
- int32_t length, MemoryPool*) = 0;
-
- // extract the null bitmap from the leading nullity bytes of encoded keys
- static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
- std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count) {
- // first count nulls to determine if a null bitmap is necessary
- *null_count = 0;
- for (int32_t i = 0; i < length; ++i) {
- *null_count += (encoded_bytes[i][0] == kNullByte);
- }
-
- if (*null_count > 0) {
- ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool));
- uint8_t* validity = (*null_bitmap)->mutable_data();
-
- FirstTimeBitmapWriter writer(validity, 0, length);
- for (int32_t i = 0; i < length; ++i) {
- if (encoded_bytes[i][0] == kValidByte) {
- writer.Set();
- } else {
- writer.Clear();
- }
- writer.Next();
- encoded_bytes[i] += 1;
- }
- writer.Finish();
- } else {
- for (int32_t i = 0; i < length; ++i) {
- encoded_bytes[i] += 1;
- }
- }
- return Status ::OK();
- }
-};
-
-struct BooleanKeyEncoder : KeyEncoder {
- static constexpr int kByteWidth = 1;
-
- void AddLength(const ArrayData& data, int32_t* lengths) override {
- for (int64_t i = 0; i < data.length; ++i) {
- lengths[i] += kByteWidth + kExtraByteForNull;
- }
- }
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- VisitArrayDataInline<BooleanType>(
- data,
- [&](bool value) {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kValidByte;
- *encoded_ptr++ = value;
- },
- [&] {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kNullByte;
- *encoded_ptr++ = 0;
- });
- return Status::OK();
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- std::shared_ptr<Buffer> null_buf;
- int32_t null_count;
- RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
-
- ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool));
-
- uint8_t* raw_output = key_buf->mutable_data();
- for (int32_t i = 0; i < length; ++i) {
- auto& encoded_ptr = encoded_bytes[i];
- BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0);
- encoded_ptr += 1;
- }
-
- return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)},
- null_count);
- }
-};
-
-struct FixedWidthKeyEncoder : KeyEncoder {
- explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
- : type_(std::move(type)),
- byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
-
- void AddLength(const ArrayData& data, int32_t* lengths) override {
- for (int64_t i = 0; i < data.length; ++i) {
- lengths[i] += byte_width_ + kExtraByteForNull;
- }
- }
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers,
- data.null_count, data.offset);
-
- VisitArrayDataInline<FixedSizeBinaryType>(
- viewed,
- [&](util::string_view bytes) {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kValidByte;
- memcpy(encoded_ptr, bytes.data(), byte_width_);
- encoded_ptr += byte_width_;
- },
- [&] {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kNullByte;
- memset(encoded_ptr, 0, byte_width_);
- encoded_ptr += byte_width_;
- });
- return Status::OK();
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- std::shared_ptr<Buffer> null_buf;
- int32_t null_count;
- RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
-
- ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool));
-
- uint8_t* raw_output = key_buf->mutable_data();
- for (int32_t i = 0; i < length; ++i) {
- auto& encoded_ptr = encoded_bytes[i];
- std::memcpy(raw_output, encoded_ptr, byte_width_);
- encoded_ptr += byte_width_;
- raw_output += byte_width_;
- }
-
- return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)},
- null_count);
- }
-
- std::shared_ptr<DataType> type_;
- int byte_width_;
-};
-
-struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
- DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
- : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- auto dict = MakeArray(data.dictionary);
- if (dictionary_) {
- if (!dictionary_->Equals(dict)) {
- // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
- // differs from the first we saw for this key
- return Status::NotImplemented("Unifying differing dictionaries");
- }
- } else {
- dictionary_ = std::move(dict);
- }
- return FixedWidthKeyEncoder::Encode(data, encoded_bytes);
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- ARROW_ASSIGN_OR_RAISE(auto data,
- FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool));
-
- if (dictionary_) {
- data->dictionary = dictionary_->data();
- } else {
- ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
- data->dictionary = dict->data();
- }
-
- data->type = type_;
- return data;
- }
-
- MemoryPool* pool_;
- std::shared_ptr<Array> dictionary_;
-};
-
-template <typename T>
-struct VarLengthKeyEncoder : KeyEncoder {
- using Offset = typename T::offset_type;
-
- void AddLength(const ArrayData& data, int32_t* lengths) override {
- int64_t i = 0;
- VisitArrayDataInline<T>(
- data,
- [&](util::string_view bytes) {
- lengths[i++] +=
- kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
- },
- [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
- }
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- VisitArrayDataInline<T>(
- data,
- [&](util::string_view bytes) {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kValidByte;
- util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
- encoded_ptr += sizeof(Offset);
- memcpy(encoded_ptr, bytes.data(), bytes.size());
- encoded_ptr += bytes.size();
- },
- [&] {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kNullByte;
- util::SafeStore(encoded_ptr, static_cast<Offset>(0));
- encoded_ptr += sizeof(Offset);
- });
- return Status::OK();
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- std::shared_ptr<Buffer> null_buf;
- int32_t null_count;
- RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
-
- Offset length_sum = 0;
- for (int32_t i = 0; i < length; ++i) {
- length_sum += util::SafeLoadAs<Offset>(encoded_bytes[i]);
- }
-
- ARROW_ASSIGN_OR_RAISE(auto offset_buf,
- AllocateBuffer(sizeof(Offset) * (1 + length), pool));
- ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum));
-
- auto raw_offsets = reinterpret_cast<Offset*>(offset_buf->mutable_data());
- auto raw_keys = key_buf->mutable_data();
-
- Offset current_offset = 0;
- for (int32_t i = 0; i < length; ++i) {
- raw_offsets[i] = current_offset;
-
- auto key_length = util::SafeLoadAs<Offset>(encoded_bytes[i]);
- encoded_bytes[i] += sizeof(Offset);
-
- memcpy(raw_keys + current_offset, encoded_bytes[i], key_length);
- encoded_bytes[i] += key_length;
-
- current_offset += key_length;
- }
- raw_offsets[length] = current_offset;
-
- return ArrayData::Make(
- type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)},
- null_count);
- }
-
- explicit VarLengthKeyEncoder(std::shared_ptr<DataType> type) : type_(std::move(type)) {}
-
- std::shared_ptr<DataType> type_;
-};
-
-struct GrouperImpl : Grouper {
- static Result<std::unique_ptr<GrouperImpl>> Make(const std::vector<ValueDescr>& keys,
- ExecContext* ctx) {
- auto impl = ::arrow::internal::make_unique<GrouperImpl>();
-
- impl->encoders_.resize(keys.size());
- impl->ctx_ = ctx;
-
- for (size_t i = 0; i < keys.size(); ++i) {
- const auto& key = keys[i].type;
-
- if (key->id() == Type::BOOL) {
- impl->encoders_[i] = ::arrow::internal::make_unique<BooleanKeyEncoder>();
- continue;
- }
-
- if (key->id() == Type::DICTIONARY) {
- impl->encoders_[i] =
- ::arrow::internal::make_unique<DictionaryKeyEncoder>(key, ctx->memory_pool());
- continue;
- }
-
- if (is_fixed_width(key->id())) {
- impl->encoders_[i] = ::arrow::internal::make_unique<FixedWidthKeyEncoder>(key);
- continue;
- }
-
- if (is_binary_like(key->id())) {
- impl->encoders_[i] =
- ::arrow::internal::make_unique<VarLengthKeyEncoder<BinaryType>>(key);
- continue;
- }
-
- if (is_large_binary_like(key->id())) {
- impl->encoders_[i] =
- ::arrow::internal::make_unique<VarLengthKeyEncoder<LargeBinaryType>>(key);
- continue;
- }
-
- return Status::NotImplemented("Keys of type ", *key);
- }
-
- return std::move(impl);
- }
-
- Result<Datum> Consume(const ExecBatch& batch) override {
- std::vector<int32_t> offsets_batch(batch.length + 1);
- for (int i = 0; i < batch.num_values(); ++i) {
- encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data());
- }
-
- int32_t total_length = 0;
- for (int64_t i = 0; i < batch.length; ++i) {
- auto total_length_before = total_length;
- total_length += offsets_batch[i];
- offsets_batch[i] = total_length_before;
- }
- offsets_batch[batch.length] = total_length;
-
- std::vector<uint8_t> key_bytes_batch(total_length);
- std::vector<uint8_t*> key_buf_ptrs(batch.length);
- for (int64_t i = 0; i < batch.length; ++i) {
- key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i];
- }
-
- for (int i = 0; i < batch.num_values(); ++i) {
- RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data()));
- }
-
- TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
- RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
-
- for (int64_t i = 0; i < batch.length; ++i) {
- int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
- std::string key(
- reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
- key_length);
-
- auto it_success = map_.emplace(key, num_groups_);
- auto group_id = it_success.first->second;
-
- if (it_success.second) {
- // new key; update offsets and key_bytes
- ++num_groups_;
- auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
- key_bytes_.resize(next_key_offset + key_length);
- offsets_.push_back(next_key_offset + key_length);
- memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
- }
-
- group_ids_batch.UnsafeAppend(group_id);
- }
-
- ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
- return Datum(UInt32Array(batch.length, std::move(group_ids)));
- }
-
- uint32_t num_groups() const override { return num_groups_; }
-
- Result<ExecBatch> GetUniques() override {
- ExecBatch out({}, num_groups_);
-
- std::vector<uint8_t*> key_buf_ptrs(num_groups_);
- for (int64_t i = 0; i < num_groups_; ++i) {
- key_buf_ptrs[i] = key_bytes_.data() + offsets_[i];
- }
-
- out.values.resize(encoders_.size());
- for (size_t i = 0; i < encoders_.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- out.values[i],
- encoders_[i]->Decode(key_buf_ptrs.data(), static_cast<int32_t>(num_groups_),
- ctx_->memory_pool()));
- }
-
- return out;
- }
-
- ExecContext* ctx_;
- std::unordered_map<std::string, uint32_t> map_;
- std::vector<int32_t> offsets_ = {0};
- std::vector<uint8_t> key_bytes_;
- uint32_t num_groups_ = 0;
- std::vector<std::unique_ptr<KeyEncoder>> encoders_;
-};
-
-struct GrouperFastImpl : Grouper {
- static constexpr int kBitmapPaddingForSIMD = 64; // bits
- static constexpr int kPaddingForSIMD = 32; // bytes
-
- static bool CanUse(const std::vector<ValueDescr>& keys) {
-#if ARROW_LITTLE_ENDIAN
- for (size_t i = 0; i < keys.size(); ++i) {
- const auto& key = keys[i].type;
- if (is_large_binary_like(key->id())) {
- return false;
- }
- }
- return true;
-#else
- return false;
-#endif
- }
-
- static Result<std::unique_ptr<GrouperFastImpl>> Make(
- const std::vector<ValueDescr>& keys, ExecContext* ctx) {
- auto impl = ::arrow::internal::make_unique<GrouperFastImpl>();
- impl->ctx_ = ctx;
-
- RETURN_NOT_OK(impl->temp_stack_.Init(ctx->memory_pool(), 64 * minibatch_size_max_));
- impl->encode_ctx_.hardware_flags =
- arrow::internal::CpuInfo::GetInstance()->hardware_flags();
- impl->encode_ctx_.stack = &impl->temp_stack_;
-
- auto num_columns = keys.size();
- impl->col_metadata_.resize(num_columns);
- impl->key_types_.resize(num_columns);
- impl->dictionaries_.resize(num_columns);
- for (size_t icol = 0; icol < num_columns; ++icol) {
- const auto& key = keys[icol].type;
- if (key->id() == Type::DICTIONARY) {
- auto bit_width = checked_cast<const FixedWidthType&>(*key).bit_width();
- ARROW_DCHECK(bit_width % 8 == 0);
- impl->col_metadata_[icol] =
- arrow::compute::KeyEncoder::KeyColumnMetadata(true, bit_width / 8);
- } else if (key->id() == Type::BOOL) {
- impl->col_metadata_[icol] =
- arrow::compute::KeyEncoder::KeyColumnMetadata(true, 0);
- } else if (is_fixed_width(key->id())) {
- impl->col_metadata_[icol] = arrow::compute::KeyEncoder::KeyColumnMetadata(
- true, checked_cast<const FixedWidthType&>(*key).bit_width() / 8);
- } else if (is_binary_like(key->id())) {
- impl->col_metadata_[icol] =
- arrow::compute::KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
- } else {
- return Status::NotImplemented("Keys of type ", *key);
- }
- impl->key_types_[icol] = key;
- }
-
- impl->encoder_.Init(impl->col_metadata_, &impl->encode_ctx_,
- /* row_alignment = */ sizeof(uint64_t),
- /* string_alignment = */ sizeof(uint64_t));
- RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
- RETURN_NOT_OK(
- impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
- impl->minibatch_size_ = impl->minibatch_size_min_;
- GrouperFastImpl* impl_ptr = impl.get();
- auto equal_func = [impl_ptr](
- int num_keys_to_compare, const uint16_t* selection_may_be_null,
- const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
- uint16_t* out_selection_mismatch) {
- arrow::compute::KeyCompare::CompareRows(
- num_keys_to_compare, selection_may_be_null, group_ids, &impl_ptr->encode_ctx_,
- out_num_keys_mismatch, out_selection_mismatch, impl_ptr->rows_minibatch_,
- impl_ptr->rows_);
- };
- auto append_func = [impl_ptr](int num_keys, const uint16_t* selection) {
- return impl_ptr->rows_.AppendSelectionFrom(impl_ptr->rows_minibatch_, num_keys,
- selection);
- };
- RETURN_NOT_OK(impl->map_.init(impl->encode_ctx_.hardware_flags, ctx->memory_pool(),
- impl->encode_ctx_.stack, impl->log_minibatch_max_,
- equal_func, append_func));
- impl->cols_.resize(num_columns);
- impl->minibatch_hashes_.resize(impl->minibatch_size_max_ +
- kPaddingForSIMD / sizeof(uint32_t));
-
- return std::move(impl);
- }
-
- ~GrouperFastImpl() { map_.cleanup(); }
-
- Result<Datum> Consume(const ExecBatch& batch) override {
- int64_t num_rows = batch.length;
- int num_columns = batch.num_values();
-
- // Process dictionaries
- for (int icol = 0; icol < num_columns; ++icol) {
- if (key_types_[icol]->id() == Type::DICTIONARY) {
- auto data = batch[icol].array();
- auto dict = MakeArray(data->dictionary);
- if (dictionaries_[icol]) {
- if (!dictionaries_[icol]->Equals(dict)) {
- // TODO(bkietz) unify if necessary. For now, just error if any batch's
- // dictionary differs from the first we saw for this key
- return Status::NotImplemented("Unifying differing dictionaries");
- }
- } else {
- dictionaries_[icol] = std::move(dict);
- }
- }
- }
-
- std::shared_ptr<arrow::Buffer> group_ids;
- ARROW_ASSIGN_OR_RAISE(
- group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
-
- for (int icol = 0; icol < num_columns; ++icol) {
- const uint8_t* non_nulls = nullptr;
- if (batch[icol].array()->buffers[0] != NULLPTR) {
- non_nulls = batch[icol].array()->buffers[0]->data();
- }
- const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
- const uint8_t* varlen = nullptr;
- if (!col_metadata_[icol].is_fixed_length) {
- varlen = batch[icol].array()->buffers[2]->data();
- }
-
- int64_t offset = batch[icol].array()->offset;
-
- auto col_base = arrow::compute::KeyEncoder::KeyColumnArray(
- col_metadata_[icol], offset + num_rows, non_nulls, fixedlen, varlen);
-
- cols_[icol] =
- arrow::compute::KeyEncoder::KeyColumnArray(col_base, offset, num_rows);
- }
-
- // Split into smaller mini-batches
- //
- for (uint32_t start_row = 0; start_row < num_rows;) {
- uint32_t batch_size_next = std::min(static_cast<uint32_t>(minibatch_size_),
- static_cast<uint32_t>(num_rows) - start_row);
-
- // Encode
- rows_minibatch_.Clean();
- RETURN_NOT_OK(encoder_.PrepareOutputForEncode(start_row, batch_size_next,
- &rows_minibatch_, cols_));
- encoder_.Encode(start_row, batch_size_next, &rows_minibatch_, cols_);
-
- // Compute hash
- if (encoder_.row_metadata().is_fixed_length) {
- Hashing::hash_fixed(encode_ctx_.hardware_flags, batch_size_next,
- encoder_.row_metadata().fixed_length, rows_minibatch_.data(1),
- minibatch_hashes_.data());
- } else {
- auto hash_temp_buf =
- util::TempVectorHolder<uint32_t>(&temp_stack_, 4 * batch_size_next);
- Hashing::hash_varlen(encode_ctx_.hardware_flags, batch_size_next,
- rows_minibatch_.offsets(), rows_minibatch_.data(2),
- hash_temp_buf.mutable_data(), minibatch_hashes_.data());
- }
-
- // Map
- RETURN_NOT_OK(
- map_.map(batch_size_next, minibatch_hashes_.data(),
- reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row));
-
- start_row += batch_size_next;
-
- if (minibatch_size_ * 2 <= minibatch_size_max_) {
- minibatch_size_ *= 2;
- }
- }
-
- return Datum(UInt32Array(batch.length, std::move(group_ids)));
- }
-
- uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }
-
- // Make sure padded buffers end up with the right logical size
-
- Result<std::shared_ptr<Buffer>> AllocatePaddedBitmap(int64_t length) {
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<Buffer> buf,
- AllocateBitmap(length + kBitmapPaddingForSIMD, ctx_->memory_pool()));
- return SliceMutableBuffer(buf, 0, BitUtil::BytesForBits(length));
- }
-
- Result<std::shared_ptr<Buffer>> AllocatePaddedBuffer(int64_t size) {
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<Buffer> buf,
- AllocateBuffer(size + kBitmapPaddingForSIMD, ctx_->memory_pool()));
- return SliceMutableBuffer(buf, 0, size);
- }
-
- Result<ExecBatch> GetUniques() override {
- auto num_columns = static_cast<uint32_t>(col_metadata_.size());
- int64_t num_groups = rows_.length();
-
- std::vector<std::shared_ptr<Buffer>> non_null_bufs(num_columns);
- std::vector<std::shared_ptr<Buffer>> fixedlen_bufs(num_columns);
- std::vector<std::shared_ptr<Buffer>> varlen_bufs(num_columns);
-
- for (size_t i = 0; i < num_columns; ++i) {
- ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocatePaddedBitmap(num_groups));
- if (col_metadata_[i].is_fixed_length) {
- if (col_metadata_[i].fixed_length == 0) {
- ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBitmap(num_groups));
- } else {
- ARROW_ASSIGN_OR_RAISE(
- fixedlen_bufs[i],
- AllocatePaddedBuffer(num_groups * col_metadata_[i].fixed_length));
- }
- } else {
- ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
- AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t)));
- }
- cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
- col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
- fixedlen_bufs[i]->mutable_data(), nullptr);
- }
-
- for (int64_t start_row = 0; start_row < num_groups;) {
- int64_t batch_size_next =
- std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
- encoder_.DecodeFixedLengthBuffers(start_row, start_row, batch_size_next, rows_,
- &cols_);
- start_row += batch_size_next;
- }
-
- if (!rows_.metadata().is_fixed_length) {
- for (size_t i = 0; i < num_columns; ++i) {
- if (!col_metadata_[i].is_fixed_length) {
- auto varlen_size =
- reinterpret_cast<const uint32_t*>(fixedlen_bufs[i]->data())[num_groups];
- ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocatePaddedBuffer(varlen_size));
- cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
- col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
- fixedlen_bufs[i]->mutable_data(), varlen_bufs[i]->mutable_data());
- }
- }
-
- for (int64_t start_row = 0; start_row < num_groups;) {
- int64_t batch_size_next =
- std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
- encoder_.DecodeVaryingLengthBuffers(start_row, start_row, batch_size_next, rows_,
- &cols_);
- start_row += batch_size_next;
- }
- }
-
- ExecBatch out({}, num_groups);
- out.values.resize(num_columns);
- for (size_t i = 0; i < num_columns; ++i) {
- auto valid_count = arrow::internal::CountSetBits(
- non_null_bufs[i]->data(), /*offset=*/0, static_cast<int64_t>(num_groups));
- int null_count = static_cast<int>(num_groups) - static_cast<int>(valid_count);
-
- if (col_metadata_[i].is_fixed_length) {
- out.values[i] = ArrayData::Make(
- key_types_[i], num_groups,
- {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])}, null_count);
- } else {
- out.values[i] =
- ArrayData::Make(key_types_[i], num_groups,
- {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]),
- std::move(varlen_bufs[i])},
- null_count);
- }
- }
-
- // Process dictionaries
- for (size_t icol = 0; icol < num_columns; ++icol) {
- if (key_types_[icol]->id() == Type::DICTIONARY) {
- if (dictionaries_[icol]) {
- out.values[icol].array()->dictionary = dictionaries_[icol]->data();
- } else {
- ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(key_types_[icol], 0));
- out.values[icol].array()->dictionary = dict->data();
- }
- }
- }
-
- return out;
- }
-
- static constexpr int log_minibatch_max_ = 10;
- static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_;
- static constexpr int minibatch_size_min_ = 128;
- int minibatch_size_;
-
- ExecContext* ctx_;
- arrow::util::TempVectorStack temp_stack_;
- arrow::compute::KeyEncoder::KeyEncoderContext encode_ctx_;
-
- std::vector<std::shared_ptr<arrow::DataType>> key_types_;
- std::vector<arrow::compute::KeyEncoder::KeyColumnMetadata> col_metadata_;
- std::vector<arrow::compute::KeyEncoder::KeyColumnArray> cols_;
- std::vector<uint32_t> minibatch_hashes_;
-
- std::vector<std::shared_ptr<Array>> dictionaries_;
-
- arrow::compute::KeyEncoder::KeyRowArray rows_;
- arrow::compute::KeyEncoder::KeyRowArray rows_minibatch_;
- arrow::compute::KeyEncoder encoder_;
- arrow::compute::SwissTable map_;
-};
-
-/// C++ abstract base class for the HashAggregateKernel interface.
-/// Implementations should be default constructible and perform initialization in
-/// Init().
-struct GroupedAggregator : KernelState {
- virtual Status Init(ExecContext*, const FunctionOptions*,
- const std::shared_ptr<DataType>&) = 0;
-
- virtual Status Consume(const ExecBatch& batch) = 0;
-
- virtual Result<Datum> Finalize() = 0;
-
- template <typename Reserve>
- Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch,
- const Reserve& reserve) {
- int64_t new_num_groups = batch[2].scalar_as<UInt32Scalar>().value;
- if (new_num_groups <= old_num_groups) {
- return Status::OK();
- }
- return reserve(new_num_groups - old_num_groups);
- }
-
- virtual std::shared_ptr<DataType> out_type() const = 0;
-};
-
-// ----------------------------------------------------------------------
-// Count implementation
-
-struct GroupedCountImpl : public GroupedAggregator {
- Status Init(ExecContext* ctx, const FunctionOptions* options,
- const std::shared_ptr<DataType>&) override {
- options_ = checked_cast<const ScalarAggregateOptions&>(*options);
- counts_ = BufferBuilder(ctx->memory_pool());
- return Status::OK();
- }
-
- Status Consume(const ExecBatch& batch) override {
- RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
- num_groups_ += added_groups;
- return counts_.Append(added_groups * sizeof(int64_t), 0);
- }));
-
- auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
- auto raw_counts = reinterpret_cast<int64_t*>(counts_.mutable_data());
-
- const auto& input = batch[0].array();
-
- if (!options_.skip_nulls) {
- if (input->GetNullCount() != 0) {
- for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) {
- auto g = group_ids[i];
- raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i);
- }
- }
- return Status::OK();
- }
-
- arrow::internal::VisitSetBitRunsVoid(
- input->buffers[0], input->offset, input->length,
- [&](int64_t begin, int64_t length) {
- for (int64_t input_i = begin, i = begin - input->offset;
- input_i < begin + length; ++input_i, ++i) {
- auto g = group_ids[i];
- raw_counts[g] += 1;
- }
- });
- return Status::OK();
- }
-
- Result<Datum> Finalize() override {
- ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish());
- return std::make_shared<Int64Array>(num_groups_, std::move(counts));
- }
-
- std::shared_ptr<DataType> out_type() const override { return int64(); }
-
- int64_t num_groups_ = 0;
- ScalarAggregateOptions options_;
- BufferBuilder counts_;
-};
-
-// ----------------------------------------------------------------------
-// Sum implementation
-
-struct GroupedSumImpl : public GroupedAggregator {
- // NB: whether we are accumulating into double, int64_t, or uint64_t
- // we always have 64 bits per group in the sums buffer.
- static constexpr size_t kSumSize = sizeof(int64_t);
-
- using ConsumeImpl = std::function<void(const std::shared_ptr<ArrayData>&,
- const uint32_t*, void*, int64_t*)>;
-
- struct GetConsumeImpl {
- template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
- Status Visit(const T&) {
- consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
- void* boxed_sums, int64_t* counts) {
- auto sums = reinterpret_cast<typename TypeTraits<AccType>::CType*>(boxed_sums);
-
- VisitArrayDataInline<T>(
- *input,
- [&](typename TypeTraits<T>::CType value) {
- sums[*group] += value;
- counts[*group] += 1;
- ++group;
- },
- [&] { ++group; });
- };
- out_type = TypeTraits<AccType>::type_singleton();
- return Status::OK();
- }
-
- Status Visit(const HalfFloatType& type) {
- return Status::NotImplemented("Summing data of type ", type);
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("Summing data of type ", type);
- }
-
- ConsumeImpl consume_impl;
- std::shared_ptr<DataType> out_type;
- };
-
- Status Init(ExecContext* ctx, const FunctionOptions*,
- const std::shared_ptr<DataType>& input_type) override {
- pool_ = ctx->memory_pool();
- sums_ = BufferBuilder(pool_);
- counts_ = BufferBuilder(pool_);
-
- GetConsumeImpl get_consume_impl;
- RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl));
-
- consume_impl_ = std::move(get_consume_impl.consume_impl);
- out_type_ = std::move(get_consume_impl.out_type);
-
- return Status::OK();
- }
-
- Status Consume(const ExecBatch& batch) override {
- RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
- num_groups_ += added_groups;
- RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0));
- RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0));
- return Status::OK();
- }));
-
- auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
- consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(),
- reinterpret_cast<int64_t*>(counts_.mutable_data()));
- return Status::OK();
- }
-
- Result<Datum> Finalize() override {
- std::shared_ptr<Buffer> null_bitmap;
- int64_t null_count = 0;
-
- for (int64_t i = 0; i < num_groups_; ++i) {
- if (reinterpret_cast<const int64_t*>(counts_.data())[i] > 0) continue;
-
- if (null_bitmap == nullptr) {
- ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_));
- BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true);
- }
-
- null_count += 1;
- BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false);
- }
-
- ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish());
-
- return ArrayData::Make(std::move(out_type_), num_groups_,
- {std::move(null_bitmap), std::move(sums)}, null_count);
- }
-
- std::shared_ptr<DataType> out_type() const override { return out_type_; }
-
- // NB: counts are used here instead of a simple "has_values_" bitmap since
- // we expect to reuse this kernel to handle Mean
- int64_t num_groups_ = 0;
- BufferBuilder sums_, counts_;
- std::shared_ptr<DataType> out_type_;
- ConsumeImpl consume_impl_;
- MemoryPool* pool_;
-};
-
-// ----------------------------------------------------------------------
-// MinMax implementation
-
-template <typename CType>
-struct Extrema : std::numeric_limits<CType> {};
-
-template <>
-struct Extrema<float> {
- static constexpr float min() { return -std::numeric_limits<float>::infinity(); }
- static constexpr float max() { return std::numeric_limits<float>::infinity(); }
-};
-
-template <>
-struct Extrema<double> {
- static constexpr double min() { return -std::numeric_limits<double>::infinity(); }
- static constexpr double max() { return std::numeric_limits<double>::infinity(); }
-};
-
-struct GroupedMinMaxImpl : public GroupedAggregator {
- using ConsumeImpl =
- std::function<void(const std::shared_ptr<ArrayData>&, const uint32_t*, void*, void*,
- uint8_t*, uint8_t*)>;
-
- using ResizeImpl = std::function<Status(BufferBuilder*, int64_t)>;
-
- template <typename CType>
- static ResizeImpl MakeResizeImpl(CType anti_extreme) {
- // resize a min or max buffer, storing the correct anti extreme
- return [anti_extreme](BufferBuilder* builder, int64_t added_groups) {
- TypedBufferBuilder<CType> typed_builder(std::move(*builder));
- RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme));
- *builder = std::move(*typed_builder.bytes_builder());
- return Status::OK();
- };
- }
-
- struct GetImpl {
- template <typename T, typename CType = typename TypeTraits<T>::CType>
- enable_if_number<T, Status> Visit(const T&) {
- consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
- void* mins, void* maxes, uint8_t* has_values,
- uint8_t* has_nulls) {
- auto raw_mins = reinterpret_cast<CType*>(mins);
- auto raw_maxes = reinterpret_cast<CType*>(maxes);
-
- VisitArrayDataInline<T>(
- *input,
- [&](CType val) {
- raw_maxes[*group] = std::max(raw_maxes[*group], val);
- raw_mins[*group] = std::min(raw_mins[*group], val);
- BitUtil::SetBit(has_values, *group++);
- },
- [&] { BitUtil::SetBit(has_nulls, *group++); });
- };
-
- resize_min_impl = MakeResizeImpl(Extrema<CType>::max());
- resize_max_impl = MakeResizeImpl(Extrema<CType>::min());
- return Status::OK();
- }
-
- Status Visit(const BooleanType& type) {
- return Status::NotImplemented("Grouped MinMax data of type ", type);
- }
-
- Status Visit(const HalfFloatType& type) {
- return Status::NotImplemented("Grouped MinMax data of type ", type);
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("Grouped MinMax data of type ", type);
- }
-
- ConsumeImpl consume_impl;
- ResizeImpl resize_min_impl, resize_max_impl;
- };
-
- Status Init(ExecContext* ctx, const FunctionOptions* options,
- const std::shared_ptr<DataType>& input_type) override {
- options_ = *checked_cast<const ScalarAggregateOptions*>(options);
- type_ = input_type;
-
- mins_ = BufferBuilder(ctx->memory_pool());
- maxes_ = BufferBuilder(ctx->memory_pool());
- has_values_ = TypedBufferBuilder<bool>(ctx->memory_pool());
- has_nulls_ = TypedBufferBuilder<bool>(ctx->memory_pool());
-
- GetImpl get_impl;
- RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl));
-
- consume_impl_ = std::move(get_impl.consume_impl);
- resize_min_impl_ = std::move(get_impl.resize_min_impl);
- resize_max_impl_ = std::move(get_impl.resize_max_impl);
-
- return Status::OK();
- }
-
- Status Consume(const ExecBatch& batch) override {
- RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
- num_groups_ += added_groups;
- RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups));
- RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups));
- RETURN_NOT_OK(has_values_.Append(added_groups, false));
- RETURN_NOT_OK(has_nulls_.Append(added_groups, false));
- return Status::OK();
- }));
-
- auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
- consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(),
- maxes_.mutable_data(), has_values_.mutable_data(),
- has_nulls_.mutable_data());
- return Status::OK();
- }
-
- Result<Datum> Finalize() override {
- // aggregation for group is valid if there was at least one value in that group
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
-
- if (!options_.skip_nulls) {
- // ... and there were no nulls in that group
- ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
- arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
- num_groups_, 0, null_bitmap->mutable_data());
- }
-
- auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
- auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
- ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish());
- ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish());
-
- return ArrayData::Make(out_type(), num_groups_, {nullptr},
- {std::move(mins), std::move(maxes)});
- }
-
- std::shared_ptr<DataType> out_type() const override {
- return struct_({field("min", type_), field("max", type_)});
- }
-
- int64_t num_groups_;
- BufferBuilder mins_, maxes_;
- TypedBufferBuilder<bool> has_values_, has_nulls_;
- std::shared_ptr<DataType> type_;
- ConsumeImpl consume_impl_;
- ResizeImpl resize_min_impl_, resize_max_impl_;
- ScalarAggregateOptions options_;
-};
-
-template <typename Impl>
-HashAggregateKernel MakeKernel(InputType argument_type) {
- HashAggregateKernel kernel;
-
- kernel.init = [](KernelContext* ctx,
- const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
- auto impl = ::arrow::internal::make_unique<Impl>();
- // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg
- // for the Impl. Otherwise we're not exposing dispatch as well as we should.
- RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type));
- return std::move(impl);
- };
-
- kernel.signature = KernelSignature::Make(
- {std::move(argument_type), InputType::Array(Type::UINT32),
- InputType::Scalar(Type::UINT32)},
- OutputType(
- [](KernelContext* ctx, const std::vector<ValueDescr>&) -> Result<ValueDescr> {
- return checked_cast<GroupedAggregator*>(ctx->state())->out_type();
- }));
-
- kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) {
- return checked_cast<GroupedAggregator*>(ctx->state())->Consume(batch);
- };
-
- kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) {
- // TODO(ARROW-11840) merge two hash tables
- return Status::NotImplemented("Merge hashed aggregations");
- };
-
- kernel.finalize = [](KernelContext* ctx, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(*out,
- checked_cast<GroupedAggregator*>(ctx->state())->Finalize());
- return Status::OK();
- };
-
- return kernel;
-}
-
-Result<std::vector<const HashAggregateKernel*>> GetKernels(
- ExecContext* ctx, const std::vector<Aggregate>& aggregates,
- const std::vector<ValueDescr>& in_descrs) {
- if (aggregates.size() != in_descrs.size()) {
- return Status::Invalid(aggregates.size(), " aggregate functions were specified but ",
- in_descrs.size(), " arguments were provided.");
- }
-
- std::vector<const HashAggregateKernel*> kernels(in_descrs.size());
-
- for (size_t i = 0; i < aggregates.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(auto function,
- ctx->func_registry()->GetFunction(aggregates[i].function));
- ARROW_ASSIGN_OR_RAISE(
- const Kernel* kernel,
- function->DispatchExact(
- {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())}));
- kernels[i] = static_cast<const HashAggregateKernel*>(kernel);
- }
- return kernels;
-}
-
-Result<std::vector<std::unique_ptr<KernelState>>> InitKernels(
- const std::vector<const HashAggregateKernel*>& kernels, ExecContext* ctx,
- const std::vector<Aggregate>& aggregates, const std::vector<ValueDescr>& in_descrs) {
- std::vector<std::unique_ptr<KernelState>> states(kernels.size());
-
- for (size_t i = 0; i < aggregates.size(); ++i) {
- auto options = aggregates[i].options;
-
- if (options == nullptr) {
- // use known default options for the named function if possible
- auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function);
- if (maybe_function.ok()) {
- options = maybe_function.ValueOrDie()->default_options();
- }
- }
-
- KernelContext kernel_ctx{ctx};
- ARROW_ASSIGN_OR_RAISE(
- states[i], kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i],
- {
- in_descrs[i].type,
- uint32(),
- uint32(),
- },
- options}));
- }
-
- return std::move(states);
-}
-
-Result<FieldVector> ResolveKernels(
- const std::vector<Aggregate>& aggregates,
- const std::vector<const HashAggregateKernel*>& kernels,
- const std::vector<std::unique_ptr<KernelState>>& states, ExecContext* ctx,
- const std::vector<ValueDescr>& descrs) {
- FieldVector fields(descrs.size());
-
- for (size_t i = 0; i < kernels.size(); ++i) {
- KernelContext kernel_ctx{ctx};
- kernel_ctx.SetState(states[i].get());
-
- ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve(
- &kernel_ctx, {
- descrs[i].type,
- uint32(),
- uint32(),
- }));
- fields[i] = field(aggregates[i].function, std::move(descr.type));
- }
- return fields;
-}
-
-} // namespace
-
-Result<std::unique_ptr<Grouper>> Grouper::Make(const std::vector<ValueDescr>& descrs,
- ExecContext* ctx) {
- if (GrouperFastImpl::CanUse(descrs)) {
- return GrouperFastImpl::Make(descrs, ctx);
- }
- return GrouperImpl::Make(descrs, ctx);
-}
-
-Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
- const std::vector<Aggregate>& aggregates, ExecContext* ctx) {
- // Construct and initialize HashAggregateKernels
- ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
- ExecBatch::Make(arguments).Map(
- [](ExecBatch batch) { return batch.GetDescriptors(); }));
-
- ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs));
-
- ARROW_ASSIGN_OR_RAISE(auto states,
- InitKernels(kernels, ctx, aggregates, argument_descrs));
-
- ARROW_ASSIGN_OR_RAISE(
- FieldVector out_fields,
- ResolveKernels(aggregates, kernels, states, ctx, argument_descrs));
-
- using arrow::compute::detail::ExecBatchIterator;
-
- ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator,
- ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
-
- // Construct Grouper
- ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
- return batch.GetDescriptors();
- }));
-
- ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx));
-
- int i = 0;
- for (ValueDescr& key_descr : key_descrs) {
- out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
- }
-
- ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
- ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
-
- // start "streaming" execution
- ExecBatch key_batch, argument_batch;
- while (argument_batch_iterator->Next(&argument_batch) &&
- key_batch_iterator->Next(&key_batch)) {
- if (key_batch.length == 0) continue;
-
- // compute a batch of group ids
- ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
-
- // consume group ids with HashAggregateKernels
- for (size_t i = 0; i < kernels.size(); ++i) {
- KernelContext batch_ctx{ctx};
- batch_ctx.SetState(states[i].get());
- ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch,
- Datum(grouper->num_groups())}));
- RETURN_NOT_OK(kernels[i]->consume(&batch_ctx, batch));
- }
- }
-
- // Finalize output
- ArrayDataVector out_data(arguments.size() + keys.size());
- auto it = out_data.begin();
-
- for (size_t i = 0; i < kernels.size(); ++i) {
- KernelContext batch_ctx{ctx};
- batch_ctx.SetState(states[i].get());
- Datum out;
- RETURN_NOT_OK(kernels[i]->finalize(&batch_ctx, &out));
- *it++ = out.array();
- }
-
- ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques());
- for (const auto& key : out_keys.values) {
- *it++ = key.array();
- }
-
- int64_t length = out_data[0]->length;
- return ArrayData::Make(struct_(std::move(out_fields)), length,
- {/*null_bitmap=*/nullptr}, std::move(out_data),
- /*null_count=*/0);
-}
-
-Result<std::shared_ptr<ListArray>> Grouper::ApplyGroupings(const ListArray& groupings,
- const Array& array,
- ExecContext* ctx) {
- ARROW_ASSIGN_OR_RAISE(Datum sorted,
- compute::Take(array, groupings.data()->child_data[0],
- TakeOptions::NoBoundsCheck(), ctx));
-
- return std::make_shared<ListArray>(list(array.type()), groupings.length(),
- groupings.value_offsets(), sorted.make_array());
-}
-
-Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids,
- uint32_t num_groups,
- ExecContext* ctx) {
- if (ids.null_count() != 0) {
- return Status::Invalid("MakeGroupings with null ids");
- }
-
- ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1),
- ctx->memory_pool()));
- auto raw_offsets = reinterpret_cast<int32_t*>(offsets->mutable_data());
-
- std::memset(raw_offsets, 0, offsets->size());
- for (int i = 0; i < ids.length(); ++i) {
- DCHECK_LT(ids.Value(i), num_groups);
- raw_offsets[ids.Value(i)] += 1;
- }
- int32_t length = 0;
- for (uint32_t id = 0; id < num_groups; ++id) {
- auto offset = raw_offsets[id];
- raw_offsets[id] = length;
- length += offset;
- }
- raw_offsets[num_groups] = length;
- DCHECK_EQ(ids.length(), length);
-
- ARROW_ASSIGN_OR_RAISE(auto offsets_copy,
- offsets->CopySlice(0, offsets->size(), ctx->memory_pool()));
- raw_offsets = reinterpret_cast<int32_t*>(offsets_copy->mutable_data());
-
- ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(),
- ctx->memory_pool()));
- auto raw_sort_indices = reinterpret_cast<int32_t*>(sort_indices->mutable_data());
- for (int i = 0; i < ids.length(); ++i) {
- raw_sort_indices[raw_offsets[ids.Value(i)]++] = i;
- }
-
- return std::make_shared<ListArray>(
- list(int32()), num_groups, std::move(offsets),
- std::make_shared<Int32Array>(ids.length(), std::move(sort_indices)));
-}
-
-namespace {
-const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
- ("By default, non-null values are counted.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array", "group_id_array", "group_count"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
- ("Null values are ignored."),
- {"array", "group_id_array", "group_count"}};
-
-const FunctionDoc hash_min_max_doc{
- "Compute the minimum and maximum values of a numeric array",
- ("Null values are ignored by default.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array", "group_id_array", "group_count"},
- "ScalarAggregateOptions"};
-} // namespace
-
-void RegisterHashAggregateBasic(FunctionRegistry* registry) {
- {
- static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
- auto func = std::make_shared<HashAggregateFunction>(
- "hash_count", Arity::Ternary(), &hash_count_doc,
- &default_scalar_aggregate_options);
- DCHECK_OK(func->AddKernel(MakeKernel<GroupedCountImpl>(ValueDescr::ARRAY)));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-
- {
- auto func = std::make_shared<HashAggregateFunction>("hash_sum", Arity::Ternary(),
- &hash_sum_doc);
- DCHECK_OK(func->AddKernel(MakeKernel<GroupedSumImpl>(ValueDescr::ARRAY)));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-
- {
- static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
- auto func = std::make_shared<HashAggregateFunction>(
- "hash_min_max", Arity::Ternary(), &hash_min_max_doc,
- &default_scalar_aggregate_options);
- DCHECK_OK(func->AddKernel(MakeKernel<GroupedMinMaxImpl>(ValueDescr::ARRAY)));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec/key_compare.h"
+#include "arrow/compute/exec/key_encode.h"
+#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/exec/key_map.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::FirstTimeBitmapWriter;
+
+namespace compute {
+namespace internal {
+namespace {
+
+struct KeyEncoder {
+ // the first byte of an encoded key is used to indicate nullity
+ static constexpr bool kExtraByteForNull = true;
+
+ static constexpr uint8_t kNullByte = 1;
+ static constexpr uint8_t kValidByte = 0;
+
+ virtual ~KeyEncoder() = default;
+
+ virtual void AddLength(const ArrayData&, int32_t* lengths) = 0;
+
+ virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0;
+
+ virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
+ int32_t length, MemoryPool*) = 0;
+
+ // extract the null bitmap from the leading nullity bytes of encoded keys
+ static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
+ std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count) {
+ // first count nulls to determine if a null bitmap is necessary
+ *null_count = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ *null_count += (encoded_bytes[i][0] == kNullByte);
+ }
+
+ if (*null_count > 0) {
+ ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool));
+ uint8_t* validity = (*null_bitmap)->mutable_data();
+
+ FirstTimeBitmapWriter writer(validity, 0, length);
+ for (int32_t i = 0; i < length; ++i) {
+ if (encoded_bytes[i][0] == kValidByte) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ encoded_bytes[i] += 1;
+ }
+ writer.Finish();
+ } else {
+ for (int32_t i = 0; i < length; ++i) {
+ encoded_bytes[i] += 1;
+ }
+ }
+ return Status ::OK();
+ }
+};
+
+struct BooleanKeyEncoder : KeyEncoder {
+ static constexpr int kByteWidth = 1;
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ for (int64_t i = 0; i < data.length; ++i) {
+ lengths[i] += kByteWidth + kExtraByteForNull;
+ }
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ VisitArrayDataInline<BooleanType>(
+ data,
+ [&](bool value) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ *encoded_ptr++ = value;
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ *encoded_ptr++ = 0;
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool));
+
+ uint8_t* raw_output = key_buf->mutable_data();
+ for (int32_t i = 0; i < length; ++i) {
+ auto& encoded_ptr = encoded_bytes[i];
+ BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0);
+ encoded_ptr += 1;
+ }
+
+ return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)},
+ null_count);
+ }
+};
+
+struct FixedWidthKeyEncoder : KeyEncoder {
+ explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
+ : type_(std::move(type)),
+ byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ for (int64_t i = 0; i < data.length; ++i) {
+ lengths[i] += byte_width_ + kExtraByteForNull;
+ }
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers,
+ data.null_count, data.offset);
+
+ VisitArrayDataInline<FixedSizeBinaryType>(
+ viewed,
+ [&](util::string_view bytes) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ memcpy(encoded_ptr, bytes.data(), byte_width_);
+ encoded_ptr += byte_width_;
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ memset(encoded_ptr, 0, byte_width_);
+ encoded_ptr += byte_width_;
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool));
+
+ uint8_t* raw_output = key_buf->mutable_data();
+ for (int32_t i = 0; i < length; ++i) {
+ auto& encoded_ptr = encoded_bytes[i];
+ std::memcpy(raw_output, encoded_ptr, byte_width_);
+ encoded_ptr += byte_width_;
+ raw_output += byte_width_;
+ }
+
+ return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)},
+ null_count);
+ }
+
+ std::shared_ptr<DataType> type_;
+ int byte_width_;
+};
+
+struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
+ DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
+ : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ auto dict = MakeArray(data.dictionary);
+ if (dictionary_) {
+ if (!dictionary_->Equals(dict)) {
+ // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
+ // differs from the first we saw for this key
+ return Status::NotImplemented("Unifying differing dictionaries");
+ }
+ } else {
+ dictionary_ = std::move(dict);
+ }
+ return FixedWidthKeyEncoder::Encode(data, encoded_bytes);
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool));
+
+ if (dictionary_) {
+ data->dictionary = dictionary_->data();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
+ data->dictionary = dict->data();
+ }
+
+ data->type = type_;
+ return data;
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<Array> dictionary_;
+};
+
+template <typename T>
+struct VarLengthKeyEncoder : KeyEncoder {
+ using Offset = typename T::offset_type;
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ int64_t i = 0;
+ VisitArrayDataInline<T>(
+ data,
+ [&](util::string_view bytes) {
+ lengths[i++] +=
+ kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
+ },
+ [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ VisitArrayDataInline<T>(
+ data,
+ [&](util::string_view bytes) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
+ encoded_ptr += sizeof(Offset);
+ memcpy(encoded_ptr, bytes.data(), bytes.size());
+ encoded_ptr += bytes.size();
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ util::SafeStore(encoded_ptr, static_cast<Offset>(0));
+ encoded_ptr += sizeof(Offset);
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ Offset length_sum = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ length_sum += util::SafeLoadAs<Offset>(encoded_bytes[i]);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto offset_buf,
+ AllocateBuffer(sizeof(Offset) * (1 + length), pool));
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum));
+
+ auto raw_offsets = reinterpret_cast<Offset*>(offset_buf->mutable_data());
+ auto raw_keys = key_buf->mutable_data();
+
+ Offset current_offset = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ raw_offsets[i] = current_offset;
+
+ auto key_length = util::SafeLoadAs<Offset>(encoded_bytes[i]);
+ encoded_bytes[i] += sizeof(Offset);
+
+ memcpy(raw_keys + current_offset, encoded_bytes[i], key_length);
+ encoded_bytes[i] += key_length;
+
+ current_offset += key_length;
+ }
+ raw_offsets[length] = current_offset;
+
+ return ArrayData::Make(
+ type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)},
+ null_count);
+ }
+
+ explicit VarLengthKeyEncoder(std::shared_ptr<DataType> type) : type_(std::move(type)) {}
+
+ std::shared_ptr<DataType> type_;
+};
+
+struct GrouperImpl : Grouper {
+ static Result<std::unique_ptr<GrouperImpl>> Make(const std::vector<ValueDescr>& keys,
+ ExecContext* ctx) {
+ auto impl = ::arrow::internal::make_unique<GrouperImpl>();
+
+ impl->encoders_.resize(keys.size());
+ impl->ctx_ = ctx;
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ const auto& key = keys[i].type;
+
+ if (key->id() == Type::BOOL) {
+ impl->encoders_[i] = ::arrow::internal::make_unique<BooleanKeyEncoder>();
+ continue;
+ }
+
+ if (key->id() == Type::DICTIONARY) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<DictionaryKeyEncoder>(key, ctx->memory_pool());
+ continue;
+ }
+
+ if (is_fixed_width(key->id())) {
+ impl->encoders_[i] = ::arrow::internal::make_unique<FixedWidthKeyEncoder>(key);
+ continue;
+ }
+
+ if (is_binary_like(key->id())) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<VarLengthKeyEncoder<BinaryType>>(key);
+ continue;
+ }
+
+ if (is_large_binary_like(key->id())) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<VarLengthKeyEncoder<LargeBinaryType>>(key);
+ continue;
+ }
+
+ return Status::NotImplemented("Keys of type ", *key);
+ }
+
+ return std::move(impl);
+ }
+
+ Result<Datum> Consume(const ExecBatch& batch) override {
+ std::vector<int32_t> offsets_batch(batch.length + 1);
+ for (int i = 0; i < batch.num_values(); ++i) {
+ encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data());
+ }
+
+ int32_t total_length = 0;
+ for (int64_t i = 0; i < batch.length; ++i) {
+ auto total_length_before = total_length;
+ total_length += offsets_batch[i];
+ offsets_batch[i] = total_length_before;
+ }
+ offsets_batch[batch.length] = total_length;
+
+ std::vector<uint8_t> key_bytes_batch(total_length);
+ std::vector<uint8_t*> key_buf_ptrs(batch.length);
+ for (int64_t i = 0; i < batch.length; ++i) {
+ key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i];
+ }
+
+ for (int i = 0; i < batch.num_values(); ++i) {
+ RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data()));
+ }
+
+ TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
+ RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
+
+ for (int64_t i = 0; i < batch.length; ++i) {
+ int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
+ std::string key(
+ reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
+ key_length);
+
+ auto it_success = map_.emplace(key, num_groups_);
+ auto group_id = it_success.first->second;
+
+ if (it_success.second) {
+ // new key; update offsets and key_bytes
+ ++num_groups_;
+ auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
+ key_bytes_.resize(next_key_offset + key_length);
+ offsets_.push_back(next_key_offset + key_length);
+ memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
+ }
+
+ group_ids_batch.UnsafeAppend(group_id);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
+ return Datum(UInt32Array(batch.length, std::move(group_ids)));
+ }
+
+ uint32_t num_groups() const override { return num_groups_; }
+
+ Result<ExecBatch> GetUniques() override {
+ ExecBatch out({}, num_groups_);
+
+ std::vector<uint8_t*> key_buf_ptrs(num_groups_);
+ for (int64_t i = 0; i < num_groups_; ++i) {
+ key_buf_ptrs[i] = key_bytes_.data() + offsets_[i];
+ }
+
+ out.values.resize(encoders_.size());
+ for (size_t i = 0; i < encoders_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ out.values[i],
+ encoders_[i]->Decode(key_buf_ptrs.data(), static_cast<int32_t>(num_groups_),
+ ctx_->memory_pool()));
+ }
+
+ return out;
+ }
+
+ ExecContext* ctx_;
+ std::unordered_map<std::string, uint32_t> map_;
+ std::vector<int32_t> offsets_ = {0};
+ std::vector<uint8_t> key_bytes_;
+ uint32_t num_groups_ = 0;
+ std::vector<std::unique_ptr<KeyEncoder>> encoders_;
+};
+
+struct GrouperFastImpl : Grouper {
+ static constexpr int kBitmapPaddingForSIMD = 64; // bits
+ static constexpr int kPaddingForSIMD = 32; // bytes
+
+ static bool CanUse(const std::vector<ValueDescr>& keys) {
+#if ARROW_LITTLE_ENDIAN
+ for (size_t i = 0; i < keys.size(); ++i) {
+ const auto& key = keys[i].type;
+ if (is_large_binary_like(key->id())) {
+ return false;
+ }
+ }
+ return true;
+#else
+ return false;
+#endif
+ }
+
+ static Result<std::unique_ptr<GrouperFastImpl>> Make(
+ const std::vector<ValueDescr>& keys, ExecContext* ctx) {
+ auto impl = ::arrow::internal::make_unique<GrouperFastImpl>();
+ impl->ctx_ = ctx;
+
+ RETURN_NOT_OK(impl->temp_stack_.Init(ctx->memory_pool(), 64 * minibatch_size_max_));
+ impl->encode_ctx_.hardware_flags =
+ arrow::internal::CpuInfo::GetInstance()->hardware_flags();
+ impl->encode_ctx_.stack = &impl->temp_stack_;
+
+ auto num_columns = keys.size();
+ impl->col_metadata_.resize(num_columns);
+ impl->key_types_.resize(num_columns);
+ impl->dictionaries_.resize(num_columns);
+ for (size_t icol = 0; icol < num_columns; ++icol) {
+ const auto& key = keys[icol].type;
+ if (key->id() == Type::DICTIONARY) {
+ auto bit_width = checked_cast<const FixedWidthType&>(*key).bit_width();
+ ARROW_DCHECK(bit_width % 8 == 0);
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(true, bit_width / 8);
+ } else if (key->id() == Type::BOOL) {
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(true, 0);
+ } else if (is_fixed_width(key->id())) {
+ impl->col_metadata_[icol] = arrow::compute::KeyEncoder::KeyColumnMetadata(
+ true, checked_cast<const FixedWidthType&>(*key).bit_width() / 8);
+ } else if (is_binary_like(key->id())) {
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
+ } else {
+ return Status::NotImplemented("Keys of type ", *key);
+ }
+ impl->key_types_[icol] = key;
+ }
+
+ impl->encoder_.Init(impl->col_metadata_, &impl->encode_ctx_,
+ /* row_alignment = */ sizeof(uint64_t),
+ /* string_alignment = */ sizeof(uint64_t));
+ RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
+ RETURN_NOT_OK(
+ impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
+ impl->minibatch_size_ = impl->minibatch_size_min_;
+ GrouperFastImpl* impl_ptr = impl.get();
+ auto equal_func = [impl_ptr](
+ int num_keys_to_compare, const uint16_t* selection_may_be_null,
+ const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
+ uint16_t* out_selection_mismatch) {
+ arrow::compute::KeyCompare::CompareRows(
+ num_keys_to_compare, selection_may_be_null, group_ids, &impl_ptr->encode_ctx_,
+ out_num_keys_mismatch, out_selection_mismatch, impl_ptr->rows_minibatch_,
+ impl_ptr->rows_);
+ };
+ auto append_func = [impl_ptr](int num_keys, const uint16_t* selection) {
+ return impl_ptr->rows_.AppendSelectionFrom(impl_ptr->rows_minibatch_, num_keys,
+ selection);
+ };
+ RETURN_NOT_OK(impl->map_.init(impl->encode_ctx_.hardware_flags, ctx->memory_pool(),
+ impl->encode_ctx_.stack, impl->log_minibatch_max_,
+ equal_func, append_func));
+ impl->cols_.resize(num_columns);
+ impl->minibatch_hashes_.resize(impl->minibatch_size_max_ +
+ kPaddingForSIMD / sizeof(uint32_t));
+
+ return std::move(impl);
+ }
+
+ ~GrouperFastImpl() { map_.cleanup(); }
+
+ Result<Datum> Consume(const ExecBatch& batch) override {
+ int64_t num_rows = batch.length;
+ int num_columns = batch.num_values();
+
+ // Process dictionaries
+ for (int icol = 0; icol < num_columns; ++icol) {
+ if (key_types_[icol]->id() == Type::DICTIONARY) {
+ auto data = batch[icol].array();
+ auto dict = MakeArray(data->dictionary);
+ if (dictionaries_[icol]) {
+ if (!dictionaries_[icol]->Equals(dict)) {
+ // TODO(bkietz) unify if necessary. For now, just error if any batch's
+ // dictionary differs from the first we saw for this key
+ return Status::NotImplemented("Unifying differing dictionaries");
+ }
+ } else {
+ dictionaries_[icol] = std::move(dict);
+ }
+ }
+ }
+
+ std::shared_ptr<arrow::Buffer> group_ids;
+ ARROW_ASSIGN_OR_RAISE(
+ group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
+
+ for (int icol = 0; icol < num_columns; ++icol) {
+ const uint8_t* non_nulls = nullptr;
+ if (batch[icol].array()->buffers[0] != NULLPTR) {
+ non_nulls = batch[icol].array()->buffers[0]->data();
+ }
+ const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
+ const uint8_t* varlen = nullptr;
+ if (!col_metadata_[icol].is_fixed_length) {
+ varlen = batch[icol].array()->buffers[2]->data();
+ }
+
+ int64_t offset = batch[icol].array()->offset;
+
+ auto col_base = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[icol], offset + num_rows, non_nulls, fixedlen, varlen);
+
+ cols_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnArray(col_base, offset, num_rows);
+ }
+
+ // Split into smaller mini-batches
+ //
+ for (uint32_t start_row = 0; start_row < num_rows;) {
+ uint32_t batch_size_next = std::min(static_cast<uint32_t>(minibatch_size_),
+ static_cast<uint32_t>(num_rows) - start_row);
+
+ // Encode
+ rows_minibatch_.Clean();
+ RETURN_NOT_OK(encoder_.PrepareOutputForEncode(start_row, batch_size_next,
+ &rows_minibatch_, cols_));
+ encoder_.Encode(start_row, batch_size_next, &rows_minibatch_, cols_);
+
+ // Compute hash
+ if (encoder_.row_metadata().is_fixed_length) {
+ Hashing::hash_fixed(encode_ctx_.hardware_flags, batch_size_next,
+ encoder_.row_metadata().fixed_length, rows_minibatch_.data(1),
+ minibatch_hashes_.data());
+ } else {
+ auto hash_temp_buf =
+ util::TempVectorHolder<uint32_t>(&temp_stack_, 4 * batch_size_next);
+ Hashing::hash_varlen(encode_ctx_.hardware_flags, batch_size_next,
+ rows_minibatch_.offsets(), rows_minibatch_.data(2),
+ hash_temp_buf.mutable_data(), minibatch_hashes_.data());
+ }
+
+ // Map
+ RETURN_NOT_OK(
+ map_.map(batch_size_next, minibatch_hashes_.data(),
+ reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row));
+
+ start_row += batch_size_next;
+
+ if (minibatch_size_ * 2 <= minibatch_size_max_) {
+ minibatch_size_ *= 2;
+ }
+ }
+
+ return Datum(UInt32Array(batch.length, std::move(group_ids)));
+ }
+
+ uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }
+
+ // Make sure padded buffers end up with the right logical size
+
+ Result<std::shared_ptr<Buffer>> AllocatePaddedBitmap(int64_t length) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> buf,
+ AllocateBitmap(length + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+ return SliceMutableBuffer(buf, 0, BitUtil::BytesForBits(length));
+ }
+
+ Result<std::shared_ptr<Buffer>> AllocatePaddedBuffer(int64_t size) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> buf,
+ AllocateBuffer(size + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+ return SliceMutableBuffer(buf, 0, size);
+ }
+
+ Result<ExecBatch> GetUniques() override {
+ auto num_columns = static_cast<uint32_t>(col_metadata_.size());
+ int64_t num_groups = rows_.length();
+
+ std::vector<std::shared_ptr<Buffer>> non_null_bufs(num_columns);
+ std::vector<std::shared_ptr<Buffer>> fixedlen_bufs(num_columns);
+ std::vector<std::shared_ptr<Buffer>> varlen_bufs(num_columns);
+
+ for (size_t i = 0; i < num_columns; ++i) {
+ ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocatePaddedBitmap(num_groups));
+ if (col_metadata_[i].is_fixed_length) {
+ if (col_metadata_[i].fixed_length == 0) {
+ ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBitmap(num_groups));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ fixedlen_bufs[i],
+ AllocatePaddedBuffer(num_groups * col_metadata_[i].fixed_length));
+ }
+ } else {
+ ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
+ AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t)));
+ }
+ cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
+ fixedlen_bufs[i]->mutable_data(), nullptr);
+ }
+
+ for (int64_t start_row = 0; start_row < num_groups;) {
+ int64_t batch_size_next =
+ std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
+ encoder_.DecodeFixedLengthBuffers(start_row, start_row, batch_size_next, rows_,
+ &cols_);
+ start_row += batch_size_next;
+ }
+
+ if (!rows_.metadata().is_fixed_length) {
+ for (size_t i = 0; i < num_columns; ++i) {
+ if (!col_metadata_[i].is_fixed_length) {
+ auto varlen_size =
+ reinterpret_cast<const uint32_t*>(fixedlen_bufs[i]->data())[num_groups];
+ ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocatePaddedBuffer(varlen_size));
+ cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
+ fixedlen_bufs[i]->mutable_data(), varlen_bufs[i]->mutable_data());
+ }
+ }
+
+ for (int64_t start_row = 0; start_row < num_groups;) {
+ int64_t batch_size_next =
+ std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
+ encoder_.DecodeVaryingLengthBuffers(start_row, start_row, batch_size_next, rows_,
+ &cols_);
+ start_row += batch_size_next;
+ }
+ }
+
+ ExecBatch out({}, num_groups);
+ out.values.resize(num_columns);
+ for (size_t i = 0; i < num_columns; ++i) {
+ auto valid_count = arrow::internal::CountSetBits(
+ non_null_bufs[i]->data(), /*offset=*/0, static_cast<int64_t>(num_groups));
+ int null_count = static_cast<int>(num_groups) - static_cast<int>(valid_count);
+
+ if (col_metadata_[i].is_fixed_length) {
+ out.values[i] = ArrayData::Make(
+ key_types_[i], num_groups,
+ {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])}, null_count);
+ } else {
+ out.values[i] =
+ ArrayData::Make(key_types_[i], num_groups,
+ {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]),
+ std::move(varlen_bufs[i])},
+ null_count);
+ }
+ }
+
+ // Process dictionaries
+ for (size_t icol = 0; icol < num_columns; ++icol) {
+ if (key_types_[icol]->id() == Type::DICTIONARY) {
+ if (dictionaries_[icol]) {
+ out.values[icol].array()->dictionary = dictionaries_[icol]->data();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(key_types_[icol], 0));
+ out.values[icol].array()->dictionary = dict->data();
+ }
+ }
+ }
+
+ return out;
+ }
+
+ static constexpr int log_minibatch_max_ = 10;
+ static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_;
+ static constexpr int minibatch_size_min_ = 128;
+ int minibatch_size_;
+
+ ExecContext* ctx_;
+ arrow::util::TempVectorStack temp_stack_;
+ arrow::compute::KeyEncoder::KeyEncoderContext encode_ctx_;
+
+ std::vector<std::shared_ptr<arrow::DataType>> key_types_;
+ std::vector<arrow::compute::KeyEncoder::KeyColumnMetadata> col_metadata_;
+ std::vector<arrow::compute::KeyEncoder::KeyColumnArray> cols_;
+ std::vector<uint32_t> minibatch_hashes_;
+
+ std::vector<std::shared_ptr<Array>> dictionaries_;
+
+ arrow::compute::KeyEncoder::KeyRowArray rows_;
+ arrow::compute::KeyEncoder::KeyRowArray rows_minibatch_;
+ arrow::compute::KeyEncoder encoder_;
+ arrow::compute::SwissTable map_;
+};
+
+/// C++ abstract base class for the HashAggregateKernel interface.
+/// Implementations should be default constructible and perform initialization in
+/// Init().
+struct GroupedAggregator : KernelState {
+ virtual Status Init(ExecContext*, const FunctionOptions*,
+ const std::shared_ptr<DataType>&) = 0;
+
+ virtual Status Consume(const ExecBatch& batch) = 0;
+
+ virtual Result<Datum> Finalize() = 0;
+
+ template <typename Reserve>
+ Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch,
+ const Reserve& reserve) {
+ int64_t new_num_groups = batch[2].scalar_as<UInt32Scalar>().value;
+ if (new_num_groups <= old_num_groups) {
+ return Status::OK();
+ }
+ return reserve(new_num_groups - old_num_groups);
+ }
+
+ virtual std::shared_ptr<DataType> out_type() const = 0;
+};
+
+// ----------------------------------------------------------------------
+// Count implementation
+
+struct GroupedCountImpl : public GroupedAggregator {
+ Status Init(ExecContext* ctx, const FunctionOptions* options,
+ const std::shared_ptr<DataType>&) override {
+ options_ = checked_cast<const ScalarAggregateOptions&>(*options);
+ counts_ = BufferBuilder(ctx->memory_pool());
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ return counts_.Append(added_groups * sizeof(int64_t), 0);
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ auto raw_counts = reinterpret_cast<int64_t*>(counts_.mutable_data());
+
+ const auto& input = batch[0].array();
+
+ if (!options_.skip_nulls) {
+ if (input->GetNullCount() != 0) {
+ for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) {
+ auto g = group_ids[i];
+ raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i);
+ }
+ }
+ return Status::OK();
+ }
+
+ arrow::internal::VisitSetBitRunsVoid(
+ input->buffers[0], input->offset, input->length,
+ [&](int64_t begin, int64_t length) {
+ for (int64_t input_i = begin, i = begin - input->offset;
+ input_i < begin + length; ++input_i, ++i) {
+ auto g = group_ids[i];
+ raw_counts[g] += 1;
+ }
+ });
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish());
+ return std::make_shared<Int64Array>(num_groups_, std::move(counts));
+ }
+
+ std::shared_ptr<DataType> out_type() const override { return int64(); }
+
+ int64_t num_groups_ = 0;
+ ScalarAggregateOptions options_;
+ BufferBuilder counts_;
+};
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+struct GroupedSumImpl : public GroupedAggregator {
+ // NB: whether we are accumulating into double, int64_t, or uint64_t
+ // we always have 64 bits per group in the sums buffer.
+ static constexpr size_t kSumSize = sizeof(int64_t);
+
+ using ConsumeImpl = std::function<void(const std::shared_ptr<ArrayData>&,
+ const uint32_t*, void*, int64_t*)>;
+
+ struct GetConsumeImpl {
+ template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
+ Status Visit(const T&) {
+ consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+ void* boxed_sums, int64_t* counts) {
+ auto sums = reinterpret_cast<typename TypeTraits<AccType>::CType*>(boxed_sums);
+
+ VisitArrayDataInline<T>(
+ *input,
+ [&](typename TypeTraits<T>::CType value) {
+ sums[*group] += value;
+ counts[*group] += 1;
+ ++group;
+ },
+ [&] { ++group; });
+ };
+ out_type = TypeTraits<AccType>::type_singleton();
+ return Status::OK();
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ return Status::NotImplemented("Summing data of type ", type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Summing data of type ", type);
+ }
+
+ ConsumeImpl consume_impl;
+ std::shared_ptr<DataType> out_type;
+ };
+
+ Status Init(ExecContext* ctx, const FunctionOptions*,
+ const std::shared_ptr<DataType>& input_type) override {
+ pool_ = ctx->memory_pool();
+ sums_ = BufferBuilder(pool_);
+ counts_ = BufferBuilder(pool_);
+
+ GetConsumeImpl get_consume_impl;
+ RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl));
+
+ consume_impl_ = std::move(get_consume_impl.consume_impl);
+ out_type_ = std::move(get_consume_impl.out_type);
+
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0));
+ RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0));
+ return Status::OK();
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(),
+ reinterpret_cast<int64_t*>(counts_.mutable_data()));
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ std::shared_ptr<Buffer> null_bitmap;
+ int64_t null_count = 0;
+
+ for (int64_t i = 0; i < num_groups_; ++i) {
+ if (reinterpret_cast<const int64_t*>(counts_.data())[i] > 0) continue;
+
+ if (null_bitmap == nullptr) {
+ ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_));
+ BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true);
+ }
+
+ null_count += 1;
+ BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish());
+
+ return ArrayData::Make(std::move(out_type_), num_groups_,
+ {std::move(null_bitmap), std::move(sums)}, null_count);
+ }
+
+ std::shared_ptr<DataType> out_type() const override { return out_type_; }
+
+ // NB: counts are used here instead of a simple "has_values_" bitmap since
+ // we expect to reuse this kernel to handle Mean
+ int64_t num_groups_ = 0;
+ BufferBuilder sums_, counts_;
+ std::shared_ptr<DataType> out_type_;
+ ConsumeImpl consume_impl_;
+ MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename CType>
+struct Extrema : std::numeric_limits<CType> {};
+
+template <>
+struct Extrema<float> {
+ static constexpr float min() { return -std::numeric_limits<float>::infinity(); }
+ static constexpr float max() { return std::numeric_limits<float>::infinity(); }
+};
+
+template <>
+struct Extrema<double> {
+ static constexpr double min() { return -std::numeric_limits<double>::infinity(); }
+ static constexpr double max() { return std::numeric_limits<double>::infinity(); }
+};
+
+struct GroupedMinMaxImpl : public GroupedAggregator {
+ using ConsumeImpl =
+ std::function<void(const std::shared_ptr<ArrayData>&, const uint32_t*, void*, void*,
+ uint8_t*, uint8_t*)>;
+
+ using ResizeImpl = std::function<Status(BufferBuilder*, int64_t)>;
+
+ template <typename CType>
+ static ResizeImpl MakeResizeImpl(CType anti_extreme) {
+ // resize a min or max buffer, storing the correct anti extreme
+ return [anti_extreme](BufferBuilder* builder, int64_t added_groups) {
+ TypedBufferBuilder<CType> typed_builder(std::move(*builder));
+ RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme));
+ *builder = std::move(*typed_builder.bytes_builder());
+ return Status::OK();
+ };
+ }
+
+ struct GetImpl {
+ template <typename T, typename CType = typename TypeTraits<T>::CType>
+ enable_if_number<T, Status> Visit(const T&) {
+ consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+ void* mins, void* maxes, uint8_t* has_values,
+ uint8_t* has_nulls) {
+ auto raw_mins = reinterpret_cast<CType*>(mins);
+ auto raw_maxes = reinterpret_cast<CType*>(maxes);
+
+ VisitArrayDataInline<T>(
+ *input,
+ [&](CType val) {
+ raw_maxes[*group] = std::max(raw_maxes[*group], val);
+ raw_mins[*group] = std::min(raw_mins[*group], val);
+ BitUtil::SetBit(has_values, *group++);
+ },
+ [&] { BitUtil::SetBit(has_nulls, *group++); });
+ };
+
+ resize_min_impl = MakeResizeImpl(Extrema<CType>::max());
+ resize_max_impl = MakeResizeImpl(Extrema<CType>::min());
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ ConsumeImpl consume_impl;
+ ResizeImpl resize_min_impl, resize_max_impl;
+ };
+
+ Status Init(ExecContext* ctx, const FunctionOptions* options,
+ const std::shared_ptr<DataType>& input_type) override {
+ options_ = *checked_cast<const ScalarAggregateOptions*>(options);
+ type_ = input_type;
+
+ mins_ = BufferBuilder(ctx->memory_pool());
+ maxes_ = BufferBuilder(ctx->memory_pool());
+ has_values_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+ has_nulls_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+
+ GetImpl get_impl;
+ RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl));
+
+ consume_impl_ = std::move(get_impl.consume_impl);
+ resize_min_impl_ = std::move(get_impl.resize_min_impl);
+ resize_max_impl_ = std::move(get_impl.resize_max_impl);
+
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups));
+ RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups));
+ RETURN_NOT_OK(has_values_.Append(added_groups, false));
+ RETURN_NOT_OK(has_nulls_.Append(added_groups, false));
+ return Status::OK();
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(),
+ maxes_.mutable_data(), has_values_.mutable_data(),
+ has_nulls_.mutable_data());
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ // aggregation for group is valid if there was at least one value in that group
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
+
+ if (!options_.skip_nulls) {
+ // ... and there were no nulls in that group
+ ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
+ arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
+ num_groups_, 0, null_bitmap->mutable_data());
+ }
+
+ auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
+ auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
+ ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish());
+ ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish());
+
+ return ArrayData::Make(out_type(), num_groups_, {nullptr},
+ {std::move(mins), std::move(maxes)});
+ }
+
+ std::shared_ptr<DataType> out_type() const override {
+ return struct_({field("min", type_), field("max", type_)});
+ }
+
+ int64_t num_groups_;
+ BufferBuilder mins_, maxes_;
+ TypedBufferBuilder<bool> has_values_, has_nulls_;
+ std::shared_ptr<DataType> type_;
+ ConsumeImpl consume_impl_;
+ ResizeImpl resize_min_impl_, resize_max_impl_;
+ ScalarAggregateOptions options_;
+};
+
+template <typename Impl>
+HashAggregateKernel MakeKernel(InputType argument_type) {
+ HashAggregateKernel kernel;
+
+ kernel.init = [](KernelContext* ctx,
+ const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
+ auto impl = ::arrow::internal::make_unique<Impl>();
+ // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg
+ // for the Impl. Otherwise we're not exposing dispatch as well as we should.
+ RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type));
+ return std::move(impl);
+ };
+
+ kernel.signature = KernelSignature::Make(
+ {std::move(argument_type), InputType::Array(Type::UINT32),
+ InputType::Scalar(Type::UINT32)},
+ OutputType(
+ [](KernelContext* ctx, const std::vector<ValueDescr>&) -> Result<ValueDescr> {
+ return checked_cast<GroupedAggregator*>(ctx->state())->out_type();
+ }));
+
+ kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) {
+ return checked_cast<GroupedAggregator*>(ctx->state())->Consume(batch);
+ };
+
+ kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) {
+ // TODO(ARROW-11840) merge two hash tables
+ return Status::NotImplemented("Merge hashed aggregations");
+ };
+
+ kernel.finalize = [](KernelContext* ctx, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(*out,
+ checked_cast<GroupedAggregator*>(ctx->state())->Finalize());
+ return Status::OK();
+ };
+
+ return kernel;
+}
+
+Result<std::vector<const HashAggregateKernel*>> GetKernels(
+ ExecContext* ctx, const std::vector<Aggregate>& aggregates,
+ const std::vector<ValueDescr>& in_descrs) {
+ if (aggregates.size() != in_descrs.size()) {
+ return Status::Invalid(aggregates.size(), " aggregate functions were specified but ",
+ in_descrs.size(), " arguments were provided.");
+ }
+
+ std::vector<const HashAggregateKernel*> kernels(in_descrs.size());
+
+ for (size_t i = 0; i < aggregates.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto function,
+ ctx->func_registry()->GetFunction(aggregates[i].function));
+ ARROW_ASSIGN_OR_RAISE(
+ const Kernel* kernel,
+ function->DispatchExact(
+ {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())}));
+ kernels[i] = static_cast<const HashAggregateKernel*>(kernel);
+ }
+ return kernels;
+}
+
+Result<std::vector<std::unique_ptr<KernelState>>> InitKernels(
+ const std::vector<const HashAggregateKernel*>& kernels, ExecContext* ctx,
+ const std::vector<Aggregate>& aggregates, const std::vector<ValueDescr>& in_descrs) {
+ std::vector<std::unique_ptr<KernelState>> states(kernels.size());
+
+ for (size_t i = 0; i < aggregates.size(); ++i) {
+ auto options = aggregates[i].options;
+
+ if (options == nullptr) {
+ // use known default options for the named function if possible
+ auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function);
+ if (maybe_function.ok()) {
+ options = maybe_function.ValueOrDie()->default_options();
+ }
+ }
+
+ KernelContext kernel_ctx{ctx};
+ ARROW_ASSIGN_OR_RAISE(
+ states[i], kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i],
+ {
+ in_descrs[i].type,
+ uint32(),
+ uint32(),
+ },
+ options}));
+ }
+
+ return std::move(states);
+}
+
+Result<FieldVector> ResolveKernels(
+ const std::vector<Aggregate>& aggregates,
+ const std::vector<const HashAggregateKernel*>& kernels,
+ const std::vector<std::unique_ptr<KernelState>>& states, ExecContext* ctx,
+ const std::vector<ValueDescr>& descrs) {
+ FieldVector fields(descrs.size());
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext kernel_ctx{ctx};
+ kernel_ctx.SetState(states[i].get());
+
+ ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve(
+ &kernel_ctx, {
+ descrs[i].type,
+ uint32(),
+ uint32(),
+ }));
+ fields[i] = field(aggregates[i].function, std::move(descr.type));
+ }
+ return fields;
+}
+
+} // namespace
+
+Result<std::unique_ptr<Grouper>> Grouper::Make(const std::vector<ValueDescr>& descrs,
+ ExecContext* ctx) {
+ if (GrouperFastImpl::CanUse(descrs)) {
+ return GrouperFastImpl::Make(descrs, ctx);
+ }
+ return GrouperImpl::Make(descrs, ctx);
+}
+
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+ const std::vector<Aggregate>& aggregates, ExecContext* ctx) {
+ // Construct and initialize HashAggregateKernels
+ ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
+ ExecBatch::Make(arguments).Map(
+ [](ExecBatch batch) { return batch.GetDescriptors(); }));
+
+ ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs));
+
+ ARROW_ASSIGN_OR_RAISE(auto states,
+ InitKernels(kernels, ctx, aggregates, argument_descrs));
+
+ ARROW_ASSIGN_OR_RAISE(
+ FieldVector out_fields,
+ ResolveKernels(aggregates, kernels, states, ctx, argument_descrs));
+
+ using arrow::compute::detail::ExecBatchIterator;
+
+ ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator,
+ ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
+
+ // Construct Grouper
+ ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
+ return batch.GetDescriptors();
+ }));
+
+ ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx));
+
+ int i = 0;
+ for (ValueDescr& key_descr : key_descrs) {
+ out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
+ ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
+
+ // start "streaming" execution
+ ExecBatch key_batch, argument_batch;
+ while (argument_batch_iterator->Next(&argument_batch) &&
+ key_batch_iterator->Next(&key_batch)) {
+ if (key_batch.length == 0) continue;
+
+ // compute a batch of group ids
+ ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
+
+ // consume group ids with HashAggregateKernels
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext batch_ctx{ctx};
+ batch_ctx.SetState(states[i].get());
+ ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch,
+ Datum(grouper->num_groups())}));
+ RETURN_NOT_OK(kernels[i]->consume(&batch_ctx, batch));
+ }
+ }
+
+ // Finalize output
+ ArrayDataVector out_data(arguments.size() + keys.size());
+ auto it = out_data.begin();
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext batch_ctx{ctx};
+ batch_ctx.SetState(states[i].get());
+ Datum out;
+ RETURN_NOT_OK(kernels[i]->finalize(&batch_ctx, &out));
+ *it++ = out.array();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques());
+ for (const auto& key : out_keys.values) {
+ *it++ = key.array();
+ }
+
+ int64_t length = out_data[0]->length;
+ return ArrayData::Make(struct_(std::move(out_fields)), length,
+ {/*null_bitmap=*/nullptr}, std::move(out_data),
+ /*null_count=*/0);
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::ApplyGroupings(const ListArray& groupings,
+ const Array& array,
+ ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum sorted,
+ compute::Take(array, groupings.data()->child_data[0],
+ TakeOptions::NoBoundsCheck(), ctx));
+
+ return std::make_shared<ListArray>(list(array.type()), groupings.length(),
+ groupings.value_offsets(), sorted.make_array());
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids,
+ uint32_t num_groups,
+ ExecContext* ctx) {
+ if (ids.null_count() != 0) {
+ return Status::Invalid("MakeGroupings with null ids");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1),
+ ctx->memory_pool()));
+ auto raw_offsets = reinterpret_cast<int32_t*>(offsets->mutable_data());
+
+ std::memset(raw_offsets, 0, offsets->size());
+ for (int i = 0; i < ids.length(); ++i) {
+ DCHECK_LT(ids.Value(i), num_groups);
+ raw_offsets[ids.Value(i)] += 1;
+ }
+ int32_t length = 0;
+ for (uint32_t id = 0; id < num_groups; ++id) {
+ auto offset = raw_offsets[id];
+ raw_offsets[id] = length;
+ length += offset;
+ }
+ raw_offsets[num_groups] = length;
+ DCHECK_EQ(ids.length(), length);
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets_copy,
+ offsets->CopySlice(0, offsets->size(), ctx->memory_pool()));
+ raw_offsets = reinterpret_cast<int32_t*>(offsets_copy->mutable_data());
+
+ ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(),
+ ctx->memory_pool()));
+ auto raw_sort_indices = reinterpret_cast<int32_t*>(sort_indices->mutable_data());
+ for (int i = 0; i < ids.length(); ++i) {
+ raw_sort_indices[raw_offsets[ids.Value(i)]++] = i;
+ }
+
+ return std::make_shared<ListArray>(
+ list(int32()), num_groups, std::move(offsets),
+ std::make_shared<Int32Array>(ids.length(), std::move(sort_indices)));
+}
+
+namespace {
+const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
+ ("By default, non-null values are counted.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array", "group_id_array", "group_count"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
+ ("Null values are ignored."),
+ {"array", "group_id_array", "group_count"}};
+
+const FunctionDoc hash_min_max_doc{
+ "Compute the minimum and maximum values of a numeric array",
+ ("Null values are ignored by default.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array", "group_id_array", "group_count"},
+ "ScalarAggregateOptions"};
+} // namespace
+
+void RegisterHashAggregateBasic(FunctionRegistry* registry) {
+ {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<HashAggregateFunction>(
+ "hash_count", Arity::Ternary(), &hash_count_doc,
+ &default_scalar_aggregate_options);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedCountImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ auto func = std::make_shared<HashAggregateFunction>("hash_sum", Arity::Ternary(),
+ &hash_sum_doc);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedSumImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<HashAggregateFunction>(
+ "hash_min_max", Arity::Ternary(), &hash_min_max_doc,
+ &default_scalar_aggregate_options);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedMinMaxImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index a5d4a557740..a8f1f82771b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -15,17 +15,17 @@
// specific language governing permissions and limitations
// under the License.
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <utility>
-
-#include "arrow/compute/kernels/codegen_internal.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/decimal.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/macros.h"
@@ -34,7 +34,7 @@ namespace arrow {
using internal::AddWithOverflow;
using internal::DivideWithOverflow;
using internal::MultiplyWithOverflow;
-using internal::NegateWithOverflow;
+using internal::NegateWithOverflow;
using internal::SubtractWithOverflow;
namespace compute {
@@ -42,8 +42,8 @@ namespace internal {
using applicator::ScalarBinaryEqualTypes;
using applicator::ScalarBinaryNotNullEqualTypes;
-using applicator::ScalarUnary;
-using applicator::ScalarUnaryNotNull;
+using applicator::ScalarUnary;
+using applicator::ScalarUnaryNotNull;
namespace {
@@ -55,169 +55,169 @@ template <typename T>
using is_signed_integer =
std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
-template <typename T, typename R = T>
-using enable_if_signed_integer = enable_if_t<is_signed_integer<T>::value, R>;
+template <typename T, typename R = T>
+using enable_if_signed_integer = enable_if_t<is_signed_integer<T>::value, R>;
-template <typename T, typename R = T>
-using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer<T>::value, R>;
+template <typename T, typename R = T>
+using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer<T>::value, R>;
-template <typename T, typename R = T>
+template <typename T, typename R = T>
using enable_if_integer =
- enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, R>;
-
-template <typename T, typename R = T>
-using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, R>;
+ enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, R>;
+template <typename T, typename R = T>
+using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, R>;
+
template <typename T>
-using enable_if_decimal =
- enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
- T>;
+using enable_if_decimal =
+ enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
+ T>;
template <typename T, typename Unsigned = typename std::make_unsigned<T>::type>
constexpr Unsigned to_unsigned(T signed_) {
return static_cast<Unsigned>(signed_);
}
-struct AbsoluteValue {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, T arg, Status*) {
- return std::fabs(arg);
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, T arg, Status*) {
- return arg;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, T arg, Status* st) {
- return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
- }
-};
-
-struct AbsoluteValueChecked {
- template <typename T, typename Arg>
- static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == std::numeric_limits<Arg>::min()) {
- *st = Status::Invalid("overflow");
- return arg;
- }
- return std::abs(arg);
- }
-
- template <typename T, typename Arg>
- static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- return arg;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- return std::fabs(arg);
- }
-};
-
+struct AbsoluteValue {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, T arg, Status*) {
+ return std::fabs(arg);
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, T arg, Status*) {
+ return arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, T arg, Status* st) {
+ return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
+ }
+};
+
+struct AbsoluteValueChecked {
+ template <typename T, typename Arg>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == std::numeric_limits<Arg>::min()) {
+ *st = Status::Invalid("overflow");
+ return arg;
+ }
+ return std::abs(arg);
+ }
+
+ template <typename T, typename Arg>
+ static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return std::fabs(arg);
+ }
+};
+
struct Add {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
return left + right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
- Arg1 right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
+ Arg1 right, Status*) {
return left + right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
return arrow::internal::SafeSignedAdd(left, right);
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + right;
+ }
};
struct AddChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result = 0;
if (ARROW_PREDICT_FALSE(AddWithOverflow(left, right, &result))) {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left + right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + right;
+ }
};
struct Subtract {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left - right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
- Arg1 right, Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
+ Arg1 right, Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left - right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return arrow::internal::SafeSignedSubtract(left, right);
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + (-right);
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + (-right);
+ }
};
struct SubtractChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result = 0;
if (ARROW_PREDICT_FALSE(SubtractWithOverflow(left, right, &result))) {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left - right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + (-right);
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + (-right);
+ }
};
struct Multiply {
@@ -230,23 +230,23 @@ struct Multiply {
static_assert(std::is_same<decltype(int64_t() * int64_t()), int64_t>::value, "");
static_assert(std::is_same<decltype(uint64_t() * uint64_t()), uint64_t>::value, "");
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, T left, T right,
- Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, T left, T right,
+ Status*) {
return left * right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_t<
- is_unsigned_integer<T>::value && !std::is_same<T, uint16_t>::value, T>
- Call(KernelContext*, T left, T right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_t<
+ is_unsigned_integer<T>::value && !std::is_same<T, uint16_t>::value, T>
+ Call(KernelContext*, T left, T right, Status*) {
return left * right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_t<
- is_signed_integer<T>::value && !std::is_same<T, int16_t>::value, T>
- Call(KernelContext*, T left, T right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_t<
+ is_signed_integer<T>::value && !std::is_same<T, int16_t>::value, T>
+ Call(KernelContext*, T left, T right, Status*) {
return to_unsigned(left) * to_unsigned(right);
}
@@ -254,593 +254,593 @@ struct Multiply {
// integer. However, some inputs may nevertheless overflow (which triggers undefined
// behaviour). Therefore we first cast to 32 bit unsigned integers where overflow is
// well defined.
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_same<T, int16_t, T> Call(KernelContext*, int16_t left,
- int16_t right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_same<T, int16_t, T> Call(KernelContext*, int16_t left,
+ int16_t right, Status*) {
return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_same<T, uint16_t, T> Call(KernelContext*, uint16_t left,
- uint16_t right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_same<T, uint16_t, T> Call(KernelContext*, uint16_t left,
+ uint16_t right, Status*) {
return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left * right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left * right;
+ }
};
struct MultiplyChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result = 0;
if (ARROW_PREDICT_FALSE(MultiplyWithOverflow(left, right, &result))) {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left * right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left * right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left * right;
+ }
};
struct Divide {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
return left / right;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
T result;
if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
if (right == 0) {
- *st = Status::Invalid("divide by zero");
+ *st = Status::Invalid("divide by zero");
} else {
result = 0;
}
}
return result;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
- if (right == Arg1()) {
- *st = Status::Invalid("Divide by zero");
- return T();
- } else {
- return left / right;
- }
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ if (right == Arg1()) {
+ *st = Status::Invalid("Divide by zero");
+ return T();
+ } else {
+ return left / right;
+ }
+ }
};
struct DivideChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result;
if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
if (right == 0) {
- *st = Status::Invalid("divide by zero");
+ *st = Status::Invalid("divide by zero");
} else {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status* st) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
if (ARROW_PREDICT_FALSE(right == 0)) {
- *st = Status::Invalid("divide by zero");
+ *st = Status::Invalid("divide by zero");
return 0;
}
return left / right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext* ctx, Arg0 left, Arg1 right,
- Status* st) {
- return Divide::Call<T>(ctx, left, right, st);
- }
-};
-
-struct Negate {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return -arg;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return ~arg + 1;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return arrow::internal::SafeSignedNegate(arg);
- }
-};
-
-struct NegateChecked {
- template <typename T, typename Arg>
- static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- T result = 0;
- if (ARROW_PREDICT_FALSE(NegateWithOverflow(arg, &result))) {
- *st = Status::Invalid("overflow");
- }
- return result;
- }
-
- template <typename T, typename Arg>
- static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- DCHECK(false) << "This is included only for the purposes of instantiability from the "
- "arithmetic kernel generator";
- return 0;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- return -arg;
- }
-};
-
-struct Power {
- ARROW_NOINLINE
- static uint64_t IntegerPower(uint64_t base, uint64_t exp) {
- // right to left O(logn) power
- uint64_t pow = 1;
- while (exp) {
- pow *= (exp & 1) ? base : 1;
- base *= base;
- exp >>= 1;
- }
- return pow;
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, T base, T exp, Status* st) {
- if (exp < 0) {
- *st = Status::Invalid("integers to negative integer powers are not allowed");
- return 0;
- }
- return static_cast<T>(IntegerPower(base, exp));
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, T base, T exp, Status*) {
- return std::pow(base, exp);
- }
-};
-
-struct PowerChecked {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status* st) {
- if (exp < 0) {
- *st = Status::Invalid("integers to negative integer powers are not allowed");
- return 0;
- } else if (exp == 0) {
- return 1;
- }
- // left to right O(logn) power with overflow checks
- bool overflow = false;
- uint64_t bitmask =
- 1ULL << (63 - BitUtil::CountLeadingZeros(static_cast<uint64_t>(exp)));
- T pow = 1;
- while (bitmask) {
- overflow |= MultiplyWithOverflow(pow, pow, &pow);
- if (exp & bitmask) {
- overflow |= MultiplyWithOverflow(pow, base, &pow);
- }
- bitmask >>= 1;
- }
- if (overflow) {
- *st = Status::Invalid("overflow");
- }
- return pow;
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
- return std::pow(base, exp);
- }
-};
-
-struct Sign {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1));
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return arg > 0;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
- }
-};
-
-// Bitwise operations
-
-struct BitWiseNot {
- template <typename T, typename Arg>
- static T Call(KernelContext*, Arg arg, Status*) {
- return ~arg;
- }
-};
-
-struct BitWiseAnd {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- return lhs & rhs;
- }
-};
-
-struct BitWiseOr {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- return lhs | rhs;
- }
-};
-
-struct BitWiseXor {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- return lhs ^ rhs;
- }
-};
-
-struct ShiftLeft {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- using Unsigned = typename std::make_unsigned<Arg0>::type;
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- return lhs;
- }
- return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
- }
-};
-
-// See SEI CERT C Coding Standard rule INT34-C
-struct ShiftLeftChecked {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
- Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
- return lhs;
- }
- return lhs << rhs;
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_signed_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
- Status* st) {
- using Unsigned = typename std::make_unsigned<Arg0>::type;
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
- return lhs;
- }
- // In C/C++ left shift of a negative number is undefined (C++11 standard 5.8.2)
- // Mimic Java/etc. and treat left shift as based on two's complement representation
- // Assumes two's complement machine
- return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
- }
-};
-
-struct ShiftRight {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- // Logical right shift when Arg0 is unsigned
- // Arithmetic otherwise (this is implementation-defined but GCC and MSVC document this
- // as arithmetic right shift)
- // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
- // https://docs.microsoft.com/en-us/cpp/cpp/left-shift-and-right-shift-operators-input-and-output?view=msvc-160
- // Clang doesn't document their behavior.
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- return lhs;
- }
- return lhs >> rhs;
- }
-};
-
-struct ShiftRightChecked {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
- return lhs;
- }
- return lhs >> rhs;
- }
-};
-
-struct Sin {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::sin(val);
- }
-};
-
-struct SinChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(std::isinf(val))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::sin(val);
- }
-};
-
-struct Cos {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::cos(val);
- }
-};
-
-struct CosChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(std::isinf(val))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::cos(val);
- }
-};
-
-struct Tan {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::tan(val);
- }
-};
-
-struct TanChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(std::isinf(val))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- // Cannot raise range errors (overflow) since PI/2 is not exactly representable
- return std::tan(val);
- }
-};
-
-struct Asin {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::asin(val);
- }
-};
-
-struct AsinChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::asin(val);
- }
-};
-
-struct Acos {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::acos(val);
- }
-};
-
-struct AcosChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::acos(val);
- }
-};
-
-struct Atan {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::atan(val);
- }
-};
-
-struct Atan2 {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- static_assert(std::is_same<Arg0, Arg1>::value, "");
- return std::atan2(y, x);
- }
-};
-
-struct LogNatural {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < 0.0) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log(arg);
- }
-};
-
-struct LogNaturalChecked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < 0.0) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log(arg);
- }
-};
-
-struct Log10 {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < 0.0) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log10(arg);
- }
-};
-
-struct Log10Checked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < 0) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log10(arg);
- }
-};
-
-struct Log2 {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < 0.0) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log2(arg);
- }
-};
-
-struct Log2Checked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < 0.0) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log2(arg);
- }
-};
-
-struct Log1p {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == -1) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < -1) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log1p(arg);
- }
-};
-
-struct Log1pChecked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == -1) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < -1) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log1p(arg);
- }
-};
-
-struct Floor {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::floor(arg);
- }
-};
-
-struct Ceil {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::ceil(arg);
- }
-};
-
-struct Trunc {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::trunc(arg);
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext* ctx, Arg0 left, Arg1 right,
+ Status* st) {
+ return Divide::Call<T>(ctx, left, right, st);
+ }
};
+struct Negate {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return -arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return ~arg + 1;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return arrow::internal::SafeSignedNegate(arg);
+ }
+};
+
+struct NegateChecked {
+ template <typename T, typename Arg>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ T result = 0;
+ if (ARROW_PREDICT_FALSE(NegateWithOverflow(arg, &result))) {
+ *st = Status::Invalid("overflow");
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg>
+ static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ DCHECK(false) << "This is included only for the purposes of instantiability from the "
+ "arithmetic kernel generator";
+ return 0;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return -arg;
+ }
+};
+
+struct Power {
+ ARROW_NOINLINE
+ static uint64_t IntegerPower(uint64_t base, uint64_t exp) {
+ // right to left O(logn) power
+ uint64_t pow = 1;
+ while (exp) {
+ pow *= (exp & 1) ? base : 1;
+ base *= base;
+ exp >>= 1;
+ }
+ return pow;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, T base, T exp, Status* st) {
+ if (exp < 0) {
+ *st = Status::Invalid("integers to negative integer powers are not allowed");
+ return 0;
+ }
+ return static_cast<T>(IntegerPower(base, exp));
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, T base, T exp, Status*) {
+ return std::pow(base, exp);
+ }
+};
+
+struct PowerChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status* st) {
+ if (exp < 0) {
+ *st = Status::Invalid("integers to negative integer powers are not allowed");
+ return 0;
+ } else if (exp == 0) {
+ return 1;
+ }
+ // left to right O(logn) power with overflow checks
+ bool overflow = false;
+ uint64_t bitmask =
+ 1ULL << (63 - BitUtil::CountLeadingZeros(static_cast<uint64_t>(exp)));
+ T pow = 1;
+ while (bitmask) {
+ overflow |= MultiplyWithOverflow(pow, pow, &pow);
+ if (exp & bitmask) {
+ overflow |= MultiplyWithOverflow(pow, base, &pow);
+ }
+ bitmask >>= 1;
+ }
+ if (overflow) {
+ *st = Status::Invalid("overflow");
+ }
+ return pow;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return std::pow(base, exp);
+ }
+};
+
+struct Sign {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1));
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return arg > 0;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
+ }
+};
+
+// Bitwise operations
+
+struct BitWiseNot {
+ template <typename T, typename Arg>
+ static T Call(KernelContext*, Arg arg, Status*) {
+ return ~arg;
+ }
+};
+
+struct BitWiseAnd {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs & rhs;
+ }
+};
+
+struct BitWiseOr {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs | rhs;
+ }
+};
+
+struct BitWiseXor {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs ^ rhs;
+ }
+};
+
+struct ShiftLeft {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ using Unsigned = typename std::make_unsigned<Arg0>::type;
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ return lhs;
+ }
+ return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
+ }
+};
+
+// See SEI CERT C Coding Standard rule INT34-C
+struct ShiftLeftChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+ Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ return lhs << rhs;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+ Status* st) {
+ using Unsigned = typename std::make_unsigned<Arg0>::type;
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ // In C/C++ left shift of a negative number is undefined (C++11 standard 5.8.2)
+ // Mimic Java/etc. and treat left shift as based on two's complement representation
+ // Assumes two's complement machine
+ return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
+ }
+};
+
+struct ShiftRight {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ // Logical right shift when Arg0 is unsigned
+ // Arithmetic otherwise (this is implementation-defined but GCC and MSVC document this
+ // as arithmetic right shift)
+ // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
+ // https://docs.microsoft.com/en-us/cpp/cpp/left-shift-and-right-shift-operators-input-and-output?view=msvc-160
+ // Clang doesn't document their behavior.
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ return lhs;
+ }
+ return lhs >> rhs;
+ }
+};
+
+struct ShiftRightChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ return lhs >> rhs;
+ }
+};
+
+struct Sin {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::sin(val);
+ }
+};
+
+struct SinChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::sin(val);
+ }
+};
+
+struct Cos {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::cos(val);
+ }
+};
+
+struct CosChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::cos(val);
+ }
+};
+
+struct Tan {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::tan(val);
+ }
+};
+
+struct TanChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ // Cannot raise range errors (overflow) since PI/2 is not exactly representable
+ return std::tan(val);
+ }
+};
+
+struct Asin {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::asin(val);
+ }
+};
+
+struct AsinChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::asin(val);
+ }
+};
+
+struct Acos {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::acos(val);
+ }
+};
+
+struct AcosChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::acos(val);
+ }
+};
+
+struct Atan {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::atan(val);
+ }
+};
+
+struct Atan2 {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ static_assert(std::is_same<Arg0, Arg1>::value, "");
+ return std::atan2(y, x);
+ }
+};
+
+struct LogNatural {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log(arg);
+ }
+};
+
+struct LogNaturalChecked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0.0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log(arg);
+ }
+};
+
+struct Log10 {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log10(arg);
+ }
+};
+
+struct Log10Checked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log10(arg);
+ }
+};
+
+struct Log2 {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log2(arg);
+ }
+};
+
+struct Log2Checked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0.0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log2(arg);
+ }
+};
+
+struct Log1p {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == -1) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < -1) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log1p(arg);
+ }
+};
+
+struct Log1pChecked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == -1) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < -1) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log1p(arg);
+ }
+};
+
+struct Floor {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::floor(arg);
+ }
+};
+
+struct Ceil {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::ceil(arg);
+ }
+};
+
+struct Trunc {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::trunc(arg);
+ }
+};
+
// Generate a kernel given an arithmetic functor
template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
+ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::INT8:
return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
@@ -869,321 +869,321 @@ ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
}
}
-// Generate a kernel given a bitwise arithmetic functor. Assumes the
-// functor treats all integer types of equal width identically
-template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- case Type::UINT8:
- return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
- case Type::INT16:
- case Type::UINT16:
- return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
- case Type::INT32:
- case Type::UINT32:
- return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
- case Type::INT64:
- case Type::UINT64:
- return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
- case Type::UINT8:
- return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
- case Type::INT16:
- return KernelGenerator<Int16Type, Int16Type, Op>::Exec;
- case Type::UINT16:
- return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
- case Type::INT32:
- return KernelGenerator<Int32Type, Int32Type, Op>::Exec;
- case Type::UINT32:
- return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
- case Type::INT64:
- return KernelGenerator<Int64Type, Int64Type, Op>::Exec;
- case Type::UINT64:
- return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::FLOAT:
- return KernelGenerator<FloatType, FloatType, Op>::Exec;
- case Type::DOUBLE:
- return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-Status CastBinaryDecimalArgs(const std::string& func_name,
- std::vector<ValueDescr>* values) {
- auto& left_type = (*values)[0].type;
- auto& right_type = (*values)[1].type;
- DCHECK(is_decimal(left_type->id()) || is_decimal(right_type->id()));
-
- // decimal + float = float
- if (is_floating(left_type->id())) {
- right_type = left_type;
- return Status::OK();
- } else if (is_floating(right_type->id())) {
- left_type = right_type;
- return Status::OK();
- }
-
- // precision, scale of left and right args
- int32_t p1, s1, p2, s2;
-
- // decimal + integer = decimal
- if (is_decimal(left_type->id())) {
- auto decimal = checked_cast<const DecimalType*>(left_type.get());
- p1 = decimal->precision();
- s1 = decimal->scale();
- } else {
- DCHECK(is_integer(left_type->id()));
- p1 = static_cast<int32_t>(std::ceil(std::log10(bit_width(left_type->id()))));
- s1 = 0;
- }
- if (is_decimal(right_type->id())) {
- auto decimal = checked_cast<const DecimalType*>(right_type.get());
- p2 = decimal->precision();
- s2 = decimal->scale();
- } else {
- DCHECK(is_integer(right_type->id()));
- p2 = static_cast<int32_t>(std::ceil(std::log10(bit_width(right_type->id()))));
- s2 = 0;
- }
- if (s1 < 0 || s2 < 0) {
- return Status::NotImplemented("Decimals with negative scales not supported");
- }
-
- // decimal128 + decimal256 = decimal256
- Type::type casted_type_id = Type::DECIMAL128;
- if (left_type->id() == Type::DECIMAL256 || right_type->id() == Type::DECIMAL256) {
- casted_type_id = Type::DECIMAL256;
- }
-
- // decimal promotion rules compatible with amazon redshift
- // https://docs.aws.amazon.com/redshift/latest/dg/r_numeric_computations201.html
- int32_t left_scaleup, right_scaleup;
-
- // "add_checked" -> "add"
- const std::string op = func_name.substr(0, func_name.find("_"));
- if (op == "add" || op == "subtract") {
- left_scaleup = std::max(s1, s2) - s1;
- right_scaleup = std::max(s1, s2) - s2;
- } else if (op == "multiply") {
- left_scaleup = right_scaleup = 0;
- } else if (op == "divide") {
- left_scaleup = std::max(4, s1 + p2 - s2 + 1) + s2 - s1;
- right_scaleup = 0;
- } else {
- return Status::Invalid("Invalid decimal function: ", func_name);
- }
-
- ARROW_ASSIGN_OR_RAISE(
- left_type, DecimalType::Make(casted_type_id, p1 + left_scaleup, s1 + left_scaleup));
- ARROW_ASSIGN_OR_RAISE(right_type, DecimalType::Make(casted_type_id, p2 + right_scaleup,
- s2 + right_scaleup));
- return Status::OK();
-}
-
-// resolve decimal binary operation output type per *casted* args
-template <typename OutputGetter>
-Result<ValueDescr> ResolveDecimalBinaryOperationOutput(
- const std::vector<ValueDescr>& args, OutputGetter&& getter) {
- // casted args should be same size decimals
- auto left_type = checked_cast<const DecimalType*>(args[0].type.get());
- auto right_type = checked_cast<const DecimalType*>(args[1].type.get());
- DCHECK_EQ(left_type->id(), right_type->id());
-
- int32_t precision, scale;
- std::tie(precision, scale) = getter(left_type->precision(), left_type->scale(),
- right_type->precision(), right_type->scale());
- ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type->id(), precision, scale));
- return ValueDescr(std::move(type), GetBroadcastShape(args));
-}
-
-Result<ValueDescr> ResolveDecimalAdditionOrSubtractionOutput(
- KernelContext*, const std::vector<ValueDescr>& args) {
- return ResolveDecimalBinaryOperationOutput(
- args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
- DCHECK_EQ(s1, s2);
- const int32_t scale = s1;
- const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1;
- return std::make_pair(precision, scale);
- });
-}
-
-Result<ValueDescr> ResolveDecimalMultiplicationOutput(
- KernelContext*, const std::vector<ValueDescr>& args) {
- return ResolveDecimalBinaryOperationOutput(
- args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
- const int32_t scale = s1 + s2;
- const int32_t precision = p1 + p2 + 1;
- return std::make_pair(precision, scale);
- });
-}
-
-Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
- const std::vector<ValueDescr>& args) {
- return ResolveDecimalBinaryOperationOutput(
- args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
- DCHECK_GE(s1, s2);
- const int32_t scale = s1 - s2;
- const int32_t precision = p1;
- return std::make_pair(precision, scale);
- });
-}
-
-template <typename Op>
-void AddDecimalBinaryKernels(const std::string& name,
- std::shared_ptr<ScalarFunction>* func) {
- OutputType out_type(null());
- const std::string op = name.substr(0, name.find("_"));
- if (op == "add" || op == "subtract") {
- out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput);
- } else if (op == "multiply") {
- out_type = OutputType(ResolveDecimalMultiplicationOutput);
- } else if (op == "divide") {
- out_type = OutputType(ResolveDecimalDivisionOutput);
- } else {
- DCHECK(false);
- }
-
- auto in_type128 = InputType(Type::DECIMAL128);
- auto in_type256 = InputType(Type::DECIMAL256);
- auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
- auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
- DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
- DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
-}
-
-// Generate a kernel given an arithmetic functor
-template <template <typename...> class KernelGenerator, typename OutType, typename Op>
-ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- return KernelGenerator<OutType, Int8Type, Op>::Exec;
- case Type::UINT8:
- return KernelGenerator<OutType, UInt8Type, Op>::Exec;
- case Type::INT16:
- return KernelGenerator<OutType, Int16Type, Op>::Exec;
- case Type::UINT16:
- return KernelGenerator<OutType, UInt16Type, Op>::Exec;
- case Type::INT32:
- return KernelGenerator<OutType, Int32Type, Op>::Exec;
- case Type::UINT32:
- return KernelGenerator<OutType, UInt32Type, Op>::Exec;
- case Type::INT64:
- case Type::TIMESTAMP:
- return KernelGenerator<OutType, Int64Type, Op>::Exec;
- case Type::UINT64:
- return KernelGenerator<OutType, UInt64Type, Op>::Exec;
- case Type::FLOAT:
- return KernelGenerator<FloatType, FloatType, Op>::Exec;
- case Type::DOUBLE:
- return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-struct ArithmeticFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- RETURN_NOT_OK(CheckDecimals(values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
-
- // Only promote types for binary functions
- if (values->size() == 2) {
- ReplaceNullWithOtherType(values);
-
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- }
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-
- Status CheckDecimals(std::vector<ValueDescr>* values) const {
- bool has_decimal = false;
- for (const auto& value : *values) {
- if (is_decimal(value.type->id())) {
- has_decimal = true;
- break;
- }
- }
- if (!has_decimal) return Status::OK();
-
- if (values->size() == 2) {
- return CastBinaryDecimalArgs(name(), values);
- }
- return Status::OK();
- }
-};
-
-/// An ArithmeticFunction that promotes integer arguments to double.
-struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
- using ArithmeticFunction::ArithmeticFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
- RETURN_NOT_OK(CheckDecimals(values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
-
- if (values->size() == 2) {
- ReplaceNullWithOtherType(values);
- }
-
- for (auto& descr : *values) {
- if (is_integer(descr.type->id())) {
- descr.type = float64();
- }
- }
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
+// Generate a kernel given a bitwise arithmetic functor. Assumes the
+// functor treats all integer types of equal width identically
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<Int16Type, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<Int32Type, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ return KernelGenerator<Int64Type, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+Status CastBinaryDecimalArgs(const std::string& func_name,
+ std::vector<ValueDescr>* values) {
+ auto& left_type = (*values)[0].type;
+ auto& right_type = (*values)[1].type;
+ DCHECK(is_decimal(left_type->id()) || is_decimal(right_type->id()));
+
+ // decimal + float = float
+ if (is_floating(left_type->id())) {
+ right_type = left_type;
+ return Status::OK();
+ } else if (is_floating(right_type->id())) {
+ left_type = right_type;
+ return Status::OK();
+ }
+
+ // precision, scale of left and right args
+ int32_t p1, s1, p2, s2;
+
+ // decimal + integer = decimal
+ if (is_decimal(left_type->id())) {
+ auto decimal = checked_cast<const DecimalType*>(left_type.get());
+ p1 = decimal->precision();
+ s1 = decimal->scale();
+ } else {
+ DCHECK(is_integer(left_type->id()));
+ p1 = static_cast<int32_t>(std::ceil(std::log10(bit_width(left_type->id()))));
+ s1 = 0;
+ }
+ if (is_decimal(right_type->id())) {
+ auto decimal = checked_cast<const DecimalType*>(right_type.get());
+ p2 = decimal->precision();
+ s2 = decimal->scale();
+ } else {
+ DCHECK(is_integer(right_type->id()));
+ p2 = static_cast<int32_t>(std::ceil(std::log10(bit_width(right_type->id()))));
+ s2 = 0;
+ }
+ if (s1 < 0 || s2 < 0) {
+ return Status::NotImplemented("Decimals with negative scales not supported");
+ }
+
+ // decimal128 + decimal256 = decimal256
+ Type::type casted_type_id = Type::DECIMAL128;
+ if (left_type->id() == Type::DECIMAL256 || right_type->id() == Type::DECIMAL256) {
+ casted_type_id = Type::DECIMAL256;
+ }
+
+ // decimal promotion rules compatible with amazon redshift
+ // https://docs.aws.amazon.com/redshift/latest/dg/r_numeric_computations201.html
+ int32_t left_scaleup, right_scaleup;
+
+ // "add_checked" -> "add"
+ const std::string op = func_name.substr(0, func_name.find("_"));
+ if (op == "add" || op == "subtract") {
+ left_scaleup = std::max(s1, s2) - s1;
+ right_scaleup = std::max(s1, s2) - s2;
+ } else if (op == "multiply") {
+ left_scaleup = right_scaleup = 0;
+ } else if (op == "divide") {
+ left_scaleup = std::max(4, s1 + p2 - s2 + 1) + s2 - s1;
+ right_scaleup = 0;
+ } else {
+ return Status::Invalid("Invalid decimal function: ", func_name);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ left_type, DecimalType::Make(casted_type_id, p1 + left_scaleup, s1 + left_scaleup));
+ ARROW_ASSIGN_OR_RAISE(right_type, DecimalType::Make(casted_type_id, p2 + right_scaleup,
+ s2 + right_scaleup));
+ return Status::OK();
+}
+
+// resolve decimal binary operation output type per *casted* args
+template <typename OutputGetter>
+Result<ValueDescr> ResolveDecimalBinaryOperationOutput(
+ const std::vector<ValueDescr>& args, OutputGetter&& getter) {
+ // casted args should be same size decimals
+ auto left_type = checked_cast<const DecimalType*>(args[0].type.get());
+ auto right_type = checked_cast<const DecimalType*>(args[1].type.get());
+ DCHECK_EQ(left_type->id(), right_type->id());
+
+ int32_t precision, scale;
+ std::tie(precision, scale) = getter(left_type->precision(), left_type->scale(),
+ right_type->precision(), right_type->scale());
+ ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type->id(), precision, scale));
+ return ValueDescr(std::move(type), GetBroadcastShape(args));
+}
+
+Result<ValueDescr> ResolveDecimalAdditionOrSubtractionOutput(
+ KernelContext*, const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ DCHECK_EQ(s1, s2);
+ const int32_t scale = s1;
+ const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+Result<ValueDescr> ResolveDecimalMultiplicationOutput(
+ KernelContext*, const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ const int32_t scale = s1 + s2;
+ const int32_t precision = p1 + p2 + 1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
+ const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ DCHECK_GE(s1, s2);
+ const int32_t scale = s1 - s2;
+ const int32_t precision = p1;
+ return std::make_pair(precision, scale);
+ });
+}
+
template <typename Op>
-std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+void AddDecimalBinaryKernels(const std::string& name,
+ std::shared_ptr<ScalarFunction>* func) {
+ OutputType out_type(null());
+ const std::string op = name.substr(0, name.find("_"));
+ if (op == "add" || op == "subtract") {
+ out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput);
+ } else if (op == "multiply") {
+ out_type = OutputType(ResolveDecimalMultiplicationOutput);
+ } else if (op == "divide") {
+ out_type = OutputType(ResolveDecimalDivisionOutput);
+ } else {
+ DCHECK(false);
+ }
+
+ auto in_type128 = InputType(Type::DECIMAL128);
+ auto in_type256 = InputType(Type::DECIMAL256);
+ auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
+ auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
+ DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
+ DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
+}
+
+// Generate a kernel given an arithmetic functor
+template <template <typename...> class KernelGenerator, typename OutType, typename Op>
+ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<OutType, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<OutType, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<OutType, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<OutType, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<OutType, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<OutType, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::TIMESTAMP:
+ return KernelGenerator<OutType, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<OutType, UInt64Type, Op>::Exec;
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+struct ArithmeticFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ RETURN_NOT_OK(CheckDecimals(values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ // Only promote types for binary functions
+ if (values->size() == 2) {
+ ReplaceNullWithOtherType(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+
+ Status CheckDecimals(std::vector<ValueDescr>* values) const {
+ bool has_decimal = false;
+ for (const auto& value : *values) {
+ if (is_decimal(value.type->id())) {
+ has_decimal = true;
+ break;
+ }
+ }
+ if (!has_decimal) return Status::OK();
+
+ if (values->size() == 2) {
+ return CastBinaryDecimalArgs(name(), values);
+ }
+ return Status::OK();
+ }
+};
+
+/// An ArithmeticFunction that promotes integer arguments to double.
+struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
+ using ArithmeticFunction::ArithmeticFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ RETURN_NOT_OK(CheckDecimals(values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ if (values->size() == 2) {
+ ReplaceNullWithOtherType(values);
+ }
+
+ for (auto& descr : *values) {
+ if (is_integer(descr.type->id())) {
+ descr.type = float64();
+ }
+ }
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Op>(ty);
+ auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Op>(ty);
DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
}
return func;
@@ -1192,630 +1192,630 @@ std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
// Like MakeArithmeticFunction, but for arithmetic ops that need to run
// only on non-null output.
template <typename Op>
-std::shared_ptr<ScalarFunction> MakeArithmeticFunctionNotNull(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
- DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarUnary, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, ty, exec));
- }
- return func;
-}
-
-// Like MakeUnaryArithmeticFunction, but for unary arithmetic ops with a fixed
-// output type for integral inputs.
-template <typename Op, typename IntOutType>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionWithFixedIntOutType(
- std::string name, const FunctionDoc* doc) {
- auto int_out_ty = TypeTraits<IntOutType>::type_singleton();
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- auto out_ty = arrow::is_floating(ty->id()) ? ty : int_out_ty;
- auto exec = GenerateArithmeticWithFixedIntOutType<ScalarUnary, IntOutType, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, out_ty, exec));
- }
- return func;
-}
-
-// Like MakeUnaryArithmeticFunction, but for arithmetic ops that need to run
-// only on non-null output.
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionNotNull(
- std::string name, const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+std::shared_ptr<ScalarFunction> MakeArithmeticFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, ty, exec));
- }
- return func;
-}
-
-// Like MakeUnaryArithmeticFunction, but for signed arithmetic ops that need to run
-// only on non-null output.
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnarySignedArithmeticFunctionNotNull(
- std::string name, const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- if (!arrow::is_unsigned_integer(ty->id())) {
- auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, ty, exec));
- }
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeBitWiseFunctionNotNull(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : IntTypes()) {
- auto exec = TypeAgnosticBitWiseExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ auto exec = ArithmeticExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
}
return func;
}
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeShiftFunctionNotNull(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : IntTypes()) {
- auto exec = ShiftExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
- DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPoint(
- std::string name, const FunctionDoc* doc) {
- auto func =
- std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : FloatingPointTypes()) {
- auto output = is_integer(ty->id()) ? float64() : ty;
- auto exec = GenerateArithmeticFloatingPoint<ScalarUnary, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, output, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPointNotNull(
- std::string name, const FunctionDoc* doc) {
- auto func =
- std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : FloatingPointTypes()) {
- auto output = is_integer(ty->id()) ? float64() : ty;
- auto exec = GenerateArithmeticFloatingPoint<ScalarUnaryNotNull, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, output, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeArithmeticFunctionFloatingPoint(
- std::string name, const FunctionDoc* doc) {
- auto func =
- std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : FloatingPointTypes()) {
- auto output = is_integer(ty->id()) ? float64() : ty;
- auto exec = GenerateArithmeticFloatingPoint<ScalarBinaryEqualTypes, Op>(ty);
- DCHECK_OK(func->AddKernel({ty, ty}, output, exec));
- }
- return func;
-}
-
-const FunctionDoc absolute_value_doc{
- "Calculate the absolute value of the argument element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"abs_checked\" if you want overflow\n"
- "to return an error."),
- {"x"}};
-
-const FunctionDoc absolute_value_checked_doc{
- "Calculate the absolute value of the argument element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"abs\"."),
- {"x"}};
-
-const FunctionDoc add_doc{"Add the arguments element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"add_checked\" if you want overflow\n"
- "to return an error."),
- {"x", "y"}};
-
-const FunctionDoc add_checked_doc{
- "Add the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"add\"."),
- {"x", "y"}};
-
-const FunctionDoc sub_doc{"Subtract the arguments element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"subtract_checked\" if you want overflow\n"
- "to return an error."),
- {"x", "y"}};
-
-const FunctionDoc sub_checked_doc{
- "Subtract the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"subtract\"."),
- {"x", "y"}};
-
-const FunctionDoc mul_doc{"Multiply the arguments element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"multiply_checked\" if you want overflow\n"
- "to return an error."),
- {"x", "y"}};
-
-const FunctionDoc mul_checked_doc{
- "Multiply the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"multiply\"."),
- {"x", "y"}};
-
-const FunctionDoc div_doc{
- "Divide the arguments element-wise",
- ("Integer division by zero returns an error. However, integer overflow\n"
- "wraps around, and floating-point division by zero returns an infinite.\n"
- "Use function \"divide_checked\" if you want to get an error\n"
- "in all the aforementioned cases."),
- {"dividend", "divisor"}};
-
-const FunctionDoc div_checked_doc{
- "Divide the arguments element-wise",
- ("An error is returned when trying to divide by zero, or when\n"
- "integer overflow is encountered."),
- {"dividend", "divisor"}};
-
-const FunctionDoc negate_doc{"Negate the argument element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"negate_checked\" if you want overflow\n"
- "to return an error."),
- {"x"}};
-
-const FunctionDoc negate_checked_doc{
- "Negate the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"negate\"."),
- {"x"}};
-
-const FunctionDoc pow_doc{
- "Raise arguments to power element-wise",
- ("Integer to negative integer power returns an error. However, integer overflow\n"
- "wraps around. If either base or exponent is null the result will be null."),
- {"base", "exponent"}};
-
-const FunctionDoc pow_checked_doc{
- "Raise arguments to power element-wise",
- ("An error is returned when integer to negative integer power is encountered,\n"
- "or integer overflow is encountered."),
- {"base", "exponent"}};
-
-const FunctionDoc sign_doc{
- "Get the signedness of the arguments element-wise",
- ("Output is any of (-1,1) for nonzero inputs and 0 for zero input.\n"
- "NaN values return NaN. Integral values return signedness as Int8 and\n"
- "floating-point values return it with the same type as the input values."),
- {"x"}};
-
-const FunctionDoc bit_wise_not_doc{
- "Bit-wise negate the arguments element-wise", "Null values return null.", {"x"}};
-
-const FunctionDoc bit_wise_and_doc{
- "Bit-wise AND the arguments element-wise", "Null values return null.", {"x", "y"}};
-
-const FunctionDoc bit_wise_or_doc{
- "Bit-wise OR the arguments element-wise", "Null values return null.", {"x", "y"}};
-
-const FunctionDoc bit_wise_xor_doc{
- "Bit-wise XOR the arguments element-wise", "Null values return null.", {"x", "y"}};
-
-const FunctionDoc shift_left_doc{
- "Left shift `x` by `y`",
- ("This function will return `x` if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`.\n"
- "The shift operates as if on the two's complement representation of the number. "
- "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
- "even if overflow occurs.\n"
- "Use function \"shift_left_checked\" if you want an invalid shift amount to "
- "return an error."),
- {"x", "y"}};
-
-const FunctionDoc shift_left_checked_doc{
- "Left shift `x` by `y` with invalid shift check",
- ("This function will raise an error if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`. "
- "The shift operates as if on the two's complement representation of the number. "
- "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
- "even if overflow occurs.\n"
- "See \"shift_left\" for a variant that doesn't fail for an invalid shift amount."),
- {"x", "y"}};
-
-const FunctionDoc shift_right_doc{
- "Right shift `x` by `y`",
- ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
- "This function will return `x` if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`.\n"
- "Use function \"shift_right_checked\" if you want an invalid shift amount to return "
- "an error."),
- {"x", "y"}};
-
-const FunctionDoc shift_right_checked_doc{
- "Right shift `x` by `y` with invalid shift check",
- ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
- "This function will raise an error if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`.\n"
- "See \"shift_right\" for a variant that doesn't fail for an invalid shift amount"),
- {"x", "y"}};
-
-const FunctionDoc sin_doc{"Compute the sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"sin_checked\"."),
- {"x"}};
-
-const FunctionDoc sin_checked_doc{
- "Compute the sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"sin\"."),
- {"x"}};
-
-const FunctionDoc cos_doc{"Compute the cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"cos_checked\"."),
- {"x"}};
-
-const FunctionDoc cos_checked_doc{
- "Compute the cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"cos\"."),
- {"x"}};
-
-const FunctionDoc tan_doc{"Compute the tangent of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"tan_checked\"."),
- {"x"}};
-
-const FunctionDoc tan_checked_doc{
- "Compute the tangent of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"tan\"."),
- {"x"}};
-
-const FunctionDoc asin_doc{"Compute the inverse sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"asin_checked\"."),
- {"x"}};
-
-const FunctionDoc asin_checked_doc{
- "Compute the inverse sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"asin\"."),
- {"x"}};
-
-const FunctionDoc acos_doc{"Compute the inverse cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"acos_checked\"."),
- {"x"}};
-
-const FunctionDoc acos_checked_doc{
- "Compute the inverse cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"acos\"."),
- {"x"}};
-
-const FunctionDoc atan_doc{"Compute the principal value of the inverse tangent",
- "Integer arguments return double values.",
- {"x"}};
-
-const FunctionDoc atan2_doc{
- "Compute the inverse tangent using argument signs to determine the quadrant",
- "Integer arguments return double values.",
- {"y", "x"}};
-
-const FunctionDoc ln_doc{
- "Compute natural log of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"ln_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc ln_checked_doc{
- "Compute natural log of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"ln\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc log10_doc{
- "Compute log base 10 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log10_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc log10_checked_doc{
- "Compute log base 10 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log10\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc log2_doc{
- "Compute log base 2 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log2_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc log2_checked_doc{
- "Compute log base 2 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log2\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc log1p_doc{
- "Compute natural log of (1+x) element-wise",
- ("Values <= -1 return -inf or NaN. Null values return null.\n"
- "This function may be more precise than log(1 + x) for x close to zero."
- "Use function \"log1p_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc log1p_checked_doc{
- "Compute natural log of (1+x) element-wise",
- ("Values <= -1 return -inf or NaN. Null values return null.\n"
- "This function may be more precise than log(1 + x) for x close to zero."
- "Use function \"log1p\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc floor_doc{
- "Round down to the nearest integer",
- ("Calculate the nearest integer less than or equal in magnitude to the "
- "argument element-wise"),
- {"x"}};
-
-const FunctionDoc ceil_doc{
- "Round up to the nearest integer",
- ("Calculate the nearest integer greater than or equal in magnitude to the "
- "argument element-wise"),
- {"x"}};
-
-const FunctionDoc trunc_doc{
- "Get the integral part without fractional digits",
- ("Calculate the nearest integer not greater in magnitude than to the "
- "argument element-wise."),
- {"x"}};
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarUnary, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for unary arithmetic ops with a fixed
+// output type for integral inputs.
+template <typename Op, typename IntOutType>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionWithFixedIntOutType(
+ std::string name, const FunctionDoc* doc) {
+ auto int_out_ty = TypeTraits<IntOutType>::type_singleton();
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto out_ty = arrow::is_floating(ty->id()) ? ty : int_out_ty;
+ auto exec = GenerateArithmeticWithFixedIntOutType<ScalarUnary, IntOutType, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, out_ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for signed arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnarySignedArithmeticFunctionNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ if (!arrow::is_unsigned_integer(ty->id())) {
+ auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBitWiseFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = TypeAgnosticBitWiseExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeShiftFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = ShiftExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPoint(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarUnary, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, output, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPointNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, output, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeArithmeticFunctionFloatingPoint(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarBinaryEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, output, exec));
+ }
+ return func;
+}
+
+const FunctionDoc absolute_value_doc{
+ "Calculate the absolute value of the argument element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"abs_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x"}};
+
+const FunctionDoc absolute_value_checked_doc{
+ "Calculate the absolute value of the argument element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"abs\"."),
+ {"x"}};
+
+const FunctionDoc add_doc{"Add the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"add_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc add_checked_doc{
+ "Add the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"add\"."),
+ {"x", "y"}};
+
+const FunctionDoc sub_doc{"Subtract the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"subtract_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc sub_checked_doc{
+ "Subtract the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"subtract\"."),
+ {"x", "y"}};
+
+const FunctionDoc mul_doc{"Multiply the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"multiply_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc mul_checked_doc{
+ "Multiply the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"multiply\"."),
+ {"x", "y"}};
+
+const FunctionDoc div_doc{
+ "Divide the arguments element-wise",
+ ("Integer division by zero returns an error. However, integer overflow\n"
+ "wraps around, and floating-point division by zero returns an infinite.\n"
+ "Use function \"divide_checked\" if you want to get an error\n"
+ "in all the aforementioned cases."),
+ {"dividend", "divisor"}};
+
+const FunctionDoc div_checked_doc{
+ "Divide the arguments element-wise",
+ ("An error is returned when trying to divide by zero, or when\n"
+ "integer overflow is encountered."),
+ {"dividend", "divisor"}};
+
+const FunctionDoc negate_doc{"Negate the argument element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"negate_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x"}};
+
+const FunctionDoc negate_checked_doc{
+ "Negate the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"negate\"."),
+ {"x"}};
+
+const FunctionDoc pow_doc{
+ "Raise arguments to power element-wise",
+ ("Integer to negative integer power returns an error. However, integer overflow\n"
+ "wraps around. If either base or exponent is null the result will be null."),
+ {"base", "exponent"}};
+
+const FunctionDoc pow_checked_doc{
+ "Raise arguments to power element-wise",
+ ("An error is returned when integer to negative integer power is encountered,\n"
+ "or integer overflow is encountered."),
+ {"base", "exponent"}};
+
+const FunctionDoc sign_doc{
+ "Get the signedness of the arguments element-wise",
+ ("Output is any of (-1,1) for nonzero inputs and 0 for zero input.\n"
+ "NaN values return NaN. Integral values return signedness as Int8 and\n"
+ "floating-point values return it with the same type as the input values."),
+ {"x"}};
+
+const FunctionDoc bit_wise_not_doc{
+ "Bit-wise negate the arguments element-wise", "Null values return null.", {"x"}};
+
+const FunctionDoc bit_wise_and_doc{
+ "Bit-wise AND the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc bit_wise_or_doc{
+ "Bit-wise OR the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc bit_wise_xor_doc{
+ "Bit-wise XOR the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc shift_left_doc{
+ "Left shift `x` by `y`",
+ ("This function will return `x` if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "The shift operates as if on the two's complement representation of the number. "
+ "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+ "even if overflow occurs.\n"
+ "Use function \"shift_left_checked\" if you want an invalid shift amount to "
+ "return an error."),
+ {"x", "y"}};
+
+const FunctionDoc shift_left_checked_doc{
+ "Left shift `x` by `y` with invalid shift check",
+ ("This function will raise an error if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`. "
+ "The shift operates as if on the two's complement representation of the number. "
+ "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+ "even if overflow occurs.\n"
+ "See \"shift_left\" for a variant that doesn't fail for an invalid shift amount."),
+ {"x", "y"}};
+
+const FunctionDoc shift_right_doc{
+ "Right shift `x` by `y`",
+ ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
+ "This function will return `x` if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "Use function \"shift_right_checked\" if you want an invalid shift amount to return "
+ "an error."),
+ {"x", "y"}};
+
+const FunctionDoc shift_right_checked_doc{
+ "Right shift `x` by `y` with invalid shift check",
+ ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
+ "This function will raise an error if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "See \"shift_right\" for a variant that doesn't fail for an invalid shift amount"),
+ {"x", "y"}};
+
+const FunctionDoc sin_doc{"Compute the sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"sin_checked\"."),
+ {"x"}};
+
+const FunctionDoc sin_checked_doc{
+ "Compute the sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"sin\"."),
+ {"x"}};
+
+const FunctionDoc cos_doc{"Compute the cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"cos_checked\"."),
+ {"x"}};
+
+const FunctionDoc cos_checked_doc{
+ "Compute the cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"cos\"."),
+ {"x"}};
+
+const FunctionDoc tan_doc{"Compute the tangent of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"tan_checked\"."),
+ {"x"}};
+
+const FunctionDoc tan_checked_doc{
+ "Compute the tangent of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"tan\"."),
+ {"x"}};
+
+const FunctionDoc asin_doc{"Compute the inverse sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"asin_checked\"."),
+ {"x"}};
+
+const FunctionDoc asin_checked_doc{
+ "Compute the inverse sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"asin\"."),
+ {"x"}};
+
+const FunctionDoc acos_doc{"Compute the inverse cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"acos_checked\"."),
+ {"x"}};
+
+const FunctionDoc acos_checked_doc{
+ "Compute the inverse cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"acos\"."),
+ {"x"}};
+
+const FunctionDoc atan_doc{"Compute the principal value of the inverse tangent",
+ "Integer arguments return double values.",
+ {"x"}};
+
+const FunctionDoc atan2_doc{
+ "Compute the inverse tangent using argument signs to determine the quadrant",
+ "Integer arguments return double values.",
+ {"y", "x"}};
+
+const FunctionDoc ln_doc{
+ "Compute natural log of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"ln_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc ln_checked_doc{
+ "Compute natural log of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"ln\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log10_doc{
+ "Compute log base 10 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log10_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log10_checked_doc{
+ "Compute log base 10 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log10\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log2_doc{
+ "Compute log base 2 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log2_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log2_checked_doc{
+ "Compute log base 2 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log2\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log1p_doc{
+ "Compute natural log of (1+x) element-wise",
+ ("Values <= -1 return -inf or NaN. Null values return null.\n"
+ "This function may be more precise than log(1 + x) for x close to zero."
+ "Use function \"log1p_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log1p_checked_doc{
+ "Compute natural log of (1+x) element-wise",
+ ("Values <= -1 return -inf or NaN. Null values return null.\n"
+ "This function may be more precise than log(1 + x) for x close to zero."
+ "Use function \"log1p\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc floor_doc{
+ "Round down to the nearest integer",
+ ("Calculate the nearest integer less than or equal in magnitude to the "
+ "argument element-wise"),
+ {"x"}};
+
+const FunctionDoc ceil_doc{
+ "Round up to the nearest integer",
+ ("Calculate the nearest integer greater than or equal in magnitude to the "
+ "argument element-wise"),
+ {"x"}};
+
+const FunctionDoc trunc_doc{
+ "Get the integral part without fractional digits",
+ ("Calculate the nearest integer not greater in magnitude than to the "
+ "argument element-wise."),
+ {"x"}};
} // namespace
void RegisterScalarArithmetic(FunctionRegistry* registry) {
// ----------------------------------------------------------------------
- auto absolute_value =
- MakeUnaryArithmeticFunction<AbsoluteValue>("abs", &absolute_value_doc);
- DCHECK_OK(registry->AddFunction(std::move(absolute_value)));
-
- // ----------------------------------------------------------------------
- auto absolute_value_checked = MakeUnaryArithmeticFunctionNotNull<AbsoluteValueChecked>(
- "abs_checked", &absolute_value_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(absolute_value_checked)));
-
- // ----------------------------------------------------------------------
- auto add = MakeArithmeticFunction<Add>("add", &add_doc);
- AddDecimalBinaryKernels<Add>("add", &add);
+ auto absolute_value =
+ MakeUnaryArithmeticFunction<AbsoluteValue>("abs", &absolute_value_doc);
+ DCHECK_OK(registry->AddFunction(std::move(absolute_value)));
+
+ // ----------------------------------------------------------------------
+ auto absolute_value_checked = MakeUnaryArithmeticFunctionNotNull<AbsoluteValueChecked>(
+ "abs_checked", &absolute_value_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(absolute_value_checked)));
+
+ // ----------------------------------------------------------------------
+ auto add = MakeArithmeticFunction<Add>("add", &add_doc);
+ AddDecimalBinaryKernels<Add>("add", &add);
DCHECK_OK(registry->AddFunction(std::move(add)));
// ----------------------------------------------------------------------
- auto add_checked =
- MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
- AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
+ auto add_checked =
+ MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
+ AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
DCHECK_OK(registry->AddFunction(std::move(add_checked)));
// ----------------------------------------------------------------------
- auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
- AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
+ auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
+ AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
// Add subtract(timestamp, timestamp) -> duration
for (auto unit : AllTimeUnits()) {
InputType in_type(match::TimestampTypeUnit(unit));
- auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Subtract>(Type::TIMESTAMP);
+ auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Subtract>(Type::TIMESTAMP);
DCHECK_OK(subtract->AddKernel({in_type, in_type}, duration(unit), std::move(exec)));
}
DCHECK_OK(registry->AddFunction(std::move(subtract)));
// ----------------------------------------------------------------------
- auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
- "subtract_checked", &sub_checked_doc);
- AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
+ auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
+ "subtract_checked", &sub_checked_doc);
+ AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
DCHECK_OK(registry->AddFunction(std::move(subtract_checked)));
// ----------------------------------------------------------------------
- auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
- AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
+ auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
+ AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
DCHECK_OK(registry->AddFunction(std::move(multiply)));
// ----------------------------------------------------------------------
- auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
- "multiply_checked", &mul_checked_doc);
- AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
+ auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
+ "multiply_checked", &mul_checked_doc);
+ AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
DCHECK_OK(registry->AddFunction(std::move(multiply_checked)));
// ----------------------------------------------------------------------
- auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
- AddDecimalBinaryKernels<Divide>("divide", &divide);
+ auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
+ AddDecimalBinaryKernels<Divide>("divide", &divide);
DCHECK_OK(registry->AddFunction(std::move(divide)));
// ----------------------------------------------------------------------
- auto divide_checked =
- MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
- AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
+ auto divide_checked =
+ MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
+ AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
DCHECK_OK(registry->AddFunction(std::move(divide_checked)));
-
- // ----------------------------------------------------------------------
- auto negate = MakeUnaryArithmeticFunction<Negate>("negate", &negate_doc);
- DCHECK_OK(registry->AddFunction(std::move(negate)));
-
- // ----------------------------------------------------------------------
- auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull<NegateChecked>(
- "negate_checked", &negate_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(negate_checked)));
-
- // ----------------------------------------------------------------------
- auto power = MakeArithmeticFunction<Power>("power", &pow_doc);
- DCHECK_OK(registry->AddFunction(std::move(power)));
-
- // ----------------------------------------------------------------------
- auto power_checked =
- MakeArithmeticFunctionNotNull<PowerChecked>("power_checked", &pow_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(power_checked)));
-
- // ----------------------------------------------------------------------
- auto sign =
- MakeUnaryArithmeticFunctionWithFixedIntOutType<Sign, Int8Type>("sign", &sign_doc);
- DCHECK_OK(registry->AddFunction(std::move(sign)));
-
- // ----------------------------------------------------------------------
- // Bitwise functions
- {
- auto bit_wise_not = std::make_shared<ArithmeticFunction>(
- "bit_wise_not", Arity::Unary(), &bit_wise_not_doc);
- for (const auto& ty : IntTypes()) {
- auto exec = TypeAgnosticBitWiseExecFromOp<ScalarUnaryNotNull, BitWiseNot>(ty);
- DCHECK_OK(bit_wise_not->AddKernel({ty}, ty, exec));
- }
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_not)));
- }
-
- auto bit_wise_and =
- MakeBitWiseFunctionNotNull<BitWiseAnd>("bit_wise_and", &bit_wise_and_doc);
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_and)));
-
- auto bit_wise_or =
- MakeBitWiseFunctionNotNull<BitWiseOr>("bit_wise_or", &bit_wise_or_doc);
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_or)));
-
- auto bit_wise_xor =
- MakeBitWiseFunctionNotNull<BitWiseXor>("bit_wise_xor", &bit_wise_xor_doc);
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_xor)));
-
- auto shift_left = MakeShiftFunctionNotNull<ShiftLeft>("shift_left", &shift_left_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_left)));
-
- auto shift_left_checked = MakeShiftFunctionNotNull<ShiftLeftChecked>(
- "shift_left_checked", &shift_left_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_left_checked)));
-
- auto shift_right =
- MakeShiftFunctionNotNull<ShiftRight>("shift_right", &shift_right_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_right)));
-
- auto shift_right_checked = MakeShiftFunctionNotNull<ShiftRightChecked>(
- "shift_right_checked", &shift_right_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_right_checked)));
-
- // ----------------------------------------------------------------------
- // Trig functions
- auto sin = MakeUnaryArithmeticFunctionFloatingPoint<Sin>("sin", &sin_doc);
- DCHECK_OK(registry->AddFunction(std::move(sin)));
-
- auto sin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<SinChecked>(
- "sin_checked", &sin_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(sin_checked)));
-
- auto cos = MakeUnaryArithmeticFunctionFloatingPoint<Cos>("cos", &cos_doc);
- DCHECK_OK(registry->AddFunction(std::move(cos)));
-
- auto cos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<CosChecked>(
- "cos_checked", &cos_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(cos_checked)));
-
- auto tan = MakeUnaryArithmeticFunctionFloatingPoint<Tan>("tan", &tan_doc);
- DCHECK_OK(registry->AddFunction(std::move(tan)));
-
- auto tan_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<TanChecked>(
- "tan_checked", &tan_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(tan_checked)));
-
- auto asin = MakeUnaryArithmeticFunctionFloatingPoint<Asin>("asin", &asin_doc);
- DCHECK_OK(registry->AddFunction(std::move(asin)));
-
- auto asin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AsinChecked>(
- "asin_checked", &asin_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(asin_checked)));
-
- auto acos = MakeUnaryArithmeticFunctionFloatingPoint<Acos>("acos", &acos_doc);
- DCHECK_OK(registry->AddFunction(std::move(acos)));
-
- auto acos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AcosChecked>(
- "acos_checked", &acos_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(acos_checked)));
-
- auto atan = MakeUnaryArithmeticFunctionFloatingPoint<Atan>("atan", &atan_doc);
- DCHECK_OK(registry->AddFunction(std::move(atan)));
-
- auto atan2 = MakeArithmeticFunctionFloatingPoint<Atan2>("atan2", &atan2_doc);
- DCHECK_OK(registry->AddFunction(std::move(atan2)));
-
- // ----------------------------------------------------------------------
- // Logarithms
- auto ln = MakeUnaryArithmeticFunctionFloatingPoint<LogNatural>("ln", &ln_doc);
- DCHECK_OK(registry->AddFunction(std::move(ln)));
-
- auto ln_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<LogNaturalChecked>(
- "ln_checked", &ln_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(ln_checked)));
-
- auto log10 = MakeUnaryArithmeticFunctionFloatingPoint<Log10>("log10", &log10_doc);
- DCHECK_OK(registry->AddFunction(std::move(log10)));
-
- auto log10_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log10Checked>(
- "log10_checked", &log10_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(log10_checked)));
-
- auto log2 = MakeUnaryArithmeticFunctionFloatingPoint<Log2>("log2", &log2_doc);
- DCHECK_OK(registry->AddFunction(std::move(log2)));
-
- auto log2_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log2Checked>(
- "log2_checked", &log2_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(log2_checked)));
-
- auto log1p = MakeUnaryArithmeticFunctionFloatingPoint<Log1p>("log1p", &log1p_doc);
- DCHECK_OK(registry->AddFunction(std::move(log1p)));
-
- auto log1p_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log1pChecked>(
- "log1p_checked", &log1p_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(log1p_checked)));
-
- // ----------------------------------------------------------------------
- // Rounding functions
- auto floor = MakeUnaryArithmeticFunctionFloatingPoint<Floor>("floor", &floor_doc);
- DCHECK_OK(registry->AddFunction(std::move(floor)));
-
- auto ceil = MakeUnaryArithmeticFunctionFloatingPoint<Ceil>("ceil", &ceil_doc);
- DCHECK_OK(registry->AddFunction(std::move(ceil)));
-
- auto trunc = MakeUnaryArithmeticFunctionFloatingPoint<Trunc>("trunc", &trunc_doc);
- DCHECK_OK(registry->AddFunction(std::move(trunc)));
+
+ // ----------------------------------------------------------------------
+ auto negate = MakeUnaryArithmeticFunction<Negate>("negate", &negate_doc);
+ DCHECK_OK(registry->AddFunction(std::move(negate)));
+
+ // ----------------------------------------------------------------------
+ auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull<NegateChecked>(
+ "negate_checked", &negate_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(negate_checked)));
+
+ // ----------------------------------------------------------------------
+ auto power = MakeArithmeticFunction<Power>("power", &pow_doc);
+ DCHECK_OK(registry->AddFunction(std::move(power)));
+
+ // ----------------------------------------------------------------------
+ auto power_checked =
+ MakeArithmeticFunctionNotNull<PowerChecked>("power_checked", &pow_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(power_checked)));
+
+ // ----------------------------------------------------------------------
+ auto sign =
+ MakeUnaryArithmeticFunctionWithFixedIntOutType<Sign, Int8Type>("sign", &sign_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sign)));
+
+ // ----------------------------------------------------------------------
+ // Bitwise functions
+ {
+ auto bit_wise_not = std::make_shared<ArithmeticFunction>(
+ "bit_wise_not", Arity::Unary(), &bit_wise_not_doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = TypeAgnosticBitWiseExecFromOp<ScalarUnaryNotNull, BitWiseNot>(ty);
+ DCHECK_OK(bit_wise_not->AddKernel({ty}, ty, exec));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_not)));
+ }
+
+ auto bit_wise_and =
+ MakeBitWiseFunctionNotNull<BitWiseAnd>("bit_wise_and", &bit_wise_and_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_and)));
+
+ auto bit_wise_or =
+ MakeBitWiseFunctionNotNull<BitWiseOr>("bit_wise_or", &bit_wise_or_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_or)));
+
+ auto bit_wise_xor =
+ MakeBitWiseFunctionNotNull<BitWiseXor>("bit_wise_xor", &bit_wise_xor_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_xor)));
+
+ auto shift_left = MakeShiftFunctionNotNull<ShiftLeft>("shift_left", &shift_left_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_left)));
+
+ auto shift_left_checked = MakeShiftFunctionNotNull<ShiftLeftChecked>(
+ "shift_left_checked", &shift_left_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_left_checked)));
+
+ auto shift_right =
+ MakeShiftFunctionNotNull<ShiftRight>("shift_right", &shift_right_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_right)));
+
+ auto shift_right_checked = MakeShiftFunctionNotNull<ShiftRightChecked>(
+ "shift_right_checked", &shift_right_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_right_checked)));
+
+ // ----------------------------------------------------------------------
+ // Trig functions
+ auto sin = MakeUnaryArithmeticFunctionFloatingPoint<Sin>("sin", &sin_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sin)));
+
+ auto sin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<SinChecked>(
+ "sin_checked", &sin_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sin_checked)));
+
+ auto cos = MakeUnaryArithmeticFunctionFloatingPoint<Cos>("cos", &cos_doc);
+ DCHECK_OK(registry->AddFunction(std::move(cos)));
+
+ auto cos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<CosChecked>(
+ "cos_checked", &cos_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(cos_checked)));
+
+ auto tan = MakeUnaryArithmeticFunctionFloatingPoint<Tan>("tan", &tan_doc);
+ DCHECK_OK(registry->AddFunction(std::move(tan)));
+
+ auto tan_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<TanChecked>(
+ "tan_checked", &tan_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(tan_checked)));
+
+ auto asin = MakeUnaryArithmeticFunctionFloatingPoint<Asin>("asin", &asin_doc);
+ DCHECK_OK(registry->AddFunction(std::move(asin)));
+
+ auto asin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AsinChecked>(
+ "asin_checked", &asin_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(asin_checked)));
+
+ auto acos = MakeUnaryArithmeticFunctionFloatingPoint<Acos>("acos", &acos_doc);
+ DCHECK_OK(registry->AddFunction(std::move(acos)));
+
+ auto acos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AcosChecked>(
+ "acos_checked", &acos_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(acos_checked)));
+
+ auto atan = MakeUnaryArithmeticFunctionFloatingPoint<Atan>("atan", &atan_doc);
+ DCHECK_OK(registry->AddFunction(std::move(atan)));
+
+ auto atan2 = MakeArithmeticFunctionFloatingPoint<Atan2>("atan2", &atan2_doc);
+ DCHECK_OK(registry->AddFunction(std::move(atan2)));
+
+ // ----------------------------------------------------------------------
+ // Logarithms
+ auto ln = MakeUnaryArithmeticFunctionFloatingPoint<LogNatural>("ln", &ln_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ln)));
+
+ auto ln_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<LogNaturalChecked>(
+ "ln_checked", &ln_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ln_checked)));
+
+ auto log10 = MakeUnaryArithmeticFunctionFloatingPoint<Log10>("log10", &log10_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log10)));
+
+ auto log10_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log10Checked>(
+ "log10_checked", &log10_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log10_checked)));
+
+ auto log2 = MakeUnaryArithmeticFunctionFloatingPoint<Log2>("log2", &log2_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log2)));
+
+ auto log2_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log2Checked>(
+ "log2_checked", &log2_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log2_checked)));
+
+ auto log1p = MakeUnaryArithmeticFunctionFloatingPoint<Log1p>("log1p", &log1p_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log1p)));
+
+ auto log1p_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log1pChecked>(
+ "log1p_checked", &log1p_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log1p_checked)));
+
+ // ----------------------------------------------------------------------
+ // Rounding functions
+ auto floor = MakeUnaryArithmeticFunctionFloatingPoint<Floor>("floor", &floor_doc);
+ DCHECK_OK(registry->AddFunction(std::move(floor)));
+
+ auto ceil = MakeUnaryArithmeticFunctionFloatingPoint<Ceil>("ceil", &ceil_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ceil)));
+
+ auto trunc = MakeUnaryArithmeticFunctionFloatingPoint<Trunc>("trunc", &trunc_doc);
+ DCHECK_OK(registry->AddFunction(std::move(trunc)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 7a0e3654edb..3a99c87dd99 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -33,180 +33,180 @@ namespace {
template <typename ComputeWord>
void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
const ArrayData& right, ArrayData* out) {
- DCHECK(left.null_count != 0 || right.null_count != 0)
- << "ComputeKleene is unnecessarily expensive for the non-null case";
+ DCHECK(left.null_count != 0 || right.null_count != 0)
+ << "ComputeKleene is unnecessarily expensive for the non-null case";
- Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
- Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
+ Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
+ Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
- Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
- Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
+ Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
+ Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
- std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
- Bitmap(out->buffers[1], out->offset, out->length)};
+ std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
+ Bitmap(out->buffers[1], out->offset, out->length)};
auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
- uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
+ uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
auto left_true = left_valid & left_data;
auto left_false = left_valid & ~left_data;
auto right_true = right_valid & right_data;
auto right_false = right_valid & ~right_data;
- compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
+ compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
};
- if (right.null_count == 0) {
- std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
- Bitmap::VisitWordsAndWrite(
- in_bms, &out_bms,
- [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
- apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1)));
- });
- return;
+ if (right.null_count == 0) {
+ std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+ apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1)));
+ });
+ return;
+ }
+
+ if (left.null_count == 0) {
+ std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+ apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1)));
+ });
+ return;
}
-
- if (left.null_count == 0) {
- std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
- Bitmap::VisitWordsAndWrite(
- in_bms, &out_bms,
- [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
- apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1)));
- });
- return;
- }
-
- DCHECK(left.null_count != 0 && right.null_count != 0);
- std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
- right_data_bm};
- Bitmap::VisitWordsAndWrite(
- in_bms, &out_bms,
- [&](const std::array<uint64_t, 4>& in, std::array<uint64_t, 2>* out) {
- apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1)));
- });
-}
-
-inline BooleanScalar InvertScalar(const Scalar& in) {
- return in.is_valid ? BooleanScalar(!checked_cast<const BooleanScalar&>(in).value)
- : BooleanScalar();
-}
-
-inline Bitmap GetBitmap(const ArrayData& arr, int index) {
- return Bitmap{arr.buffers[index], arr.offset, arr.length};
+
+ DCHECK(left.null_count != 0 && right.null_count != 0);
+ std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
+ right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 4>& in, std::array<uint64_t, 2>* out) {
+ apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1)));
+ });
}
-struct InvertOp {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
- *checked_cast<BooleanScalar*>(out) = InvertScalar(in);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
- GetBitmap(*out, 1).CopyFromInverted(GetBitmap(in, 1));
- return Status::OK();
- }
-};
-
-template <typename Op>
-struct Commutative {
- static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
- ArrayData* out) {
- return Op::Call(ctx, right, left, out);
- }
-};
-
-struct AndOp : Commutative<AndOp> {
- using Commutative<AndOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- if (left.is_valid && right.is_valid) {
+inline BooleanScalar InvertScalar(const Scalar& in) {
+ return in.is_valid ? BooleanScalar(!checked_cast<const BooleanScalar&>(in).value)
+ : BooleanScalar();
+}
+
+inline Bitmap GetBitmap(const ArrayData& arr, int index) {
+ return Bitmap{arr.buffers[index], arr.offset, arr.length};
+}
+
+struct InvertOp {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ *checked_cast<BooleanScalar*>(out) = InvertScalar(in);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+ GetBitmap(*out, 1).CopyFromInverted(GetBitmap(in, 1));
+ return Status::OK();
+ }
+};
+
+template <typename Op>
+struct Commutative {
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ return Op::Call(ctx, right, left, out);
+ }
+};
+
+struct AndOp : Commutative<AndOp> {
+ using Commutative<AndOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
checked_cast<BooleanScalar*>(out)->value =
- checked_cast<const BooleanScalar&>(left).value &&
- checked_cast<const BooleanScalar&>(right).value;
+ checked_cast<const BooleanScalar&>(left).value &&
+ checked_cast<const BooleanScalar&>(right).value;
}
- return Status::OK();
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- if (right.is_valid) {
- checked_cast<const BooleanScalar&>(right).value
- ? GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1))
- : GetBitmap(*out, 1).SetBitsTo(false);
- }
- return Status::OK();
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1))
+ : GetBitmap(*out, 1).SetBitsTo(false);
+ }
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
right.buffers[1]->data(), right.offset, right.length,
out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
+ return Status::OK();
}
};
-struct KleeneAndOp : Commutative<KleeneAndOp> {
- using Commutative<KleeneAndOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
- bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
-
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- checked_cast<BooleanScalar*>(out)->value = left_true && right_true;
- out->is_valid = left_false || right_false || (left_true && right_true);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- if (right_false) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- GetBitmap(*out, 1).SetBitsTo(false); // all false case
- return Status::OK();
- }
-
- if (right_true) {
- if (left.GetNullCount() == 0) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- } else {
- GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
- }
- GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- return Status::OK();
- }
-
- // scalar was null: out[i] is valid iff left[i] was false
- if (left.GetNullCount() == 0) {
- ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[0]->mutable_data(), out->offset);
- } else {
- ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset,
- left.buffers[1]->data(), left.offset, left.length,
- out->offset, out->buffers[0]->mutable_data());
- }
- ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[1]->mutable_data(), out->offset);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct KleeneAndOp : Commutative<KleeneAndOp> {
+ using Commutative<KleeneAndOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ checked_cast<BooleanScalar*>(out)->value = left_true && right_true;
+ out->is_valid = left_false || right_false || (left_true && right_true);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ if (right_false) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(false); // all false case
+ return Status::OK();
+ }
+
+ if (right_true) {
+ if (left.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+ }
+ GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff left[i] was false
+ if (left.GetNullCount() == 0) {
+ ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset,
+ left.buffers[1]->data(), left.offset, left.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
- out->null_count = 0;
- // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
- BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
- return AndOp::Call(ctx, left, right, out);
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return AndOp::Call(ctx, left, right, out);
}
auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
uint64_t right_false, uint64_t* out_valid,
@@ -215,104 +215,104 @@ struct KleeneAndOp : Commutative<KleeneAndOp> {
*out_valid = left_false | right_false | (left_true & right_true);
};
ComputeKleene(compute_word, ctx, left, right, out);
- return Status::OK();
+ return Status::OK();
}
};
-struct OrOp : Commutative<OrOp> {
- using Commutative<OrOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- if (left.is_valid && right.is_valid) {
- checked_cast<BooleanScalar*>(out)->value =
- checked_cast<const BooleanScalar&>(left).value ||
- checked_cast<const BooleanScalar&>(right).value;
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- if (right.is_valid) {
- checked_cast<const BooleanScalar&>(right).value
- ? GetBitmap(*out, 1).SetBitsTo(true)
- : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct OrOp : Commutative<OrOp> {
+ using Commutative<OrOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value ||
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).SetBitsTo(true)
+ : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset,
right.buffers[1]->data(), right.offset, right.length,
out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
+ return Status::OK();
}
};
-struct KleeneOrOp : Commutative<KleeneOrOp> {
- using Commutative<KleeneOrOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
- bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
-
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- checked_cast<BooleanScalar*>(out)->value = left_true || right_true;
- out->is_valid = left_true || right_true || (left_false && right_false);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- if (right_true) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- GetBitmap(*out, 1).SetBitsTo(true); // all true case
- return Status::OK();
- }
-
- if (right_false) {
- if (left.GetNullCount() == 0) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- } else {
- GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
- }
- GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- return Status::OK();
- }
-
- // scalar was null: out[i] is valid iff left[i] was true
- if (left.GetNullCount() == 0) {
- ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[0]->mutable_data(), out->offset);
- } else {
- ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset,
- left.buffers[1]->data(), left.offset, left.length,
- out->offset, out->buffers[0]->mutable_data());
- }
- ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[1]->mutable_data(), out->offset);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct KleeneOrOp : Commutative<KleeneOrOp> {
+ using Commutative<KleeneOrOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ checked_cast<BooleanScalar*>(out)->value = left_true || right_true;
+ out->is_valid = left_true || right_true || (left_false && right_false);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ if (right_true) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(true); // all true case
+ return Status::OK();
+ }
+
+ if (right_false) {
+ if (left.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+ }
+ GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff left[i] was true
+ if (left.GetNullCount() == 0) {
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset,
+ left.buffers[1]->data(), left.offset, left.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
- out->null_count = 0;
- // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
- BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
- return OrOp::Call(ctx, left, right, out);
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return OrOp::Call(ctx, left, right, out);
}
-
+
static auto compute_word = [](uint64_t left_true, uint64_t left_false,
uint64_t right_true, uint64_t right_false,
uint64_t* out_valid, uint64_t* out_data) {
@@ -320,149 +320,149 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
*out_valid = left_true | right_true | (left_false & right_false);
};
- ComputeKleene(compute_word, ctx, left, right, out);
- return Status::OK();
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
}
};
-struct XorOp : Commutative<XorOp> {
- using Commutative<XorOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- if (left.is_valid && right.is_valid) {
- checked_cast<BooleanScalar*>(out)->value =
- checked_cast<const BooleanScalar&>(left).value ^
- checked_cast<const BooleanScalar&>(right).value;
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- if (right.is_valid) {
- checked_cast<const BooleanScalar&>(right).value
- ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(left, 1))
- : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct XorOp : Commutative<XorOp> {
+ using Commutative<XorOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value ^
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(left, 1))
+ : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset,
right.buffers[1]->data(), right.offset, right.length,
out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
- }
-};
-
-struct AndNotOp {
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- return AndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
- ArrayData* out) {
- if (left.is_valid) {
- checked_cast<const BooleanScalar&>(left).value
- ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1))
- : GetBitmap(*out, 1).SetBitsTo(false);
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- return AndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
- ::arrow::internal::BitmapAndNot(left.buffers[1]->data(), left.offset,
- right.buffers[1]->data(), right.offset, right.length,
- out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
- }
-};
-
-struct KleeneAndNotOp {
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
- ArrayData* out) {
- bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
- bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
-
- if (left_false) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- GetBitmap(*out, 1).SetBitsTo(false); // all false case
- return Status::OK();
- }
-
- if (left_true) {
- if (right.GetNullCount() == 0) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- } else {
- GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0));
- }
- GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1));
- return Status::OK();
- }
-
- // scalar was null: out[i] is valid iff right[i] was true
- if (right.GetNullCount() == 0) {
- ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length,
- out->buffers[0]->mutable_data(), out->offset);
- } else {
- ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset,
- right.buffers[1]->data(), right.offset, right.length,
- out->offset, out->buffers[0]->mutable_data());
- }
- ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length,
- out->buffers[1]->mutable_data(), out->offset);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
- if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
- out->null_count = 0;
- // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
- BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
- return AndNotOp::Call(ctx, left, right, out);
- }
-
- static auto compute_word = [](uint64_t left_true, uint64_t left_false,
- uint64_t right_true, uint64_t right_false,
- uint64_t* out_valid, uint64_t* out_data) {
- *out_data = left_true & right_false;
- *out_valid = left_false | right_true | (left_true & right_false);
- };
-
- ComputeKleene(compute_word, ctx, left, right, out);
- return Status::OK();
+ return Status::OK();
}
};
-void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
- const FunctionDoc* doc, FunctionRegistry* registry,
+struct AndNotOp {
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ return AndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.is_valid) {
+ checked_cast<const BooleanScalar&>(left).value
+ ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1))
+ : GetBitmap(*out, 1).SetBitsTo(false);
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ return AndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ ::arrow::internal::BitmapAndNot(left.buffers[1]->data(), left.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[1]->mutable_data());
+ return Status::OK();
+ }
+};
+
+struct KleeneAndNotOp {
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ if (left_false) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(false); // all false case
+ return Status::OK();
+ }
+
+ if (left_true) {
+ if (right.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0));
+ }
+ GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff right[i] was true
+ if (right.GetNullCount() == 0) {
+ ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return AndNotOp::Call(ctx, left, right, out);
+ }
+
+ static auto compute_word = [](uint64_t left_true, uint64_t left_false,
+ uint64_t right_true, uint64_t right_false,
+ uint64_t* out_valid, uint64_t* out_data) {
+ *out_data = left_true & right_false;
+ *out_valid = left_false | right_true | (left_true & right_false);
+ };
+
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
+ }
+};
+
+void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
+ const FunctionDoc* doc, FunctionRegistry* registry,
NullHandling::type null_handling = NullHandling::INTERSECTION) {
- auto func = std::make_shared<ScalarFunction>(name, Arity(arity), doc);
+ auto func = std::make_shared<ScalarFunction>(name, Arity(arity), doc);
// Scalar arguments not yet supported
- std::vector<InputType> in_types(arity, InputType(boolean()));
+ std::vector<InputType> in_types(arity, InputType(boolean()));
ScalarKernel kernel(std::move(in_types), boolean(), exec);
kernel.null_handling = null_handling;
@@ -470,92 +470,92 @@ void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-const FunctionDoc invert_doc{"Invert boolean values", "", {"values"}};
-
-const FunctionDoc and_doc{
- "Logical 'and' boolean values",
- ("When a null is encountered in either input, a null is output.\n"
- "For a different null behavior, see function \"and_kleene\"."),
- {"x", "y"}};
-
-const FunctionDoc and_not_doc{
- "Logical 'and not' boolean values",
- ("When a null is encountered in either input, a null is output.\n"
- "For a different null behavior, see function \"and_not_kleene\"."),
- {"x", "y"}};
-
-const FunctionDoc or_doc{
- "Logical 'or' boolean values",
- ("When a null is encountered in either input, a null is output.\n"
- "For a different null behavior, see function \"or_kleene\"."),
- {"x", "y"}};
-
-const FunctionDoc xor_doc{
- "Logical 'xor' boolean values",
- ("When a null is encountered in either input, a null is output."),
- {"x", "y"}};
-
-const FunctionDoc and_kleene_doc{
- "Logical 'and' boolean values (Kleene logic)",
- ("This function behaves as follows with nulls:\n\n"
- "- true and null = null\n"
- "- null and true = null\n"
- "- false and null = false\n"
- "- null and false = false\n"
- "- null and null = null\n"
- "\n"
- "In other words, in this context a null value really means \"unknown\",\n"
- "and an unknown value 'and' false is always false.\n"
- "For a different null behavior, see function \"and\"."),
- {"x", "y"}};
-
-const FunctionDoc and_not_kleene_doc{
- "Logical 'and not' boolean values (Kleene logic)",
- ("This function behaves as follows with nulls:\n\n"
- "- true and null = null\n"
- "- null and false = null\n"
- "- false and null = false\n"
- "- null and true = false\n"
- "- null and null = null\n"
- "\n"
- "In other words, in this context a null value really means \"unknown\",\n"
- "and an unknown value 'and not' true is always false, as is false\n"
- "'and not' an unknown value.\n"
- "For a different null behavior, see function \"and_not\"."),
- {"x", "y"}};
-
-const FunctionDoc or_kleene_doc{
- "Logical 'or' boolean values (Kleene logic)",
- ("This function behaves as follows with nulls:\n\n"
- "- true or null = true\n"
- "- null and true = true\n"
- "- false and null = null\n"
- "- null and false = null\n"
- "- null and null = null\n"
- "\n"
- "In other words, in this context a null value really means \"unknown\",\n"
- "and an unknown value 'or' true is always true.\n"
- "For a different null behavior, see function \"and\"."),
- {"x", "y"}};
-
+const FunctionDoc invert_doc{"Invert boolean values", "", {"values"}};
+
+const FunctionDoc and_doc{
+ "Logical 'and' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"and_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc and_not_doc{
+ "Logical 'and not' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"and_not_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc or_doc{
+ "Logical 'or' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"or_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc xor_doc{
+ "Logical 'xor' boolean values",
+ ("When a null is encountered in either input, a null is output."),
+ {"x", "y"}};
+
+const FunctionDoc and_kleene_doc{
+ "Logical 'and' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true and null = null\n"
+ "- null and true = null\n"
+ "- false and null = false\n"
+ "- null and false = false\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'and' false is always false.\n"
+ "For a different null behavior, see function \"and\"."),
+ {"x", "y"}};
+
+const FunctionDoc and_not_kleene_doc{
+ "Logical 'and not' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true and null = null\n"
+ "- null and false = null\n"
+ "- false and null = false\n"
+ "- null and true = false\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'and not' true is always false, as is false\n"
+ "'and not' an unknown value.\n"
+ "For a different null behavior, see function \"and_not\"."),
+ {"x", "y"}};
+
+const FunctionDoc or_kleene_doc{
+ "Logical 'or' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true or null = true\n"
+ "- null and true = true\n"
+ "- false and null = null\n"
+ "- null and false = null\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'or' true is always true.\n"
+ "For a different null behavior, see function \"and\"."),
+ {"x", "y"}};
+
} // namespace
namespace internal {
void RegisterScalarBoolean(FunctionRegistry* registry) {
// These functions can write into sliced output bitmaps
- MakeFunction("invert", 1, applicator::SimpleUnary<InvertOp>, &invert_doc, registry);
- MakeFunction("and", 2, applicator::SimpleBinary<AndOp>, &and_doc, registry);
- MakeFunction("and_not", 2, applicator::SimpleBinary<AndNotOp>, &and_not_doc, registry);
- MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, &or_doc, registry);
- MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, &xor_doc, registry);
-
- MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
- registry, NullHandling::COMPUTED_PREALLOCATE);
- MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
- &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
- MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, &or_kleene_doc,
- registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("invert", 1, applicator::SimpleUnary<InvertOp>, &invert_doc, registry);
+ MakeFunction("and", 2, applicator::SimpleBinary<AndOp>, &and_doc, registry);
+ MakeFunction("and_not", 2, applicator::SimpleBinary<AndNotOp>, &and_not_doc, registry);
+ MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, &or_doc, registry);
+ MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, &xor_doc, registry);
+
+ MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
+ registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
+ &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, &or_kleene_doc,
+ registry, NullHandling::COMPUTED_PREALLOCATE);
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index dad94c1ace7..daee9cff79a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -17,7 +17,7 @@
// Cast types to boolean
-#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_primitive.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/util/value_parsing.h"
@@ -31,17 +31,17 @@ namespace internal {
struct IsNonZero {
template <typename OutValue, typename Arg0Value>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
return val != 0;
}
};
struct ParseBooleanString {
template <typename OutValue, typename Arg0Value>
- static OutValue Call(KernelContext*, Arg0Value val, Status* st) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status* st) {
bool result = false;
if (ARROW_PREDICT_FALSE(!ParseValue<BooleanType>(val.data(), val.size(), &result))) {
- *st = Status::Invalid("Failed to parse value: ", val);
+ *st = Status::Invalid("Failed to parse value: ", val);
}
return result;
}
@@ -50,7 +50,7 @@ struct ParseBooleanString {
std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
auto func = std::make_shared<CastFunction>("cast_boolean", Type::BOOL);
AddCommonCasts(Type::BOOL, boolean(), func.get());
- AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
+ AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
for (const auto& ty : NumericTypes()) {
ArrayKernelExec exec =
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
index b1e1164fd34..b8be4d78696 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -1,126 +1,126 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Implementation of casting to dictionary type
-
-#include <arrow/util/bitmap_ops.h>
-#include <arrow/util/checked_cast.h>
-
-#include "arrow/array/builder_primitive.h"
-#include "arrow/compute/cast_internal.h"
-#include "arrow/compute/kernels/scalar_cast_internal.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/util/int_util.h"
-
-namespace arrow {
-using internal::CopyBitmap;
-
-namespace compute {
-namespace internal {
-
-Status CastDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const CastOptions& options = CastState::Get(ctx);
- auto out_type = std::static_pointer_cast<DictionaryType>(out->type());
-
- // if out type is same as in type, return input
- if (out_type->Equals(batch[0].type())) {
- *out = batch[0];
- return Status::OK();
- }
-
- if (batch[0].is_scalar()) { // if input is scalar
- auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar());
-
- // if invalid scalar, return null scalar
- if (!in_scalar.is_valid) {
- *out = MakeNullScalar(out_type);
- return Status::OK();
- }
-
- Datum casted_index, casted_dict;
- if (in_scalar.value.index->type->Equals(out_type->index_type())) {
- casted_index = in_scalar.value.index;
- } else {
- ARROW_ASSIGN_OR_RAISE(casted_index,
- Cast(in_scalar.value.index, out_type->index_type(), options,
- ctx->exec_context()));
- }
-
- if (in_scalar.value.dictionary->type()->Equals(out_type->value_type())) {
- casted_dict = in_scalar.value.dictionary;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- casted_dict, Cast(in_scalar.value.dictionary, out_type->value_type(), options,
- ctx->exec_context()));
- }
-
- *out = std::static_pointer_cast<Scalar>(
- DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array()));
-
- return Status::OK();
- }
-
- // if input is array
- const std::shared_ptr<ArrayData>& in_array = batch[0].array();
- const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
-
- ArrayData* out_array = out->mutable_array();
-
- if (in_type.index_type()->Equals(out_type->index_type())) {
- out_array->buffers[0] = in_array->buffers[0];
- out_array->buffers[1] = in_array->buffers[1];
- out_array->null_count = in_array->GetNullCount();
- out_array->offset = in_array->offset;
- } else {
- // for indices, create a dummy ArrayData with index_type()
- const std::shared_ptr<ArrayData>& indices_arr =
- ArrayData::Make(in_type.index_type(), in_array->length, in_array->buffers,
- in_array->GetNullCount(), in_array->offset);
- ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type->index_type(),
- options, ctx->exec_context()));
- out_array->buffers[0] = std::move(casted_indices.array()->buffers[0]);
- out_array->buffers[1] = std::move(casted_indices.array()->buffers[1]);
- }
-
- // data (dict)
- if (in_type.value_type()->Equals(out_type->value_type())) {
- out_array->dictionary = in_array->dictionary;
- } else {
- const std::shared_ptr<Array>& dict_arr = MakeArray(in_array->dictionary);
- ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type->value_type(),
- options, ctx->exec_context()));
- out_array->dictionary = casted_data.array();
- }
- return Status::OK();
-}
-
-std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
- auto func = std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
-
- AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get());
- ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastDictionary);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-
- DCHECK_OK(func->AddKernel(Type::DICTIONARY, std::move(kernel)));
-
- return {func};
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to dictionary type
+
+#include <arrow/util/bitmap_ops.h>
+#include <arrow/util/checked_cast.h>
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/cast_internal.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/util/int_util.h"
+
+namespace arrow {
+using internal::CopyBitmap;
+
+namespace compute {
+namespace internal {
+
+Status CastDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const CastOptions& options = CastState::Get(ctx);
+ auto out_type = std::static_pointer_cast<DictionaryType>(out->type());
+
+ // if out type is same as in type, return input
+ if (out_type->Equals(batch[0].type())) {
+ *out = batch[0];
+ return Status::OK();
+ }
+
+ if (batch[0].is_scalar()) { // if input is scalar
+ auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar());
+
+ // if invalid scalar, return null scalar
+ if (!in_scalar.is_valid) {
+ *out = MakeNullScalar(out_type);
+ return Status::OK();
+ }
+
+ Datum casted_index, casted_dict;
+ if (in_scalar.value.index->type->Equals(out_type->index_type())) {
+ casted_index = in_scalar.value.index;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(casted_index,
+ Cast(in_scalar.value.index, out_type->index_type(), options,
+ ctx->exec_context()));
+ }
+
+ if (in_scalar.value.dictionary->type()->Equals(out_type->value_type())) {
+ casted_dict = in_scalar.value.dictionary;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ casted_dict, Cast(in_scalar.value.dictionary, out_type->value_type(), options,
+ ctx->exec_context()));
+ }
+
+ *out = std::static_pointer_cast<Scalar>(
+ DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array()));
+
+ return Status::OK();
+ }
+
+ // if input is array
+ const std::shared_ptr<ArrayData>& in_array = batch[0].array();
+ const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
+
+ ArrayData* out_array = out->mutable_array();
+
+ if (in_type.index_type()->Equals(out_type->index_type())) {
+ out_array->buffers[0] = in_array->buffers[0];
+ out_array->buffers[1] = in_array->buffers[1];
+ out_array->null_count = in_array->GetNullCount();
+ out_array->offset = in_array->offset;
+ } else {
+ // for indices, create a dummy ArrayData with index_type()
+ const std::shared_ptr<ArrayData>& indices_arr =
+ ArrayData::Make(in_type.index_type(), in_array->length, in_array->buffers,
+ in_array->GetNullCount(), in_array->offset);
+ ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type->index_type(),
+ options, ctx->exec_context()));
+ out_array->buffers[0] = std::move(casted_indices.array()->buffers[0]);
+ out_array->buffers[1] = std::move(casted_indices.array()->buffers[1]);
+ }
+
+ // data (dict)
+ if (in_type.value_type()->Equals(out_type->value_type())) {
+ out_array->dictionary = in_array->dictionary;
+ } else {
+ const std::shared_ptr<Array>& dict_arr = MakeArray(in_array->dictionary);
+ ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type->value_type(),
+ options, ctx->exec_context()));
+ out_array->dictionary = casted_data.array();
+ }
+ return Status::OK();
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
+ auto func = std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
+
+ AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get());
+ ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastDictionary);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+ DCHECK_OK(func->AddKernel(Type::DICTIONARY, std::move(kernel)));
+
+ return {func};
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 198c82bd97e..660250359c4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -148,40 +148,40 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
// ----------------------------------------------------------------------
-Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK(out->is_array());
-
+Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+
DictionaryArray dict_arr(batch[0].array());
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
const auto& dict_type = *dict_arr.dictionary()->type();
- if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {
- return Status::Invalid("Cast type ", options.to_type->ToString(),
- " incompatible with dictionary type ", dict_type.ToString());
+ if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {
+ return Status::Invalid("Cast type ", options.to_type->ToString(),
+ " incompatible with dictionary type ", dict_type.ToString());
}
- ARROW_ASSIGN_OR_RAISE(*out,
- Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
- TakeOptions::Defaults(), ctx->exec_context()));
-
- if (!dict_type.Equals(options.to_type)) {
- ARROW_ASSIGN_OR_RAISE(*out, Cast(*out, options));
+ ARROW_ASSIGN_OR_RAISE(*out,
+ Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
+ TakeOptions::Defaults(), ctx->exec_context()));
+
+ if (!dict_type.Equals(options.to_type)) {
+ ARROW_ASSIGN_OR_RAISE(*out, Cast(*out, options));
}
- return Status::OK();
+ return Status::OK();
}
-Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (out->is_scalar()) {
- out->scalar()->is_valid = false;
- } else {
- ArrayData* output = out->mutable_array();
- output->buffers = {nullptr};
- output->null_count = batch.length;
- }
- return Status::OK();
+Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (out->is_scalar()) {
+ out->scalar()->is_valid = false;
+ } else {
+ ArrayData* output = out->mutable_array();
+ output->buffers = {nullptr};
+ output->null_count = batch.length;
+ }
+ return Status::OK();
}
-Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
const DataType& in_type = *batch[0].type();
@@ -190,20 +190,20 @@ Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out)
ExtensionArray extension(batch[0].array());
Datum casted_storage;
- RETURN_NOT_OK(Cast(*extension.storage(), out->type(), options, ctx->exec_context())
- .Value(&casted_storage));
+ RETURN_NOT_OK(Cast(*extension.storage(), out->type(), options, ctx->exec_context())
+ .Value(&casted_storage));
out->value = casted_storage.array();
- return Status::OK();
+ return Status::OK();
}
-Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (!batch[0].is_scalar()) {
- ArrayData* output = out->mutable_array();
- std::shared_ptr<Array> nulls;
- RETURN_NOT_OK(MakeArrayOfNull(output->type, batch.length).Value(&nulls));
- out->value = nulls->data();
- }
- return Status::OK();
+Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (!batch[0].is_scalar()) {
+ ArrayData* output = out->mutable_array();
+ std::shared_ptr<Array> nulls;
+ RETURN_NOT_OK(MakeArrayOfNull(output->type, batch.length).Value(&nulls));
+ out->value = nulls->data();
+ }
+ return Status::OK();
}
Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
@@ -223,25 +223,25 @@ Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
OutputType kOutputTargetType(ResolveOutputFromOptions);
-Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- // Make a copy of the buffers into a destination array without carrying
- // the type
- const ArrayData& input = *batch[0].array();
- ArrayData* output = out->mutable_array();
- output->length = input.length;
- output->SetNullCount(input.null_count);
- output->buffers = input.buffers;
- output->offset = input.offset;
- output->child_data = input.child_data;
- return Status::OK();
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+ // Make a copy of the buffers into a destination array without carrying
+ // the type
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+ output->buffers = input.buffers;
+ output->offset = input.offset;
+ output->child_data = input.child_data;
+ return Status::OK();
}
void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
CastFunction* func) {
auto sig = KernelSignature::Make({in_type}, out_type);
ScalarKernel kernel;
- kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec);
+ kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec);
kernel.signature = sig;
kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
@@ -255,12 +255,12 @@ static bool CanCastFromDictionary(Type::type type_id) {
void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* func) {
// From null to this type
- ScalarKernel kernel;
- kernel.exec = CastFromNull;
- kernel.signature = KernelSignature::Make({null()}, out_ty);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- DCHECK_OK(func->AddKernel(Type::NA, std::move(kernel)));
+ ScalarKernel kernel;
+ kernel.exec = CastFromNull;
+ kernel.signature = KernelSignature::Make({null()}, out_ty);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(Type::NA, std::move(kernel)));
// From dictionary to this type
if (CanCastFromDictionary(out_type_id)) {
@@ -268,10 +268,10 @@ void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* fun
//
// XXX: Uses Take and does its own memory allocation for the moment. We can
// fix this later.
- DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
- TrivialScalarUnaryAsArraysExec(UnpackDictionary),
- NullHandling::COMPUTED_NO_PREALLOCATE,
- MemAllocation::NO_PREALLOCATE));
+ DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
+ TrivialScalarUnaryAsArraysExec(UnpackDictionary),
+ NullHandling::COMPUTED_NO_PREALLOCATE,
+ MemAllocation::NO_PREALLOCATE));
}
// From extension type to this type
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 2419d898a68..bffa64988a6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -21,7 +21,7 @@
#include "arrow/compute/cast.h" // IWYU pragma: export
#include "arrow/compute/cast_internal.h" // IWYU pragma: export
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
namespace arrow {
@@ -37,10 +37,10 @@ struct CastFunctor {};
template <typename O, typename I>
struct CastFunctor<
O, I, enable_if_t<std::is_same<O, I>::value && is_parameter_free_type<I>::value>> {
- static Status Exec(KernelContext*, const ExecBatch&, Datum*) { return Status::OK(); }
+ static Status Exec(KernelContext*, const ExecBatch&, Datum*) { return Status::OK(); }
};
-Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
// Utility for numeric casts
void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input,
@@ -49,23 +49,23 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
// ----------------------------------------------------------------------
// Dictionary to other things
-Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-// Adds a cast function where CastFunctor is specialized and the input and output
-// types are parameter free (have a type_singleton). Scalar inputs are handled by
-// wrapping with TrivialScalarUnaryAsArraysExec.
+// Adds a cast function where CastFunctor is specialized and the input and output
+// types are parameter free (have a type_singleton). Scalar inputs are handled by
+// wrapping with TrivialScalarUnaryAsArraysExec.
template <typename InType, typename OutType>
void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
- DCHECK_OK(func->AddKernel(
- InType::type_id, {in_ty}, out_ty,
- TrivialScalarUnaryAsArraysExec(CastFunctor<OutType, InType>::Exec)));
+ DCHECK_OK(func->AddKernel(
+ InType::type_id, {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(CastFunctor<OutType, InType>::Exec)));
}
-Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
CastFunction* func);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index ec92dbb5d60..c5fccf30311 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -20,74 +20,74 @@
#include <utility>
#include <vector>
-#include "arrow/array/builder_nested.h"
-#include "arrow/compute/api_scalar.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/compute/api_scalar.h"
#include "arrow/compute/cast.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
-#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_ops.h"
namespace arrow {
-
-using internal::CopyBitmap;
-
+
+using internal::CopyBitmap;
+
namespace compute {
namespace internal {
template <typename Type>
-Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- using offset_type = typename Type::offset_type;
- using ScalarType = typename TypeTraits<Type>::ScalarType;
-
- const CastOptions& options = CastState::Get(ctx);
-
- auto child_type = checked_cast<const Type&>(*out->type()).value_type();
-
- if (out->kind() == Datum::SCALAR) {
- const auto& in_scalar = checked_cast<const ScalarType&>(*batch[0].scalar());
- auto out_scalar = checked_cast<ScalarType*>(out->scalar().get());
-
- DCHECK(!out_scalar->is_valid);
- if (in_scalar.is_valid) {
- ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, options,
- ctx->exec_context()));
-
- out_scalar->is_valid = true;
- }
- return Status::OK();
- }
-
- const ArrayData& in_array = *batch[0].array();
- ArrayData* out_array = out->mutable_array();
-
- // Copy from parent
- out_array->buffers = in_array.buffers;
- Datum values = in_array.child_data[0];
-
- if (in_array.offset != 0) {
- if (in_array.buffers[0]) {
- ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
- CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
- in_array.offset, in_array.length));
- }
- ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
- ctx->Allocate(sizeof(offset_type) * (in_array.length + 1)));
-
- auto offsets = in_array.GetValues<offset_type>(1);
- auto shifted_offsets = out_array->GetMutableValues<offset_type>(1);
-
- for (int64_t i = 0; i < in_array.length + 1; ++i) {
- shifted_offsets[i] = offsets[i] - offsets[0];
- }
- values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]);
+Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using offset_type = typename Type::offset_type;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+
+ const CastOptions& options = CastState::Get(ctx);
+
+ auto child_type = checked_cast<const Type&>(*out->type()).value_type();
+
+ if (out->kind() == Datum::SCALAR) {
+ const auto& in_scalar = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto out_scalar = checked_cast<ScalarType*>(out->scalar().get());
+
+ DCHECK(!out_scalar->is_valid);
+ if (in_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, options,
+ ctx->exec_context()));
+
+ out_scalar->is_valid = true;
+ }
+ return Status::OK();
}
- ARROW_ASSIGN_OR_RAISE(Datum cast_values,
- Cast(values, child_type, options, ctx->exec_context()));
-
- DCHECK_EQ(Datum::ARRAY, cast_values.kind());
- out_array->child_data.push_back(cast_values.array());
- return Status::OK();
+ const ArrayData& in_array = *batch[0].array();
+ ArrayData* out_array = out->mutable_array();
+
+ // Copy from parent
+ out_array->buffers = in_array.buffers;
+ Datum values = in_array.child_data[0];
+
+ if (in_array.offset != 0) {
+ if (in_array.buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+ CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
+ in_array.offset, in_array.length));
+ }
+ ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
+ ctx->Allocate(sizeof(offset_type) * (in_array.length + 1)));
+
+ auto offsets = in_array.GetValues<offset_type>(1);
+ auto shifted_offsets = out_array->GetMutableValues<offset_type>(1);
+
+ for (int64_t i = 0; i < in_array.length + 1; ++i) {
+ shifted_offsets[i] = offsets[i] - offsets[0];
+ }
+ values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+ Cast(values, child_type, options, ctx->exec_context()));
+
+ DCHECK_EQ(Datum::ARRAY, cast_values.kind());
+ out_array->child_data.push_back(cast_values.array());
+ return Status::OK();
}
template <typename Type>
@@ -120,12 +120,12 @@ std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
auto cast_struct = std::make_shared<CastFunction>("cast_struct", Type::STRUCT);
AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get());
- // So is dictionary
- auto cast_dictionary =
- std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
- AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get());
-
- return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary};
+ // So is dictionary
+ auto cast_dictionary =
+ std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
+ AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get());
+
+ return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary};
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index cc7b533f262..4ada0b08afe 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -17,10 +17,10 @@
// Implementation of casting to integer, floating point, or decimal types
-#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_primitive.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/int_util.h"
#include "arrow/util/value_parsing.h"
@@ -36,18 +36,18 @@ using internal::ParseValue;
namespace compute {
namespace internal {
-Status CastIntegerToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastIntegerToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
if (!options.allow_int_overflow) {
- RETURN_NOT_OK(IntegersCanFit(batch[0], *out->type()));
+ RETURN_NOT_OK(IntegersCanFit(batch[0], *out->type()));
}
CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
- return Status::OK();
+ return Status::OK();
}
-Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out) {
+Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out) {
CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -65,7 +65,7 @@ Status CheckFloatTruncation(const Datum& input, const Datum& output) {
return is_valid && static_cast<InT>(out_val) != in_val;
};
auto GetErrorMessage = [&](InT val) {
- return Status::Invalid("Float value ", val, " was truncated converting to ",
+ return Status::Invalid("Float value ", val, " was truncated converting to ",
*output.type());
};
@@ -170,13 +170,13 @@ Status CheckFloatToIntTruncation(const Datum& input, const Datum& output) {
return Status::OK();
}
-Status CastFloatingToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFloatingToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
if (!options.allow_float_truncate) {
- RETURN_NOT_OK(CheckFloatToIntTruncation(batch[0], *out));
+ RETURN_NOT_OK(CheckFloatToIntTruncation(batch[0], *out));
}
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -249,14 +249,14 @@ Status CheckForIntegerToFloatingTruncation(const Datum& input, Type::type out_ty
return Status::OK();
}
-Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
Type::type out_type = out->type()->id();
if (!options.allow_float_truncate) {
- RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
+ RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
}
CastNumberToNumberUnsafe(batch[0].type()->id(), out_type, batch[0], out);
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -264,7 +264,7 @@ Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum*
struct BooleanToNumber {
template <typename OutValue, typename Arg0Value>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
constexpr auto kOne = static_cast<OutValue>(1);
constexpr auto kZero = static_cast<OutValue>(0);
return val ? kOne : kZero;
@@ -273,9 +273,9 @@ struct BooleanToNumber {
template <typename O>
struct CastFunctor<O, BooleanType, enable_if_number<O>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return applicator::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch,
- out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return applicator::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch,
+ out);
}
};
@@ -285,11 +285,11 @@ struct CastFunctor<O, BooleanType, enable_if_number<O>> {
template <typename OutType>
struct ParseString {
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
OutValue result = OutValue(0);
if (ARROW_PREDICT_FALSE(!ParseValue<OutType>(val.data(), val.size(), &result))) {
- *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
- TypeTraits<OutType>::type_singleton()->ToString());
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ TypeTraits<OutType>::type_singleton()->ToString());
}
return result;
}
@@ -297,8 +297,8 @@ struct ParseString {
template <typename O, typename I>
struct CastFunctor<O, I, enable_if_base_binary<I>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
}
};
@@ -306,13 +306,13 @@ struct CastFunctor<O, I, enable_if_base_binary<I>> {
// Decimal to integer
struct DecimalToIntegerMixin {
- template <typename OutValue, typename Arg0Value>
- OutValue ToInteger(KernelContext* ctx, const Arg0Value& val, Status* st) const {
+ template <typename OutValue, typename Arg0Value>
+ OutValue ToInteger(KernelContext* ctx, const Arg0Value& val, Status* st) const {
constexpr auto min_value = std::numeric_limits<OutValue>::min();
constexpr auto max_value = std::numeric_limits<OutValue>::max();
if (!allow_int_overflow_ && ARROW_PREDICT_FALSE(val < min_value || val > max_value)) {
- *st = Status::Invalid("Integer value out of bounds");
+ *st = Status::Invalid("Integer value out of bounds");
return OutValue{}; // Zero
} else {
return static_cast<OutValue>(val.low_bits());
@@ -330,8 +330,8 @@ struct UnsafeUpscaleDecimalToInteger : public DecimalToIntegerMixin {
using DecimalToIntegerMixin::DecimalToIntegerMixin;
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
- return ToInteger<OutValue>(ctx, val.IncreaseScaleBy(-in_scale_), st);
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ return ToInteger<OutValue>(ctx, val.IncreaseScaleBy(-in_scale_), st);
}
};
@@ -339,8 +339,8 @@ struct UnsafeDownscaleDecimalToInteger : public DecimalToIntegerMixin {
using DecimalToIntegerMixin::DecimalToIntegerMixin;
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
- return ToInteger<OutValue>(ctx, val.ReduceScaleBy(in_scale_, false), st);
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ return ToInteger<OutValue>(ctx, val.ReduceScaleBy(in_scale_, false), st);
}
};
@@ -348,44 +348,44 @@ struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin {
using DecimalToIntegerMixin::DecimalToIntegerMixin;
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
auto result = val.Rescale(in_scale_, 0);
if (ARROW_PREDICT_FALSE(!result.ok())) {
- *st = result.status();
+ *st = result.status();
return OutValue{}; // Zero
} else {
- return ToInteger<OutValue>(ctx, *result, st);
+ return ToInteger<OutValue>(ctx, *result, st);
}
}
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
using out_type = typename O::c_type;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
- const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
+ const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
const auto in_scale = in_type_inst.scale();
if (options.allow_decimal_truncate) {
if (in_scale < 0) {
// Unsafe upscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimalToInteger>
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimalToInteger>
kernel(UnsafeUpscaleDecimalToInteger{in_scale, options.allow_int_overflow});
return kernel.Exec(ctx, batch, out);
} else {
// Unsafe downscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimalToInteger>
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimalToInteger>
kernel(UnsafeDownscaleDecimalToInteger{in_scale, options.allow_int_overflow});
return kernel.Exec(ctx, batch, out);
}
} else {
// Safe rescale
- applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimalToInteger> kernel(
- SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
+ applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimalToInteger> kernel(
+ SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
return kernel.Exec(ctx, batch, out);
}
}
@@ -394,104 +394,104 @@ struct CastFunctor<O, I,
// ----------------------------------------------------------------------
// Decimal to decimal
-// Helper that converts the input and output decimals
-// For instance, Decimal128 -> Decimal256 requires converting, then scaling
-// Decimal256 -> Decimal128 requires scaling, then truncating
-template <typename OutDecimal, typename InDecimal>
-struct DecimalConversions {};
-
-template <typename InDecimal>
-struct DecimalConversions<Decimal256, InDecimal> {
- // Convert then scale
- static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); }
- static Decimal256 ConvertOutput(Decimal256&& val) { return val; }
-};
-
-template <>
-struct DecimalConversions<Decimal128, Decimal256> {
- // Scale then truncate
- static Decimal256 ConvertInput(Decimal256&& val) { return val; }
- static Decimal128 ConvertOutput(Decimal256&& val) {
- return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]);
- }
-};
-
-template <>
-struct DecimalConversions<Decimal128, Decimal128> {
- static Decimal128 ConvertInput(Decimal128&& val) { return val; }
- static Decimal128 ConvertOutput(Decimal128&& val) { return val; }
-};
-
-struct UnsafeUpscaleDecimal {
- template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status*) const {
- using Conv = DecimalConversions<OutValue, Arg0Value>;
- return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_));
- }
- int32_t by_;
+// Helper that converts the input and output decimals
+// For instance, Decimal128 -> Decimal256 requires converting, then scaling
+// Decimal256 -> Decimal128 requires scaling, then truncating
+template <typename OutDecimal, typename InDecimal>
+struct DecimalConversions {};
+
+template <typename InDecimal>
+struct DecimalConversions<Decimal256, InDecimal> {
+ // Convert then scale
+ static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); }
+ static Decimal256 ConvertOutput(Decimal256&& val) { return val; }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal256> {
+ // Scale then truncate
+ static Decimal256 ConvertInput(Decimal256&& val) { return val; }
+ static Decimal128 ConvertOutput(Decimal256&& val) {
+ return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]);
+ }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal128> {
+ static Decimal128 ConvertInput(Decimal128&& val) { return val; }
+ static Decimal128 ConvertOutput(Decimal128&& val) { return val; }
};
+struct UnsafeUpscaleDecimal {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status*) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_));
+ }
+ int32_t by_;
+};
+
struct UnsafeDownscaleDecimal {
- template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status*) const {
- using Conv = DecimalConversions<OutValue, Arg0Value>;
- return Conv::ConvertOutput(
- Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false));
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status*) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ return Conv::ConvertOutput(
+ Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false));
}
- int32_t by_;
+ int32_t by_;
};
struct SafeRescaleDecimal {
- template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
- using Conv = DecimalConversions<OutValue, Arg0Value>;
- auto maybe_rescaled =
- Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_);
- if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) {
- *st = maybe_rescaled.status();
- return {}; // Zero
- }
-
- if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) {
- return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe());
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ auto maybe_rescaled =
+ Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_);
+ if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) {
+ *st = maybe_rescaled.status();
+ return {}; // Zero
}
-
- *st = Status::Invalid("Decimal value does not fit in precision ", out_precision_);
- return {}; // Zero
+
+ if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) {
+ return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe());
+ }
+
+ *st = Status::Invalid("Decimal value does not fit in precision ", out_precision_);
+ return {}; // Zero
}
int32_t out_scale_, out_precision_, in_scale_;
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
- const auto& in_type = checked_cast<const I&>(*batch[0].type());
- const auto& out_type = checked_cast<const O&>(*out->type());
- const auto in_scale = in_type.scale();
- const auto out_scale = out_type.scale();
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto& out_type = checked_cast<const O&>(*out->type());
+ const auto in_scale = in_type.scale();
+ const auto out_scale = out_type.scale();
if (options.allow_decimal_truncate) {
if (in_scale < out_scale) {
// Unsafe upscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimal> kernel(
- UnsafeUpscaleDecimal{out_scale - in_scale});
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimal> kernel(
+ UnsafeUpscaleDecimal{out_scale - in_scale});
return kernel.Exec(ctx, batch, out);
} else {
// Unsafe downscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimal> kernel(
- UnsafeDownscaleDecimal{in_scale - out_scale});
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimal> kernel(
+ UnsafeDownscaleDecimal{in_scale - out_scale});
return kernel.Exec(ctx, batch, out);
}
}
-
- // Safe rescale
- applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimal> kernel(
- SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
- return kernel.Exec(ctx, batch, out);
+
+ // Safe rescale
+ applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimal> kernel(
+ SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
+ return kernel.Exec(ctx, batch, out);
}
};
@@ -500,33 +500,33 @@ struct CastFunctor<O, I,
struct RealToDecimal {
template <typename OutValue, typename RealType>
- OutValue Call(KernelContext*, RealType val, Status* st) const {
- auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_);
-
- if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) {
- return maybe_decimal.MoveValueUnsafe();
+ OutValue Call(KernelContext*, RealType val, Status* st) const {
+ auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_);
+
+ if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) {
+ return maybe_decimal.MoveValueUnsafe();
}
-
- if (!allow_truncate_) {
- *st = maybe_decimal.status();
- }
- return {}; // Zero
+
+ if (!allow_truncate_) {
+ *st = maybe_decimal.status();
+ }
+ return {}; // Zero
}
int32_t out_scale_, out_precision_;
bool allow_truncate_;
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
- const auto& out_type = checked_cast<const O&>(*out->type());
- const auto out_scale = out_type.scale();
- const auto out_precision = out_type.precision();
+ const auto& out_type = checked_cast<const O&>(*out->type());
+ const auto out_scale = out_type.scale();
+ const auto out_precision = out_type.precision();
- applicator::ScalarUnaryNotNullStateful<O, I, RealToDecimal> kernel(
+ applicator::ScalarUnaryNotNullStateful<O, I, RealToDecimal> kernel(
RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate});
return kernel.Exec(ctx, batch, out);
}
@@ -537,21 +537,21 @@ struct CastFunctor<O, I,
struct DecimalToReal {
template <typename RealType, typename Arg0Value>
- RealType Call(KernelContext*, const Arg0Value& val, Status*) const {
- return val.template ToReal<RealType>(in_scale_);
+ RealType Call(KernelContext*, const Arg0Value& val, Status*) const {
+ return val.template ToReal<RealType>(in_scale_);
}
int32_t in_scale_;
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& in_type = checked_cast<const I&>(*batch[0].type());
- const auto in_scale = in_type.scale();
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto in_scale = in_type.scale();
- applicator::ScalarUnaryNotNullStateful<O, I, DecimalToReal> kernel(
+ applicator::ScalarUnaryNotNullStateful<O, I, DecimalToReal> kernel(
DecimalToReal{in_scale});
return kernel.Exec(ctx, batch, out);
}
@@ -595,10 +595,10 @@ std::shared_ptr<CastFunction> GetCastToInteger(std::string name) {
AddCommonNumberCasts<OutType>(out_ty, func.get());
// From decimal to integer
- DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
+ DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
CastFunctor<OutType, Decimal128Type>::Exec));
- DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
- CastFunctor<OutType, Decimal256Type>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+ CastFunctor<OutType, Decimal256Type>::Exec));
return func;
}
@@ -621,18 +621,18 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
AddCommonNumberCasts<OutType>(out_ty, func.get());
// From decimal to floating point
- DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
+ DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
CastFunctor<OutType, Decimal128Type>::Exec));
- DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
- CastFunctor<OutType, Decimal256Type>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+ CastFunctor<OutType, Decimal256Type>::Exec));
return func;
}
-std::shared_ptr<CastFunction> GetCastToDecimal128() {
+std::shared_ptr<CastFunction> GetCastToDecimal128() {
OutputType sig_out_ty(ResolveOutputFromOptions);
- auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
- AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
+ auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
+ AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
// Cast from floating point
DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
@@ -643,36 +643,36 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
// Cast from other decimal
auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
// We resolve the output type of this kernel from the CastOptions
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
- exec = CastFunctor<Decimal128Type, Decimal256Type>::Exec;
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
- return func;
-}
-
-std::shared_ptr<CastFunction> GetCastToDecimal256() {
- OutputType sig_out_ty(ResolveOutputFromOptions);
-
- auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
- AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
-
- // Cast from floating point
- DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
- CastFunctor<Decimal256Type, FloatType>::Exec));
- DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
- CastFunctor<Decimal256Type, DoubleType>::Exec));
-
- // Cast from other decimal
- auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
- exec = CastFunctor<Decimal256Type, Decimal256Type>::Exec;
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+ exec = CastFunctor<Decimal128Type, Decimal256Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
return func;
}
+std::shared_ptr<CastFunction> GetCastToDecimal256() {
+ OutputType sig_out_ty(ResolveOutputFromOptions);
+
+ auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
+ AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
+
+ // Cast from floating point
+ DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
+ CastFunctor<Decimal256Type, FloatType>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
+ CastFunctor<Decimal256Type, DoubleType>::Exec));
+
+ // Cast from other decimal
+ auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+ exec = CastFunctor<Decimal256Type, Decimal256Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
+ return func;
+}
+
} // namespace
std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
@@ -681,8 +681,8 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
// Make a cast to null that does not do much. Not sure why we need to be able
// to cast from dict<null> -> null but there are unit tests for it
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
- DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
- OutputAllNull));
+ DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
+ OutputAllNull));
functions.push_back(cast_null);
functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
@@ -716,8 +716,8 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
functions.push_back(GetCastToFloating<FloatType>("cast_float"));
functions.push_back(GetCastToFloating<DoubleType>("cast_double"));
- functions.push_back(GetCastToDecimal128());
- functions.push_back(GetCastToDecimal256());
+ functions.push_back(GetCastToDecimal128());
+ functions.push_back(GetCastToDecimal256());
return functions;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 3ce537b7223..e24d7fabf37 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -15,15 +15,15 @@
// specific language governing permissions and limitations
// under the License.
-#include <limits>
+#include <limits>
#include "arrow/array/array_base.h"
-#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_binary.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/result.h"
#include "arrow/util/formatting.h"
-#include "arrow/util/int_util.h"
+#include "arrow/util/int_util.h"
#include "arrow/util/optional.h"
#include "arrow/util/utf8.h"
#include "arrow/visitor_inline.h"
@@ -37,22 +37,22 @@ using util::ValidateUTF8;
namespace compute {
namespace internal {
-namespace {
-
+namespace {
+
// ----------------------------------------------------------------------
// Number / Boolean to String
-template <typename O, typename I>
-struct NumericToStringCastFunctor {
+template <typename O, typename I>
+struct NumericToStringCastFunctor {
using value_type = typename TypeTraits<I>::CType;
using BuilderType = typename TypeTraits<O>::BuilderType;
using FormatterType = StringFormatter<I>;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK(out->is_array());
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
const ArrayData& input = *batch[0].array();
ArrayData* output = out->mutable_array();
- return Convert(ctx, input, output);
+ return Convert(ctx, input, output);
}
static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
@@ -73,7 +73,7 @@ struct NumericToStringCastFunctor {
};
// ----------------------------------------------------------------------
-// Binary-like to binary-like
+// Binary-like to binary-like
//
#if defined(_MSC_VER)
@@ -94,152 +94,152 @@ struct Utf8Validator {
};
template <typename I, typename O>
-Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
- ArrayData* output) {
- static_assert(std::is_same<I, O>::value, "Cast same-width offsets (no-op)");
- return Status::OK();
-}
+Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
+ ArrayData* output) {
+ static_assert(std::is_same<I, O>::value, "Cast same-width offsets (no-op)");
+ return Status::OK();
+}
-// Upcast offsets
+// Upcast offsets
template <>
-Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
- const ArrayData& input,
- ArrayData* output) {
- using input_offset_type = int32_t;
- using output_offset_type = int64_t;
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[1],
- ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type)));
- memset(output->buffers[1]->mutable_data(), 0,
- output->offset * sizeof(output_offset_type));
- ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
- output->GetMutableValues<output_offset_type>(1),
- output->length + 1);
- return Status::OK();
-}
-
-// Downcast offsets
+Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
+ const ArrayData& input,
+ ArrayData* output) {
+ using input_offset_type = int32_t;
+ using output_offset_type = int64_t;
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[1],
+ ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type)));
+ memset(output->buffers[1]->mutable_data(), 0,
+ output->offset * sizeof(output_offset_type));
+ ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+ output->GetMutableValues<output_offset_type>(1),
+ output->length + 1);
+ return Status::OK();
+}
+
+// Downcast offsets
template <>
-Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
- const ArrayData& input,
- ArrayData* output) {
- using input_offset_type = int64_t;
- using output_offset_type = int32_t;
-
- constexpr input_offset_type kMaxOffset = std::numeric_limits<output_offset_type>::max();
-
- auto input_offsets = input.GetValues<input_offset_type>(1);
-
- // Binary offsets are ascending, so it's enough to check the last one for overflow.
- if (input_offsets[input.length] > kMaxOffset) {
- return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
- output->type->ToString(), ": input array too large");
- } else {
- ARROW_ASSIGN_OR_RAISE(output->buffers[1],
- ctx->Allocate((output->length + output->offset + 1) *
- sizeof(output_offset_type)));
- memset(output->buffers[1]->mutable_data(), 0,
- output->offset * sizeof(output_offset_type));
- ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
- output->GetMutableValues<output_offset_type>(1),
- output->length + 1);
- return Status::OK();
- }
-}
-
-template <typename O, typename I>
-Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK(out->is_array());
- const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- const ArrayData& input = *batch[0].array();
-
- if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
- InitializeUTF8();
-
- ArrayDataVisitor<I> visitor;
- Utf8Validator validator;
- RETURN_NOT_OK(visitor.Visit(input, &validator));
- }
-
- // Start with a zero-copy cast, but change indices to expected size
- RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
- return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
- ctx, input, out->mutable_array());
-}
-
+Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
+ const ArrayData& input,
+ ArrayData* output) {
+ using input_offset_type = int64_t;
+ using output_offset_type = int32_t;
+
+ constexpr input_offset_type kMaxOffset = std::numeric_limits<output_offset_type>::max();
+
+ auto input_offsets = input.GetValues<input_offset_type>(1);
+
+ // Binary offsets are ascending, so it's enough to check the last one for overflow.
+ if (input_offsets[input.length] > kMaxOffset) {
+ return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+ output->type->ToString(), ": input array too large");
+ } else {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate((output->length + output->offset + 1) *
+ sizeof(output_offset_type)));
+ memset(output->buffers[1]->mutable_data(), 0,
+ output->offset * sizeof(output_offset_type));
+ ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+ output->GetMutableValues<output_offset_type>(1),
+ output->length + 1);
+ return Status::OK();
+ }
+}
+
+template <typename O, typename I>
+Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+ const ArrayData& input = *batch[0].array();
+
+ if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
+ InitializeUTF8();
+
+ ArrayDataVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+
+ // Start with a zero-copy cast, but change indices to expected size
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
+ ctx, input, out->mutable_array());
+}
+
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
-// ----------------------------------------------------------------------
-// Cast functions registration
+// ----------------------------------------------------------------------
+// Cast functions registration
template <typename OutType>
-void AddNumberToStringCasts(CastFunction* func) {
- auto out_ty = TypeTraits<OutType>::type_singleton();
-
+void AddNumberToStringCasts(CastFunction* func) {
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
- TrivialScalarUnaryAsArraysExec(
- NumericToStringCastFunctor<OutType, BooleanType>::Exec),
+ TrivialScalarUnaryAsArraysExec(
+ NumericToStringCastFunctor<OutType, BooleanType>::Exec),
NullHandling::COMPUTED_NO_PREALLOCATE));
for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
- DCHECK_OK(
- func->AddKernel(in_ty->id(), {in_ty}, out_ty,
- TrivialScalarUnaryAsArraysExec(
- GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty)),
- NullHandling::COMPUTED_NO_PREALLOCATE));
+ DCHECK_OK(
+ func->AddKernel(in_ty->id(), {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(
+ GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty)),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
}
}
-template <typename OutType, typename InType>
-void AddBinaryToBinaryCast(CastFunction* func) {
- auto in_ty = TypeTraits<InType>::type_singleton();
- auto out_ty = TypeTraits<OutType>::type_singleton();
-
- DCHECK_OK(func->AddKernel(
- InType::type_id, {in_ty}, out_ty,
- TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>),
- NullHandling::COMPUTED_NO_PREALLOCATE));
-}
-
-template <typename OutType>
-void AddBinaryToBinaryCast(CastFunction* func) {
- AddBinaryToBinaryCast<OutType, StringType>(func);
- AddBinaryToBinaryCast<OutType, BinaryType>(func);
- AddBinaryToBinaryCast<OutType, LargeStringType>(func);
- AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
-}
-
-} // namespace
-
+template <typename OutType, typename InType>
+void AddBinaryToBinaryCast(CastFunction* func) {
+ auto in_ty = TypeTraits<InType>::type_singleton();
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
+ DCHECK_OK(func->AddKernel(
+ InType::type_id, {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
+}
+
+template <typename OutType>
+void AddBinaryToBinaryCast(CastFunction* func) {
+ AddBinaryToBinaryCast<OutType, StringType>(func);
+ AddBinaryToBinaryCast<OutType, BinaryType>(func);
+ AddBinaryToBinaryCast<OutType, LargeStringType>(func);
+ AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
+}
+
+} // namespace
+
std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
auto cast_binary = std::make_shared<CastFunction>("cast_binary", Type::BINARY);
AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
- AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
+ AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
auto cast_large_binary =
std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
- AddBinaryToBinaryCast<LargeBinaryType>(cast_large_binary.get());
+ AddBinaryToBinaryCast<LargeBinaryType>(cast_large_binary.get());
auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
AddCommonCasts(Type::STRING, utf8(), cast_string.get());
- AddNumberToStringCasts<StringType>(cast_string.get());
- AddBinaryToBinaryCast<StringType>(cast_string.get());
+ AddNumberToStringCasts<StringType>(cast_string.get());
+ AddBinaryToBinaryCast<StringType>(cast_string.get());
auto cast_large_string =
std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
- AddNumberToStringCasts<LargeStringType>(cast_large_string.get());
- AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
-
- auto cast_fsb =
- std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
- AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
- cast_fsb.get());
-
- return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
+ AddNumberToStringCasts<LargeStringType>(cast_large_string.get());
+ AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
+
+ auto cast_fsb =
+ std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
+ AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
+ cast_fsb.get());
+
+ return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 1a58fce7c74..b5271e02413 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -19,7 +19,7 @@
#include <limits>
-#include "arrow/array/builder_time.h"
+#include "arrow/array/builder_time.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/util/bitmap_reader.h"
@@ -39,10 +39,10 @@ constexpr int64_t kMillisecondsInDay = 86400000;
// From one timestamp to another
template <typename in_type, typename out_type>
-Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
- const int64_t factor, const ArrayData& input, ArrayData* output) {
+Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
+ const int64_t factor, const ArrayData& input, ArrayData* output) {
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- auto in_data = input.GetValues<in_type>(1);
+ auto in_data = input.GetValues<in_type>(1);
auto out_data = output->GetMutableValues<out_type>(1);
if (factor == 1) {
@@ -55,10 +55,10 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
out_data[i] = static_cast<out_type>(in_data[i] * factor);
}
} else {
-#define RAISE_OVERFLOW_CAST(VAL) \
- return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
- output->type->ToString(), " would result in ", \
- "out of bounds timestamp: ", VAL);
+#define RAISE_OVERFLOW_CAST(VAL) \
+ return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+ output->type->ToString(), " would result in ", \
+ "out of bounds timestamp: ", VAL);
int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
@@ -88,9 +88,9 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
out_data[i] = static_cast<out_type>(in_data[i] / factor);
}
} else {
-#define RAISE_INVALID_CAST(VAL) \
- return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
- output->type->ToString(), " would lose data: ", VAL);
+#define RAISE_INVALID_CAST(VAL) \
+ return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+ output->type->ToString(), " would lose data: ", VAL);
if (input.null_count != 0) {
BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
@@ -113,8 +113,8 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
#undef RAISE_INVALID_CAST
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
// <TimestampType, TimestampType> and <DurationType, DurationType>
@@ -123,7 +123,7 @@ struct CastFunctor<
O, I,
enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
(is_duration_type<O>::value && is_duration_type<I>::value)>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const ArrayData& input = *batch[0].array();
@@ -137,14 +137,14 @@ struct CastFunctor<
// lengths to make this zero copy in the future but we leave it for now
auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
- return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input,
- output);
+ return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input,
+ output);
}
};
template <>
struct CastFunctor<Date32Type, TimestampType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const ArrayData& input = *batch[0].array();
@@ -160,13 +160,13 @@ struct CastFunctor<Date32Type, TimestampType> {
};
const int64_t factor = kTimestampToDateFactors[static_cast<int>(in_type.unit())];
- return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, factor, input, output);
+ return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, factor, input, output);
}
};
template <>
struct CastFunctor<Date64Type, TimestampType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
@@ -175,8 +175,8 @@ struct CastFunctor<Date64Type, TimestampType> {
const auto& in_type = checked_cast<const TimestampType&>(*input.type);
auto conversion = util::GetTimestampConversion(in_type.unit(), TimeUnit::MILLI);
- RETURN_NOT_OK((ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
- input, output)));
+ RETURN_NOT_OK((ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
+ input, output)));
// Ensure that intraday milliseconds have been zeroed out
auto out_data = output->GetMutableValues<int64_t>(1);
@@ -188,7 +188,7 @@ struct CastFunctor<Date64Type, TimestampType> {
const int64_t remainder = out_data[i] % kMillisecondsInDay;
if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && bit_reader.IsSet() &&
remainder > 0)) {
- return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
+ return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
}
out_data[i] -= remainder;
bit_reader.Next();
@@ -197,13 +197,13 @@ struct CastFunctor<Date64Type, TimestampType> {
for (int64_t i = 0; i < input.length; ++i) {
const int64_t remainder = out_data[i] % kMillisecondsInDay;
if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && remainder > 0)) {
- return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
+ return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
}
out_data[i] -= remainder;
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
@@ -215,7 +215,7 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
using in_t = typename I::c_type;
using out_t = typename O::c_type;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const ArrayData& input = *batch[0].array();
@@ -226,8 +226,8 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
const auto& out_type = checked_cast<const O&>(*output->type);
DCHECK_NE(in_type.unit(), out_type.unit()) << "Do not cast equal types";
auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
- return ShiftTime<in_t, out_t>(ctx, conversion.first, conversion.second, input,
- output);
+ return ShiftTime<in_t, out_t>(ctx, conversion.first, conversion.second, input,
+ output);
}
};
@@ -236,68 +236,68 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
template <>
struct CastFunctor<Date64Type, Date32Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
- *batch[0].array(), out->mutable_array());
+ return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
+ *batch[0].array(), out->mutable_array());
}
};
template <>
struct CastFunctor<Date32Type, Date64Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay,
- *batch[0].array(), out->mutable_array());
- }
-};
-
-// ----------------------------------------------------------------------
-// date32, date64 to timestamp
-
-template <>
-struct CastFunctor<TimestampType, Date32Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
- const auto& out_type = checked_cast<const TimestampType&>(*out->type());
- // get conversion SECOND -> unit
- auto conversion = util::GetTimestampConversion(TimeUnit::SECOND, out_type.unit());
- DCHECK_EQ(conversion.first, util::MULTIPLY);
-
- // multiply to achieve days -> unit
- conversion.second *= kMillisecondsInDay / 1000;
- return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, conversion.second,
- *batch[0].array(), out->mutable_array());
- }
-};
-
-template <>
-struct CastFunctor<TimestampType, Date64Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
- const auto& out_type = checked_cast<const TimestampType&>(*out->type());
-
- // date64 is ms since epoch
- auto conversion = util::GetTimestampConversion(TimeUnit::MILLI, out_type.unit());
- return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
- *batch[0].array(), out->mutable_array());
+ return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay,
+ *batch[0].array(), out->mutable_array());
}
};
// ----------------------------------------------------------------------
+// date32, date64 to timestamp
+
+template <>
+struct CastFunctor<TimestampType, Date32Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+ // get conversion SECOND -> unit
+ auto conversion = util::GetTimestampConversion(TimeUnit::SECOND, out_type.unit());
+ DCHECK_EQ(conversion.first, util::MULTIPLY);
+
+ // multiply to achieve days -> unit
+ conversion.second *= kMillisecondsInDay / 1000;
+ return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, conversion.second,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+template <>
+struct CastFunctor<TimestampType, Date64Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+
+ // date64 is ms since epoch
+ auto conversion = util::GetTimestampConversion(TimeUnit::MILLI, out_type.unit());
+ return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+// ----------------------------------------------------------------------
// String to Timestamp
struct ParseTimestamp {
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
+ OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
OutValue result = 0;
if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) {
- *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
- type.ToString());
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ type.ToString());
}
return result;
}
@@ -307,7 +307,7 @@ struct ParseTimestamp {
template <typename I>
struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& out_type = checked_cast<const TimestampType&>(*out->type());
applicator::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
ParseTimestamp{out_type});
@@ -318,7 +318,7 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
template <typename Type>
void AddCrossUnitCast(CastFunction* func) {
ScalarKernel kernel;
- kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec);
+ kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec);
kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
}
@@ -417,11 +417,11 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
// From date types
- // TODO: ARROW-8876, these casts are not directly tested
- AddSimpleCast<Date32Type, TimestampType>(InputType(Type::DATE32), kOutputTargetType,
- func.get());
- AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64), kOutputTargetType,
- func.get());
+ // TODO: ARROW-8876, these casts are not directly tested
+ AddSimpleCast<Date32Type, TimestampType>(InputType(Type::DATE32), kOutputTargetType,
+ func.get());
+ AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64), kOutputTargetType,
+ func.get());
// string -> timestamp
AddSimpleCast<StringType, TimestampType>(utf8(), kOutputTargetType, func.get());
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 4342d776c38..777a7c9d5ee 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -15,12 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-#include <cmath>
-#include <limits>
-
-#include "arrow/compute/api_scalar.h"
+#include <cmath>
+#include <limits>
+
+#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_ops.h"
namespace arrow {
@@ -34,110 +34,110 @@ namespace internal {
namespace {
struct Equal {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left == right;
}
};
struct NotEqual {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left != right;
}
};
struct Greater {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left > right;
}
};
struct GreaterEqual {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left >= right;
}
};
-template <typename T>
-using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
- std::is_unsigned<T>::value>;
-
-template <typename T>
-using is_signed_integer =
- std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
-
-template <typename T>
-using enable_if_integer =
- enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, T>;
-
-template <typename T>
-using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
-
-struct Minimum {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::fmin(left, right);
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::min(left, right);
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
- return std::nanf("");
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
- return std::nan("");
- }
-
- template <typename T>
- static constexpr enable_if_integer<T> antiextreme() {
- return std::numeric_limits<T>::max();
- }
-};
-
-struct Maximum {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::fmax(left, right);
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::max(left, right);
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
- return std::nanf("");
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
- return std::nan("");
- }
-
- template <typename T>
- static constexpr enable_if_integer<T> antiextreme() {
- return std::numeric_limits<T>::min();
- }
-};
-
+template <typename T>
+using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
+ std::is_unsigned<T>::value>;
+
+template <typename T>
+using is_signed_integer =
+ std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
+
+template <typename T>
+using enable_if_integer =
+ enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, T>;
+
+template <typename T>
+using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
+
+struct Minimum {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::fmin(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::min(left, right);
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
+ return std::nanf("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
+ return std::nan("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_integer<T> antiextreme() {
+ return std::numeric_limits<T>::max();
+ }
+};
+
+struct Maximum {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::fmax(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::max(left, right);
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
+ return std::nanf("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
+ return std::nan("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_integer<T> antiextreme() {
+ return std::numeric_limits<T>::min();
+ }
+};
+
// Implement Less, LessEqual by flipping arguments to Greater, GreaterEqual
template <typename Op>
@@ -154,57 +154,57 @@ void AddGenericCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* func
applicator::ScalarBinaryEqualTypes<BooleanType, InType, Op>::Exec));
}
-struct CompareFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
- ReplaceNullWithOtherType(values);
-
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- } else if (auto type = CommonTimestamp(*values)) {
- ReplaceTypes(type, values);
- } else if (auto type = CommonBinary(*values)) {
- ReplaceTypes(type, values);
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-struct VarArgsCompareFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
-
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- } else if (auto type = CommonTimestamp(*values)) {
- ReplaceTypes(type, values);
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
+struct CompareFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+ ReplaceNullWithOtherType(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonTimestamp(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonBinary(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+struct VarArgsCompareFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonTimestamp(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
template <typename Op>
-std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
+std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
DCHECK_OK(func->AddKernel(
{boolean(), boolean()}, boolean(),
@@ -263,9 +263,9 @@ std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
}
std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
- const ScalarFunction& func,
- const FunctionDoc* doc) {
- auto flipped_func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
+ const ScalarFunction& func,
+ const FunctionDoc* doc) {
+ auto flipped_func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
for (const ScalarKernel* kernel : func.kernels()) {
ScalarKernel flipped_kernel = *kernel;
flipped_kernel.exec = MakeFlippedBinaryExec(kernel->exec);
@@ -274,249 +274,249 @@ std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
return flipped_func;
}
-using MinMaxState = OptionsWrapper<ElementWiseAggregateOptions>;
-
-// Implement a variadic scalar min/max kernel.
-template <typename OutType, typename Op>
-struct ScalarMinMax {
- using OutValue = typename GetOutputType<OutType>::T;
-
- static void ExecScalar(const ExecBatch& batch,
- const ElementWiseAggregateOptions& options, Scalar* out) {
- // All arguments are scalar
- OutValue value{};
- bool valid = false;
- for (const auto& arg : batch.values) {
- // Ignore non-scalar arguments so we can use it in the mixed-scalar-and-array case
- if (!arg.is_scalar()) continue;
- const auto& scalar = *arg.scalar();
- if (!scalar.is_valid) {
- if (options.skip_nulls) continue;
- out->is_valid = false;
- return;
- }
- if (!valid) {
- value = UnboxScalar<OutType>::Unbox(scalar);
- valid = true;
- } else {
- value = Op::template Call<OutValue, OutValue, OutValue>(
- value, UnboxScalar<OutType>::Unbox(scalar));
- }
- }
- out->is_valid = valid;
- if (valid) {
- BoxScalar<OutType>::Box(value, out);
- }
- }
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
- const auto descrs = batch.GetDescriptors();
- const size_t scalar_count =
- static_cast<size_t>(std::count_if(batch.values.begin(), batch.values.end(),
- [](const Datum& d) { return d.is_scalar(); }));
- if (scalar_count == batch.values.size()) {
- ExecScalar(batch, options, out->scalar().get());
- return Status::OK();
- }
-
- ArrayData* output = out->mutable_array();
-
- // At least one array, two or more arguments
- ArrayDataVector arrays;
- for (const auto& arg : batch.values) {
- if (!arg.is_array()) continue;
- arrays.push_back(arg.array());
- }
-
- bool initialize_output = true;
- if (scalar_count > 0) {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
- MakeScalar(out->type(), 0));
- ExecScalar(batch, options, temp_scalar.get());
- if (temp_scalar->is_valid) {
- const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
- initialize_output = false;
- OutValue* out = output->GetMutableValues<OutValue>(1);
- std::fill(out, out + batch.length, value);
- } else if (!options.skip_nulls) {
- // Abort early
- ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*temp_scalar, batch.length,
- ctx->memory_pool()));
- *output = *array->data();
- return Status::OK();
- }
- }
-
- if (initialize_output) {
- OutValue* out = output->GetMutableValues<OutValue>(1);
- std::fill(out, out + batch.length, Op::template antiextreme<OutValue>());
- }
-
- // Precompute the validity buffer
- if (options.skip_nulls && initialize_output) {
- // OR together the validity buffers of all arrays
- if (std::all_of(arrays.begin(), arrays.end(),
- [](const std::shared_ptr<ArrayData>& arr) {
- return arr->MayHaveNulls();
- })) {
- for (const auto& arr : arrays) {
- if (!arr->MayHaveNulls()) continue;
- if (!output->buffers[0]) {
- ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
- ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
-
- batch.length,
- output->buffers[0]->mutable_data(),
- /*dest_offset=*/0);
- } else {
- ::arrow::internal::BitmapOr(
- output->buffers[0]->data(), /*left_offset=*/0, arr->buffers[0]->data(),
- arr->offset, batch.length,
- /*out_offset=*/0, output->buffers[0]->mutable_data());
- }
- }
- }
- } else if (!options.skip_nulls) {
- // AND together the validity buffers of all arrays
- for (const auto& arr : arrays) {
- if (!arr->MayHaveNulls()) continue;
- if (!output->buffers[0]) {
- ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
- ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
- batch.length, output->buffers[0]->mutable_data(),
- /*dest_offset=*/0);
- } else {
- ::arrow::internal::BitmapAnd(output->buffers[0]->data(), /*left_offset=*/0,
- arr->buffers[0]->data(), arr->offset, batch.length,
- /*out_offset=*/0,
- output->buffers[0]->mutable_data());
- }
- }
- }
-
- for (const auto& array : arrays) {
- OutputArrayWriter<OutType> writer(out->mutable_array());
- ArrayIterator<OutType> out_it(*output);
- int64_t index = 0;
- VisitArrayValuesInline<OutType>(
- *array,
- [&](OutValue value) {
- auto u = out_it();
- if (!output->buffers[0] ||
- BitUtil::GetBit(output->buffers[0]->data(), index)) {
- writer.Write(Op::template Call<OutValue, OutValue, OutValue>(u, value));
- } else {
- writer.Write(value);
- }
- index++;
- },
- [&]() {
- // RHS is null, preserve the LHS
- writer.values++;
- index++;
- out_it();
- });
- }
- output->null_count = output->buffers[0] ? -1 : 0;
- return Status::OK();
- }
-};
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name,
- const FunctionDoc* doc) {
- static auto default_element_wise_aggregate_options =
- ElementWiseAggregateOptions::Defaults();
-
- auto func = std::make_shared<VarArgsCompareFunction>(
- name, Arity::VarArgs(), doc, &default_element_wise_aggregate_options);
- for (const auto& ty : NumericTypes()) {
- auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
- ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
- MinMaxState::Init};
- kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- for (const auto& ty : TemporalTypes()) {
- auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
- ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
- MinMaxState::Init};
- kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- return func;
-}
-
-const FunctionDoc equal_doc{"Compare values for equality (x == y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc not_equal_doc{"Compare values for inequality (x != y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc greater_doc{"Compare values for ordered inequality (x > y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc greater_equal_doc{
- "Compare values for ordered inequality (x >= y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc less_doc{"Compare values for ordered inequality (x < y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc less_equal_doc{
- "Compare values for ordered inequality (x <= y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc min_element_wise_doc{
- "Find the element-wise minimum value",
- ("Nulls will be ignored (default) or propagated. "
- "NaN will be taken over null, but not over any valid float."),
- {"*args"},
- "ElementWiseAggregateOptions"};
-
-const FunctionDoc max_element_wise_doc{
- "Find the element-wise maximum value",
- ("Nulls will be ignored (default) or propagated. "
- "NaN will be taken over null, but not over any valid float."),
- {"*args"},
- "ElementWiseAggregateOptions"};
+using MinMaxState = OptionsWrapper<ElementWiseAggregateOptions>;
+
+// Implement a variadic scalar min/max kernel.
+template <typename OutType, typename Op>
+struct ScalarMinMax {
+ using OutValue = typename GetOutputType<OutType>::T;
+
+ static void ExecScalar(const ExecBatch& batch,
+ const ElementWiseAggregateOptions& options, Scalar* out) {
+ // All arguments are scalar
+ OutValue value{};
+ bool valid = false;
+ for (const auto& arg : batch.values) {
+ // Ignore non-scalar arguments so we can use it in the mixed-scalar-and-array case
+ if (!arg.is_scalar()) continue;
+ const auto& scalar = *arg.scalar();
+ if (!scalar.is_valid) {
+ if (options.skip_nulls) continue;
+ out->is_valid = false;
+ return;
+ }
+ if (!valid) {
+ value = UnboxScalar<OutType>::Unbox(scalar);
+ valid = true;
+ } else {
+ value = Op::template Call<OutValue, OutValue, OutValue>(
+ value, UnboxScalar<OutType>::Unbox(scalar));
+ }
+ }
+ out->is_valid = valid;
+ if (valid) {
+ BoxScalar<OutType>::Box(value, out);
+ }
+ }
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ const auto descrs = batch.GetDescriptors();
+ const size_t scalar_count =
+ static_cast<size_t>(std::count_if(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); }));
+ if (scalar_count == batch.values.size()) {
+ ExecScalar(batch, options, out->scalar().get());
+ return Status::OK();
+ }
+
+ ArrayData* output = out->mutable_array();
+
+ // At least one array, two or more arguments
+ ArrayDataVector arrays;
+ for (const auto& arg : batch.values) {
+ if (!arg.is_array()) continue;
+ arrays.push_back(arg.array());
+ }
+
+ bool initialize_output = true;
+ if (scalar_count > 0) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
+ MakeScalar(out->type(), 0));
+ ExecScalar(batch, options, temp_scalar.get());
+ if (temp_scalar->is_valid) {
+ const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
+ initialize_output = false;
+ OutValue* out = output->GetMutableValues<OutValue>(1);
+ std::fill(out, out + batch.length, value);
+ } else if (!options.skip_nulls) {
+ // Abort early
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*temp_scalar, batch.length,
+ ctx->memory_pool()));
+ *output = *array->data();
+ return Status::OK();
+ }
+ }
+
+ if (initialize_output) {
+ OutValue* out = output->GetMutableValues<OutValue>(1);
+ std::fill(out, out + batch.length, Op::template antiextreme<OutValue>());
+ }
+
+ // Precompute the validity buffer
+ if (options.skip_nulls && initialize_output) {
+ // OR together the validity buffers of all arrays
+ if (std::all_of(arrays.begin(), arrays.end(),
+ [](const std::shared_ptr<ArrayData>& arr) {
+ return arr->MayHaveNulls();
+ })) {
+ for (const auto& arr : arrays) {
+ if (!arr->MayHaveNulls()) continue;
+ if (!output->buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
+ ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
+
+ batch.length,
+ output->buffers[0]->mutable_data(),
+ /*dest_offset=*/0);
+ } else {
+ ::arrow::internal::BitmapOr(
+ output->buffers[0]->data(), /*left_offset=*/0, arr->buffers[0]->data(),
+ arr->offset, batch.length,
+ /*out_offset=*/0, output->buffers[0]->mutable_data());
+ }
+ }
+ }
+ } else if (!options.skip_nulls) {
+ // AND together the validity buffers of all arrays
+ for (const auto& arr : arrays) {
+ if (!arr->MayHaveNulls()) continue;
+ if (!output->buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
+ ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
+ batch.length, output->buffers[0]->mutable_data(),
+ /*dest_offset=*/0);
+ } else {
+ ::arrow::internal::BitmapAnd(output->buffers[0]->data(), /*left_offset=*/0,
+ arr->buffers[0]->data(), arr->offset, batch.length,
+ /*out_offset=*/0,
+ output->buffers[0]->mutable_data());
+ }
+ }
+ }
+
+ for (const auto& array : arrays) {
+ OutputArrayWriter<OutType> writer(out->mutable_array());
+ ArrayIterator<OutType> out_it(*output);
+ int64_t index = 0;
+ VisitArrayValuesInline<OutType>(
+ *array,
+ [&](OutValue value) {
+ auto u = out_it();
+ if (!output->buffers[0] ||
+ BitUtil::GetBit(output->buffers[0]->data(), index)) {
+ writer.Write(Op::template Call<OutValue, OutValue, OutValue>(u, value));
+ } else {
+ writer.Write(value);
+ }
+ index++;
+ },
+ [&]() {
+ // RHS is null, preserve the LHS
+ writer.values++;
+ index++;
+ out_it();
+ });
+ }
+ output->null_count = output->buffers[0] ? -1 : 0;
+ return Status::OK();
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name,
+ const FunctionDoc* doc) {
+ static auto default_element_wise_aggregate_options =
+ ElementWiseAggregateOptions::Defaults();
+
+ auto func = std::make_shared<VarArgsCompareFunction>(
+ name, Arity::VarArgs(), doc, &default_element_wise_aggregate_options);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
+ ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
+ MinMaxState::Init};
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ for (const auto& ty : TemporalTypes()) {
+ auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
+ ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
+ MinMaxState::Init};
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ return func;
+}
+
+const FunctionDoc equal_doc{"Compare values for equality (x == y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc not_equal_doc{"Compare values for inequality (x != y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc greater_doc{"Compare values for ordered inequality (x > y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc greater_equal_doc{
+ "Compare values for ordered inequality (x >= y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc less_doc{"Compare values for ordered inequality (x < y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc less_equal_doc{
+ "Compare values for ordered inequality (x <= y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc min_element_wise_doc{
+ "Find the element-wise minimum value",
+ ("Nulls will be ignored (default) or propagated. "
+ "NaN will be taken over null, but not over any valid float."),
+ {"*args"},
+ "ElementWiseAggregateOptions"};
+
+const FunctionDoc max_element_wise_doc{
+ "Find the element-wise maximum value",
+ ("Nulls will be ignored (default) or propagated. "
+ "NaN will be taken over null, but not over any valid float."),
+ {"*args"},
+ "ElementWiseAggregateOptions"};
} // namespace
void RegisterScalarComparison(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunction(MakeCompareFunction<Equal>("equal", &equal_doc)));
- DCHECK_OK(
- registry->AddFunction(MakeCompareFunction<NotEqual>("not_equal", &not_equal_doc)));
+ DCHECK_OK(registry->AddFunction(MakeCompareFunction<Equal>("equal", &equal_doc)));
+ DCHECK_OK(
+ registry->AddFunction(MakeCompareFunction<NotEqual>("not_equal", &not_equal_doc)));
- auto greater = MakeCompareFunction<Greater>("greater", &greater_doc);
- auto greater_equal =
- MakeCompareFunction<GreaterEqual>("greater_equal", &greater_equal_doc);
+ auto greater = MakeCompareFunction<Greater>("greater", &greater_doc);
+ auto greater_equal =
+ MakeCompareFunction<GreaterEqual>("greater_equal", &greater_equal_doc);
- auto less = MakeFlippedFunction("less", *greater, &less_doc);
- auto less_equal = MakeFlippedFunction("less_equal", *greater_equal, &less_equal_doc);
+ auto less = MakeFlippedFunction("less", *greater, &less_doc);
+ auto less_equal = MakeFlippedFunction("less_equal", *greater_equal, &less_equal_doc);
DCHECK_OK(registry->AddFunction(std::move(less)));
DCHECK_OK(registry->AddFunction(std::move(less_equal)));
DCHECK_OK(registry->AddFunction(std::move(greater)));
DCHECK_OK(registry->AddFunction(std::move(greater_equal)));
-
- // ----------------------------------------------------------------------
- // Variadic element-wise functions
-
- auto min_element_wise =
- MakeScalarMinMax<Minimum>("min_element_wise", &min_element_wise_doc);
- DCHECK_OK(registry->AddFunction(std::move(min_element_wise)));
-
- auto max_element_wise =
- MakeScalarMinMax<Maximum>("max_element_wise", &max_element_wise_doc);
- DCHECK_OK(registry->AddFunction(std::move(max_element_wise)));
+
+ // ----------------------------------------------------------------------
+ // Variadic element-wise functions
+
+ auto min_element_wise =
+ MakeScalarMinMax<Minimum>("min_element_wise", &min_element_wise_doc);
+ DCHECK_OK(registry->AddFunction(std::move(min_element_wise)));
+
+ auto max_element_wise =
+ MakeScalarMinMax<Maximum>("max_element_wise", &max_element_wise_doc);
+ DCHECK_OK(registry->AddFunction(std::move(max_element_wise)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
index cf22b0de3dc..d29c3984b7a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
@@ -37,13 +37,13 @@ namespace {
template <typename Type, typename Enable = void>
struct FillNullFunctor {};
-// Numeric inputs
-
+// Numeric inputs
+
template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
using T = typename TypeTraits<Type>::CType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const ArrayData& data = *batch[0].array();
const Scalar& fill_value = *batch[1].scalar();
ArrayData* output = out->mutable_array();
@@ -54,8 +54,8 @@ struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
T value = UnboxScalar<Type>::Unbox(fill_value);
if (data.MayHaveNulls() != 0 && fill_value.is_valid) {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
- ctx->Allocate(data.length * sizeof(T)));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+ ctx->Allocate(data.length * sizeof(T)));
const uint8_t* is_valid = data.buffers[0]->data();
const T* in_values = data.GetValues<T>(1);
@@ -80,28 +80,28 @@ struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
in_values += block.length;
}
output->buffers[1] = out_buf;
- output->null_count = 0;
+ output->null_count = 0;
} else {
*output = data;
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
-// Boolean input
-
+// Boolean input
+
template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const ArrayData& data = *batch[0].array();
const Scalar& fill_value = *batch[1].scalar();
ArrayData* output = out->mutable_array();
bool value = UnboxScalar<BooleanType>::Unbox(fill_value);
if (data.MayHaveNulls() != 0 && fill_value.is_valid) {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
- ctx->AllocateBitmap(data.length));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+ ctx->AllocateBitmap(data.length));
const uint8_t* is_valid = data.buffers[0]->data();
const uint8_t* data_bitmap = data.buffers[1]->data();
@@ -132,68 +132,68 @@ struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
out_offset += block.length;
}
output->buffers[1] = out_buf;
- output->null_count = 0;
+ output->null_count = 0;
} else {
*output = data;
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
-// Null input
-
+// Null input
+
template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_null_type<Type>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Nothing preallocated, so we assign into the output
*out->mutable_array() = *batch[0].array();
- return Status::OK();
- }
-};
-
-// Binary-like input
-
-template <typename Type>
-struct FillNullFunctor<Type, enable_if_t<is_base_binary_type<Type>::value>> {
- using BuilderType = typename TypeTraits<Type>::BuilderType;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const ArrayData& input = *batch[0].array();
- const auto& fill_value_scalar =
- checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
- ArrayData* output = out->mutable_array();
-
- // Ensure the kernel is configured properly to have no validity bitmap /
- // null count 0 unless we explicitly propagate it below.
- DCHECK(output->buffers[0] == nullptr);
-
- const int64_t null_count = input.GetNullCount();
-
- if (null_count > 0 && fill_value_scalar.is_valid) {
- util::string_view fill_value(*fill_value_scalar.value);
- BuilderType builder(input.type, ctx->memory_pool());
- RETURN_NOT_OK(builder.ReserveData(input.buffers[2]->size() +
- fill_value.length() * null_count));
- RETURN_NOT_OK(builder.Resize(input.length));
-
- VisitArrayDataInline<Type>(
- input, [&](util::string_view s) { builder.UnsafeAppend(s); },
- [&]() { builder.UnsafeAppend(fill_value); });
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder.Finish(&string_array));
- *output = *string_array->data();
- // The builder does not match the logical type, due to
- // GenerateTypeAgnosticVarBinaryBase
- output->type = input.type;
- } else {
- *output = input;
- }
-
- return Status::OK();
+ return Status::OK();
}
};
+// Binary-like input
+
+template <typename Type>
+struct FillNullFunctor<Type, enable_if_t<is_base_binary_type<Type>::value>> {
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& input = *batch[0].array();
+ const auto& fill_value_scalar =
+ checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+ ArrayData* output = out->mutable_array();
+
+ // Ensure the kernel is configured properly to have no validity bitmap /
+ // null count 0 unless we explicitly propagate it below.
+ DCHECK(output->buffers[0] == nullptr);
+
+ const int64_t null_count = input.GetNullCount();
+
+ if (null_count > 0 && fill_value_scalar.is_valid) {
+ util::string_view fill_value(*fill_value_scalar.value);
+ BuilderType builder(input.type, ctx->memory_pool());
+ RETURN_NOT_OK(builder.ReserveData(input.buffers[2]->size() +
+ fill_value.length() * null_count));
+ RETURN_NOT_OK(builder.Resize(input.length));
+
+ VisitArrayDataInline<Type>(
+ input, [&](util::string_view s) { builder.UnsafeAppend(s); },
+ [&]() { builder.UnsafeAppend(fill_value); });
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *output = *string_array->data();
+ // The builder does not match the logical type, due to
+ // GenerateTypeAgnosticVarBinaryBase
+ output->type = input.type;
+ } else {
+ *output = input;
+ }
+
+ return Status::OK();
+ }
+};
+
void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
for (const std::shared_ptr<DataType>& ty : types) {
@@ -208,22 +208,22 @@ void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
AddKernels({boolean(), null()});
}
-void AddBinaryFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
- for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
- kernel.signature =
- KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
- kernel.exec = GenerateTypeAgnosticVarBinaryBase<FillNullFunctor>(*ty);
- DCHECK_OK(func->AddKernel(kernel));
- }
-}
-
-const FunctionDoc fill_null_doc{
- "Replace null elements",
- ("`fill_value` must be a scalar of the same type as `values`.\n"
- "Each non-null value in `values` is emitted as-is.\n"
- "Each null value in `values` is replaced with `fill_value`."),
- {"values", "fill_value"}};
-
+void AddBinaryFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ kernel.signature =
+ KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
+ kernel.exec = GenerateTypeAgnosticVarBinaryBase<FillNullFunctor>(*ty);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+}
+
+const FunctionDoc fill_null_doc{
+ "Replace null elements",
+ ("`fill_value` must be a scalar of the same type as `values`.\n"
+ "Each non-null value in `values` is emitted as-is.\n"
+ "Each null value in `values` is replaced with `fill_value`."),
+ {"values", "fill_value"}};
+
} // namespace
void RegisterScalarFillNull(FunctionRegistry* registry) {
@@ -231,10 +231,10 @@ void RegisterScalarFillNull(FunctionRegistry* registry) {
ScalarKernel fill_null_base;
fill_null_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
fill_null_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
- auto fill_null =
- std::make_shared<ScalarFunction>("fill_null", Arity::Binary(), &fill_null_doc);
+ auto fill_null =
+ std::make_shared<ScalarFunction>("fill_null", Arity::Binary(), &fill_null_doc);
AddBasicFillNullKernels(fill_null_base, fill_null.get());
- AddBinaryFillNullKernels(fill_null_base, fill_null.get());
+ AddBinaryFillNullKernels(fill_null_base, fill_null.get());
DCHECK_OK(registry->AddFunction(fill_null));
}
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index ff308a673a3..74fdc062930 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -1,1730 +1,1730 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <arrow/compute/api.h>
-#include <arrow/compute/kernels/codegen_internal.h>
-#include <arrow/compute/util_internal.h>
-#include <arrow/util/bit_block_counter.h>
-#include <arrow/util/bitmap.h>
-#include <arrow/util/bitmap_ops.h>
-#include <arrow/util/bitmap_reader.h>
-
-namespace arrow {
-using internal::BitBlockCount;
-using internal::BitBlockCounter;
-using internal::Bitmap;
-using internal::BitmapWordReader;
-
-namespace compute {
-namespace internal {
-
-namespace {
-
-constexpr uint64_t kAllNull = 0;
-constexpr uint64_t kAllValid = ~kAllNull;
-
-util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {
- if (data.is_scalar()) {
- return data.scalar()->is_valid ? kAllValid : kAllNull;
- }
-
- if (data.array()->null_count == data.array()->length) return kAllNull;
-
- if (!data.array()->MayHaveNulls()) return kAllValid;
-
- // no constant validity word available
- return {};
-}
-
-inline Bitmap GetBitmap(const Datum& datum, int i) {
- if (datum.is_scalar()) return {};
- const ArrayData& a = *datum.array();
- return Bitmap{a.buffers[i], a.offset, a.length};
-}
-
-// if the condition is null then output is null otherwise we take validity from the
-// selected argument
-// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
-template <typename AllocateNullBitmap>
-Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
- const Datum& right_d, ArrayData* output) {
- auto cond_const = GetConstantValidityWord(cond_d);
- auto left_const = GetConstantValidityWord(left_d);
- auto right_const = GetConstantValidityWord(right_d);
-
- enum { COND_CONST = 1, LEFT_CONST = 2, RIGHT_CONST = 4 };
- auto flag = COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
- RIGHT_CONST * right_const.has_value();
-
- const ArrayData& cond = *cond_d.array();
- // cond.data will always be available
- Bitmap cond_data{cond.buffers[1], cond.offset, cond.length};
- Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
- Bitmap left_valid = GetBitmap(left_d, 0);
- Bitmap right_valid = GetBitmap(right_d, 0);
-
- // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
- // In the following cases, we dont need to allocate out_valid bitmap
-
- // if cond & left & right all ones, then output is all valid.
- // if output validity buffer is already allocated (NullHandling::
- // COMPUTED_PREALLOCATE) -> set all bits
- // else, return nullptr
- if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
- if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
- output->buffers[0] = nullptr;
- } else { // NullHandling::COMPUTED_PREALLOCATE
- BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
- output->length);
- }
- return Status::OK();
- }
-
- if (left_const == kAllValid && right_const == kAllValid) {
- // if both left and right are valid, no need to calculate out_valid bitmap. Copy
- // cond validity buffer
- if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
- // if there's an offset, copy bitmap (cannot slice a bitmap)
- if (cond.offset) {
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[0],
- arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
- cond.offset, cond.length));
- } else { // just copy assign cond validity buffer
- output->buffers[0] = cond.buffers[0];
- }
- } else { // NullHandling::COMPUTED_PREALLOCATE
- arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
- output->buffers[0]->mutable_data(), output->offset);
- }
- return Status::OK();
- }
-
- // lambda function that will be used inside the visitor
- auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
- uint64_t r_valid) {
- return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
- };
-
- if (AllocateNullBitmap::value) {
- // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
- // would not have allocated buffers for it.
- ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
- }
-
- std::array<Bitmap, 1> out_bitmaps{
- Bitmap{output->buffers[0], output->offset, output->length}};
-
- switch (flag) {
- case COND_CONST | LEFT_CONST | RIGHT_CONST: {
- std::array<Bitmap, 1> bitmaps{cond_data};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 1>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- *left_const, *right_const);
- });
- break;
- }
- case LEFT_CONST | RIGHT_CONST: {
- std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 2>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- *left_const, *right_const);
- });
- break;
- }
- case COND_CONST | RIGHT_CONST: {
- // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
- // Visit()
- std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 2>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- words_in[1], *right_const);
- });
- break;
- }
- case RIGHT_CONST: {
- // bitmaps[R_VALID] might be null; override to make it safe for Visit()
- std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 3>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- words_in[2], *right_const);
- });
- break;
- }
- case COND_CONST | LEFT_CONST: {
- // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
- // Visit()
- std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 2>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- *left_const, words_in[1]);
- });
- break;
- }
- case LEFT_CONST: {
- // bitmaps[L_VALID] might be null; override to make it safe for Visit()
- std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 3>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- *left_const, words_in[2]);
- });
- break;
- }
- case COND_CONST: {
- // bitmaps[C_VALID] might be null; override to make it safe for Visit()
- std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 3>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- words_in[1], words_in[2]);
- });
- break;
- }
- case 0: {
- std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 4>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- words_in[2], words_in[3]);
- });
- break;
- }
- }
- return Status::OK();
-}
-
-using Word = uint64_t;
-static constexpr int64_t word_len = sizeof(Word) * 8;
-
-/// Runs the main if_else loop. Here, it is expected that the right data has already
-/// been copied to the output.
-/// If `invert` is meant to invert the cond.data. If is set to `true`, then the
-/// buffer will be inverted before calling the handle_block or handle_each functions.
-/// This is useful, when left is an array and right is scalar. Then rather than
-/// copying data from the right to output, we can copy left data to the output and
-/// invert the cond data to fill right values. Filling out with a scalar is presumed to
-/// be more efficient than filling with an array
-///
-/// `HandleBlock` has the signature:
-/// [](int64_t offset, int64_t length){...}
-/// It should copy `length` number of elements from source array to output array with
-/// `offset` offset in both arrays
-template <typename HandleBlock, bool invert = false>
-void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
- int64_t data_offset = 0;
- int64_t bit_offset = cond.offset;
- const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray
-
- BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
- constexpr Word pickAll = invert ? 0 : UINT64_MAX;
- constexpr Word pickNone = ~pickAll;
-
- int64_t cnt = cond_reader.words();
- while (cnt--) {
- Word word = cond_reader.NextWord();
-
- if (word == pickAll) {
- handle_block(data_offset, word_len);
- } else if (word != pickNone) {
- for (int64_t i = 0; i < word_len; ++i) {
- if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
- handle_block(data_offset + i, 1);
- }
- }
- }
- data_offset += word_len;
- bit_offset += word_len;
- }
-
- constexpr uint8_t pickAllByte = invert ? 0 : UINT8_MAX;
- // byte bit-wise inversion is int-wide. Hence XOR with 0xff
- constexpr uint8_t pickNoneByte = pickAllByte ^ 0xff;
-
- cnt = cond_reader.trailing_bytes();
- while (cnt--) {
- int valid_bits;
- uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-
- if (byte == pickAllByte && valid_bits == 8) {
- handle_block(data_offset, 8);
- } else if (byte != pickNoneByte) {
- for (int i = 0; i < valid_bits; ++i) {
- if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
- handle_block(data_offset + i, 1);
- }
- }
- }
- data_offset += 8;
- bit_offset += 8;
- }
-}
-
-template <typename HandleBlock>
-void RunIfElseLoopInverted(const ArrayData& cond, const HandleBlock& handle_block) {
- RunIfElseLoop<HandleBlock, true>(cond, handle_block);
-}
-
-/// Runs if-else when cond is a scalar. Two special functions are required,
-/// 1.CopyArrayData, 2. BroadcastScalar
-template <typename CopyArrayData, typename BroadcastScalar>
-Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum& right,
- Datum* out, const CopyArrayData& copy_array_data,
- const BroadcastScalar& broadcast_scalar) {
- if (left.is_scalar() && right.is_scalar()) { // output will be a scalar
- if (cond.is_valid) {
- *out = cond.value ? left.scalar() : right.scalar();
- } else {
- *out = MakeNullScalar(left.type());
- }
- return Status::OK();
- }
-
- // either left or right is an array. Output is always an array`
- const std::shared_ptr<ArrayData>& out_array = out->array();
- if (!cond.is_valid) {
- // cond is null; output is all null --> clear validity buffer
- BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- return Status::OK();
- }
-
- // cond is a non-null scalar
- const auto& valid_data = cond.value ? left : right;
- if (valid_data.is_array()) {
- // valid_data is an array. Hence copy data to the output buffers
- const auto& valid_array = valid_data.array();
- if (valid_array->MayHaveNulls()) {
- arrow::internal::CopyBitmap(
- valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
- out_array->buffers[0]->mutable_data(), out_array->offset);
- } else { // validity buffer is nullptr --> set all bits
- BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- }
- copy_array_data(*valid_array, out_array.get());
- return Status::OK();
-
- } else { // valid data is scalar
- // valid data is a scalar that needs to be broadcasted
- const auto& valid_scalar = *valid_data.scalar();
- if (valid_scalar.is_valid) { // if the scalar is non-null, broadcast
- BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- broadcast_scalar(*valid_data.scalar(), out_array.get());
- } else { // scalar is null, clear the output validity buffer
- BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- }
- return Status::OK();
- }
-}
-
-template <typename Type, typename Enable = void>
-struct IfElseFunctor {};
-
-// only number types needs to be handled for Fixed sized primitive data types because,
-// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
-// int type
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_number<Type>> {
- using T = typename TypeTraits<Type>::CType;
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- return RunIfElseScalar(
- cond, left, right, out,
- /*CopyArrayData*/
- [&](const ArrayData& valid_array, ArrayData* out_array) {
- std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
- valid_array.length * sizeof(T));
- },
- /*BroadcastScalar*/
- [&](const Scalar& scalar, ArrayData* out_array) {
- T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
- std::fill(out_array->GetMutableValues<T>(1),
- out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
- });
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy right data to out_buff
- const T* right_data = right.GetValues<T>(1);
- std::memcpy(out_values, right_data, right.length * sizeof(T));
-
- // selectively copy values from left data
- const T* left_data = left.GetValues<T>(1);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::memcpy(out_values + data_offset, left_data + data_offset,
- num_elems * sizeof(T));
- });
-
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy right data to out_buff
- const T* right_data = right.GetValues<T>(1);
- std::memcpy(out_values, right_data, right.length * sizeof(T));
-
- // selectively copy values from left data
- T left_data = internal::UnboxScalar<Type>::Unbox(left);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::fill(out_values + data_offset, out_values + data_offset + num_elems,
- left_data);
- });
-
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy left data to out_buff
- const T* left_data = left.GetValues<T>(1);
- std::memcpy(out_values, left_data, left.length * sizeof(T));
-
- T right_data = internal::UnboxScalar<Type>::Unbox(right);
-
- RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::fill(out_values + data_offset, out_values + data_offset + num_elems,
- right_data);
- });
-
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy right data to out_buff
- T right_data = internal::UnboxScalar<Type>::Unbox(right);
- std::fill(out_values, out_values + cond.length, right_data);
-
- // selectively copy values from left data
- T left_data = internal::UnboxScalar<Type>::Unbox(left);
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::fill(out_values + data_offset, out_values + data_offset + num_elems,
- left_data);
- });
-
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_boolean<Type>> {
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- return RunIfElseScalar(
- cond, left, right, out,
- /*CopyArrayData*/
- [&](const ArrayData& valid_array, ArrayData* out_array) {
- arrow::internal::CopyBitmap(
- valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
- out_array->buffers[1]->mutable_data(), out_array->offset);
- },
- /*BroadcastScalar*/
- [&](const Scalar& scalar, ArrayData* out_array) {
- bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
- BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
- out_array->length, scalar_data);
- });
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- // out_buff = right & ~cond
- const auto& out_buf = out->buffers[1];
- arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
- cond.buffers[1]->data(), cond.offset, cond.length,
- out->offset, out_buf->mutable_data());
-
- // out_buff = left & cond
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
- arrow::internal::BitmapAnd(
- ctx->memory_pool(), left.buffers[1]->data(), left.offset,
- cond.buffers[1]->data(), cond.offset, cond.length, 0));
-
- arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
- cond.length, out->offset, out_buf->mutable_data());
-
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- // out_buff = right & ~cond
- const auto& out_buf = out->buffers[1];
- arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
- cond.buffers[1]->data(), cond.offset, cond.length,
- out->offset, out_buf->mutable_data());
-
- // out_buff = left & cond
- bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
- if (left_data) {
- arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
- cond.offset, cond.length, out->offset,
- out_buf->mutable_data());
- }
-
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- // out_buff = left & cond
- const auto& out_buf = out->buffers[1];
- arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
- cond.buffers[1]->data(), cond.offset, cond.length,
- out->offset, out_buf->mutable_data());
-
- bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
-
- // out_buff = left & cond | right & ~cond
- if (right_data) {
- arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
- cond.offset, cond.length, out->offset,
- out_buf->mutable_data());
- }
-
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
- bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
-
- const auto& out_buf = out->buffers[1];
-
- // out_buf = left & cond | right & ~cond
- // std::shared_ptr<Buffer> out_buf = nullptr;
- if (left_data) {
- if (right_data) {
- // out_buf = ones
- BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
- } else {
- // out_buf = cond
- arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
- out_buf->mutable_data(), out->offset);
- }
- } else {
- if (right_data) {
- // out_buf = ~cond
- arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
- out_buf->mutable_data(), out->offset);
- } else {
- // out_buf = zeros
- BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
- }
- }
-
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
- using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
-
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- if (left.is_scalar() && right.is_scalar()) {
- if (cond.is_valid) {
- *out = cond.value ? left.scalar() : right.scalar();
- } else {
- *out = MakeNullScalar(left.type());
- }
- return Status::OK();
- }
- // either left or right is an array. Output is always an array
- int64_t out_arr_len = std::max(left.length(), right.length());
- if (!cond.is_valid) {
- // cond is null; just create a null array
- ARROW_ASSIGN_OR_RAISE(*out,
- MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
- return Status::OK();
- }
-
- const auto& valid_data = cond.value ? left : right;
- if (valid_data.is_array()) {
- *out = valid_data;
- } else {
- // valid data is a scalar that needs to be broadcasted
- ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
- ctx->memory_pool()));
- }
- return Status::OK();
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- const auto* left_offsets = left.GetValues<OffsetType>(1);
- const uint8_t* left_data = left.buffers[2]->data();
- const auto* right_offsets = right.GetValues<OffsetType>(1);
- const uint8_t* right_data = right.buffers[2]->data();
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
- right_offsets[right.length] - right_offsets[0];
-
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out,
- [&](int64_t i) {
- builder.UnsafeAppend(left_data + left_offsets[i],
- left_offsets[i + 1] - left_offsets[i]);
- },
- [&](int64_t i) {
- builder.UnsafeAppend(right_data + right_offsets[i],
- right_offsets[i + 1] - right_offsets[i]);
- },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
- auto left_size = static_cast<OffsetType>(left_data.size());
-
- const auto* right_offsets = right.GetValues<OffsetType>(1);
- const uint8_t* right_data = right.buffers[2]->data();
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc =
- left_size * cond.length + right_offsets[right.length] - right_offsets[0];
-
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
- [&](int64_t i) {
- builder.UnsafeAppend(right_data + right_offsets[i],
- right_offsets[i + 1] - right_offsets[i]);
- },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- const auto* left_offsets = left.GetValues<OffsetType>(1);
- const uint8_t* left_data = left.buffers[2]->data();
-
- util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
- auto right_size = static_cast<OffsetType>(right_data.size());
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc =
- right_size * cond.length + left_offsets[left.length] - left_offsets[0];
-
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out,
- [&](int64_t i) {
- builder.UnsafeAppend(left_data + left_offsets[i],
- left_offsets[i + 1] - left_offsets[i]);
- },
- [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
- auto left_size = static_cast<OffsetType>(left_data.size());
-
- util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
- auto right_size = static_cast<OffsetType>(right_data.size());
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc = std::max(right_size, left_size) * cond.length;
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
- [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- template <typename HandleLeft, typename HandleRight, typename HandleNull>
- static void RunLoop(const ArrayData& cond, const ArrayData& output,
- HandleLeft&& handle_left, HandleRight&& handle_right,
- HandleNull&& handle_null) {
- const auto* cond_data = cond.buffers[1]->data();
-
- if (output.buffers[0]) { // output may have nulls
- // output validity buffer is allocated internally from the IfElseFunctor. Therefore
- // it is cond.length'd with 0 offset.
- const auto* out_valid = output.buffers[0]->data();
-
- for (int64_t i = 0; i < cond.length; i++) {
- if (BitUtil::GetBit(out_valid, i)) {
- BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
- } else {
- handle_null();
- }
- }
- } else { // output is all valid (no nulls)
- for (int64_t i = 0; i < cond.length; i++) {
- BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
- }
- }
- }
-};
-
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type(), *right.type()));
- return RunIfElseScalar(
- cond, left, right, out,
- /*CopyArrayData*/
- [&](const ArrayData& valid_array, ArrayData* out_array) {
- std::memcpy(
- out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
- valid_array.buffers[1]->data() + valid_array.offset * byte_width,
- valid_array.length * byte_width);
- },
- /*BroadcastScalar*/
- [&](const Scalar& scalar, ArrayData* out_array) {
- const util::string_view& scalar_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
- uint8_t* start =
- out_array->buffers[1]->mutable_data() + out_array->offset * byte_width;
- for (int64_t i = 0; i < out_array->length; i++) {
- std::memcpy(start + i * byte_width, scalar_data.data(), scalar_data.size());
- }
- });
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy right data to out_buff
- const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
- std::memcpy(out_values, right_data, right.length * byte_width);
-
- // selectively copy values from left data
- const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::memcpy(out_values + data_offset * byte_width,
- left_data + data_offset * byte_width, num_elems * byte_width);
- });
-
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy right data to out_buff
- const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
- std::memcpy(out_values, right_data, right.length * byte_width);
-
- // selectively copy values from left data
- const util::string_view& left_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- if (left_data.data()) {
- for (int64_t i = 0; i < num_elems; i++) {
- std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
- left_data.size());
- }
- }
- });
-
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy left data to out_buff
- const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
- std::memcpy(out_values, left_data, left.length * byte_width);
-
- const util::string_view& right_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
-
- RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
- if (right_data.data()) {
- for (int64_t i = 0; i < num_elems; i++) {
- std::memcpy(out_values + (data_offset + i) * byte_width, right_data.data(),
- right_data.size());
- }
- }
- });
-
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy right data to out_buff
- const util::string_view& right_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
- if (right_data.data()) {
- for (int64_t i = 0; i < cond.length; i++) {
- std::memcpy(out_values + i * byte_width, right_data.data(), right_data.size());
- }
- }
-
- // selectively copy values from left data
- const util::string_view& left_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- if (left_data.data()) {
- for (int64_t i = 0; i < num_elems; i++) {
- std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
- left_data.size());
- }
- }
- });
-
- return Status::OK();
- }
-
- static Result<int32_t> GetByteWidth(const DataType& left_type,
- const DataType& right_type) {
- int width = checked_cast<const FixedSizeBinaryType&>(left_type).byte_width();
- if (width == checked_cast<const FixedSizeBinaryType&>(right_type).byte_width()) {
- return width;
- } else {
- return Status::Invalid("FixedSizeBinaryType byte_widths should be equal");
- }
- }
-};
-
-template <typename Type, typename AllocateMem>
-struct ResolveIfElseExec {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // cond is scalar
- if (batch[0].is_scalar()) {
- const auto& cond = batch[0].scalar_as<BooleanScalar>();
- return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
- }
-
- // cond is array. Use functors to sort things out
- ARROW_RETURN_NOT_OK(PromoteNullsVisitor<AllocateMem>(ctx, batch[0], batch[1],
- batch[2], out->mutable_array()));
-
- if (batch[1].kind() == Datum::ARRAY) {
- if (batch[2].kind() == Datum::ARRAY) { // AAA
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
- *batch[2].array(), out->mutable_array());
- } else { // AAS
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
- *batch[2].scalar(), out->mutable_array());
- }
- } else {
- if (batch[2].kind() == Datum::ARRAY) { // ASA
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
- *batch[2].array(), out->mutable_array());
- } else { // ASS
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
- *batch[2].scalar(), out->mutable_array());
- }
- }
- }
-};
-
-template <typename AllocateMem>
-struct ResolveIfElseExec<NullType, AllocateMem> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // if all are scalars, return a null scalar
- if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
- *out = MakeNullScalar(null());
- } else {
- ARROW_ASSIGN_OR_RAISE(*out,
- MakeArrayOfNull(null(), batch.length, ctx->memory_pool()));
- }
- return Status::OK();
- }
-};
-
-struct IfElseFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- // if 0th descriptor is null, replace with bool
- if (values->at(0).type->id() == Type::NA) {
- values->at(0).type = boolean();
- }
-
- // if-else 0'th descriptor is bool, so skip it
- std::vector<ValueDescr> values_copy(values->begin() + 1, values->end());
- internal::EnsureDictionaryDecoded(&values_copy);
- internal::ReplaceNullWithOtherType(&values_copy);
-
- if (auto type = internal::CommonNumeric(values_copy)) {
- internal::ReplaceTypes(type, &values_copy);
- }
-
- std::move(values_copy.begin(), values_copy.end(), values->begin() + 1);
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-void AddNullIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
- ScalarKernel kernel({boolean(), null(), null()}, null(),
- ResolveIfElseExec<NullType,
- /*AllocateMem=*/std::true_type>::Exec);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- kernel.can_write_into_slices = false;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec =
- internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec,
- /*AllocateMem=*/std::false_type>(*type);
- // cond array needs to be boolean always
- ScalarKernel kernel({boolean(), type, type}, type, exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = true;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
- }
-}
-
-void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec =
- internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec,
- /*AllocateMem=*/std::true_type>(
- *type);
- // cond array needs to be boolean always
- ScalarKernel kernel({boolean(), type, type}, type, exec);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- kernel.can_write_into_slices = false;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
- }
-}
-
-void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
- // cond array needs to be boolean always
- ScalarKernel kernel(
- {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
- OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
- return ValueDescr(descrs[1].type, ValueDescr::ANY);
- }),
- ResolveIfElseExec<FixedSizeBinaryType, /*AllocateMem=*/std::false_type>::Exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = true;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-// Helper to copy or broadcast fixed-width values between buffers.
-template <typename Type, typename Enable = void>
-struct CopyFixedWidth {};
-template <>
-struct CopyFixedWidth<BooleanType> {
- static void CopyScalar(const Scalar& scalar, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const bool value = UnboxScalar<BooleanType>::Unbox(scalar);
- BitUtil::SetBitsTo(raw_out_values, out_offset, length, value);
- }
- static void CopyArray(const DataType&, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- arrow::internal::CopyBitmap(in_values, in_offset, length, raw_out_values, out_offset);
- }
-};
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_number<Type>> {
- using CType = typename TypeTraits<Type>::CType;
- static void CopyScalar(const Scalar& scalar, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- CType* out_values = reinterpret_cast<CType*>(raw_out_values);
- const CType value = UnboxScalar<Type>::Unbox(scalar);
- std::fill(out_values + out_offset, out_values + out_offset + length, value);
- }
- static void CopyArray(const DataType&, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- std::memcpy(raw_out_values + out_offset * sizeof(CType),
- in_values + in_offset * sizeof(CType), length * sizeof(CType));
- }
-};
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
- static void CopyScalar(const Scalar& values, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width =
- checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(values);
- // Scalar may have null value buffer
- if (!scalar.value) {
- std::memset(next, 0x00, width * length);
- } else {
- DCHECK_EQ(scalar.value->size(), width);
- for (int i = 0; i < length; i++) {
- std::memcpy(next, scalar.value->data(), width);
- next += width;
- }
- }
- }
- static void CopyArray(const DataType& type, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- std::memcpy(next, in_values + in_offset * width, length * width);
- }
-};
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_decimal<Type>> {
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- static void CopyScalar(const Scalar& values, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width =
- checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- const auto& scalar = checked_cast<const ScalarType&>(values);
- const auto value = scalar.value.ToBytes();
- for (int i = 0; i < length; i++) {
- std::memcpy(next, value.data(), width);
- next += width;
- }
- }
- static void CopyArray(const DataType& type, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- std::memcpy(next, in_values + in_offset * width, length * width);
- }
-};
-// Copy fixed-width values from a scalar/array datum into an output values buffer
-template <typename Type>
-void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
- uint8_t* out_valid, uint8_t* out_values, const int64_t out_offset) {
- if (in_values.is_scalar()) {
- const auto& scalar = *in_values.scalar();
- if (out_valid) {
- BitUtil::SetBitsTo(out_valid, out_offset, length, scalar.is_valid);
- }
- CopyFixedWidth<Type>::CopyScalar(scalar, length, out_values, out_offset);
- } else {
- const ArrayData& array = *in_values.array();
- if (out_valid) {
- if (array.MayHaveNulls()) {
- if (length == 1) {
- // CopyBitmap is slow for short runs
- BitUtil::SetBitTo(
- out_valid, out_offset,
- BitUtil::GetBit(array.buffers[0]->data(), array.offset + in_offset));
- } else {
- arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset + in_offset,
- length, out_valid, out_offset);
- }
- } else {
- BitUtil::SetBitsTo(out_valid, out_offset, length, true);
- }
- }
- CopyFixedWidth<Type>::CopyArray(*array.type, array.buffers[1]->data(),
- array.offset + in_offset, length, out_values,
- out_offset);
- }
-}
-
-// Specialized helper to copy a single value from a source array. Allows avoiding
-// repeatedly calling MayHaveNulls and Buffer::data() which have internal checks that
-// add up when called in a loop.
-template <typename Type>
-void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
- const uint8_t* in_values, const int64_t in_offset,
- uint8_t* out_valid, uint8_t* out_values,
- const int64_t out_offset) {
- if (out_valid) {
- BitUtil::SetBitTo(out_valid, out_offset,
- !in_valid || BitUtil::GetBit(in_valid, in_offset));
- }
- CopyFixedWidth<Type>::CopyArray(type, in_values, in_offset, /*length=*/1, out_values,
- out_offset);
-}
-
-struct CaseWhenFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- // The first function is a struct of booleans, where the number of fields in the
- // struct is either equal to the number of other arguments or is one less.
- RETURN_NOT_OK(CheckArity(*values));
- EnsureDictionaryDecoded(values);
- auto first_type = (*values)[0].type;
- if (first_type->id() != Type::STRUCT) {
- return Status::TypeError("case_when: first argument must be STRUCT, not ",
- *first_type);
- }
- auto num_fields = static_cast<size_t>(first_type->num_fields());
- if (num_fields < values->size() - 2 || num_fields >= values->size()) {
- return Status::Invalid(
- "case_when: number of struct fields must be equal to or one less than count of "
- "remaining arguments (",
- values->size() - 1, "), got: ", first_type->num_fields());
- }
- for (const auto& field : first_type->fields()) {
- if (field->type()->id() != Type::BOOL) {
- return Status::TypeError(
- "case_when: all fields of first argument must be BOOL, but ", field->name(),
- " was of type: ", *field->type());
- }
- }
-
- if (auto type = CommonNumeric(values->data() + 1, values->size() - 1)) {
- for (auto it = values->begin() + 1; it != values->end(); it++) {
- it->type = type;
- }
- }
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-// Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions
-template <typename Type>
-Status ExecScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
- if (!conds.is_valid) {
- return Status::Invalid("cond struct must not be null");
- }
- Datum result;
- for (size_t i = 0; i < batch.values.size() - 1; i++) {
- if (i < conds.value.size()) {
- const Scalar& cond = *conds.value[i];
- if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
- result = batch[i + 1];
- break;
- }
- } else {
- // ELSE clause
- result = batch[i + 1];
- break;
- }
- }
- if (out->is_scalar()) {
- *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
- return Status::OK();
- }
- ArrayData* output = out->mutable_array();
- if (!result.is_value()) {
- // All conditions false, no 'else' argument
- result = MakeNullScalar(out->type());
- }
- CopyValues<Type>(result, /*in_offset=*/0, batch.length,
- output->GetMutableValues<uint8_t>(0, 0),
- output->GetMutableValues<uint8_t>(1, 0), output->offset);
- return Status::OK();
-}
-
-// Implement 'case when' for any mix of scalar/array arguments for any fixed-width type,
-// given helper functions to copy data from a source array to a target array
-template <typename Type>
-Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& conds_array = *batch.values[0].array();
- if (conds_array.GetNullCount() > 0) {
- return Status::Invalid("cond struct must not have top-level nulls");
- }
- ArrayData* output = out->mutable_array();
- const int64_t out_offset = output->offset;
- const auto num_value_args = batch.values.size() - 1;
- const bool have_else_arg =
- static_cast<size_t>(conds_array.type->num_fields()) < num_value_args;
- uint8_t* out_valid = output->buffers[0]->mutable_data();
- uint8_t* out_values = output->buffers[1]->mutable_data();
- if (have_else_arg) {
- // Copy 'else' value into output
- CopyValues<Type>(batch.values.back(), /*in_offset=*/0, batch.length, out_valid,
- out_values, out_offset);
- } else {
- // There's no 'else' argument, so we should have an all-null validity bitmap
- BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
- }
-
- // Allocate a temporary bitmap to determine which elements still need setting.
- ARROW_ASSIGN_OR_RAISE(auto mask_buffer, ctx->AllocateBitmap(batch.length));
- uint8_t* mask = mask_buffer->mutable_data();
- std::memset(mask, 0xFF, mask_buffer->size());
-
- // Then iterate through each argument in turn and set elements.
- for (size_t i = 0; i < batch.values.size() - (have_else_arg ? 2 : 1); i++) {
- const ArrayData& cond_array = *conds_array.child_data[i];
- const int64_t cond_offset = conds_array.offset + cond_array.offset;
- const uint8_t* cond_values = cond_array.buffers[1]->data();
- const Datum& values_datum = batch[i + 1];
- int64_t offset = 0;
-
- if (cond_array.GetNullCount() == 0) {
- // If no valid buffer, visit mask & cond bitmap simultaneously
- BinaryBitBlockCounter counter(mask, /*start_offset=*/0, cond_values, cond_offset,
- batch.length);
- while (offset < batch.length) {
- const auto block = counter.NextAndWord();
- if (block.AllSet()) {
- CopyValues<Type>(values_datum, offset, block.length, out_valid, out_values,
- out_offset + offset);
- BitUtil::SetBitsTo(mask, offset, block.length, false);
- } else if (block.popcount) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (BitUtil::GetBit(mask, offset + j) &&
- BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
- CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
- out_values, out_offset + offset + j);
- BitUtil::SetBitTo(mask, offset + j, false);
- }
- }
- }
- offset += block.length;
- }
- } else {
- // Visit mask & cond bitmap & cond validity
- const uint8_t* cond_valid = cond_array.buffers[0]->data();
- Bitmap bitmaps[3] = {{mask, /*offset=*/0, batch.length},
- {cond_values, cond_offset, batch.length},
- {cond_valid, cond_offset, batch.length}};
- Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 3> words) {
- const uint64_t word = words[0] & words[1] & words[2];
- const int64_t block_length = std::min<int64_t>(64, batch.length - offset);
- if (word == std::numeric_limits<uint64_t>::max()) {
- CopyValues<Type>(values_datum, offset, block_length, out_valid, out_values,
- out_offset + offset);
- BitUtil::SetBitsTo(mask, offset, block_length, false);
- } else if (word) {
- for (int64_t j = 0; j < block_length; ++j) {
- if (BitUtil::GetBit(mask, offset + j) &&
- BitUtil::GetBit(cond_valid, cond_offset + offset + j) &&
- BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
- CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
- out_values, out_offset + offset + j);
- BitUtil::SetBitTo(mask, offset + j, false);
- }
- }
- }
- });
- }
- }
- if (!have_else_arg) {
- // Need to initialize any remaining null slots (uninitialized memory)
- BitBlockCounter counter(mask, /*offset=*/0, batch.length);
- int64_t offset = 0;
- auto bit_width = checked_cast<const FixedWidthType&>(*out->type()).bit_width();
- auto byte_width = BitUtil::BytesForBits(bit_width);
- while (offset < batch.length) {
- const auto block = counter.NextWord();
- if (block.AllSet()) {
- if (bit_width == 1) {
- BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
- } else {
- std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
- byte_width * block.length);
- }
- } else if (!block.NoneSet()) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
- if (bit_width == 1) {
- BitUtil::ClearBit(out_values, out_offset + offset + j);
- } else {
- std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
- byte_width);
- }
- }
- }
- offset += block.length;
- }
- }
- return Status::OK();
-}
-
-template <typename Type, typename Enable = void>
-struct CaseWhenFunctor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (batch.values[0].is_array()) {
- return ExecArrayCaseWhen<Type>(ctx, batch, out);
- }
- return ExecScalarCaseWhen<Type>(ctx, batch, out);
- }
-};
-
-template <>
-struct CaseWhenFunctor<NullType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::OK();
- }
-};
-
-struct CoalesceFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- EnsureDictionaryDecoded(values);
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- }
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-// Implement a 'coalesce' (SQL) operator for any number of scalar inputs
-Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- for (const auto& datum : batch.values) {
- if (datum.scalar()->is_valid) {
- *out = datum;
- break;
- }
- }
- return Status::OK();
-}
-
-// Helper: copy from a source datum into all null slots of the output
-template <typename Type>
-void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
- const int64_t out_offset, const int64_t length) {
- BitBlockCounter counter(out_valid, out_offset, length);
- int64_t offset = 0;
- while (offset < length) {
- const auto block = counter.NextWord();
- if (block.NoneSet()) {
- CopyValues<Type>(source, offset, block.length, out_valid, out_values,
- out_offset + offset);
- } else if (!block.AllSet()) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (!BitUtil::GetBit(out_valid, out_offset + offset + j)) {
- CopyValues<Type>(source, offset + j, 1, out_valid, out_values,
- out_offset + offset + j);
- }
- }
- }
- offset += block.length;
- }
-}
-
-// Helper: zero the values buffer of the output wherever the slot is null
-void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_values,
- const int64_t out_offset, const int64_t length) {
- BitBlockCounter counter(out_valid, out_offset, length);
- int64_t offset = 0;
- auto bit_width = checked_cast<const FixedWidthType&>(type).bit_width();
- auto byte_width = BitUtil::BytesForBits(bit_width);
- while (offset < length) {
- const auto block = counter.NextWord();
- if (block.NoneSet()) {
- if (bit_width == 1) {
- BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
- } else {
- std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
- byte_width * block.length);
- }
- } else if (!block.AllSet()) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
- if (bit_width == 1) {
- BitUtil::ClearBit(out_values, out_offset + offset + j);
- } else {
- std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
- byte_width);
- }
- }
- }
- offset += block.length;
- }
-}
-
-// Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
-template <typename Type>
-Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ArrayData* output = out->mutable_array();
- const int64_t out_offset = output->offset;
- // Use output validity buffer as mask to decide what values to copy
- uint8_t* out_valid = output->buffers[0]->mutable_data();
- // Clear output buffer - no values are set initially
- BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
- uint8_t* out_values = output->buffers[1]->mutable_data();
-
- for (const auto& datum : batch.values) {
- if ((datum.is_scalar() && datum.scalar()->is_valid) ||
- (datum.is_array() && !datum.array()->MayHaveNulls())) {
- // Valid scalar, or all-valid array
- CopyValuesAllValid<Type>(datum, out_valid, out_values, out_offset, batch.length);
- break;
- } else if (datum.is_array()) {
- // Array with nulls
- const ArrayData& arr = *datum.array();
- const DataType& type = *datum.type();
- const uint8_t* in_valid = arr.buffers[0]->data();
- const uint8_t* in_values = arr.buffers[1]->data();
- BinaryBitBlockCounter counter(in_valid, arr.offset, out_valid, out_offset,
- batch.length);
- int64_t offset = 0;
- while (offset < batch.length) {
- const auto block = counter.NextAndNotWord();
- if (block.AllSet()) {
- CopyValues<Type>(datum, offset, block.length, out_valid, out_values,
- out_offset + offset);
- } else if (block.popcount) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (!BitUtil::GetBit(out_valid, out_offset + offset + j) &&
- BitUtil::GetBit(in_valid, arr.offset + offset + j)) {
- // This version lets us avoid calling MayHaveNulls() on every iteration
- // (which does an atomic load and can add up)
- CopyOneArrayValue<Type>(type, in_valid, in_values, arr.offset + offset + j,
- out_valid, out_values, out_offset + offset + j);
- }
- }
- }
- offset += block.length;
- }
- }
- }
-
- // Initialize any remaining null slots (uninitialized memory)
- InitializeNullSlots(*out->type(), out_valid, out_values, out_offset, batch.length);
- return Status::OK();
-}
-
-template <typename Type, typename Enable = void>
-struct CoalesceFunctor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- for (const auto& datum : batch.values) {
- if (datum.is_array()) {
- return ExecArrayCoalesce<Type>(ctx, batch, out);
- }
- }
- return ExecScalarCoalesce(ctx, batch, out);
- }
-};
-
-template <>
-struct CoalesceFunctor<NullType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
- using offset_type = typename Type::offset_type;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- for (const auto& datum : batch.values) {
- if (datum.is_array()) {
- return ExecArray(ctx, batch, out);
- }
- }
- return ExecScalarCoalesce(ctx, batch, out);
- }
-
- static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // Special case: grab any leading non-null scalar or array arguments
- for (const auto& datum : batch.values) {
- if (datum.is_scalar()) {
- if (!datum.scalar()->is_valid) continue;
- ARROW_ASSIGN_OR_RAISE(
- *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
- return Status::OK();
- } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
- *out = datum;
- return Status::OK();
- }
- break;
- }
- ArrayData* output = out->mutable_array();
- BuilderType builder(batch[0].type(), ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(batch.length));
- for (int64_t i = 0; i < batch.length; i++) {
- bool set = false;
- for (const auto& datum : batch.values) {
- if (datum.is_scalar()) {
- if (datum.scalar()->is_valid) {
- RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(*datum.scalar())));
- set = true;
- break;
- }
- } else {
- const ArrayData& source = *datum.array();
- if (!source.MayHaveNulls() ||
- BitUtil::GetBit(source.buffers[0]->data(), source.offset + i)) {
- const uint8_t* data = source.buffers[2]->data();
- const offset_type* offsets = source.GetValues<offset_type>(1);
- const offset_type offset0 = offsets[i];
- const offset_type offset1 = offsets[i + 1];
- RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
- set = true;
- break;
- }
- }
- }
- if (!set) RETURN_NOT_OK(builder.AppendNull());
- }
- ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
- *output = *temp_output->data();
- // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
- output->type = batch[0].type();
- return Status::OK();
- }
-};
-
-Result<ValueDescr> LastType(KernelContext*, const std::vector<ValueDescr>& descrs) {
- ValueDescr result = descrs.back();
- result.shape = GetBroadcastShape(descrs);
- return result;
-}
-
-void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
- detail::GetTypeId get_id, ArrayKernelExec exec) {
- ScalarKernel kernel(
- KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
- OutputType(LastType),
- /*is_varargs=*/true),
- exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = is_fixed_width(get_id.id);
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-void AddPrimitiveCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec = GenerateTypeAgnosticPrimitive<CaseWhenFunctor>(*type);
- AddCaseWhenKernel(scalar_function, type, std::move(exec));
- }
-}
-
-void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
- detail::GetTypeId get_id, ArrayKernelExec exec) {
- ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
- /*is_varargs=*/true),
- exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = is_fixed_width(get_id.id);
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec = GenerateTypeAgnosticPrimitive<CoalesceFunctor>(*type);
- AddCoalesceKernel(scalar_function, type, std::move(exec));
- }
-}
-
-const FunctionDoc if_else_doc{"Choose values based on a condition",
- ("`cond` must be a Boolean scalar/ array. \n`left` or "
- "`right` must be of the same type scalar/ array.\n"
- "`null` values in `cond` will be promoted to the"
- " output."),
- {"cond", "left", "right"}};
-
-const FunctionDoc case_when_doc{
- "Choose values based on multiple conditions",
- ("`cond` must be a struct of Boolean values. `cases` can be a mix "
- "of scalar and array arguments (of any type, but all must be the "
- "same type or castable to a common type), with either exactly one "
- "datum per child of `cond`, or one more `cases` than children of "
- "`cond` (in which case we have an \"else\" value).\n"
- "Each row of the output will be the corresponding value of the "
- "first datum in `cases` for which the corresponding child of `cond` "
- "is true, or otherwise the \"else\" value (if given), or null. "
- "Essentially, this implements a switch-case or if-else, if-else... "
- "statement."),
- {"cond", "*cases"}};
-
-const FunctionDoc coalesce_doc{
- "Select the first non-null value in each slot",
- ("Each row of the output will be the value from the first corresponding input "
- "for which the value is not null. If all inputs are null in a row, the output "
- "will be null."),
- {"*values"}};
-} // namespace
-
-void RegisterScalarIfElse(FunctionRegistry* registry) {
- {
- auto func =
- std::make_shared<IfElseFunction>("if_else", Arity::Ternary(), &if_else_doc);
-
- AddPrimitiveIfElseKernels(func, NumericTypes());
- AddPrimitiveIfElseKernels(func, TemporalTypes());
- AddPrimitiveIfElseKernels(func, {boolean(), day_time_interval(), month_interval()});
- AddNullIfElseKernel(func);
- AddBinaryIfElseKernels(func, BaseBinaryTypes());
- AddFSBinaryIfElseKernel(func);
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<CaseWhenFunction>(
- "case_when", Arity::VarArgs(/*min_args=*/1), &case_when_doc);
- AddPrimitiveCaseWhenKernels(func, NumericTypes());
- AddPrimitiveCaseWhenKernels(func, TemporalTypes());
- AddPrimitiveCaseWhenKernels(
- func, {boolean(), null(), day_time_interval(), month_interval()});
- AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY,
- CaseWhenFunctor<FixedSizeBinaryType>::Exec);
- AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<Decimal128Type>::Exec);
- AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<Decimal256Type>::Exec);
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<CoalesceFunction>(
- "coalesce", Arity::VarArgs(/*min_args=*/1), &coalesce_doc);
- AddPrimitiveCoalesceKernels(func, NumericTypes());
- AddPrimitiveCoalesceKernels(func, TemporalTypes());
- AddPrimitiveCoalesceKernels(
- func, {boolean(), null(), day_time_interval(), month_interval()});
- AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
- CoalesceFunctor<FixedSizeBinaryType>::Exec);
- AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<Decimal128Type>::Exec);
- AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<Decimal256Type>::Exec);
- for (const auto& ty : BaseBinaryTypes()) {
- AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<CoalesceFunctor>(ty));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/compute/api.h>
+#include <arrow/compute/kernels/codegen_internal.h>
+#include <arrow/compute/util_internal.h>
+#include <arrow/util/bit_block_counter.h>
+#include <arrow/util/bitmap.h>
+#include <arrow/util/bitmap_ops.h>
+#include <arrow/util/bitmap_reader.h>
+
+namespace arrow {
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::Bitmap;
+using internal::BitmapWordReader;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+constexpr uint64_t kAllNull = 0;
+constexpr uint64_t kAllValid = ~kAllNull;
+
+util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {
+ if (data.is_scalar()) {
+ return data.scalar()->is_valid ? kAllValid : kAllNull;
+ }
+
+ if (data.array()->null_count == data.array()->length) return kAllNull;
+
+ if (!data.array()->MayHaveNulls()) return kAllValid;
+
+ // no constant validity word available
+ return {};
+}
+
+inline Bitmap GetBitmap(const Datum& datum, int i) {
+ if (datum.is_scalar()) return {};
+ const ArrayData& a = *datum.array();
+ return Bitmap{a.buffers[i], a.offset, a.length};
+}
+
+// if the condition is null then output is null otherwise we take validity from the
+// selected argument
+// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+template <typename AllocateNullBitmap>
+Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
+ const Datum& right_d, ArrayData* output) {
+ auto cond_const = GetConstantValidityWord(cond_d);
+ auto left_const = GetConstantValidityWord(left_d);
+ auto right_const = GetConstantValidityWord(right_d);
+
+ enum { COND_CONST = 1, LEFT_CONST = 2, RIGHT_CONST = 4 };
+ auto flag = COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
+ RIGHT_CONST * right_const.has_value();
+
+ const ArrayData& cond = *cond_d.array();
+ // cond.data will always be available
+ Bitmap cond_data{cond.buffers[1], cond.offset, cond.length};
+ Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
+ Bitmap left_valid = GetBitmap(left_d, 0);
+ Bitmap right_valid = GetBitmap(right_d, 0);
+
+ // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+ // In the following cases, we dont need to allocate out_valid bitmap
+
+ // if cond & left & right all ones, then output is all valid.
+ // if output validity buffer is already allocated (NullHandling::
+ // COMPUTED_PREALLOCATE) -> set all bits
+ // else, return nullptr
+ if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
+ if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
+ output->buffers[0] = nullptr;
+ } else { // NullHandling::COMPUTED_PREALLOCATE
+ BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
+ output->length);
+ }
+ return Status::OK();
+ }
+
+ if (left_const == kAllValid && right_const == kAllValid) {
+ // if both left and right are valid, no need to calculate out_valid bitmap. Copy
+ // cond validity buffer
+ if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
+ // if there's an offset, copy bitmap (cannot slice a bitmap)
+ if (cond.offset) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
+ cond.offset, cond.length));
+ } else { // just copy assign cond validity buffer
+ output->buffers[0] = cond.buffers[0];
+ }
+ } else { // NullHandling::COMPUTED_PREALLOCATE
+ arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
+ output->buffers[0]->mutable_data(), output->offset);
+ }
+ return Status::OK();
+ }
+
+ // lambda function that will be used inside the visitor
+ auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
+ uint64_t r_valid) {
+ return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
+ };
+
+ if (AllocateNullBitmap::value) {
+ // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
+ // would not have allocated buffers for it.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
+ }
+
+ std::array<Bitmap, 1> out_bitmaps{
+ Bitmap{output->buffers[0], output->offset, output->length}};
+
+ switch (flag) {
+ case COND_CONST | LEFT_CONST | RIGHT_CONST: {
+ std::array<Bitmap, 1> bitmaps{cond_data};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 1>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ *left_const, *right_const);
+ });
+ break;
+ }
+ case LEFT_CONST | RIGHT_CONST: {
+ std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ *left_const, *right_const);
+ });
+ break;
+ }
+ case COND_CONST | RIGHT_CONST: {
+ // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
+ // Visit()
+ std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ words_in[1], *right_const);
+ });
+ break;
+ }
+ case RIGHT_CONST: {
+ // bitmaps[R_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ words_in[2], *right_const);
+ });
+ break;
+ }
+ case COND_CONST | LEFT_CONST: {
+ // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
+ // Visit()
+ std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ *left_const, words_in[1]);
+ });
+ break;
+ }
+ case LEFT_CONST: {
+ // bitmaps[L_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ *left_const, words_in[2]);
+ });
+ break;
+ }
+ case COND_CONST: {
+ // bitmaps[C_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ words_in[1], words_in[2]);
+ });
+ break;
+ }
+ case 0: {
+ std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 4>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ words_in[2], words_in[3]);
+ });
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+using Word = uint64_t;
+static constexpr int64_t word_len = sizeof(Word) * 8;
+
+/// Runs the main if_else loop. Here, it is expected that the right data has already
+/// been copied to the output.
+/// If `invert` is meant to invert the cond.data. If is set to `true`, then the
+/// buffer will be inverted before calling the handle_block or handle_each functions.
+/// This is useful, when left is an array and right is scalar. Then rather than
+/// copying data from the right to output, we can copy left data to the output and
+/// invert the cond data to fill right values. Filling out with a scalar is presumed to
+/// be more efficient than filling with an array
+///
+/// `HandleBlock` has the signature:
+/// [](int64_t offset, int64_t length){...}
+/// It should copy `length` number of elements from source array to output array with
+/// `offset` offset in both arrays
+template <typename HandleBlock, bool invert = false>
+void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
+ int64_t data_offset = 0;
+ int64_t bit_offset = cond.offset;
+ const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray
+
+ BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
+
+ constexpr Word pickAll = invert ? 0 : UINT64_MAX;
+ constexpr Word pickNone = ~pickAll;
+
+ int64_t cnt = cond_reader.words();
+ while (cnt--) {
+ Word word = cond_reader.NextWord();
+
+ if (word == pickAll) {
+ handle_block(data_offset, word_len);
+ } else if (word != pickNone) {
+ for (int64_t i = 0; i < word_len; ++i) {
+ if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
+ handle_block(data_offset + i, 1);
+ }
+ }
+ }
+ data_offset += word_len;
+ bit_offset += word_len;
+ }
+
+ constexpr uint8_t pickAllByte = invert ? 0 : UINT8_MAX;
+ // byte bit-wise inversion is int-wide. Hence XOR with 0xff
+ constexpr uint8_t pickNoneByte = pickAllByte ^ 0xff;
+
+ cnt = cond_reader.trailing_bytes();
+ while (cnt--) {
+ int valid_bits;
+ uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+
+ if (byte == pickAllByte && valid_bits == 8) {
+ handle_block(data_offset, 8);
+ } else if (byte != pickNoneByte) {
+ for (int i = 0; i < valid_bits; ++i) {
+ if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
+ handle_block(data_offset + i, 1);
+ }
+ }
+ }
+ data_offset += 8;
+ bit_offset += 8;
+ }
+}
+
+template <typename HandleBlock>
+void RunIfElseLoopInverted(const ArrayData& cond, const HandleBlock& handle_block) {
+ RunIfElseLoop<HandleBlock, true>(cond, handle_block);
+}
+
+/// Runs if-else when cond is a scalar. Two special functions are required,
+/// 1.CopyArrayData, 2. BroadcastScalar
+template <typename CopyArrayData, typename BroadcastScalar>
+Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum& right,
+ Datum* out, const CopyArrayData& copy_array_data,
+ const BroadcastScalar& broadcast_scalar) {
+ if (left.is_scalar() && right.is_scalar()) { // output will be a scalar
+ if (cond.is_valid) {
+ *out = cond.value ? left.scalar() : right.scalar();
+ } else {
+ *out = MakeNullScalar(left.type());
+ }
+ return Status::OK();
+ }
+
+ // either left or right is an array. Output is always an array`
+ const std::shared_ptr<ArrayData>& out_array = out->array();
+ if (!cond.is_valid) {
+ // cond is null; output is all null --> clear validity buffer
+ BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ return Status::OK();
+ }
+
+ // cond is a non-null scalar
+ const auto& valid_data = cond.value ? left : right;
+ if (valid_data.is_array()) {
+ // valid_data is an array. Hence copy data to the output buffers
+ const auto& valid_array = valid_data.array();
+ if (valid_array->MayHaveNulls()) {
+ arrow::internal::CopyBitmap(
+ valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
+ out_array->buffers[0]->mutable_data(), out_array->offset);
+ } else { // validity buffer is nullptr --> set all bits
+ BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ }
+ copy_array_data(*valid_array, out_array.get());
+ return Status::OK();
+
+ } else { // valid data is scalar
+ // valid data is a scalar that needs to be broadcasted
+ const auto& valid_scalar = *valid_data.scalar();
+ if (valid_scalar.is_valid) { // if the scalar is non-null, broadcast
+ BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ broadcast_scalar(*valid_data.scalar(), out_array.get());
+ } else { // scalar is null, clear the output validity buffer
+ BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ }
+ return Status::OK();
+ }
+}
+
+template <typename Type, typename Enable = void>
+struct IfElseFunctor {};
+
+// only number types needs to be handled for Fixed sized primitive data types because,
+// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
+// int type
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_number<Type>> {
+ using T = typename TypeTraits<Type>::CType;
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
+ valid_array.length * sizeof(T));
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+ std::fill(out_array->GetMutableValues<T>(1),
+ out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ const T* right_data = right.GetValues<T>(1);
+ std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+ // selectively copy values from left data
+ const T* left_data = left.GetValues<T>(1);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::memcpy(out_values + data_offset, left_data + data_offset,
+ num_elems * sizeof(T));
+ });
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ const T* right_data = right.GetValues<T>(1);
+ std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+ // selectively copy values from left data
+ T left_data = internal::UnboxScalar<Type>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ left_data);
+ });
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy left data to out_buff
+ const T* left_data = left.GetValues<T>(1);
+ std::memcpy(out_values, left_data, left.length * sizeof(T));
+
+ T right_data = internal::UnboxScalar<Type>::Unbox(right);
+
+ RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ right_data);
+ });
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ T right_data = internal::UnboxScalar<Type>::Unbox(right);
+ std::fill(out_values, out_values + cond.length, right_data);
+
+ // selectively copy values from left data
+ T left_data = internal::UnboxScalar<Type>::Unbox(left);
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ left_data);
+ });
+
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_boolean<Type>> {
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ arrow::internal::CopyBitmap(
+ valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
+ out_array->buffers[1]->mutable_data(), out_array->offset);
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+ BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
+ out_array->length, scalar_data);
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ // out_buff = right & ~cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ // out_buff = left & cond
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
+ arrow::internal::BitmapAnd(
+ ctx->memory_pool(), left.buffers[1]->data(), left.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length, 0));
+
+ arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
+ cond.length, out->offset, out_buf->mutable_data());
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ // out_buff = right & ~cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ // out_buff = left & cond
+ bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
+ if (left_data) {
+ arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
+ cond.offset, cond.length, out->offset,
+ out_buf->mutable_data());
+ }
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ // out_buff = left & cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
+
+ // out_buff = left & cond | right & ~cond
+ if (right_data) {
+ arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
+ cond.offset, cond.length, out->offset,
+ out_buf->mutable_data());
+ }
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
+ bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
+
+ const auto& out_buf = out->buffers[1];
+
+ // out_buf = left & cond | right & ~cond
+ // std::shared_ptr<Buffer> out_buf = nullptr;
+ if (left_data) {
+ if (right_data) {
+ // out_buf = ones
+ BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
+ } else {
+ // out_buf = cond
+ arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+ out_buf->mutable_data(), out->offset);
+ }
+ } else {
+ if (right_data) {
+ // out_buf = ~cond
+ arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+ out_buf->mutable_data(), out->offset);
+ } else {
+ // out_buf = zeros
+ BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+ using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ if (left.is_scalar() && right.is_scalar()) {
+ if (cond.is_valid) {
+ *out = cond.value ? left.scalar() : right.scalar();
+ } else {
+ *out = MakeNullScalar(left.type());
+ }
+ return Status::OK();
+ }
+ // either left or right is an array. Output is always an array
+ int64_t out_arr_len = std::max(left.length(), right.length());
+ if (!cond.is_valid) {
+ // cond is null; just create a null array
+ ARROW_ASSIGN_OR_RAISE(*out,
+ MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+ return Status::OK();
+ }
+
+ const auto& valid_data = cond.value ? left : right;
+ if (valid_data.is_array()) {
+ *out = valid_data;
+ } else {
+ // valid data is a scalar that needs to be broadcasted
+ ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+ ctx->memory_pool()));
+ }
+ return Status::OK();
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ const auto* left_offsets = left.GetValues<OffsetType>(1);
+ const uint8_t* left_data = left.buffers[2]->data();
+ const auto* right_offsets = right.GetValues<OffsetType>(1);
+ const uint8_t* right_data = right.buffers[2]->data();
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
+ right_offsets[right.length] - right_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out,
+ [&](int64_t i) {
+ builder.UnsafeAppend(left_data + left_offsets[i],
+ left_offsets[i + 1] - left_offsets[i]);
+ },
+ [&](int64_t i) {
+ builder.UnsafeAppend(right_data + right_offsets[i],
+ right_offsets[i + 1] - right_offsets[i]);
+ },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+ auto left_size = static_cast<OffsetType>(left_data.size());
+
+ const auto* right_offsets = right.GetValues<OffsetType>(1);
+ const uint8_t* right_data = right.buffers[2]->data();
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc =
+ left_size * cond.length + right_offsets[right.length] - right_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+ [&](int64_t i) {
+ builder.UnsafeAppend(right_data + right_offsets[i],
+ right_offsets[i + 1] - right_offsets[i]);
+ },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ const auto* left_offsets = left.GetValues<OffsetType>(1);
+ const uint8_t* left_data = left.buffers[2]->data();
+
+ util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
+ auto right_size = static_cast<OffsetType>(right_data.size());
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc =
+ right_size * cond.length + left_offsets[left.length] - left_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out,
+ [&](int64_t i) {
+ builder.UnsafeAppend(left_data + left_offsets[i],
+ left_offsets[i + 1] - left_offsets[i]);
+ },
+ [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+ auto left_size = static_cast<OffsetType>(left_data.size());
+
+ util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
+ auto right_size = static_cast<OffsetType>(right_data.size());
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc = std::max(right_size, left_size) * cond.length;
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+ [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ template <typename HandleLeft, typename HandleRight, typename HandleNull>
+ static void RunLoop(const ArrayData& cond, const ArrayData& output,
+ HandleLeft&& handle_left, HandleRight&& handle_right,
+ HandleNull&& handle_null) {
+ const auto* cond_data = cond.buffers[1]->data();
+
+ if (output.buffers[0]) { // output may have nulls
+ // output validity buffer is allocated internally from the IfElseFunctor. Therefore
+ // it is cond.length'd with 0 offset.
+ const auto* out_valid = output.buffers[0]->data();
+
+ for (int64_t i = 0; i < cond.length; i++) {
+ if (BitUtil::GetBit(out_valid, i)) {
+ BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
+ } else {
+ handle_null();
+ }
+ }
+ } else { // output is all valid (no nulls)
+ for (int64_t i = 0; i < cond.length; i++) {
+ BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
+ }
+ }
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type(), *right.type()));
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ std::memcpy(
+ out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
+ valid_array.buffers[1]->data() + valid_array.offset * byte_width,
+ valid_array.length * byte_width);
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ const util::string_view& scalar_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ uint8_t* start =
+ out_array->buffers[1]->mutable_data() + out_array->offset * byte_width;
+ for (int64_t i = 0; i < out_array->length; i++) {
+ std::memcpy(start + i * byte_width, scalar_data.data(), scalar_data.size());
+ }
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+ std::memcpy(out_values, right_data, right.length * byte_width);
+
+ // selectively copy values from left data
+ const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::memcpy(out_values + data_offset * byte_width,
+ left_data + data_offset * byte_width, num_elems * byte_width);
+ });
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+ std::memcpy(out_values, right_data, right.length * byte_width);
+
+ // selectively copy values from left data
+ const util::string_view& left_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (left_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
+ left_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy left data to out_buff
+ const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+ std::memcpy(out_values, left_data, left.length * byte_width);
+
+ const util::string_view& right_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
+
+ RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (right_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, right_data.data(),
+ right_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const util::string_view& right_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
+ if (right_data.data()) {
+ for (int64_t i = 0; i < cond.length; i++) {
+ std::memcpy(out_values + i * byte_width, right_data.data(), right_data.size());
+ }
+ }
+
+ // selectively copy values from left data
+ const util::string_view& left_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (left_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
+ left_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ static Result<int32_t> GetByteWidth(const DataType& left_type,
+ const DataType& right_type) {
+ int width = checked_cast<const FixedSizeBinaryType&>(left_type).byte_width();
+ if (width == checked_cast<const FixedSizeBinaryType&>(right_type).byte_width()) {
+ return width;
+ } else {
+ return Status::Invalid("FixedSizeBinaryType byte_widths should be equal");
+ }
+ }
+};
+
+template <typename Type, typename AllocateMem>
+struct ResolveIfElseExec {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cond is scalar
+ if (batch[0].is_scalar()) {
+ const auto& cond = batch[0].scalar_as<BooleanScalar>();
+ return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
+ }
+
+ // cond is array. Use functors to sort things out
+ ARROW_RETURN_NOT_OK(PromoteNullsVisitor<AllocateMem>(ctx, batch[0], batch[1],
+ batch[2], out->mutable_array()));
+
+ if (batch[1].kind() == Datum::ARRAY) {
+ if (batch[2].kind() == Datum::ARRAY) { // AAA
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
+ *batch[2].array(), out->mutable_array());
+ } else { // AAS
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
+ *batch[2].scalar(), out->mutable_array());
+ }
+ } else {
+ if (batch[2].kind() == Datum::ARRAY) { // ASA
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ *batch[2].array(), out->mutable_array());
+ } else { // ASS
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ *batch[2].scalar(), out->mutable_array());
+ }
+ }
+ }
+};
+
+template <typename AllocateMem>
+struct ResolveIfElseExec<NullType, AllocateMem> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // if all are scalars, return a null scalar
+ if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
+ *out = MakeNullScalar(null());
+ } else {
+ ARROW_ASSIGN_OR_RAISE(*out,
+ MakeArrayOfNull(null(), batch.length, ctx->memory_pool()));
+ }
+ return Status::OK();
+ }
+};
+
+struct IfElseFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ // if 0th descriptor is null, replace with bool
+ if (values->at(0).type->id() == Type::NA) {
+ values->at(0).type = boolean();
+ }
+
+ // if-else 0'th descriptor is bool, so skip it
+ std::vector<ValueDescr> values_copy(values->begin() + 1, values->end());
+ internal::EnsureDictionaryDecoded(&values_copy);
+ internal::ReplaceNullWithOtherType(&values_copy);
+
+ if (auto type = internal::CommonNumeric(values_copy)) {
+ internal::ReplaceTypes(type, &values_copy);
+ }
+
+ std::move(values_copy.begin(), values_copy.end(), values->begin() + 1);
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+void AddNullIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+ ScalarKernel kernel({boolean(), null(), null()}, null(),
+ ResolveIfElseExec<NullType,
+ /*AllocateMem=*/std::true_type>::Exec);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.can_write_into_slices = false;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec =
+ internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec,
+ /*AllocateMem=*/std::false_type>(*type);
+ // cond array needs to be boolean always
+ ScalarKernel kernel({boolean(), type, type}, type, exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = true;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+ }
+}
+
+void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec =
+ internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec,
+ /*AllocateMem=*/std::true_type>(
+ *type);
+ // cond array needs to be boolean always
+ ScalarKernel kernel({boolean(), type, type}, type, exec);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.can_write_into_slices = false;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+ }
+}
+
+void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+ // cond array needs to be boolean always
+ ScalarKernel kernel(
+ {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
+ OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
+ return ValueDescr(descrs[1].type, ValueDescr::ANY);
+ }),
+ ResolveIfElseExec<FixedSizeBinaryType, /*AllocateMem=*/std::false_type>::Exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = true;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+// Helper to copy or broadcast fixed-width values between buffers.
+template <typename Type, typename Enable = void>
+struct CopyFixedWidth {};
+template <>
+struct CopyFixedWidth<BooleanType> {
+ static void CopyScalar(const Scalar& scalar, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const bool value = UnboxScalar<BooleanType>::Unbox(scalar);
+ BitUtil::SetBitsTo(raw_out_values, out_offset, length, value);
+ }
+ static void CopyArray(const DataType&, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ arrow::internal::CopyBitmap(in_values, in_offset, length, raw_out_values, out_offset);
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_number<Type>> {
+ using CType = typename TypeTraits<Type>::CType;
+ static void CopyScalar(const Scalar& scalar, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ CType* out_values = reinterpret_cast<CType*>(raw_out_values);
+ const CType value = UnboxScalar<Type>::Unbox(scalar);
+ std::fill(out_values + out_offset, out_values + out_offset + length, value);
+ }
+ static void CopyArray(const DataType&, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ std::memcpy(raw_out_values + out_offset * sizeof(CType),
+ in_values + in_offset * sizeof(CType), length * sizeof(CType));
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+ static void CopyScalar(const Scalar& values, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width =
+ checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(values);
+ // Scalar may have null value buffer
+ if (!scalar.value) {
+ std::memset(next, 0x00, width * length);
+ } else {
+ DCHECK_EQ(scalar.value->size(), width);
+ for (int i = 0; i < length; i++) {
+ std::memcpy(next, scalar.value->data(), width);
+ next += width;
+ }
+ }
+ }
+ static void CopyArray(const DataType& type, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ std::memcpy(next, in_values + in_offset * width, length * width);
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_decimal<Type>> {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void CopyScalar(const Scalar& values, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width =
+ checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ const auto& scalar = checked_cast<const ScalarType&>(values);
+ const auto value = scalar.value.ToBytes();
+ for (int i = 0; i < length; i++) {
+ std::memcpy(next, value.data(), width);
+ next += width;
+ }
+ }
+ static void CopyArray(const DataType& type, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ std::memcpy(next, in_values + in_offset * width, length * width);
+ }
+};
+// Copy fixed-width values from a scalar/array datum into an output values buffer
+template <typename Type>
+void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
+ uint8_t* out_valid, uint8_t* out_values, const int64_t out_offset) {
+ if (in_values.is_scalar()) {
+ const auto& scalar = *in_values.scalar();
+ if (out_valid) {
+ BitUtil::SetBitsTo(out_valid, out_offset, length, scalar.is_valid);
+ }
+ CopyFixedWidth<Type>::CopyScalar(scalar, length, out_values, out_offset);
+ } else {
+ const ArrayData& array = *in_values.array();
+ if (out_valid) {
+ if (array.MayHaveNulls()) {
+ if (length == 1) {
+ // CopyBitmap is slow for short runs
+ BitUtil::SetBitTo(
+ out_valid, out_offset,
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + in_offset));
+ } else {
+ arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset + in_offset,
+ length, out_valid, out_offset);
+ }
+ } else {
+ BitUtil::SetBitsTo(out_valid, out_offset, length, true);
+ }
+ }
+ CopyFixedWidth<Type>::CopyArray(*array.type, array.buffers[1]->data(),
+ array.offset + in_offset, length, out_values,
+ out_offset);
+ }
+}
+
+// Specialized helper to copy a single value from a source array. Allows avoiding
+// repeatedly calling MayHaveNulls and Buffer::data() which have internal checks that
+// add up when called in a loop.
+template <typename Type>
+void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
+ const uint8_t* in_values, const int64_t in_offset,
+ uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset) {
+ if (out_valid) {
+ BitUtil::SetBitTo(out_valid, out_offset,
+ !in_valid || BitUtil::GetBit(in_valid, in_offset));
+ }
+ CopyFixedWidth<Type>::CopyArray(type, in_values, in_offset, /*length=*/1, out_values,
+ out_offset);
+}
+
+struct CaseWhenFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ // The first function is a struct of booleans, where the number of fields in the
+ // struct is either equal to the number of other arguments or is one less.
+ RETURN_NOT_OK(CheckArity(*values));
+ EnsureDictionaryDecoded(values);
+ auto first_type = (*values)[0].type;
+ if (first_type->id() != Type::STRUCT) {
+ return Status::TypeError("case_when: first argument must be STRUCT, not ",
+ *first_type);
+ }
+ auto num_fields = static_cast<size_t>(first_type->num_fields());
+ if (num_fields < values->size() - 2 || num_fields >= values->size()) {
+ return Status::Invalid(
+ "case_when: number of struct fields must be equal to or one less than count of "
+ "remaining arguments (",
+ values->size() - 1, "), got: ", first_type->num_fields());
+ }
+ for (const auto& field : first_type->fields()) {
+ if (field->type()->id() != Type::BOOL) {
+ return Status::TypeError(
+ "case_when: all fields of first argument must be BOOL, but ", field->name(),
+ " was of type: ", *field->type());
+ }
+ }
+
+ if (auto type = CommonNumeric(values->data() + 1, values->size() - 1)) {
+ for (auto it = values->begin() + 1; it != values->end(); it++) {
+ it->type = type;
+ }
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions
+template <typename Type>
+Status ExecScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
+ if (!conds.is_valid) {
+ return Status::Invalid("cond struct must not be null");
+ }
+ Datum result;
+ for (size_t i = 0; i < batch.values.size() - 1; i++) {
+ if (i < conds.value.size()) {
+ const Scalar& cond = *conds.value[i];
+ if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
+ result = batch[i + 1];
+ break;
+ }
+ } else {
+ // ELSE clause
+ result = batch[i + 1];
+ break;
+ }
+ }
+ if (out->is_scalar()) {
+ *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
+ return Status::OK();
+ }
+ ArrayData* output = out->mutable_array();
+ if (!result.is_value()) {
+ // All conditions false, no 'else' argument
+ result = MakeNullScalar(out->type());
+ }
+ CopyValues<Type>(result, /*in_offset=*/0, batch.length,
+ output->GetMutableValues<uint8_t>(0, 0),
+ output->GetMutableValues<uint8_t>(1, 0), output->offset);
+ return Status::OK();
+}
+
+// Implement 'case when' for any mix of scalar/array arguments for any fixed-width type,
+// given helper functions to copy data from a source array to a target array
+template <typename Type>
+Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& conds_array = *batch.values[0].array();
+ if (conds_array.GetNullCount() > 0) {
+ return Status::Invalid("cond struct must not have top-level nulls");
+ }
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ const auto num_value_args = batch.values.size() - 1;
+ const bool have_else_arg =
+ static_cast<size_t>(conds_array.type->num_fields()) < num_value_args;
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ if (have_else_arg) {
+ // Copy 'else' value into output
+ CopyValues<Type>(batch.values.back(), /*in_offset=*/0, batch.length, out_valid,
+ out_values, out_offset);
+ } else {
+ // There's no 'else' argument, so we should have an all-null validity bitmap
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ }
+
+ // Allocate a temporary bitmap to determine which elements still need setting.
+ ARROW_ASSIGN_OR_RAISE(auto mask_buffer, ctx->AllocateBitmap(batch.length));
+ uint8_t* mask = mask_buffer->mutable_data();
+ std::memset(mask, 0xFF, mask_buffer->size());
+
+ // Then iterate through each argument in turn and set elements.
+ for (size_t i = 0; i < batch.values.size() - (have_else_arg ? 2 : 1); i++) {
+ const ArrayData& cond_array = *conds_array.child_data[i];
+ const int64_t cond_offset = conds_array.offset + cond_array.offset;
+ const uint8_t* cond_values = cond_array.buffers[1]->data();
+ const Datum& values_datum = batch[i + 1];
+ int64_t offset = 0;
+
+ if (cond_array.GetNullCount() == 0) {
+ // If no valid buffer, visit mask & cond bitmap simultaneously
+ BinaryBitBlockCounter counter(mask, /*start_offset=*/0, cond_values, cond_offset,
+ batch.length);
+ while (offset < batch.length) {
+ const auto block = counter.NextAndWord();
+ if (block.AllSet()) {
+ CopyValues<Type>(values_datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ BitUtil::SetBitsTo(mask, offset, block.length, false);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(mask, offset + j) &&
+ BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
+ CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
+ out_values, out_offset + offset + j);
+ BitUtil::SetBitTo(mask, offset + j, false);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ } else {
+ // Visit mask & cond bitmap & cond validity
+ const uint8_t* cond_valid = cond_array.buffers[0]->data();
+ Bitmap bitmaps[3] = {{mask, /*offset=*/0, batch.length},
+ {cond_values, cond_offset, batch.length},
+ {cond_valid, cond_offset, batch.length}};
+ Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 3> words) {
+ const uint64_t word = words[0] & words[1] & words[2];
+ const int64_t block_length = std::min<int64_t>(64, batch.length - offset);
+ if (word == std::numeric_limits<uint64_t>::max()) {
+ CopyValues<Type>(values_datum, offset, block_length, out_valid, out_values,
+ out_offset + offset);
+ BitUtil::SetBitsTo(mask, offset, block_length, false);
+ } else if (word) {
+ for (int64_t j = 0; j < block_length; ++j) {
+ if (BitUtil::GetBit(mask, offset + j) &&
+ BitUtil::GetBit(cond_valid, cond_offset + offset + j) &&
+ BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
+ CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
+ out_values, out_offset + offset + j);
+ BitUtil::SetBitTo(mask, offset + j, false);
+ }
+ }
+ }
+ });
+ }
+ }
+ if (!have_else_arg) {
+ // Need to initialize any remaining null slots (uninitialized memory)
+ BitBlockCounter counter(mask, /*offset=*/0, batch.length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast<const FixedWidthType&>(*out->type()).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < batch.length) {
+ const auto block = counter.NextWord();
+ if (block.AllSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.NoneSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct CaseWhenFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch.values[0].is_array()) {
+ return ExecArrayCaseWhen<Type>(ctx, batch, out);
+ }
+ return ExecScalarCaseWhen<Type>(ctx, batch, out);
+ }
+};
+
+template <>
+struct CaseWhenFunctor<NullType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+struct CoalesceFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ EnsureDictionaryDecoded(values);
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'coalesce' (SQL) operator for any number of scalar inputs
+Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.scalar()->is_valid) {
+ *out = datum;
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+// Helper: copy from a source datum into all null slots of the output
+template <typename Type>
+void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ CopyValues<Type>(source, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j)) {
+ CopyValues<Type>(source, offset + j, 1, out_valid, out_values,
+ out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Helper: zero the values buffer of the output wherever the slot is null
+void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast<const FixedWidthType&>(type).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
+template <typename Type>
+Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ // Use output validity buffer as mask to decide what values to copy
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ // Clear output buffer - no values are set initially
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+
+ for (const auto& datum : batch.values) {
+ if ((datum.is_scalar() && datum.scalar()->is_valid) ||
+ (datum.is_array() && !datum.array()->MayHaveNulls())) {
+ // Valid scalar, or all-valid array
+ CopyValuesAllValid<Type>(datum, out_valid, out_values, out_offset, batch.length);
+ break;
+ } else if (datum.is_array()) {
+ // Array with nulls
+ const ArrayData& arr = *datum.array();
+ const DataType& type = *datum.type();
+ const uint8_t* in_valid = arr.buffers[0]->data();
+ const uint8_t* in_values = arr.buffers[1]->data();
+ BinaryBitBlockCounter counter(in_valid, arr.offset, out_valid, out_offset,
+ batch.length);
+ int64_t offset = 0;
+ while (offset < batch.length) {
+ const auto block = counter.NextAndNotWord();
+ if (block.AllSet()) {
+ CopyValues<Type>(datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j) &&
+ BitUtil::GetBit(in_valid, arr.offset + offset + j)) {
+ // This version lets us avoid calling MayHaveNulls() on every iteration
+ // (which does an atomic load and can add up)
+ CopyOneArrayValue<Type>(type, in_valid, in_values, arr.offset + offset + j,
+ out_valid, out_values, out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ }
+
+ // Initialize any remaining null slots (uninitialized memory)
+ InitializeNullSlots(*out->type(), out_valid, out_values, out_offset, batch.length);
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct CoalesceFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArrayCoalesce<Type>(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+};
+
+template <>
+struct CoalesceFunctor<NullType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArray(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+
+ static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Special case: grab any leading non-null scalar or array arguments
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) continue;
+ ARROW_ASSIGN_OR_RAISE(
+ *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
+ return Status::OK();
+ } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
+ *out = datum;
+ return Status::OK();
+ }
+ break;
+ }
+ ArrayData* output = out->mutable_array();
+ BuilderType builder(batch[0].type(), ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ for (int64_t i = 0; i < batch.length; i++) {
+ bool set = false;
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (datum.scalar()->is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(*datum.scalar())));
+ set = true;
+ break;
+ }
+ } else {
+ const ArrayData& source = *datum.array();
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + i)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues<offset_type>(1);
+ const offset_type offset0 = offsets[i];
+ const offset_type offset1 = offsets[i + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ set = true;
+ break;
+ }
+ }
+ }
+ if (!set) RETURN_NOT_OK(builder.AppendNull());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = batch[0].type();
+ return Status::OK();
+ }
+};
+
+Result<ValueDescr> LastType(KernelContext*, const std::vector<ValueDescr>& descrs) {
+ ValueDescr result = descrs.back();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
+}
+
+void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(
+ KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
+ OutputType(LastType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive<CaseWhenFunctor>(*type);
+ AddCaseWhenKernel(scalar_function, type, std::move(exec));
+ }
+}
+
+void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive<CoalesceFunctor>(*type);
+ AddCoalesceKernel(scalar_function, type, std::move(exec));
+ }
+}
+
+const FunctionDoc if_else_doc{"Choose values based on a condition",
+ ("`cond` must be a Boolean scalar/ array. \n`left` or "
+ "`right` must be of the same type scalar/ array.\n"
+ "`null` values in `cond` will be promoted to the"
+ " output."),
+ {"cond", "left", "right"}};
+
+const FunctionDoc case_when_doc{
+ "Choose values based on multiple conditions",
+ ("`cond` must be a struct of Boolean values. `cases` can be a mix "
+ "of scalar and array arguments (of any type, but all must be the "
+ "same type or castable to a common type), with either exactly one "
+ "datum per child of `cond`, or one more `cases` than children of "
+ "`cond` (in which case we have an \"else\" value).\n"
+ "Each row of the output will be the corresponding value of the "
+ "first datum in `cases` for which the corresponding child of `cond` "
+ "is true, or otherwise the \"else\" value (if given), or null. "
+ "Essentially, this implements a switch-case or if-else, if-else... "
+ "statement."),
+ {"cond", "*cases"}};
+
+const FunctionDoc coalesce_doc{
+ "Select the first non-null value in each slot",
+ ("Each row of the output will be the value from the first corresponding input "
+ "for which the value is not null. If all inputs are null in a row, the output "
+ "will be null."),
+ {"*values"}};
+} // namespace
+
+void RegisterScalarIfElse(FunctionRegistry* registry) {
+ {
+ auto func =
+ std::make_shared<IfElseFunction>("if_else", Arity::Ternary(), &if_else_doc);
+
+ AddPrimitiveIfElseKernels(func, NumericTypes());
+ AddPrimitiveIfElseKernels(func, TemporalTypes());
+ AddPrimitiveIfElseKernels(func, {boolean(), day_time_interval(), month_interval()});
+ AddNullIfElseKernel(func);
+ AddBinaryIfElseKernels(func, BaseBinaryTypes());
+ AddFSBinaryIfElseKernel(func);
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<CaseWhenFunction>(
+ "case_when", Arity::VarArgs(/*min_args=*/1), &case_when_doc);
+ AddPrimitiveCaseWhenKernels(func, NumericTypes());
+ AddPrimitiveCaseWhenKernels(func, TemporalTypes());
+ AddPrimitiveCaseWhenKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY,
+ CaseWhenFunctor<FixedSizeBinaryType>::Exec);
+ AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<Decimal128Type>::Exec);
+ AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<Decimal256Type>::Exec);
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<CoalesceFunction>(
+ "coalesce", Arity::VarArgs(/*min_args=*/1), &coalesce_doc);
+ AddPrimitiveCoalesceKernels(func, NumericTypes());
+ AddPrimitiveCoalesceKernels(func, TemporalTypes());
+ AddPrimitiveCoalesceKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
+ CoalesceFunctor<FixedSizeBinaryType>::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<Decimal128Type>::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<Decimal256Type>::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<CoalesceFunctor>(ty));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
index e9f0696c8fd..7810577b1fe 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -18,7 +18,7 @@
// Vector kernels involving nested types
#include "arrow/array/array_base.h"
-#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/result.h"
#include "arrow/util/bit_block_counter.h"
@@ -29,7 +29,7 @@ namespace internal {
namespace {
template <typename Type, typename offset_type = typename Type::offset_type>
-Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
using ScalarType = typename TypeTraits<Type>::ScalarType;
using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;
@@ -51,131 +51,131 @@ Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
static_cast<offset_type>(arg0.value->length());
}
}
-
- return Status::OK();
-}
-
-const FunctionDoc list_value_length_doc{
- "Compute list lengths",
- ("`lists` must have a list-like type.\n"
- "For each non-null value in `lists`, its length is emitted.\n"
- "Null values emit a null in the output."),
- {"lists"}};
-
-Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
- const std::vector<ValueDescr>& descrs) {
- auto names = OptionsWrapper<MakeStructOptions>::Get(ctx).field_names;
- auto nullable = OptionsWrapper<MakeStructOptions>::Get(ctx).field_nullability;
- auto metadata = OptionsWrapper<MakeStructOptions>::Get(ctx).field_metadata;
-
- if (names.size() == 0) {
- names.resize(descrs.size());
- nullable.resize(descrs.size(), true);
- metadata.resize(descrs.size(), nullptr);
- int i = 0;
- for (auto& name : names) {
- name = std::to_string(i++);
- }
- } else if (names.size() != descrs.size() || nullable.size() != descrs.size() ||
- metadata.size() != descrs.size()) {
- return Status::Invalid("make_struct() was passed ", descrs.size(), " arguments but ",
- names.size(), " field names, ", nullable.size(),
- " nullability bits, and ", metadata.size(),
- " metadata dictionaries.");
- }
-
- size_t i = 0;
- FieldVector fields(descrs.size());
-
- ValueDescr::Shape shape = ValueDescr::SCALAR;
- for (const ValueDescr& descr : descrs) {
- if (descr.shape != ValueDescr::SCALAR) {
- shape = ValueDescr::ARRAY;
- } else {
- switch (descr.type->id()) {
- case Type::EXTENSION:
- case Type::DENSE_UNION:
- case Type::SPARSE_UNION:
- return Status::NotImplemented("Broadcasting scalars of type ", *descr.type);
- default:
- break;
- }
- }
-
- fields[i] =
- field(std::move(names[i]), descr.type, nullable[i], std::move(metadata[i]));
- ++i;
- }
-
- return ValueDescr{struct_(std::move(fields)), shape};
-}
-
-Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors()));
-
- for (int i = 0; i < batch.num_values(); ++i) {
- const auto& field = checked_cast<const StructType&>(*descr.type).field(i);
- if (batch[i].null_count() > 0 && !field->nullable()) {
- return Status::Invalid("Output field ", field, " (#", i,
- ") does not allow nulls but the corresponding "
- "argument was not entirely valid.");
- }
- }
-
- if (descr.shape == ValueDescr::SCALAR) {
- ScalarVector scalars(batch.num_values());
- for (int i = 0; i < batch.num_values(); ++i) {
- scalars[i] = batch[i].scalar();
- }
-
- *out =
- Datum(std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type)));
- return Status::OK();
- }
-
- ArrayVector arrays(batch.num_values());
- for (int i = 0; i < batch.num_values(); ++i) {
- if (batch[i].is_array()) {
- arrays[i] = batch[i].make_array();
- continue;
- }
-
- ARROW_ASSIGN_OR_RAISE(arrays[i], MakeArrayFromScalar(*batch[i].scalar(), batch.length,
- ctx->memory_pool()));
- }
-
- *out = std::make_shared<StructArray>(descr.type, batch.length, std::move(arrays));
- return Status::OK();
+
+ return Status::OK();
}
-const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray",
- ("Names of the StructArray's fields are\n"
- "specified through MakeStructOptions."),
- {"*args"},
- "MakeStructOptions"};
-
+const FunctionDoc list_value_length_doc{
+ "Compute list lengths",
+ ("`lists` must have a list-like type.\n"
+ "For each non-null value in `lists`, its length is emitted.\n"
+ "Null values emit a null in the output."),
+ {"lists"}};
+
+Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
+ const std::vector<ValueDescr>& descrs) {
+ auto names = OptionsWrapper<MakeStructOptions>::Get(ctx).field_names;
+ auto nullable = OptionsWrapper<MakeStructOptions>::Get(ctx).field_nullability;
+ auto metadata = OptionsWrapper<MakeStructOptions>::Get(ctx).field_metadata;
+
+ if (names.size() == 0) {
+ names.resize(descrs.size());
+ nullable.resize(descrs.size(), true);
+ metadata.resize(descrs.size(), nullptr);
+ int i = 0;
+ for (auto& name : names) {
+ name = std::to_string(i++);
+ }
+ } else if (names.size() != descrs.size() || nullable.size() != descrs.size() ||
+ metadata.size() != descrs.size()) {
+ return Status::Invalid("make_struct() was passed ", descrs.size(), " arguments but ",
+ names.size(), " field names, ", nullable.size(),
+ " nullability bits, and ", metadata.size(),
+ " metadata dictionaries.");
+ }
+
+ size_t i = 0;
+ FieldVector fields(descrs.size());
+
+ ValueDescr::Shape shape = ValueDescr::SCALAR;
+ for (const ValueDescr& descr : descrs) {
+ if (descr.shape != ValueDescr::SCALAR) {
+ shape = ValueDescr::ARRAY;
+ } else {
+ switch (descr.type->id()) {
+ case Type::EXTENSION:
+ case Type::DENSE_UNION:
+ case Type::SPARSE_UNION:
+ return Status::NotImplemented("Broadcasting scalars of type ", *descr.type);
+ default:
+ break;
+ }
+ }
+
+ fields[i] =
+ field(std::move(names[i]), descr.type, nullable[i], std::move(metadata[i]));
+ ++i;
+ }
+
+ return ValueDescr{struct_(std::move(fields)), shape};
+}
+
+Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors()));
+
+ for (int i = 0; i < batch.num_values(); ++i) {
+ const auto& field = checked_cast<const StructType&>(*descr.type).field(i);
+ if (batch[i].null_count() > 0 && !field->nullable()) {
+ return Status::Invalid("Output field ", field, " (#", i,
+ ") does not allow nulls but the corresponding "
+ "argument was not entirely valid.");
+ }
+ }
+
+ if (descr.shape == ValueDescr::SCALAR) {
+ ScalarVector scalars(batch.num_values());
+ for (int i = 0; i < batch.num_values(); ++i) {
+ scalars[i] = batch[i].scalar();
+ }
+
+ *out =
+ Datum(std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type)));
+ return Status::OK();
+ }
+
+ ArrayVector arrays(batch.num_values());
+ for (int i = 0; i < batch.num_values(); ++i) {
+ if (batch[i].is_array()) {
+ arrays[i] = batch[i].make_array();
+ continue;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(arrays[i], MakeArrayFromScalar(*batch[i].scalar(), batch.length,
+ ctx->memory_pool()));
+ }
+
+ *out = std::make_shared<StructArray>(descr.type, batch.length, std::move(arrays));
+ return Status::OK();
+}
+
+const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray",
+ ("Names of the StructArray's fields are\n"
+ "specified through MakeStructOptions."),
+ {"*args"},
+ "MakeStructOptions"};
+
} // namespace
void RegisterScalarNested(FunctionRegistry* registry) {
- auto list_value_length = std::make_shared<ScalarFunction>(
- "list_value_length", Arity::Unary(), &list_value_length_doc);
+ auto list_value_length = std::make_shared<ScalarFunction>(
+ "list_value_length", Arity::Unary(), &list_value_length_doc);
DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(),
ListValueLength<ListType>));
DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
ListValueLength<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_value_length)));
-
- static MakeStructOptions kDefaultMakeStructOptions;
- auto make_struct_function = std::make_shared<ScalarFunction>(
- "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions);
-
- ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{MakeStructResolve},
- /*is_varargs=*/true),
- MakeStructExec, OptionsWrapper<MakeStructOptions>::Init};
- kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- DCHECK_OK(make_struct_function->AddKernel(std::move(kernel)));
- DCHECK_OK(registry->AddFunction(std::move(make_struct_function)));
+
+ static MakeStructOptions kDefaultMakeStructOptions;
+ auto make_struct_function = std::make_shared<ScalarFunction>(
+ "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions);
+
+ ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{MakeStructResolve},
+ /*is_varargs=*/true),
+ MakeStructExec, OptionsWrapper<MakeStructOptions>::Init};
+ kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(make_struct_function->AddKernel(std::move(kernel)));
+ DCHECK_OK(registry->AddFunction(std::move(make_struct_function)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 3e2e95e5401..8fe28aae920 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -18,9 +18,9 @@
#include "arrow/array/array_base.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/cast.h"
+#include "arrow/compute/cast.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_writer.h"
#include "arrow/util/hashing.h"
@@ -37,68 +37,68 @@ namespace {
template <typename Type>
struct SetLookupState : public KernelState {
- explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
+ explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
Status Init(const SetLookupOptions& options) {
- if (options.value_set.kind() == Datum::ARRAY) {
- const ArrayData& value_set = *options.value_set.array();
- memo_index_to_value_index.reserve(value_set.length);
- RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
- } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
- const ChunkedArray& value_set = *options.value_set.chunked_array();
- memo_index_to_value_index.reserve(value_set.length());
- int64_t offset = 0;
- for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
- RETURN_NOT_OK(AddArrayValueSet(options, *chunk->data(), offset));
- offset += chunk->length();
- }
- } else {
- return Status::Invalid("value_set should be an array or chunked array");
- }
- if (!options.skip_nulls && lookup_table.GetNull() >= 0) {
- null_index = memo_index_to_value_index[lookup_table.GetNull()];
- }
- return Status::OK();
- }
-
- Status AddArrayValueSet(const SetLookupOptions& options, const ArrayData& data,
- int64_t start_index = 0) {
+ if (options.value_set.kind() == Datum::ARRAY) {
+ const ArrayData& value_set = *options.value_set.array();
+ memo_index_to_value_index.reserve(value_set.length);
+ RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
+ } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
+ const ChunkedArray& value_set = *options.value_set.chunked_array();
+ memo_index_to_value_index.reserve(value_set.length());
+ int64_t offset = 0;
+ for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
+ RETURN_NOT_OK(AddArrayValueSet(options, *chunk->data(), offset));
+ offset += chunk->length();
+ }
+ } else {
+ return Status::Invalid("value_set should be an array or chunked array");
+ }
+ if (!options.skip_nulls && lookup_table.GetNull() >= 0) {
+ null_index = memo_index_to_value_index[lookup_table.GetNull()];
+ }
+ return Status::OK();
+ }
+
+ Status AddArrayValueSet(const SetLookupOptions& options, const ArrayData& data,
+ int64_t start_index = 0) {
using T = typename GetViewType<Type>::T;
- int32_t index = static_cast<int32_t>(start_index);
+ int32_t index = static_cast<int32_t>(start_index);
auto visit_valid = [&](T v) {
- const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
+ const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
int32_t unused_memo_index;
- auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
- auto on_not_found = [&](int32_t memo_index) {
- DCHECK_EQ(memo_index, memo_size);
- memo_index_to_value_index.push_back(index);
- };
- RETURN_NOT_OK(lookup_table.GetOrInsert(
- v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
- ++index;
- return Status::OK();
+ auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
+ auto on_not_found = [&](int32_t memo_index) {
+ DCHECK_EQ(memo_index, memo_size);
+ memo_index_to_value_index.push_back(index);
+ };
+ RETURN_NOT_OK(lookup_table.GetOrInsert(
+ v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
+ ++index;
+ return Status::OK();
};
auto visit_null = [&]() {
- const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
- auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
- auto on_not_found = [&](int32_t memo_index) {
- DCHECK_EQ(memo_index, memo_size);
- memo_index_to_value_index.push_back(index);
- };
- lookup_table.GetOrInsertNull(std::move(on_found), std::move(on_not_found));
- ++index;
+ const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
+ auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
+ auto on_not_found = [&](int32_t memo_index) {
+ DCHECK_EQ(memo_index, memo_size);
+ memo_index_to_value_index.push_back(index);
+ };
+ lookup_table.GetOrInsertNull(std::move(on_found), std::move(on_not_found));
+ ++index;
return Status::OK();
};
-
- return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
+
+ return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
}
using MemoTable = typename HashTraits<Type>::MemoTableType;
MemoTable lookup_table;
- // When there are duplicates in value_set, the MemoTable indices must
- // be mapped back to indices in the value_set.
- std::vector<int32_t> memo_index_to_value_index;
- int32_t null_index = -1;
+ // When there are duplicates in value_set, the MemoTable indices must
+ // be mapped back to indices in the value_set.
+ std::vector<int32_t> memo_index_to_value_index;
+ int32_t null_index = -1;
};
template <>
@@ -106,11 +106,11 @@ struct SetLookupState<NullType> : public KernelState {
explicit SetLookupState(MemoryPool*) {}
Status Init(const SetLookupOptions& options) {
- value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
+ value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
return Status::OK();
}
- bool value_set_has_null;
+ bool value_set_has_null;
};
// TODO: Put this concept somewhere reusable
@@ -140,20 +140,20 @@ struct UnsignedIntType<8> {
// Constructing the type requires a type parameter
struct InitStateVisitor {
KernelContext* ctx;
- SetLookupOptions options;
- const std::shared_ptr<DataType>& arg_type;
+ SetLookupOptions options;
+ const std::shared_ptr<DataType>& arg_type;
std::unique_ptr<KernelState> result;
- InitStateVisitor(KernelContext* ctx, const KernelInitArgs& args)
- : ctx(ctx),
- options(*checked_cast<const SetLookupOptions*>(args.options)),
- arg_type(args.inputs[0].type) {}
+ InitStateVisitor(KernelContext* ctx, const KernelInitArgs& args)
+ : ctx(ctx),
+ options(*checked_cast<const SetLookupOptions*>(args.options)),
+ arg_type(args.inputs[0].type) {}
template <typename Type>
Status Init() {
using StateType = SetLookupState<Type>;
result.reset(new StateType(ctx->exec_context()->memory_pool()));
- return static_cast<StateType*>(result.get())->Init(options);
+ return static_cast<StateType*>(result.get())->Init(options);
}
Status Visit(const DataType&) { return Init<NullType>(); }
@@ -177,26 +177,26 @@ struct InitStateVisitor {
// Handle Decimal128Type, FixedSizeBinaryType
Status Visit(const FixedSizeBinaryType& type) { return Init<FixedSizeBinaryType>(); }
- Result<std::unique_ptr<KernelState>> GetResult() {
- if (!options.value_set.type()->Equals(arg_type)) {
- ARROW_ASSIGN_OR_RAISE(
- options.value_set,
- Cast(options.value_set, CastOptions::Safe(arg_type), ctx->exec_context()));
- }
-
- RETURN_NOT_OK(VisitTypeInline(*arg_type, this));
- return std::move(result);
+ Result<std::unique_ptr<KernelState>> GetResult() {
+ if (!options.value_set.type()->Equals(arg_type)) {
+ ARROW_ASSIGN_OR_RAISE(
+ options.value_set,
+ Cast(options.value_set, CastOptions::Safe(arg_type), ctx->exec_context()));
+ }
+
+ RETURN_NOT_OK(VisitTypeInline(*arg_type, this));
+ return std::move(result);
}
};
-Result<std::unique_ptr<KernelState>> InitSetLookup(KernelContext* ctx,
- const KernelInitArgs& args) {
- if (args.options == nullptr) {
- return Status::Invalid(
- "Attempted to call a set lookup function without SetLookupOptions");
- }
-
- return InitStateVisitor{ctx, args}.GetResult();
+Result<std::unique_ptr<KernelState>> InitSetLookup(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (args.options == nullptr) {
+ return Status::Invalid(
+ "Attempted to call a set lookup function without SetLookupOptions");
+ }
+
+ return InitStateVisitor{ctx, args}.GetResult();
}
struct IndexInVisitor {
@@ -208,18 +208,18 @@ struct IndexInVisitor {
IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
: ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {}
- Status Visit(const DataType& type) {
- DCHECK_EQ(type.id(), Type::NA);
+ Status Visit(const DataType& type) {
+ DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
if (data.length != 0) {
- // skip_nulls is honored for consistency with other types
- if (state.value_set_has_null) {
+ // skip_nulls is honored for consistency with other types
+ if (state.value_set_has_null) {
RETURN_NOT_OK(this->builder.Reserve(data.length));
for (int64_t i = 0; i < data.length; ++i) {
this->builder.UnsafeAppend(0);
}
- } else {
- RETURN_NOT_OK(this->builder.AppendNulls(data.length));
+ } else {
+ RETURN_NOT_OK(this->builder.AppendNulls(data.length));
}
}
return Status::OK();
@@ -238,16 +238,16 @@ struct IndexInVisitor {
int32_t index = state.lookup_table.Get(v);
if (index != -1) {
// matching needle; output index from value_set
- this->builder.UnsafeAppend(state.memo_index_to_value_index[index]);
+ this->builder.UnsafeAppend(state.memo_index_to_value_index[index]);
} else {
// no matching needle; output null
this->builder.UnsafeAppendNull();
}
},
[&]() {
- if (state.null_index != -1) {
+ if (state.null_index != -1) {
// value_set included null
- this->builder.UnsafeAppend(state.null_index);
+ this->builder.UnsafeAppend(state.null_index);
} else {
// value_set does not include null; output null
this->builder.UnsafeAppendNull();
@@ -290,13 +290,13 @@ struct IndexInVisitor {
}
};
-Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return IndexInVisitor(ctx, *batch[0].array(), out).Execute();
+Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return IndexInVisitor(ctx, *batch[0].array(), out).Execute();
}
// ----------------------------------------------------------------------
-// IsIn writes the results into a preallocated boolean data bitmap
+// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
KernelContext* ctx;
const ArrayData& data;
@@ -305,13 +305,13 @@ struct IsInVisitor {
IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
: ctx(ctx), data(data), out(out) {}
- Status Visit(const DataType& type) {
- DCHECK_EQ(type.id(), Type::NA);
+ Status Visit(const DataType& type) {
+ DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
ArrayData* output = out->mutable_array();
- // skip_nulls is honored for consistency with other types
- BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset, output->length,
- state.value_set_has_null);
+ // skip_nulls is honored for consistency with other types
+ BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset, output->length,
+ state.value_set_has_null);
return Status::OK();
}
@@ -323,7 +323,7 @@ struct IsInVisitor {
FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
output->length);
-
+
VisitArrayDataInline<Type>(
this->data,
[&](T v) {
@@ -335,11 +335,11 @@ struct IsInVisitor {
writer.Next();
},
[&]() {
- if (state.null_index != -1) {
- writer.Set();
- } else {
- writer.Clear();
- }
+ if (state.null_index != -1) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
writer.Next();
});
writer.Finish();
@@ -370,8 +370,8 @@ struct IsInVisitor {
Status Execute() { return VisitTypeInline(*data.type, this); }
};
-Status ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return IsInVisitor(ctx, *batch[0].array(), out).Execute();
+Status ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return IsInVisitor(ctx, *batch[0].array(), out).Execute();
}
// Unary set lookup kernels available for the following input types
@@ -408,8 +408,8 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
// Enables calling is_in with CallFunction as though it were binary.
class IsInMetaBinary : public MetaFunction {
public:
- IsInMetaBinary()
- : MetaFunction("is_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+ IsInMetaBinary()
+ : MetaFunction("is_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -424,8 +424,8 @@ class IsInMetaBinary : public MetaFunction {
// Enables calling index_in with CallFunction as though it were binary.
class IndexInMetaBinary : public MetaFunction {
public:
- IndexInMetaBinary()
- : MetaFunction("index_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+ IndexInMetaBinary()
+ : MetaFunction("index_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -437,46 +437,46 @@ class IndexInMetaBinary : public MetaFunction {
}
};
-struct SetLookupFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- EnsureDictionaryDecoded(values);
- return DispatchExact(*values);
- }
-};
-
-const FunctionDoc is_in_doc{
- "Find each element in a set of values",
- ("For each element in `values`, return true if it is found in a given\n"
- "set of values, false otherwise.\n"
- "The set of values to look for must be given in SetLookupOptions.\n"
- "By default, nulls are matched against the value set, this can be\n"
- "changed in SetLookupOptions."),
- {"values"},
- "SetLookupOptions"};
-
-const FunctionDoc index_in_doc{
- "Return index of each element in a set of values",
- ("For each element in `values`, return its index in a given set of\n"
- "values, or null if it is not found there.\n"
- "The set of values to look for must be given in SetLookupOptions.\n"
- "By default, nulls are matched against the value set, this can be\n"
- "changed in SetLookupOptions."),
- {"values"},
- "SetLookupOptions"};
-
+struct SetLookupFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ EnsureDictionaryDecoded(values);
+ return DispatchExact(*values);
+ }
+};
+
+const FunctionDoc is_in_doc{
+ "Find each element in a set of values",
+ ("For each element in `values`, return true if it is found in a given\n"
+ "set of values, false otherwise.\n"
+ "The set of values to look for must be given in SetLookupOptions.\n"
+ "By default, nulls are matched against the value set, this can be\n"
+ "changed in SetLookupOptions."),
+ {"values"},
+ "SetLookupOptions"};
+
+const FunctionDoc index_in_doc{
+ "Return index of each element in a set of values",
+ ("For each element in `values`, return its index in a given set of\n"
+ "values, or null if it is not found there.\n"
+ "The set of values to look for must be given in SetLookupOptions.\n"
+ "By default, nulls are matched against the value set, this can be\n"
+ "changed in SetLookupOptions."),
+ {"values"},
+ "SetLookupOptions"};
+
} // namespace
void RegisterScalarSetLookup(FunctionRegistry* registry) {
- // IsIn writes its boolean output into preallocated memory
+ // IsIn writes its boolean output into preallocated memory
{
ScalarKernel isin_base;
isin_base.init = InitSetLookup;
- isin_base.exec =
- TrivialScalarUnaryAsArraysExec(ExecIsIn, NullHandling::OUTPUT_NOT_NULL);
- isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
- auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), &is_in_doc);
+ isin_base.exec =
+ TrivialScalarUnaryAsArraysExec(ExecIsIn, NullHandling::OUTPUT_NOT_NULL);
+ isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), &is_in_doc);
AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get());
@@ -491,12 +491,12 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
{
ScalarKernel index_in_base;
index_in_base.init = InitSetLookup;
- index_in_base.exec = TrivialScalarUnaryAsArraysExec(
- ExecIndexIn, NullHandling::COMPUTED_NO_PREALLOCATE);
+ index_in_base.exec = TrivialScalarUnaryAsArraysExec(
+ ExecIndexIn, NullHandling::COMPUTED_NO_PREALLOCATE);
index_in_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
index_in_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
- auto index_in =
- std::make_shared<SetLookupFunction>("index_in", Arity::Unary(), &index_in_doc);
+ auto index_in =
+ std::make_shared<SetLookupFunction>("index_in", Arity::Unary(), &index_in_doc);
AddBasicSetLookupKernels(index_in_base, /*output_type=*/int32(), index_in.get());
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
index ab0a490eeb3..ce37b089b6f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -17,55 +17,55 @@
#include <algorithm>
#include <cctype>
-#include <iterator>
+#include <iterator>
#include <string>
#ifdef ARROW_WITH_UTF8PROC
#include <utf8proc.h>
#endif
-#ifdef ARROW_WITH_RE2
-#include <re2/re2.h>
-#endif
-
-#include "arrow/array/builder_binary.h"
-#include "arrow/array/builder_nested.h"
-#include "arrow/buffer_builder.h"
-
-#include "arrow/builder.h"
+#ifdef ARROW_WITH_RE2
+#include <re2/re2.h>
+#endif
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/buffer_builder.h"
+
+#include "arrow/builder.h"
#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/utf8.h"
#include "arrow/util/value_parsing.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
-
-using internal::checked_cast;
-
+
+using internal::checked_cast;
+
namespace compute {
namespace internal {
namespace {
-#ifdef ARROW_WITH_RE2
-util::string_view ToStringView(re2::StringPiece piece) {
- return {piece.data(), piece.length()};
-}
-
-re2::StringPiece ToStringPiece(util::string_view view) {
- return {view.data(), view.length()};
-}
-
-Status RegexStatus(const RE2& regex) {
- if (!regex.ok()) {
- return Status::Invalid("Invalid regular expression: ", regex.error());
- }
- return Status::OK();
-}
-#endif
-
+#ifdef ARROW_WITH_RE2
+util::string_view ToStringView(re2::StringPiece piece) {
+ return {piece.data(), piece.length()};
+}
+
+re2::StringPiece ToStringPiece(util::string_view view) {
+ return {view.data(), view.length()};
+}
+
+Status RegexStatus(const RE2& regex) {
+ if (!regex.ok()) {
+ return Status::Invalid("Invalid regular expression: ", regex.error());
+ }
+ return Status::OK();
+}
+#endif
+
// Code units in the range [a-z] can only be an encoding of an ascii
// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
@@ -88,20 +88,20 @@ static inline bool IsAsciiCharacter(T character) {
struct BinaryLength {
template <typename OutValue, typename Arg0Value = util::string_view>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
return static_cast<OutValue>(val.size());
}
};
-struct Utf8Length {
- template <typename OutValue, typename Arg0Value = util::string_view>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
- auto str = reinterpret_cast<const uint8_t*>(val.data());
- auto strlen = val.size();
- return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
- }
-};
-
+struct Utf8Length {
+ template <typename OutValue, typename Arg0Value = util::string_view>
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ auto str = reinterpret_cast<const uint8_t*>(val.data());
+ auto strlen = val.size();
+ return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
+ }
+};
+
#ifdef ARROW_WITH_UTF8PROC
// Direct lookup tables for unicode properties
@@ -124,239 +124,239 @@ void EnsureLookupTablesFilled() {
});
}
-#else
-
-void EnsureLookupTablesFilled() {}
-
-#endif // ARROW_WITH_UTF8PROC
-
-constexpr int64_t kTransformError = -1;
-
-struct StringTransformBase {
- virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::OK();
- }
-
- // Return the maximum total size of the output in codeunits (i.e. bytes)
- // given input characteristics.
- virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
- return input_ncodeunits;
- }
-
- virtual Status InvalidStatus() {
- return Status::Invalid("Invalid UTF8 sequence in input");
- }
-
- // Derived classes should also define this method:
- // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- // uint8_t* output);
-};
-
-template <typename Type, typename StringTransform>
-struct StringTransformExecBase {
+#else
+
+void EnsureLookupTablesFilled() {}
+
+#endif // ARROW_WITH_UTF8PROC
+
+constexpr int64_t kTransformError = -1;
+
+struct StringTransformBase {
+ virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+
+ // Return the maximum total size of the output in codeunits (i.e. bytes)
+ // given input characteristics.
+ virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
+ return input_ncodeunits;
+ }
+
+ virtual Status InvalidStatus() {
+ return Status::Invalid("Invalid UTF8 sequence in input");
+ }
+
+ // Derived classes should also define this method:
+ // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ // uint8_t* output);
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExecBase {
using offset_type = typename Type::offset_type;
using ArrayType = typename TypeTraits<Type>::ArrayType;
- static Status Execute(KernelContext* ctx, StringTransform* transform,
- const ExecBatch& batch, Datum* out) {
- if (batch[0].kind() == Datum::ARRAY) {
- return ExecArray(ctx, transform, batch[0].array(), out);
+ static Status Execute(KernelContext* ctx, StringTransform* transform,
+ const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ return ExecArray(ctx, transform, batch[0].array(), out);
}
- DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
- return ExecScalar(ctx, transform, batch[0].scalar(), out);
+ DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
+ return ExecScalar(ctx, transform, batch[0].scalar(), out);
}
- static Status ExecArray(KernelContext* ctx, StringTransform* transform,
- const std::shared_ptr<ArrayData>& data, Datum* out) {
- ArrayType input(data);
- ArrayData* output = out->mutable_array();
+ static Status ExecArray(KernelContext* ctx, StringTransform* transform,
+ const std::shared_ptr<ArrayData>& data, Datum* out) {
+ ArrayType input(data);
+ ArrayData* output = out->mutable_array();
- const int64_t input_ncodeunits = input.total_values_length();
- const int64_t input_nstrings = input.length();
+ const int64_t input_ncodeunits = input.total_values_length();
+ const int64_t input_nstrings = input.length();
- const int64_t output_ncodeunits_max =
- transform->MaxCodeunits(input_nstrings, input_ncodeunits);
- if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
- return Status::CapacityError(
- "Result might not fit in a 32bit utf8 array, convert to large_utf8");
- }
+ const int64_t output_ncodeunits_max =
+ transform->MaxCodeunits(input_nstrings, input_ncodeunits);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+ }
- ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
- output->buffers[2] = values_buffer;
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
+ output->buffers[2] = values_buffer;
- // String offsets are preallocated
- offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
- uint8_t* output_str = output->buffers[2]->mutable_data();
- offset_type output_ncodeunits = 0;
+ // String offsets are preallocated
+ offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
+ uint8_t* output_str = output->buffers[2]->mutable_data();
+ offset_type output_ncodeunits = 0;
- output_string_offsets[0] = 0;
- for (int64_t i = 0; i < input_nstrings; i++) {
- if (!input.IsNull(i)) {
+ output_string_offsets[0] = 0;
+ for (int64_t i = 0; i < input_nstrings; i++) {
+ if (!input.IsNull(i)) {
offset_type input_string_ncodeunits;
- const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
- auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
- input_string, input_string_ncodeunits, output_str + output_ncodeunits));
- if (encoded_nbytes < 0) {
- return transform->InvalidStatus();
+ const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
+ auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
+ input_string, input_string_ncodeunits, output_str + output_ncodeunits));
+ if (encoded_nbytes < 0) {
+ return transform->InvalidStatus();
}
output_ncodeunits += encoded_nbytes;
}
- output_string_offsets[i + 1] = output_ncodeunits;
- }
- DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
-
- // Trim the codepoint buffer, since we allocated too much
- return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
- }
-
- static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
- const std::shared_ptr<Scalar>& scalar, Datum* out) {
- const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
- if (!input.is_valid) {
- return Status::OK();
- }
- auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
- result->is_valid = true;
- const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
-
- const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
- if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
- return Status::CapacityError(
- "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+ output_string_offsets[i + 1] = output_ncodeunits;
+ }
+ DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+ // Trim the codepoint buffer, since we allocated too much
+ return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+ }
+
+ static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
+ const std::shared_ptr<Scalar>& scalar, Datum* out) {
+ const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
+ if (!input.is_valid) {
+ return Status::OK();
}
- ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
- result->value = value_buffer;
- auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
- input.value->data(), data_nbytes, value_buffer->mutable_data()));
- if (encoded_nbytes < 0) {
- return transform->InvalidStatus();
- }
- DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
- return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+ auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ result->is_valid = true;
+ const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
+
+ const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+ }
+ ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
+ result->value = value_buffer;
+ auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
+ input.value->data(), data_nbytes, value_buffer->mutable_data()));
+ if (encoded_nbytes < 0) {
+ return transform->InvalidStatus();
+ }
+ DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+ return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
}
};
-template <typename Type, typename StringTransform>
-struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
- using StringTransformExecBase<Type, StringTransform>::Execute;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- StringTransform transform;
- RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
- return Execute(ctx, &transform, batch, out);
- }
-};
-
-template <typename Type, typename StringTransform>
-struct StringTransformExecWithState
- : public StringTransformExecBase<Type, StringTransform> {
- using State = typename StringTransform::State;
- using StringTransformExecBase<Type, StringTransform>::Execute;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- StringTransform transform(State::Get(ctx));
- RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
- return Execute(ctx, &transform, batch, out);
- }
-};
-
-#ifdef ARROW_WITH_UTF8PROC
-
-template <typename CodepointTransform>
-struct StringTransformCodepoint : public StringTransformBase {
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- EnsureLookupTablesFilled();
- return Status::OK();
- }
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits);
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- uint8_t* output_start = output;
- if (ARROW_PREDICT_FALSE(
- !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
- CodepointTransform::TransformCodepoint))) {
- return kTransformError;
- }
- return output - output_start;
- }
-};
-
-// struct CaseMappingMixin {
-struct CaseMappingTransform {
- static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
- // Section 5.18 of the Unicode spec claim that the number of codepoints for case
- // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
- // However, since we don't support all casings (SpecialCasing.txt) the growth
- // in bytes iss actually only at max 3/2 (as covered by the unittest).
- // Note that rounding down the 3/2 is ok, since only codepoints encoded by
- // two code units (even) can grow to 3 code units.
- return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
- }
-};
-
-struct UTF8UpperTransform : public CaseMappingTransform {
- static uint32_t TransformCodepoint(uint32_t codepoint) {
+template <typename Type, typename StringTransform>
+struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
+ using StringTransformExecBase<Type, StringTransform>::Execute;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ StringTransform transform;
+ RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+ return Execute(ctx, &transform, batch, out);
+ }
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExecWithState
+ : public StringTransformExecBase<Type, StringTransform> {
+ using State = typename StringTransform::State;
+ using StringTransformExecBase<Type, StringTransform>::Execute;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ StringTransform transform(State::Get(ctx));
+ RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+ return Execute(ctx, &transform, batch, out);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename CodepointTransform>
+struct StringTransformCodepoint : public StringTransformBase {
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits);
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ uint8_t* output_start = output;
+ if (ARROW_PREDICT_FALSE(
+ !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
+ CodepointTransform::TransformCodepoint))) {
+ return kTransformError;
+ }
+ return output - output_start;
+ }
+};
+
+// struct CaseMappingMixin {
+struct CaseMappingTransform {
+ static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
+ // Section 5.18 of the Unicode spec claim that the number of codepoints for case
+ // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
+ // However, since we don't support all casings (SpecialCasing.txt) the growth
+ // in bytes iss actually only at max 3/2 (as covered by the unittest).
+ // Note that rounding down the 3/2 is ok, since only codepoints encoded by
+ // two code units (even) can grow to 3 code units.
+ return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
+ }
+};
+
+struct UTF8UpperTransform : public CaseMappingTransform {
+ static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
: utf8proc_toupper(codepoint);
}
};
template <typename Type>
-using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
-
-struct UTF8LowerTransform : public CaseMappingTransform {
+using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
+
+struct UTF8LowerTransform : public CaseMappingTransform {
static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
: utf8proc_tolower(codepoint);
}
};
-template <typename Type>
-using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
+template <typename Type>
+using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
#endif // ARROW_WITH_UTF8PROC
-struct AsciiReverseTransform : public StringTransformBase {
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- uint8_t utf8_char_found = 0;
- for (int64_t i = 0; i < input_string_ncodeunits; i++) {
- // if a utf8 char is found, report to utf8_char_found
- utf8_char_found |= input[i] & 0x80;
- output[input_string_ncodeunits - i - 1] = input[i];
- }
- return utf8_char_found ? kTransformError : input_string_ncodeunits;
- }
-
- Status InvalidStatus() override {
- return Status::Invalid("Non-ASCII sequence in input");
- }
-};
-
-template <typename Type>
-using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
-
-struct Utf8ReverseTransform : public StringTransformBase {
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- int64_t i = 0;
- while (i < input_string_ncodeunits) {
- int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
- input_string_ncodeunits);
- std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
- i = char_end;
- }
- return input_string_ncodeunits;
- }
-};
-
-template <typename Type>
-using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
-
+struct AsciiReverseTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ uint8_t utf8_char_found = 0;
+ for (int64_t i = 0; i < input_string_ncodeunits; i++) {
+ // if a utf8 char is found, report to utf8_char_found
+ utf8_char_found |= input[i] & 0x80;
+ output[input_string_ncodeunits - i - 1] = input[i];
+ }
+ return utf8_char_found ? kTransformError : input_string_ncodeunits;
+ }
+
+ Status InvalidStatus() override {
+ return Status::Invalid("Non-ASCII sequence in input");
+ }
+};
+
+template <typename Type>
+using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
+
+struct Utf8ReverseTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ int64_t i = 0;
+ while (i < input_string_ncodeunits) {
+ int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
+ input_string_ncodeunits);
+ std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
+ i = char_end;
+ }
+ return input_string_ncodeunits;
+ }
+};
+
+template <typename Type>
+using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
+
using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
// Transform a buffer of offsets to one which begins with 0 and has same
@@ -378,8 +378,8 @@ Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t
// Apply `transform` to input character data- this function cannot change the
// length
template <typename Type>
-Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
- TransformFunc transform, Datum* out) {
+Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
+ TransformFunc transform, Datum* out) {
using ArrayType = typename TypeTraits<Type>::ArrayType;
using offset_type = typename Type::offset_type;
@@ -395,13 +395,13 @@ Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
} else {
DCHECK(input.buffers[1]);
// We must allocate new space for the offsets and shift the existing offsets
- RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
- input.length, &out_arr->buffers[1]));
+ RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
+ input.length, &out_arr->buffers[1]));
}
// Allocate space for output data
int64_t data_nbytes = input_boxed.total_values_length();
- RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
+ RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
if (input.length > 0) {
transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
out_arr->buffers[2]->mutable_data());
@@ -412,13 +412,13 @@ Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
if (input.is_valid) {
result->is_valid = true;
int64_t data_nbytes = input.value->size();
- RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
+ RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
transform(input.value->data(), data_nbytes, result->value->mutable_data());
}
- out->value = result;
+ out->value = result;
}
-
- return Status::OK();
+
+ return Status::OK();
}
void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
@@ -427,8 +427,8 @@ void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output)
template <typename Type>
struct AsciiUpper {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
}
};
@@ -438,8 +438,8 @@ void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output)
template <typename Type>
struct AsciiLower {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
}
};
@@ -473,881 +473,881 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
static_cast<offset_type>(input.value->size())};
transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
&result_value);
- out->value = std::make_shared<BooleanScalar>(result_value > 0);
+ out->value = std::make_shared<BooleanScalar>(result_value > 0);
}
}
}
-using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
-
-// This is an implementation of the Knuth-Morris-Pratt algorithm
-struct PlainSubstringMatcher {
- const MatchSubstringOptions& options_;
- std::vector<int64_t> prefix_table;
-
- static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
- const MatchSubstringOptions& options) {
- // Should be handled by partial template specialization below
- DCHECK(!options.ignore_case);
- return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
- }
-
- explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
- : options_(options) {
- // Phase 1: Build the prefix table
- const auto pattern_length = options_.pattern.size();
- prefix_table.resize(pattern_length + 1, /*value=*/0);
- int64_t prefix_length = -1;
- prefix_table[0] = -1;
- for (size_t pos = 0; pos < pattern_length; ++pos) {
- // The prefix cannot be expanded, reset.
- while (prefix_length >= 0 &&
- options_.pattern[pos] != options_.pattern[prefix_length]) {
- prefix_length = prefix_table[prefix_length];
- }
- prefix_length++;
- prefix_table[pos + 1] = prefix_length;
+using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
+
+// This is an implementation of the Knuth-Morris-Pratt algorithm
+struct PlainSubstringMatcher {
+ const MatchSubstringOptions& options_;
+ std::vector<int64_t> prefix_table;
+
+ static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
+ }
+
+ explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
+ : options_(options) {
+ // Phase 1: Build the prefix table
+ const auto pattern_length = options_.pattern.size();
+ prefix_table.resize(pattern_length + 1, /*value=*/0);
+ int64_t prefix_length = -1;
+ prefix_table[0] = -1;
+ for (size_t pos = 0; pos < pattern_length; ++pos) {
+ // The prefix cannot be expanded, reset.
+ while (prefix_length >= 0 &&
+ options_.pattern[pos] != options_.pattern[prefix_length]) {
+ prefix_length = prefix_table[prefix_length];
+ }
+ prefix_length++;
+ prefix_table[pos + 1] = prefix_length;
}
}
- int64_t Find(util::string_view current) const {
- // Phase 2: Find the prefix in the data
- const auto pattern_length = options_.pattern.size();
+ int64_t Find(util::string_view current) const {
+ // Phase 2: Find the prefix in the data
+ const auto pattern_length = options_.pattern.size();
int64_t pattern_pos = 0;
- int64_t pos = 0;
- if (pattern_length == 0) return 0;
- for (const auto c : current) {
- while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
+ int64_t pos = 0;
+ if (pattern_length == 0) return 0;
+ for (const auto c : current) {
+ while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
pattern_pos = prefix_table[pattern_pos];
}
pattern_pos++;
- if (static_cast<size_t>(pattern_pos) == pattern_length) {
- return pos + 1 - pattern_length;
+ if (static_cast<size_t>(pattern_pos) == pattern_length) {
+ return pos + 1 - pattern_length;
}
- pos++;
+ pos++;
}
- return -1;
- }
-
- bool Match(util::string_view current) const { return Find(current) >= 0; }
-};
-
-struct PlainStartsWithMatcher {
- const MatchSubstringOptions& options_;
-
- explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
- : options_(options) {}
-
- static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
- const MatchSubstringOptions& options) {
- // Should be handled by partial template specialization below
- DCHECK(!options.ignore_case);
- return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
- }
-
- bool Match(util::string_view current) const {
- // string_view::starts_with is C++20
- return current.substr(0, options_.pattern.size()) == options_.pattern;
- }
-};
-
-struct PlainEndsWithMatcher {
- const MatchSubstringOptions& options_;
-
- explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
- : options_(options) {}
-
- static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
- const MatchSubstringOptions& options) {
- // Should be handled by partial template specialization below
- DCHECK(!options.ignore_case);
- return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
- }
-
- bool Match(util::string_view current) const {
- // string_view::ends_with is C++20
- return current.size() >= options_.pattern.size() &&
- current.substr(current.size() - options_.pattern.size(),
- options_.pattern.size()) == options_.pattern;
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct RegexSubstringMatcher {
- const MatchSubstringOptions& options_;
- const RE2 regex_match_;
-
- static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
- const MatchSubstringOptions& options, bool literal = false) {
- auto matcher =
- ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
- RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
- return std::move(matcher);
- }
-
- explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
- bool literal = false)
- : options_(options),
- regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
-
- bool Match(util::string_view current) const {
- auto piece = re2::StringPiece(current.data(), current.length());
- return re2::RE2::PartialMatch(piece, regex_match_);
- }
-
- static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
- bool literal) {
- RE2::RE2::Options re2_options(RE2::Quiet);
- re2_options.set_case_sensitive(!options.ignore_case);
- re2_options.set_literal(literal);
- return re2_options;
- }
-};
-#endif
-
-template <typename Type, typename Matcher>
-struct MatchSubstringImpl {
+ return -1;
+ }
+
+ bool Match(util::string_view current) const { return Find(current) >= 0; }
+};
+
+struct PlainStartsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
+ }
+
+ bool Match(util::string_view current) const {
+ // string_view::starts_with is C++20
+ return current.substr(0, options_.pattern.size()) == options_.pattern;
+ }
+};
+
+struct PlainEndsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
+ }
+
+ bool Match(util::string_view current) const {
+ // string_view::ends_with is C++20
+ return current.size() >= options_.pattern.size() &&
+ current.substr(current.size() - options_.pattern.size(),
+ options_.pattern.size()) == options_.pattern;
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubstringMatcher {
+ const MatchSubstringOptions& options_;
+ const RE2 regex_match_;
+
+ static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
+ const MatchSubstringOptions& options, bool literal = false) {
+ auto matcher =
+ ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
+ RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
+ return std::move(matcher);
+ }
+
+ explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
+ bool literal = false)
+ : options_(options),
+ regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
+
+ bool Match(util::string_view current) const {
+ auto piece = re2::StringPiece(current.data(), current.length());
+ return re2::RE2::PartialMatch(piece, regex_match_);
+ }
+
+ static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
+ bool literal) {
+ RE2::RE2::Options re2_options(RE2::Quiet);
+ re2_options.set_case_sensitive(!options.ignore_case);
+ re2_options.set_literal(literal);
+ return re2_options;
+ }
+};
+#endif
+
+template <typename Type, typename Matcher>
+struct MatchSubstringImpl {
using offset_type = typename Type::offset_type;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
- const Matcher* matcher) {
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+ const Matcher* matcher) {
StringBoolTransform<Type>(
ctx, batch,
- [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
- int64_t output_offset, uint8_t* output) {
- const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
- FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
- for (int64_t i = 0; i < length; ++i) {
- const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
- int64_t current_length = offsets[i + 1] - offsets[i];
- if (matcher->Match(util::string_view(current_data, current_length))) {
- bitmap_writer.Set();
- }
- bitmap_writer.Next();
- }
- bitmap_writer.Finish();
+ [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
+ int64_t output_offset, uint8_t* output) {
+ const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
+ FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
+ for (int64_t i = 0; i < length; ++i) {
+ const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
+ int64_t current_length = offsets[i + 1] - offsets[i];
+ if (matcher->Match(util::string_view(current_data, current_length))) {
+ bitmap_writer.Set();
+ }
+ bitmap_writer.Next();
+ }
+ bitmap_writer.Finish();
},
out);
- return Status::OK();
- }
-};
-
-template <typename Type, typename Matcher>
-struct MatchSubstring {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // TODO Cache matcher across invocations (for regex compilation)
- ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
- return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
- }
-};
-
-template <typename Type>
-struct MatchSubstring<Type, PlainSubstringMatcher> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- auto options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- ARROW_ASSIGN_OR_RAISE(auto matcher,
- RegexSubstringMatcher::Make(options, /*literal=*/true));
- return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
- return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
- }
-};
-
-template <typename Type>
-struct MatchSubstring<Type, PlainStartsWithMatcher> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- auto options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- MatchSubstringOptions converted_options = options;
- converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
- ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
- return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
- return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
- matcher.get());
- }
-};
-
-template <typename Type>
-struct MatchSubstring<Type, PlainEndsWithMatcher> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- auto options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- MatchSubstringOptions converted_options = options;
- converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
- ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
- return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
- return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
- matcher.get());
+ return Status::OK();
}
};
-const FunctionDoc match_substring_doc(
- "Match strings against literal pattern",
- ("For each string in `strings`, emit true iff it contains a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-const FunctionDoc starts_with_doc(
- "Check if strings start with a literal pattern",
- ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-const FunctionDoc ends_with_doc(
- "Check if strings end with a literal pattern",
- ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-const FunctionDoc match_substring_regex_doc(
- "Match strings against regex pattern",
- ("For each string in `strings`, emit true iff it matches a given pattern at any "
- "position.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-// SQL LIKE match
-
-/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
-std::string MakeLikeRegex(const MatchSubstringOptions& options) {
- // Allow . to match \n
- std::string like_pattern = "(?s:^";
- like_pattern.reserve(options.pattern.size() + 7);
- bool escaped = false;
- for (const char c : options.pattern) {
- if (!escaped && c == '%') {
- like_pattern.append(".*");
- } else if (!escaped && c == '_') {
- like_pattern.append(".");
- } else if (!escaped && c == '\\') {
- escaped = true;
- } else {
- switch (c) {
- case '.':
- case '?':
- case '+':
- case '*':
- case '^':
- case '$':
- case '\\':
- case '[':
- case '{':
- case '(':
- case ')':
- case '|': {
- like_pattern.push_back('\\');
- like_pattern.push_back(c);
- escaped = false;
- break;
- }
- default: {
- like_pattern.push_back(c);
- escaped = false;
- break;
- }
- }
- }
- }
- like_pattern.append("$)");
- return like_pattern;
-}
-
-// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
-// substring search as appropriate. See what Apache Impala does:
-// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
-template <typename StringType>
-struct MatchLike {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // NOTE: avoid making those constants global to avoid compiling regexes at startup
- // A LIKE pattern matching this regex can be translated into a substring search.
- static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
- // A LIKE pattern matching this regex can be translated into a prefix search.
- static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
- // A LIKE pattern matching this regex can be translated into a suffix search.
- static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
-
- auto original_options = MatchSubstringState::Get(ctx);
- auto original_state = ctx->state();
-
- Status status;
- std::string pattern;
- if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
- } else if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
- } else if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
- } else {
- MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
- original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
- }
- ctx->SetState(original_state);
- return status;
- }
-};
-
-const FunctionDoc match_like_doc(
- "Match strings against SQL-style LIKE pattern",
- ("For each string in `strings`, emit true iff it fully matches a given pattern "
- "at any position. That is, '%' will match any number of characters, '_' will "
- "match exactly one character, and any other character matches itself. To "
- "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
-#endif
-
+template <typename Type, typename Matcher>
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache matcher across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
+ return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainSubstringMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto matcher,
+ RegexSubstringMatcher::Make(options, /*literal=*/true));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainStartsWithMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainEndsWithMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+const FunctionDoc match_substring_doc(
+ "Match strings against literal pattern",
+ ("For each string in `strings`, emit true iff it contains a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc starts_with_doc(
+ "Check if strings start with a literal pattern",
+ ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc ends_with_doc(
+ "Check if strings end with a literal pattern",
+ ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+const FunctionDoc match_substring_regex_doc(
+ "Match strings against regex pattern",
+ ("For each string in `strings`, emit true iff it matches a given pattern at any "
+ "position.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+// SQL LIKE match
+
+/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
+std::string MakeLikeRegex(const MatchSubstringOptions& options) {
+ // Allow . to match \n
+ std::string like_pattern = "(?s:^";
+ like_pattern.reserve(options.pattern.size() + 7);
+ bool escaped = false;
+ for (const char c : options.pattern) {
+ if (!escaped && c == '%') {
+ like_pattern.append(".*");
+ } else if (!escaped && c == '_') {
+ like_pattern.append(".");
+ } else if (!escaped && c == '\\') {
+ escaped = true;
+ } else {
+ switch (c) {
+ case '.':
+ case '?':
+ case '+':
+ case '*':
+ case '^':
+ case '$':
+ case '\\':
+ case '[':
+ case '{':
+ case '(':
+ case ')':
+ case '|': {
+ like_pattern.push_back('\\');
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ default: {
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ }
+ }
+ }
+ like_pattern.append("$)");
+ return like_pattern;
+}
+
+// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
+// substring search as appropriate. See what Apache Impala does:
+// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
+template <typename StringType>
+struct MatchLike {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // NOTE: avoid making those constants global to avoid compiling regexes at startup
+ // A LIKE pattern matching this regex can be translated into a substring search.
+ static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a prefix search.
+ static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a suffix search.
+ static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
+
+ auto original_options = MatchSubstringState::Get(ctx);
+ auto original_state = ctx->state();
+
+ Status status;
+ std::string pattern;
+ if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
+ } else {
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
+ original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
+ }
+ ctx->SetState(original_state);
+ return status;
+ }
+};
+
+const FunctionDoc match_like_doc(
+ "Match strings against SQL-style LIKE pattern",
+ ("For each string in `strings`, emit true iff it fully matches a given pattern "
+ "at any position. That is, '%' will match any number of characters, '_' will "
+ "match exactly one character, and any other character matches itself. To "
+ "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#endif
+
void AddMatchSubstring(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
- &match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
- &match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
- &match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#ifdef ARROW_WITH_RE2
- {
- auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
- &match_substring_regex_doc);
- auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func =
- std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
- auto exec_32 = MatchLike<StringType>::Exec;
- auto exec_64 = MatchLike<LargeStringType>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#endif
-}
-
-// Substring find - lfind/index/etc.
-
-struct FindSubstring {
- const PlainSubstringMatcher matcher_;
-
- explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- return static_cast<OutValue>(matcher_.Find(val));
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct FindSubstringRegex {
- std::unique_ptr<RE2> regex_match_;
-
- explicit FindSubstringRegex(const MatchSubstringOptions& options,
- bool literal = false) {
- std::string regex = "(";
- regex.reserve(options.pattern.length() + 2);
- regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
- regex += ")";
- regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
- options, /*literal=*/false)));
- }
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- re2::StringPiece piece(val.data(), val.length());
- re2::StringPiece match;
- if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
- return static_cast<OutValue>(match.data() - piece.data());
- }
- return -1;
- }
-};
-#endif
-
-template <typename InputType>
-struct FindSubstringExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
- kernel{FindSubstringRegex(options, /*literal=*/true)};
- return kernel.Exec(ctx, batch, out);
-#endif
- return Status::NotImplemented("ignore_case requires RE2");
- }
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
- FindSubstring(PlainSubstringMatcher(options))};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-const FunctionDoc find_substring_doc(
- "Find first occurrence of substring",
- ("For each string in `strings`, emit the index of the first occurrence of the given "
- "pattern, or -1 if not found.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-template <typename InputType>
-struct FindSubstringRegexExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
- kernel{FindSubstringRegex(options, /*literal=*/false)};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-const FunctionDoc find_substring_regex_doc(
- "Find location of first match of regex pattern",
- ("For each string in `strings`, emit the index of the first match of the given "
- "pattern, or -1 if not found.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-#endif
-
-void AddFindSubstring(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
- &find_substring_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#ifdef ARROW_WITH_RE2
- {
- auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
- &find_substring_regex_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(
- func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#endif
-}
-
-// Substring count
-
-struct CountSubstring {
- const PlainSubstringMatcher matcher_;
-
- explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- OutValue count = 0;
- uint64_t start = 0;
- const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
- while (start <= val.size()) {
- const int64_t index = matcher_.Find(val.substr(start));
- if (index >= 0) {
- count++;
- start += index + pattern_size;
- } else {
- break;
- }
- }
- return count;
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct CountSubstringRegex {
- std::unique_ptr<RE2> regex_match_;
-
- explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
- : regex_match_(new RE2(options.pattern,
- RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
-
- static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
- bool literal = false) {
- CountSubstringRegex counter(options, literal);
- RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
- return std::move(counter);
- }
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- OutValue count = 0;
- re2::StringPiece input(val.data(), val.size());
- auto last_size = input.size();
- while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
- count++;
- if (last_size == input.size()) {
- // 0-length match
- if (input.size() > 0) {
- input.remove_prefix(1);
- } else {
- break;
- }
- }
- last_size = input.size();
- }
- return count;
- }
-};
-
-template <typename InputType>
-struct CountSubstringRegexExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
- kernel{std::move(counter)};
- return kernel.Exec(ctx, batch, out);
- }
-};
-#endif
-
-template <typename InputType>
-struct CountSubstringExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- ARROW_ASSIGN_OR_RAISE(auto counter,
- CountSubstringRegex::Make(options, /*literal=*/true));
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
- kernel{std::move(counter)};
- return kernel.Exec(ctx, batch, out);
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
- CountSubstring(PlainSubstringMatcher(options))};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-const FunctionDoc count_substring_doc(
- "Count occurrences of substring",
- ("For each string in `strings`, emit the number of occurrences of the given "
- "pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-const FunctionDoc count_substring_regex_doc(
- "Count occurrences of substring",
- ("For each string in `strings`, emit the number of occurrences of the given "
- "regex pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-#endif
-
-void AddCountSubstring(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
- &count_substring_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#ifdef ARROW_WITH_RE2
- {
- auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
- &count_substring_regex_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(
- func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#endif
-}
-
-// Slicing
-
-struct SliceTransformBase : public StringTransformBase {
- using State = OptionsWrapper<SliceOptions>;
-
- const SliceOptions* options;
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- options = &State::Get(ctx);
- if (options->step == 0) {
- return Status::Invalid("Slice step cannot be zero");
- }
- return Status::OK();
- }
-};
-
-struct SliceCodeunitsTransform : SliceTransformBase {
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- const SliceOptions& opt = *this->options;
- if ((opt.start >= 0) != (opt.stop >= 0)) {
- // If start and stop don't have the same sign, we can't guess an upper bound
- // on the resulting slice lengths, so return a worst case estimate.
- return input_ncodeunits;
- }
- int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
- // The maximum UTF8 byte size of a codepoint is 4
- return std::min(input_ncodeunits,
- 4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- if (options->step >= 1) {
- return SliceForward(input, input_string_ncodeunits, output);
- }
- return SliceBackward(input, input_string_ncodeunits, output);
- }
-
-#define RETURN_IF_UTF8_ERROR(expr) \
- do { \
- if (ARROW_PREDICT_FALSE(!expr)) { \
- return kTransformError; \
- } \
- } while (0)
-
- int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- // Slice in forward order (step > 0)
- const SliceOptions& opt = *this->options;
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* begin_sliced = begin;
- const uint8_t* end_sliced = end;
-
- // First, compute begin_sliced and end_sliced
- if (opt.start >= 0) {
- // start counting from the left
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
- if (opt.stop > opt.start) {
- // continue counting from begin_sliced
- const int64_t length = opt.stop - opt.start;
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
- } else if (opt.stop < 0) {
- // or from the end (but we will never need to < begin_sliced)
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin_sliced, end, &end_sliced, -opt.stop));
- } else {
- // zero length slice
- return 0;
- }
- } else {
- // start counting from the right
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin, end, &begin_sliced, -opt.start));
- if (opt.stop > 0) {
- // continue counting from the left, we cannot start from begin_sliced because we
- // don't know how many codepoints are between begin and begin_sliced
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
- // and therefore we also needs this
- if (end_sliced <= begin_sliced) {
- // zero length slice
- return 0;
- }
- } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
- // stop is negative, but larger than start, so we count again from the right
- // in some cases we can optimize this, depending on the shortest path (from end
- // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
- // for instance when start=-100, when the string length is only 10.
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin_sliced, end, &end_sliced, -opt.stop));
- } else {
- // zero length slice
- return 0;
- }
- }
-
- // Second, copy computed slice to output
- DCHECK(begin_sliced <= end_sliced);
- if (opt.step == 1) {
- // fast case, where we simply can finish with a memcpy
- std::copy(begin_sliced, end_sliced, output);
- return end_sliced - begin_sliced;
- }
- uint8_t* dest = output;
- const uint8_t* i = begin_sliced;
-
- while (i < end_sliced) {
- uint32_t codepoint = 0;
- // write a single codepoint
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
- dest = arrow::util::UTF8Encode(dest, codepoint);
- // and skip the remainder
- int64_t skips = opt.step - 1;
- while ((skips--) && (i < end_sliced)) {
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
- }
- }
- return dest - output;
- }
-
- int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- // Slice in reverse order (step < 0)
- const SliceOptions& opt = *this->options;
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* begin_sliced = begin;
- const uint8_t* end_sliced = end;
-
- // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
- // reverse iterators.
- if (opt.start >= 0) {
- // +1 because begin_sliced acts as as the end of a reverse iterator
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
- } else {
- // -1 because start=-1 means the last codeunit, which is 0 advances
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin, end, &begin_sliced, -opt.start - 1));
- }
- // make it point at the last codeunit of the previous codeunit
- begin_sliced--;
-
- // similar to opt.start
- if (opt.stop >= 0) {
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
- } else {
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin, end, &end_sliced, -opt.stop - 1));
- }
- end_sliced--;
-
- // Copy computed slice to output
- uint8_t* dest = output;
- const uint8_t* i = begin_sliced;
- while (i > end_sliced) {
- uint32_t codepoint = 0;
- // write a single codepoint
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
- dest = arrow::util::UTF8Encode(dest, codepoint);
- // and skip the remainder
- int64_t skips = -opt.step - 1;
- while ((skips--) && (i > end_sliced)) {
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
- }
- }
- return dest - output;
- }
-
-#undef RETURN_IF_UTF8_ERROR
-};
-
-template <typename Type>
-using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
-
-const FunctionDoc utf8_slice_codeunits_doc(
- "Slice string ",
- ("For each string in `strings`, slice into a substring defined by\n"
- "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
- "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
- "string will be advanced in reversed order. A `step` of zero is considered an\n"
- "error.\n"
- "Null inputs emit null."),
- {"strings"}, "SliceOptions");
-
-void AddSlice(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
- &utf8_slice_codeunits_doc);
- using t32 = SliceCodeunits<StringType>;
- using t64 = SliceCodeunits<LargeStringType>;
+ {
+ auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
+ &match_substring_regex_doc);
+ auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func =
+ std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
+ auto exec_32 = MatchLike<StringType>::Exec;
+ auto exec_64 = MatchLike<LargeStringType>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Substring find - lfind/index/etc.
+
+struct FindSubstring {
+ const PlainSubstringMatcher matcher_;
+
+ explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ return static_cast<OutValue>(matcher_.Find(val));
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct FindSubstringRegex {
+ std::unique_ptr<RE2> regex_match_;
+
+ explicit FindSubstringRegex(const MatchSubstringOptions& options,
+ bool literal = false) {
+ std::string regex = "(";
+ regex.reserve(options.pattern.length() + 2);
+ regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
+ regex += ")";
+ regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
+ options, /*literal=*/false)));
+ }
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ re2::StringPiece piece(val.data(), val.length());
+ re2::StringPiece match;
+ if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
+ return static_cast<OutValue>(match.data() - piece.data());
+ }
+ return -1;
+ }
+};
+#endif
+
+template <typename InputType>
+struct FindSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
+ kernel{FindSubstringRegex(options, /*literal=*/true)};
+ return kernel.Exec(ctx, batch, out);
+#endif
+ return Status::NotImplemented("ignore_case requires RE2");
+ }
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
+ FindSubstring(PlainSubstringMatcher(options))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_doc(
+ "Find first occurrence of substring",
+ ("For each string in `strings`, emit the index of the first occurrence of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename InputType>
+struct FindSubstringRegexExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
+ kernel{FindSubstringRegex(options, /*literal=*/false)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_regex_doc(
+ "Find location of first match of regex pattern",
+ ("For each string in `strings`, emit the index of the first match of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
+void AddFindSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
+ &find_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
+ &find_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Substring count
+
+struct CountSubstring {
+ const PlainSubstringMatcher matcher_;
+
+ explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ OutValue count = 0;
+ uint64_t start = 0;
+ const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
+ while (start <= val.size()) {
+ const int64_t index = matcher_.Find(val.substr(start));
+ if (index >= 0) {
+ count++;
+ start += index + pattern_size;
+ } else {
+ break;
+ }
+ }
+ return count;
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct CountSubstringRegex {
+ std::unique_ptr<RE2> regex_match_;
+
+ explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
+ : regex_match_(new RE2(options.pattern,
+ RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
+
+ static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
+ bool literal = false) {
+ CountSubstringRegex counter(options, literal);
+ RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
+ return std::move(counter);
+ }
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ OutValue count = 0;
+ re2::StringPiece input(val.data(), val.size());
+ auto last_size = input.size();
+ while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
+ count++;
+ if (last_size == input.size()) {
+ // 0-length match
+ if (input.size() > 0) {
+ input.remove_prefix(1);
+ } else {
+ break;
+ }
+ }
+ last_size = input.size();
+ }
+ return count;
+ }
+};
+
+template <typename InputType>
+struct CountSubstringRegexExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
+ kernel{std::move(counter)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+#endif
+
+template <typename InputType>
+struct CountSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto counter,
+ CountSubstringRegex::Make(options, /*literal=*/true));
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
+ kernel{std::move(counter)};
+ return kernel.Exec(ctx, batch, out);
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
+ CountSubstring(PlainSubstringMatcher(options))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc count_substring_doc(
+ "Count occurrences of substring",
+ ("For each string in `strings`, emit the number of occurrences of the given "
+ "pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+const FunctionDoc count_substring_regex_doc(
+ "Count occurrences of substring",
+ ("For each string in `strings`, emit the number of occurrences of the given "
+ "regex pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
+void AddCountSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
+ &count_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
+ &count_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Slicing
+
+struct SliceTransformBase : public StringTransformBase {
+ using State = OptionsWrapper<SliceOptions>;
+
+ const SliceOptions* options;
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ options = &State::Get(ctx);
+ if (options->step == 0) {
+ return Status::Invalid("Slice step cannot be zero");
+ }
+ return Status::OK();
+ }
+};
+
+struct SliceCodeunitsTransform : SliceTransformBase {
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ const SliceOptions& opt = *this->options;
+ if ((opt.start >= 0) != (opt.stop >= 0)) {
+ // If start and stop don't have the same sign, we can't guess an upper bound
+ // on the resulting slice lengths, so return a worst case estimate.
+ return input_ncodeunits;
+ }
+ int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
+ // The maximum UTF8 byte size of a codepoint is 4
+ return std::min(input_ncodeunits,
+ 4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ if (options->step >= 1) {
+ return SliceForward(input, input_string_ncodeunits, output);
+ }
+ return SliceBackward(input, input_string_ncodeunits, output);
+ }
+
+#define RETURN_IF_UTF8_ERROR(expr) \
+ do { \
+ if (ARROW_PREDICT_FALSE(!expr)) { \
+ return kTransformError; \
+ } \
+ } while (0)
+
+ int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ // Slice in forward order (step > 0)
+ const SliceOptions& opt = *this->options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* begin_sliced = begin;
+ const uint8_t* end_sliced = end;
+
+ // First, compute begin_sliced and end_sliced
+ if (opt.start >= 0) {
+ // start counting from the left
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
+ if (opt.stop > opt.start) {
+ // continue counting from begin_sliced
+ const int64_t length = opt.stop - opt.start;
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
+ } else if (opt.stop < 0) {
+ // or from the end (but we will never need to < begin_sliced)
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin_sliced, end, &end_sliced, -opt.stop));
+ } else {
+ // zero length slice
+ return 0;
+ }
+ } else {
+ // start counting from the right
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &begin_sliced, -opt.start));
+ if (opt.stop > 0) {
+ // continue counting from the left, we cannot start from begin_sliced because we
+ // don't know how many codepoints are between begin and begin_sliced
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
+ // and therefore we also needs this
+ if (end_sliced <= begin_sliced) {
+ // zero length slice
+ return 0;
+ }
+ } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
+ // stop is negative, but larger than start, so we count again from the right
+ // in some cases we can optimize this, depending on the shortest path (from end
+ // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
+ // for instance when start=-100, when the string length is only 10.
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin_sliced, end, &end_sliced, -opt.stop));
+ } else {
+ // zero length slice
+ return 0;
+ }
+ }
+
+ // Second, copy computed slice to output
+ DCHECK(begin_sliced <= end_sliced);
+ if (opt.step == 1) {
+ // fast case, where we simply can finish with a memcpy
+ std::copy(begin_sliced, end_sliced, output);
+ return end_sliced - begin_sliced;
+ }
+ uint8_t* dest = output;
+ const uint8_t* i = begin_sliced;
+
+ while (i < end_sliced) {
+ uint32_t codepoint = 0;
+ // write a single codepoint
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
+ dest = arrow::util::UTF8Encode(dest, codepoint);
+ // and skip the remainder
+ int64_t skips = opt.step - 1;
+ while ((skips--) && (i < end_sliced)) {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
+ }
+ }
+ return dest - output;
+ }
+
+ int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ // Slice in reverse order (step < 0)
+ const SliceOptions& opt = *this->options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* begin_sliced = begin;
+ const uint8_t* end_sliced = end;
+
+ // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
+ // reverse iterators.
+ if (opt.start >= 0) {
+ // +1 because begin_sliced acts as as the end of a reverse iterator
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
+ } else {
+ // -1 because start=-1 means the last codeunit, which is 0 advances
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &begin_sliced, -opt.start - 1));
+ }
+ // make it point at the last codeunit of the previous codeunit
+ begin_sliced--;
+
+ // similar to opt.start
+ if (opt.stop >= 0) {
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
+ } else {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &end_sliced, -opt.stop - 1));
+ }
+ end_sliced--;
+
+ // Copy computed slice to output
+ uint8_t* dest = output;
+ const uint8_t* i = begin_sliced;
+ while (i > end_sliced) {
+ uint32_t codepoint = 0;
+ // write a single codepoint
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+ dest = arrow::util::UTF8Encode(dest, codepoint);
+ // and skip the remainder
+ int64_t skips = -opt.step - 1;
+ while ((skips--) && (i > end_sliced)) {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+ }
+ }
+ return dest - output;
+ }
+
+#undef RETURN_IF_UTF8_ERROR
+};
+
+template <typename Type>
+using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
+
+const FunctionDoc utf8_slice_codeunits_doc(
+ "Slice string ",
+ ("For each string in `strings`, slice into a substring defined by\n"
+ "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
+ "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
+ "string will be advanced in reversed order. A `step` of zero is considered an\n"
+ "error.\n"
+ "Null inputs emit null."),
+ {"strings"}, "SliceOptions");
+
+void AddSlice(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
+ &utf8_slice_codeunits_doc);
+ using t32 = SliceCodeunits<StringType>;
+ using t64 = SliceCodeunits<LargeStringType>;
DCHECK_OK(
- func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
- DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
- SliceCodeunitsTransform::State::Init));
+ func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
+ SliceCodeunitsTransform::State::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
@@ -1496,8 +1496,8 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
template <typename Derived, bool allow_empty = false>
struct CharacterPredicateUnicode {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status* st) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status* st) {
if (allow_empty && input_string_ncodeunits == 0) {
return true;
}
@@ -1508,7 +1508,7 @@ struct CharacterPredicateUnicode {
any |= Derived::PredicateCharacterAny(codepoint);
return Derived::PredicateCharacterAll(codepoint);
}))) {
- *st = Status::Invalid("Invalid UTF8 sequence in input");
+ *st = Status::Invalid("Invalid UTF8 sequence in input");
return false;
}
return all & any;
@@ -1521,8 +1521,8 @@ struct CharacterPredicateUnicode {
template <typename Derived, bool allow_empty = false>
struct CharacterPredicateAscii {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status*) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status*) {
if (allow_empty && input_string_ncodeunits == 0) {
return true;
}
@@ -1599,8 +1599,8 @@ struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
#endif
struct IsAscii {
- static bool Call(KernelContext*, const uint8_t* input,
- size_t input_string_nascii_characters, Status*) {
+ static bool Call(KernelContext*, const uint8_t* input,
+ size_t input_string_nascii_characters, Status*) {
return std::all_of(input, input + input_string_nascii_characters,
IsAsciiCharacter<uint8_t>);
}
@@ -1661,8 +1661,8 @@ struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
#ifdef ARROW_WITH_UTF8PROC
struct IsTitleUnicode {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status* st) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status* st) {
// rules:
// * 1: lower case follows cased
// * 2: upper case follows uncased
@@ -1689,7 +1689,7 @@ struct IsTitleUnicode {
return true;
});
if (!ARROW_PREDICT_TRUE(status)) {
- *st = Status::Invalid("Invalid UTF8 sequence in input");
+ *st = Status::Invalid("Invalid UTF8 sequence in input");
return false;
}
return rules_1_and_2 & rule_3;
@@ -1698,8 +1698,8 @@ struct IsTitleUnicode {
#endif
struct IsTitleAscii {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status*) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status*) {
// rules:
// * 1: lower case follows cased
// * 2: upper case follows uncased
@@ -1758,1021 +1758,1021 @@ struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
}
};
-// splitting
-
-template <typename Options>
-struct SplitFinderBase {
- virtual Status PreExec(const Options& options) { return Status::OK(); }
-
- // Derived classes should also define these methods:
- // static bool Find(const uint8_t* begin, const uint8_t* end,
- // const uint8_t** separator_begin,
- // const uint8_t** separator_end,
- // const SplitPatternOptions& options);
- //
- // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
- // const uint8_t** separator_begin,
- // const uint8_t** separator_end,
- // const SplitPatternOptions& options);
-};
-
-template <typename Type, typename ListType, typename SplitFinder,
- typename Options = typename SplitFinder::Options>
-struct SplitExec {
- using string_offset_type = typename Type::offset_type;
- using list_offset_type = typename ListType::offset_type;
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using ArrayListType = typename TypeTraits<ListType>::ArrayType;
- using ListScalarType = typename TypeTraits<ListType>::ScalarType;
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
- using State = OptionsWrapper<Options>;
-
- // Keep the temporary storage accross individual values, to minimize reallocations
- std::vector<util::string_view> parts;
- Options options;
-
- explicit SplitExec(const Options& options) : options(options) {}
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
- }
-
- Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- SplitFinder finder;
- RETURN_NOT_OK(finder.PreExec(options));
- if (batch[0].kind() == Datum::ARRAY) {
- return Execute(ctx, &finder, batch[0].array(), out);
- }
- DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
- return Execute(ctx, &finder, batch[0].scalar(), out);
- }
-
- Status Execute(KernelContext* ctx, SplitFinder* finder,
- const std::shared_ptr<ArrayData>& data, Datum* out) {
- const ArrayType input(data);
-
- BuilderType builder(input.type(), ctx->memory_pool());
- // A slight overestimate of the data needed
- RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
- // The minimum amount of strings needed
- RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
-
- ArrayData* output_list = out->mutable_array();
- // List offsets were preallocated
- auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
- DCHECK_NE(list_offsets, nullptr);
- // Initial value
- *list_offsets++ = 0;
- for (int64_t i = 0; i < input.length(); ++i) {
- if (!input.IsNull(i)) {
- RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
- if (ARROW_PREDICT_FALSE(builder.length() >
- std::numeric_limits<list_offset_type>::max())) {
- return Status::CapacityError("List offset does not fit into 32 bit");
- }
- }
- *list_offsets++ = static_cast<list_offset_type>(builder.length());
- }
- // Assign string array to list child data
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder.Finish(&string_array));
- output_list->child_data.push_back(string_array->data());
- return Status::OK();
- }
-
- Status Execute(KernelContext* ctx, SplitFinder* finder,
- const std::shared_ptr<Scalar>& scalar, Datum* out) {
- const auto& input = checked_cast<const ScalarType&>(*scalar);
- auto result = checked_cast<ListScalarType*>(out->scalar().get());
- if (input.is_valid) {
- result->is_valid = true;
- BuilderType builder(input.type, ctx->memory_pool());
- util::string_view s(*input.value);
- RETURN_NOT_OK(SplitString(s, finder, &builder));
- RETURN_NOT_OK(builder.Finish(&result->value));
- }
- return Status::OK();
- }
-
- Status SplitString(const util::string_view& s, SplitFinder* finder,
- BuilderType* builder) {
- const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
- const uint8_t* end = begin + s.length();
-
- int64_t max_splits = options.max_splits;
- // if there is no max splits, reversing does not make sense (and is probably less
- // efficient), but is useful for testing
- if (options.reverse) {
- // note that i points 1 further than the 'current'
- const uint8_t* i = end;
- // we will record the parts in reverse order
- parts.clear();
- if (max_splits > -1) {
- parts.reserve(max_splits + 1);
- }
- while (max_splits != 0) {
- const uint8_t *separator_begin, *separator_end;
- // find with whatever algo the part we will 'cut out'
- if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
- parts.emplace_back(reinterpret_cast<const char*>(separator_end),
- i - separator_end);
- i = separator_begin;
- max_splits--;
- } else {
- // if we cannot find a separator, we're done
- break;
- }
- }
- parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
- // now we do the copying
- for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
- RETURN_NOT_OK(builder->Append(*it));
- }
- } else {
- const uint8_t* i = begin;
- while (max_splits != 0) {
- const uint8_t *separator_begin, *separator_end;
- // find with whatever algo the part we will 'cut out'
- if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
- // the part till the beginning of the 'cut'
- RETURN_NOT_OK(
- builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
- i = separator_end;
- max_splits--;
- } else {
- // if we cannot find a separator, we're done
- break;
- }
- }
- // trailing part
- RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
- }
- return Status::OK();
- }
-};
-
-struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
- using Options = SplitPatternOptions;
-
- Status PreExec(const SplitPatternOptions& options) override {
- if (options.pattern.length() == 0) {
- return Status::Invalid("Empty separator");
- }
- return Status::OK();
- }
-
- static bool Find(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitPatternOptions& options) {
- const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
- const int64_t pattern_length = options.pattern.length();
- const uint8_t* i = begin;
- // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
- // the match kernel
- while ((i + pattern_length <= end)) {
- i = std::search(i, end, pattern, pattern + pattern_length);
- if (i != end) {
- *separator_begin = i;
- *separator_end = i + pattern_length;
- return true;
- }
- }
- return false;
- }
-
- static bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitPatternOptions& options) {
- const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
- const int64_t pattern_length = options.pattern.length();
- // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
- // the match kernel
- std::reverse_iterator<const uint8_t*> ri(end);
- std::reverse_iterator<const uint8_t*> rend(begin);
- std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
- std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
- while (begin <= ri.base() - pattern_length) {
- ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
- if (ri != rend) {
- *separator_begin = ri.base() - pattern_length;
- *separator_end = ri.base();
- return true;
- }
- }
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
-
-const FunctionDoc split_pattern_doc(
- "Split string according to separator",
- ("Split each string according to the exact `pattern` defined in\n"
- "SplitPatternOptions. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitPatternOptions."),
- {"strings"}, "SplitPatternOptions");
-
-const FunctionDoc ascii_split_whitespace_doc(
- "Split string according to any ASCII whitespace",
- ("Split each string according any non-zero length sequence of ASCII\n"
- "whitespace characters. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitOptions."),
- {"strings"}, "SplitOptions");
-
-const FunctionDoc utf8_split_whitespace_doc(
- "Split string according to any Unicode whitespace",
- ("Split each string according any non-zero length sequence of Unicode\n"
- "whitespace characters. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitOptions."),
- {"strings"}, "SplitOptions");
-
-void AddSplitPattern(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
- &split_pattern_doc);
- using t32 = SplitPatternExec<StringType, ListType>;
- using t64 = SplitPatternExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
- using Options = SplitOptions;
-
- static bool Find(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitOptions& options) {
- const uint8_t* i = begin;
- while (i < end) {
- if (IsSpaceCharacterAscii(*i)) {
- *separator_begin = i;
- do {
- i++;
- } while (IsSpaceCharacterAscii(*i) && i < end);
- *separator_end = i;
- return true;
- }
- i++;
- }
- return false;
- }
-
- static bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitOptions& options) {
- const uint8_t* i = end - 1;
- while ((i >= begin)) {
- if (IsSpaceCharacterAscii(*i)) {
- *separator_end = i + 1;
- do {
- i--;
- } while (IsSpaceCharacterAscii(*i) && i >= begin);
- *separator_begin = i + 1;
- return true;
- }
- i--;
- }
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
-
-void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
- static const SplitOptions default_options{};
- auto func =
- std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
- &ascii_split_whitespace_doc, &default_options);
- using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
- using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-#ifdef ARROW_WITH_UTF8PROC
-struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
- using Options = SplitOptions;
-
- Status PreExec(const SplitOptions& options) override {
- EnsureLookupTablesFilled();
- return Status::OK();
- }
-
- bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
- const uint8_t** separator_end, const SplitOptions& options) {
- const uint8_t* i = begin;
- while ((i < end)) {
- uint32_t codepoint = 0;
- *separator_begin = i;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
- return false;
- }
- if (IsSpaceCharacterUnicode(codepoint)) {
- do {
- *separator_end = i;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
- return false;
- }
- } while (IsSpaceCharacterUnicode(codepoint) && i < end);
- return true;
- }
- }
- return false;
- }
-
- bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitOptions& options) {
- const uint8_t* i = end - 1;
- while ((i >= begin)) {
- uint32_t codepoint = 0;
- *separator_end = i + 1;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
- return false;
- }
- if (IsSpaceCharacterUnicode(codepoint)) {
- do {
- *separator_begin = i + 1;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
- return false;
- }
- } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
- return true;
- }
- }
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
-
-void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
- static const SplitOptions default_options{};
- auto func =
- std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
- &utf8_split_whitespace_doc, &default_options);
- using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
- using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-#endif // ARROW_WITH_UTF8PROC
-
-#ifdef ARROW_WITH_RE2
-struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
- using Options = SplitPatternOptions;
-
- util::optional<RE2> regex_split;
-
- Status PreExec(const SplitPatternOptions& options) override {
- if (options.reverse) {
- return Status::NotImplemented("Cannot split in reverse with regex");
- }
- // RE2 does *not* give you the full match! Must wrap the regex in a capture group
- // There is FindAndConsume, but it would give only the end of the separator
- std::string pattern = "(";
- pattern.reserve(options.pattern.size() + 2);
- pattern += options.pattern;
- pattern += ')';
- regex_split.emplace(std::move(pattern));
- return RegexStatus(*regex_split);
- }
-
- bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
- const uint8_t** separator_end, const SplitPatternOptions& options) {
- re2::StringPiece piece(reinterpret_cast<const char*>(begin),
- std::distance(begin, end));
- // "StringPiece is mutated to point to matched piece"
- re2::StringPiece result;
- if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
- return false;
- }
- *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
- *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
- return true;
- }
-
- bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitPatternOptions& options) {
- // Unsupported (see PreExec)
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
-
-const FunctionDoc split_pattern_regex_doc(
- "Split string according to regex pattern",
- ("Split each string according to the regex `pattern` defined in\n"
- "SplitPatternOptions. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitPatternOptions."),
- {"strings"}, "SplitPatternOptions");
-
-void AddSplitRegex(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
- &split_pattern_regex_doc);
- using t32 = SplitRegexExec<StringType, ListType>;
- using t64 = SplitRegexExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-#endif // ARROW_WITH_RE2
-
-void AddSplit(FunctionRegistry* registry) {
- AddSplitPattern(registry);
- AddSplitWhitespaceAscii(registry);
-#ifdef ARROW_WITH_UTF8PROC
- AddSplitWhitespaceUTF8(registry);
-#endif
-#ifdef ARROW_WITH_RE2
- AddSplitRegex(registry);
-#endif
-}
-
-// ----------------------------------------------------------------------
-// Replace substring (plain, regex)
-
-template <typename Type, typename Replacer>
-struct ReplaceSubString {
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- using offset_type = typename Type::offset_type;
- using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
- using OffsetBuilder = TypedBufferBuilder<offset_type>;
- using State = OptionsWrapper<ReplaceSubstringOptions>;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // TODO Cache replacer across invocations (for regex compilation)
- ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
- return Replace(ctx, batch, *replacer, out);
- }
-
- static Status Replace(KernelContext* ctx, const ExecBatch& batch,
- const Replacer& replacer, Datum* out) {
- ValueDataBuilder value_data_builder(ctx->memory_pool());
- OffsetBuilder offset_builder(ctx->memory_pool());
-
- if (batch[0].kind() == Datum::ARRAY) {
- // We already know how many strings we have, so we can use Reserve/UnsafeAppend
- RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
- offset_builder.UnsafeAppend(0); // offsets start at 0
-
- const ArrayData& input = *batch[0].array();
- RETURN_NOT_OK(VisitArrayDataInline<Type>(
- input,
- [&](util::string_view s) {
- RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
- offset_builder.UnsafeAppend(
- static_cast<offset_type>(value_data_builder.length()));
- return Status::OK();
- },
- [&]() {
- // offset for null value
- offset_builder.UnsafeAppend(
- static_cast<offset_type>(value_data_builder.length()));
- return Status::OK();
- }));
- ArrayData* output = out->mutable_array();
- RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
- RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
- } else {
- const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
- auto result = std::make_shared<ScalarType>();
- if (input.is_valid) {
- util::string_view s = static_cast<util::string_view>(*input.value);
- RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
- RETURN_NOT_OK(value_data_builder.Finish(&result->value));
- result->is_valid = true;
- }
- out->value = result;
- }
-
- return Status::OK();
- }
-};
-
-struct PlainSubStringReplacer {
- const ReplaceSubstringOptions& options_;
-
- static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
- const ReplaceSubstringOptions& options) {
- return arrow::internal::make_unique<PlainSubStringReplacer>(options);
- }
-
- explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
- : options_(options) {}
-
- Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
- const char* i = s.begin();
- const char* end = s.end();
- int64_t max_replacements = options_.max_replacements;
- while ((i < end) && (max_replacements != 0)) {
- const char* pos =
- std::search(i, end, options_.pattern.begin(), options_.pattern.end());
- if (pos == end) {
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i)));
- i = end;
- } else {
- // the string before the pattern
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(pos - i)));
- // the replacement
- RETURN_NOT_OK(
- builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
- options_.replacement.length()));
- // skip pattern
- i = pos + options_.pattern.length();
- max_replacements--;
- }
- }
- // if we exited early due to max_replacements, add the trailing part
- return builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i));
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct RegexSubStringReplacer {
- const ReplaceSubstringOptions& options_;
- const RE2 regex_find_;
- const RE2 regex_replacement_;
-
- static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
- const ReplaceSubstringOptions& options) {
- auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
-
- RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
- RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
-
- std::string replacement_error;
- if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
- &replacement_error)) {
- return Status::Invalid("Invalid replacement string: ",
- std::move(replacement_error));
- }
-
- return std::move(replacer);
- }
-
- // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
- // we have 2 regexes, one with () around it, one without.
- explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
- : options_(options),
- regex_find_("(" + options_.pattern + ")", RE2::Quiet),
- regex_replacement_(options_.pattern, RE2::Quiet) {}
-
- Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
- re2::StringPiece replacement(options_.replacement);
-
- if (options_.max_replacements == -1) {
- std::string s_copy(s.to_string());
- re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
- return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
- s_copy.length());
- }
-
- // Since RE2 does not have the concept of max_replacements, we have to do some work
- // ourselves.
- // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
- const char* i = s.begin();
- const char* end = s.end();
- re2::StringPiece piece(s.data(), s.length());
-
- int64_t max_replacements = options_.max_replacements;
- while ((i < end) && (max_replacements != 0)) {
- std::string found;
- if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i)));
- i = end;
- } else {
- // wind back to the beginning of the match
- const char* pos = piece.begin() - found.length();
- // the string before the pattern
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(pos - i)));
- // replace the pattern in what we found
- if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
- return Status::Invalid("Regex found, but replacement failed");
- }
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
- static_cast<int64_t>(found.length())));
- // skip pattern
- i = piece.begin();
- max_replacements--;
- }
- }
- // If we exited early due to max_replacements, add the trailing part
- return builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i));
- }
-};
-#endif
-
-template <typename Type>
-using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
-
-const FunctionDoc replace_substring_doc(
- "Replace non-overlapping substrings that match pattern by replacement",
- ("For each string in `strings`, replace non-overlapping substrings that match\n"
- "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
- "maximum amount of replacements made, counting from the left. Null values emit\n"
- "null."),
- {"strings"}, "ReplaceSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-template <typename Type>
-using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
-
-const FunctionDoc replace_substring_regex_doc(
- "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
- ("For each string in `strings`, replace non-overlapping substrings that match the\n"
- "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
- "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
- "made, counting from the left. Note that if the pattern contains groups,\n"
- "backreferencing macan be used. Null values emit null."),
- {"strings"}, "ReplaceSubstringOptions");
-#endif
-
-// ----------------------------------------------------------------------
-// Replace slice
-
-struct ReplaceSliceTransformBase : public StringTransformBase {
- using State = OptionsWrapper<ReplaceSliceOptions>;
-
- const ReplaceSliceOptions* options;
-
- explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
- : options{&options} {}
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- return ninputs * options->replacement.size() + input_ncodeunits;
- }
-};
-
-struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
- using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const auto& opts = *options;
- int64_t before_slice = 0;
- int64_t after_slice = 0;
- uint8_t* output_start = output;
-
- if (opts.start >= 0) {
- // Count from left
- before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
- } else {
- // Count from right
- before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
- }
- // Mimic Pandas: if stop would be before start, treat as 0-length slice
- if (opts.stop >= 0) {
- // Count from left
- after_slice =
- std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
- } else {
- // Count from right
- after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
- }
- output = std::copy(input, input + before_slice, output);
- output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
- output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
- return output - output_start;
- }
-};
-
-struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
- using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const auto& opts = *options;
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t *begin_sliced, *end_sliced;
- uint8_t* output_start = output;
-
- // Mimic Pandas: if stop would be before start, treat as 0-length slice
- if (opts.start >= 0) {
- // Count from left
- if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
- return kTransformError;
- }
- if (opts.stop > options->start) {
- // Continue counting from left
- const int64_t length = opts.stop - options->start;
- if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
- return kTransformError;
- }
- } else if (opts.stop < 0) {
- // Count from right
- if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
- -opts.stop)) {
- return kTransformError;
- }
- } else {
- // Zero-length slice
- end_sliced = begin_sliced;
- }
- } else {
- // Count from right
- if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
- -opts.start)) {
- return kTransformError;
- }
- if (opts.stop >= 0) {
- // Restart counting from left
- if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
- return kTransformError;
- }
- if (end_sliced <= begin_sliced) {
- // Zero-length slice
- end_sliced = begin_sliced;
- }
- } else if ((opts.stop < 0) && (options->stop > options->start)) {
- // Count from right
- if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
- -opts.stop)) {
- return kTransformError;
- }
- } else {
- // zero-length slice
- end_sliced = begin_sliced;
- }
- }
- output = std::copy(begin, begin_sliced, output);
- output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
- output = std::copy(end_sliced, end, output);
- return output - output_start;
- }
-};
-
-template <typename Type>
-using BinaryReplaceSlice =
- StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
-template <typename Type>
-using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
-
-const FunctionDoc binary_replace_slice_doc(
- "Replace a slice of a binary string with `replacement`",
- ("For each string in `strings`, replace a slice of the string defined by `start`"
- "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
- "and both are measured in bytes.\n"
- "Null values emit null."),
- {"strings"}, "ReplaceSliceOptions");
-
-const FunctionDoc utf8_replace_slice_doc(
- "Replace a slice of a string with `replacement`",
- ("For each string in `strings`, replace a slice of the string defined by `start`"
- "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
- "and both are measured in codeunits.\n"
- "Null values emit null."),
- {"strings"}, "ReplaceSliceOptions");
-
-void AddReplaceSlice(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
- &binary_replace_slice_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- DCHECK_OK(func->AddKernel({ty}, ty,
- GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
- ReplaceSliceTransformBase::State::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-
- {
- auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
- &utf8_replace_slice_doc);
- DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
- ReplaceSliceTransformBase::State::Init));
- DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
- Utf8ReplaceSlice<LargeStringType>::Exec,
- ReplaceSliceTransformBase::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-// ----------------------------------------------------------------------
-// Extract with regex
-
-#ifdef ARROW_WITH_RE2
-
-// TODO cache this once per ExtractRegexOptions
-struct ExtractRegexData {
- // Use unique_ptr<> because RE2 is non-movable
- std::unique_ptr<RE2> regex;
- std::vector<std::string> group_names;
-
- static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
- ExtractRegexData data(options.pattern);
- RETURN_NOT_OK(RegexStatus(*data.regex));
-
- const int group_count = data.regex->NumberOfCapturingGroups();
- const auto& name_map = data.regex->CapturingGroupNames();
- data.group_names.reserve(group_count);
-
- for (int i = 0; i < group_count; i++) {
- auto item = name_map.find(i + 1); // re2 starts counting from 1
- if (item == name_map.end()) {
- // XXX should we instead just create fields with an empty name?
- return Status::Invalid("Regular expression contains unnamed groups");
- }
- data.group_names.emplace_back(item->second);
- }
- return std::move(data);
- }
-
- Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
- const auto& input_type = args[0].type;
- if (input_type == nullptr) {
- // No input type specified => propagate shape
- return args[0];
- }
- // Input type is either String or LargeString and is also the type of each
- // field in the output struct type.
- DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
- FieldVector fields;
- fields.reserve(group_names.size());
- std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
- [&](const std::string& name) { return field(name, input_type); });
- return struct_(std::move(fields));
- }
-
- private:
- explicit ExtractRegexData(const std::string& pattern)
- : regex(new RE2(pattern, RE2::Quiet)) {}
-};
-
-Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
- const std::vector<ValueDescr>& args) {
- using State = OptionsWrapper<ExtractRegexOptions>;
- ExtractRegexOptions options = State::Get(ctx);
- ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
- return data.ResolveOutputType(args);
-}
-
-struct ExtractRegexBase {
- const ExtractRegexData& data;
- const int group_count;
- std::vector<re2::StringPiece> found_values;
- std::vector<re2::RE2::Arg> args;
- std::vector<const re2::RE2::Arg*> args_pointers;
- const re2::RE2::Arg** args_pointers_start;
- const re2::RE2::Arg* null_arg = nullptr;
-
- explicit ExtractRegexBase(const ExtractRegexData& data)
- : data(data),
- group_count(static_cast<int>(data.group_names.size())),
- found_values(group_count) {
- args.reserve(group_count);
- args_pointers.reserve(group_count);
-
- for (int i = 0; i < group_count; i++) {
- args.emplace_back(&found_values[i]);
- // Since we reserved capacity, we're guaranteed the pointer remains valid
- args_pointers.push_back(&args[i]);
- }
- // Avoid null pointer if there is no capture group
- args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
- }
-
- bool Match(util::string_view s) {
- return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
- group_count);
- }
-};
-
-template <typename Type>
-struct ExtractRegex : public ExtractRegexBase {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- using State = OptionsWrapper<ExtractRegexOptions>;
-
- using ExtractRegexBase::ExtractRegexBase;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ExtractRegexOptions options = State::Get(ctx);
- ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
- return ExtractRegex{data}.Extract(ctx, batch, out);
- }
-
- Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
- DCHECK_NE(descr.type, nullptr);
- const auto& type = descr.type;
-
- if (batch[0].kind() == Datum::ARRAY) {
- std::unique_ptr<ArrayBuilder> array_builder;
- RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
- StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
-
- std::vector<BuilderType*> field_builders;
- field_builders.reserve(group_count);
- for (int i = 0; i < group_count; i++) {
- field_builders.push_back(
- checked_cast<BuilderType*>(struct_builder->field_builder(i)));
- }
-
- auto visit_null = [&]() { return struct_builder->AppendNull(); };
- auto visit_value = [&](util::string_view s) {
- if (Match(s)) {
- for (int i = 0; i < group_count; i++) {
- RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
- }
- return struct_builder->Append();
- } else {
- return struct_builder->AppendNull();
- }
- };
- const ArrayData& input = *batch[0].array();
- RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
-
- std::shared_ptr<Array> out_array;
- RETURN_NOT_OK(struct_builder->Finish(&out_array));
- *out = std::move(out_array);
- } else {
- const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
- auto result = std::make_shared<StructScalar>(type);
- if (input.is_valid && Match(util::string_view(*input.value))) {
- result->value.reserve(group_count);
- for (int i = 0; i < group_count; i++) {
- result->value.push_back(
- std::make_shared<ScalarType>(found_values[i].as_string()));
- }
- result->is_valid = true;
- } else {
- result->is_valid = false;
- }
- out->value = std::move(result);
- }
-
- return Status::OK();
- }
-};
-
-const FunctionDoc extract_regex_doc(
- "Extract substrings captured by a regex pattern",
- ("For each string in `strings`, match the regular expression and, if\n"
- "successful, emit a struct with field names and values coming from the\n"
- "regular expression's named capture groups. If the input is null or the\n"
- "regular expression fails matching, a null output value is emitted.\n"
- "\n"
- "Regular expression matching is done using the Google RE2 library."),
- {"strings"}, "ExtractRegexOptions");
-
-void AddExtractRegex(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
- &extract_regex_doc);
- using t32 = ExtractRegex<StringType>;
- using t64 = ExtractRegex<LargeStringType>;
- OutputType out_ty(ResolveExtractRegexOutput);
- ScalarKernel kernel;
-
- // Null values will be computed based on regex match or not
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
- kernel.exec = t32::Exec;
- kernel.init = t32::State::Init;
- DCHECK_OK(func->AddKernel(kernel));
- kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
- kernel.exec = t64::Exec;
- kernel.init = t64::State::Init;
- DCHECK_OK(func->AddKernel(kernel));
-
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-#endif // ARROW_WITH_RE2
-
+// splitting
+
+template <typename Options>
+struct SplitFinderBase {
+ virtual Status PreExec(const Options& options) { return Status::OK(); }
+
+ // Derived classes should also define these methods:
+ // static bool Find(const uint8_t* begin, const uint8_t* end,
+ // const uint8_t** separator_begin,
+ // const uint8_t** separator_end,
+ // const SplitPatternOptions& options);
+ //
+ // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ // const uint8_t** separator_begin,
+ // const uint8_t** separator_end,
+ // const SplitPatternOptions& options);
+};
+
+template <typename Type, typename ListType, typename SplitFinder,
+ typename Options = typename SplitFinder::Options>
+struct SplitExec {
+ using string_offset_type = typename Type::offset_type;
+ using list_offset_type = typename ListType::offset_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+ using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
+ using State = OptionsWrapper<Options>;
+
+ // Keep the temporary storage accross individual values, to minimize reallocations
+ std::vector<util::string_view> parts;
+ Options options;
+
+ explicit SplitExec(const Options& options) : options(options) {}
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
+ }
+
+ Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ SplitFinder finder;
+ RETURN_NOT_OK(finder.PreExec(options));
+ if (batch[0].kind() == Datum::ARRAY) {
+ return Execute(ctx, &finder, batch[0].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
+ return Execute(ctx, &finder, batch[0].scalar(), out);
+ }
+
+ Status Execute(KernelContext* ctx, SplitFinder* finder,
+ const std::shared_ptr<ArrayData>& data, Datum* out) {
+ const ArrayType input(data);
+
+ BuilderType builder(input.type(), ctx->memory_pool());
+ // A slight overestimate of the data needed
+ RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
+ // The minimum amount of strings needed
+ RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
+
+ ArrayData* output_list = out->mutable_array();
+ // List offsets were preallocated
+ auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
+ DCHECK_NE(list_offsets, nullptr);
+ // Initial value
+ *list_offsets++ = 0;
+ for (int64_t i = 0; i < input.length(); ++i) {
+ if (!input.IsNull(i)) {
+ RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
+ if (ARROW_PREDICT_FALSE(builder.length() >
+ std::numeric_limits<list_offset_type>::max())) {
+ return Status::CapacityError("List offset does not fit into 32 bit");
+ }
+ }
+ *list_offsets++ = static_cast<list_offset_type>(builder.length());
+ }
+ // Assign string array to list child data
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ output_list->child_data.push_back(string_array->data());
+ return Status::OK();
+ }
+
+ Status Execute(KernelContext* ctx, SplitFinder* finder,
+ const std::shared_ptr<Scalar>& scalar, Datum* out) {
+ const auto& input = checked_cast<const ScalarType&>(*scalar);
+ auto result = checked_cast<ListScalarType*>(out->scalar().get());
+ if (input.is_valid) {
+ result->is_valid = true;
+ BuilderType builder(input.type, ctx->memory_pool());
+ util::string_view s(*input.value);
+ RETURN_NOT_OK(SplitString(s, finder, &builder));
+ RETURN_NOT_OK(builder.Finish(&result->value));
+ }
+ return Status::OK();
+ }
+
+ Status SplitString(const util::string_view& s, SplitFinder* finder,
+ BuilderType* builder) {
+ const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
+ const uint8_t* end = begin + s.length();
+
+ int64_t max_splits = options.max_splits;
+ // if there is no max splits, reversing does not make sense (and is probably less
+ // efficient), but is useful for testing
+ if (options.reverse) {
+ // note that i points 1 further than the 'current'
+ const uint8_t* i = end;
+ // we will record the parts in reverse order
+ parts.clear();
+ if (max_splits > -1) {
+ parts.reserve(max_splits + 1);
+ }
+ while (max_splits != 0) {
+ const uint8_t *separator_begin, *separator_end;
+ // find with whatever algo the part we will 'cut out'
+ if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
+ parts.emplace_back(reinterpret_cast<const char*>(separator_end),
+ i - separator_end);
+ i = separator_begin;
+ max_splits--;
+ } else {
+ // if we cannot find a separator, we're done
+ break;
+ }
+ }
+ parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
+ // now we do the copying
+ for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+ RETURN_NOT_OK(builder->Append(*it));
+ }
+ } else {
+ const uint8_t* i = begin;
+ while (max_splits != 0) {
+ const uint8_t *separator_begin, *separator_end;
+ // find with whatever algo the part we will 'cut out'
+ if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
+ // the part till the beginning of the 'cut'
+ RETURN_NOT_OK(
+ builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
+ i = separator_end;
+ max_splits--;
+ } else {
+ // if we cannot find a separator, we're done
+ break;
+ }
+ }
+ // trailing part
+ RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
+ }
+ return Status::OK();
+ }
+};
+
+struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
+ using Options = SplitPatternOptions;
+
+ Status PreExec(const SplitPatternOptions& options) override {
+ if (options.pattern.length() == 0) {
+ return Status::Invalid("Empty separator");
+ }
+ return Status::OK();
+ }
+
+ static bool Find(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
+ const int64_t pattern_length = options.pattern.length();
+ const uint8_t* i = begin;
+ // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
+ // the match kernel
+ while ((i + pattern_length <= end)) {
+ i = std::search(i, end, pattern, pattern + pattern_length);
+ if (i != end) {
+ *separator_begin = i;
+ *separator_end = i + pattern_length;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
+ const int64_t pattern_length = options.pattern.length();
+ // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
+ // the match kernel
+ std::reverse_iterator<const uint8_t*> ri(end);
+ std::reverse_iterator<const uint8_t*> rend(begin);
+ std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
+ std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
+ while (begin <= ri.base() - pattern_length) {
+ ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
+ if (ri != rend) {
+ *separator_begin = ri.base() - pattern_length;
+ *separator_end = ri.base();
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
+
+const FunctionDoc split_pattern_doc(
+ "Split string according to separator",
+ ("Split each string according to the exact `pattern` defined in\n"
+ "SplitPatternOptions. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitPatternOptions."),
+ {"strings"}, "SplitPatternOptions");
+
+const FunctionDoc ascii_split_whitespace_doc(
+ "Split string according to any ASCII whitespace",
+ ("Split each string according any non-zero length sequence of ASCII\n"
+ "whitespace characters. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitOptions."),
+ {"strings"}, "SplitOptions");
+
+const FunctionDoc utf8_split_whitespace_doc(
+ "Split string according to any Unicode whitespace",
+ ("Split each string according any non-zero length sequence of Unicode\n"
+ "whitespace characters. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitOptions."),
+ {"strings"}, "SplitOptions");
+
+void AddSplitPattern(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
+ &split_pattern_doc);
+ using t32 = SplitPatternExec<StringType, ListType>;
+ using t64 = SplitPatternExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
+ using Options = SplitOptions;
+
+ static bool Find(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = begin;
+ while (i < end) {
+ if (IsSpaceCharacterAscii(*i)) {
+ *separator_begin = i;
+ do {
+ i++;
+ } while (IsSpaceCharacterAscii(*i) && i < end);
+ *separator_end = i;
+ return true;
+ }
+ i++;
+ }
+ return false;
+ }
+
+ static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = end - 1;
+ while ((i >= begin)) {
+ if (IsSpaceCharacterAscii(*i)) {
+ *separator_end = i + 1;
+ do {
+ i--;
+ } while (IsSpaceCharacterAscii(*i) && i >= begin);
+ *separator_begin = i + 1;
+ return true;
+ }
+ i--;
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
+
+void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
+ static const SplitOptions default_options{};
+ auto func =
+ std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
+ &ascii_split_whitespace_doc, &default_options);
+ using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
+ using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+#ifdef ARROW_WITH_UTF8PROC
+struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
+ using Options = SplitOptions;
+
+ Status PreExec(const SplitOptions& options) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
+ const uint8_t** separator_end, const SplitOptions& options) {
+ const uint8_t* i = begin;
+ while ((i < end)) {
+ uint32_t codepoint = 0;
+ *separator_begin = i;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ if (IsSpaceCharacterUnicode(codepoint)) {
+ do {
+ *separator_end = i;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ } while (IsSpaceCharacterUnicode(codepoint) && i < end);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = end - 1;
+ while ((i >= begin)) {
+ uint32_t codepoint = 0;
+ *separator_end = i + 1;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ if (IsSpaceCharacterUnicode(codepoint)) {
+ do {
+ *separator_begin = i + 1;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
+
+void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
+ static const SplitOptions default_options{};
+ auto func =
+ std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
+ &utf8_split_whitespace_doc, &default_options);
+ using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
+ using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_UTF8PROC
+
+#ifdef ARROW_WITH_RE2
+struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
+ using Options = SplitPatternOptions;
+
+ util::optional<RE2> regex_split;
+
+ Status PreExec(const SplitPatternOptions& options) override {
+ if (options.reverse) {
+ return Status::NotImplemented("Cannot split in reverse with regex");
+ }
+ // RE2 does *not* give you the full match! Must wrap the regex in a capture group
+ // There is FindAndConsume, but it would give only the end of the separator
+ std::string pattern = "(";
+ pattern.reserve(options.pattern.size() + 2);
+ pattern += options.pattern;
+ pattern += ')';
+ regex_split.emplace(std::move(pattern));
+ return RegexStatus(*regex_split);
+ }
+
+ bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
+ const uint8_t** separator_end, const SplitPatternOptions& options) {
+ re2::StringPiece piece(reinterpret_cast<const char*>(begin),
+ std::distance(begin, end));
+ // "StringPiece is mutated to point to matched piece"
+ re2::StringPiece result;
+ if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
+ return false;
+ }
+ *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
+ *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
+ return true;
+ }
+
+ bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ // Unsupported (see PreExec)
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
+
+const FunctionDoc split_pattern_regex_doc(
+ "Split string according to regex pattern",
+ ("Split each string according to the regex `pattern` defined in\n"
+ "SplitPatternOptions. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitPatternOptions."),
+ {"strings"}, "SplitPatternOptions");
+
+void AddSplitRegex(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
+ &split_pattern_regex_doc);
+ using t32 = SplitRegexExec<StringType, ListType>;
+ using t64 = SplitRegexExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_RE2
+
+void AddSplit(FunctionRegistry* registry) {
+ AddSplitPattern(registry);
+ AddSplitWhitespaceAscii(registry);
+#ifdef ARROW_WITH_UTF8PROC
+ AddSplitWhitespaceUTF8(registry);
+#endif
+#ifdef ARROW_WITH_RE2
+ AddSplitRegex(registry);
+#endif
+}
+
// ----------------------------------------------------------------------
+// Replace substring (plain, regex)
+
+template <typename Type, typename Replacer>
+struct ReplaceSubString {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using offset_type = typename Type::offset_type;
+ using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
+ using OffsetBuilder = TypedBufferBuilder<offset_type>;
+ using State = OptionsWrapper<ReplaceSubstringOptions>;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache replacer across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
+ return Replace(ctx, batch, *replacer, out);
+ }
+
+ static Status Replace(KernelContext* ctx, const ExecBatch& batch,
+ const Replacer& replacer, Datum* out) {
+ ValueDataBuilder value_data_builder(ctx->memory_pool());
+ OffsetBuilder offset_builder(ctx->memory_pool());
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ // We already know how many strings we have, so we can use Reserve/UnsafeAppend
+ RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
+ offset_builder.UnsafeAppend(0); // offsets start at 0
+
+ const ArrayData& input = *batch[0].array();
+ RETURN_NOT_OK(VisitArrayDataInline<Type>(
+ input,
+ [&](util::string_view s) {
+ RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
+ offset_builder.UnsafeAppend(
+ static_cast<offset_type>(value_data_builder.length()));
+ return Status::OK();
+ },
+ [&]() {
+ // offset for null value
+ offset_builder.UnsafeAppend(
+ static_cast<offset_type>(value_data_builder.length()));
+ return Status::OK();
+ }));
+ ArrayData* output = out->mutable_array();
+ RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
+ RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
+ } else {
+ const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto result = std::make_shared<ScalarType>();
+ if (input.is_valid) {
+ util::string_view s = static_cast<util::string_view>(*input.value);
+ RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
+ RETURN_NOT_OK(value_data_builder.Finish(&result->value));
+ result->is_valid = true;
+ }
+ out->value = result;
+ }
+
+ return Status::OK();
+ }
+};
+
+struct PlainSubStringReplacer {
+ const ReplaceSubstringOptions& options_;
+
+ static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
+ const ReplaceSubstringOptions& options) {
+ return arrow::internal::make_unique<PlainSubStringReplacer>(options);
+ }
+
+ explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
+ : options_(options) {}
+
+ Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
+ const char* i = s.begin();
+ const char* end = s.end();
+ int64_t max_replacements = options_.max_replacements;
+ while ((i < end) && (max_replacements != 0)) {
+ const char* pos =
+ std::search(i, end, options_.pattern.begin(), options_.pattern.end());
+ if (pos == end) {
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i)));
+ i = end;
+ } else {
+ // the string before the pattern
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(pos - i)));
+ // the replacement
+ RETURN_NOT_OK(
+ builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
+ options_.replacement.length()));
+ // skip pattern
+ i = pos + options_.pattern.length();
+ max_replacements--;
+ }
+ }
+ // if we exited early due to max_replacements, add the trailing part
+ return builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i));
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubStringReplacer {
+ const ReplaceSubstringOptions& options_;
+ const RE2 regex_find_;
+ const RE2 regex_replacement_;
+
+ static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
+ const ReplaceSubstringOptions& options) {
+ auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
+
+ RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
+ RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
+
+ std::string replacement_error;
+ if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
+ &replacement_error)) {
+ return Status::Invalid("Invalid replacement string: ",
+ std::move(replacement_error));
+ }
+
+ return std::move(replacer);
+ }
+
+ // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
+ // we have 2 regexes, one with () around it, one without.
+ explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
+ : options_(options),
+ regex_find_("(" + options_.pattern + ")", RE2::Quiet),
+ regex_replacement_(options_.pattern, RE2::Quiet) {}
+
+ Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
+ re2::StringPiece replacement(options_.replacement);
+
+ if (options_.max_replacements == -1) {
+ std::string s_copy(s.to_string());
+ re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
+ return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
+ s_copy.length());
+ }
+
+ // Since RE2 does not have the concept of max_replacements, we have to do some work
+ // ourselves.
+ // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
+ const char* i = s.begin();
+ const char* end = s.end();
+ re2::StringPiece piece(s.data(), s.length());
+
+ int64_t max_replacements = options_.max_replacements;
+ while ((i < end) && (max_replacements != 0)) {
+ std::string found;
+ if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i)));
+ i = end;
+ } else {
+ // wind back to the beginning of the match
+ const char* pos = piece.begin() - found.length();
+ // the string before the pattern
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(pos - i)));
+ // replace the pattern in what we found
+ if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
+ return Status::Invalid("Regex found, but replacement failed");
+ }
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
+ static_cast<int64_t>(found.length())));
+ // skip pattern
+ i = piece.begin();
+ max_replacements--;
+ }
+ }
+ // If we exited early due to max_replacements, add the trailing part
+ return builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i));
+ }
+};
+#endif
+
+template <typename Type>
+using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
+
+const FunctionDoc replace_substring_doc(
+ "Replace non-overlapping substrings that match pattern by replacement",
+ ("For each string in `strings`, replace non-overlapping substrings that match\n"
+ "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
+ "maximum amount of replacements made, counting from the left. Null values emit\n"
+ "null."),
+ {"strings"}, "ReplaceSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename Type>
+using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
+
+const FunctionDoc replace_substring_regex_doc(
+ "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
+ ("For each string in `strings`, replace non-overlapping substrings that match the\n"
+ "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
+ "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
+ "made, counting from the left. Note that if the pattern contains groups,\n"
+ "backreferencing macan be used. Null values emit null."),
+ {"strings"}, "ReplaceSubstringOptions");
+#endif
+
+// ----------------------------------------------------------------------
+// Replace slice
+
+struct ReplaceSliceTransformBase : public StringTransformBase {
+ using State = OptionsWrapper<ReplaceSliceOptions>;
+
+ const ReplaceSliceOptions* options;
+
+ explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return ninputs * options->replacement.size() + input_ncodeunits;
+ }
+};
+
+struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
+ using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const auto& opts = *options;
+ int64_t before_slice = 0;
+ int64_t after_slice = 0;
+ uint8_t* output_start = output;
+
+ if (opts.start >= 0) {
+ // Count from left
+ before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
+ } else {
+ // Count from right
+ before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
+ }
+ // Mimic Pandas: if stop would be before start, treat as 0-length slice
+ if (opts.stop >= 0) {
+ // Count from left
+ after_slice =
+ std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
+ } else {
+ // Count from right
+ after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
+ }
+ output = std::copy(input, input + before_slice, output);
+ output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
+ output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
+ return output - output_start;
+ }
+};
+
+struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
+ using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const auto& opts = *options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t *begin_sliced, *end_sliced;
+ uint8_t* output_start = output;
+
+ // Mimic Pandas: if stop would be before start, treat as 0-length slice
+ if (opts.start >= 0) {
+ // Count from left
+ if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
+ return kTransformError;
+ }
+ if (opts.stop > options->start) {
+ // Continue counting from left
+ const int64_t length = opts.stop - options->start;
+ if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
+ return kTransformError;
+ }
+ } else if (opts.stop < 0) {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
+ -opts.stop)) {
+ return kTransformError;
+ }
+ } else {
+ // Zero-length slice
+ end_sliced = begin_sliced;
+ }
+ } else {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
+ -opts.start)) {
+ return kTransformError;
+ }
+ if (opts.stop >= 0) {
+ // Restart counting from left
+ if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
+ return kTransformError;
+ }
+ if (end_sliced <= begin_sliced) {
+ // Zero-length slice
+ end_sliced = begin_sliced;
+ }
+ } else if ((opts.stop < 0) && (options->stop > options->start)) {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
+ -opts.stop)) {
+ return kTransformError;
+ }
+ } else {
+ // zero-length slice
+ end_sliced = begin_sliced;
+ }
+ }
+ output = std::copy(begin, begin_sliced, output);
+ output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
+ output = std::copy(end_sliced, end, output);
+ return output - output_start;
+ }
+};
+
+template <typename Type>
+using BinaryReplaceSlice =
+ StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
+template <typename Type>
+using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
+
+const FunctionDoc binary_replace_slice_doc(
+ "Replace a slice of a binary string with `replacement`",
+ ("For each string in `strings`, replace a slice of the string defined by `start`"
+ "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
+ "and both are measured in bytes.\n"
+ "Null values emit null."),
+ {"strings"}, "ReplaceSliceOptions");
+
+const FunctionDoc utf8_replace_slice_doc(
+ "Replace a slice of a string with `replacement`",
+ ("For each string in `strings`, replace a slice of the string defined by `start`"
+ "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
+ "and both are measured in codeunits.\n"
+ "Null values emit null."),
+ {"strings"}, "ReplaceSliceOptions");
+
+void AddReplaceSlice(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
+ &binary_replace_slice_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ DCHECK_OK(func->AddKernel({ty}, ty,
+ GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
+ ReplaceSliceTransformBase::State::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
+ &utf8_replace_slice_doc);
+ DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
+ ReplaceSliceTransformBase::State::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
+ Utf8ReplaceSlice<LargeStringType>::Exec,
+ ReplaceSliceTransformBase::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+// ----------------------------------------------------------------------
+// Extract with regex
+
+#ifdef ARROW_WITH_RE2
+
+// TODO cache this once per ExtractRegexOptions
+struct ExtractRegexData {
+ // Use unique_ptr<> because RE2 is non-movable
+ std::unique_ptr<RE2> regex;
+ std::vector<std::string> group_names;
+
+ static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
+ ExtractRegexData data(options.pattern);
+ RETURN_NOT_OK(RegexStatus(*data.regex));
+
+ const int group_count = data.regex->NumberOfCapturingGroups();
+ const auto& name_map = data.regex->CapturingGroupNames();
+ data.group_names.reserve(group_count);
+
+ for (int i = 0; i < group_count; i++) {
+ auto item = name_map.find(i + 1); // re2 starts counting from 1
+ if (item == name_map.end()) {
+ // XXX should we instead just create fields with an empty name?
+ return Status::Invalid("Regular expression contains unnamed groups");
+ }
+ data.group_names.emplace_back(item->second);
+ }
+ return std::move(data);
+ }
+
+ Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
+ const auto& input_type = args[0].type;
+ if (input_type == nullptr) {
+ // No input type specified => propagate shape
+ return args[0];
+ }
+ // Input type is either String or LargeString and is also the type of each
+ // field in the output struct type.
+ DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
+ FieldVector fields;
+ fields.reserve(group_names.size());
+ std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
+ [&](const std::string& name) { return field(name, input_type); });
+ return struct_(std::move(fields));
+ }
+
+ private:
+ explicit ExtractRegexData(const std::string& pattern)
+ : regex(new RE2(pattern, RE2::Quiet)) {}
+};
+
+Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ using State = OptionsWrapper<ExtractRegexOptions>;
+ ExtractRegexOptions options = State::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+ return data.ResolveOutputType(args);
+}
+
+struct ExtractRegexBase {
+ const ExtractRegexData& data;
+ const int group_count;
+ std::vector<re2::StringPiece> found_values;
+ std::vector<re2::RE2::Arg> args;
+ std::vector<const re2::RE2::Arg*> args_pointers;
+ const re2::RE2::Arg** args_pointers_start;
+ const re2::RE2::Arg* null_arg = nullptr;
+
+ explicit ExtractRegexBase(const ExtractRegexData& data)
+ : data(data),
+ group_count(static_cast<int>(data.group_names.size())),
+ found_values(group_count) {
+ args.reserve(group_count);
+ args_pointers.reserve(group_count);
+
+ for (int i = 0; i < group_count; i++) {
+ args.emplace_back(&found_values[i]);
+ // Since we reserved capacity, we're guaranteed the pointer remains valid
+ args_pointers.push_back(&args[i]);
+ }
+ // Avoid null pointer if there is no capture group
+ args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
+ }
+
+ bool Match(util::string_view s) {
+ return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
+ group_count);
+ }
+};
+
+template <typename Type>
+struct ExtractRegex : public ExtractRegexBase {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using State = OptionsWrapper<ExtractRegexOptions>;
+
+ using ExtractRegexBase::ExtractRegexBase;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ExtractRegexOptions options = State::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+ return ExtractRegex{data}.Extract(ctx, batch, out);
+ }
+
+ Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
+ DCHECK_NE(descr.type, nullptr);
+ const auto& type = descr.type;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ std::unique_ptr<ArrayBuilder> array_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
+ StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
+
+ std::vector<BuilderType*> field_builders;
+ field_builders.reserve(group_count);
+ for (int i = 0; i < group_count; i++) {
+ field_builders.push_back(
+ checked_cast<BuilderType*>(struct_builder->field_builder(i)));
+ }
+
+ auto visit_null = [&]() { return struct_builder->AppendNull(); };
+ auto visit_value = [&](util::string_view s) {
+ if (Match(s)) {
+ for (int i = 0; i < group_count; i++) {
+ RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
+ }
+ return struct_builder->Append();
+ } else {
+ return struct_builder->AppendNull();
+ }
+ };
+ const ArrayData& input = *batch[0].array();
+ RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
+
+ std::shared_ptr<Array> out_array;
+ RETURN_NOT_OK(struct_builder->Finish(&out_array));
+ *out = std::move(out_array);
+ } else {
+ const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto result = std::make_shared<StructScalar>(type);
+ if (input.is_valid && Match(util::string_view(*input.value))) {
+ result->value.reserve(group_count);
+ for (int i = 0; i < group_count; i++) {
+ result->value.push_back(
+ std::make_shared<ScalarType>(found_values[i].as_string()));
+ }
+ result->is_valid = true;
+ } else {
+ result->is_valid = false;
+ }
+ out->value = std::move(result);
+ }
+
+ return Status::OK();
+ }
+};
+
+const FunctionDoc extract_regex_doc(
+ "Extract substrings captured by a regex pattern",
+ ("For each string in `strings`, match the regular expression and, if\n"
+ "successful, emit a struct with field names and values coming from the\n"
+ "regular expression's named capture groups. If the input is null or the\n"
+ "regular expression fails matching, a null output value is emitted.\n"
+ "\n"
+ "Regular expression matching is done using the Google RE2 library."),
+ {"strings"}, "ExtractRegexOptions");
+
+void AddExtractRegex(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
+ &extract_regex_doc);
+ using t32 = ExtractRegex<StringType>;
+ using t64 = ExtractRegex<LargeStringType>;
+ OutputType out_ty(ResolveExtractRegexOutput);
+ ScalarKernel kernel;
+
+ // Null values will be computed based on regex match or not
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
+ kernel.exec = t32::Exec;
+ kernel.init = t32::State::Init;
+ DCHECK_OK(func->AddKernel(kernel));
+ kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
+ kernel.exec = t64::Exec;
+ kernel.init = t64::State::Init;
+ DCHECK_OK(func->AddKernel(kernel));
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_RE2
+
+// ----------------------------------------------------------------------
// strptime string parsing
using StrptimeState = OptionsWrapper<StrptimeOptions>;
@@ -2782,11 +2782,11 @@ struct ParseStrptime {
: parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}
template <typename... Ignored>
- int64_t Call(KernelContext*, util::string_view val, Status* st) const {
+ int64_t Call(KernelContext*, util::string_view val, Status* st) const {
int64_t result = 0;
if (!(*parser)(val.data(), val.size(), unit, &result)) {
- *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
- TimestampType(unit).ToString());
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ TimestampType(unit).ToString());
}
return result;
}
@@ -2796,7 +2796,7 @@ struct ParseStrptime {
};
template <typename InputType>
-Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
ParseStrptime(StrptimeState::Get(ctx))};
return kernel.Exec(ctx, batch, out);
@@ -2810,471 +2810,471 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDe
return Status::Invalid("strptime does not provide default StrptimeOptions");
}
-// ----------------------------------------------------------------------
-// string padding
-
-template <bool PadLeft, bool PadRight>
-struct AsciiPadTransform : public StringTransformBase {
- using State = OptionsWrapper<PadOptions>;
-
- const PadOptions& options_;
-
- explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- if (options_.padding.size() != 1) {
- return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
- }
- return Status::OK();
- }
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- // This is likely very overallocated but hard to do better without
- // actually looking at each string (because of strings that may be
- // longer than the given width)
- return input_ncodeunits + ninputs * options_.width;
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- if (input_string_ncodeunits >= options_.width) {
- std::copy(input, input + input_string_ncodeunits, output);
- return input_string_ncodeunits;
- }
- const int64_t spaces = options_.width - input_string_ncodeunits;
- int64_t left = 0;
- int64_t right = 0;
- if (PadLeft && PadRight) {
- // If odd number of spaces, put the extra space on the right
- left = spaces / 2;
- right = spaces - left;
- } else if (PadLeft) {
- left = spaces;
- } else if (PadRight) {
- right = spaces;
- } else {
- DCHECK(false) << "unreachable";
- return 0;
- }
- std::fill(output, output + left, options_.padding[0]);
- output += left;
- output = std::copy(input, input + input_string_ncodeunits, output);
- std::fill(output, output + right, options_.padding[0]);
- return options_.width;
- }
-};
-
-template <bool PadLeft, bool PadRight>
-struct Utf8PadTransform : public StringTransformBase {
- using State = OptionsWrapper<PadOptions>;
-
- const PadOptions& options_;
-
- explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
- auto strlen = options_.padding.size();
- if (util::UTF8Length(str, str + strlen) != 1) {
- return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
- "'");
- }
- return Status::OK();
- }
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- // This is likely very overallocated but hard to do better without
- // actually looking at each string (because of strings that may be
- // longer than the given width)
- // One codepoint may be up to 4 bytes
- return input_ncodeunits + 4 * ninputs * options_.width;
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
- if (input_width >= options_.width) {
- std::copy(input, input + input_string_ncodeunits, output);
- return input_string_ncodeunits;
- }
- const int64_t spaces = options_.width - input_width;
- int64_t left = 0;
- int64_t right = 0;
- if (PadLeft && PadRight) {
- // If odd number of spaces, put the extra space on the right
- left = spaces / 2;
- right = spaces - left;
- } else if (PadLeft) {
- left = spaces;
- } else if (PadRight) {
- right = spaces;
- } else {
- DCHECK(false) << "unreachable";
- return 0;
- }
- uint8_t* start = output;
- while (left) {
- output = std::copy(options_.padding.begin(), options_.padding.end(), output);
- left--;
- }
- output = std::copy(input, input + input_string_ncodeunits, output);
- while (right) {
- output = std::copy(options_.padding.begin(), options_.padding.end(), output);
- right--;
- }
- return output - start;
- }
-};
-
-template <typename Type>
-using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
-template <typename Type>
-using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
-template <typename Type>
-using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
-template <typename Type>
-using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
-template <typename Type>
-using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
-template <typename Type>
-using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
-
-// ----------------------------------------------------------------------
-// string trimming
-
-#ifdef ARROW_WITH_UTF8PROC
-
-template <bool TrimLeft, bool TrimRight>
-struct UTF8TrimWhitespaceTransform : public StringTransformBase {
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- EnsureLookupTablesFilled();
- return Status::OK();
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
- if (TrimLeft && !ARROW_PREDICT_TRUE(
- arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
- return kTransformError;
- }
- if (TrimRight && begin_trimmed < end) {
- if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
- predicate, &end_trimmed))) {
- return kTransformError;
- }
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using UTF8TrimWhitespace =
- StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
-
-template <typename Type>
-using UTF8LTrimWhitespace =
- StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
-
-template <typename Type>
-using UTF8RTrimWhitespace =
- StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
-
-struct UTF8TrimState {
- TrimOptions options_;
- std::vector<bool> codepoints_;
- Status status_ = Status::OK();
-
- explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
- : options_(std::move(options)) {
- if (!ARROW_PREDICT_TRUE(
- arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
- codepoints_.resize(
- std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
- codepoints_.at(c) = true;
- }))) {
- status_ = Status::Invalid("Invalid UTF8 sequence in input");
- }
- }
-};
-
-template <bool TrimLeft, bool TrimRight>
-struct UTF8TrimTransform : public StringTransformBase {
- using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
-
- const UTF8TrimState& state_;
-
- explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- return state_.status_;
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [&](uint32_t c) { return !state_.codepoints_[c]; };
- if (TrimLeft && !ARROW_PREDICT_TRUE(
- arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
- return kTransformError;
- }
- if (TrimRight && begin_trimmed < end) {
- if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
- predicate, &end_trimmed))) {
- return kTransformError;
- }
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
-
-template <typename Type>
-using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
-
-template <typename Type>
-using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
-
-#endif
-
-template <bool TrimLeft, bool TrimRight>
-struct AsciiTrimWhitespaceTransform : public StringTransformBase {
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
- if (TrimLeft) {
- begin_trimmed = std::find_if(begin, end, predicate);
- }
- if (TrimRight && begin_trimmed < end) {
- std::reverse_iterator<const uint8_t*> rbegin(end);
- std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
- end_trimmed = std::find_if(rbegin, rend, predicate).base();
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using AsciiTrimWhitespace =
- StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
-
-template <typename Type>
-using AsciiLTrimWhitespace =
- StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
-
-template <typename Type>
-using AsciiRTrimWhitespace =
- StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
-
-struct AsciiTrimState {
- TrimOptions options_;
- std::vector<bool> characters_;
-
- explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
- : options_(std::move(options)), characters_(256) {
- for (const auto c : options_.characters) {
- characters_[static_cast<unsigned char>(c)] = true;
- }
- }
-};
-
-template <bool TrimLeft, bool TrimRight>
-struct AsciiTrimTransform : public StringTransformBase {
- using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
-
- const AsciiTrimState& state_;
-
- explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [&](uint8_t c) { return !state_.characters_[c]; };
- if (TrimLeft) {
- begin_trimmed = std::find_if(begin, end, predicate);
- }
- if (TrimRight && begin_trimmed < end) {
- std::reverse_iterator<const uint8_t*> rbegin(end);
- std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
- end_trimmed = std::find_if(rbegin, rend, predicate).base();
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
-
-template <typename Type>
-using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
-
-template <typename Type>
-using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
-
-const FunctionDoc utf8_center_doc(
- "Center strings by padding with a given character",
- ("For each string in `strings`, emit a centered string by padding both sides \n"
- "with the given UTF8 codeunit.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc utf8_lpad_doc(
- "Right-align strings by padding with a given character",
- ("For each string in `strings`, emit a right-aligned string by prepending \n"
- "the given UTF8 codeunit.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc utf8_rpad_doc(
- "Left-align strings by padding with a given character",
- ("For each string in `strings`, emit a left-aligned string by appending \n"
- "the given UTF8 codeunit.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc ascii_center_doc(
- utf8_center_doc.description + "",
- ("For each string in `strings`, emit a centered string by padding both sides \n"
- "with the given ASCII character.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc ascii_lpad_doc(
- utf8_lpad_doc.description + "",
- ("For each string in `strings`, emit a right-aligned string by prepending \n"
- "the given ASCII character.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc ascii_rpad_doc(
- utf8_rpad_doc.description + "",
- ("For each string in `strings`, emit a left-aligned string by appending \n"
- "the given ASCII character.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc utf8_trim_whitespace_doc(
- "Trim leading and trailing whitespace characters",
- ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
- "characters removed, where whitespace characters are defined by the Unicode\n"
- "standard. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_ltrim_whitespace_doc(
- "Trim leading whitespace characters",
- ("For each string in `strings`, emit a string with leading whitespace\n"
- "characters removed, where whitespace characters are defined by the Unicode\n"
- "standard. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_rtrim_whitespace_doc(
- "Trim trailing whitespace characters",
- ("For each string in `strings`, emit a string with trailing whitespace\n"
- "characters removed, where whitespace characters are defined by the Unicode\n"
- "standard. Null values emit null."),
- {"strings"});
-
-const FunctionDoc ascii_trim_whitespace_doc(
- "Trim leading and trailing ASCII whitespace characters",
- ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
- "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
- "whitespace characters. Null values emit null."),
- {"strings"});
-
-const FunctionDoc ascii_ltrim_whitespace_doc(
- "Trim leading ASCII whitespace characters",
- ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
- "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
- "whitespace characters. Null values emit null."),
- {"strings"});
-
-const FunctionDoc ascii_rtrim_whitespace_doc(
- "Trim trailing ASCII whitespace characters",
- ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
- "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
- "whitespace characters. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_trim_doc(
- "Trim leading and trailing characters present in the `characters` arguments",
- ("For each string in `strings`, emit a string with leading and trailing\n"
- "characters removed that are present in the `characters` argument. Null values\n"
- "emit null."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc utf8_ltrim_doc(
- "Trim leading characters present in the `characters` arguments",
- ("For each string in `strings`, emit a string with leading\n"
- "characters removed that are present in the `characters` argument. Null values\n"
- "emit null."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc utf8_rtrim_doc(
- "Trim trailing characters present in the `characters` arguments",
- ("For each string in `strings`, emit a string with leading "
- "characters removed that are present in the `characters` argument. Null values\n"
- "emit null."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc ascii_trim_doc(
- utf8_trim_doc.summary + "",
- utf8_trim_doc.description +
- ("\nBoth the input string as the `characters` argument are interepreted as\n"
- "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc ascii_ltrim_doc(
- utf8_ltrim_doc.summary + "",
- utf8_ltrim_doc.description +
- ("\nBoth the input string as the `characters` argument are interepreted as\n"
- "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc ascii_rtrim_doc(
- utf8_rtrim_doc.summary + "",
- utf8_rtrim_doc.description +
- ("\nBoth the input string as the `characters` argument are interepreted as\n"
- "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc strptime_doc(
- "Parse timestamps",
- ("For each string in `strings`, parse it as a timestamp.\n"
- "The timestamp unit and the expected string pattern must be given\n"
- "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
- "fails parsing, an error is returned."),
- {"strings"}, "StrptimeOptions");
-
-const FunctionDoc binary_length_doc(
- "Compute string lengths",
- ("For each string in `strings`, emit the number of bytes. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
- ("For each string in `strings`, emit the number of "
- "UTF8 characters. Null values emit null."),
- {"strings"});
-
+// ----------------------------------------------------------------------
+// string padding
+
+template <bool PadLeft, bool PadRight>
+struct AsciiPadTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ if (options_.padding.size() != 1) {
+ return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
+ }
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ // This is likely very overallocated but hard to do better without
+ // actually looking at each string (because of strings that may be
+ // longer than the given width)
+ return input_ncodeunits + ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ if (input_string_ncodeunits >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_string_ncodeunits;
+ int64_t left = 0;
+ int64_t right = 0;
+ if (PadLeft && PadRight) {
+ // If odd number of spaces, put the extra space on the right
+ left = spaces / 2;
+ right = spaces - left;
+ } else if (PadLeft) {
+ left = spaces;
+ } else if (PadRight) {
+ right = spaces;
+ } else {
+ DCHECK(false) << "unreachable";
+ return 0;
+ }
+ std::fill(output, output + left, options_.padding[0]);
+ output += left;
+ output = std::copy(input, input + input_string_ncodeunits, output);
+ std::fill(output, output + right, options_.padding[0]);
+ return options_.width;
+ }
+};
+
+template <bool PadLeft, bool PadRight>
+struct Utf8PadTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+ auto strlen = options_.padding.size();
+ if (util::UTF8Length(str, str + strlen) != 1) {
+ return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
+ "'");
+ }
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ // This is likely very overallocated but hard to do better without
+ // actually looking at each string (because of strings that may be
+ // longer than the given width)
+ // One codepoint may be up to 4 bytes
+ return input_ncodeunits + 4 * ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
+ if (input_width >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_width;
+ int64_t left = 0;
+ int64_t right = 0;
+ if (PadLeft && PadRight) {
+ // If odd number of spaces, put the extra space on the right
+ left = spaces / 2;
+ right = spaces - left;
+ } else if (PadLeft) {
+ left = spaces;
+ } else if (PadRight) {
+ right = spaces;
+ } else {
+ DCHECK(false) << "unreachable";
+ return 0;
+ }
+ uint8_t* start = output;
+ while (left) {
+ output = std::copy(options_.padding.begin(), options_.padding.end(), output);
+ left--;
+ }
+ output = std::copy(input, input + input_string_ncodeunits, output);
+ while (right) {
+ output = std::copy(options_.padding.begin(), options_.padding.end(), output);
+ right--;
+ }
+ return output - start;
+ }
+};
+
+template <typename Type>
+using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
+template <typename Type>
+using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
+template <typename Type>
+using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
+template <typename Type>
+using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
+template <typename Type>
+using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
+template <typename Type>
+using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
+
+// ----------------------------------------------------------------------
+// string trimming
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <bool TrimLeft, bool TrimRight>
+struct UTF8TrimWhitespaceTransform : public StringTransformBase {
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+ if (TrimLeft && !ARROW_PREDICT_TRUE(
+ arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
+ return kTransformError;
+ }
+ if (TrimRight && begin_trimmed < end) {
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
+ predicate, &end_trimmed))) {
+ return kTransformError;
+ }
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using UTF8TrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
+
+template <typename Type>
+using UTF8LTrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
+
+template <typename Type>
+using UTF8RTrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
+
+struct UTF8TrimState {
+ TrimOptions options_;
+ std::vector<bool> codepoints_;
+ Status status_ = Status::OK();
+
+ explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
+ : options_(std::move(options)) {
+ if (!ARROW_PREDICT_TRUE(
+ arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
+ codepoints_.resize(
+ std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
+ codepoints_.at(c) = true;
+ }))) {
+ status_ = Status::Invalid("Invalid UTF8 sequence in input");
+ }
+ }
+};
+
+template <bool TrimLeft, bool TrimRight>
+struct UTF8TrimTransform : public StringTransformBase {
+ using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
+
+ const UTF8TrimState& state_;
+
+ explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ return state_.status_;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [&](uint32_t c) { return !state_.codepoints_[c]; };
+ if (TrimLeft && !ARROW_PREDICT_TRUE(
+ arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
+ return kTransformError;
+ }
+ if (TrimRight && begin_trimmed < end) {
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
+ predicate, &end_trimmed))) {
+ return kTransformError;
+ }
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
+
+template <typename Type>
+using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
+
+template <typename Type>
+using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
+
+#endif
+
+template <bool TrimLeft, bool TrimRight>
+struct AsciiTrimWhitespaceTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+ if (TrimLeft) {
+ begin_trimmed = std::find_if(begin, end, predicate);
+ }
+ if (TrimRight && begin_trimmed < end) {
+ std::reverse_iterator<const uint8_t*> rbegin(end);
+ std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+ end_trimmed = std::find_if(rbegin, rend, predicate).base();
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using AsciiTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
+
+template <typename Type>
+using AsciiLTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
+
+template <typename Type>
+using AsciiRTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
+
+struct AsciiTrimState {
+ TrimOptions options_;
+ std::vector<bool> characters_;
+
+ explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
+ : options_(std::move(options)), characters_(256) {
+ for (const auto c : options_.characters) {
+ characters_[static_cast<unsigned char>(c)] = true;
+ }
+ }
+};
+
+template <bool TrimLeft, bool TrimRight>
+struct AsciiTrimTransform : public StringTransformBase {
+ using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
+
+ const AsciiTrimState& state_;
+
+ explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [&](uint8_t c) { return !state_.characters_[c]; };
+ if (TrimLeft) {
+ begin_trimmed = std::find_if(begin, end, predicate);
+ }
+ if (TrimRight && begin_trimmed < end) {
+ std::reverse_iterator<const uint8_t*> rbegin(end);
+ std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+ end_trimmed = std::find_if(rbegin, rend, predicate).base();
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
+
+template <typename Type>
+using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
+
+template <typename Type>
+using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
+
+const FunctionDoc utf8_center_doc(
+ "Center strings by padding with a given character",
+ ("For each string in `strings`, emit a centered string by padding both sides \n"
+ "with the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_lpad_doc(
+ "Right-align strings by padding with a given character",
+ ("For each string in `strings`, emit a right-aligned string by prepending \n"
+ "the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_rpad_doc(
+ "Left-align strings by padding with a given character",
+ ("For each string in `strings`, emit a left-aligned string by appending \n"
+ "the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_center_doc(
+ utf8_center_doc.description + "",
+ ("For each string in `strings`, emit a centered string by padding both sides \n"
+ "with the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_lpad_doc(
+ utf8_lpad_doc.description + "",
+ ("For each string in `strings`, emit a right-aligned string by prepending \n"
+ "the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_rpad_doc(
+ utf8_rpad_doc.description + "",
+ ("For each string in `strings`, emit a left-aligned string by appending \n"
+ "the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_trim_whitespace_doc(
+ "Trim leading and trailing whitespace characters",
+ ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_ltrim_whitespace_doc(
+ "Trim leading whitespace characters",
+ ("For each string in `strings`, emit a string with leading whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_rtrim_whitespace_doc(
+ "Trim trailing whitespace characters",
+ ("For each string in `strings`, emit a string with trailing whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_trim_whitespace_doc(
+ "Trim leading and trailing ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
+ "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_ltrim_whitespace_doc(
+ "Trim leading ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
+ "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_rtrim_whitespace_doc(
+ "Trim trailing ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
+ "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_trim_doc(
+ "Trim leading and trailing characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading and trailing\n"
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc utf8_ltrim_doc(
+ "Trim leading characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading\n"
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc utf8_rtrim_doc(
+ "Trim trailing characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading "
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_trim_doc(
+ utf8_trim_doc.summary + "",
+ utf8_trim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_ltrim_doc(
+ utf8_ltrim_doc.summary + "",
+ utf8_ltrim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_rtrim_doc(
+ utf8_rtrim_doc.summary + "",
+ utf8_rtrim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc strptime_doc(
+ "Parse timestamps",
+ ("For each string in `strings`, parse it as a timestamp.\n"
+ "The timestamp unit and the expected string pattern must be given\n"
+ "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
+ "fails parsing, an error is returned."),
+ {"strings"}, "StrptimeOptions");
+
+const FunctionDoc binary_length_doc(
+ "Compute string lengths",
+ ("For each string in `strings`, emit the number of bytes. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
+ ("For each string in `strings`, emit the number of "
+ "UTF8 characters. Null values emit null."),
+ {"strings"});
+
void AddStrptime(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
+ auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
StrptimeExec<StringType>, StrptimeState::Init));
DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
@@ -3283,8 +3283,8 @@ void AddStrptime(FunctionRegistry* registry) {
}
void AddBinaryLength(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
- &binary_length_doc);
+ auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
+ &binary_length_doc);
ArrayKernelExec exec_offset_32 =
applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
ArrayKernelExec exec_offset_64 =
@@ -3298,575 +3298,575 @@ void AddBinaryLength(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-void AddUtf8Length(FunctionRegistry* registry) {
- auto func =
- std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
-
- ArrayKernelExec exec_offset_32 =
- applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
-
- ArrayKernelExec exec_offset_64 =
- applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
- DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
-
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-template <typename BinaryType, typename ListType>
-struct BinaryJoin {
- using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
- using ListArrayType = typename TypeTraits<ListType>::ArrayType;
- using ListScalarType = typename TypeTraits<ListType>::ScalarType;
- using ListOffsetType = typename ListArrayType::offset_type;
- using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (batch[0].kind() == Datum::SCALAR) {
- if (batch[1].kind() == Datum::SCALAR) {
- return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
- }
- DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
- return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
- }
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- if (batch[1].kind() == Datum::SCALAR) {
- return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
- }
- DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
- return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
- }
-
- struct ListScalarOffsetLookup {
- const ArrayType& values;
-
- int64_t GetStart(int64_t i) { return 0; }
- int64_t GetStop(int64_t i) { return values.length(); }
- bool IsNull(int64_t i) { return false; }
- };
-
- struct ListArrayOffsetLookup {
- explicit ListArrayOffsetLookup(const ListArrayType& lists)
- : lists_(lists), offsets_(lists.raw_value_offsets()) {}
-
- int64_t GetStart(int64_t i) { return offsets_[i]; }
- int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
- bool IsNull(int64_t i) { return lists_.IsNull(i); }
-
- private:
- const ListArrayType& lists_;
- const ListOffsetType* offsets_;
- };
-
- struct SeparatorScalarLookup {
- const util::string_view separator;
-
- bool IsNull(int64_t i) { return false; }
- util::string_view GetView(int64_t i) { return separator; }
- };
-
- struct SeparatorArrayLookup {
- const ArrayType& separators;
-
- bool IsNull(int64_t i) { return separators.IsNull(i); }
- util::string_view GetView(int64_t i) { return separators.GetView(i); }
- };
-
- // Scalar, scalar -> scalar
- static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
- const Scalar& right, Datum* out) {
- const auto& list = checked_cast<const ListScalarType&>(left);
- const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
- if (!list.is_valid || !separator_scalar.is_valid) {
- return Status::OK();
- }
- util::string_view separator(*separator_scalar.value);
-
- const auto& strings = checked_cast<const ArrayType&>(*list.value);
- if (strings.null_count() > 0) {
- out->scalar()->is_valid = false;
- return Status::OK();
- }
-
- TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
- auto Append = [&](util::string_view value) {
- return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
- static_cast<int64_t>(value.size()));
- };
- if (strings.length() > 0) {
- auto data_length =
- strings.total_values_length() + (strings.length() - 1) * separator.length();
- RETURN_NOT_OK(builder.Reserve(data_length));
- RETURN_NOT_OK(Append(strings.GetView(0)));
- for (int64_t j = 1; j < strings.length(); j++) {
- RETURN_NOT_OK(Append(separator));
- RETURN_NOT_OK(Append(strings.GetView(j)));
- }
- }
- auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
- return builder.Finish(&out_scalar->value);
- }
-
- // Scalar, array -> array
- static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
- const std::shared_ptr<ArrayData>& right, Datum* out) {
- const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
- if (!list_scalar.is_valid) {
- ARROW_ASSIGN_OR_RAISE(
- auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
- *out = *nulls->data();
- return Status::OK();
- }
- const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
- if (strings.null_count() != 0) {
- ARROW_ASSIGN_OR_RAISE(
- auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
- *out = *nulls->data();
- return Status::OK();
- }
- const ArrayType separators(right);
-
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(separators.length()));
-
- // Presize data to avoid multiple reallocations when joining strings
- int64_t total_data_length = 0;
- const int64_t list_length = strings.length();
- if (list_length) {
- const int64_t string_length = strings.total_values_length();
- total_data_length +=
- string_length * (separators.length() - separators.null_count());
- for (int64_t i = 0; i < separators.length(); ++i) {
- if (separators.IsNull(i)) {
- continue;
- }
- total_data_length += (list_length - 1) * separators.value_length(i);
- }
- }
- RETURN_NOT_OK(builder.ReserveData(total_data_length));
-
- return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
- SeparatorArrayLookup{separators}, &builder, out);
- }
-
- // Array, scalar -> array
- static Status ExecArrayScalar(KernelContext* ctx,
- const std::shared_ptr<ArrayData>& left,
- const Scalar& right, Datum* out) {
- const ListArrayType lists(left);
- const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
-
- if (!separator_scalar.is_valid) {
- ARROW_ASSIGN_OR_RAISE(
- auto nulls,
- MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
- *out = *nulls->data();
- return Status::OK();
- }
-
- util::string_view separator(*separator_scalar.value);
- const auto& strings = checked_cast<const ArrayType&>(*lists.values());
- const auto list_offsets = lists.raw_value_offsets();
-
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(lists.length()));
-
- // Presize data to avoid multiple reallocations when joining strings
- int64_t total_data_length = strings.total_values_length();
- for (int64_t i = 0; i < lists.length(); ++i) {
- const auto start = list_offsets[i], end = list_offsets[i + 1];
- if (end > start && !ValuesContainNull(strings, start, end)) {
- total_data_length += (end - start - 1) * separator.length();
- }
- }
- RETURN_NOT_OK(builder.ReserveData(total_data_length));
-
- return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
- SeparatorScalarLookup{separator}, &builder, out);
- }
-
- // Array, array -> array
- static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
- const std::shared_ptr<ArrayData>& right, Datum* out) {
- const ListArrayType lists(left);
- const auto& strings = checked_cast<const ArrayType&>(*lists.values());
- const auto list_offsets = lists.raw_value_offsets();
- const auto string_offsets = strings.raw_value_offsets();
- const ArrayType separators(right);
-
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(lists.length()));
-
- // Presize data to avoid multiple reallocations when joining strings
- int64_t total_data_length = 0;
- for (int64_t i = 0; i < lists.length(); ++i) {
- if (separators.IsNull(i)) {
- continue;
- }
- const auto start = list_offsets[i], end = list_offsets[i + 1];
- if (end > start && !ValuesContainNull(strings, start, end)) {
- total_data_length += string_offsets[end] - string_offsets[start];
- total_data_length += (end - start - 1) * separators.value_length(i);
- }
- }
- RETURN_NOT_OK(builder.ReserveData(total_data_length));
-
- struct SeparatorLookup {
- const ArrayType& separators;
-
- bool IsNull(int64_t i) { return separators.IsNull(i); }
- util::string_view GetView(int64_t i) { return separators.GetView(i); }
- };
- return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
- SeparatorArrayLookup{separators}, &builder, out);
- }
-
- template <typename ListOffsetLookup, typename SeparatorLookup>
- static Status JoinStrings(int64_t length, const ArrayType& strings,
- ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
- BuilderType* builder, Datum* out) {
- for (int64_t i = 0; i < length; ++i) {
- if (list_offsets.IsNull(i) || separators.IsNull(i)) {
- builder->UnsafeAppendNull();
- continue;
- }
- const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
- if (j_start == j_end) {
- builder->UnsafeAppendEmptyValue();
- continue;
- }
- if (ValuesContainNull(strings, j_start, j_end)) {
- builder->UnsafeAppendNull();
- continue;
- }
- builder->UnsafeAppend(strings.GetView(j_start));
- for (int64_t j = j_start + 1; j < j_end; ++j) {
- builder->UnsafeExtendCurrent(separators.GetView(i));
- builder->UnsafeExtendCurrent(strings.GetView(j));
- }
- }
-
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder->Finish(&string_array));
- *out = *string_array->data();
- // Correct the output type based on the input
- out->mutable_array()->type = strings.type();
- return Status::OK();
- }
-
- static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
- if (values.null_count() == 0) {
- return false;
- }
- for (int64_t i = start; i < end; ++i) {
- if (values.IsNull(i)) {
- return true;
- }
- }
- return false;
- }
-};
-
-using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
-
-template <typename Type>
-struct BinaryJoinElementWise {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- using offset_type = typename Type::offset_type;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
- // Last argument is the separator (for consistency with binary_join)
- if (std::all_of(batch.values.begin(), batch.values.end(),
- [](const Datum& d) { return d.is_scalar(); })) {
- return ExecOnlyScalar(ctx, options, batch, out);
- }
- return ExecContainingArrays(ctx, options, batch, out);
- }
-
- static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
- const ExecBatch& batch, Datum* out) {
- BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
- const size_t num_args = batch.values.size();
- if (num_args == 1) {
- // Only separator, no values
- ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
- output->is_valid = batch.values[0].scalar()->is_valid;
- return Status::OK();
- }
-
- int64_t final_size = CalculateRowSize(options, batch, 0);
- if (final_size < 0) {
- ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
- output->is_valid = false;
- return Status::OK();
- }
- ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
- const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
- uint8_t* buf = output->value->mutable_data();
- bool first = true;
- for (size_t i = 0; i < num_args - 1; i++) {
- const Scalar& scalar = *batch[i].scalar();
- util::string_view s;
- if (scalar.is_valid) {
- s = UnboxScalar<Type>::Unbox(scalar);
- } else {
- switch (options.null_handling) {
- case JoinOptions::EMIT_NULL:
- // Handled by CalculateRowSize
- DCHECK(false) << "unreachable";
- break;
- case JoinOptions::SKIP:
- continue;
- case JoinOptions::REPLACE:
- s = options.null_replacement;
- break;
- }
- }
- if (!first) {
- buf = std::copy(separator.begin(), separator.end(), buf);
- }
- first = false;
- buf = std::copy(s.begin(), s.end(), buf);
- }
- output->is_valid = true;
- DCHECK_EQ(final_size, buf - output->value->mutable_data());
- return Status::OK();
- }
-
- static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
- const ExecBatch& batch, Datum* out) {
- // Presize data to avoid reallocations
- int64_t final_size = 0;
- for (int64_t i = 0; i < batch.length; i++) {
- auto size = CalculateRowSize(options, batch, i);
- if (size > 0) final_size += size;
- }
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(batch.length));
- RETURN_NOT_OK(builder.ReserveData(final_size));
-
- std::vector<util::string_view> valid_cols(batch.values.size());
- for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
- size_t num_valid = 0; // Not counting separator
- for (size_t col = 0; col < batch.values.size(); col++) {
- if (batch[col].is_scalar()) {
- const auto& scalar = *batch[col].scalar();
- if (scalar.is_valid) {
- valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
- if (col < batch.values.size() - 1) num_valid++;
- } else {
- valid_cols[col] = util::string_view();
- }
- } else {
- const ArrayData& array = *batch[col].array();
- if (!array.MayHaveNulls() ||
- BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
- const offset_type* offsets = array.GetValues<offset_type>(1);
- const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
- const int64_t length = offsets[row + 1] - offsets[row];
- valid_cols[col] = util::string_view(
- reinterpret_cast<const char*>(data + offsets[row]), length);
- if (col < batch.values.size() - 1) num_valid++;
- } else {
- valid_cols[col] = util::string_view();
- }
- }
- }
-
- if (!valid_cols.back().data()) {
- // Separator is null
- builder.UnsafeAppendNull();
- continue;
- } else if (batch.values.size() == 1) {
- // Only given separator
- builder.UnsafeAppendEmptyValue();
- continue;
- } else if (num_valid < batch.values.size() - 1) {
- // We had some nulls
- if (options.null_handling == JoinOptions::EMIT_NULL) {
- builder.UnsafeAppendNull();
- continue;
- }
- }
- const auto separator = valid_cols.back();
- bool first = true;
- for (size_t col = 0; col < batch.values.size() - 1; col++) {
- util::string_view value = valid_cols[col];
- if (!value.data()) {
- switch (options.null_handling) {
- case JoinOptions::EMIT_NULL:
- DCHECK(false) << "unreachable";
- break;
- case JoinOptions::SKIP:
- continue;
- case JoinOptions::REPLACE:
- value = options.null_replacement;
- break;
- }
- }
- if (first) {
- builder.UnsafeAppend(value);
- first = false;
- continue;
- }
- builder.UnsafeExtendCurrent(separator);
- builder.UnsafeExtendCurrent(value);
- }
- }
-
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder.Finish(&string_array));
- *out = *string_array->data();
- out->mutable_array()->type = batch[0].type();
- DCHECK_EQ(batch.length, out->array()->length);
- DCHECK_EQ(final_size,
- checked_cast<const ArrayType&>(*string_array).total_values_length());
- return Status::OK();
- }
-
- // Compute the length of the output for the given position, or -1 if it would be null.
- static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
- const int64_t index) {
- const auto num_args = batch.values.size();
- int64_t final_size = 0;
- int64_t num_non_null_args = 0;
- for (size_t i = 0; i < num_args; i++) {
- int64_t element_size = 0;
- bool valid = true;
- if (batch[i].is_scalar()) {
- const Scalar& scalar = *batch[i].scalar();
- valid = scalar.is_valid;
- element_size = UnboxScalar<Type>::Unbox(scalar).size();
- } else {
- const ArrayData& array = *batch[i].array();
- valid = !array.MayHaveNulls() ||
- BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
- const offset_type* offsets = array.GetValues<offset_type>(1);
- element_size = offsets[index + 1] - offsets[index];
- }
- if (i == num_args - 1) {
- if (!valid) return -1;
- if (num_non_null_args > 1) {
- // Add separator size (only if there were values to join)
- final_size += (num_non_null_args - 1) * element_size;
- }
- break;
- }
- if (!valid) {
- switch (options.null_handling) {
- case JoinOptions::EMIT_NULL:
- return -1;
- case JoinOptions::SKIP:
- continue;
- case JoinOptions::REPLACE:
- element_size = options.null_replacement.size();
- break;
- }
- }
- num_non_null_args++;
- final_size += element_size;
- }
- return final_size;
- }
-};
-
-const FunctionDoc binary_join_doc(
- "Join a list of strings together with a `separator` to form a single string",
- ("Insert `separator` between `list` elements, and concatenate them.\n"
- "Any null input and any null `list` element emits a null output.\n"),
- {"list", "separator"});
-
-const FunctionDoc binary_join_element_wise_doc(
- "Join string arguments into one, using the last argument as the separator",
- ("Insert the last argument of `strings` between the rest of the elements, "
- "and concatenate them.\n"
- "Any null separator element emits a null output. Null elements either "
- "emit a null (the default), are skipped, or replaced with a given string.\n"),
- {"*strings"}, "JoinOptions");
-
-const auto kDefaultJoinOptions = JoinOptions::Defaults();
-
-template <typename ListType>
-void AddBinaryJoinForListType(ScalarFunction* func) {
- for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
- auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
- auto list_ty = std::make_shared<ListType>(ty);
- DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
- }
-}
-
-void AddBinaryJoin(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
- &binary_join_doc);
- AddBinaryJoinForListType<ListType>(func.get());
- AddBinaryJoinForListType<LargeListType>(func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<ScalarFunction>(
- "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
- &binary_join_element_wise_doc, &kDefaultJoinOptions);
- for (const auto& ty : BaseBinaryTypes()) {
- ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
- GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
- BinaryJoinElementWiseState::Init};
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-template <template <typename> class ExecFunctor>
-void MakeUnaryStringBatchKernel(
- std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
- MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
- {
- auto exec_32 = ExecFunctor<StringType>::Exec;
- ScalarKernel kernel{{utf8()}, utf8(), exec_32};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- {
- auto exec_64 = ExecFunctor<LargeStringType>::Exec;
- ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
+void AddUtf8Length(FunctionRegistry* registry) {
+ auto func =
+ std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
+
+ ArrayKernelExec exec_offset_32 =
+ applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
+
+ ArrayKernelExec exec_offset_64 =
+ applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
+ DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+template <typename BinaryType, typename ListType>
+struct BinaryJoin {
+ using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
+ using ListArrayType = typename TypeTraits<ListType>::ArrayType;
+ using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+ using ListOffsetType = typename ListArrayType::offset_type;
+ using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::SCALAR) {
+ if (batch[1].kind() == Datum::SCALAR) {
+ return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+ }
+ DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
+ return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+ if (batch[1].kind() == Datum::SCALAR) {
+ return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
+ }
+ DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
+ return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
+ }
+
+ struct ListScalarOffsetLookup {
+ const ArrayType& values;
+
+ int64_t GetStart(int64_t i) { return 0; }
+ int64_t GetStop(int64_t i) { return values.length(); }
+ bool IsNull(int64_t i) { return false; }
+ };
+
+ struct ListArrayOffsetLookup {
+ explicit ListArrayOffsetLookup(const ListArrayType& lists)
+ : lists_(lists), offsets_(lists.raw_value_offsets()) {}
+
+ int64_t GetStart(int64_t i) { return offsets_[i]; }
+ int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
+ bool IsNull(int64_t i) { return lists_.IsNull(i); }
+
+ private:
+ const ListArrayType& lists_;
+ const ListOffsetType* offsets_;
+ };
+
+ struct SeparatorScalarLookup {
+ const util::string_view separator;
+
+ bool IsNull(int64_t i) { return false; }
+ util::string_view GetView(int64_t i) { return separator; }
+ };
+
+ struct SeparatorArrayLookup {
+ const ArrayType& separators;
+
+ bool IsNull(int64_t i) { return separators.IsNull(i); }
+ util::string_view GetView(int64_t i) { return separators.GetView(i); }
+ };
+
+ // Scalar, scalar -> scalar
+ static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
+ const Scalar& right, Datum* out) {
+ const auto& list = checked_cast<const ListScalarType&>(left);
+ const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
+ if (!list.is_valid || !separator_scalar.is_valid) {
+ return Status::OK();
+ }
+ util::string_view separator(*separator_scalar.value);
+
+ const auto& strings = checked_cast<const ArrayType&>(*list.value);
+ if (strings.null_count() > 0) {
+ out->scalar()->is_valid = false;
+ return Status::OK();
+ }
+
+ TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+ auto Append = [&](util::string_view value) {
+ return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<int64_t>(value.size()));
+ };
+ if (strings.length() > 0) {
+ auto data_length =
+ strings.total_values_length() + (strings.length() - 1) * separator.length();
+ RETURN_NOT_OK(builder.Reserve(data_length));
+ RETURN_NOT_OK(Append(strings.GetView(0)));
+ for (int64_t j = 1; j < strings.length(); j++) {
+ RETURN_NOT_OK(Append(separator));
+ RETURN_NOT_OK(Append(strings.GetView(j)));
+ }
+ }
+ auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ return builder.Finish(&out_scalar->value);
+ }
+
+ // Scalar, array -> array
+ static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
+ const std::shared_ptr<ArrayData>& right, Datum* out) {
+ const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
+ if (!list_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+ const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
+ if (strings.null_count() != 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+ const ArrayType separators(right);
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(separators.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = 0;
+ const int64_t list_length = strings.length();
+ if (list_length) {
+ const int64_t string_length = strings.total_values_length();
+ total_data_length +=
+ string_length * (separators.length() - separators.null_count());
+ for (int64_t i = 0; i < separators.length(); ++i) {
+ if (separators.IsNull(i)) {
+ continue;
+ }
+ total_data_length += (list_length - 1) * separators.value_length(i);
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
+ SeparatorArrayLookup{separators}, &builder, out);
+ }
+
+ // Array, scalar -> array
+ static Status ExecArrayScalar(KernelContext* ctx,
+ const std::shared_ptr<ArrayData>& left,
+ const Scalar& right, Datum* out) {
+ const ListArrayType lists(left);
+ const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
+
+ if (!separator_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls,
+ MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+
+ util::string_view separator(*separator_scalar.value);
+ const auto& strings = checked_cast<const ArrayType&>(*lists.values());
+ const auto list_offsets = lists.raw_value_offsets();
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(lists.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = strings.total_values_length();
+ for (int64_t i = 0; i < lists.length(); ++i) {
+ const auto start = list_offsets[i], end = list_offsets[i + 1];
+ if (end > start && !ValuesContainNull(strings, start, end)) {
+ total_data_length += (end - start - 1) * separator.length();
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
+ SeparatorScalarLookup{separator}, &builder, out);
+ }
+
+ // Array, array -> array
+ static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
+ const std::shared_ptr<ArrayData>& right, Datum* out) {
+ const ListArrayType lists(left);
+ const auto& strings = checked_cast<const ArrayType&>(*lists.values());
+ const auto list_offsets = lists.raw_value_offsets();
+ const auto string_offsets = strings.raw_value_offsets();
+ const ArrayType separators(right);
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(lists.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = 0;
+ for (int64_t i = 0; i < lists.length(); ++i) {
+ if (separators.IsNull(i)) {
+ continue;
+ }
+ const auto start = list_offsets[i], end = list_offsets[i + 1];
+ if (end > start && !ValuesContainNull(strings, start, end)) {
+ total_data_length += string_offsets[end] - string_offsets[start];
+ total_data_length += (end - start - 1) * separators.value_length(i);
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ struct SeparatorLookup {
+ const ArrayType& separators;
+
+ bool IsNull(int64_t i) { return separators.IsNull(i); }
+ util::string_view GetView(int64_t i) { return separators.GetView(i); }
+ };
+ return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
+ SeparatorArrayLookup{separators}, &builder, out);
+ }
+
+ template <typename ListOffsetLookup, typename SeparatorLookup>
+ static Status JoinStrings(int64_t length, const ArrayType& strings,
+ ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
+ BuilderType* builder, Datum* out) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (list_offsets.IsNull(i) || separators.IsNull(i)) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
+ if (j_start == j_end) {
+ builder->UnsafeAppendEmptyValue();
+ continue;
+ }
+ if (ValuesContainNull(strings, j_start, j_end)) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ builder->UnsafeAppend(strings.GetView(j_start));
+ for (int64_t j = j_start + 1; j < j_end; ++j) {
+ builder->UnsafeExtendCurrent(separators.GetView(i));
+ builder->UnsafeExtendCurrent(strings.GetView(j));
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder->Finish(&string_array));
+ *out = *string_array->data();
+ // Correct the output type based on the input
+ out->mutable_array()->type = strings.type();
+ return Status::OK();
+ }
+
+ static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
+ if (values.null_count() == 0) {
+ return false;
+ }
+ for (int64_t i = start; i < end; ++i) {
+ if (values.IsNull(i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
+
+template <typename Type>
+struct BinaryJoinElementWise {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
+ // Last argument is the separator (for consistency with binary_join)
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ const size_t num_args = batch.values.size();
+ if (num_args == 1) {
+ // Only separator, no values
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+ output->is_valid = batch.values[0].scalar()->is_valid;
+ return Status::OK();
+ }
+
+ int64_t final_size = CalculateRowSize(options, batch, 0);
+ if (final_size < 0) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+ output->is_valid = false;
+ return Status::OK();
+ }
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
+ const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
+ uint8_t* buf = output->value->mutable_data();
+ bool first = true;
+ for (size_t i = 0; i < num_args - 1; i++) {
+ const Scalar& scalar = *batch[i].scalar();
+ util::string_view s;
+ if (scalar.is_valid) {
+ s = UnboxScalar<Type>::Unbox(scalar);
+ } else {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ // Handled by CalculateRowSize
+ DCHECK(false) << "unreachable";
+ break;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ s = options.null_replacement;
+ break;
+ }
+ }
+ if (!first) {
+ buf = std::copy(separator.begin(), separator.end(), buf);
+ }
+ first = false;
+ buf = std::copy(s.begin(), s.end(), buf);
+ }
+ output->is_valid = true;
+ DCHECK_EQ(final_size, buf - output->value->mutable_data());
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations
+ int64_t final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSize(options, batch, i);
+ if (size > 0) final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(final_size));
+
+ std::vector<util::string_view> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0; // Not counting separator
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+ if (col < batch.values.size() - 1) num_valid++;
+ } else {
+ valid_cols[col] = util::string_view();
+ }
+ } else {
+ const ArrayData& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const offset_type* offsets = array.GetValues<offset_type>(1);
+ const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+ const int64_t length = offsets[row + 1] - offsets[row];
+ valid_cols[col] = util::string_view(
+ reinterpret_cast<const char*>(data + offsets[row]), length);
+ if (col < batch.values.size() - 1) num_valid++;
+ } else {
+ valid_cols[col] = util::string_view();
+ }
+ }
+ }
+
+ if (!valid_cols.back().data()) {
+ // Separator is null
+ builder.UnsafeAppendNull();
+ continue;
+ } else if (batch.values.size() == 1) {
+ // Only given separator
+ builder.UnsafeAppendEmptyValue();
+ continue;
+ } else if (num_valid < batch.values.size() - 1) {
+ // We had some nulls
+ if (options.null_handling == JoinOptions::EMIT_NULL) {
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ }
+ const auto separator = valid_cols.back();
+ bool first = true;
+ for (size_t col = 0; col < batch.values.size() - 1; col++) {
+ util::string_view value = valid_cols[col];
+ if (!value.data()) {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ DCHECK(false) << "unreachable";
+ break;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ value = options.null_replacement;
+ break;
+ }
+ }
+ if (first) {
+ builder.UnsafeAppend(value);
+ first = false;
+ continue;
+ }
+ builder.UnsafeExtendCurrent(separator);
+ builder.UnsafeExtendCurrent(value);
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ DCHECK_EQ(final_size,
+ checked_cast<const ArrayType&>(*string_array).total_values_length());
+ return Status::OK();
+ }
+
+ // Compute the length of the output for the given position, or -1 if it would be null.
+ static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
+ const int64_t index) {
+ const auto num_args = batch.values.size();
+ int64_t final_size = 0;
+ int64_t num_non_null_args = 0;
+ for (size_t i = 0; i < num_args; i++) {
+ int64_t element_size = 0;
+ bool valid = true;
+ if (batch[i].is_scalar()) {
+ const Scalar& scalar = *batch[i].scalar();
+ valid = scalar.is_valid;
+ element_size = UnboxScalar<Type>::Unbox(scalar).size();
+ } else {
+ const ArrayData& array = *batch[i].array();
+ valid = !array.MayHaveNulls() ||
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
+ const offset_type* offsets = array.GetValues<offset_type>(1);
+ element_size = offsets[index + 1] - offsets[index];
+ }
+ if (i == num_args - 1) {
+ if (!valid) return -1;
+ if (num_non_null_args > 1) {
+ // Add separator size (only if there were values to join)
+ final_size += (num_non_null_args - 1) * element_size;
+ }
+ break;
+ }
+ if (!valid) {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ return -1;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ element_size = options.null_replacement.size();
+ break;
+ }
+ }
+ num_non_null_args++;
+ final_size += element_size;
+ }
+ return final_size;
+ }
+};
+
+const FunctionDoc binary_join_doc(
+ "Join a list of strings together with a `separator` to form a single string",
+ ("Insert `separator` between `list` elements, and concatenate them.\n"
+ "Any null input and any null `list` element emits a null output.\n"),
+ {"list", "separator"});
+
+const FunctionDoc binary_join_element_wise_doc(
+ "Join string arguments into one, using the last argument as the separator",
+ ("Insert the last argument of `strings` between the rest of the elements, "
+ "and concatenate them.\n"
+ "Any null separator element emits a null output. Null elements either "
+ "emit a null (the default), are skipped, or replaced with a given string.\n"),
+ {"*strings"}, "JoinOptions");
+
+const auto kDefaultJoinOptions = JoinOptions::Defaults();
+
+template <typename ListType>
+void AddBinaryJoinForListType(ScalarFunction* func) {
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
+ auto list_ty = std::make_shared<ListType>(ty);
+ DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
+ }
+}
+
+void AddBinaryJoin(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
+ &binary_join_doc);
+ AddBinaryJoinForListType<ListType>(func.get());
+ AddBinaryJoinForListType<LargeListType>(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>(
+ "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
+ &binary_join_element_wise_doc, &kDefaultJoinOptions);
+ for (const auto& ty : BaseBinaryTypes()) {
+ ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
+ GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
+ BinaryJoinElementWiseState::Init};
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
template <template <typename> class ExecFunctor>
-void MakeUnaryStringBatchKernelWithState(
- std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
- MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
- {
- using t32 = ExecFunctor<StringType>;
- ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- {
- using t64 = ExecFunctor<LargeStringType>;
- ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
+void MakeUnaryStringBatchKernel(
+ std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ {
+ auto exec_32 = ExecFunctor<StringType>::Exec;
+ ScalarKernel kernel{{utf8()}, utf8(), exec_32};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ {
+ auto exec_64 = ExecFunctor<LargeStringType>::Exec;
+ ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+template <template <typename> class ExecFunctor>
+void MakeUnaryStringBatchKernelWithState(
+ std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ {
+ using t32 = ExecFunctor<StringType>;
+ ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ {
+ using t64 = ExecFunctor<LargeStringType>;
+ ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
#ifdef ARROW_WITH_UTF8PROC
template <template <typename> class Transformer>
-void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
@@ -3876,15 +3876,15 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
#endif
-// NOTE: Predicate should only populate 'status' with errors,
-// leave it unmodified to indicate Status::OK()
-using StringPredicate =
- std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
+// NOTE: Predicate should only populate 'status' with errors,
+// leave it unmodified to indicate Status::OK()
+using StringPredicate =
+ std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
template <typename Type>
-Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
- StringPredicate predicate, Datum* out) {
- Status st = Status::OK();
+Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
+ StringPredicate predicate, Datum* out) {
+ Status st = Status::OK();
EnsureLookupTablesFilled();
if (batch[0].kind() == Datum::ARRAY) {
const ArrayData& input = *batch[0].array();
@@ -3894,250 +3894,250 @@ Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
[&]() -> bool {
util::string_view val = input_it();
- return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
- &st);
+ return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
+ &st);
});
} else {
const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
if (input.is_valid) {
- bool boolean_result = predicate(ctx, input.value->data(),
- static_cast<size_t>(input.value->size()), &st);
- // UTF decoding can lead to issues
- if (st.ok()) {
- out->value = std::make_shared<BooleanScalar>(boolean_result);
+ bool boolean_result = predicate(ctx, input.value->data(),
+ static_cast<size_t>(input.value->size()), &st);
+ // UTF decoding can lead to issues
+ if (st.ok()) {
+ out->value = std::make_shared<BooleanScalar>(boolean_result);
}
}
}
- return st;
+ return st;
}
template <typename Predicate>
-void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
+ return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
};
auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
+ return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
};
DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
- return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
-}
-
-FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
- bool non_empty) {
- std::string summary, description;
- {
- std::stringstream ss;
- ss << "Classify strings as " << class_summary;
- summary = ss.str();
- }
- {
- std::stringstream ss;
- if (non_empty) {
- ss
- << ("For each string in `strings`, emit true iff the string is non-empty\n"
- "and consists only of ");
- } else {
- ss
- << ("For each string in `strings`, emit true iff the string consists only\n"
- "of ");
- }
- ss << class_desc << ". Null strings emit null.";
- description = ss.str();
- }
- return StringPredicateDoc(std::move(summary), std::move(description));
-}
-
-const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
-
-const auto ascii_is_alnum_doc =
- StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
-const auto ascii_is_alpha_doc =
- StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
-const auto ascii_is_decimal_doc =
- StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
-const auto ascii_is_lower_doc =
- StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
-const auto ascii_is_printable_doc =
- StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
-const auto ascii_is_space_doc =
- StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
-const auto ascii_is_upper_doc =
- StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
-
-const auto ascii_is_title_doc = StringPredicateDoc(
- "Classify strings as ASCII titlecase",
- ("For each string in `strings`, emit true iff the string is title-cased,\n"
- "i.e. it has at least one cased character, each uppercase character\n"
- "follows a non-cased character, and each lowercase character follows\n"
- "an uppercase character.\n"));
-
-const auto utf8_is_alnum_doc =
- StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
-const auto utf8_is_alpha_doc =
- StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
-const auto utf8_is_decimal_doc =
- StringClassifyDoc("decimal", "decimal Unicode characters", true);
-const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
-const auto utf8_is_lower_doc =
- StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
-const auto utf8_is_numeric_doc =
- StringClassifyDoc("numeric", "numeric Unicode characters", true);
-const auto utf8_is_printable_doc =
- StringClassifyDoc("printable", "printable Unicode characters", true);
-const auto utf8_is_space_doc =
- StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
-const auto utf8_is_upper_doc =
- StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
-
-const auto utf8_is_title_doc = StringPredicateDoc(
- "Classify strings as titlecase",
- ("For each string in `strings`, emit true iff the string is title-cased,\n"
- "i.e. it has at least one cased character, each uppercase character\n"
- "follows a non-cased character, and each lowercase character follows\n"
- "an uppercase character.\n"));
-
-const FunctionDoc ascii_upper_doc(
- "Transform ASCII input to uppercase",
- ("For each string in `strings`, return an uppercase version.\n\n"
- "This function assumes the input is fully ASCII. It it may contain\n"
- "non-ASCII characters, use \"utf8_upper\" instead."),
- {"strings"});
-
-const FunctionDoc ascii_lower_doc(
- "Transform ASCII input to lowercase",
- ("For each string in `strings`, return a lowercase version.\n\n"
- "This function assumes the input is fully ASCII. If it may contain\n"
- "non-ASCII characters, use \"utf8_lower\" instead."),
- {"strings"});
-
-const FunctionDoc utf8_upper_doc(
- "Transform input to uppercase",
- ("For each string in `strings`, return an uppercase version."), {"strings"});
-
-const FunctionDoc utf8_lower_doc(
- "Transform input to lowercase",
- ("For each string in `strings`, return a lowercase version."), {"strings"});
-
-const FunctionDoc ascii_reverse_doc(
- "Reverse ASCII input",
- ("For each ASCII string in `strings`, return a reversed version.\n\n"
- "This function assumes the input is fully ASCII. If it may contain\n"
- "non-ASCII characters, use \"utf8_reverse\" instead."),
- {"strings"});
-
-const FunctionDoc utf8_reverse_doc(
- "Reverse utf8 input",
- ("For each utf8 string in `strings`, return a reversed version.\n\n"
- "This function operates on codepoints/UTF-8 code units, not grapheme\n"
- "clusters. Hence, it will not correctly reverse grapheme clusters\n"
- "composed of multiple codepoints."),
- {"strings"});
-
+FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
+ return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
+}
+
+FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
+ bool non_empty) {
+ std::string summary, description;
+ {
+ std::stringstream ss;
+ ss << "Classify strings as " << class_summary;
+ summary = ss.str();
+ }
+ {
+ std::stringstream ss;
+ if (non_empty) {
+ ss
+ << ("For each string in `strings`, emit true iff the string is non-empty\n"
+ "and consists only of ");
+ } else {
+ ss
+ << ("For each string in `strings`, emit true iff the string consists only\n"
+ "of ");
+ }
+ ss << class_desc << ". Null strings emit null.";
+ description = ss.str();
+ }
+ return StringPredicateDoc(std::move(summary), std::move(description));
+}
+
+const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
+
+const auto ascii_is_alnum_doc =
+ StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
+const auto ascii_is_alpha_doc =
+ StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
+const auto ascii_is_decimal_doc =
+ StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
+const auto ascii_is_lower_doc =
+ StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
+const auto ascii_is_printable_doc =
+ StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
+const auto ascii_is_space_doc =
+ StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
+const auto ascii_is_upper_doc =
+ StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
+
+const auto ascii_is_title_doc = StringPredicateDoc(
+ "Classify strings as ASCII titlecase",
+ ("For each string in `strings`, emit true iff the string is title-cased,\n"
+ "i.e. it has at least one cased character, each uppercase character\n"
+ "follows a non-cased character, and each lowercase character follows\n"
+ "an uppercase character.\n"));
+
+const auto utf8_is_alnum_doc =
+ StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
+const auto utf8_is_alpha_doc =
+ StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
+const auto utf8_is_decimal_doc =
+ StringClassifyDoc("decimal", "decimal Unicode characters", true);
+const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
+const auto utf8_is_lower_doc =
+ StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
+const auto utf8_is_numeric_doc =
+ StringClassifyDoc("numeric", "numeric Unicode characters", true);
+const auto utf8_is_printable_doc =
+ StringClassifyDoc("printable", "printable Unicode characters", true);
+const auto utf8_is_space_doc =
+ StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
+const auto utf8_is_upper_doc =
+ StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
+
+const auto utf8_is_title_doc = StringPredicateDoc(
+ "Classify strings as titlecase",
+ ("For each string in `strings`, emit true iff the string is title-cased,\n"
+ "i.e. it has at least one cased character, each uppercase character\n"
+ "follows a non-cased character, and each lowercase character follows\n"
+ "an uppercase character.\n"));
+
+const FunctionDoc ascii_upper_doc(
+ "Transform ASCII input to uppercase",
+ ("For each string in `strings`, return an uppercase version.\n\n"
+ "This function assumes the input is fully ASCII. It it may contain\n"
+ "non-ASCII characters, use \"utf8_upper\" instead."),
+ {"strings"});
+
+const FunctionDoc ascii_lower_doc(
+ "Transform ASCII input to lowercase",
+ ("For each string in `strings`, return a lowercase version.\n\n"
+ "This function assumes the input is fully ASCII. If it may contain\n"
+ "non-ASCII characters, use \"utf8_lower\" instead."),
+ {"strings"});
+
+const FunctionDoc utf8_upper_doc(
+ "Transform input to uppercase",
+ ("For each string in `strings`, return an uppercase version."), {"strings"});
+
+const FunctionDoc utf8_lower_doc(
+ "Transform input to lowercase",
+ ("For each string in `strings`, return a lowercase version."), {"strings"});
+
+const FunctionDoc ascii_reverse_doc(
+ "Reverse ASCII input",
+ ("For each ASCII string in `strings`, return a reversed version.\n\n"
+ "This function assumes the input is fully ASCII. If it may contain\n"
+ "non-ASCII characters, use \"utf8_reverse\" instead."),
+ {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+ "Reverse utf8 input",
+ ("For each utf8 string in `strings`, return a reversed version.\n\n"
+ "This function operates on codepoints/UTF-8 code units, not grapheme\n"
+ "clusters. Hence, it will not correctly reverse grapheme clusters\n"
+ "composed of multiple codepoints."),
+ {"strings"});
+
} // namespace
void RegisterScalarStringAscii(FunctionRegistry* registry) {
- // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
- // so don't preallocate them in the output.
- MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
- MemAllocation::NO_PREALLOCATE);
- MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
- MemAllocation::NO_PREALLOCATE);
- MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
- &ascii_trim_whitespace_doc);
- MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
- &ascii_ltrim_whitespace_doc);
- MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
- &ascii_rtrim_whitespace_doc);
- MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
- MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
-
- MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
- &ascii_center_doc);
- MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
- MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
- MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
- &utf8_center_doc);
- MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
- MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
-
- MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
- MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
- &ascii_ltrim_doc);
- MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
- &ascii_rtrim_doc);
-
- AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
-
- AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
- &ascii_is_alnum_doc);
- AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
- AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
- &ascii_is_decimal_doc);
+ // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
+ // so don't preallocate them in the output.
+ MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
+ MemAllocation::NO_PREALLOCATE);
+ MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
+ MemAllocation::NO_PREALLOCATE);
+ MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
+ &ascii_trim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
+ &ascii_ltrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
+ &ascii_rtrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
+ MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
+
+ MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
+ &ascii_center_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
+ &utf8_center_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
+
+ MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
+ &ascii_ltrim_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
+ &ascii_rtrim_doc);
+
+ AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
+
+ AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
+ &ascii_is_alnum_doc);
+ AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
+ AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
+ &ascii_is_decimal_doc);
// no is_digit for ascii, since it is the same as is_decimal
- AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
+ AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
// no is_numeric for ascii, since it is the same as is_decimal
- AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
- &ascii_is_printable_doc);
- AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
- AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
- AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
+ AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
+ &ascii_is_printable_doc);
+ AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
+ AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
+ AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
#ifdef ARROW_WITH_UTF8PROC
- MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
- MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
- MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
- &utf8_trim_whitespace_doc);
- MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
- &utf8_ltrim_whitespace_doc);
- MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
- &utf8_rtrim_whitespace_doc);
- MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
- MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
- MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
-
- AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
- &utf8_is_alnum_doc);
- AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
- AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
- &utf8_is_decimal_doc);
- AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
- AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
- AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
- &utf8_is_numeric_doc);
- AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
- &utf8_is_printable_doc);
- AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
- AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
- AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
+ MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
+ MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
+ MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
+ &utf8_trim_whitespace_doc);
+ MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
+ &utf8_ltrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
+ &utf8_rtrim_whitespace_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
+
+ AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
+ &utf8_is_alnum_doc);
+ AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
+ AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
+ &utf8_is_decimal_doc);
+ AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
+ AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
+ AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
+ &utf8_is_numeric_doc);
+ AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
+ &utf8_is_printable_doc);
+ AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
+ AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
+ AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
#endif
AddBinaryLength(registry);
- AddUtf8Length(registry);
+ AddUtf8Length(registry);
AddMatchSubstring(registry);
- AddFindSubstring(registry);
- AddCountSubstring(registry);
- MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
- "replace_substring", registry, &replace_substring_doc,
- MemAllocation::NO_PREALLOCATE);
-#ifdef ARROW_WITH_RE2
- MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
- "replace_substring_regex", registry, &replace_substring_regex_doc,
- MemAllocation::NO_PREALLOCATE);
- AddExtractRegex(registry);
-#endif
- AddReplaceSlice(registry);
- AddSlice(registry);
- AddSplit(registry);
+ AddFindSubstring(registry);
+ AddCountSubstring(registry);
+ MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
+ "replace_substring", registry, &replace_substring_doc,
+ MemAllocation::NO_PREALLOCATE);
+#ifdef ARROW_WITH_RE2
+ MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
+ "replace_substring_regex", registry, &replace_substring_regex_doc,
+ MemAllocation::NO_PREALLOCATE);
+ AddExtractRegex(registry);
+#endif
+ AddReplaceSlice(registry);
+ AddSlice(registry);
+ AddSplit(registry);
AddStrptime(registry);
- AddBinaryJoin(registry);
+ AddBinaryJoin(registry);
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
index f0257772d4a..e9375664a90 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
@@ -1,663 +1,663 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/builder.h"
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/time.h"
-#include "arrow/vendored/datetime.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-namespace compute {
-namespace internal {
-
-namespace {
-
-using arrow_vendored::date::days;
-using arrow_vendored::date::floor;
-using arrow_vendored::date::hh_mm_ss;
-using arrow_vendored::date::sys_time;
-using arrow_vendored::date::trunc;
-using arrow_vendored::date::weekday;
-using arrow_vendored::date::weeks;
-using arrow_vendored::date::year_month_day;
-using arrow_vendored::date::years;
-using arrow_vendored::date::literals::dec;
-using arrow_vendored::date::literals::jan;
-using arrow_vendored::date::literals::last;
-using arrow_vendored::date::literals::mon;
-using arrow_vendored::date::literals::thu;
-using internal::applicator::ScalarUnaryNotNull;
-using internal::applicator::SimpleUnary;
-
-using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
-
-const std::string& GetInputTimezone(const Datum& datum) {
- return checked_cast<const TimestampType&>(*datum.type()).timezone();
-}
-
-const std::string& GetInputTimezone(const Scalar& scalar) {
- return checked_cast<const TimestampType&>(*scalar.type).timezone();
-}
-
-const std::string& GetInputTimezone(const ArrayData& array) {
- return checked_cast<const TimestampType&>(*array.type).timezone();
-}
-
-template <typename T>
-Status TemporalComponentExtractCheckTimezone(const T& input) {
- const auto& timezone = GetInputTimezone(input);
- if (!timezone.empty()) {
- return Status::NotImplemented(
- "Cannot extract components from timestamp with specific timezone: ", timezone);
- }
- return Status::OK();
-}
-
-template <typename Op, typename OutType>
-struct TemporalComponentExtract {
- using OutValue = typename internal::GetOutputType<OutType>::T;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
- return ScalarUnaryNotNull<OutType, TimestampType, Op>::Exec(ctx, batch, out);
- }
-};
-
-template <typename Op, typename OutType>
-struct DayOfWeekExec {
- using OutValue = typename internal::GetOutputType<OutType>::T;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
- if (options.week_start < 1 || 7 < options.week_start) {
- return Status::Invalid(
- "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=",
- options.week_start);
- }
-
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
- applicator::ScalarUnaryNotNullStateful<OutType, TimestampType, Op> kernel{
- Op(options)};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract year from timestamp
-
-template <typename Duration>
-struct Year {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- return static_cast<T>(static_cast<const int32_t>(
- year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).year()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract month from timestamp
-
-template <typename Duration>
-struct Month {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- return static_cast<T>(static_cast<const uint32_t>(
- year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).month()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract day from timestamp
-
-template <typename Duration>
-struct Day {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- return static_cast<T>(static_cast<const uint32_t>(
- year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).day()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract day of week from timestamp
-//
-// By default week starts on Monday represented by 0 and ends on Sunday represented
-// by 6. Start day of the week (Monday=1, Sunday=7) and numbering start (0 or 1) can be
-// set using DayOfWeekOptions
-
-template <typename Duration>
-struct DayOfWeek {
- explicit DayOfWeek(const DayOfWeekOptions& options) {
- for (int i = 0; i < 7; i++) {
- lookup_table[i] = i + 8 - options.week_start;
- lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i];
- lookup_table[i] += options.one_based_numbering;
- }
- }
-
- template <typename T, typename Arg0>
- T Call(KernelContext*, Arg0 arg, Status*) const {
- const auto wd = arrow_vendored::date::year_month_weekday(
- floor<days>(sys_time<Duration>(Duration{arg})))
- .weekday()
- .iso_encoding();
- return lookup_table[wd - 1];
- }
- std::array<int64_t, 7> lookup_table;
-};
-
-// ----------------------------------------------------------------------
-// Extract day of year from timestamp
-
-template <typename Duration>
-struct DayOfYear {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- return static_cast<T>(
- (t - sys_time<days>(year_month_day(t).year() / jan / 0)).count());
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract ISO Year values from timestamp
-//
-// First week of an ISO year has the majority (4 or more) of it's days in January.
-// Last week of an ISO year has the year's last Thursday in it.
-
-template <typename Duration>
-struct ISOYear {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- auto y = year_month_day{t + days{3}}.year();
- auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- if (t < start) {
- --y;
- }
- return static_cast<T>(static_cast<int32_t>(y));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract ISO week from timestamp
-//
-// First week of an ISO year has the majority (4 or more) of it's days in January.
-// Last week of an ISO year has the year's last Thursday in it.
-// Based on
-// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503
-template <typename Duration>
-struct ISOWeek {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- auto y = year_month_day{t + days{3}}.year();
- auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- if (t < start) {
- --y;
- start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- }
- return static_cast<T>(trunc<weeks>(t - start).count() + 1);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract quarter from timestamp
-
-template <typename Duration>
-struct Quarter {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto ymd = year_month_day(floor<days>(sys_time<Duration>(Duration{arg})));
- return static_cast<T>((static_cast<const uint32_t>(ymd.month()) - 1) / 3 + 1);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract hour from timestamp
-
-template <typename Duration>
-struct Hour {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>((t - floor<days>(t)) / std::chrono::hours(1));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract minute from timestamp
-
-template <typename Duration>
-struct Minute {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>((t - floor<std::chrono::hours>(t)) / std::chrono::minutes(1));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract second from timestamp
-
-template <typename Duration>
-struct Second {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>((t - floor<std::chrono::minutes>(t)) / std::chrono::seconds(1));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract subsecond from timestamp
-
-template <typename Duration>
-struct Subsecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- (std::chrono::duration<double>(t - floor<std::chrono::seconds>(t)).count()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract milliseconds from timestamp
-
-template <typename Duration>
-struct Millisecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- ((t - floor<std::chrono::seconds>(t)) / std::chrono::milliseconds(1)) % 1000);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract microseconds from timestamp
-
-template <typename Duration>
-struct Microsecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- ((t - floor<std::chrono::seconds>(t)) / std::chrono::microseconds(1)) % 1000);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract nanoseconds from timestamp
-
-template <typename Duration>
-struct Nanosecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- ((t - floor<std::chrono::seconds>(t)) / std::chrono::nanoseconds(1)) % 1000);
- }
-};
-
-template <typename Duration>
-inline std::vector<int64_t> get_iso_calendar(int64_t arg) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- const auto ymd = year_month_day(t);
- auto y = year_month_day{t + days{3}}.year();
- auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- if (t < start) {
- --y;
- start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- }
- return {static_cast<int64_t>(static_cast<int32_t>(y)),
- static_cast<int64_t>(trunc<weeks>(t - start).count() + 1),
- static_cast<int64_t>(weekday(ymd).iso_encoding())};
-}
-
-// ----------------------------------------------------------------------
-// Extract ISO calendar values from timestamp
-
-template <typename Duration>
-struct ISOCalendar {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
- if (in.is_valid) {
- const std::shared_ptr<DataType> iso_calendar_type =
- struct_({field("iso_year", int64()), field("iso_week", int64()),
- field("iso_day_of_week", int64())});
- const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
- const auto iso_calendar = get_iso_calendar<Duration>(in_val);
-
- std::vector<std::shared_ptr<Scalar>> values = {
- std::make_shared<Int64Scalar>(iso_calendar[0]),
- std::make_shared<Int64Scalar>(iso_calendar[1]),
- std::make_shared<Int64Scalar>(iso_calendar[2])};
- *checked_cast<StructScalar*>(out) = StructScalar(values, iso_calendar_type);
- } else {
- out->is_valid = false;
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
- using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
-
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
- const std::shared_ptr<DataType> iso_calendar_type =
- struct_({field("iso_year", int64()), field("iso_week", int64()),
- field("iso_day_of_week", int64())});
-
- std::unique_ptr<ArrayBuilder> array_builder;
- RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), iso_calendar_type, &array_builder));
- StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
- RETURN_NOT_OK(struct_builder->Reserve(in.length));
-
- std::vector<BuilderType*> field_builders;
- field_builders.reserve(3);
- for (int i = 0; i < 3; i++) {
- field_builders.push_back(
- checked_cast<BuilderType*>(struct_builder->field_builder(i)));
- RETURN_NOT_OK(field_builders[i]->Reserve(1));
- }
- auto visit_null = [&]() { return struct_builder->AppendNull(); };
- auto visit_value = [&](int64_t arg) {
- const auto iso_calendar = get_iso_calendar<Duration>(arg);
- field_builders[0]->UnsafeAppend(iso_calendar[0]);
- field_builders[1]->UnsafeAppend(iso_calendar[1]);
- field_builders[2]->UnsafeAppend(iso_calendar[2]);
- return struct_builder->Append();
- };
- RETURN_NOT_OK(VisitArrayDataInline<Int64Type>(in, visit_value, visit_null));
-
- std::shared_ptr<Array> out_array;
- RETURN_NOT_OK(struct_builder->Finish(&out_array));
- *out = *std::move(out_array->data());
-
- return Status::OK();
- }
-};
-
-template <template <typename...> class Op, typename OutType>
-std::shared_ptr<ScalarFunction> MakeTemporal(std::string name, const FunctionDoc* doc) {
- const auto& out_type = TypeTraits<OutType>::type_singleton();
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- for (auto unit : internal::AllTimeUnits()) {
- InputType in_type{match::TimestampTypeUnit(unit)};
- switch (unit) {
- case TimeUnit::SECOND: {
- auto exec = TemporalComponentExtract<Op<std::chrono::seconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MILLI: {
- auto exec =
- TemporalComponentExtract<Op<std::chrono::milliseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MICRO: {
- auto exec =
- TemporalComponentExtract<Op<std::chrono::microseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::NANO: {
- auto exec = TemporalComponentExtract<Op<std::chrono::nanoseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- }
- }
- return func;
-}
-
-template <template <typename...> class Op, typename OutType>
-std::shared_ptr<ScalarFunction> MakeTemporalWithOptions(
- std::string name, const FunctionDoc* doc, const DayOfWeekOptions& default_options,
- KernelInit init) {
- const auto& out_type = TypeTraits<OutType>::type_singleton();
- auto func =
- std::make_shared<ScalarFunction>(name, Arity::Unary(), doc, &default_options);
-
- for (auto unit : internal::AllTimeUnits()) {
- InputType in_type{match::TimestampTypeUnit(unit)};
- switch (unit) {
- case TimeUnit::SECOND: {
- auto exec = DayOfWeekExec<Op<std::chrono::seconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- case TimeUnit::MILLI: {
- auto exec = DayOfWeekExec<Op<std::chrono::milliseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- case TimeUnit::MICRO: {
- auto exec = DayOfWeekExec<Op<std::chrono::microseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- case TimeUnit::NANO: {
- auto exec = DayOfWeekExec<Op<std::chrono::nanoseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- }
- }
- return func;
-}
-
-template <template <typename...> class Op>
-std::shared_ptr<ScalarFunction> MakeStructTemporal(std::string name,
- const FunctionDoc* doc) {
- const auto& out_type = struct_({field("iso_year", int64()), field("iso_week", int64()),
- field("iso_day_of_week", int64())});
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- for (auto unit : internal::AllTimeUnits()) {
- InputType in_type{match::TimestampTypeUnit(unit)};
- switch (unit) {
- case TimeUnit::SECOND: {
- auto exec = SimpleUnary<Op<std::chrono::seconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MILLI: {
- auto exec = SimpleUnary<Op<std::chrono::milliseconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MICRO: {
- auto exec = SimpleUnary<Op<std::chrono::microseconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::NANO: {
- auto exec = SimpleUnary<Op<std::chrono::nanoseconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- }
- }
- return func;
-}
-
-const FunctionDoc year_doc{
- "Extract year from timestamp",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc month_doc{
- "Extract month number",
- ("Month is encoded as January=1, December=12.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc day_doc{
- "Extract day number",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc day_of_week_doc{
- "Extract day of the week number",
- ("By default, the week starts on Monday represented by 0 and ends on Sunday "
- "represented by 6.\n"
- "DayOfWeekOptions.week_start can be used to set another starting day using ISO "
- "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using "
- "DayOfWeekOptions.one_based_numbering parameter.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"},
- "DayOfWeekOptions"};
-
-const FunctionDoc day_of_year_doc{
- "Extract number of day of year",
- ("January 1st maps to day number 1, February 1st to 32, etc.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc iso_year_doc{
- "Extract ISO year number",
- ("First week of an ISO year has the majority (4 or more) of its days in January."
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc iso_week_doc{
- "Extract ISO week of year number",
- ("First ISO week has the majority (4 or more) of its days in January.\n"
- "Week of the year starts with 1 and can run up to 53.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc iso_calendar_doc{
- "Extract (ISO year, ISO week, ISO day of week) struct",
- ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc quarter_doc{
- "Extract quarter of year number",
- ("First quarter maps to 1 and forth quarter maps to 4.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc hour_doc{
- "Extract hour value",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc minute_doc{
- "Extract minute values",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc second_doc{
- "Extract second values",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc millisecond_doc{
- "Extract millisecond values",
- ("Millisecond returns number of milliseconds since the last full second.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc microsecond_doc{
- "Extract microsecond values",
- ("Millisecond returns number of microseconds since the last full millisecond.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc nanosecond_doc{
- "Extract nanosecond values",
- ("Nanosecond returns number of nanoseconds since the last full microsecond.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc subsecond_doc{
- "Extract subsecond values",
- ("Subsecond returns the fraction of a second since the last full second.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-} // namespace
-
-void RegisterScalarTemporal(FunctionRegistry* registry) {
- auto year = MakeTemporal<Year, Int64Type>("year", &year_doc);
- DCHECK_OK(registry->AddFunction(std::move(year)));
-
- auto month = MakeTemporal<Month, Int64Type>("month", &year_doc);
- DCHECK_OK(registry->AddFunction(std::move(month)));
-
- auto day = MakeTemporal<Day, Int64Type>("day", &year_doc);
- DCHECK_OK(registry->AddFunction(std::move(day)));
-
- static auto default_day_of_week_options = DayOfWeekOptions::Defaults();
- auto day_of_week = MakeTemporalWithOptions<DayOfWeek, Int64Type>(
- "day_of_week", &day_of_week_doc, default_day_of_week_options, DayOfWeekState::Init);
- DCHECK_OK(registry->AddFunction(std::move(day_of_week)));
-
- auto day_of_year = MakeTemporal<DayOfYear, Int64Type>("day_of_year", &day_of_year_doc);
- DCHECK_OK(registry->AddFunction(std::move(day_of_year)));
-
- auto iso_year = MakeTemporal<ISOYear, Int64Type>("iso_year", &iso_year_doc);
- DCHECK_OK(registry->AddFunction(std::move(iso_year)));
-
- auto iso_week = MakeTemporal<ISOWeek, Int64Type>("iso_week", &iso_week_doc);
- DCHECK_OK(registry->AddFunction(std::move(iso_week)));
-
- auto iso_calendar = MakeStructTemporal<ISOCalendar>("iso_calendar", &iso_calendar_doc);
- DCHECK_OK(registry->AddFunction(std::move(iso_calendar)));
-
- auto quarter = MakeTemporal<Quarter, Int64Type>("quarter", &quarter_doc);
- DCHECK_OK(registry->AddFunction(std::move(quarter)));
-
- auto hour = MakeTemporal<Hour, Int64Type>("hour", &hour_doc);
- DCHECK_OK(registry->AddFunction(std::move(hour)));
-
- auto minute = MakeTemporal<Minute, Int64Type>("minute", &minute_doc);
- DCHECK_OK(registry->AddFunction(std::move(minute)));
-
- auto second = MakeTemporal<Second, Int64Type>("second", &second_doc);
- DCHECK_OK(registry->AddFunction(std::move(second)));
-
- auto millisecond =
- MakeTemporal<Millisecond, Int64Type>("millisecond", &millisecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(millisecond)));
-
- auto microsecond =
- MakeTemporal<Microsecond, Int64Type>("microsecond", &microsecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(microsecond)));
-
- auto nanosecond = MakeTemporal<Nanosecond, Int64Type>("nanosecond", &nanosecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(nanosecond)));
-
- auto subsecond = MakeTemporal<Subsecond, DoubleType>("subsecond", &subsecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(subsecond)));
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/builder.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/time.h"
+#include "arrow/vendored/datetime.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow_vendored::date::days;
+using arrow_vendored::date::floor;
+using arrow_vendored::date::hh_mm_ss;
+using arrow_vendored::date::sys_time;
+using arrow_vendored::date::trunc;
+using arrow_vendored::date::weekday;
+using arrow_vendored::date::weeks;
+using arrow_vendored::date::year_month_day;
+using arrow_vendored::date::years;
+using arrow_vendored::date::literals::dec;
+using arrow_vendored::date::literals::jan;
+using arrow_vendored::date::literals::last;
+using arrow_vendored::date::literals::mon;
+using arrow_vendored::date::literals::thu;
+using internal::applicator::ScalarUnaryNotNull;
+using internal::applicator::SimpleUnary;
+
+using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
+
+const std::string& GetInputTimezone(const Datum& datum) {
+ return checked_cast<const TimestampType&>(*datum.type()).timezone();
+}
+
+const std::string& GetInputTimezone(const Scalar& scalar) {
+ return checked_cast<const TimestampType&>(*scalar.type).timezone();
+}
+
+const std::string& GetInputTimezone(const ArrayData& array) {
+ return checked_cast<const TimestampType&>(*array.type).timezone();
+}
+
+template <typename T>
+Status TemporalComponentExtractCheckTimezone(const T& input) {
+ const auto& timezone = GetInputTimezone(input);
+ if (!timezone.empty()) {
+ return Status::NotImplemented(
+ "Cannot extract components from timestamp with specific timezone: ", timezone);
+ }
+ return Status::OK();
+}
+
+template <typename Op, typename OutType>
+struct TemporalComponentExtract {
+ using OutValue = typename internal::GetOutputType<OutType>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+ return ScalarUnaryNotNull<OutType, TimestampType, Op>::Exec(ctx, batch, out);
+ }
+};
+
+template <typename Op, typename OutType>
+struct DayOfWeekExec {
+ using OutValue = typename internal::GetOutputType<OutType>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
+ if (options.week_start < 1 || 7 < options.week_start) {
+ return Status::Invalid(
+ "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=",
+ options.week_start);
+ }
+
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+ applicator::ScalarUnaryNotNullStateful<OutType, TimestampType, Op> kernel{
+ Op(options)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract year from timestamp
+
+template <typename Duration>
+struct Year {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const int32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).year()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract month from timestamp
+
+template <typename Duration>
+struct Month {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const uint32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).month()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract day from timestamp
+
+template <typename Duration>
+struct Day {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const uint32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).day()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract day of week from timestamp
+//
+// By default week starts on Monday represented by 0 and ends on Sunday represented
+// by 6. Start day of the week (Monday=1, Sunday=7) and numbering start (0 or 1) can be
+// set using DayOfWeekOptions
+
+template <typename Duration>
+struct DayOfWeek {
+ explicit DayOfWeek(const DayOfWeekOptions& options) {
+ for (int i = 0; i < 7; i++) {
+ lookup_table[i] = i + 8 - options.week_start;
+ lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i];
+ lookup_table[i] += options.one_based_numbering;
+ }
+ }
+
+ template <typename T, typename Arg0>
+ T Call(KernelContext*, Arg0 arg, Status*) const {
+ const auto wd = arrow_vendored::date::year_month_weekday(
+ floor<days>(sys_time<Duration>(Duration{arg})))
+ .weekday()
+ .iso_encoding();
+ return lookup_table[wd - 1];
+ }
+ std::array<int64_t, 7> lookup_table;
+};
+
+// ----------------------------------------------------------------------
+// Extract day of year from timestamp
+
+template <typename Duration>
+struct DayOfYear {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ return static_cast<T>(
+ (t - sys_time<days>(year_month_day(t).year() / jan / 0)).count());
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract ISO Year values from timestamp
+//
+// First week of an ISO year has the majority (4 or more) of it's days in January.
+// Last week of an ISO year has the year's last Thursday in it.
+
+template <typename Duration>
+struct ISOYear {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ }
+ return static_cast<T>(static_cast<int32_t>(y));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract ISO week from timestamp
+//
+// First week of an ISO year has the majority (4 or more) of it's days in January.
+// Last week of an ISO year has the year's last Thursday in it.
+// Based on
+// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503
+template <typename Duration>
+struct ISOWeek {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ }
+ return static_cast<T>(trunc<weeks>(t - start).count() + 1);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract quarter from timestamp
+
+template <typename Duration>
+struct Quarter {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto ymd = year_month_day(floor<days>(sys_time<Duration>(Duration{arg})));
+ return static_cast<T>((static_cast<const uint32_t>(ymd.month()) - 1) / 3 + 1);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract hour from timestamp
+
+template <typename Duration>
+struct Hour {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<days>(t)) / std::chrono::hours(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract minute from timestamp
+
+template <typename Duration>
+struct Minute {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<std::chrono::hours>(t)) / std::chrono::minutes(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract second from timestamp
+
+template <typename Duration>
+struct Second {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<std::chrono::minutes>(t)) / std::chrono::seconds(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract subsecond from timestamp
+
+template <typename Duration>
+struct Subsecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ (std::chrono::duration<double>(t - floor<std::chrono::seconds>(t)).count()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract milliseconds from timestamp
+
+template <typename Duration>
+struct Millisecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::milliseconds(1)) % 1000);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract microseconds from timestamp
+
+template <typename Duration>
+struct Microsecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::microseconds(1)) % 1000);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract nanoseconds from timestamp
+
+template <typename Duration>
+struct Nanosecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::nanoseconds(1)) % 1000);
+ }
+};
+
+template <typename Duration>
+inline std::vector<int64_t> get_iso_calendar(int64_t arg) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ const auto ymd = year_month_day(t);
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ }
+ return {static_cast<int64_t>(static_cast<int32_t>(y)),
+ static_cast<int64_t>(trunc<weeks>(t - start).count() + 1),
+ static_cast<int64_t>(weekday(ymd).iso_encoding())};
+}
+
+// ----------------------------------------------------------------------
+// Extract ISO calendar values from timestamp
+
+template <typename Duration>
+struct ISOCalendar {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
+ if (in.is_valid) {
+ const std::shared_ptr<DataType> iso_calendar_type =
+ struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+ const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
+ const auto iso_calendar = get_iso_calendar<Duration>(in_val);
+
+ std::vector<std::shared_ptr<Scalar>> values = {
+ std::make_shared<Int64Scalar>(iso_calendar[0]),
+ std::make_shared<Int64Scalar>(iso_calendar[1]),
+ std::make_shared<Int64Scalar>(iso_calendar[2])};
+ *checked_cast<StructScalar*>(out) = StructScalar(values, iso_calendar_type);
+ } else {
+ out->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+ using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
+
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
+ const std::shared_ptr<DataType> iso_calendar_type =
+ struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+
+ std::unique_ptr<ArrayBuilder> array_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), iso_calendar_type, &array_builder));
+ StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
+ RETURN_NOT_OK(struct_builder->Reserve(in.length));
+
+ std::vector<BuilderType*> field_builders;
+ field_builders.reserve(3);
+ for (int i = 0; i < 3; i++) {
+ field_builders.push_back(
+ checked_cast<BuilderType*>(struct_builder->field_builder(i)));
+ RETURN_NOT_OK(field_builders[i]->Reserve(1));
+ }
+ auto visit_null = [&]() { return struct_builder->AppendNull(); };
+ auto visit_value = [&](int64_t arg) {
+ const auto iso_calendar = get_iso_calendar<Duration>(arg);
+ field_builders[0]->UnsafeAppend(iso_calendar[0]);
+ field_builders[1]->UnsafeAppend(iso_calendar[1]);
+ field_builders[2]->UnsafeAppend(iso_calendar[2]);
+ return struct_builder->Append();
+ };
+ RETURN_NOT_OK(VisitArrayDataInline<Int64Type>(in, visit_value, visit_null));
+
+ std::shared_ptr<Array> out_array;
+ RETURN_NOT_OK(struct_builder->Finish(&out_array));
+ *out = *std::move(out_array->data());
+
+ return Status::OK();
+ }
+};
+
+template <template <typename...> class Op, typename OutType>
+std::shared_ptr<ScalarFunction> MakeTemporal(std::string name, const FunctionDoc* doc) {
+ const auto& out_type = TypeTraits<OutType>::type_singleton();
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = TemporalComponentExtract<Op<std::chrono::seconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec =
+ TemporalComponentExtract<Op<std::chrono::milliseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec =
+ TemporalComponentExtract<Op<std::chrono::microseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = TemporalComponentExtract<Op<std::chrono::nanoseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+template <template <typename...> class Op, typename OutType>
+std::shared_ptr<ScalarFunction> MakeTemporalWithOptions(
+ std::string name, const FunctionDoc* doc, const DayOfWeekOptions& default_options,
+ KernelInit init) {
+ const auto& out_type = TypeTraits<OutType>::type_singleton();
+ auto func =
+ std::make_shared<ScalarFunction>(name, Arity::Unary(), doc, &default_options);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = DayOfWeekExec<Op<std::chrono::seconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec = DayOfWeekExec<Op<std::chrono::milliseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec = DayOfWeekExec<Op<std::chrono::microseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = DayOfWeekExec<Op<std::chrono::nanoseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+template <template <typename...> class Op>
+std::shared_ptr<ScalarFunction> MakeStructTemporal(std::string name,
+ const FunctionDoc* doc) {
+ const auto& out_type = struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = SimpleUnary<Op<std::chrono::seconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec = SimpleUnary<Op<std::chrono::milliseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec = SimpleUnary<Op<std::chrono::microseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = SimpleUnary<Op<std::chrono::nanoseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+const FunctionDoc year_doc{
+ "Extract year from timestamp",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc month_doc{
+ "Extract month number",
+ ("Month is encoded as January=1, December=12.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc day_doc{
+ "Extract day number",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc day_of_week_doc{
+ "Extract day of the week number",
+ ("By default, the week starts on Monday represented by 0 and ends on Sunday "
+ "represented by 6.\n"
+ "DayOfWeekOptions.week_start can be used to set another starting day using ISO "
+ "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using "
+ "DayOfWeekOptions.one_based_numbering parameter.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"},
+ "DayOfWeekOptions"};
+
+const FunctionDoc day_of_year_doc{
+ "Extract number of day of year",
+ ("January 1st maps to day number 1, February 1st to 32, etc.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_year_doc{
+ "Extract ISO year number",
+ ("First week of an ISO year has the majority (4 or more) of its days in January."
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_week_doc{
+ "Extract ISO week of year number",
+ ("First ISO week has the majority (4 or more) of its days in January.\n"
+ "Week of the year starts with 1 and can run up to 53.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_calendar_doc{
+ "Extract (ISO year, ISO week, ISO day of week) struct",
+ ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc quarter_doc{
+ "Extract quarter of year number",
+ ("First quarter maps to 1 and forth quarter maps to 4.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc hour_doc{
+ "Extract hour value",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc minute_doc{
+ "Extract minute values",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc second_doc{
+ "Extract second values",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc millisecond_doc{
+ "Extract millisecond values",
+ ("Millisecond returns number of milliseconds since the last full second.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc microsecond_doc{
+ "Extract microsecond values",
+ ("Millisecond returns number of microseconds since the last full millisecond.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc nanosecond_doc{
+ "Extract nanosecond values",
+ ("Nanosecond returns number of nanoseconds since the last full microsecond.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc subsecond_doc{
+ "Extract subsecond values",
+ ("Subsecond returns the fraction of a second since the last full second.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+} // namespace
+
+void RegisterScalarTemporal(FunctionRegistry* registry) {
+ auto year = MakeTemporal<Year, Int64Type>("year", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(year)));
+
+ auto month = MakeTemporal<Month, Int64Type>("month", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(month)));
+
+ auto day = MakeTemporal<Day, Int64Type>("day", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(day)));
+
+ static auto default_day_of_week_options = DayOfWeekOptions::Defaults();
+ auto day_of_week = MakeTemporalWithOptions<DayOfWeek, Int64Type>(
+ "day_of_week", &day_of_week_doc, default_day_of_week_options, DayOfWeekState::Init);
+ DCHECK_OK(registry->AddFunction(std::move(day_of_week)));
+
+ auto day_of_year = MakeTemporal<DayOfYear, Int64Type>("day_of_year", &day_of_year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(day_of_year)));
+
+ auto iso_year = MakeTemporal<ISOYear, Int64Type>("iso_year", &iso_year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_year)));
+
+ auto iso_week = MakeTemporal<ISOWeek, Int64Type>("iso_week", &iso_week_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_week)));
+
+ auto iso_calendar = MakeStructTemporal<ISOCalendar>("iso_calendar", &iso_calendar_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_calendar)));
+
+ auto quarter = MakeTemporal<Quarter, Int64Type>("quarter", &quarter_doc);
+ DCHECK_OK(registry->AddFunction(std::move(quarter)));
+
+ auto hour = MakeTemporal<Hour, Int64Type>("hour", &hour_doc);
+ DCHECK_OK(registry->AddFunction(std::move(hour)));
+
+ auto minute = MakeTemporal<Minute, Int64Type>("minute", &minute_doc);
+ DCHECK_OK(registry->AddFunction(std::move(minute)));
+
+ auto second = MakeTemporal<Second, Int64Type>("second", &second_doc);
+ DCHECK_OK(registry->AddFunction(std::move(second)));
+
+ auto millisecond =
+ MakeTemporal<Millisecond, Int64Type>("millisecond", &millisecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(millisecond)));
+
+ auto microsecond =
+ MakeTemporal<Microsecond, Int64Type>("microsecond", &microsecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(microsecond)));
+
+ auto nanosecond = MakeTemporal<Nanosecond, Int64Type>("nanosecond", &nanosecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(nanosecond)));
+
+ auto subsecond = MakeTemporal<Subsecond, DoubleType>("subsecond", &subsecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(subsecond)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
index ead88abc0f2..dc63edab12c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#include <cmath>
-
+#include <cmath>
+
#include "arrow/compute/kernels/common.h"
#include "arrow/util/bit_util.h"
@@ -32,12 +32,12 @@ namespace internal {
namespace {
struct IsValidOperator {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
checked_cast<BooleanScalar*>(out)->value = in.is_valid;
- return Status::OK();
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+ static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
DCHECK_EQ(out->offset, 0);
DCHECK_LE(out->length, arr.length);
if (arr.MayHaveNulls()) {
@@ -49,64 +49,64 @@ struct IsValidOperator {
arr.offset == 0 ? arr.buffers[0]
: SliceBuffer(arr.buffers[0], arr.offset / 8,
BitUtil::BytesForBits(out->length + out->offset));
- return Status::OK();
+ return Status::OK();
}
// Input has no nulls => output is entirely true.
- ARROW_ASSIGN_OR_RAISE(out->buffers[1],
- ctx->AllocateBitmap(out->length + out->offset));
+ ARROW_ASSIGN_OR_RAISE(out->buffers[1],
+ ctx->AllocateBitmap(out->length + out->offset));
BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
- return Status::OK();
- }
-};
-
-struct IsFiniteOperator {
- template <typename OutType, typename InType>
- static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
- return std::isfinite(value);
- }
-};
-
-struct IsInfOperator {
- template <typename OutType, typename InType>
- static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
- return std::isinf(value);
+ return Status::OK();
}
};
+struct IsFiniteOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isfinite(value);
+ }
+};
+
+struct IsInfOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isinf(value);
+ }
+};
+
struct IsNullOperator {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
checked_cast<BooleanScalar*>(out)->value = !in.is_valid;
- return Status::OK();
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+ static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
if (arr.MayHaveNulls()) {
// Input has nulls => output is the inverted null (validity) bitmap.
InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length,
out->buffers[1]->mutable_data(), out->offset);
- } else {
- // Input has no nulls => output is entirely false.
- BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
- false);
+ } else {
+ // Input has no nulls => output is entirely false.
+ BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
+ false);
}
- return Status::OK();
+ return Status::OK();
+ }
+};
+
+struct IsNanOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isnan(value);
}
};
-struct IsNanOperator {
- template <typename OutType, typename InType>
- static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
- return std::isnan(value);
- }
-};
-
-void MakeFunction(std::string name, const FunctionDoc* doc,
- std::vector<InputType> in_types, OutputType out_type,
+void MakeFunction(std::string name, const FunctionDoc* doc,
+ std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, FunctionRegistry* registry,
MemAllocation::type mem_allocation, bool can_write_into_slices) {
Arity arity{static_cast<int>(in_types.size())};
- auto func = std::make_shared<ScalarFunction>(name, arity, doc);
+ auto func = std::make_shared<ScalarFunction>(name, arity, doc);
ScalarKernel kernel(std::move(in_types), out_type, exec);
kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
@@ -117,112 +117,112 @@ void MakeFunction(std::string name, const FunctionDoc* doc,
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-template <typename InType, typename Op>
-void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
- DCHECK_OK(func->AddKernel({ty}, boolean(),
- applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
-}
-
-std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
- AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
-
- return func;
-}
-
-std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
- AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
-
- return func;
-}
-
-std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
- AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
-
- return func;
-}
-
-Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename InType, typename Op>
+void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
+ DCHECK_OK(func->AddKernel({ty}, boolean(),
+ applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
+}
+
+std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
+
+ return func;
+}
+
+Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const Datum& arg0 = batch[0];
if (arg0.type()->id() == Type::NA) {
auto false_value = std::make_shared<BooleanScalar>(false);
if (arg0.kind() == Datum::SCALAR) {
- out->value = false_value;
+ out->value = false_value;
} else {
std::shared_ptr<Array> false_values;
- RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
- .Value(&false_values));
+ RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
+ .Value(&false_values));
out->value = false_values->data();
}
- return Status::OK();
+ return Status::OK();
} else {
- return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
+ return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
}
}
-Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const Datum& arg0 = batch[0];
if (arg0.type()->id() == Type::NA) {
if (arg0.kind() == Datum::SCALAR) {
- out->value = std::make_shared<BooleanScalar>(true);
+ out->value = std::make_shared<BooleanScalar>(true);
} else {
// Data is preallocated
ArrayData* out_arr = out->mutable_array();
BitUtil::SetBitsTo(out_arr->buffers[1]->mutable_data(), out_arr->offset,
out_arr->length, true);
}
- return Status::OK();
+ return Status::OK();
} else {
- return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
+ return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
}
}
-const FunctionDoc is_valid_doc(
- "Return true if non-null",
- ("For each input value, emit true iff the value is valid (non-null)."), {"values"});
-
-const FunctionDoc is_finite_doc(
- "Return true if value is finite",
- ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
- {"values"});
-
-const FunctionDoc is_inf_doc(
- "Return true if infinity",
- ("For each input value, emit true iff the value is infinite (inf or -inf)."),
- {"values"});
-
-const FunctionDoc is_null_doc("Return true if null",
- ("For each input value, emit true iff the value is null."),
- {"values"});
-
-const FunctionDoc is_nan_doc("Return true if NaN",
- ("For each input value, emit true iff the value is NaN."),
- {"values"});
-
+const FunctionDoc is_valid_doc(
+ "Return true if non-null",
+ ("For each input value, emit true iff the value is valid (non-null)."), {"values"});
+
+const FunctionDoc is_finite_doc(
+ "Return true if value is finite",
+ ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
+ {"values"});
+
+const FunctionDoc is_inf_doc(
+ "Return true if infinity",
+ ("For each input value, emit true iff the value is infinite (inf or -inf)."),
+ {"values"});
+
+const FunctionDoc is_null_doc("Return true if null",
+ ("For each input value, emit true iff the value is null."),
+ {"values"});
+
+const FunctionDoc is_nan_doc("Return true if NaN",
+ ("For each input value, emit true iff the value is NaN."),
+ {"values"});
+
} // namespace
void RegisterScalarValidity(FunctionRegistry* registry) {
- MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
- registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);
+ MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
+ registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);
- MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
- registry, MemAllocation::PREALLOCATE,
+ MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
+ registry, MemAllocation::PREALLOCATE,
/*can_write_into_slices=*/true);
-
- DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
- DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
- DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
+
+ DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
+ DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
+ DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
index 846fa26baf2..0ef0ea6c753 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
@@ -53,30 +53,30 @@ PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
arg.data += arr.offset * arg.bit_width / 8;
}
// This may be kUnknownNullCount
- arg.null_count = (arg.is_valid != nullptr) ? arr.null_count.load() : 0;
+ arg.null_count = (arg.is_valid != nullptr) ? arr.null_count.load() : 0;
return arg;
}
-ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec,
- NullHandling::type null_handling) {
- return [=](KernelContext* ctx, const ExecBatch& batch, Datum* out) -> Status {
- if (out->is_array()) {
- return exec(ctx, batch, out);
- }
-
- if (null_handling == NullHandling::INTERSECTION && !batch[0].scalar()->is_valid) {
- out->scalar()->is_valid = false;
- return Status::OK();
- }
-
- ARROW_ASSIGN_OR_RAISE(Datum array_in, MakeArrayFromScalar(*batch[0].scalar(), 1));
- ARROW_ASSIGN_OR_RAISE(Datum array_out, MakeArrayFromScalar(*out->scalar(), 1));
- RETURN_NOT_OK(exec(ctx, ExecBatch{{std::move(array_in)}, 1}, &array_out));
- ARROW_ASSIGN_OR_RAISE(*out, array_out.make_array()->GetScalar(0));
- return Status::OK();
- };
-}
-
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec,
+ NullHandling::type null_handling) {
+ return [=](KernelContext* ctx, const ExecBatch& batch, Datum* out) -> Status {
+ if (out->is_array()) {
+ return exec(ctx, batch, out);
+ }
+
+ if (null_handling == NullHandling::INTERSECTION && !batch[0].scalar()->is_valid) {
+ out->scalar()->is_valid = false;
+ return Status::OK();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(Datum array_in, MakeArrayFromScalar(*batch[0].scalar(), 1));
+ ARROW_ASSIGN_OR_RAISE(Datum array_out, MakeArrayFromScalar(*out->scalar(), 1));
+ RETURN_NOT_OK(exec(ctx, ExecBatch{{std::move(array_in)}, 1}, &array_out));
+ ARROW_ASSIGN_OR_RAISE(*out, array_out.make_array()->GetScalar(0));
+ return Status::OK();
+ };
+}
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
index 394e08da581..8ce321f6b4f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
@@ -18,30 +18,30 @@
#pragma once
#include <cstdint>
-#include <utility>
+#include <utility>
-#include "arrow/array/util.h"
+#include "arrow/array/util.h"
#include "arrow/buffer.h"
-#include "arrow/compute/kernels/codegen_internal.h"
-#include "arrow/compute/type_fwd.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/util/bit_run_reader.h"
namespace arrow {
namespace compute {
namespace internal {
-// Used in some kernels and testing - not provided by default in MSVC
-// and _USE_MATH_DEFINES is not reliable with unity builds
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-#ifndef M_PI_2
-#define M_PI_2 1.57079632679489661923
-#endif
-#ifndef M_PI_4
-#define M_PI_4 0.785398163397448309616
-#endif
-
+// Used in some kernels and testing - not provided by default in MSVC
+// and _USE_MATH_DEFINES is not reliable with unity builds
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_PI_2
+#define M_PI_2 1.57079632679489661923
+#endif
+#ifndef M_PI_4
+#define M_PI_4 0.785398163397448309616
+#endif
+
// An internal data structure for unpacking a primitive argument to pass to a
// kernel implementation
struct PrimitiveArg {
@@ -67,100 +67,100 @@ int GetBitWidth(const DataType& type);
// rather than duplicating compiled code to do all these in each kernel.
PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
-// Augment a unary ArrayKernelExec which supports only array-like inputs with support for
-// scalar inputs. Scalars will be transformed to 1-long arrays with the scalar's value (or
-// null if the scalar is null) as its only element. This 1-long array will be passed to
-// the original exec, then the only element of the resulting array will be extracted as
-// the output scalar. This could be far more efficient, but instead of optimizing this
-// it'd be better to support scalar inputs "upstream" in original exec.
-ArrayKernelExec TrivialScalarUnaryAsArraysExec(
- ArrayKernelExec exec, NullHandling::type null_handling = NullHandling::INTERSECTION);
-
-// Return (min, max) of a numerical array, ignore nulls.
-// For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
-template <typename T>
-ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
- T min = std::numeric_limits<T>::max();
- T max = std::numeric_limits<T>::lowest();
-
- const T* values = data.GetValues<T>(1);
- arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- min = std::min(min, values[pos + i]);
- max = std::max(max, values[pos + i]);
- }
- });
-
- return std::make_pair(min, max);
-}
-
-template <typename T>
-std::pair<T, T> GetMinMax(const Datum& datum) {
- T min = std::numeric_limits<T>::max();
- T max = std::numeric_limits<T>::lowest();
-
- for (const auto& array : datum.chunks()) {
- T local_min, local_max;
- std::tie(local_min, local_max) = GetMinMax<T>(*array->data());
- min = std::min(min, local_min);
- max = std::max(max, local_max);
- }
-
- return std::make_pair(min, max);
-}
-
-// Count value occurrences of an array, ignore nulls.
-// 'counts' must be zeroed and with enough size.
-template <typename T>
-ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T min) {
- const int64_t n = data.length - data.GetNullCount();
- if (n > 0) {
- const T* values = data.GetValues<T>(1);
- arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- ++counts[values[pos + i] - min];
- }
- });
- }
- return n;
-}
-
-template <typename T>
-int64_t CountValues(uint64_t* counts, const Datum& datum, T min) {
- int64_t n = 0;
- for (const auto& array : datum.chunks()) {
- n += CountValues<T>(counts, *array->data(), min);
- }
- return n;
-}
-
-// Copy numerical array values to a buffer, ignore nulls.
-template <typename T>
-ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
- const int64_t n = data.length - data.GetNullCount();
- if (n > 0) {
- int64_t index = 0;
- const T* values = data.GetValues<T>(1);
- arrow::internal::VisitSetBitRunsVoid(
- data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) {
- memcpy(out + index, values + pos, len * sizeof(T));
- index += len;
- });
- }
- return n;
-}
-
-template <typename T>
-int64_t CopyNonNullValues(const Datum& datum, T* out) {
- int64_t n = 0;
- for (const auto& array : datum.chunks()) {
- n += CopyNonNullValues(*array->data(), out + n);
- }
- return n;
-}
-
+// Augment a unary ArrayKernelExec which supports only array-like inputs with support for
+// scalar inputs. Scalars will be transformed to 1-long arrays with the scalar's value (or
+// null if the scalar is null) as its only element. This 1-long array will be passed to
+// the original exec, then the only element of the resulting array will be extracted as
+// the output scalar. This could be far more efficient, but instead of optimizing this
+// it'd be better to support scalar inputs "upstream" in original exec.
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(
+ ArrayKernelExec exec, NullHandling::type null_handling = NullHandling::INTERSECTION);
+
+// Return (min, max) of a numerical array, ignore nulls.
+// For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
+template <typename T>
+ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::lowest();
+
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ min = std::min(min, values[pos + i]);
+ max = std::max(max, values[pos + i]);
+ }
+ });
+
+ return std::make_pair(min, max);
+}
+
+template <typename T>
+std::pair<T, T> GetMinMax(const Datum& datum) {
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::lowest();
+
+ for (const auto& array : datum.chunks()) {
+ T local_min, local_max;
+ std::tie(local_min, local_max) = GetMinMax<T>(*array->data());
+ min = std::min(min, local_min);
+ max = std::max(max, local_max);
+ }
+
+ return std::make_pair(min, max);
+}
+
+// Count value occurrences of an array, ignore nulls.
+// 'counts' must be zeroed and with enough size.
+template <typename T>
+ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T min) {
+ const int64_t n = data.length - data.GetNullCount();
+ if (n > 0) {
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++counts[values[pos + i] - min];
+ }
+ });
+ }
+ return n;
+}
+
+template <typename T>
+int64_t CountValues(uint64_t* counts, const Datum& datum, T min) {
+ int64_t n = 0;
+ for (const auto& array : datum.chunks()) {
+ n += CountValues<T>(counts, *array->data(), min);
+ }
+ return n;
+}
+
+// Copy numerical array values to a buffer, ignore nulls.
+template <typename T>
+ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
+ const int64_t n = data.length - data.GetNullCount();
+ if (n > 0) {
+ int64_t index = 0;
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(
+ data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) {
+ memcpy(out + index, values + pos, len * sizeof(T));
+ index += len;
+ });
+ }
+ return n;
+}
+
+template <typename T>
+int64_t CopyNonNullValues(const Datum& datum, T* out) {
+ int64_t n = 0;
+ for (const auto& array : datum.chunks()) {
+ n += CopyNonNullValues(*array->data(), out + n);
+ }
+ return n;
+}
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
index a68e78130f2..224916f5980 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -22,7 +22,7 @@
#include "arrow/array/array_dict.h"
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_primitive.h"
-#include "arrow/array/concatenate.h"
+#include "arrow/array/concatenate.h"
#include "arrow/array/dict_internal.h"
#include "arrow/array/util.h"
#include "arrow/compute/api_vector.h"
@@ -60,10 +60,10 @@ class UniqueAction final : public ActionBase {
static constexpr bool with_error_status = false;
- UniqueAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : ActionBase(type, pool) {}
-
+ UniqueAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool) {}
+
Status Reset() { return Status::OK(); }
Status Reserve(const int64_t length) { return Status::OK(); }
@@ -80,8 +80,8 @@ class UniqueAction final : public ActionBase {
template <class Index>
void ObserveNotFound(Index index) {}
- bool ShouldEncodeNulls() { return true; }
-
+ bool ShouldEncodeNulls() { return true; }
+
Status Flush(Datum* out) { return Status::OK(); }
Status FlushFinal(Datum* out) { return Status::OK(); }
@@ -96,8 +96,8 @@ class ValueCountsAction final : ActionBase {
static constexpr bool with_error_status = true;
- ValueCountsAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
+ ValueCountsAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
: ActionBase(type, pool), count_builder_(pool) {}
Status Reserve(const int64_t length) {
@@ -153,8 +153,8 @@ class ValueCountsAction final : ActionBase {
}
}
- bool ShouldEncodeNulls() const { return true; }
-
+ bool ShouldEncodeNulls() const { return true; }
+
private:
Int64Builder count_builder_;
};
@@ -168,13 +168,13 @@ class DictEncodeAction final : public ActionBase {
static constexpr bool with_error_status = false;
- DictEncodeAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : ActionBase(type, pool), indices_builder_(pool) {
- if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(options)) {
- encode_options_ = *options_ptr;
- }
- }
+ DictEncodeAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool), indices_builder_(pool) {
+ if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(options)) {
+ encode_options_ = *options_ptr;
+ }
+ }
Status Reset() {
indices_builder_.Reset();
@@ -185,16 +185,16 @@ class DictEncodeAction final : public ActionBase {
template <class Index>
void ObserveNullFound(Index index) {
- if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) {
- indices_builder_.UnsafeAppendNull();
- } else {
- indices_builder_.UnsafeAppend(index);
- }
+ if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) {
+ indices_builder_.UnsafeAppendNull();
+ } else {
+ indices_builder_.UnsafeAppend(index);
+ }
}
template <class Index>
void ObserveNullNotFound(Index index) {
- ObserveNullFound(index);
+ ObserveNullFound(index);
}
template <class Index>
@@ -207,10 +207,10 @@ class DictEncodeAction final : public ActionBase {
ObserveFound(index);
}
- bool ShouldEncodeNulls() {
- return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
- }
-
+ bool ShouldEncodeNulls() {
+ return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
+ }
+
Status Flush(Datum* out) {
std::shared_ptr<ArrayData> result;
RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
@@ -222,14 +222,14 @@ class DictEncodeAction final : public ActionBase {
private:
Int32Builder indices_builder_;
- DictionaryEncodeOptions encode_options_;
+ DictionaryEncodeOptions encode_options_;
};
class HashKernel : public KernelState {
public:
- HashKernel() : options_(nullptr) {}
- explicit HashKernel(const FunctionOptions* options) : options_(options) {}
-
+ HashKernel() : options_(nullptr) {}
+ explicit HashKernel(const FunctionOptions* options) : options_(options) {}
+
// Reset for another run.
virtual Status Reset() = 0;
@@ -253,7 +253,7 @@ class HashKernel : public KernelState {
virtual Status Append(const ArrayData& arr) = 0;
protected:
- const FunctionOptions* options_;
+ const FunctionOptions* options_;
std::mutex lock_;
};
@@ -262,12 +262,12 @@ class HashKernel : public KernelState {
// (NullType has a separate implementation)
template <typename Type, typename Scalar, typename Action,
- bool with_error_status = Action::with_error_status>
+ bool with_error_status = Action::with_error_status>
class RegularHashKernel : public HashKernel {
public:
- RegularHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {}
+ RegularHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {}
Status Reset() override {
memo_table_.reset(new MemoTable(pool_, 0));
@@ -307,7 +307,7 @@ class RegularHashKernel : public HashKernel {
&unused_memo_index);
},
[this]() {
- if (action_.ShouldEncodeNulls()) {
+ if (action_.ShouldEncodeNulls()) {
auto on_found = [this](int32_t memo_index) {
action_.ObserveNullFound(memo_index);
};
@@ -343,13 +343,13 @@ class RegularHashKernel : public HashKernel {
[this]() {
// Null
Status s = Status::OK();
- auto on_found = [this](int32_t memo_index) {
- action_.ObserveNullFound(memo_index);
- };
- auto on_not_found = [this, &s](int32_t memo_index) {
- action_.ObserveNullNotFound(memo_index, &s);
- };
- if (action_.ShouldEncodeNulls()) {
+ auto on_found = [this](int32_t memo_index) {
+ action_.ObserveNullFound(memo_index);
+ };
+ auto on_not_found = [this, &s](int32_t memo_index) {
+ action_.ObserveNullNotFound(memo_index, &s);
+ };
+ if (action_.ShouldEncodeNulls()) {
memo_table_->GetOrInsertNull(std::move(on_found), std::move(on_not_found));
}
return s;
@@ -368,23 +368,23 @@ class RegularHashKernel : public HashKernel {
// ----------------------------------------------------------------------
// Hash kernel implementation for nulls
-template <typename Action, bool with_error_status = Action::with_error_status>
+template <typename Action, bool with_error_status = Action::with_error_status>
class NullHashKernel : public HashKernel {
public:
- NullHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : pool_(pool), type_(type), action_(type, options, pool) {}
+ NullHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : pool_(pool), type_(type), action_(type, options, pool) {}
Status Reset() override { return action_.Reset(); }
- Status Append(const ArrayData& arr) override { return DoAppend(arr); }
-
- template <bool HasError = with_error_status>
- enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
+ Status Append(const ArrayData& arr) override { return DoAppend(arr); }
+
+ template <bool HasError = with_error_status>
+ enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
RETURN_NOT_OK(action_.Reserve(arr.length));
for (int64_t i = 0; i < arr.length; ++i) {
if (i == 0) {
- seen_null_ = true;
+ seen_null_ = true;
action_.ObserveNullNotFound(0);
} else {
action_.ObserveNullFound(0);
@@ -393,31 +393,31 @@ class NullHashKernel : public HashKernel {
return Status::OK();
}
- template <bool HasError = with_error_status>
- enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
- Status s = Status::OK();
- RETURN_NOT_OK(action_.Reserve(arr.length));
- for (int64_t i = 0; i < arr.length; ++i) {
- if (seen_null_ == false && i == 0) {
- seen_null_ = true;
- action_.ObserveNullNotFound(0, &s);
- } else {
- action_.ObserveNullFound(0);
- }
- }
- return s;
- }
-
+ template <bool HasError = with_error_status>
+ enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+ Status s = Status::OK();
+ RETURN_NOT_OK(action_.Reserve(arr.length));
+ for (int64_t i = 0; i < arr.length; ++i) {
+ if (seen_null_ == false && i == 0) {
+ seen_null_ = true;
+ action_.ObserveNullNotFound(0, &s);
+ } else {
+ action_.ObserveNullFound(0);
+ }
+ }
+ return s;
+ }
+
Status Flush(Datum* out) override { return action_.Flush(out); }
Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
- std::shared_ptr<NullArray> null_array;
- if (seen_null_) {
- null_array = std::make_shared<NullArray>(1);
- } else {
- null_array = std::make_shared<NullArray>(0);
- }
+ std::shared_ptr<NullArray> null_array;
+ if (seen_null_) {
+ null_array = std::make_shared<NullArray>(1);
+ } else {
+ null_array = std::make_shared<NullArray>(0);
+ }
*out = null_array->data();
return Status::OK();
}
@@ -427,7 +427,7 @@ class NullHashKernel : public HashKernel {
protected:
MemoryPool* pool_;
std::shared_ptr<DataType> type_;
- bool seen_null_ = false;
+ bool seen_null_ = false;
Action action_;
};
@@ -441,33 +441,33 @@ class DictionaryHashKernel : public HashKernel {
Status Reset() override { return indices_kernel_->Reset(); }
- Status Append(const ArrayData& arr) override {
+ Status Append(const ArrayData& arr) override {
if (!dictionary_) {
- dictionary_ = arr.dictionary;
- } else if (!MakeArray(dictionary_)->Equals(*MakeArray(arr.dictionary))) {
- // NOTE: This approach computes a new dictionary unification per chunk.
- // This is in effect O(n*k) where n is the total chunked array length and
- // k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
- //
- // A better approach may be to run the kernel over each individual chunk,
- // and then hash-aggregate all results (for example sum-group-by for
- // the "value_counts" kernel).
- auto out_dict_type = dictionary_->type;
- std::shared_ptr<Buffer> transpose_map;
- std::shared_ptr<Array> out_dict;
- ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type));
-
- ARROW_CHECK_OK(unifier->Unify(*MakeArray(dictionary_)));
- ARROW_CHECK_OK(unifier->Unify(*MakeArray(arr.dictionary), &transpose_map));
- ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict));
-
- this->dictionary_ = out_dict->data();
- auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
- auto in_dict_array = MakeArray(std::make_shared<ArrayData>(arr));
- ARROW_ASSIGN_OR_RAISE(
- auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
- .Transpose(arr.type, out_dict, transpose));
- return indices_kernel_->Append(*tmp->data());
+ dictionary_ = arr.dictionary;
+ } else if (!MakeArray(dictionary_)->Equals(*MakeArray(arr.dictionary))) {
+ // NOTE: This approach computes a new dictionary unification per chunk.
+ // This is in effect O(n*k) where n is the total chunked array length and
+ // k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
+ //
+ // A better approach may be to run the kernel over each individual chunk,
+ // and then hash-aggregate all results (for example sum-group-by for
+ // the "value_counts" kernel).
+ auto out_dict_type = dictionary_->type;
+ std::shared_ptr<Buffer> transpose_map;
+ std::shared_ptr<Array> out_dict;
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type));
+
+ ARROW_CHECK_OK(unifier->Unify(*MakeArray(dictionary_)));
+ ARROW_CHECK_OK(unifier->Unify(*MakeArray(arr.dictionary), &transpose_map));
+ ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict));
+
+ this->dictionary_ = out_dict->data();
+ auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
+ auto in_dict_array = MakeArray(std::make_shared<ArrayData>(arr));
+ ARROW_ASSIGN_OR_RAISE(
+ auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
+ .Transpose(arr.type, out_dict, transpose));
+ return indices_kernel_->Append(*tmp->data());
}
return indices_kernel_->Append(arr);
@@ -513,19 +513,19 @@ struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
};
template <typename Type, typename Action>
-Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
+ const KernelInitArgs& args) {
using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
- auto result = ::arrow::internal::make_unique<HashKernelType>(
- args.inputs[0].type, args.options, ctx->memory_pool());
- RETURN_NOT_OK(result->Reset());
+ auto result = ::arrow::internal::make_unique<HashKernelType>(
+ args.inputs[0].type, args.options, ctx->memory_pool());
+ RETURN_NOT_OK(result->Reset());
return std::move(result);
}
template <typename Type, typename Action>
-Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- return HashInitImpl<Type, Action>(ctx, args);
+Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ return HashInitImpl<Type, Action>(ctx, args);
}
template <typename Action>
@@ -564,8 +564,8 @@ KernelInit GetHashInit(Type::type type_id) {
case Type::LARGE_STRING:
return HashInit<LargeBinaryType, Action>;
case Type::FIXED_SIZE_BINARY:
- case Type::DECIMAL128:
- case Type::DECIMAL256:
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
return HashInit<FixedSizeBinaryType, Action>;
default:
DCHECK(false);
@@ -573,13 +573,13 @@ KernelInit GetHashInit(Type::type type_id) {
}
}
-using DictionaryEncodeState = OptionsWrapper<DictionaryEncodeOptions>;
-
+using DictionaryEncodeState = OptionsWrapper<DictionaryEncodeOptions>;
+
template <typename Action>
-Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
const auto& dict_type = checked_cast<const DictionaryType&>(*args.inputs[0].type);
- Result<std::unique_ptr<HashKernel>> indices_hasher;
+ Result<std::unique_ptr<HashKernel>> indices_hasher;
switch (dict_type.index_type()->id()) {
case Type::INT8:
indices_hasher = HashInitImpl<UInt8Type, Action>(ctx, args);
@@ -597,37 +597,37 @@ Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
DCHECK(false) << "Unsupported dictionary index type";
break;
}
- RETURN_NOT_OK(indices_hasher);
- return ::arrow::internal::make_unique<DictionaryHashKernel>(
- std::move(indices_hasher.ValueOrDie()));
+ RETURN_NOT_OK(indices_hasher);
+ return ::arrow::internal::make_unique<DictionaryHashKernel>(
+ std::move(indices_hasher.ValueOrDie()));
}
-Status HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
- RETURN_NOT_OK(hash_impl->Append(ctx, *batch[0].array()));
- RETURN_NOT_OK(hash_impl->Flush(out));
- return Status::OK();
+ RETURN_NOT_OK(hash_impl->Append(ctx, *batch[0].array()));
+ RETURN_NOT_OK(hash_impl->Flush(out));
+ return Status::OK();
}
-Status UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+Status UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
- RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
*out = {Datum(uniques)};
- return Status::OK();
+ return Status::OK();
}
-Status DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+Status DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
- RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
auto dict_type = dictionary(int32(), uniques->type);
auto dict = MakeArray(uniques);
for (size_t i = 0; i < out->size(); ++i) {
(*out)[i] =
std::make_shared<DictionaryArray>(dict_type, (*out)[i].make_array(), dict);
}
- return Status::OK();
+ return Status::OK();
}
std::shared_ptr<ArrayData> BoxValueCounts(const std::shared_ptr<ArrayData>& uniques,
@@ -638,33 +638,33 @@ std::shared_ptr<ArrayData> BoxValueCounts(const std::shared_ptr<ArrayData>& uniq
return std::make_shared<StructArray>(data_type, uniques->length, children)->data();
}
-Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
Datum value_counts;
- RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
- RETURN_NOT_OK(hash_impl->FlushFinal(&value_counts));
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->FlushFinal(&value_counts));
*out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
- return Status::OK();
+ return Status::OK();
}
-Status UniqueFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
- RETURN_NOT_OK(UniqueFinalize(ctx, out));
+Status UniqueFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
+ RETURN_NOT_OK(UniqueFinalize(ctx, out));
auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
(*out)[0].mutable_array()->dictionary = hash->dictionary();
- return Status::OK();
+ return Status::OK();
}
-Status ValueCountsFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
+Status ValueCountsFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
Datum value_counts;
- RETURN_NOT_OK(hash->GetDictionary(&uniques));
- RETURN_NOT_OK(hash->FlushFinal(&value_counts));
+ RETURN_NOT_OK(hash->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash->FlushFinal(&value_counts));
uniques->dictionary = hash->dictionary();
*out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
- return Status::OK();
+ return Status::OK();
}
ValueDescr DictEncodeOutput(KernelContext*, const std::vector<ValueDescr>& descrs) {
@@ -693,31 +693,31 @@ void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType out_ty)
DCHECK_OK(func->AddKernel(base));
}
- for (auto t : {Type::DECIMAL128, Type::DECIMAL256}) {
- base.init = GetHashInit<Action>(t);
- base.signature = KernelSignature::Make({InputType::Array(t)}, out_ty);
- DCHECK_OK(func->AddKernel(base));
- }
+ for (auto t : {Type::DECIMAL128, Type::DECIMAL256}) {
+ base.init = GetHashInit<Action>(t);
+ base.signature = KernelSignature::Make({InputType::Array(t)}, out_ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
}
-const FunctionDoc unique_doc(
- "Compute unique elements",
- ("Return an array with distinct values. Nulls in the input are ignored."),
- {"array"});
-
-const FunctionDoc value_counts_doc(
- "Compute counts of unique elements",
- ("For each distinct value, compute the number of times it occurs in the array.\n"
- "The result is returned as an array of `struct<input type, int64>`.\n"
- "Nulls in the input are ignored."),
- {"array"});
-
-const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
-const FunctionDoc dictionary_encode_doc(
- "Dictionary-encode array",
- ("Return a dictionary-encoded version of the input array."), {"array"},
- "DictionaryEncodeOptions");
-
+const FunctionDoc unique_doc(
+ "Compute unique elements",
+ ("Return an array with distinct values. Nulls in the input are ignored."),
+ {"array"});
+
+const FunctionDoc value_counts_doc(
+ "Compute counts of unique elements",
+ ("For each distinct value, compute the number of times it occurs in the array.\n"
+ "The result is returned as an array of `struct<input type, int64>`.\n"
+ "Nulls in the input are ignored."),
+ {"array"});
+
+const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
+const FunctionDoc dictionary_encode_doc(
+ "Dictionary-encode array",
+ ("Return a dictionary-encoded version of the input array."), {"array"},
+ "DictionaryEncodeOptions");
+
} // namespace
void RegisterVectorHash(FunctionRegistry* registry) {
@@ -729,7 +729,7 @@ void RegisterVectorHash(FunctionRegistry* registry) {
base.finalize = UniqueFinalize;
base.output_chunked = false;
- auto unique = std::make_shared<VectorFunction>("unique", Arity::Unary(), &unique_doc);
+ auto unique = std::make_shared<VectorFunction>("unique", Arity::Unary(), &unique_doc);
AddHashKernels<UniqueAction>(unique.get(), base, OutputType(FirstType));
// Dictionary unique
@@ -745,8 +745,8 @@ void RegisterVectorHash(FunctionRegistry* registry) {
// value_counts
base.finalize = ValueCountsFinalize;
- auto value_counts =
- std::make_shared<VectorFunction>("value_counts", Arity::Unary(), &value_counts_doc);
+ auto value_counts =
+ std::make_shared<VectorFunction>("value_counts", Arity::Unary(), &value_counts_doc);
AddHashKernels<ValueCountsAction>(value_counts.get(), base,
OutputType(ValueCountsOutput));
@@ -765,9 +765,9 @@ void RegisterVectorHash(FunctionRegistry* registry) {
base.finalize = DictEncodeFinalize;
// Unique and ValueCounts output unchunked arrays
base.output_chunked = true;
- auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
- &dictionary_encode_doc,
- &kDefaultDictionaryEncodeOptions);
+ auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
+ &dictionary_encode_doc,
+ &kDefaultDictionaryEncodeOptions);
AddHashKernels<DictEncodeAction>(dict_encode.get(), base, OutputType(DictEncodeOutput));
// Calling dictionary_encode on dictionary input not supported, but if it
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
index b84640854ed..68db6ae04cc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -27,15 +27,15 @@ namespace internal {
namespace {
template <typename Type>
-Status ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
typename TypeTraits<Type>::ArrayType list_array(batch[0].array());
- ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool()));
- out->value = result->data();
- return Status::OK();
+ ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool()));
+ out->value = result->data();
+ return Status::OK();
}
template <typename Type, typename offset_type = typename Type::offset_type>
-Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
typename TypeTraits<Type>::ArrayType list(batch[0].array());
ArrayData* out_arr = out->mutable_array();
@@ -44,8 +44,8 @@ Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out)
out_arr->length = values_length;
out_arr->null_count = 0;
- ARROW_ASSIGN_OR_RAISE(out_arr->buffers[1],
- ctx->Allocate(values_length * sizeof(offset_type)));
+ ARROW_ASSIGN_OR_RAISE(out_arr->buffers[1],
+ ctx->Allocate(values_length * sizeof(offset_type)));
auto out_indices = reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
for (int64_t i = 0; i < list.length(); ++i) {
// Note: In most cases, null slots are empty, but when they are non-empty
@@ -55,7 +55,7 @@ Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out)
*out_indices++ = static_cast<offset_type>(i);
}
}
- return Status::OK();
+ return Status::OK();
}
Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& args) {
@@ -63,33 +63,33 @@ Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& arg
return ValueDescr::Array(list_type.value_type());
}
-const FunctionDoc list_flatten_doc(
- "Flatten list values",
- ("`lists` must have a list-like type.\n"
- "Return an array with the top list level flattened.\n"
- "Top-level null values in `lists` do not emit anything in the input."),
- {"lists"});
-
-const FunctionDoc list_parent_indices_doc(
- "Compute parent indices of nested list values",
- ("`lists` must have a list-like type.\n"
- "For each value in each list of `lists`, the top-level list index\n"
- "is emitted."),
- {"lists"});
-
+const FunctionDoc list_flatten_doc(
+ "Flatten list values",
+ ("`lists` must have a list-like type.\n"
+ "Return an array with the top list level flattened.\n"
+ "Top-level null values in `lists` do not emit anything in the input."),
+ {"lists"});
+
+const FunctionDoc list_parent_indices_doc(
+ "Compute parent indices of nested list values",
+ ("`lists` must have a list-like type.\n"
+ "For each value in each list of `lists`, the top-level list index\n"
+ "is emitted."),
+ {"lists"});
+
} // namespace
void RegisterVectorNested(FunctionRegistry* registry) {
- auto flatten =
- std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
+ auto flatten =
+ std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ValuesType),
ListFlatten<ListType>));
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
OutputType(ValuesType), ListFlatten<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(flatten)));
- auto list_parent_indices = std::make_shared<VectorFunction>(
- "list_parent_indices", Arity::Unary(), &list_parent_indices_doc);
+ auto list_parent_indices = std::make_shared<VectorFunction>(
+ "list_parent_indices", Arity::Unary(), &list_parent_indices_doc);
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, int32(),
ListParentIndices<ListType>));
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, int64(),
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
index 644aec2a4e9..d89f7a6bb40 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -1,540 +1,540 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bitmap_ops.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-
-namespace {
-
-Status ReplacementArrayTooShort(int64_t expected, int64_t actual) {
- return Status::Invalid("Replacement array must be of appropriate length (expected ",
- expected, " items but got ", actual, " items)");
-}
-
-// Helper to implement replace_with kernel with scalar mask for fixed-width types,
-// using callbacks to handle both bool and byte-sized types
-template <typename Functor>
-Status ReplaceWithScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- Datum source = array;
- if (!mask.is_valid) {
- // Output = null
- source = MakeNullScalar(output->type);
- } else if (mask.value) {
- // Output = replacement
- source = replacements;
- }
- uint8_t* out_bitmap = output->buffers[0]->mutable_data();
- uint8_t* out_values = output->buffers[1]->mutable_data();
- const int64_t out_offset = output->offset;
- if (source.is_array()) {
- const ArrayData& in_data = *source.array();
- if (in_data.length < array.length) {
- return ReplacementArrayTooShort(array.length, in_data.length);
- }
- Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
- array.length);
- if (in_data.MayHaveNulls()) {
- arrow::internal::CopyBitmap(in_data.buffers[0]->data(), in_data.offset,
- array.length, out_bitmap, out_offset);
- } else {
- BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
- }
- } else {
- const Scalar& in_data = *source.scalar();
- Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
- array.length);
- BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
- }
- return Status::OK();
-}
-
-struct CopyArrayBitmap {
- const uint8_t* in_bitmap;
- int64_t in_offset;
-
- void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
- int64_t length) const {
- arrow::internal::CopyBitmap(in_bitmap, in_offset + offset, length, out_bitmap,
- out_offset);
- }
-
- void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
- BitUtil::SetBitTo(out_bitmap, out_offset,
- BitUtil::GetBit(in_bitmap, in_offset + offset));
- }
-};
-
-struct CopyScalarBitmap {
- const bool is_valid;
-
- void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
- int64_t length) const {
- BitUtil::SetBitsTo(out_bitmap, out_offset, length, is_valid);
- }
-
- void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
- BitUtil::SetBitTo(out_bitmap, out_offset, is_valid);
- }
-};
-
-// Helper to implement replace_with kernel with array mask for fixed-width types,
-// using callbacks to handle both bool and byte-sized types and to handle
-// scalar and array replacements
-template <typename Functor, typename Data, typename CopyBitmap>
-void ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
- const Data& replacements, bool replacements_bitmap,
- const CopyBitmap& copy_bitmap, const uint8_t* mask_bitmap,
- const uint8_t* mask_values, uint8_t* out_bitmap,
- uint8_t* out_values, const int64_t out_offset) {
- Functor::CopyData(*array.type, out_values, /*out_offset=*/0, array, /*in_offset=*/0,
- array.length);
- arrow::internal::OptionalBinaryBitBlockCounter counter(
- mask_values, mask.offset, mask_bitmap, mask.offset, mask.length);
- int64_t write_offset = 0;
- int64_t replacements_offset = 0;
- while (write_offset < array.length) {
- BitBlockCount block = counter.NextAndBlock();
- if (block.AllSet()) {
- // Copy from replacement array
- Functor::CopyData(*array.type, out_values, out_offset + write_offset, replacements,
- replacements_offset, block.length);
- if (replacements_bitmap) {
- copy_bitmap.CopyBitmap(out_bitmap, out_offset + write_offset, replacements_offset,
- block.length);
- } else if (!replacements_bitmap && out_bitmap) {
- BitUtil::SetBitsTo(out_bitmap, out_offset + write_offset, block.length, true);
- }
- replacements_offset += block.length;
- } else if (block.popcount) {
- for (int64_t i = 0; i < block.length; ++i) {
- if (BitUtil::GetBit(mask_values, write_offset + mask.offset + i) &&
- (!mask_bitmap ||
- BitUtil::GetBit(mask_bitmap, write_offset + mask.offset + i))) {
- Functor::CopyData(*array.type, out_values, out_offset + write_offset + i,
- replacements, replacements_offset, /*length=*/1);
- if (replacements_bitmap) {
- copy_bitmap.SetBit(out_bitmap, out_offset + write_offset + i,
- replacements_offset);
- }
- replacements_offset++;
- }
- }
- }
- write_offset += block.length;
- }
-}
-
-template <typename Functor>
-Status ReplaceWithArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- const int64_t out_offset = output->offset;
- uint8_t* out_bitmap = nullptr;
- uint8_t* out_values = output->buffers[1]->mutable_data();
- const uint8_t* mask_bitmap = mask.MayHaveNulls() ? mask.buffers[0]->data() : nullptr;
- const uint8_t* mask_values = mask.buffers[1]->data();
- const bool replacements_bitmap = replacements.is_array()
- ? replacements.array()->MayHaveNulls()
- : !replacements.scalar()->is_valid;
- if (replacements.is_array()) {
- // Check that we have enough replacement values
- const int64_t replacements_length = replacements.array()->length;
-
- BooleanArray mask_arr(mask.length, mask.buffers[1], mask.buffers[0], mask.null_count,
- mask.offset);
- const int64_t count = mask_arr.true_count();
- if (count > replacements_length) {
- return ReplacementArrayTooShort(count, replacements_length);
- }
- }
- if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
- out_bitmap = output->buffers[0]->mutable_data();
- output->null_count = -1;
- if (array.MayHaveNulls()) {
- // Copy array's bitmap
- arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset, array.length,
- out_bitmap, out_offset);
- } else {
- // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
- BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
- }
- } else {
- BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, array.length,
- true);
- output->null_count = 0;
- }
-
- if (replacements.is_array()) {
- const ArrayData& array_repl = *replacements.array();
- ReplaceWithArrayMaskImpl<Functor>(
- array, mask, array_repl, replacements_bitmap,
- CopyArrayBitmap{replacements_bitmap ? array_repl.buffers[0]->data() : nullptr,
- array_repl.offset},
- mask_bitmap, mask_values, out_bitmap, out_values, out_offset);
- } else {
- const Scalar& scalar_repl = *replacements.scalar();
- ReplaceWithArrayMaskImpl<Functor>(array, mask, scalar_repl, replacements_bitmap,
- CopyScalarBitmap{scalar_repl.is_valid}, mask_bitmap,
- mask_values, out_bitmap, out_values, out_offset);
- }
-
- if (mask.MayHaveNulls()) {
- arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0]->data(),
- mask.offset, array.length, out_offset, out_bitmap);
- }
- return Status::OK();
-}
-
-template <typename Type, typename Enable = void>
-struct ReplaceWithMask {};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_number<Type>> {
- using T = typename TypeTraits<Type>::CType;
-
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * sizeof(T));
- std::memcpy(out + (out_offset * sizeof(T)), in_arr, length * sizeof(T));
- }
-
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- T* begin = reinterpret_cast<T*>(out + (out_offset * sizeof(T)));
- T* end = begin + length;
- std::fill(begin, end, UnboxScalar<Type>::Unbox(in));
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_boolean<Type>> {
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const auto in_arr = in.GetValues<uint8_t>(1, /*absolute_offset=*/0);
- arrow::internal::CopyBitmap(in_arr, in_offset + in.offset, length, out, out_offset);
- }
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- BitUtil::SetBitsTo(out, out_offset, length, in.is_valid);
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
- std::memcpy(begin, in_arr, length * width);
- }
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(in);
- // Null scalar may have null value buffer
- if (!scalar.value) return;
- const Buffer& buffer = *scalar.value;
- const uint8_t* value = buffer.data();
- DCHECK_GE(buffer.size(), width);
- for (int i = 0; i < length; i++) {
- std::memcpy(begin, value, width);
- begin += width;
- }
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_decimal<Type>> {
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
- std::memcpy(begin, in_arr, length * width);
- }
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto& scalar = checked_cast<const ScalarType&>(in);
- const auto value = scalar.value.ToBytes();
- for (int i = 0; i < length; i++) {
- std::memcpy(begin, value.data(), width);
- begin += width;
- }
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_null<Type>> {
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- *output = array;
- return Status::OK();
- }
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- *output = array;
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
- using offset_type = typename Type::offset_type;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- if (!mask.is_valid) {
- // Output = null
- ARROW_ASSIGN_OR_RAISE(
- auto replacement_array,
- MakeArrayOfNull(array.type, array.length, ctx->memory_pool()));
- *output = *replacement_array->data();
- } else if (mask.value) {
- // Output = replacement
- if (replacements.is_scalar()) {
- ARROW_ASSIGN_OR_RAISE(auto replacement_array,
- MakeArrayFromScalar(*replacements.scalar(), array.length,
- ctx->memory_pool()));
- *output = *replacement_array->data();
- } else {
- const ArrayData& replacement_array = *replacements.array();
- if (replacement_array.length < array.length) {
- return ReplacementArrayTooShort(array.length, replacement_array.length);
- }
- *output = replacement_array;
- output->length = array.length;
- }
- } else {
- // Output = input
- *output = array;
- }
- return Status::OK();
- }
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- BuilderType builder(array.type, ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(array.length));
- RETURN_NOT_OK(builder.ReserveData(array.buffers[2]->size()));
- int64_t source_offset = 0;
- int64_t replacements_offset = 0;
- RETURN_NOT_OK(VisitArrayDataInline<BooleanType>(
- mask,
- [&](bool replace) {
- if (replace && replacements.is_scalar()) {
- const Scalar& scalar = *replacements.scalar();
- if (scalar.is_valid) {
- RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(scalar)));
- } else {
- RETURN_NOT_OK(builder.AppendNull());
- }
- } else {
- const ArrayData& source = replace ? *replacements.array() : array;
- const int64_t offset = replace ? replacements_offset++ : source_offset;
- if (!source.MayHaveNulls() ||
- BitUtil::GetBit(source.buffers[0]->data(), source.offset + offset)) {
- const uint8_t* data = source.buffers[2]->data();
- const offset_type* offsets = source.GetValues<offset_type>(1);
- const offset_type offset0 = offsets[offset];
- const offset_type offset1 = offsets[offset + 1];
- RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
- } else {
- RETURN_NOT_OK(builder.AppendNull());
- }
- }
- source_offset++;
- return Status::OK();
- },
- [&]() {
- RETURN_NOT_OK(builder.AppendNull());
- source_offset++;
- return Status::OK();
- }));
- std::shared_ptr<Array> temp_output;
- RETURN_NOT_OK(builder.Finish(&temp_output));
- *output = *temp_output->data();
- // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
- output->type = array.type;
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct ReplaceWithMaskFunctor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const ArrayData& array = *batch[0].array();
- const Datum& replacements = batch[2];
- ArrayData* output = out->array().get();
- output->length = array.length;
-
- // Needed for FixedSizeBinary/parameterized types
- if (!array.type->Equals(*replacements.type(), /*check_metadata=*/false)) {
- return Status::Invalid("Replacements must be of same type (expected ",
- array.type->ToString(), " but got ",
- replacements.type()->ToString(), ")");
- }
-
- if (!replacements.is_array() && !replacements.is_scalar()) {
- return Status::Invalid("Replacements must be array or scalar");
- }
-
- if (batch[1].is_scalar()) {
- return ReplaceWithMask<Type>::ExecScalarMask(
- ctx, array, batch[1].scalar_as<BooleanScalar>(), replacements, output);
- }
- const ArrayData& mask = *batch[1].array();
- if (array.length != mask.length) {
- return Status::Invalid("Mask must be of same length as array (expected ",
- array.length, " items but got ", mask.length, " items)");
- }
- return ReplaceWithMask<Type>::ExecArrayMask(ctx, array, mask, replacements, output);
- }
-};
-
-} // namespace
-
-const FunctionDoc replace_with_mask_doc(
- "Replace items using a mask and replacement values",
- ("Given an array and a Boolean mask (either scalar or of equal length), "
- "along with replacement values (either scalar or array), "
- "each element of the array for which the corresponding mask element is "
- "true will be replaced by the next value from the replacements, "
- "or with null if the mask is null. "
- "Hence, for replacement arrays, len(replacements) == sum(mask == true)."),
- {"values", "mask", "replacements"});
-
-void RegisterVectorReplace(FunctionRegistry* registry) {
- auto func = std::make_shared<VectorFunction>("replace_with_mask", Arity::Ternary(),
- &replace_with_mask_doc);
- auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExec exec) {
- VectorKernel kernel;
- kernel.can_execute_chunkwise = false;
- if (is_fixed_width(get_id.id)) {
- kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
- } else {
- kernel.can_write_into_slices = false;
- kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
- }
- kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
- kernel.signature = KernelSignature::Make(
- {InputType::Array(get_id.id), InputType(boolean()), InputType(get_id.id)},
- OutputType(FirstType));
- kernel.exec = std::move(exec);
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- };
- auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
- add_kernel(get_id, GenerateTypeAgnosticPrimitive<ReplaceWithMaskFunctor>(get_id));
- };
- for (const auto& ty : NumericTypes()) {
- add_primitive_kernel(ty);
- }
- for (const auto& ty : TemporalTypes()) {
- add_primitive_kernel(ty);
- }
- add_primitive_kernel(null());
- add_primitive_kernel(boolean());
- add_primitive_kernel(day_time_interval());
- add_primitive_kernel(month_interval());
- add_kernel(Type::FIXED_SIZE_BINARY, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
- add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<Decimal128Type>::Exec);
- add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<Decimal256Type>::Exec);
- for (const auto& ty : BaseBinaryTypes()) {
- add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<ReplaceWithMaskFunctor>(*ty));
- }
- // TODO: list types
- DCHECK_OK(registry->AddFunction(std::move(func)));
-
- // TODO(ARROW-9431): "replace_with_indices"
-}
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+Status ReplacementArrayTooShort(int64_t expected, int64_t actual) {
+ return Status::Invalid("Replacement array must be of appropriate length (expected ",
+ expected, " items but got ", actual, " items)");
+}
+
+// Helper to implement replace_with kernel with scalar mask for fixed-width types,
+// using callbacks to handle both bool and byte-sized types
+template <typename Functor>
+Status ReplaceWithScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ Datum source = array;
+ if (!mask.is_valid) {
+ // Output = null
+ source = MakeNullScalar(output->type);
+ } else if (mask.value) {
+ // Output = replacement
+ source = replacements;
+ }
+ uint8_t* out_bitmap = output->buffers[0]->mutable_data();
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ const int64_t out_offset = output->offset;
+ if (source.is_array()) {
+ const ArrayData& in_data = *source.array();
+ if (in_data.length < array.length) {
+ return ReplacementArrayTooShort(array.length, in_data.length);
+ }
+ Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
+ array.length);
+ if (in_data.MayHaveNulls()) {
+ arrow::internal::CopyBitmap(in_data.buffers[0]->data(), in_data.offset,
+ array.length, out_bitmap, out_offset);
+ } else {
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
+ }
+ } else {
+ const Scalar& in_data = *source.scalar();
+ Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
+ array.length);
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
+ }
+ return Status::OK();
+}
+
+struct CopyArrayBitmap {
+ const uint8_t* in_bitmap;
+ int64_t in_offset;
+
+ void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
+ int64_t length) const {
+ arrow::internal::CopyBitmap(in_bitmap, in_offset + offset, length, out_bitmap,
+ out_offset);
+ }
+
+ void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
+ BitUtil::SetBitTo(out_bitmap, out_offset,
+ BitUtil::GetBit(in_bitmap, in_offset + offset));
+ }
+};
+
+struct CopyScalarBitmap {
+ const bool is_valid;
+
+ void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
+ int64_t length) const {
+ BitUtil::SetBitsTo(out_bitmap, out_offset, length, is_valid);
+ }
+
+ void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
+ BitUtil::SetBitTo(out_bitmap, out_offset, is_valid);
+ }
+};
+
+// Helper to implement replace_with kernel with array mask for fixed-width types,
+// using callbacks to handle both bool and byte-sized types and to handle
+// scalar and array replacements
+template <typename Functor, typename Data, typename CopyBitmap>
+void ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
+ const Data& replacements, bool replacements_bitmap,
+ const CopyBitmap& copy_bitmap, const uint8_t* mask_bitmap,
+ const uint8_t* mask_values, uint8_t* out_bitmap,
+ uint8_t* out_values, const int64_t out_offset) {
+ Functor::CopyData(*array.type, out_values, /*out_offset=*/0, array, /*in_offset=*/0,
+ array.length);
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ mask_values, mask.offset, mask_bitmap, mask.offset, mask.length);
+ int64_t write_offset = 0;
+ int64_t replacements_offset = 0;
+ while (write_offset < array.length) {
+ BitBlockCount block = counter.NextAndBlock();
+ if (block.AllSet()) {
+ // Copy from replacement array
+ Functor::CopyData(*array.type, out_values, out_offset + write_offset, replacements,
+ replacements_offset, block.length);
+ if (replacements_bitmap) {
+ copy_bitmap.CopyBitmap(out_bitmap, out_offset + write_offset, replacements_offset,
+ block.length);
+ } else if (!replacements_bitmap && out_bitmap) {
+ BitUtil::SetBitsTo(out_bitmap, out_offset + write_offset, block.length, true);
+ }
+ replacements_offset += block.length;
+ } else if (block.popcount) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(mask_values, write_offset + mask.offset + i) &&
+ (!mask_bitmap ||
+ BitUtil::GetBit(mask_bitmap, write_offset + mask.offset + i))) {
+ Functor::CopyData(*array.type, out_values, out_offset + write_offset + i,
+ replacements, replacements_offset, /*length=*/1);
+ if (replacements_bitmap) {
+ copy_bitmap.SetBit(out_bitmap, out_offset + write_offset + i,
+ replacements_offset);
+ }
+ replacements_offset++;
+ }
+ }
+ }
+ write_offset += block.length;
+ }
+}
+
+template <typename Functor>
+Status ReplaceWithArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ const int64_t out_offset = output->offset;
+ uint8_t* out_bitmap = nullptr;
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ const uint8_t* mask_bitmap = mask.MayHaveNulls() ? mask.buffers[0]->data() : nullptr;
+ const uint8_t* mask_values = mask.buffers[1]->data();
+ const bool replacements_bitmap = replacements.is_array()
+ ? replacements.array()->MayHaveNulls()
+ : !replacements.scalar()->is_valid;
+ if (replacements.is_array()) {
+ // Check that we have enough replacement values
+ const int64_t replacements_length = replacements.array()->length;
+
+ BooleanArray mask_arr(mask.length, mask.buffers[1], mask.buffers[0], mask.null_count,
+ mask.offset);
+ const int64_t count = mask_arr.true_count();
+ if (count > replacements_length) {
+ return ReplacementArrayTooShort(count, replacements_length);
+ }
+ }
+ if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
+ out_bitmap = output->buffers[0]->mutable_data();
+ output->null_count = -1;
+ if (array.MayHaveNulls()) {
+ // Copy array's bitmap
+ arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset, array.length,
+ out_bitmap, out_offset);
+ } else {
+ // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
+ }
+ } else {
+ BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, array.length,
+ true);
+ output->null_count = 0;
+ }
+
+ if (replacements.is_array()) {
+ const ArrayData& array_repl = *replacements.array();
+ ReplaceWithArrayMaskImpl<Functor>(
+ array, mask, array_repl, replacements_bitmap,
+ CopyArrayBitmap{replacements_bitmap ? array_repl.buffers[0]->data() : nullptr,
+ array_repl.offset},
+ mask_bitmap, mask_values, out_bitmap, out_values, out_offset);
+ } else {
+ const Scalar& scalar_repl = *replacements.scalar();
+ ReplaceWithArrayMaskImpl<Functor>(array, mask, scalar_repl, replacements_bitmap,
+ CopyScalarBitmap{scalar_repl.is_valid}, mask_bitmap,
+ mask_values, out_bitmap, out_values, out_offset);
+ }
+
+ if (mask.MayHaveNulls()) {
+ arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0]->data(),
+ mask.offset, array.length, out_offset, out_bitmap);
+ }
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct ReplaceWithMask {};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_number<Type>> {
+ using T = typename TypeTraits<Type>::CType;
+
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * sizeof(T));
+ std::memcpy(out + (out_offset * sizeof(T)), in_arr, length * sizeof(T));
+ }
+
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ T* begin = reinterpret_cast<T*>(out + (out_offset * sizeof(T)));
+ T* end = begin + length;
+ std::fill(begin, end, UnboxScalar<Type>::Unbox(in));
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_boolean<Type>> {
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const auto in_arr = in.GetValues<uint8_t>(1, /*absolute_offset=*/0);
+ arrow::internal::CopyBitmap(in_arr, in_offset + in.offset, length, out, out_offset);
+ }
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ BitUtil::SetBitsTo(out, out_offset, length, in.is_valid);
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
+ std::memcpy(begin, in_arr, length * width);
+ }
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(in);
+ // Null scalar may have null value buffer
+ if (!scalar.value) return;
+ const Buffer& buffer = *scalar.value;
+ const uint8_t* value = buffer.data();
+ DCHECK_GE(buffer.size(), width);
+ for (int i = 0; i < length; i++) {
+ std::memcpy(begin, value, width);
+ begin += width;
+ }
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_decimal<Type>> {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
+ std::memcpy(begin, in_arr, length * width);
+ }
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto& scalar = checked_cast<const ScalarType&>(in);
+ const auto value = scalar.value.ToBytes();
+ for (int i = 0; i < length; i++) {
+ std::memcpy(begin, value.data(), width);
+ begin += width;
+ }
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_null<Type>> {
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ *output = array;
+ return Status::OK();
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ *output = array;
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ if (!mask.is_valid) {
+ // Output = null
+ ARROW_ASSIGN_OR_RAISE(
+ auto replacement_array,
+ MakeArrayOfNull(array.type, array.length, ctx->memory_pool()));
+ *output = *replacement_array->data();
+ } else if (mask.value) {
+ // Output = replacement
+ if (replacements.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto replacement_array,
+ MakeArrayFromScalar(*replacements.scalar(), array.length,
+ ctx->memory_pool()));
+ *output = *replacement_array->data();
+ } else {
+ const ArrayData& replacement_array = *replacements.array();
+ if (replacement_array.length < array.length) {
+ return ReplacementArrayTooShort(array.length, replacement_array.length);
+ }
+ *output = replacement_array;
+ output->length = array.length;
+ }
+ } else {
+ // Output = input
+ *output = array;
+ }
+ return Status::OK();
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ BuilderType builder(array.type, ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(array.length));
+ RETURN_NOT_OK(builder.ReserveData(array.buffers[2]->size()));
+ int64_t source_offset = 0;
+ int64_t replacements_offset = 0;
+ RETURN_NOT_OK(VisitArrayDataInline<BooleanType>(
+ mask,
+ [&](bool replace) {
+ if (replace && replacements.is_scalar()) {
+ const Scalar& scalar = *replacements.scalar();
+ if (scalar.is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(scalar)));
+ } else {
+ RETURN_NOT_OK(builder.AppendNull());
+ }
+ } else {
+ const ArrayData& source = replace ? *replacements.array() : array;
+ const int64_t offset = replace ? replacements_offset++ : source_offset;
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + offset)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues<offset_type>(1);
+ const offset_type offset0 = offsets[offset];
+ const offset_type offset1 = offsets[offset + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ } else {
+ RETURN_NOT_OK(builder.AppendNull());
+ }
+ }
+ source_offset++;
+ return Status::OK();
+ },
+ [&]() {
+ RETURN_NOT_OK(builder.AppendNull());
+ source_offset++;
+ return Status::OK();
+ }));
+ std::shared_ptr<Array> temp_output;
+ RETURN_NOT_OK(builder.Finish(&temp_output));
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = array.type;
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMaskFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& array = *batch[0].array();
+ const Datum& replacements = batch[2];
+ ArrayData* output = out->array().get();
+ output->length = array.length;
+
+ // Needed for FixedSizeBinary/parameterized types
+ if (!array.type->Equals(*replacements.type(), /*check_metadata=*/false)) {
+ return Status::Invalid("Replacements must be of same type (expected ",
+ array.type->ToString(), " but got ",
+ replacements.type()->ToString(), ")");
+ }
+
+ if (!replacements.is_array() && !replacements.is_scalar()) {
+ return Status::Invalid("Replacements must be array or scalar");
+ }
+
+ if (batch[1].is_scalar()) {
+ return ReplaceWithMask<Type>::ExecScalarMask(
+ ctx, array, batch[1].scalar_as<BooleanScalar>(), replacements, output);
+ }
+ const ArrayData& mask = *batch[1].array();
+ if (array.length != mask.length) {
+ return Status::Invalid("Mask must be of same length as array (expected ",
+ array.length, " items but got ", mask.length, " items)");
+ }
+ return ReplaceWithMask<Type>::ExecArrayMask(ctx, array, mask, replacements, output);
+ }
+};
+
+} // namespace
+
+const FunctionDoc replace_with_mask_doc(
+ "Replace items using a mask and replacement values",
+ ("Given an array and a Boolean mask (either scalar or of equal length), "
+ "along with replacement values (either scalar or array), "
+ "each element of the array for which the corresponding mask element is "
+ "true will be replaced by the next value from the replacements, "
+ "or with null if the mask is null. "
+ "Hence, for replacement arrays, len(replacements) == sum(mask == true)."),
+ {"values", "mask", "replacements"});
+
+void RegisterVectorReplace(FunctionRegistry* registry) {
+ auto func = std::make_shared<VectorFunction>("replace_with_mask", Arity::Ternary(),
+ &replace_with_mask_doc);
+ auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExec exec) {
+ VectorKernel kernel;
+ kernel.can_execute_chunkwise = false;
+ if (is_fixed_width(get_id.id)) {
+ kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
+ } else {
+ kernel.can_write_into_slices = false;
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ }
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ kernel.signature = KernelSignature::Make(
+ {InputType::Array(get_id.id), InputType(boolean()), InputType(get_id.id)},
+ OutputType(FirstType));
+ kernel.exec = std::move(exec);
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ };
+ auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
+ add_kernel(get_id, GenerateTypeAgnosticPrimitive<ReplaceWithMaskFunctor>(get_id));
+ };
+ for (const auto& ty : NumericTypes()) {
+ add_primitive_kernel(ty);
+ }
+ for (const auto& ty : TemporalTypes()) {
+ add_primitive_kernel(ty);
+ }
+ add_primitive_kernel(null());
+ add_primitive_kernel(boolean());
+ add_primitive_kernel(day_time_interval());
+ add_primitive_kernel(month_interval());
+ add_kernel(Type::FIXED_SIZE_BINARY, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
+ add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<Decimal128Type>::Exec);
+ add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<Decimal256Type>::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<ReplaceWithMaskFunctor>(*ty));
+ }
+ // TODO: list types
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // TODO(ARROW-9431): "replace_with_indices"
+}
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
index 5845a7ee2d0..b70dadbd146 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -36,7 +36,7 @@
#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/bitmap_reader.h"
@@ -87,8 +87,8 @@ int64_t GetFilterOutputSize(const ArrayData& filter,
return output_size;
}
-namespace {
-
+namespace {
+
template <typename IndexType>
Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
@@ -96,130 +96,130 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
using T = typename IndexType::c_type;
const uint8_t* filter_data = filter.buffers[1]->data();
- const bool have_filter_nulls = filter.MayHaveNulls();
- const uint8_t* filter_is_valid =
- have_filter_nulls ? filter.buffers[0]->data() : nullptr;
-
- if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
- // Most complex case: the filter may have nulls and we don't drop them.
- // The logic is ternary:
- // - filter is null: emit null
- // - filter is valid and true: emit index
- // - filter is valid and false: don't emit anything
-
- typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
-
- // The position relative to the start of the filter
- T position = 0;
- // The current position taking the filter offset into account
- int64_t position_with_offset = filter.offset;
-
- // To count blocks where filter_data[i] || !filter_is_valid[i]
+ const bool have_filter_nulls = filter.MayHaveNulls();
+ const uint8_t* filter_is_valid =
+ have_filter_nulls ? filter.buffers[0]->data() : nullptr;
+
+ if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
+ // Most complex case: the filter may have nulls and we don't drop them.
+ // The logic is ternary:
+ // - filter is null: emit null
+ // - filter is valid and true: emit index
+ // - filter is valid and false: don't emit anything
+
+ typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
+
+ // The position relative to the start of the filter
+ T position = 0;
+ // The current position taking the filter offset into account
+ int64_t position_with_offset = filter.offset;
+
+ // To count blocks where filter_data[i] || !filter_is_valid[i]
BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
filter.offset, filter.length);
- BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
- while (position < filter.length) {
- // true OR NOT valid
- BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
- if (selected_or_null_block.NoneSet()) {
- position += selected_or_null_block.length;
- position_with_offset += selected_or_null_block.length;
- continue;
+ BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
+ while (position < filter.length) {
+ // true OR NOT valid
+ BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
+ if (selected_or_null_block.NoneSet()) {
+ position += selected_or_null_block.length;
+ position_with_offset += selected_or_null_block.length;
+ continue;
}
- RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
-
- // If the values are all valid and the selected_or_null_block is full,
- // then we can infer that all the values are true and skip the bit checking
- BitBlockCount is_valid_block = is_valid_counter.NextWord();
-
- if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
- // All the values are selected and non-null
- for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
- builder.UnsafeAppend(position++);
- }
- position_with_offset += selected_or_null_block.length;
- } else {
- // Some of the values are false or null
- for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
- if (BitUtil::GetBit(filter_is_valid, position_with_offset)) {
- if (BitUtil::GetBit(filter_data, position_with_offset)) {
- builder.UnsafeAppend(position);
+ RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
+
+ // If the values are all valid and the selected_or_null_block is full,
+ // then we can infer that all the values are true and skip the bit checking
+ BitBlockCount is_valid_block = is_valid_counter.NextWord();
+
+ if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
+ // All the values are selected and non-null
+ for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+ builder.UnsafeAppend(position++);
+ }
+ position_with_offset += selected_or_null_block.length;
+ } else {
+ // Some of the values are false or null
+ for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, position_with_offset)) {
+ if (BitUtil::GetBit(filter_data, position_with_offset)) {
+ builder.UnsafeAppend(position);
}
- } else {
- // Null slot, so append a null
- builder.UnsafeAppendNull();
+ } else {
+ // Null slot, so append a null
+ builder.UnsafeAppendNull();
}
- ++position;
- ++position_with_offset;
+ ++position;
+ ++position_with_offset;
}
}
}
- std::shared_ptr<ArrayData> result;
- RETURN_NOT_OK(builder.FinishInternal(&result));
- return result;
- }
-
- // Other cases don't emit nulls and are therefore simpler.
- TypedBufferBuilder<T> builder(memory_pool);
-
- if (have_filter_nulls) {
- // The filter may have nulls, so we scan the validity bitmap and the filter
- // data bitmap together.
- DCHECK_EQ(null_selection, FilterOptions::DROP);
-
- // The position relative to the start of the filter
- T position = 0;
- // The current position taking the filter offset into account
- int64_t position_with_offset = filter.offset;
-
- BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
- filter.offset, filter.length);
- while (position < filter.length) {
- BitBlockCount and_block = filter_counter.NextAndWord();
- RETURN_NOT_OK(builder.Reserve(and_block.popcount));
- if (and_block.AllSet()) {
- // All the values are selected and non-null
- for (int64_t i = 0; i < and_block.length; ++i) {
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(builder.FinishInternal(&result));
+ return result;
+ }
+
+ // Other cases don't emit nulls and are therefore simpler.
+ TypedBufferBuilder<T> builder(memory_pool);
+
+ if (have_filter_nulls) {
+ // The filter may have nulls, so we scan the validity bitmap and the filter
+ // data bitmap together.
+ DCHECK_EQ(null_selection, FilterOptions::DROP);
+
+ // The position relative to the start of the filter
+ T position = 0;
+ // The current position taking the filter offset into account
+ int64_t position_with_offset = filter.offset;
+
+ BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
+ filter.offset, filter.length);
+ while (position < filter.length) {
+ BitBlockCount and_block = filter_counter.NextAndWord();
+ RETURN_NOT_OK(builder.Reserve(and_block.popcount));
+ if (and_block.AllSet()) {
+ // All the values are selected and non-null
+ for (int64_t i = 0; i < and_block.length; ++i) {
builder.UnsafeAppend(position++);
}
- position_with_offset += and_block.length;
- } else if (!and_block.NoneSet()) {
- // Some of the values are false or null
- for (int64_t i = 0; i < and_block.length; ++i) {
- if (BitUtil::GetBit(filter_is_valid, position_with_offset) &&
- BitUtil::GetBit(filter_data, position_with_offset)) {
+ position_with_offset += and_block.length;
+ } else if (!and_block.NoneSet()) {
+ // Some of the values are false or null
+ for (int64_t i = 0; i < and_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, position_with_offset) &&
+ BitUtil::GetBit(filter_data, position_with_offset)) {
builder.UnsafeAppend(position);
}
++position;
++position_with_offset;
}
} else {
- position += and_block.length;
- position_with_offset += and_block.length;
+ position += and_block.length;
+ position_with_offset += and_block.length;
}
}
- } else {
- // The filter has no nulls, so we need only look for true values
- RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
- // Append the consecutive run of indices
- RETURN_NOT_OK(builder.Reserve(length));
- for (int64_t i = 0; i < length; ++i) {
- builder.UnsafeAppend(static_cast<T>(offset + i));
- }
- return Status::OK();
- }));
+ } else {
+ // The filter has no nulls, so we need only look for true values
+ RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
+ // Append the consecutive run of indices
+ RETURN_NOT_OK(builder.Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ builder.UnsafeAppend(static_cast<T>(offset + i));
+ }
+ return Status::OK();
+ }));
}
-
- const int64_t length = builder.length();
- std::shared_ptr<Buffer> out_buffer;
- RETURN_NOT_OK(builder.Finish(&out_buffer));
- return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
- BufferVector{nullptr, out_buffer}, /*null_count=*/0);
+
+ const int64_t length = builder.length();
+ std::shared_ptr<Buffer> out_buffer;
+ RETURN_NOT_OK(builder.Finish(&out_buffer));
+ return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
+ BufferVector{nullptr, out_buffer}, /*null_count=*/0);
}
-} // namespace
-
+} // namespace
+
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
MemoryPool* memory_pool) {
@@ -490,9 +490,9 @@ void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
}
}
-Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (TakeState::Get(ctx).boundscheck) {
- RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
}
PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
@@ -504,29 +504,29 @@ Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// allocating the validity bitmap altogether and save time and space. A
// streamlined PrimitiveTakeImpl would need to be written that skips all
// interactions with the output validity bitmap, though.
- RETURN_NOT_OK(PreallocateData(ctx, indices.length, values.bit_width,
- /*allocate_validity=*/true, out_arr));
+ RETURN_NOT_OK(PreallocateData(ctx, indices.length, values.bit_width,
+ /*allocate_validity=*/true, out_arr));
switch (values.bit_width) {
case 1:
- TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
+ break;
case 8:
- TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
+ break;
case 16:
- TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
+ break;
case 32:
- TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
+ break;
case 64:
- TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
+ break;
default:
DCHECK(false) << "Invalid values byte width";
break;
}
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -597,9 +597,9 @@ class PrimitiveFilterImpl {
void ExecNonNull() {
// Fast filter when values and filter are not null
- ::arrow::internal::VisitSetBitRunsVoid(
- filter_data_, filter_offset_, values_length_,
- [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
+ ::arrow::internal::VisitSetBitRunsVoid(
+ filter_data_, filter_offset_, values_length_,
+ [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
}
void Exec() {
@@ -783,7 +783,7 @@ inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
BitUtil::ClearBit(out_data_, out_offset_ + out_position_++);
}
-Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
PrimitiveArg filter = GetPrimitiveArg(*batch[1].array());
FilterOptions::NullSelectionBehavior null_selection =
@@ -808,30 +808,30 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// validity bitmap.
bool allocate_validity = values.null_count != 0 || filter.null_count != 0;
- RETURN_NOT_OK(
- PreallocateData(ctx, output_length, values.bit_width, allocate_validity, out_arr));
+ RETURN_NOT_OK(
+ PreallocateData(ctx, output_length, values.bit_width, allocate_validity, out_arr));
switch (values.bit_width) {
case 1:
- PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
+ break;
case 8:
- PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
+ break;
case 16:
- PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
+ break;
case 32:
- PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
+ break;
case 64:
- PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
+ break;
default:
DCHECK(false) << "Invalid values bit width";
break;
}
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -880,25 +880,25 @@ Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArrayData& values,
ArrayData* out) {
using offset_type = typename Type::offset_type;
const auto filter_data = filter.buffers[1]->data();
-
+
BINARY_FILTER_SETUP_COMMON();
- RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
+ RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
// Bulk-append raw data
- const offset_type run_data_bytes =
- (raw_offsets[position + length] - raw_offsets[position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
// Append offsets
- offset_type cur_offset = raw_offsets[position];
- for (int64_t i = 0; i < length; ++i) {
+ offset_type cur_offset = raw_offsets[position];
+ for (int64_t i = 0; i < length; ++i) {
offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[i + position + 1] - cur_offset;
- cur_offset = raw_offsets[i + position + 1];
+ offset += raw_offsets[i + position + 1] - cur_offset;
+ cur_offset = raw_offsets[i + position + 1];
}
- return Status::OK();
- }));
-
+ return Status::OK();
+ }));
+
offset_builder.UnsafeAppend(offset);
out->length = output_length;
RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
@@ -936,8 +936,8 @@ Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
BINARY_FILTER_SETUP_COMMON();
- int64_t in_position = 0;
- int64_t out_position = 0;
+ int64_t in_position = 0;
+ int64_t out_position = 0;
while (in_position < filter.length) {
BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
BitBlockCount values_valid_block = values_valid_counter.NextWord();
@@ -1079,7 +1079,7 @@ Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
#undef APPEND_RAW_DATA
#undef APPEND_SINGLE_VALUE
-Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
FilterOptions::NullSelectionBehavior null_selection =
FilterState::Get(ctx).null_selection_behavior;
@@ -1101,100 +1101,100 @@ Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (values.null_count == 0 && filter.null_count == 0) {
// Faster no-nulls case
if (is_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
- ctx, values, filter, output_length, null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
+ ctx, values, filter, output_length, null_selection, out_arr));
} else if (is_large_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
- ctx, values, filter, output_length, null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
+ ctx, values, filter, output_length, null_selection, out_arr));
} else {
DCHECK(false);
}
} else {
// Output may have nulls
- RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
+ RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
if (is_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
- null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
+ null_selection, out_arr));
} else if (is_large_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
- null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
+ null_selection, out_arr));
} else {
DCHECK(false);
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
// ----------------------------------------------------------------------
// Null take and filter
-Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (TakeState::Get(ctx).boundscheck) {
- RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
}
// batch.length doesn't take into account the take indices
auto new_length = batch[1].array()->length;
out->value = std::make_shared<NullArray>(new_length)->data();
- return Status::OK();
+ return Status::OK();
}
-Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
int64_t output_length = GetFilterOutputSize(
*batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
out->value = std::make_shared<NullArray>(output_length)->data();
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
// Dictionary take and filter
-Status DictionaryTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status DictionaryTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DictionaryArray values(batch[0].array());
Datum result;
- RETURN_NOT_OK(
- Take(Datum(values.indices()), batch[1], TakeState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(
+ Take(Datum(values.indices()), batch[1], TakeState::Get(ctx), ctx->exec_context())
+ .Value(&result));
DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
out->value = taken_values.data();
- return Status::OK();
+ return Status::OK();
}
-Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DictionaryArray dict_values(batch[0].array());
Datum result;
- RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array(),
- FilterState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array(),
+ FilterState::Get(ctx), ctx->exec_context())
+ .Value(&result));
DictionaryArray filtered_values(dict_values.type(), result.make_array(),
dict_values.dictionary());
out->value = filtered_values.data();
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
// Extension take and filter
-Status ExtensionTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExtensionTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ExtensionArray values(batch[0].array());
Datum result;
- RETURN_NOT_OK(
- Take(Datum(values.storage()), batch[1], TakeState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(
+ Take(Datum(values.storage()), batch[1], TakeState::Get(ctx), ctx->exec_context())
+ .Value(&result));
ExtensionArray taken_values(values.type(), result.make_array());
out->value = taken_values.data();
- return Status::OK();
+ return Status::OK();
}
-Status ExtensionFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExtensionFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ExtensionArray ext_values(batch[0].array());
Datum result;
- RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array(),
- FilterState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array(),
+ FilterState::Get(ctx), ctx->exec_context())
+ .Value(&result));
ExtensionArray filtered_values(ext_values.type(), result.make_array());
out->value = filtered_values.data();
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -1668,81 +1668,81 @@ struct ListImpl : public Selection<ListImpl<Type>, Type> {
}
};
-struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
- using Base = Selection<DenseUnionImpl, DenseUnionType>;
- LIFT_BASE_MEMBERS();
-
- TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
- TypedBufferBuilder<int8_t> child_id_buffer_builder_;
- std::vector<int8_t> type_codes_;
- std::vector<Int32Builder> child_indices_builders_;
-
- DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
- Datum* out)
- : Base(ctx, batch, output_length, out),
- value_offset_buffer_builder_(ctx->memory_pool()),
- child_id_buffer_builder_(ctx->memory_pool()),
- type_codes_(checked_cast<const UnionType&>(*this->values->type).type_codes()),
- child_indices_builders_(type_codes_.size()) {
- for (auto& child_indices_builder : child_indices_builders_) {
- child_indices_builder = Int32Builder(ctx->memory_pool());
- }
- }
-
- template <typename Adapter>
- Status GenerateOutput() {
- DenseUnionArray typed_values(this->values);
- Adapter adapter(this);
- RETURN_NOT_OK(adapter.Generate(
- [&](int64_t index) {
- int8_t child_id = typed_values.child_id(index);
- child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
- int32_t value_offset = typed_values.value_offset(index);
- value_offset_buffer_builder_.UnsafeAppend(
- static_cast<int32_t>(child_indices_builders_[child_id].length()));
- RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
- child_indices_builders_[child_id].UnsafeAppend(value_offset);
- return Status::OK();
- },
- [&]() {
- int8_t child_id = 0;
- child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
- value_offset_buffer_builder_.UnsafeAppend(
- static_cast<int32_t>(child_indices_builders_[child_id].length()));
- RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
- child_indices_builders_[child_id].UnsafeAppendNull();
- return Status::OK();
- }));
- return Status::OK();
- }
-
- Status Init() override {
- RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
- RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
- return Status::OK();
- }
-
- Status Finish() override {
- ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
- ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
- value_offset_buffer_builder_.Finish());
- DenseUnionArray typed_values(this->values);
- auto num_fields = typed_values.num_fields();
- auto num_rows = child_ids_buffer->size();
- BufferVector buffers{nullptr, std::move(child_ids_buffer),
- std::move(value_offsets_buffer)};
- *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
- for (auto i = 0; i < num_fields; i++) {
- ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
- child_indices_builders_[i].Finish());
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
- Take(*typed_values.field(i), *child_indices_array));
- out->child_data.push_back(child_array->data());
- }
- return Status::OK();
- }
-};
-
+struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
+ using Base = Selection<DenseUnionImpl, DenseUnionType>;
+ LIFT_BASE_MEMBERS();
+
+ TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
+ TypedBufferBuilder<int8_t> child_id_buffer_builder_;
+ std::vector<int8_t> type_codes_;
+ std::vector<Int32Builder> child_indices_builders_;
+
+ DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
+ Datum* out)
+ : Base(ctx, batch, output_length, out),
+ value_offset_buffer_builder_(ctx->memory_pool()),
+ child_id_buffer_builder_(ctx->memory_pool()),
+ type_codes_(checked_cast<const UnionType&>(*this->values->type).type_codes()),
+ child_indices_builders_(type_codes_.size()) {
+ for (auto& child_indices_builder : child_indices_builders_) {
+ child_indices_builder = Int32Builder(ctx->memory_pool());
+ }
+ }
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ DenseUnionArray typed_values(this->values);
+ Adapter adapter(this);
+ RETURN_NOT_OK(adapter.Generate(
+ [&](int64_t index) {
+ int8_t child_id = typed_values.child_id(index);
+ child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+ int32_t value_offset = typed_values.value_offset(index);
+ value_offset_buffer_builder_.UnsafeAppend(
+ static_cast<int32_t>(child_indices_builders_[child_id].length()));
+ RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+ child_indices_builders_[child_id].UnsafeAppend(value_offset);
+ return Status::OK();
+ },
+ [&]() {
+ int8_t child_id = 0;
+ child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+ value_offset_buffer_builder_.UnsafeAppend(
+ static_cast<int32_t>(child_indices_builders_[child_id].length()));
+ RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+ child_indices_builders_[child_id].UnsafeAppendNull();
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ Status Init() override {
+ RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
+ RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
+ return Status::OK();
+ }
+
+ Status Finish() override {
+ ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
+ ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
+ value_offset_buffer_builder_.Finish());
+ DenseUnionArray typed_values(this->values);
+ auto num_fields = typed_values.num_fields();
+ auto num_rows = child_ids_buffer->size();
+ BufferVector buffers{nullptr, std::move(child_ids_buffer),
+ std::move(value_offsets_buffer)};
+ *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
+ for (auto i = 0; i < num_fields; i++) {
+ ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
+ child_indices_builders_[i].Finish());
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
+ Take(*typed_values.field(i), *child_indices_array));
+ out->child_data.push_back(child_array->data());
+ }
+ return Status::OK();
+ }
+};
+
struct FSLImpl : public Selection<FSLImpl, FixedSizeListType> {
Int64Builder child_index_builder;
@@ -1827,20 +1827,20 @@ struct StructImpl : public Selection<StructImpl, StructType> {
}
};
-Status StructFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status StructFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Transform filter to selection indices and then use Take.
std::shared_ptr<ArrayData> indices;
- RETURN_NOT_OK(GetTakeIndices(*batch[1].array(),
- FilterState::Get(ctx).null_selection_behavior,
- ctx->memory_pool())
- .Value(&indices));
+ RETURN_NOT_OK(GetTakeIndices(*batch[1].array(),
+ FilterState::Get(ctx).null_selection_behavior,
+ ctx->memory_pool())
+ .Value(&indices));
Datum result;
- RETURN_NOT_OK(
- Take(batch[0], Datum(indices), TakeOptions::NoBoundsCheck(), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(
+ Take(batch[0], Datum(indices), TakeOptions::NoBoundsCheck(), ctx->exec_context())
+ .Value(&result));
out->value = result.array();
- return Status::OK();
+ return Status::OK();
}
#undef LIFT_BASE_MEMBERS
@@ -1860,15 +1860,15 @@ Result<std::shared_ptr<RecordBatch>> FilterRecordBatch(const RecordBatch& batch,
const auto& filter_opts = *static_cast<const FilterOptions*>(options);
ARROW_ASSIGN_OR_RAISE(
std::shared_ptr<ArrayData> indices,
- GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
- ctx->memory_pool()));
+ GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
+ ctx->memory_pool()));
std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
for (int i = 0; i < batch.num_columns(); ++i) {
ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
TakeOptions::NoBoundsCheck(), ctx));
columns[i] = out.make_array();
}
- return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
+ return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
}
Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filter,
@@ -1877,82 +1877,82 @@ Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filt
if (table.num_rows() != filter.length()) {
return Status::Invalid("Filter inputs must all be the same length");
}
- if (table.num_rows() == 0) {
- return Table::Make(table.schema(), table.columns(), 0);
- }
-
- // Last input element will be the filter array
- const int num_columns = table.num_columns();
- std::vector<ArrayVector> inputs(num_columns + 1);
-
- // Fetch table columns
- for (int i = 0; i < num_columns; ++i) {
- inputs[i] = table.column(i)->chunks();
- }
- // Fetch filter
- const auto& filter_opts = *static_cast<const FilterOptions*>(options);
- switch (filter.kind()) {
- case Datum::ARRAY:
- inputs.back().push_back(filter.make_array());
- break;
- case Datum::CHUNKED_ARRAY:
- inputs.back() = filter.chunked_array()->chunks();
- break;
- default:
- return Status::NotImplemented("Filter should be array-like");
- }
-
- // Rechunk inputs to allow consistent iteration over their respective chunks
- inputs = arrow::internal::RechunkArraysConsistently(inputs);
-
- // Instead of filtering each column with the boolean filter
- // (which would be slow if the table has a large number of columns: ARROW-10569),
- // convert each filter chunk to indices, and take() the column.
- const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
- std::vector<ArrayVector> out_columns(num_columns);
- int64_t out_num_rows = 0;
-
- for (int64_t i = 0; i < num_chunks; ++i) {
- const ArrayData& filter_chunk = *inputs.back()[i]->data();
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+
+ // Last input element will be the filter array
+ const int num_columns = table.num_columns();
+ std::vector<ArrayVector> inputs(num_columns + 1);
+
+ // Fetch table columns
+ for (int i = 0; i < num_columns; ++i) {
+ inputs[i] = table.column(i)->chunks();
+ }
+ // Fetch filter
+ const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+ switch (filter.kind()) {
+ case Datum::ARRAY:
+ inputs.back().push_back(filter.make_array());
+ break;
+ case Datum::CHUNKED_ARRAY:
+ inputs.back() = filter.chunked_array()->chunks();
+ break;
+ default:
+ return Status::NotImplemented("Filter should be array-like");
+ }
+
+ // Rechunk inputs to allow consistent iteration over their respective chunks
+ inputs = arrow::internal::RechunkArraysConsistently(inputs);
+
+ // Instead of filtering each column with the boolean filter
+ // (which would be slow if the table has a large number of columns: ARROW-10569),
+ // convert each filter chunk to indices, and take() the column.
+ const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
+ std::vector<ArrayVector> out_columns(num_columns);
+ int64_t out_num_rows = 0;
+
+ for (int64_t i = 0; i < num_chunks; ++i) {
+ const ArrayData& filter_chunk = *inputs.back()[i]->data();
ARROW_ASSIGN_OR_RAISE(
- const auto indices,
- GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
- ctx->memory_pool()));
-
- if (indices->length > 0) {
- // Take from all input columns
- Datum indices_datum{std::move(indices)};
- for (int col = 0; col < num_columns; ++col) {
- const auto& column_chunk = inputs[col][i];
- ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
- TakeOptions::NoBoundsCheck(), ctx));
- out_columns[col].push_back(std::move(out).make_array());
- }
- out_num_rows += indices->length;
- }
+ const auto indices,
+ GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
+ ctx->memory_pool()));
+
+ if (indices->length > 0) {
+ // Take from all input columns
+ Datum indices_datum{std::move(indices)};
+ for (int col = 0; col < num_columns; ++col) {
+ const auto& column_chunk = inputs[col][i];
+ ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
+ TakeOptions::NoBoundsCheck(), ctx));
+ out_columns[col].push_back(std::move(out).make_array());
+ }
+ out_num_rows += indices->length;
+ }
}
-
- ChunkedArrayVector out_chunks(num_columns);
- for (int i = 0; i < num_columns; ++i) {
- out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
- table.column(i)->type());
- }
- return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
+
+ ChunkedArrayVector out_chunks(num_columns);
+ for (int i = 0; i < num_columns; ++i) {
+ out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
+ table.column(i)->type());
+ }
+ return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
}
static auto kDefaultFilterOptions = FilterOptions::Defaults();
-const FunctionDoc filter_doc(
- "Filter with a boolean selection filter",
- ("The output is populated with values from the input at positions\n"
- "where the selection filter is non-zero. Nulls in the selection filter\n"
- "are handled based on FilterOptions."),
- {"input", "selection_filter"}, "FilterOptions");
-
+const FunctionDoc filter_doc(
+ "Filter with a boolean selection filter",
+ ("The output is populated with values from the input at positions\n"
+ "where the selection filter is non-zero. Nulls in the selection filter\n"
+ "are handled based on FilterOptions."),
+ {"input", "selection_filter"}, "FilterOptions");
+
class FilterMetaFunction : public MetaFunction {
public:
FilterMetaFunction()
- : MetaFunction("filter", Arity::Binary(), &filter_doc, &kDefaultFilterOptions) {}
+ : MetaFunction("filter", Arity::Binary(), &filter_doc, &kDefaultFilterOptions) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -2061,7 +2061,7 @@ Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
for (int j = 0; j < ncols; j++) {
ARROW_ASSIGN_OR_RAISE(columns[j], TakeAA(*batch.column(j), indices, options, ctx));
}
- return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
+ return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
}
Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
@@ -2072,7 +2072,7 @@ Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
for (int j = 0; j < ncols; j++) {
ARROW_ASSIGN_OR_RAISE(columns[j], TakeCA(*table.column(j), indices, options, ctx));
}
- return Table::Make(table.schema(), std::move(columns));
+ return Table::Make(table.schema(), std::move(columns));
}
Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& indices,
@@ -2082,17 +2082,17 @@ Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& in
for (int j = 0; j < ncols; j++) {
ARROW_ASSIGN_OR_RAISE(columns[j], TakeCC(*table.column(j), indices, options, ctx));
}
- return Table::Make(table.schema(), std::move(columns));
+ return Table::Make(table.schema(), std::move(columns));
}
static auto kDefaultTakeOptions = TakeOptions::Defaults();
-const FunctionDoc take_doc(
- "Select values from an input based on indices from another array",
- ("The output is populated with values from the input at positions\n"
- "given by `indices`. Nulls in `indices` emit null in the output."),
- {"input", "indices"}, "TakeOptions");
-
+const FunctionDoc take_doc(
+ "Select values from an input based on indices from another array",
+ ("The output is populated with values from the input at positions\n"
+ "given by `indices`. Nulls in `indices` emit null in the output."),
+ {"input", "indices"}, "TakeOptions");
+
// Metafunction for dispatching to different Take implementations other than
// Array-Array.
//
@@ -2100,8 +2100,8 @@ const FunctionDoc take_doc(
// overly complex dispatching, there is no parallelization.
class TakeMetaFunction : public MetaFunction {
public:
- TakeMetaFunction()
- : MetaFunction("take", Arity::Binary(), &take_doc, &kDefaultTakeOptions) {}
+ TakeMetaFunction()
+ : MetaFunction("take", Arity::Binary(), &take_doc, &kDefaultTakeOptions) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -2149,21 +2149,21 @@ class TakeMetaFunction : public MetaFunction {
// ----------------------------------------------------------------------
template <typename Impl>
-Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// TODO: where are the values and filter length equality checked?
int64_t output_length = GetFilterOutputSize(
*batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
Impl kernel(ctx, batch, output_length, out);
- return kernel.ExecFilter();
+ return kernel.ExecFilter();
}
template <typename Impl>
-Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (TakeState::Get(ctx).boundscheck) {
- RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
}
Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
- return kernel.ExecTake();
+ return kernel.ExecTake();
}
struct SelectionKernelDescr {
@@ -2171,13 +2171,13 @@ struct SelectionKernelDescr {
ArrayKernelExec exec;
};
-void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
- VectorKernel base_kernel, InputType selection_type,
+void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
+ VectorKernel base_kernel, InputType selection_type,
const std::vector<SelectionKernelDescr>& descrs,
const FunctionOptions* default_options,
FunctionRegistry* registry) {
- auto func =
- std::make_shared<VectorFunction>(name, Arity::Binary(), doc, default_options);
+ auto func =
+ std::make_shared<VectorFunction>(name, Arity::Binary(), doc, default_options);
for (auto& descr : descrs) {
base_kernel.signature = KernelSignature::Make(
{std::move(descr.input), selection_type}, OutputType(FirstType));
@@ -2187,19 +2187,19 @@ void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-const FunctionDoc array_filter_doc(
- "Filter with a boolean selection filter",
- ("The output is populated with values from the input `array` at positions\n"
- "where the selection filter is non-zero. Nulls in the selection filter\n"
- "are handled based on FilterOptions."),
- {"array", "selection_filter"}, "FilterOptions");
-
-const FunctionDoc array_take_doc(
- "Select values from an array based on indices from another array",
- ("The output is populated with values from the input array at positions\n"
- "given by `indices`. Nulls in `indices` emit null in the output."),
- {"array", "indices"}, "TakeOptions");
-
+const FunctionDoc array_filter_doc(
+ "Filter with a boolean selection filter",
+ ("The output is populated with values from the input `array` at positions\n"
+ "where the selection filter is non-zero. Nulls in the selection filter\n"
+ "are handled based on FilterOptions."),
+ {"array", "selection_filter"}, "FilterOptions");
+
+const FunctionDoc array_take_doc(
+ "Select values from an array based on indices from another array",
+ ("The output is populated with values from the input array at positions\n"
+ "given by `indices`. Nulls in `indices` emit null in the output."),
+ {"array", "indices"}, "TakeOptions");
+
} // namespace
void RegisterVectorSelection(FunctionRegistry* registry) {
@@ -2216,7 +2216,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
{InputType::Array(Type::LIST), FilterExec<ListImpl<ListType>>},
{InputType::Array(Type::LARGE_LIST), FilterExec<ListImpl<LargeListType>>},
{InputType::Array(Type::FIXED_SIZE_LIST), FilterExec<FSLImpl>},
- {InputType::Array(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
+ {InputType::Array(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
{InputType::Array(Type::STRUCT), StructFilter},
// TODO: Reuse ListType kernel for MAP
{InputType::Array(Type::MAP), FilterExec<ListImpl<MapType>>},
@@ -2224,7 +2224,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
VectorKernel filter_base;
filter_base.init = FilterState::Init;
- RegisterSelectionFunction("array_filter", &array_filter_doc, filter_base,
+ RegisterSelectionFunction("array_filter", &array_filter_doc, filter_base,
/*selection_type=*/InputType::Array(boolean()),
filter_kernel_descrs, &kDefaultFilterOptions, registry);
@@ -2239,14 +2239,14 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
TakeExec<VarBinaryImpl<LargeBinaryType>>},
{InputType::Array(Type::FIXED_SIZE_BINARY), TakeExec<FSBImpl>},
{InputType::Array(null()), NullTake},
- {InputType::Array(Type::DECIMAL128), TakeExec<FSBImpl>},
- {InputType::Array(Type::DECIMAL256), TakeExec<FSBImpl>},
+ {InputType::Array(Type::DECIMAL128), TakeExec<FSBImpl>},
+ {InputType::Array(Type::DECIMAL256), TakeExec<FSBImpl>},
{InputType::Array(Type::DICTIONARY), DictionaryTake},
{InputType::Array(Type::EXTENSION), ExtensionTake},
{InputType::Array(Type::LIST), TakeExec<ListImpl<ListType>>},
{InputType::Array(Type::LARGE_LIST), TakeExec<ListImpl<LargeListType>>},
{InputType::Array(Type::FIXED_SIZE_LIST), TakeExec<FSLImpl>},
- {InputType::Array(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
+ {InputType::Array(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
{InputType::Array(Type::STRUCT), TakeExec<StructImpl>},
// TODO: Reuse ListType kernel for MAP
{InputType::Array(Type::MAP), TakeExec<ListImpl<MapType>>},
@@ -2256,7 +2256,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
take_base.init = TakeState::Init;
take_base.can_execute_chunkwise = false;
RegisterSelectionFunction(
- "array_take", &array_take_doc, take_base,
+ "array_take", &array_take_doc, take_base,
/*selection_type=*/InputType(match::Integer(), ValueDescr::ARRAY),
take_kernel_descrs, &kDefaultTakeOptions, registry);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
index 7fa43e715d8..b42e9d536f1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -16,305 +16,305 @@
// under the License.
#include <algorithm>
-#include <cmath>
+#include <cmath>
#include <limits>
#include <numeric>
-#include <type_traits>
-#include <utility>
+#include <type_traits>
+#include <utility>
#include "arrow/array/data.h"
#include "arrow/compute/api_vector.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/table.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bitmap.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/table.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/optional.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
-
-using internal::checked_cast;
-
+
+using internal::checked_cast;
+
namespace compute {
-namespace internal {
-
-// Visit all physical types for which sorting is implemented.
-#define VISIT_PHYSICAL_TYPES(VISIT) \
- VISIT(BooleanType) \
- VISIT(Int8Type) \
- VISIT(Int16Type) \
- VISIT(Int32Type) \
- VISIT(Int64Type) \
- VISIT(UInt8Type) \
- VISIT(UInt16Type) \
- VISIT(UInt32Type) \
- VISIT(UInt64Type) \
- VISIT(FloatType) \
- VISIT(DoubleType) \
- VISIT(BinaryType) \
- VISIT(LargeBinaryType) \
- VISIT(FixedSizeBinaryType) \
- VISIT(Decimal128Type) \
- VISIT(Decimal256Type)
-
+namespace internal {
+
+// Visit all physical types for which sorting is implemented.
+#define VISIT_PHYSICAL_TYPES(VISIT) \
+ VISIT(BooleanType) \
+ VISIT(Int8Type) \
+ VISIT(Int16Type) \
+ VISIT(Int32Type) \
+ VISIT(Int64Type) \
+ VISIT(UInt8Type) \
+ VISIT(UInt16Type) \
+ VISIT(UInt32Type) \
+ VISIT(UInt64Type) \
+ VISIT(FloatType) \
+ VISIT(DoubleType) \
+ VISIT(BinaryType) \
+ VISIT(LargeBinaryType) \
+ VISIT(FixedSizeBinaryType) \
+ VISIT(Decimal128Type) \
+ VISIT(Decimal256Type)
+
namespace {
-// The target chunk in a chunked array.
-template <typename ArrayType>
-struct ResolvedChunk {
- using V = GetViewType<typename ArrayType::TypeClass>;
- using LogicalValueType = typename V::T;
-
- // The target array in chunked array.
- const ArrayType* array;
- // The index in the target array.
- const int64_t index;
-
- ResolvedChunk(const ArrayType* array, int64_t index) : array(array), index(index) {}
-
- bool IsNull() const { return array->IsNull(index); }
-
- LogicalValueType Value() const { return V::LogicalValue(array->GetView(index)); }
-};
-
-// ResolvedChunk specialization for untyped arrays when all is needed is null lookup
-template <>
-struct ResolvedChunk<Array> {
- // The target array in chunked array.
- const Array* array;
- // The index in the target array.
- const int64_t index;
-
- ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
-
- bool IsNull() const { return array->IsNull(index); }
-};
-
-// An object that resolves an array chunk depending on the index.
-struct ChunkedArrayResolver {
- explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
- : num_chunks_(static_cast<int64_t>(chunks.size())),
- chunks_(chunks.data()),
- offsets_(MakeEndOffsets(chunks)),
- cached_chunk_(0) {}
-
- template <typename ArrayType>
- ResolvedChunk<ArrayType> Resolve(int64_t index) const {
- // It is common for the algorithms below to make consecutive accesses at
- // a relatively small distance from each other, hence often falling in
- // the same chunk.
- // This is trivial when merging (assuming each side of the merge uses
- // its own resolver), but also in the inner recursive invocations of
- // partitioning.
- const bool cache_hit =
- (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]);
- if (ARROW_PREDICT_TRUE(cache_hit)) {
- return ResolvedChunk<ArrayType>(
- checked_cast<const ArrayType*>(chunks_[cached_chunk_]),
- index - offsets_[cached_chunk_]);
- } else {
- return ResolveMissBisect<ArrayType>(index);
- }
- }
-
- private:
- template <typename ArrayType>
- ResolvedChunk<ArrayType> ResolveMissBisect(int64_t index) const {
- // Like std::upper_bound(), but hand-written as it can help the compiler.
- const int64_t* raw_offsets = offsets_.data();
- // Search [lo, lo + n)
- int64_t lo = 0, n = num_chunks_;
- while (n > 1) {
- int64_t m = n >> 1;
- int64_t mid = lo + m;
- if (index >= raw_offsets[mid]) {
- lo = mid;
- n -= m;
- } else {
- n = m;
- }
- }
- cached_chunk_ = lo;
- return ResolvedChunk<ArrayType>(checked_cast<const ArrayType*>(chunks_[lo]),
- index - offsets_[lo]);
- }
-
- static std::vector<int64_t> MakeEndOffsets(const std::vector<const Array*>& chunks) {
- std::vector<int64_t> end_offsets(chunks.size() + 1);
- int64_t offset = 0;
- end_offsets[0] = 0;
- std::transform(chunks.begin(), chunks.end(), end_offsets.begin() + 1,
- [&](const Array* chunk) {
- offset += chunk->length();
- return offset;
- });
- return end_offsets;
- }
-
- int64_t num_chunks_;
- const Array* const* chunks_;
- std::vector<int64_t> offsets_;
-
- mutable int64_t cached_chunk_;
-};
-
-// We could try to reproduce the concrete Array classes' facilities
-// (such as cached raw values pointer) in a separate hierarchy of
-// physical accessors, but doing so ends up too cumbersome.
-// Instead, we simply create the desired concrete Array objects.
-std::shared_ptr<Array> GetPhysicalArray(const Array& array,
- const std::shared_ptr<DataType>& physical_type) {
- auto new_data = array.data()->Copy();
- new_data->type = physical_type;
- return MakeArray(std::move(new_data));
-}
-
-ArrayVector GetPhysicalChunks(const ChunkedArray& chunked_array,
- const std::shared_ptr<DataType>& physical_type) {
- const auto& chunks = chunked_array.chunks();
- ArrayVector physical(chunks.size());
- std::transform(chunks.begin(), chunks.end(), physical.begin(),
- [&](const std::shared_ptr<Array>& array) {
- return GetPhysicalArray(*array, physical_type);
- });
- return physical;
-}
-
-std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
- std::vector<const Array*> pointers(arrays.size());
- std::transform(arrays.begin(), arrays.end(), pointers.begin(),
- [&](const std::shared_ptr<Array>& array) { return array.get(); });
- return pointers;
-}
-
-// NOTE: std::partition is usually faster than std::stable_partition.
-
-struct NonStablePartitioner {
- template <typename Predicate>
- uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
- return std::partition(indices_begin, indices_end, std::forward<Predicate>(pred));
- }
-};
-
-struct StablePartitioner {
- template <typename Predicate>
- uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
- return std::stable_partition(indices_begin, indices_end,
- std::forward<Predicate>(pred));
- }
-};
-
-// TODO factor out value comparison and NaN checking?
-
-template <typename TypeClass, typename Enable = void>
-struct NullTraits {
- static constexpr bool has_null_like_values = false;
-};
-
-template <typename TypeClass>
-struct NullTraits<TypeClass, enable_if_floating_point<TypeClass>> {
- static constexpr bool has_null_like_values = true;
-};
-
-// Move nulls (not null-like values) to end of array. Return where null starts.
-//
-// `offset` is used when this is called on a chunk of a chunked array
-template <typename Partitioner>
-uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
- const Array& values, int64_t offset) {
- if (values.null_count() == 0) {
- return indices_end;
- }
- Partitioner partitioner;
- return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
- return !values.IsNull(ind - offset);
- });
-}
-
-// For chunked array.
-template <typename Partitioner>
-uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays,
- int64_t null_count) {
- if (null_count == 0) {
- return indices_end;
- }
- ChunkedArrayResolver resolver(arrays);
- Partitioner partitioner;
- return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
- const auto chunk = resolver.Resolve<Array>(ind);
- return !chunk.IsNull();
- });
-}
-
-// Move non-null null-like values to end of array. Return where null-like starts.
-//
-// `offset` is used when this is called on a chunk of a chunked array
-template <typename ArrayType, typename Partitioner>
-enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset) {
- return indices_end;
-}
-
-// For chunked array.
-template <typename ArrayType, typename Partitioner>
-enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count) {
- return indices_end;
-}
-
-template <typename ArrayType, typename Partitioner>
-enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset) {
- Partitioner partitioner;
- return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
- return !std::isnan(values.GetView(ind - offset));
- });
-}
-
-template <typename ArrayType, typename Partitioner>
-enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count) {
- Partitioner partitioner;
- ChunkedArrayResolver resolver(arrays);
- return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
- const auto chunk = resolver.Resolve<ArrayType>(ind);
- return !std::isnan(chunk.Value());
- });
-}
-
-// Move nulls to end of array. Return where null starts.
-//
-// `offset` is used when this is called on a chunk of a chunked array
-template <typename ArrayType, typename Partitioner>
-uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset) {
- // Partition nulls at end, and null-like values just before
- uint64_t* nulls_begin =
- PartitionNullsOnly<Partitioner>(indices_begin, indices_end, values, offset);
- return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, values,
- offset);
-}
-
-// For chunked array.
-template <typename ArrayType, typename Partitioner>
-uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count) {
- // Partition nulls at end, and null-like values just before
- uint64_t* nulls_begin =
- PartitionNullsOnly<Partitioner>(indices_begin, indices_end, arrays, null_count);
- return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, arrays,
- null_count);
-}
-
+// The target chunk in a chunked array.
+template <typename ArrayType>
+struct ResolvedChunk {
+ using V = GetViewType<typename ArrayType::TypeClass>;
+ using LogicalValueType = typename V::T;
+
+ // The target array in chunked array.
+ const ArrayType* array;
+ // The index in the target array.
+ const int64_t index;
+
+ ResolvedChunk(const ArrayType* array, int64_t index) : array(array), index(index) {}
+
+ bool IsNull() const { return array->IsNull(index); }
+
+ LogicalValueType Value() const { return V::LogicalValue(array->GetView(index)); }
+};
+
+// ResolvedChunk specialization for untyped arrays when all is needed is null lookup
+template <>
+struct ResolvedChunk<Array> {
+ // The target array in chunked array.
+ const Array* array;
+ // The index in the target array.
+ const int64_t index;
+
+ ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
+
+ bool IsNull() const { return array->IsNull(index); }
+};
+
+// An object that resolves an array chunk depending on the index.
+struct ChunkedArrayResolver {
+ explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
+ : num_chunks_(static_cast<int64_t>(chunks.size())),
+ chunks_(chunks.data()),
+ offsets_(MakeEndOffsets(chunks)),
+ cached_chunk_(0) {}
+
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> Resolve(int64_t index) const {
+ // It is common for the algorithms below to make consecutive accesses at
+ // a relatively small distance from each other, hence often falling in
+ // the same chunk.
+ // This is trivial when merging (assuming each side of the merge uses
+ // its own resolver), but also in the inner recursive invocations of
+ // partitioning.
+ const bool cache_hit =
+ (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]);
+ if (ARROW_PREDICT_TRUE(cache_hit)) {
+ return ResolvedChunk<ArrayType>(
+ checked_cast<const ArrayType*>(chunks_[cached_chunk_]),
+ index - offsets_[cached_chunk_]);
+ } else {
+ return ResolveMissBisect<ArrayType>(index);
+ }
+ }
+
+ private:
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> ResolveMissBisect(int64_t index) const {
+ // Like std::upper_bound(), but hand-written as it can help the compiler.
+ const int64_t* raw_offsets = offsets_.data();
+ // Search [lo, lo + n)
+ int64_t lo = 0, n = num_chunks_;
+ while (n > 1) {
+ int64_t m = n >> 1;
+ int64_t mid = lo + m;
+ if (index >= raw_offsets[mid]) {
+ lo = mid;
+ n -= m;
+ } else {
+ n = m;
+ }
+ }
+ cached_chunk_ = lo;
+ return ResolvedChunk<ArrayType>(checked_cast<const ArrayType*>(chunks_[lo]),
+ index - offsets_[lo]);
+ }
+
+ static std::vector<int64_t> MakeEndOffsets(const std::vector<const Array*>& chunks) {
+ std::vector<int64_t> end_offsets(chunks.size() + 1);
+ int64_t offset = 0;
+ end_offsets[0] = 0;
+ std::transform(chunks.begin(), chunks.end(), end_offsets.begin() + 1,
+ [&](const Array* chunk) {
+ offset += chunk->length();
+ return offset;
+ });
+ return end_offsets;
+ }
+
+ int64_t num_chunks_;
+ const Array* const* chunks_;
+ std::vector<int64_t> offsets_;
+
+ mutable int64_t cached_chunk_;
+};
+
+// We could try to reproduce the concrete Array classes' facilities
+// (such as cached raw values pointer) in a separate hierarchy of
+// physical accessors, but doing so ends up too cumbersome.
+// Instead, we simply create the desired concrete Array objects.
+std::shared_ptr<Array> GetPhysicalArray(const Array& array,
+ const std::shared_ptr<DataType>& physical_type) {
+ auto new_data = array.data()->Copy();
+ new_data->type = physical_type;
+ return MakeArray(std::move(new_data));
+}
+
+ArrayVector GetPhysicalChunks(const ChunkedArray& chunked_array,
+ const std::shared_ptr<DataType>& physical_type) {
+ const auto& chunks = chunked_array.chunks();
+ ArrayVector physical(chunks.size());
+ std::transform(chunks.begin(), chunks.end(), physical.begin(),
+ [&](const std::shared_ptr<Array>& array) {
+ return GetPhysicalArray(*array, physical_type);
+ });
+ return physical;
+}
+
+std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
+ std::vector<const Array*> pointers(arrays.size());
+ std::transform(arrays.begin(), arrays.end(), pointers.begin(),
+ [&](const std::shared_ptr<Array>& array) { return array.get(); });
+ return pointers;
+}
+
+// NOTE: std::partition is usually faster than std::stable_partition.
+
+struct NonStablePartitioner {
+ template <typename Predicate>
+ uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
+ return std::partition(indices_begin, indices_end, std::forward<Predicate>(pred));
+ }
+};
+
+struct StablePartitioner {
+ template <typename Predicate>
+ uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
+ return std::stable_partition(indices_begin, indices_end,
+ std::forward<Predicate>(pred));
+ }
+};
+
+// TODO factor out value comparison and NaN checking?
+
+template <typename TypeClass, typename Enable = void>
+struct NullTraits {
+ static constexpr bool has_null_like_values = false;
+};
+
+template <typename TypeClass>
+struct NullTraits<TypeClass, enable_if_floating_point<TypeClass>> {
+ static constexpr bool has_null_like_values = true;
+};
+
+// Move nulls (not null-like values) to end of array. Return where null starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename Partitioner>
+uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
+ const Array& values, int64_t offset) {
+ if (values.null_count() == 0) {
+ return indices_end;
+ }
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
+ return !values.IsNull(ind - offset);
+ });
+}
+
+// For chunked array.
+template <typename Partitioner>
+uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays,
+ int64_t null_count) {
+ if (null_count == 0) {
+ return indices_end;
+ }
+ ChunkedArrayResolver resolver(arrays);
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
+ const auto chunk = resolver.Resolve<Array>(ind);
+ return !chunk.IsNull();
+ });
+}
+
+// Move non-null null-like values to end of array. Return where null-like starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename ArrayType, typename Partitioner>
+enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ return indices_end;
+}
+
+// For chunked array.
+template <typename ArrayType, typename Partitioner>
+enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ return indices_end;
+}
+
+template <typename ArrayType, typename Partitioner>
+enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
+ return !std::isnan(values.GetView(ind - offset));
+ });
+}
+
+template <typename ArrayType, typename Partitioner>
+enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ Partitioner partitioner;
+ ChunkedArrayResolver resolver(arrays);
+ return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
+ const auto chunk = resolver.Resolve<ArrayType>(ind);
+ return !std::isnan(chunk.Value());
+ });
+}
+
+// Move nulls to end of array. Return where null starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename ArrayType, typename Partitioner>
+uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ // Partition nulls at end, and null-like values just before
+ uint64_t* nulls_begin =
+ PartitionNullsOnly<Partitioner>(indices_begin, indices_end, values, offset);
+ return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, values,
+ offset);
+}
+
+// For chunked array.
+template <typename ArrayType, typename Partitioner>
+uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ // Partition nulls at end, and null-like values just before
+ uint64_t* nulls_begin =
+ PartitionNullsOnly<Partitioner>(indices_begin, indices_end, arrays, null_count);
+ return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, arrays,
+ null_count);
+}
+
// ----------------------------------------------------------------------
// partition_nth_indices implementation
@@ -324,116 +324,116 @@ using PartitionNthToIndicesState = internal::OptionsWrapper<PartitionNthOptions>
template <typename OutType, typename InType>
struct PartitionNthToIndices {
using ArrayType = typename TypeTraits<InType>::ArrayType;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- using GetView = GetViewType<InType>;
-
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using GetView = GetViewType<InType>;
+
if (ctx->state() == nullptr) {
- return Status::Invalid("NthToIndices requires PartitionNthOptions");
+ return Status::Invalid("NthToIndices requires PartitionNthOptions");
}
- ArrayType arr(batch[0].array());
+ ArrayType arr(batch[0].array());
int64_t pivot = PartitionNthToIndicesState::Get(ctx).pivot;
if (pivot > arr.length()) {
- return Status::IndexError("NthToIndices index out of bound");
+ return Status::IndexError("NthToIndices index out of bound");
}
ArrayData* out_arr = out->mutable_array();
uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
uint64_t* out_end = out_begin + arr.length();
std::iota(out_begin, out_end, 0);
if (pivot == arr.length()) {
- return Status::OK();
+ return Status::OK();
}
- auto nulls_begin =
- PartitionNulls<ArrayType, NonStablePartitioner>(out_begin, out_end, arr, 0);
+ auto nulls_begin =
+ PartitionNulls<ArrayType, NonStablePartitioner>(out_begin, out_end, arr, 0);
auto nth_begin = out_begin + pivot;
if (nth_begin < nulls_begin) {
std::nth_element(out_begin, nth_begin, nulls_begin,
[&arr](uint64_t left, uint64_t right) {
- const auto lval = GetView::LogicalValue(arr.GetView(left));
- const auto rval = GetView::LogicalValue(arr.GetView(right));
- return lval < rval;
+ const auto lval = GetView::LogicalValue(arr.GetView(left));
+ const auto rval = GetView::LogicalValue(arr.GetView(right));
+ return lval < rval;
});
}
- return Status::OK();
+ return Status::OK();
}
};
-// ----------------------------------------------------------------------
-// Array sorting implementations
-
+// ----------------------------------------------------------------------
+// Array sorting implementations
+
template <typename ArrayType, typename VisitorNotNull, typename VisitorNull>
inline void VisitRawValuesInline(const ArrayType& values,
VisitorNotNull&& visitor_not_null,
VisitorNull&& visitor_null) {
const auto data = values.raw_values();
- VisitBitBlocksVoid(
- values.null_bitmap(), values.offset(), values.length(),
- [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
-}
-
-template <typename VisitorNotNull, typename VisitorNull>
-inline void VisitRawValuesInline(const BooleanArray& values,
- VisitorNotNull&& visitor_not_null,
- VisitorNull&& visitor_null) {
- if (values.null_count() != 0) {
- const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
- VisitBitBlocksVoid(
- values.null_bitmap(), values.offset(), values.length(),
- [&](int64_t i) { visitor_not_null(BitUtil::GetBit(data, values.offset() + i)); },
- [&]() { visitor_null(); });
+ VisitBitBlocksVoid(
+ values.null_bitmap(), values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
+}
+
+template <typename VisitorNotNull, typename VisitorNull>
+inline void VisitRawValuesInline(const BooleanArray& values,
+ VisitorNotNull&& visitor_not_null,
+ VisitorNull&& visitor_null) {
+ if (values.null_count() != 0) {
+ const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
+ VisitBitBlocksVoid(
+ values.null_bitmap(), values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(BitUtil::GetBit(data, values.offset() + i)); },
+ [&]() { visitor_null(); });
} else {
- // Can avoid GetBit() overhead in the no-nulls case
- VisitBitBlocksVoid(
- values.data()->buffers[1], values.offset(), values.length(),
- [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
+ // Can avoid GetBit() overhead in the no-nulls case
+ VisitBitBlocksVoid(
+ values.data()->buffers[1], values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
}
}
template <typename ArrowType>
-class ArrayCompareSorter {
+class ArrayCompareSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
- using GetView = GetViewType<ArrowType>;
+ using GetView = GetViewType<ArrowType>;
public:
- // Returns where null starts.
- //
- // `offset` is used when this is called on a chunk of a chunked array
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
- int64_t offset, const ArraySortOptions& options) {
- auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
- indices_begin, indices_end, values, offset);
- if (options.order == SortOrder::Ascending) {
- std::stable_sort(
- indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
- const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
- return lhs < rhs;
- });
- } else {
- std::stable_sort(
- indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
- const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
- // We don't use 'left > right' here to reduce required operator.
- // If we use 'right < left' here, '<' is only required.
- return rhs < lhs;
- });
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
+ auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
+ indices_begin, indices_end, values, offset);
+ if (options.order == SortOrder::Ascending) {
+ std::stable_sort(
+ indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
+ return lhs < rhs;
+ });
+ } else {
+ std::stable_sort(
+ indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ return rhs < lhs;
+ });
}
- return nulls_begin;
+ return nulls_begin;
}
};
template <typename ArrowType>
-class ArrayCountSorter {
+class ArrayCountSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
using c_type = typename ArrowType::c_type;
public:
- ArrayCountSorter() = default;
+ ArrayCountSorter() = default;
- explicit ArrayCountSorter(c_type min, c_type max) { SetMinMax(min, max); }
+ explicit ArrayCountSorter(c_type min, c_type max) { SetMinMax(min, max); }
// Assume: max >= min && (max - min) < 4Gi
void SetMinMax(c_type min, c_type max) {
@@ -441,14 +441,14 @@ class ArrayCountSorter {
value_range_ = static_cast<uint32_t>(max - min) + 1;
}
- // Returns where null starts.
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
- int64_t offset, const ArraySortOptions& options) {
+ // Returns where null starts.
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
// 32bit counter performs much better than 64bit one
if (values.length() < (1LL << 32)) {
- return SortInternal<uint32_t>(indices_begin, indices_end, values, offset, options);
+ return SortInternal<uint32_t>(indices_begin, indices_end, values, offset, options);
} else {
- return SortInternal<uint64_t>(indices_begin, indices_end, values, offset, options);
+ return SortInternal<uint64_t>(indices_begin, indices_end, values, offset, options);
}
}
@@ -456,81 +456,81 @@ class ArrayCountSorter {
c_type min_{0};
uint32_t value_range_{0};
- // Returns where null starts.
- //
- // `offset` is used when this is called on a chunk of a chunked array
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
template <typename CounterType>
- uint64_t* SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset,
- const ArraySortOptions& options) {
+ uint64_t* SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset,
+ const ArraySortOptions& options) {
const uint32_t value_range = value_range_;
// first slot reserved for prefix sum
std::vector<CounterType> counts(1 + value_range);
- if (options.order == SortOrder::Ascending) {
- VisitRawValuesInline(
- values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {});
- for (uint32_t i = 1; i <= value_range; ++i) {
- counts[i] += counts[i - 1];
- }
- auto null_position = counts[value_range];
- auto nulls_begin = indices_begin + null_position;
- int64_t index = offset;
- VisitRawValuesInline(
- values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
- [&]() { indices_begin[null_position++] = index++; });
- return nulls_begin;
- } else {
- VisitRawValuesInline(
- values, [&](c_type v) { ++counts[v - min_]; }, []() {});
- for (uint32_t i = value_range; i >= 1; --i) {
- counts[i - 1] += counts[i];
- }
- auto null_position = counts[0];
- auto nulls_begin = indices_begin + null_position;
- int64_t index = offset;
- VisitRawValuesInline(
- values, [&](c_type v) { indices_begin[counts[v - min_ + 1]++] = index++; },
- [&]() { indices_begin[null_position++] = index++; });
- return nulls_begin;
- }
- }
-};
-
-using ::arrow::internal::Bitmap;
-
-template <>
-class ArrayCountSorter<BooleanType> {
- public:
- ArrayCountSorter() = default;
-
- // Returns where null starts.
- // `offset` is used when this is called on a chunk of a chunked array
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
- const BooleanArray& values, int64_t offset,
- const ArraySortOptions& options) {
- std::array<int64_t, 2> counts{0, 0};
-
- const int64_t nulls = values.null_count();
- const int64_t ones = values.true_count();
- const int64_t zeros = values.length() - ones - nulls;
-
- int64_t null_position = values.length() - nulls;
- int64_t index = offset;
- const auto nulls_begin = indices_begin + null_position;
-
- if (options.order == SortOrder::Ascending) {
- // ones start after zeros
- counts[1] = zeros;
- } else {
- // zeros start after ones
- counts[0] = ones;
+ if (options.order == SortOrder::Ascending) {
+ VisitRawValuesInline(
+ values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {});
+ for (uint32_t i = 1; i <= value_range; ++i) {
+ counts[i] += counts[i - 1];
+ }
+ auto null_position = counts[value_range];
+ auto nulls_begin = indices_begin + null_position;
+ int64_t index = offset;
+ VisitRawValuesInline(
+ values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ } else {
+ VisitRawValuesInline(
+ values, [&](c_type v) { ++counts[v - min_]; }, []() {});
+ for (uint32_t i = value_range; i >= 1; --i) {
+ counts[i - 1] += counts[i];
+ }
+ auto null_position = counts[0];
+ auto nulls_begin = indices_begin + null_position;
+ int64_t index = offset;
+ VisitRawValuesInline(
+ values, [&](c_type v) { indices_begin[counts[v - min_ + 1]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ }
+ }
+};
+
+using ::arrow::internal::Bitmap;
+
+template <>
+class ArrayCountSorter<BooleanType> {
+ public:
+ ArrayCountSorter() = default;
+
+ // Returns where null starts.
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
+ const BooleanArray& values, int64_t offset,
+ const ArraySortOptions& options) {
+ std::array<int64_t, 2> counts{0, 0};
+
+ const int64_t nulls = values.null_count();
+ const int64_t ones = values.true_count();
+ const int64_t zeros = values.length() - ones - nulls;
+
+ int64_t null_position = values.length() - nulls;
+ int64_t index = offset;
+ const auto nulls_begin = indices_begin + null_position;
+
+ if (options.order == SortOrder::Ascending) {
+ // ones start after zeros
+ counts[1] = zeros;
+ } else {
+ // zeros start after ones
+ counts[0] = ones;
}
VisitRawValuesInline(
- values, [&](bool v) { indices_begin[counts[v]++] = index++; },
- [&]() { indices_begin[null_position++] = index++; });
- return nulls_begin;
+ values, [&](bool v) { indices_begin[counts[v]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
}
};
@@ -538,35 +538,35 @@ class ArrayCountSorter<BooleanType> {
// - Use O(n) counting sort if values are in a small range
// - Use O(nlogn) std::stable_sort otherwise
template <typename ArrowType>
-class ArrayCountOrCompareSorter {
+class ArrayCountOrCompareSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
using c_type = typename ArrowType::c_type;
public:
- // Returns where null starts.
- //
- // `offset` is used when this is called on a chunk of a chunked array
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
- int64_t offset, const ArraySortOptions& options) {
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
if (values.length() >= countsort_min_len_ && values.length() > values.null_count()) {
- c_type min, max;
- std::tie(min, max) = GetMinMax<c_type>(*values.data());
+ c_type min, max;
+ std::tie(min, max) = GetMinMax<c_type>(*values.data());
// For signed int32/64, (max - min) may overflow and trigger UBSAN.
// Cast to largest unsigned type(uint64_t) before subtraction.
if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <=
countsort_max_range_) {
count_sorter_.SetMinMax(min, max);
- return count_sorter_.Sort(indices_begin, indices_end, values, offset, options);
+ return count_sorter_.Sort(indices_begin, indices_end, values, offset, options);
}
}
- return compare_sorter_.Sort(indices_begin, indices_end, values, offset, options);
+ return compare_sorter_.Sort(indices_begin, indices_end, values, offset, options);
}
private:
- ArrayCompareSorter<ArrowType> compare_sorter_;
- ArrayCountSorter<ArrowType> count_sorter_;
+ ArrayCompareSorter<ArrowType> compare_sorter_;
+ ArrayCountSorter<ArrowType> count_sorter_;
// Cross point to prefer counting sort than stl::stable_sort(merge sort)
// - array to be sorted is longer than "count_min_len_"
@@ -582,1257 +582,1257 @@ class ArrayCountOrCompareSorter {
};
template <typename Type, typename Enable = void>
-struct ArraySorter;
-
-template <>
-struct ArraySorter<BooleanType> {
- ArrayCountSorter<BooleanType> impl;
-};
+struct ArraySorter;
template <>
-struct ArraySorter<UInt8Type> {
- ArrayCountSorter<UInt8Type> impl;
- ArraySorter() : impl(0, 255) {}
+struct ArraySorter<BooleanType> {
+ ArrayCountSorter<BooleanType> impl;
};
template <>
-struct ArraySorter<Int8Type> {
- ArrayCountSorter<Int8Type> impl;
- ArraySorter() : impl(-128, 127) {}
+struct ArraySorter<UInt8Type> {
+ ArrayCountSorter<UInt8Type> impl;
+ ArraySorter() : impl(0, 255) {}
};
+template <>
+struct ArraySorter<Int8Type> {
+ ArrayCountSorter<Int8Type> impl;
+ ArraySorter() : impl(-128, 127) {}
+};
+
template <typename Type>
-struct ArraySorter<Type, enable_if_t<(is_integer_type<Type>::value &&
- (sizeof(typename Type::c_type) > 1)) ||
- is_temporal_type<Type>::value>> {
- ArrayCountOrCompareSorter<Type> impl;
+struct ArraySorter<Type, enable_if_t<(is_integer_type<Type>::value &&
+ (sizeof(typename Type::c_type) > 1)) ||
+ is_temporal_type<Type>::value>> {
+ ArrayCountOrCompareSorter<Type> impl;
};
template <typename Type>
-struct ArraySorter<
- Type, enable_if_t<is_floating_type<Type>::value || is_base_binary_type<Type>::value ||
- is_fixed_size_binary_type<Type>::value>> {
- ArrayCompareSorter<Type> impl;
+struct ArraySorter<
+ Type, enable_if_t<is_floating_type<Type>::value || is_base_binary_type<Type>::value ||
+ is_fixed_size_binary_type<Type>::value>> {
+ ArrayCompareSorter<Type> impl;
};
-using ArraySortIndicesState = internal::OptionsWrapper<ArraySortOptions>;
-
+using ArraySortIndicesState = internal::OptionsWrapper<ArraySortOptions>;
+
template <typename OutType, typename InType>
-struct ArraySortIndices {
+struct ArraySortIndices {
using ArrayType = typename TypeTraits<InType>::ArrayType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& options = ArraySortIndicesState::Get(ctx);
-
- ArrayType arr(batch[0].array());
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = ArraySortIndicesState::Get(ctx);
+
+ ArrayType arr(batch[0].array());
ArrayData* out_arr = out->mutable_array();
uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
uint64_t* out_end = out_begin + arr.length();
- std::iota(out_begin, out_end, 0);
-
- ArraySorter<InType> sorter;
- sorter.impl.Sort(out_begin, out_end, arr, 0, options);
+ std::iota(out_begin, out_end, 0);
- return Status::OK();
+ ArraySorter<InType> sorter;
+ sorter.impl.Sort(out_begin, out_end, arr, 0, options);
+
+ return Status::OK();
}
};
// Sort indices kernels implemented for
//
-// * Boolean type
+// * Boolean type
// * Number types
// * Base binary types
template <template <typename...> class ExecTemplate>
void AddSortingKernels(VectorKernel base, VectorFunction* func) {
- // bool type
- base.signature = KernelSignature::Make({InputType::Array(boolean())}, uint64());
- base.exec = ExecTemplate<UInt64Type, BooleanType>::Exec;
- DCHECK_OK(func->AddKernel(base));
-
+ // bool type
+ base.signature = KernelSignature::Make({InputType::Array(boolean())}, uint64());
+ base.exec = ExecTemplate<UInt64Type, BooleanType>::Exec;
+ DCHECK_OK(func->AddKernel(base));
+
for (const auto& ty : NumericTypes()) {
- auto physical_type = GetPhysicalType(ty);
- base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
- base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
- DCHECK_OK(func->AddKernel(base));
- }
- for (const auto& ty : TemporalTypes()) {
- auto physical_type = GetPhysicalType(ty);
+ auto physical_type = GetPhysicalType(ty);
base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
- base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
- DCHECK_OK(func->AddKernel(base));
- }
- for (const auto id : DecimalTypeIds()) {
- base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
- base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
+ base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
DCHECK_OK(func->AddKernel(base));
}
+ for (const auto& ty : TemporalTypes()) {
+ auto physical_type = GetPhysicalType(ty);
+ base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+ base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ for (const auto id : DecimalTypeIds()) {
+ base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
+ base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
+ DCHECK_OK(func->AddKernel(base));
+ }
for (const auto& ty : BaseBinaryTypes()) {
- auto physical_type = GetPhysicalType(ty);
+ auto physical_type = GetPhysicalType(ty);
base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
- base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
+ base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
DCHECK_OK(func->AddKernel(base));
}
- base.signature =
- KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, uint64());
- base.exec = ExecTemplate<UInt64Type, FixedSizeBinaryType>::Exec;
- DCHECK_OK(func->AddKernel(base));
+ base.signature =
+ KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, uint64());
+ base.exec = ExecTemplate<UInt64Type, FixedSizeBinaryType>::Exec;
+ DCHECK_OK(func->AddKernel(base));
}
-// ----------------------------------------------------------------------
-// ChunkedArray sorting implementations
-
-// Sort a chunked array directly without sorting each array in the
-// chunked array. This is used for processing the second and following
-// sort keys in TableRadixSorter.
-//
-// This uses the same algorithm as ArrayCompareSorter.
-template <typename Type>
-class ChunkedArrayCompareSorter {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- public:
- // Returns where null starts.
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count,
- const ArraySortOptions& options) {
- auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
- indices_begin, indices_end, arrays, null_count);
- ChunkedArrayResolver resolver(arrays);
- if (options.order == SortOrder::Ascending) {
- std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
- const auto chunk_left = resolver.Resolve<ArrayType>(left);
- const auto chunk_right = resolver.Resolve<ArrayType>(right);
- return chunk_left.Value() < chunk_right.Value();
- });
- } else {
- std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
- const auto chunk_left = resolver.Resolve<ArrayType>(left);
- const auto chunk_right = resolver.Resolve<ArrayType>(right);
- // We don't use 'left > right' here to reduce required operator.
- // If we use 'right < left' here, '<' is only required.
- return chunk_right.Value() < chunk_left.Value();
- });
- }
- return nulls_begin;
- }
-};
-
-// Sort a chunked array by sorting each array in the chunked array.
-//
-// TODO: This is a naive implementation. We'll be able to improve
-// performance of this. For example, we'll be able to use threads for
-// sorting each array.
-class ChunkedArraySorter : public TypeVisitor {
- public:
- ChunkedArraySorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
- const ChunkedArray& chunked_array, const SortOrder order,
- bool can_use_array_sorter = true)
- : TypeVisitor(),
- indices_begin_(indices_begin),
- indices_end_(indices_end),
- chunked_array_(chunked_array),
- physical_type_(GetPhysicalType(chunked_array.type())),
- physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
- order_(order),
- can_use_array_sorter_(can_use_array_sorter),
- ctx_(ctx) {}
-
- Status Sort() { return physical_type_->Accept(this); }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- private:
- template <typename Type>
- Status SortInternal() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- ArraySortOptions options(order_);
- const auto num_chunks = chunked_array_.num_chunks();
- if (num_chunks == 0) {
- return Status::OK();
- }
- const auto arrays = GetArrayPointers(physical_chunks_);
- if (can_use_array_sorter_) {
- // Sort each chunk independently and merge to sorted indices.
- // This is a serial implementation.
- ArraySorter<Type> sorter;
- struct SortedChunk {
- int64_t begin_offset;
- int64_t end_offset;
- int64_t nulls_offset;
- };
- std::vector<SortedChunk> sorted(num_chunks);
-
- // First sort all individual chunks
- int64_t begin_offset = 0;
- int64_t end_offset = 0;
- int64_t null_count = 0;
- for (int i = 0; i < num_chunks; ++i) {
- const auto array = checked_cast<const ArrayType*>(arrays[i]);
- end_offset += array->length();
- null_count += array->null_count();
- uint64_t* nulls_begin =
- sorter.impl.Sort(indices_begin_ + begin_offset, indices_begin_ + end_offset,
- *array, begin_offset, options);
- sorted[i] = {begin_offset, end_offset, nulls_begin - indices_begin_};
- begin_offset = end_offset;
- }
- DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
-
- std::unique_ptr<Buffer> temp_buffer;
- uint64_t* temp_indices = nullptr;
- if (sorted.size() > 1) {
- ARROW_ASSIGN_OR_RAISE(
- temp_buffer,
- AllocateBuffer(sizeof(int64_t) * (indices_end_ - indices_begin_ - null_count),
- ctx_->memory_pool()));
- temp_indices = reinterpret_cast<uint64_t*>(temp_buffer->mutable_data());
- }
-
- // Then merge them by pairs, recursively
- while (sorted.size() > 1) {
- auto out_it = sorted.begin();
- auto it = sorted.begin();
- while (it < sorted.end() - 1) {
- const auto& left = *it++;
- const auto& right = *it++;
- DCHECK_EQ(left.end_offset, right.begin_offset);
- DCHECK_GE(left.nulls_offset, left.begin_offset);
- DCHECK_LE(left.nulls_offset, left.end_offset);
- DCHECK_GE(right.nulls_offset, right.begin_offset);
- DCHECK_LE(right.nulls_offset, right.end_offset);
- uint64_t* nulls_begin = Merge<ArrayType>(
- indices_begin_ + left.begin_offset, indices_begin_ + left.end_offset,
- indices_begin_ + right.end_offset, indices_begin_ + left.nulls_offset,
- indices_begin_ + right.nulls_offset, arrays, null_count, order_,
- temp_indices);
- *out_it++ = {left.begin_offset, right.end_offset, nulls_begin - indices_begin_};
- }
- if (it < sorted.end()) {
- *out_it++ = *it++;
- }
- sorted.erase(out_it, sorted.end());
- }
- DCHECK_EQ(sorted.size(), 1);
- DCHECK_EQ(sorted[0].begin_offset, 0);
- DCHECK_EQ(sorted[0].end_offset, chunked_array_.length());
- // Note that "nulls" can also include NaNs, hence the >= check
- DCHECK_GE(chunked_array_.length() - sorted[0].nulls_offset, null_count);
- } else {
- // Sort the chunked array directory.
- ChunkedArrayCompareSorter<Type> sorter;
- sorter.Sort(indices_begin_, indices_end_, arrays, chunked_array_.null_count(),
- options);
- }
- return Status::OK();
- }
-
- // Merges two sorted indices arrays and returns where nulls starts.
- // Where nulls starts is used when the next merge to detect the
- // sorted indices locations.
- template <typename ArrayType>
- uint64_t* Merge(uint64_t* indices_begin, uint64_t* indices_middle,
- uint64_t* indices_end, uint64_t* left_nulls_begin,
- uint64_t* right_nulls_begin, const std::vector<const Array*>& arrays,
- int64_t null_count, const SortOrder order, uint64_t* temp_indices) {
- // Input layout:
- // [left non-nulls .... left nulls .... right non-nulls .... right nulls]
- // ^ ^ ^ ^
- // | | | |
- // indices_begin left_nulls_begin indices_middle right_nulls_begin
- auto left_num_non_nulls = left_nulls_begin - indices_begin;
- auto right_num_non_nulls = right_nulls_begin - indices_middle;
-
- // Mutate the input, stably, to obtain the following layout:
- // [left non-nulls .... right non-nulls .... left nulls .... right nulls]
- // ^ ^ ^ ^
- // | | | |
- // indices_begin indices_middle nulls_begin right_nulls_begin
- std::rotate(left_nulls_begin, indices_middle, right_nulls_begin);
- auto nulls_begin = indices_begin + left_num_non_nulls + right_num_non_nulls;
- // If the type has null-like values (such as NaN), ensure those plus regular
- // nulls are partitioned in the right order. Note this assumes that all
- // null-like values (e.g. NaN) are ordered equally.
- if (NullTraits<typename ArrayType::TypeClass>::has_null_like_values) {
- PartitionNullsOnly<StablePartitioner>(nulls_begin, indices_end, arrays, null_count);
- }
-
- // Merge the non-null values into temp area
- indices_middle = indices_begin + left_num_non_nulls;
- indices_end = indices_middle + right_num_non_nulls;
- const ChunkedArrayResolver left_resolver(arrays);
- const ChunkedArrayResolver right_resolver(arrays);
- if (order == SortOrder::Ascending) {
- std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
- [&](uint64_t left, uint64_t right) {
- const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
- const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
- return chunk_left.Value() < chunk_right.Value();
- });
- } else {
- std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
- [&](uint64_t left, uint64_t right) {
- const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
- const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
- // We don't use 'left > right' here to reduce required
- // operator. If we use 'right < left' here, '<' is only
- // required.
- return chunk_right.Value() < chunk_left.Value();
- });
- }
- // Copy back temp area into main buffer
- std::copy(temp_indices, temp_indices + (nulls_begin - indices_begin), indices_begin);
- return nulls_begin;
- }
-
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
- const ChunkedArray& chunked_array_;
- const std::shared_ptr<DataType> physical_type_;
- const ArrayVector physical_chunks_;
- const SortOrder order_;
- const bool can_use_array_sorter_;
- ExecContext* ctx_;
-};
-
-// ----------------------------------------------------------------------
-// Record batch sorting implementation(s)
-
-// Visit contiguous ranges of equal values. All entries are assumed
-// to be non-null.
-template <typename ArrayType, typename Visitor>
-void VisitConstantRanges(const ArrayType& array, uint64_t* indices_begin,
- uint64_t* indices_end, Visitor&& visit) {
- using GetView = GetViewType<typename ArrayType::TypeClass>;
-
- if (indices_begin == indices_end) {
- return;
- }
- auto range_start = indices_begin;
- auto range_cur = range_start;
- auto last_value = GetView::LogicalValue(array.GetView(*range_cur));
- while (++range_cur != indices_end) {
- auto v = GetView::LogicalValue(array.GetView(*range_cur));
- if (v != last_value) {
- visit(range_start, range_cur);
- range_start = range_cur;
- last_value = v;
- }
- }
- if (range_start != range_cur) {
- visit(range_start, range_cur);
- }
-}
-
-// A sorter for a single column of a RecordBatch, deferring to the next column
-// for ranges of equal values.
-class RecordBatchColumnSorter {
- public:
- explicit RecordBatchColumnSorter(RecordBatchColumnSorter* next_column = nullptr)
- : next_column_(next_column) {}
- virtual ~RecordBatchColumnSorter() {}
-
- virtual void SortRange(uint64_t* indices_begin, uint64_t* indices_end) = 0;
-
- protected:
- RecordBatchColumnSorter* next_column_;
-};
-
-template <typename Type>
-class ConcreteRecordBatchColumnSorter : public RecordBatchColumnSorter {
- public:
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- ConcreteRecordBatchColumnSorter(std::shared_ptr<Array> array, SortOrder order,
- RecordBatchColumnSorter* next_column = nullptr)
- : RecordBatchColumnSorter(next_column),
- owned_array_(std::move(array)),
- array_(checked_cast<const ArrayType&>(*owned_array_)),
- order_(order),
- null_count_(array_.null_count()) {}
-
- void SortRange(uint64_t* indices_begin, uint64_t* indices_end) {
- using GetView = GetViewType<Type>;
-
- constexpr int64_t offset = 0;
- uint64_t* nulls_begin;
- if (null_count_ == 0) {
- nulls_begin = indices_end;
- } else {
- // NOTE that null_count_ is merely an upper bound on the number of nulls
- // in this particular range.
- nulls_begin = PartitionNullsOnly<StablePartitioner>(indices_begin, indices_end,
- array_, offset);
- DCHECK_LE(indices_end - nulls_begin, null_count_);
- }
- uint64_t* null_likes_begin = PartitionNullLikes<ArrayType, StablePartitioner>(
- indices_begin, nulls_begin, array_, offset);
-
- // TODO This is roughly the same as ArrayCompareSorter.
- // Also, we would like to use a counting sort if possible. This requires
- // a counting sort compatible with indirect indexing.
- if (order_ == SortOrder::Ascending) {
- std::stable_sort(
- indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
- const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
- return lhs < rhs;
- });
- } else {
- std::stable_sort(
- indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
- // We don't use 'left > right' here to reduce required operator.
- // If we use 'right < left' here, '<' is only required.
- const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
- return lhs > rhs;
- });
- }
-
- if (next_column_ != nullptr) {
- // Visit all ranges of equal values in this column and sort them on
- // the next column.
- SortNextColumn(null_likes_begin, nulls_begin);
- SortNextColumn(nulls_begin, indices_end);
- VisitConstantRanges(array_, indices_begin, null_likes_begin,
- [&](uint64_t* range_start, uint64_t* range_end) {
- SortNextColumn(range_start, range_end);
- });
- }
- }
-
- void SortNextColumn(uint64_t* indices_begin, uint64_t* indices_end) {
- // Avoid the cost of a virtual method call in trivial cases
- if (indices_end - indices_begin > 1) {
- next_column_->SortRange(indices_begin, indices_end);
- }
- }
-
- protected:
- const std::shared_ptr<Array> owned_array_;
- const ArrayType& array_;
- const SortOrder order_;
- const int64_t null_count_;
-};
-
-// Sort a batch using a single-pass left-to-right radix sort.
-class RadixRecordBatchSorter {
- public:
- RadixRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
- const RecordBatch& batch, const SortOptions& options)
- : batch_(batch),
- options_(options),
- indices_begin_(indices_begin),
- indices_end_(indices_end) {}
-
- Status Sort() {
- ARROW_ASSIGN_OR_RAISE(const auto sort_keys,
- ResolveSortKeys(batch_, options_.sort_keys));
-
- // Create column sorters from right to left
- std::vector<std::unique_ptr<RecordBatchColumnSorter>> column_sorts(sort_keys.size());
- RecordBatchColumnSorter* next_column = nullptr;
- for (int64_t i = static_cast<int64_t>(sort_keys.size() - 1); i >= 0; --i) {
- ColumnSortFactory factory(sort_keys[i], next_column);
- ARROW_ASSIGN_OR_RAISE(column_sorts[i], factory.MakeColumnSort());
- next_column = column_sorts[i].get();
- }
-
- // Sort from left to right
- column_sorts.front()->SortRange(indices_begin_, indices_end_);
- return Status::OK();
- }
-
- protected:
- struct ResolvedSortKey {
- std::shared_ptr<Array> array;
- SortOrder order;
- };
-
- struct ColumnSortFactory {
- ColumnSortFactory(const ResolvedSortKey& sort_key,
- RecordBatchColumnSorter* next_column)
- : physical_type(GetPhysicalType(sort_key.array->type())),
- array(GetPhysicalArray(*sort_key.array, physical_type)),
- order(sort_key.order),
- next_column(next_column) {}
-
- Result<std::unique_ptr<RecordBatchColumnSorter>> MakeColumnSort() {
- RETURN_NOT_OK(VisitTypeInline(*physical_type, this));
- DCHECK_NE(result, nullptr);
- return std::move(result);
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) { return VisitGeneric(type); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- Status Visit(const DataType& type) {
- return Status::TypeError("Unsupported type for RecordBatch sorting: ",
- type.ToString());
- }
-
- template <typename Type>
- Status VisitGeneric(const Type&) {
- result.reset(new ConcreteRecordBatchColumnSorter<Type>(array, order, next_column));
- return Status::OK();
- }
-
- std::shared_ptr<DataType> physical_type;
- std::shared_ptr<Array> array;
- SortOrder order;
- RecordBatchColumnSorter* next_column;
- std::unique_ptr<RecordBatchColumnSorter> result;
- };
-
- static Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
- const RecordBatch& batch, const std::vector<SortKey>& sort_keys) {
- std::vector<ResolvedSortKey> resolved;
- resolved.reserve(sort_keys.size());
- for (const auto& sort_key : sort_keys) {
- auto array = batch.GetColumnByName(sort_key.name);
- if (!array) {
- return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- }
- resolved.push_back({std::move(array), sort_key.order});
- }
- return resolved;
- }
-
- const RecordBatch& batch_;
- const SortOptions& options_;
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
-};
-
-// Compare two records in the same RecordBatch or Table
-// (indexing is handled through ResolvedSortKey)
-template <typename ResolvedSortKey>
-class MultipleKeyComparator {
- public:
- explicit MultipleKeyComparator(const std::vector<ResolvedSortKey>& sort_keys)
- : sort_keys_(sort_keys) {}
-
- Status status() const { return status_; }
-
- // Returns true if the left-th value should be ordered before the
- // right-th value, false otherwise. The start_sort_key_index-th
- // sort key and subsequent sort keys are used for comparison.
- bool Compare(uint64_t left, uint64_t right, size_t start_sort_key_index) {
- current_left_ = left;
- current_right_ = right;
- current_compared_ = 0;
- auto num_sort_keys = sort_keys_.size();
- for (size_t i = start_sort_key_index; i < num_sort_keys; ++i) {
- current_sort_key_index_ = i;
- status_ = VisitTypeInline(*sort_keys_[i].type, this);
- // If the left value equals to the right value, we need to
- // continue to sort.
- if (current_compared_ != 0) {
- break;
- }
- }
- return current_compared_ < 0;
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) { \
- current_compared_ = CompareType<TYPE>(); \
- return Status::OK(); \
- }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- Status Visit(const DataType& type) {
- return Status::TypeError("Unsupported type for RecordBatch sorting: ",
- type.ToString());
- }
-
- private:
- // Compares two records in the same table and returns -1, 0 or 1.
- //
- // -1: The left is less than the right.
- // 0: The left equals to the right.
- // 1: The left is greater than the right.
- //
- // This supports null and NaN. Null is processed in this and NaN
- // is processed in CompareTypeValue().
- template <typename Type>
- int32_t CompareType() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- const auto& sort_key = sort_keys_[current_sort_key_index_];
- auto order = sort_key.order;
- const auto chunk_left = sort_key.template GetChunk<ArrayType>(current_left_);
- const auto chunk_right = sort_key.template GetChunk<ArrayType>(current_right_);
- if (sort_key.null_count > 0) {
- auto is_null_left = chunk_left.IsNull();
- auto is_null_right = chunk_right.IsNull();
- if (is_null_left && is_null_right) {
- return 0;
- } else if (is_null_left) {
- return 1;
- } else if (is_null_right) {
- return -1;
- }
- }
- return CompareTypeValue<Type>(chunk_left, chunk_right, order);
- }
-
- // For non-float types. Value is never NaN.
- template <typename Type>
- enable_if_t<!is_floating_type<Type>::value, int32_t> CompareTypeValue(
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
- const SortOrder order) {
- const auto left = chunk_left.Value();
- const auto right = chunk_right.Value();
- int32_t compared;
- if (left == right) {
- compared = 0;
- } else if (left > right) {
- compared = 1;
- } else {
- compared = -1;
- }
- if (order == SortOrder::Descending) {
- compared = -compared;
- }
- return compared;
- }
-
- // For float types. Value may be NaN.
- template <typename Type>
- enable_if_t<is_floating_type<Type>::value, int32_t> CompareTypeValue(
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
- const SortOrder order) {
- const auto left = chunk_left.Value();
- const auto right = chunk_right.Value();
- auto is_nan_left = std::isnan(left);
- auto is_nan_right = std::isnan(right);
- if (is_nan_left && is_nan_right) {
- return 0;
- } else if (is_nan_left) {
- return 1;
- } else if (is_nan_right) {
- return -1;
- }
- int32_t compared;
- if (left == right) {
- compared = 0;
- } else if (left > right) {
- compared = 1;
- } else {
- compared = -1;
- }
- if (order == SortOrder::Descending) {
- compared = -compared;
- }
- return compared;
- }
-
- const std::vector<ResolvedSortKey>& sort_keys_;
- Status status_;
- int64_t current_left_;
- int64_t current_right_;
- size_t current_sort_key_index_;
- int32_t current_compared_;
-};
-
-// Sort a batch using a single sort and multiple-key comparisons.
-class MultipleKeyRecordBatchSorter : public TypeVisitor {
- private:
- // Preprocessed sort key.
- struct ResolvedSortKey {
- ResolvedSortKey(const std::shared_ptr<Array>& array, const SortOrder order)
- : type(GetPhysicalType(array->type())),
- owned_array(GetPhysicalArray(*array, type)),
- array(*owned_array),
- order(order),
- null_count(array->null_count()) {}
-
- template <typename ArrayType>
- ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
- return {&checked_cast<const ArrayType&>(array), index};
- }
-
- const std::shared_ptr<DataType> type;
- std::shared_ptr<Array> owned_array;
- const Array& array;
- SortOrder order;
- int64_t null_count;
- };
-
- using Comparator = MultipleKeyComparator<ResolvedSortKey>;
-
- public:
- MultipleKeyRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
- const RecordBatch& batch, const SortOptions& options)
- : indices_begin_(indices_begin),
- indices_end_(indices_end),
- sort_keys_(ResolveSortKeys(batch, options.sort_keys, &status_)),
- comparator_(sort_keys_) {}
-
- // This is optimized for the first sort key. The first sort key sort
- // is processed in this class. The second and following sort keys
- // are processed in Comparator.
- Status Sort() {
- RETURN_NOT_OK(status_);
- return sort_keys_[0].type->Accept(this);
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- private:
- static std::vector<ResolvedSortKey> ResolveSortKeys(
- const RecordBatch& batch, const std::vector<SortKey>& sort_keys, Status* status) {
- std::vector<ResolvedSortKey> resolved;
- for (const auto& sort_key : sort_keys) {
- auto array = batch.GetColumnByName(sort_key.name);
- if (!array) {
- *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- break;
- }
- resolved.emplace_back(array, sort_key.order);
- }
- return resolved;
- }
-
- template <typename Type>
- Status SortInternal() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- auto& comparator = comparator_;
- const auto& first_sort_key = sort_keys_[0];
- const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
- auto nulls_begin = indices_end_;
- nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
- // Sort first-key non-nulls
- std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
- // Both values are never null nor NaN
- // (otherwise they've been partitioned away above).
- const auto value_left = array.GetView(left);
- const auto value_right = array.GetView(right);
- if (value_left != value_right) {
- bool compared = value_left < value_right;
- if (first_sort_key.order == SortOrder::Ascending) {
- return compared;
- } else {
- return !compared;
- }
- }
- // If the left value equals to the right value,
- // we need to compare the second and following
- // sort keys.
- return comparator.Compare(left, right, 1);
- });
- return comparator_.status();
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For non-float types.
- template <typename Type>
- enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- if (first_sort_key.null_count == 0) {
- return indices_end_;
- }
- const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
- StablePartitioner partitioner;
- auto nulls_begin = partitioner(indices_begin_, indices_end_,
- [&](uint64_t index) { return !array.IsNull(index); });
- // Sort all nulls by second and following sort keys
- // TODO: could we instead run an independent sort from the second key on
- // this slice?
- if (nulls_begin != indices_end_) {
- auto& comparator = comparator_;
- std::stable_sort(nulls_begin, indices_end_,
- [&comparator](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- }
- return nulls_begin;
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For float types.
- template <typename Type>
- enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
- StablePartitioner partitioner;
- uint64_t* nulls_begin;
- if (first_sort_key.null_count == 0) {
- nulls_begin = indices_end_;
- } else {
- nulls_begin = partitioner(indices_begin_, indices_end_,
- [&](uint64_t index) { return !array.IsNull(index); });
- }
- uint64_t* nans_and_nulls_begin =
- partitioner(indices_begin_, nulls_begin,
- [&](uint64_t index) { return !std::isnan(array.GetView(index)); });
- auto& comparator = comparator_;
- if (nans_and_nulls_begin != nulls_begin) {
- // Sort all NaNs by the second and following sort keys.
- // TODO: could we instead run an independent sort from the second key on
- // this slice?
- std::stable_sort(nans_and_nulls_begin, nulls_begin,
- [&comparator](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- }
- if (nulls_begin != indices_end_) {
- // Sort all nulls by the second and following sort keys.
- // TODO: could we instead run an independent sort from the second key on
- // this slice?
- std::stable_sort(nulls_begin, indices_end_,
- [&comparator](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- }
- return nans_and_nulls_begin;
- }
-
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
- Status status_;
- std::vector<ResolvedSortKey> sort_keys_;
- Comparator comparator_;
-};
-
-// ----------------------------------------------------------------------
-// Table sorting implementations
-
-// Sort a table using a radix sort-like algorithm.
-// A distinct stable sort is called for each sort key, from the last key to the first.
-class TableRadixSorter {
- public:
- Status Sort(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
- const Table& table, const SortOptions& options) {
- for (auto i = options.sort_keys.size(); i > 0; --i) {
- const auto& sort_key = options.sort_keys[i - 1];
- const auto& chunked_array = table.GetColumnByName(sort_key.name);
- if (!chunked_array) {
- return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- }
- // We can use ArraySorter only for the sort key that is
- // processed first because ArraySorter doesn't care about
- // existing indices.
- const auto can_use_array_sorter = (i == 0);
- ChunkedArraySorter sorter(ctx, indices_begin, indices_end, *chunked_array.get(),
- sort_key.order, can_use_array_sorter);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- }
- return Status::OK();
- }
-};
-
-// Sort a table using a single sort and multiple-key comparisons.
-class MultipleKeyTableSorter : public TypeVisitor {
- private:
- // TODO instead of resolving chunks for each column independently, we could
- // split the table into RecordBatches and pay the cost of chunked indexing
- // at the first column only.
-
- // Preprocessed sort key.
- struct ResolvedSortKey {
- ResolvedSortKey(const ChunkedArray& chunked_array, const SortOrder order)
- : order(order),
- type(GetPhysicalType(chunked_array.type())),
- chunks(GetPhysicalChunks(chunked_array, type)),
- chunk_pointers(GetArrayPointers(chunks)),
- null_count(chunked_array.null_count()),
- num_chunks(chunked_array.num_chunks()),
- resolver(chunk_pointers) {}
-
- // Finds the target chunk and index in the target chunk from an
- // index in chunked array.
- template <typename ArrayType>
- ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
- return resolver.Resolve<ArrayType>(index);
- }
-
- const SortOrder order;
- const std::shared_ptr<DataType> type;
- const ArrayVector chunks;
- const std::vector<const Array*> chunk_pointers;
- const int64_t null_count;
- const int num_chunks;
- const ChunkedArrayResolver resolver;
- };
-
- using Comparator = MultipleKeyComparator<ResolvedSortKey>;
-
- public:
- MultipleKeyTableSorter(uint64_t* indices_begin, uint64_t* indices_end,
- const Table& table, const SortOptions& options)
- : indices_begin_(indices_begin),
- indices_end_(indices_end),
- sort_keys_(ResolveSortKeys(table, options.sort_keys, &status_)),
- comparator_(sort_keys_) {}
-
- // This is optimized for the first sort key. The first sort key sort
- // is processed in this class. The second and following sort keys
- // are processed in Comparator.
- Status Sort() {
- ARROW_RETURN_NOT_OK(status_);
- return sort_keys_[0].type->Accept(this);
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- private:
- static std::vector<ResolvedSortKey> ResolveSortKeys(
- const Table& table, const std::vector<SortKey>& sort_keys, Status* status) {
- std::vector<ResolvedSortKey> resolved;
- resolved.reserve(sort_keys.size());
- for (const auto& sort_key : sort_keys) {
- const auto& chunked_array = table.GetColumnByName(sort_key.name);
- if (!chunked_array) {
- *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- break;
- }
- resolved.emplace_back(*chunked_array, sort_key.order);
- }
- return resolved;
- }
-
- template <typename Type>
- Status SortInternal() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- auto& comparator = comparator_;
- const auto& first_sort_key = sort_keys_[0];
- auto nulls_begin = indices_end_;
- nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
- std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
- // Both values are never null nor NaN.
- auto chunk_left = first_sort_key.GetChunk<ArrayType>(left);
- auto chunk_right = first_sort_key.GetChunk<ArrayType>(right);
- auto value_left = chunk_left.Value();
- auto value_right = chunk_right.Value();
- if (value_left == value_right) {
- // If the left value equals to the right value,
- // we need to compare the second and following
- // sort keys.
- return comparator.Compare(left, right, 1);
- } else {
- auto compared = value_left < value_right;
- if (first_sort_key.order == SortOrder::Ascending) {
- return compared;
- } else {
- return !compared;
- }
- }
- });
- return comparator_.status();
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For non-float types.
- template <typename Type>
- enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- if (first_sort_key.null_count == 0) {
- return indices_end_;
- }
- StablePartitioner partitioner;
- auto nulls_begin =
- partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) {
- const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
- return !chunk.IsNull();
- });
- DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
- auto& comparator = comparator_;
- std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- return nulls_begin;
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For float types.
- template <typename Type>
- enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- StablePartitioner partitioner;
- uint64_t* nulls_begin;
- if (first_sort_key.null_count == 0) {
- nulls_begin = indices_end_;
- } else {
- nulls_begin = partitioner(indices_begin_, indices_end_, [&](uint64_t index) {
- const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
- return !chunk.IsNull();
- });
- }
- DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
- uint64_t* nans_begin = partitioner(indices_begin_, nulls_begin, [&](uint64_t index) {
- const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
- return !std::isnan(chunk.Value());
- });
- auto& comparator = comparator_;
- // Sort all NaNs by the second and following sort keys.
- std::stable_sort(nans_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- // Sort all nulls by the second and following sort keys.
- std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- return nans_begin;
- }
-
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
- Status status_;
- std::vector<ResolvedSortKey> sort_keys_;
- Comparator comparator_;
-};
-
-// ----------------------------------------------------------------------
-// Top-level sort functions
-
-const auto kDefaultSortOptions = SortOptions::Defaults();
-
-const FunctionDoc sort_indices_doc(
- "Return the indices that would sort an array, record batch or table",
- ("This function computes an array of indices that define a stable sort\n"
- "of the input array, record batch or table. Null values are considered\n"
- "greater than any other value and are therefore sorted at the end of the\n"
- "input. For floating-point types, NaNs are considered greater than any\n"
- "other non-null value, but smaller than null values."),
- {"input"}, "SortOptions");
-
-class SortIndicesMetaFunction : public MetaFunction {
- public:
- SortIndicesMetaFunction()
- : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc,
- &kDefaultSortOptions) {}
-
- Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
- const FunctionOptions* options,
- ExecContext* ctx) const override {
- const SortOptions& sort_options = static_cast<const SortOptions&>(*options);
- switch (args[0].kind()) {
- case Datum::ARRAY:
- return SortIndices(*args[0].make_array(), sort_options, ctx);
- break;
- case Datum::CHUNKED_ARRAY:
- return SortIndices(*args[0].chunked_array(), sort_options, ctx);
- break;
- case Datum::RECORD_BATCH: {
- return SortIndices(*args[0].record_batch(), sort_options, ctx);
- } break;
- case Datum::TABLE:
- return SortIndices(*args[0].table(), sort_options, ctx);
- break;
- default:
- break;
- }
- return Status::NotImplemented(
- "Unsupported types for sort_indices operation: "
- "values=",
- args[0].ToString());
- }
-
- private:
- Result<Datum> SortIndices(const Array& values, const SortOptions& options,
- ExecContext* ctx) const {
- SortOrder order = SortOrder::Ascending;
- if (!options.sort_keys.empty()) {
- order = options.sort_keys[0].order;
- }
- ArraySortOptions array_options(order);
- return CallFunction("array_sort_indices", {values}, &array_options, ctx);
- }
-
- Result<Datum> SortIndices(const ChunkedArray& chunked_array, const SortOptions& options,
- ExecContext* ctx) const {
- SortOrder order = SortOrder::Ascending;
- if (!options.sort_keys.empty()) {
- order = options.sort_keys[0].order;
- }
-
- auto out_type = uint64();
- auto length = chunked_array.length();
- auto buffer_size = BitUtil::BytesForBits(
- length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
- std::vector<std::shared_ptr<Buffer>> buffers(2);
- ARROW_ASSIGN_OR_RAISE(buffers[1],
- AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
- auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
- auto out_begin = out->GetMutableValues<uint64_t>(1);
- auto out_end = out_begin + length;
- std::iota(out_begin, out_end, 0);
-
- ChunkedArraySorter sorter(ctx, out_begin, out_end, chunked_array, order);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- return Datum(out);
- }
-
- Result<Datum> SortIndices(const RecordBatch& batch, const SortOptions& options,
- ExecContext* ctx) const {
- auto n_sort_keys = options.sort_keys.size();
- if (n_sort_keys == 0) {
- return Status::Invalid("Must specify one or more sort keys");
- }
- if (n_sort_keys == 1) {
- auto array = batch.GetColumnByName(options.sort_keys[0].name);
- if (!array) {
- return Status::Invalid("Nonexistent sort key column: ",
- options.sort_keys[0].name);
- }
- return SortIndices(*array, options, ctx);
- }
-
- auto out_type = uint64();
- auto length = batch.num_rows();
- auto buffer_size = BitUtil::BytesForBits(
- length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
- BufferVector buffers(2);
- ARROW_ASSIGN_OR_RAISE(buffers[1],
- AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
- auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
- auto out_begin = out->GetMutableValues<uint64_t>(1);
- auto out_end = out_begin + length;
- std::iota(out_begin, out_end, 0);
-
- // Radix sorting is consistently faster except when there is a large number
- // of sort keys, in which case it can end up degrading catastrophically.
- // Cut off above 8 sort keys.
- if (n_sort_keys <= 8) {
- RadixRecordBatchSorter sorter(out_begin, out_end, batch, options);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- } else {
- MultipleKeyRecordBatchSorter sorter(out_begin, out_end, batch, options);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- }
- return Datum(out);
- }
-
- Result<Datum> SortIndices(const Table& table, const SortOptions& options,
- ExecContext* ctx) const {
- auto n_sort_keys = options.sort_keys.size();
- if (n_sort_keys == 0) {
- return Status::Invalid("Must specify one or more sort keys");
- }
- if (n_sort_keys == 1) {
- auto chunked_array = table.GetColumnByName(options.sort_keys[0].name);
- if (!chunked_array) {
- return Status::Invalid("Nonexistent sort key column: ",
- options.sort_keys[0].name);
- }
- return SortIndices(*chunked_array, options, ctx);
- }
-
- auto out_type = uint64();
- auto length = table.num_rows();
- auto buffer_size = BitUtil::BytesForBits(
- length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
- std::vector<std::shared_ptr<Buffer>> buffers(2);
- ARROW_ASSIGN_OR_RAISE(buffers[1],
- AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
- auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
- auto out_begin = out->GetMutableValues<uint64_t>(1);
- auto out_end = out_begin + length;
- std::iota(out_begin, out_end, 0);
-
- // TODO: We should choose suitable sort implementation
- // automatically. The current TableRadixSorter implementation is
- // faster than MultipleKeyTableSorter only when the number of
- // sort keys is 2 and counting sort is used. So we always
- // MultipleKeyTableSorter for now.
- //
- // TableRadixSorter sorter;
- // ARROW_RETURN_NOT_OK(sorter.Sort(ctx, out_begin, out_end, table, options));
- MultipleKeyTableSorter sorter(out_begin, out_end, table, options);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- return Datum(out);
- }
-};
-
-const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
-
-const FunctionDoc array_sort_indices_doc(
- "Return the indices that would sort an array",
- ("This function computes an array of indices that define a stable sort\n"
- "of the input array. Null values are considered greater than any\n"
- "other value and are therefore sorted at the end of the array.\n"
- "For floating-point types, NaNs are considered greater than any\n"
- "other non-null value, but smaller than null values."),
- {"array"}, "ArraySortOptions");
-
-const FunctionDoc partition_nth_indices_doc(
- "Return the indices that would partition an array around a pivot",
- ("This functions computes an array of indices that define a non-stable\n"
- "partial sort of the input array.\n"
- "\n"
- "The output is such that the `N`'th index points to the `N`'th element\n"
- "of the input in sorted order, and all indices before the `N`'th point\n"
- "to elements in the input less or equal to elements at or after the `N`'th.\n"
- "\n"
- "Null values are considered greater than any other value and are\n"
- "therefore partitioned towards the end of the array.\n"
- "For floating-point types, NaNs are considered greater than any\n"
- "other non-null value, but smaller than null values.\n"
- "\n"
- "The pivot index `N` must be given in PartitionNthOptions."),
- {"array"}, "PartitionNthOptions");
-
-} // namespace
-
+// ----------------------------------------------------------------------
+// ChunkedArray sorting implementations
+
+// Sort a chunked array directly without sorting each array in the
+// chunked array. This is used for processing the second and following
+// sort keys in TableRadixSorter.
+//
+// This uses the same algorithm as ArrayCompareSorter.
+template <typename Type>
+class ChunkedArrayCompareSorter {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ public:
+ // Returns where null starts.
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count,
+ const ArraySortOptions& options) {
+ auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
+ indices_begin, indices_end, arrays, null_count);
+ ChunkedArrayResolver resolver(arrays);
+ if (options.order == SortOrder::Ascending) {
+ std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = resolver.Resolve<ArrayType>(right);
+ return chunk_left.Value() < chunk_right.Value();
+ });
+ } else {
+ std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = resolver.Resolve<ArrayType>(right);
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ return chunk_right.Value() < chunk_left.Value();
+ });
+ }
+ return nulls_begin;
+ }
+};
+
+// Sort a chunked array by sorting each array in the chunked array.
+//
+// TODO: This is a naive implementation. We'll be able to improve
+// performance of this. For example, we'll be able to use threads for
+// sorting each array.
+class ChunkedArraySorter : public TypeVisitor {
+ public:
+ ChunkedArraySorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const ChunkedArray& chunked_array, const SortOrder order,
+ bool can_use_array_sorter = true)
+ : TypeVisitor(),
+ indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ chunked_array_(chunked_array),
+ physical_type_(GetPhysicalType(chunked_array.type())),
+ physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+ order_(order),
+ can_use_array_sorter_(can_use_array_sorter),
+ ctx_(ctx) {}
+
+ Status Sort() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ ArraySortOptions options(order_);
+ const auto num_chunks = chunked_array_.num_chunks();
+ if (num_chunks == 0) {
+ return Status::OK();
+ }
+ const auto arrays = GetArrayPointers(physical_chunks_);
+ if (can_use_array_sorter_) {
+ // Sort each chunk independently and merge to sorted indices.
+ // This is a serial implementation.
+ ArraySorter<Type> sorter;
+ struct SortedChunk {
+ int64_t begin_offset;
+ int64_t end_offset;
+ int64_t nulls_offset;
+ };
+ std::vector<SortedChunk> sorted(num_chunks);
+
+ // First sort all individual chunks
+ int64_t begin_offset = 0;
+ int64_t end_offset = 0;
+ int64_t null_count = 0;
+ for (int i = 0; i < num_chunks; ++i) {
+ const auto array = checked_cast<const ArrayType*>(arrays[i]);
+ end_offset += array->length();
+ null_count += array->null_count();
+ uint64_t* nulls_begin =
+ sorter.impl.Sort(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+ *array, begin_offset, options);
+ sorted[i] = {begin_offset, end_offset, nulls_begin - indices_begin_};
+ begin_offset = end_offset;
+ }
+ DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+ std::unique_ptr<Buffer> temp_buffer;
+ uint64_t* temp_indices = nullptr;
+ if (sorted.size() > 1) {
+ ARROW_ASSIGN_OR_RAISE(
+ temp_buffer,
+ AllocateBuffer(sizeof(int64_t) * (indices_end_ - indices_begin_ - null_count),
+ ctx_->memory_pool()));
+ temp_indices = reinterpret_cast<uint64_t*>(temp_buffer->mutable_data());
+ }
+
+ // Then merge them by pairs, recursively
+ while (sorted.size() > 1) {
+ auto out_it = sorted.begin();
+ auto it = sorted.begin();
+ while (it < sorted.end() - 1) {
+ const auto& left = *it++;
+ const auto& right = *it++;
+ DCHECK_EQ(left.end_offset, right.begin_offset);
+ DCHECK_GE(left.nulls_offset, left.begin_offset);
+ DCHECK_LE(left.nulls_offset, left.end_offset);
+ DCHECK_GE(right.nulls_offset, right.begin_offset);
+ DCHECK_LE(right.nulls_offset, right.end_offset);
+ uint64_t* nulls_begin = Merge<ArrayType>(
+ indices_begin_ + left.begin_offset, indices_begin_ + left.end_offset,
+ indices_begin_ + right.end_offset, indices_begin_ + left.nulls_offset,
+ indices_begin_ + right.nulls_offset, arrays, null_count, order_,
+ temp_indices);
+ *out_it++ = {left.begin_offset, right.end_offset, nulls_begin - indices_begin_};
+ }
+ if (it < sorted.end()) {
+ *out_it++ = *it++;
+ }
+ sorted.erase(out_it, sorted.end());
+ }
+ DCHECK_EQ(sorted.size(), 1);
+ DCHECK_EQ(sorted[0].begin_offset, 0);
+ DCHECK_EQ(sorted[0].end_offset, chunked_array_.length());
+ // Note that "nulls" can also include NaNs, hence the >= check
+ DCHECK_GE(chunked_array_.length() - sorted[0].nulls_offset, null_count);
+ } else {
+ // Sort the chunked array directory.
+ ChunkedArrayCompareSorter<Type> sorter;
+ sorter.Sort(indices_begin_, indices_end_, arrays, chunked_array_.null_count(),
+ options);
+ }
+ return Status::OK();
+ }
+
+ // Merges two sorted indices arrays and returns where nulls starts.
+ // Where nulls starts is used when the next merge to detect the
+ // sorted indices locations.
+ template <typename ArrayType>
+ uint64_t* Merge(uint64_t* indices_begin, uint64_t* indices_middle,
+ uint64_t* indices_end, uint64_t* left_nulls_begin,
+ uint64_t* right_nulls_begin, const std::vector<const Array*>& arrays,
+ int64_t null_count, const SortOrder order, uint64_t* temp_indices) {
+ // Input layout:
+ // [left non-nulls .... left nulls .... right non-nulls .... right nulls]
+ // ^ ^ ^ ^
+ // | | | |
+ // indices_begin left_nulls_begin indices_middle right_nulls_begin
+ auto left_num_non_nulls = left_nulls_begin - indices_begin;
+ auto right_num_non_nulls = right_nulls_begin - indices_middle;
+
+ // Mutate the input, stably, to obtain the following layout:
+ // [left non-nulls .... right non-nulls .... left nulls .... right nulls]
+ // ^ ^ ^ ^
+ // | | | |
+ // indices_begin indices_middle nulls_begin right_nulls_begin
+ std::rotate(left_nulls_begin, indices_middle, right_nulls_begin);
+ auto nulls_begin = indices_begin + left_num_non_nulls + right_num_non_nulls;
+ // If the type has null-like values (such as NaN), ensure those plus regular
+ // nulls are partitioned in the right order. Note this assumes that all
+ // null-like values (e.g. NaN) are ordered equally.
+ if (NullTraits<typename ArrayType::TypeClass>::has_null_like_values) {
+ PartitionNullsOnly<StablePartitioner>(nulls_begin, indices_end, arrays, null_count);
+ }
+
+ // Merge the non-null values into temp area
+ indices_middle = indices_begin + left_num_non_nulls;
+ indices_end = indices_middle + right_num_non_nulls;
+ const ChunkedArrayResolver left_resolver(arrays);
+ const ChunkedArrayResolver right_resolver(arrays);
+ if (order == SortOrder::Ascending) {
+ std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
+ [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
+ return chunk_left.Value() < chunk_right.Value();
+ });
+ } else {
+ std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
+ [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
+ // We don't use 'left > right' here to reduce required
+ // operator. If we use 'right < left' here, '<' is only
+ // required.
+ return chunk_right.Value() < chunk_left.Value();
+ });
+ }
+ // Copy back temp area into main buffer
+ std::copy(temp_indices, temp_indices + (nulls_begin - indices_begin), indices_begin);
+ return nulls_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ const ChunkedArray& chunked_array_;
+ const std::shared_ptr<DataType> physical_type_;
+ const ArrayVector physical_chunks_;
+ const SortOrder order_;
+ const bool can_use_array_sorter_;
+ ExecContext* ctx_;
+};
+
+// ----------------------------------------------------------------------
+// Record batch sorting implementation(s)
+
+// Visit contiguous ranges of equal values. All entries are assumed
+// to be non-null.
+template <typename ArrayType, typename Visitor>
+void VisitConstantRanges(const ArrayType& array, uint64_t* indices_begin,
+ uint64_t* indices_end, Visitor&& visit) {
+ using GetView = GetViewType<typename ArrayType::TypeClass>;
+
+ if (indices_begin == indices_end) {
+ return;
+ }
+ auto range_start = indices_begin;
+ auto range_cur = range_start;
+ auto last_value = GetView::LogicalValue(array.GetView(*range_cur));
+ while (++range_cur != indices_end) {
+ auto v = GetView::LogicalValue(array.GetView(*range_cur));
+ if (v != last_value) {
+ visit(range_start, range_cur);
+ range_start = range_cur;
+ last_value = v;
+ }
+ }
+ if (range_start != range_cur) {
+ visit(range_start, range_cur);
+ }
+}
+
+// A sorter for a single column of a RecordBatch, deferring to the next column
+// for ranges of equal values.
+class RecordBatchColumnSorter {
+ public:
+ explicit RecordBatchColumnSorter(RecordBatchColumnSorter* next_column = nullptr)
+ : next_column_(next_column) {}
+ virtual ~RecordBatchColumnSorter() {}
+
+ virtual void SortRange(uint64_t* indices_begin, uint64_t* indices_end) = 0;
+
+ protected:
+ RecordBatchColumnSorter* next_column_;
+};
+
+template <typename Type>
+class ConcreteRecordBatchColumnSorter : public RecordBatchColumnSorter {
+ public:
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ ConcreteRecordBatchColumnSorter(std::shared_ptr<Array> array, SortOrder order,
+ RecordBatchColumnSorter* next_column = nullptr)
+ : RecordBatchColumnSorter(next_column),
+ owned_array_(std::move(array)),
+ array_(checked_cast<const ArrayType&>(*owned_array_)),
+ order_(order),
+ null_count_(array_.null_count()) {}
+
+ void SortRange(uint64_t* indices_begin, uint64_t* indices_end) {
+ using GetView = GetViewType<Type>;
+
+ constexpr int64_t offset = 0;
+ uint64_t* nulls_begin;
+ if (null_count_ == 0) {
+ nulls_begin = indices_end;
+ } else {
+ // NOTE that null_count_ is merely an upper bound on the number of nulls
+ // in this particular range.
+ nulls_begin = PartitionNullsOnly<StablePartitioner>(indices_begin, indices_end,
+ array_, offset);
+ DCHECK_LE(indices_end - nulls_begin, null_count_);
+ }
+ uint64_t* null_likes_begin = PartitionNullLikes<ArrayType, StablePartitioner>(
+ indices_begin, nulls_begin, array_, offset);
+
+ // TODO This is roughly the same as ArrayCompareSorter.
+ // Also, we would like to use a counting sort if possible. This requires
+ // a counting sort compatible with indirect indexing.
+ if (order_ == SortOrder::Ascending) {
+ std::stable_sort(
+ indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
+ return lhs < rhs;
+ });
+ } else {
+ std::stable_sort(
+ indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
+ return lhs > rhs;
+ });
+ }
+
+ if (next_column_ != nullptr) {
+ // Visit all ranges of equal values in this column and sort them on
+ // the next column.
+ SortNextColumn(null_likes_begin, nulls_begin);
+ SortNextColumn(nulls_begin, indices_end);
+ VisitConstantRanges(array_, indices_begin, null_likes_begin,
+ [&](uint64_t* range_start, uint64_t* range_end) {
+ SortNextColumn(range_start, range_end);
+ });
+ }
+ }
+
+ void SortNextColumn(uint64_t* indices_begin, uint64_t* indices_end) {
+ // Avoid the cost of a virtual method call in trivial cases
+ if (indices_end - indices_begin > 1) {
+ next_column_->SortRange(indices_begin, indices_end);
+ }
+ }
+
+ protected:
+ const std::shared_ptr<Array> owned_array_;
+ const ArrayType& array_;
+ const SortOrder order_;
+ const int64_t null_count_;
+};
+
+// Sort a batch using a single-pass left-to-right radix sort.
+class RadixRecordBatchSorter {
+ public:
+ RadixRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const RecordBatch& batch, const SortOptions& options)
+ : batch_(batch),
+ options_(options),
+ indices_begin_(indices_begin),
+ indices_end_(indices_end) {}
+
+ Status Sort() {
+ ARROW_ASSIGN_OR_RAISE(const auto sort_keys,
+ ResolveSortKeys(batch_, options_.sort_keys));
+
+ // Create column sorters from right to left
+ std::vector<std::unique_ptr<RecordBatchColumnSorter>> column_sorts(sort_keys.size());
+ RecordBatchColumnSorter* next_column = nullptr;
+ for (int64_t i = static_cast<int64_t>(sort_keys.size() - 1); i >= 0; --i) {
+ ColumnSortFactory factory(sort_keys[i], next_column);
+ ARROW_ASSIGN_OR_RAISE(column_sorts[i], factory.MakeColumnSort());
+ next_column = column_sorts[i].get();
+ }
+
+ // Sort from left to right
+ column_sorts.front()->SortRange(indices_begin_, indices_end_);
+ return Status::OK();
+ }
+
+ protected:
+ struct ResolvedSortKey {
+ std::shared_ptr<Array> array;
+ SortOrder order;
+ };
+
+ struct ColumnSortFactory {
+ ColumnSortFactory(const ResolvedSortKey& sort_key,
+ RecordBatchColumnSorter* next_column)
+ : physical_type(GetPhysicalType(sort_key.array->type())),
+ array(GetPhysicalArray(*sort_key.array, physical_type)),
+ order(sort_key.order),
+ next_column(next_column) {}
+
+ Result<std::unique_ptr<RecordBatchColumnSorter>> MakeColumnSort() {
+ RETURN_NOT_OK(VisitTypeInline(*physical_type, this));
+ DCHECK_NE(result, nullptr);
+ return std::move(result);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { return VisitGeneric(type); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("Unsupported type for RecordBatch sorting: ",
+ type.ToString());
+ }
+
+ template <typename Type>
+ Status VisitGeneric(const Type&) {
+ result.reset(new ConcreteRecordBatchColumnSorter<Type>(array, order, next_column));
+ return Status::OK();
+ }
+
+ std::shared_ptr<DataType> physical_type;
+ std::shared_ptr<Array> array;
+ SortOrder order;
+ RecordBatchColumnSorter* next_column;
+ std::unique_ptr<RecordBatchColumnSorter> result;
+ };
+
+ static Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
+ const RecordBatch& batch, const std::vector<SortKey>& sort_keys) {
+ std::vector<ResolvedSortKey> resolved;
+ resolved.reserve(sort_keys.size());
+ for (const auto& sort_key : sort_keys) {
+ auto array = batch.GetColumnByName(sort_key.name);
+ if (!array) {
+ return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ }
+ resolved.push_back({std::move(array), sort_key.order});
+ }
+ return resolved;
+ }
+
+ const RecordBatch& batch_;
+ const SortOptions& options_;
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+};
+
+// Compare two records in the same RecordBatch or Table
+// (indexing is handled through ResolvedSortKey)
+template <typename ResolvedSortKey>
+class MultipleKeyComparator {
+ public:
+ explicit MultipleKeyComparator(const std::vector<ResolvedSortKey>& sort_keys)
+ : sort_keys_(sort_keys) {}
+
+ Status status() const { return status_; }
+
+ // Returns true if the left-th value should be ordered before the
+ // right-th value, false otherwise. The start_sort_key_index-th
+ // sort key and subsequent sort keys are used for comparison.
+ bool Compare(uint64_t left, uint64_t right, size_t start_sort_key_index) {
+ current_left_ = left;
+ current_right_ = right;
+ current_compared_ = 0;
+ auto num_sort_keys = sort_keys_.size();
+ for (size_t i = start_sort_key_index; i < num_sort_keys; ++i) {
+ current_sort_key_index_ = i;
+ status_ = VisitTypeInline(*sort_keys_[i].type, this);
+ // If the left value equals to the right value, we need to
+ // continue to sort.
+ if (current_compared_ != 0) {
+ break;
+ }
+ }
+ return current_compared_ < 0;
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { \
+ current_compared_ = CompareType<TYPE>(); \
+ return Status::OK(); \
+ }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("Unsupported type for RecordBatch sorting: ",
+ type.ToString());
+ }
+
+ private:
+ // Compares two records in the same table and returns -1, 0 or 1.
+ //
+ // -1: The left is less than the right.
+ // 0: The left equals to the right.
+ // 1: The left is greater than the right.
+ //
+ // This supports null and NaN. Null is processed in this and NaN
+ // is processed in CompareTypeValue().
+ template <typename Type>
+ int32_t CompareType() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ const auto& sort_key = sort_keys_[current_sort_key_index_];
+ auto order = sort_key.order;
+ const auto chunk_left = sort_key.template GetChunk<ArrayType>(current_left_);
+ const auto chunk_right = sort_key.template GetChunk<ArrayType>(current_right_);
+ if (sort_key.null_count > 0) {
+ auto is_null_left = chunk_left.IsNull();
+ auto is_null_right = chunk_right.IsNull();
+ if (is_null_left && is_null_right) {
+ return 0;
+ } else if (is_null_left) {
+ return 1;
+ } else if (is_null_right) {
+ return -1;
+ }
+ }
+ return CompareTypeValue<Type>(chunk_left, chunk_right, order);
+ }
+
+ // For non-float types. Value is never NaN.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, int32_t> CompareTypeValue(
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
+ const SortOrder order) {
+ const auto left = chunk_left.Value();
+ const auto right = chunk_right.Value();
+ int32_t compared;
+ if (left == right) {
+ compared = 0;
+ } else if (left > right) {
+ compared = 1;
+ } else {
+ compared = -1;
+ }
+ if (order == SortOrder::Descending) {
+ compared = -compared;
+ }
+ return compared;
+ }
+
+ // For float types. Value may be NaN.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, int32_t> CompareTypeValue(
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
+ const SortOrder order) {
+ const auto left = chunk_left.Value();
+ const auto right = chunk_right.Value();
+ auto is_nan_left = std::isnan(left);
+ auto is_nan_right = std::isnan(right);
+ if (is_nan_left && is_nan_right) {
+ return 0;
+ } else if (is_nan_left) {
+ return 1;
+ } else if (is_nan_right) {
+ return -1;
+ }
+ int32_t compared;
+ if (left == right) {
+ compared = 0;
+ } else if (left > right) {
+ compared = 1;
+ } else {
+ compared = -1;
+ }
+ if (order == SortOrder::Descending) {
+ compared = -compared;
+ }
+ return compared;
+ }
+
+ const std::vector<ResolvedSortKey>& sort_keys_;
+ Status status_;
+ int64_t current_left_;
+ int64_t current_right_;
+ size_t current_sort_key_index_;
+ int32_t current_compared_;
+};
+
+// Sort a batch using a single sort and multiple-key comparisons.
+class MultipleKeyRecordBatchSorter : public TypeVisitor {
+ private:
+ // Preprocessed sort key.
+ struct ResolvedSortKey {
+ ResolvedSortKey(const std::shared_ptr<Array>& array, const SortOrder order)
+ : type(GetPhysicalType(array->type())),
+ owned_array(GetPhysicalArray(*array, type)),
+ array(*owned_array),
+ order(order),
+ null_count(array->null_count()) {}
+
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
+ return {&checked_cast<const ArrayType&>(array), index};
+ }
+
+ const std::shared_ptr<DataType> type;
+ std::shared_ptr<Array> owned_array;
+ const Array& array;
+ SortOrder order;
+ int64_t null_count;
+ };
+
+ using Comparator = MultipleKeyComparator<ResolvedSortKey>;
+
+ public:
+ MultipleKeyRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const RecordBatch& batch, const SortOptions& options)
+ : indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ sort_keys_(ResolveSortKeys(batch, options.sort_keys, &status_)),
+ comparator_(sort_keys_) {}
+
+ // This is optimized for the first sort key. The first sort key sort
+ // is processed in this class. The second and following sort keys
+ // are processed in Comparator.
+ Status Sort() {
+ RETURN_NOT_OK(status_);
+ return sort_keys_[0].type->Accept(this);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ static std::vector<ResolvedSortKey> ResolveSortKeys(
+ const RecordBatch& batch, const std::vector<SortKey>& sort_keys, Status* status) {
+ std::vector<ResolvedSortKey> resolved;
+ for (const auto& sort_key : sort_keys) {
+ auto array = batch.GetColumnByName(sort_key.name);
+ if (!array) {
+ *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ break;
+ }
+ resolved.emplace_back(array, sort_key.order);
+ }
+ return resolved;
+ }
+
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ auto& comparator = comparator_;
+ const auto& first_sort_key = sort_keys_[0];
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ auto nulls_begin = indices_end_;
+ nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
+ // Sort first-key non-nulls
+ std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
+ // Both values are never null nor NaN
+ // (otherwise they've been partitioned away above).
+ const auto value_left = array.GetView(left);
+ const auto value_right = array.GetView(right);
+ if (value_left != value_right) {
+ bool compared = value_left < value_right;
+ if (first_sort_key.order == SortOrder::Ascending) {
+ return compared;
+ } else {
+ return !compared;
+ }
+ }
+ // If the left value equals to the right value,
+ // we need to compare the second and following
+ // sort keys.
+ return comparator.Compare(left, right, 1);
+ });
+ return comparator_.status();
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For non-float types.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ if (first_sort_key.null_count == 0) {
+ return indices_end_;
+ }
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ StablePartitioner partitioner;
+ auto nulls_begin = partitioner(indices_begin_, indices_end_,
+ [&](uint64_t index) { return !array.IsNull(index); });
+ // Sort all nulls by second and following sort keys
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ if (nulls_begin != indices_end_) {
+ auto& comparator = comparator_;
+ std::stable_sort(nulls_begin, indices_end_,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ return nulls_begin;
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For float types.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ StablePartitioner partitioner;
+ uint64_t* nulls_begin;
+ if (first_sort_key.null_count == 0) {
+ nulls_begin = indices_end_;
+ } else {
+ nulls_begin = partitioner(indices_begin_, indices_end_,
+ [&](uint64_t index) { return !array.IsNull(index); });
+ }
+ uint64_t* nans_and_nulls_begin =
+ partitioner(indices_begin_, nulls_begin,
+ [&](uint64_t index) { return !std::isnan(array.GetView(index)); });
+ auto& comparator = comparator_;
+ if (nans_and_nulls_begin != nulls_begin) {
+ // Sort all NaNs by the second and following sort keys.
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ std::stable_sort(nans_and_nulls_begin, nulls_begin,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ if (nulls_begin != indices_end_) {
+ // Sort all nulls by the second and following sort keys.
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ std::stable_sort(nulls_begin, indices_end_,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ return nans_and_nulls_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ Status status_;
+ std::vector<ResolvedSortKey> sort_keys_;
+ Comparator comparator_;
+};
+
+// ----------------------------------------------------------------------
+// Table sorting implementations
+
+// Sort a table using a radix sort-like algorithm.
+// A distinct stable sort is called for each sort key, from the last key to the first.
+class TableRadixSorter {
+ public:
+ Status Sort(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const Table& table, const SortOptions& options) {
+ for (auto i = options.sort_keys.size(); i > 0; --i) {
+ const auto& sort_key = options.sort_keys[i - 1];
+ const auto& chunked_array = table.GetColumnByName(sort_key.name);
+ if (!chunked_array) {
+ return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ }
+ // We can use ArraySorter only for the sort key that is
+ // processed first because ArraySorter doesn't care about
+ // existing indices.
+ const auto can_use_array_sorter = (i == 0);
+ ChunkedArraySorter sorter(ctx, indices_begin, indices_end, *chunked_array.get(),
+ sort_key.order, can_use_array_sorter);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ }
+ return Status::OK();
+ }
+};
+
+// Sort a table using a single sort and multiple-key comparisons.
+class MultipleKeyTableSorter : public TypeVisitor {
+ private:
+ // TODO instead of resolving chunks for each column independently, we could
+ // split the table into RecordBatches and pay the cost of chunked indexing
+ // at the first column only.
+
+ // Preprocessed sort key.
+ struct ResolvedSortKey {
+ ResolvedSortKey(const ChunkedArray& chunked_array, const SortOrder order)
+ : order(order),
+ type(GetPhysicalType(chunked_array.type())),
+ chunks(GetPhysicalChunks(chunked_array, type)),
+ chunk_pointers(GetArrayPointers(chunks)),
+ null_count(chunked_array.null_count()),
+ num_chunks(chunked_array.num_chunks()),
+ resolver(chunk_pointers) {}
+
+ // Finds the target chunk and index in the target chunk from an
+ // index in chunked array.
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
+ return resolver.Resolve<ArrayType>(index);
+ }
+
+ const SortOrder order;
+ const std::shared_ptr<DataType> type;
+ const ArrayVector chunks;
+ const std::vector<const Array*> chunk_pointers;
+ const int64_t null_count;
+ const int num_chunks;
+ const ChunkedArrayResolver resolver;
+ };
+
+ using Comparator = MultipleKeyComparator<ResolvedSortKey>;
+
+ public:
+ MultipleKeyTableSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const Table& table, const SortOptions& options)
+ : indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ sort_keys_(ResolveSortKeys(table, options.sort_keys, &status_)),
+ comparator_(sort_keys_) {}
+
+ // This is optimized for the first sort key. The first sort key sort
+ // is processed in this class. The second and following sort keys
+ // are processed in Comparator.
+ Status Sort() {
+ ARROW_RETURN_NOT_OK(status_);
+ return sort_keys_[0].type->Accept(this);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ static std::vector<ResolvedSortKey> ResolveSortKeys(
+ const Table& table, const std::vector<SortKey>& sort_keys, Status* status) {
+ std::vector<ResolvedSortKey> resolved;
+ resolved.reserve(sort_keys.size());
+ for (const auto& sort_key : sort_keys) {
+ const auto& chunked_array = table.GetColumnByName(sort_key.name);
+ if (!chunked_array) {
+ *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ break;
+ }
+ resolved.emplace_back(*chunked_array, sort_key.order);
+ }
+ return resolved;
+ }
+
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ auto& comparator = comparator_;
+ const auto& first_sort_key = sort_keys_[0];
+ auto nulls_begin = indices_end_;
+ nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
+ std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
+ // Both values are never null nor NaN.
+ auto chunk_left = first_sort_key.GetChunk<ArrayType>(left);
+ auto chunk_right = first_sort_key.GetChunk<ArrayType>(right);
+ auto value_left = chunk_left.Value();
+ auto value_right = chunk_right.Value();
+ if (value_left == value_right) {
+ // If the left value equals to the right value,
+ // we need to compare the second and following
+ // sort keys.
+ return comparator.Compare(left, right, 1);
+ } else {
+ auto compared = value_left < value_right;
+ if (first_sort_key.order == SortOrder::Ascending) {
+ return compared;
+ } else {
+ return !compared;
+ }
+ }
+ });
+ return comparator_.status();
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For non-float types.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ if (first_sort_key.null_count == 0) {
+ return indices_end_;
+ }
+ StablePartitioner partitioner;
+ auto nulls_begin =
+ partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !chunk.IsNull();
+ });
+ DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
+ auto& comparator = comparator_;
+ std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ return nulls_begin;
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For float types.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ StablePartitioner partitioner;
+ uint64_t* nulls_begin;
+ if (first_sort_key.null_count == 0) {
+ nulls_begin = indices_end_;
+ } else {
+ nulls_begin = partitioner(indices_begin_, indices_end_, [&](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !chunk.IsNull();
+ });
+ }
+ DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
+ uint64_t* nans_begin = partitioner(indices_begin_, nulls_begin, [&](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !std::isnan(chunk.Value());
+ });
+ auto& comparator = comparator_;
+ // Sort all NaNs by the second and following sort keys.
+ std::stable_sort(nans_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ // Sort all nulls by the second and following sort keys.
+ std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ return nans_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ Status status_;
+ std::vector<ResolvedSortKey> sort_keys_;
+ Comparator comparator_;
+};
+
+// ----------------------------------------------------------------------
+// Top-level sort functions
+
+const auto kDefaultSortOptions = SortOptions::Defaults();
+
+const FunctionDoc sort_indices_doc(
+ "Return the indices that would sort an array, record batch or table",
+ ("This function computes an array of indices that define a stable sort\n"
+ "of the input array, record batch or table. Null values are considered\n"
+ "greater than any other value and are therefore sorted at the end of the\n"
+ "input. For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values."),
+ {"input"}, "SortOptions");
+
+class SortIndicesMetaFunction : public MetaFunction {
+ public:
+ SortIndicesMetaFunction()
+ : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc,
+ &kDefaultSortOptions) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ const SortOptions& sort_options = static_cast<const SortOptions&>(*options);
+ switch (args[0].kind()) {
+ case Datum::ARRAY:
+ return SortIndices(*args[0].make_array(), sort_options, ctx);
+ break;
+ case Datum::CHUNKED_ARRAY:
+ return SortIndices(*args[0].chunked_array(), sort_options, ctx);
+ break;
+ case Datum::RECORD_BATCH: {
+ return SortIndices(*args[0].record_batch(), sort_options, ctx);
+ } break;
+ case Datum::TABLE:
+ return SortIndices(*args[0].table(), sort_options, ctx);
+ break;
+ default:
+ break;
+ }
+ return Status::NotImplemented(
+ "Unsupported types for sort_indices operation: "
+ "values=",
+ args[0].ToString());
+ }
+
+ private:
+ Result<Datum> SortIndices(const Array& values, const SortOptions& options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+ ArraySortOptions array_options(order);
+ return CallFunction("array_sort_indices", {values}, &array_options, ctx);
+ }
+
+ Result<Datum> SortIndices(const ChunkedArray& chunked_array, const SortOptions& options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+
+ auto out_type = uint64();
+ auto length = chunked_array.length();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ ChunkedArraySorter sorter(ctx, out_begin, out_end, chunked_array, order);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ return Datum(out);
+ }
+
+ Result<Datum> SortIndices(const RecordBatch& batch, const SortOptions& options,
+ ExecContext* ctx) const {
+ auto n_sort_keys = options.sort_keys.size();
+ if (n_sort_keys == 0) {
+ return Status::Invalid("Must specify one or more sort keys");
+ }
+ if (n_sort_keys == 1) {
+ auto array = batch.GetColumnByName(options.sort_keys[0].name);
+ if (!array) {
+ return Status::Invalid("Nonexistent sort key column: ",
+ options.sort_keys[0].name);
+ }
+ return SortIndices(*array, options, ctx);
+ }
+
+ auto out_type = uint64();
+ auto length = batch.num_rows();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ BufferVector buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ // Radix sorting is consistently faster except when there is a large number
+ // of sort keys, in which case it can end up degrading catastrophically.
+ // Cut off above 8 sort keys.
+ if (n_sort_keys <= 8) {
+ RadixRecordBatchSorter sorter(out_begin, out_end, batch, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ } else {
+ MultipleKeyRecordBatchSorter sorter(out_begin, out_end, batch, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ }
+ return Datum(out);
+ }
+
+ Result<Datum> SortIndices(const Table& table, const SortOptions& options,
+ ExecContext* ctx) const {
+ auto n_sort_keys = options.sort_keys.size();
+ if (n_sort_keys == 0) {
+ return Status::Invalid("Must specify one or more sort keys");
+ }
+ if (n_sort_keys == 1) {
+ auto chunked_array = table.GetColumnByName(options.sort_keys[0].name);
+ if (!chunked_array) {
+ return Status::Invalid("Nonexistent sort key column: ",
+ options.sort_keys[0].name);
+ }
+ return SortIndices(*chunked_array, options, ctx);
+ }
+
+ auto out_type = uint64();
+ auto length = table.num_rows();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ // TODO: We should choose suitable sort implementation
+ // automatically. The current TableRadixSorter implementation is
+ // faster than MultipleKeyTableSorter only when the number of
+ // sort keys is 2 and counting sort is used. So we always
+ // MultipleKeyTableSorter for now.
+ //
+ // TableRadixSorter sorter;
+ // ARROW_RETURN_NOT_OK(sorter.Sort(ctx, out_begin, out_end, table, options));
+ MultipleKeyTableSorter sorter(out_begin, out_end, table, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ return Datum(out);
+ }
+};
+
+const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
+
+const FunctionDoc array_sort_indices_doc(
+ "Return the indices that would sort an array",
+ ("This function computes an array of indices that define a stable sort\n"
+ "of the input array. Null values are considered greater than any\n"
+ "other value and are therefore sorted at the end of the array.\n"
+ "For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values."),
+ {"array"}, "ArraySortOptions");
+
+const FunctionDoc partition_nth_indices_doc(
+ "Return the indices that would partition an array around a pivot",
+ ("This functions computes an array of indices that define a non-stable\n"
+ "partial sort of the input array.\n"
+ "\n"
+ "The output is such that the `N`'th index points to the `N`'th element\n"
+ "of the input in sorted order, and all indices before the `N`'th point\n"
+ "to elements in the input less or equal to elements at or after the `N`'th.\n"
+ "\n"
+ "Null values are considered greater than any other value and are\n"
+ "therefore partitioned towards the end of the array.\n"
+ "For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values.\n"
+ "\n"
+ "The pivot index `N` must be given in PartitionNthOptions."),
+ {"array"}, "PartitionNthOptions");
+
+} // namespace
+
void RegisterVectorSort(FunctionRegistry* registry) {
// The kernel outputs into preallocated memory and is never null
VectorKernel base;
base.mem_allocation = MemAllocation::PREALLOCATE;
base.null_handling = NullHandling::OUTPUT_NOT_NULL;
- auto array_sort_indices = std::make_shared<VectorFunction>(
- "array_sort_indices", Arity::Unary(), &array_sort_indices_doc,
- &kDefaultArraySortOptions);
- base.init = ArraySortIndicesState::Init;
- AddSortingKernels<ArraySortIndices>(base, array_sort_indices.get());
- DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
-
- DCHECK_OK(registry->AddFunction(std::make_shared<SortIndicesMetaFunction>()));
+ auto array_sort_indices = std::make_shared<VectorFunction>(
+ "array_sort_indices", Arity::Unary(), &array_sort_indices_doc,
+ &kDefaultArraySortOptions);
+ base.init = ArraySortIndicesState::Init;
+ AddSortingKernels<ArraySortIndices>(base, array_sort_indices.get());
+ DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
+ DCHECK_OK(registry->AddFunction(std::make_shared<SortIndicesMetaFunction>()));
+
// partition_nth_indices has a parameter so needs its init function
- auto part_indices = std::make_shared<VectorFunction>(
- "partition_nth_indices", Arity::Unary(), &partition_nth_indices_doc);
+ auto part_indices = std::make_shared<VectorFunction>(
+ "partition_nth_indices", Arity::Unary(), &partition_nth_indices_doc);
base.init = PartitionNthToIndicesState::Init;
AddSortingKernels<PartitionNthToIndices>(base, part_indices.get());
DCHECK_OK(registry->AddFunction(std::move(part_indices)));
}
-#undef VISIT_PHYSICAL_TYPES
-
+#undef VISIT_PHYSICAL_TYPES
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
index ca7b6137306..9f24f7a7008 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
@@ -24,10 +24,10 @@
#include <utility>
#include "arrow/compute/function.h"
-#include "arrow/compute/function_internal.h"
+#include "arrow/compute/function_internal.h"
#include "arrow/compute/registry_internal.h"
#include "arrow/status.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace compute {
@@ -35,8 +35,8 @@ namespace compute {
class FunctionRegistry::FunctionRegistryImpl {
public:
Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite) {
- RETURN_NOT_OK(function->Validate());
-
+ RETURN_NOT_OK(function->Validate());
+
std::lock_guard<std::mutex> mutation_guard(lock_);
const std::string& name = function->name();
@@ -59,20 +59,20 @@ class FunctionRegistry::FunctionRegistryImpl {
return Status::OK();
}
- Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
- bool allow_overwrite = false) {
- std::lock_guard<std::mutex> mutation_guard(lock_);
-
- const std::string name = options_type->type_name();
- auto it = name_to_options_type_.find(name);
- if (it != name_to_options_type_.end() && !allow_overwrite) {
- return Status::KeyError(
- "Already have a function options type registered with name: ", name);
- }
- name_to_options_type_[name] = options_type;
- return Status::OK();
- }
-
+ Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite = false) {
+ std::lock_guard<std::mutex> mutation_guard(lock_);
+
+ const std::string name = options_type->type_name();
+ auto it = name_to_options_type_.find(name);
+ if (it != name_to_options_type_.end() && !allow_overwrite) {
+ return Status::KeyError(
+ "Already have a function options type registered with name: ", name);
+ }
+ name_to_options_type_[name] = options_type;
+ return Status::OK();
+ }
+
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const {
auto it = name_to_function_.find(name);
if (it == name_to_function_.end()) {
@@ -90,21 +90,21 @@ class FunctionRegistry::FunctionRegistryImpl {
return results;
}
- Result<const FunctionOptionsType*> GetFunctionOptionsType(
- const std::string& name) const {
- auto it = name_to_options_type_.find(name);
- if (it == name_to_options_type_.end()) {
- return Status::KeyError("No function options type registered with name: ", name);
- }
- return it->second;
- }
-
+ Result<const FunctionOptionsType*> GetFunctionOptionsType(
+ const std::string& name) const {
+ auto it = name_to_options_type_.find(name);
+ if (it == name_to_options_type_.end()) {
+ return Status::KeyError("No function options type registered with name: ", name);
+ }
+ return it->second;
+ }
+
int num_functions() const { return static_cast<int>(name_to_function_.size()); }
private:
std::mutex lock_;
std::unordered_map<std::string, std::shared_ptr<Function>> name_to_function_;
- std::unordered_map<std::string, const FunctionOptionsType*> name_to_options_type_;
+ std::unordered_map<std::string, const FunctionOptionsType*> name_to_options_type_;
};
std::unique_ptr<FunctionRegistry> FunctionRegistry::Make() {
@@ -125,11 +125,11 @@ Status FunctionRegistry::AddAlias(const std::string& target_name,
return impl_->AddAlias(target_name, source_name);
}
-Status FunctionRegistry::AddFunctionOptionsType(const FunctionOptionsType* options_type,
- bool allow_overwrite) {
- return impl_->AddFunctionOptionsType(options_type, allow_overwrite);
-}
-
+Status FunctionRegistry::AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite) {
+ return impl_->AddFunctionOptionsType(options_type, allow_overwrite);
+}
+
Result<std::shared_ptr<Function>> FunctionRegistry::GetFunction(
const std::string& name) const {
return impl_->GetFunction(name);
@@ -139,11 +139,11 @@ std::vector<std::string> FunctionRegistry::GetFunctionNames() const {
return impl_->GetFunctionNames();
}
-Result<const FunctionOptionsType*> FunctionRegistry::GetFunctionOptionsType(
- const std::string& name) const {
- return impl_->GetFunctionOptionsType(name);
-}
-
+Result<const FunctionOptionsType*> FunctionRegistry::GetFunctionOptionsType(
+ const std::string& name) const {
+ return impl_->GetFunctionOptionsType(name);
+}
+
int FunctionRegistry::num_functions() const { return impl_->num_functions(); }
namespace internal {
@@ -161,30 +161,30 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
RegisterScalarStringAscii(registry.get());
RegisterScalarValidity(registry.get());
RegisterScalarFillNull(registry.get());
- RegisterScalarIfElse(registry.get());
- RegisterScalarTemporal(registry.get());
+ RegisterScalarIfElse(registry.get());
+ RegisterScalarTemporal(registry.get());
- RegisterScalarOptions(registry.get());
+ RegisterScalarOptions(registry.get());
// Vector functions
RegisterVectorHash(registry.get());
- RegisterVectorReplace(registry.get());
+ RegisterVectorReplace(registry.get());
RegisterVectorSelection(registry.get());
RegisterVectorNested(registry.get());
RegisterVectorSort(registry.get());
- RegisterVectorOptions(registry.get());
-
- // Aggregate functions
- RegisterScalarAggregateBasic(registry.get());
- RegisterScalarAggregateMode(registry.get());
- RegisterScalarAggregateQuantile(registry.get());
- RegisterScalarAggregateTDigest(registry.get());
- RegisterScalarAggregateVariance(registry.get());
- RegisterHashAggregateBasic(registry.get());
-
- RegisterAggregateOptions(registry.get());
-
+ RegisterVectorOptions(registry.get());
+
+ // Aggregate functions
+ RegisterScalarAggregateBasic(registry.get());
+ RegisterScalarAggregateMode(registry.get());
+ RegisterScalarAggregateQuantile(registry.get());
+ RegisterScalarAggregateTDigest(registry.get());
+ RegisterScalarAggregateVariance(registry.get());
+ RegisterHashAggregateBasic(registry.get());
+
+ RegisterAggregateOptions(registry.get());
+
return registry;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
index e83036db6ac..796eba2fb12 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
@@ -32,7 +32,7 @@ namespace arrow {
namespace compute {
class Function;
-class FunctionOptionsType;
+class FunctionOptionsType;
/// \brief A mutable central function registry for built-in functions as well
/// as user-defined functions. Functions are implementations of
@@ -59,11 +59,11 @@ class ARROW_EXPORT FunctionRegistry {
/// function with the given name is not registered
Status AddAlias(const std::string& target_name, const std::string& source_name);
- /// \brief Add a new function options type to the registry. Returns Status::KeyError if
- /// a function options type with the same name is already registered
- Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
- bool allow_overwrite = false);
-
+ /// \brief Add a new function options type to the registry. Returns Status::KeyError if
+ /// a function options type with the same name is already registered
+ Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite = false);
+
/// \brief Retrieve a function by name from the registry
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
@@ -71,10 +71,10 @@ class ARROW_EXPORT FunctionRegistry {
/// displaying a manifest of available functions
std::vector<std::string> GetFunctionNames() const;
- /// \brief Retrieve a function options type by name from the registry
- Result<const FunctionOptionsType*> GetFunctionOptionsType(
- const std::string& name) const;
-
+ /// \brief Retrieve a function options type by name from the registry
+ Result<const FunctionOptionsType*> GetFunctionOptionsType(
+ const std::string& name) const;
+
/// \brief The number of currently registered functions
int num_functions() const;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
index 892b54341da..bc5a2d734f4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
@@ -34,30 +34,30 @@ void RegisterScalarSetLookup(FunctionRegistry* registry);
void RegisterScalarStringAscii(FunctionRegistry* registry);
void RegisterScalarValidity(FunctionRegistry* registry);
void RegisterScalarFillNull(FunctionRegistry* registry);
-void RegisterScalarIfElse(FunctionRegistry* registry);
-void RegisterScalarTemporal(FunctionRegistry* registry);
-
-void RegisterScalarOptions(FunctionRegistry* registry);
+void RegisterScalarIfElse(FunctionRegistry* registry);
+void RegisterScalarTemporal(FunctionRegistry* registry);
+void RegisterScalarOptions(FunctionRegistry* registry);
+
// Vector functions
void RegisterVectorHash(FunctionRegistry* registry);
-void RegisterVectorReplace(FunctionRegistry* registry);
+void RegisterVectorReplace(FunctionRegistry* registry);
void RegisterVectorSelection(FunctionRegistry* registry);
void RegisterVectorNested(FunctionRegistry* registry);
void RegisterVectorSort(FunctionRegistry* registry);
-void RegisterVectorOptions(FunctionRegistry* registry);
-
+void RegisterVectorOptions(FunctionRegistry* registry);
+
// Aggregate functions
void RegisterScalarAggregateBasic(FunctionRegistry* registry);
-void RegisterScalarAggregateMode(FunctionRegistry* registry);
-void RegisterScalarAggregateQuantile(FunctionRegistry* registry);
-void RegisterScalarAggregateTDigest(FunctionRegistry* registry);
-void RegisterScalarAggregateVariance(FunctionRegistry* registry);
-void RegisterHashAggregateBasic(FunctionRegistry* registry);
-
-void RegisterAggregateOptions(FunctionRegistry* registry);
-
+void RegisterScalarAggregateMode(FunctionRegistry* registry);
+void RegisterScalarAggregateQuantile(FunctionRegistry* registry);
+void RegisterScalarAggregateTDigest(FunctionRegistry* registry);
+void RegisterScalarAggregateVariance(FunctionRegistry* registry);
+void RegisterHashAggregateBasic(FunctionRegistry* registry);
+
+void RegisterAggregateOptions(FunctionRegistry* registry);
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
index eebc8c1b678..3a3d2ac4b7d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
@@ -20,16 +20,16 @@
namespace arrow {
struct Datum;
-struct ValueDescr;
+struct ValueDescr;
namespace compute {
-class Function;
-class FunctionOptions;
-
-class CastOptions;
-
-struct ExecBatch;
+class Function;
+class FunctionOptions;
+
+class CastOptions;
+
+struct ExecBatch;
class ExecContext;
class KernelContext;
@@ -38,11 +38,11 @@ struct ScalarKernel;
struct ScalarAggregateKernel;
struct VectorKernel;
-struct KernelState;
-
-class Expression;
-class ExecNode;
-class ExecPlan;
-
+struct KernelState;
+
+class Expression;
+class ExecNode;
+class ExecPlan;
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/config.cc b/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
index b93f207161d..7d68f638b6c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
@@ -16,19 +16,19 @@
// under the License.
#include "arrow/config.h"
-
-#include <cstdint>
-
+
+#include <cstdint>
+
#include "arrow/util/config.h"
-#include "arrow/util/cpu_info.h"
+#include "arrow/util/cpu_info.h"
namespace arrow {
-using internal::CpuInfo;
-
-namespace {
-
-const BuildInfo kBuildInfo = {
+using internal::CpuInfo;
+
+namespace {
+
+const BuildInfo kBuildInfo = {
// clang-format off
ARROW_VERSION,
ARROW_VERSION_MAJOR,
@@ -46,33 +46,33 @@ const BuildInfo kBuildInfo = {
// clang-format on
};
-template <typename QueryFlagFunction>
-std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
- if (query_flag(CpuInfo::AVX512)) {
- return "avx512";
- } else if (query_flag(CpuInfo::AVX2)) {
- return "avx2";
- } else if (query_flag(CpuInfo::AVX)) {
- return "avx";
- } else if (query_flag(CpuInfo::SSE4_2)) {
- return "sse4_2";
- } else {
- return "none";
- }
-}
-
-}; // namespace
-
+template <typename QueryFlagFunction>
+std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
+ if (query_flag(CpuInfo::AVX512)) {
+ return "avx512";
+ } else if (query_flag(CpuInfo::AVX2)) {
+ return "avx2";
+ } else if (query_flag(CpuInfo::AVX)) {
+ return "avx";
+ } else if (query_flag(CpuInfo::SSE4_2)) {
+ return "sse4_2";
+ } else {
+ return "none";
+ }
+}
+
+}; // namespace
+
const BuildInfo& GetBuildInfo() { return kBuildInfo; }
-RuntimeInfo GetRuntimeInfo() {
- RuntimeInfo info;
- auto cpu_info = CpuInfo::GetInstance();
- info.simd_level =
- MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
- info.detected_simd_level =
- MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
- return info;
-}
-
+RuntimeInfo GetRuntimeInfo() {
+ RuntimeInfo info;
+ auto cpu_info = CpuInfo::GetInstance();
+ info.simd_level =
+ MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
+ info.detected_simd_level =
+ MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
+ return info;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/config.h b/contrib/libs/apache/arrow/cpp/src/arrow/config.h
index 5ae7e223164..a1abc997984 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/config.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/config.h
@@ -45,17 +45,17 @@ struct BuildInfo {
std::string package_kind;
};
-struct RuntimeInfo {
- /// The enabled SIMD level
- ///
- /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
- /// environment variable is set to another value.
- std::string simd_level;
-
- /// The SIMD level available on the OS and CPU
- std::string detected_simd_level;
-};
-
+struct RuntimeInfo {
+ /// The enabled SIMD level
+ ///
+ /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
+ /// environment variable is set to another value.
+ std::string simd_level;
+
+ /// The SIMD level available on the OS and CPU
+ std::string detected_simd_level;
+};
+
/// \brief Get runtime build info.
///
/// The returned values correspond to exact loaded version of the Arrow library,
@@ -64,9 +64,9 @@ struct RuntimeInfo {
ARROW_EXPORT
const BuildInfo& GetBuildInfo();
-/// \brief Get runtime info.
-///
-ARROW_EXPORT
-RuntimeInfo GetRuntimeInfo();
-
+/// \brief Get runtime info.
+///
+ARROW_EXPORT
+RuntimeInfo GetRuntimeInfo();
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc b/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
index dd10fce3e4d..5be26f62d6e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
@@ -57,20 +57,20 @@ Datum::Datum(std::shared_ptr<RecordBatch> value) : value(std::move(value)) {}
Datum::Datum(std::shared_ptr<Table> value) : value(std::move(value)) {}
Datum::Datum(std::vector<Datum> value) : value(std::move(value)) {}
-Datum::Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
-Datum::Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
-Datum::Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
-Datum::Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
-Datum::Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
-Datum::Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
-Datum::Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
-Datum::Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
-Datum::Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
-Datum::Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
-Datum::Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
-Datum::Datum(std::string value)
- : value(std::make_shared<StringScalar>(std::move(value))) {}
-Datum::Datum(const char* value) : value(std::make_shared<StringScalar>(value)) {}
+Datum::Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
+Datum::Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
+Datum::Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
+Datum::Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
+Datum::Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
+Datum::Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
+Datum::Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
+Datum::Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
+Datum::Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
+Datum::Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
+Datum::Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
+Datum::Datum(std::string value)
+ : value(std::make_shared<StringScalar>(std::move(value))) {}
+Datum::Datum(const char* value) : value(std::make_shared<StringScalar>(value)) {}
Datum::Datum(const ChunkedArray& value)
: value(std::make_shared<ChunkedArray>(value.chunks(), value.type())) {}
@@ -89,26 +89,26 @@ std::shared_ptr<Array> Datum::make_array() const {
std::shared_ptr<DataType> Datum::type() const {
if (this->kind() == Datum::ARRAY) {
return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
- }
- if (this->kind() == Datum::CHUNKED_ARRAY) {
+ }
+ if (this->kind() == Datum::CHUNKED_ARRAY) {
return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
- }
- if (this->kind() == Datum::SCALAR) {
+ }
+ if (this->kind() == Datum::SCALAR) {
return util::get<std::shared_ptr<Scalar>>(this->value)->type;
}
- return nullptr;
-}
-
-std::shared_ptr<Schema> Datum::schema() const {
- if (this->kind() == Datum::RECORD_BATCH) {
- return util::get<std::shared_ptr<RecordBatch>>(this->value)->schema();
- }
- if (this->kind() == Datum::TABLE) {
- return util::get<std::shared_ptr<Table>>(this->value)->schema();
- }
- return nullptr;
+ return nullptr;
}
+std::shared_ptr<Schema> Datum::schema() const {
+ if (this->kind() == Datum::RECORD_BATCH) {
+ return util::get<std::shared_ptr<RecordBatch>>(this->value)->schema();
+ }
+ if (this->kind() == Datum::TABLE) {
+ return util::get<std::shared_ptr<Table>>(this->value)->schema();
+ }
+ return nullptr;
+}
+
int64_t Datum::length() const {
if (this->kind() == Datum::ARRAY) {
return util::get<std::shared_ptr<ArrayData>>(this->value)->length;
@@ -211,21 +211,21 @@ static std::string FormatValueDescr(const ValueDescr& descr) {
std::string ValueDescr::ToString() const { return FormatValueDescr(*this); }
-std::string ValueDescr::ToString(const std::vector<ValueDescr>& descrs) {
- std::stringstream ss;
- ss << "(";
- for (size_t i = 0; i < descrs.size(); ++i) {
- if (i > 0) {
- ss << ", ";
- }
- ss << descrs[i].ToString();
- }
- ss << ")";
- return ss.str();
-}
-
-void PrintTo(const ValueDescr& descr, std::ostream* os) { *os << descr.ToString(); }
-
+std::string ValueDescr::ToString(const std::vector<ValueDescr>& descrs) {
+ std::stringstream ss;
+ ss << "(";
+ for (size_t i = 0; i < descrs.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
+ }
+ ss << descrs[i].ToString();
+ }
+ ss << ")";
+ return ss.str();
+}
+
+void PrintTo(const ValueDescr& descr, std::ostream* os) { *os << descr.ToString(); }
+
std::string Datum::ToString() const {
switch (this->kind()) {
case Datum::NONE:
@@ -250,7 +250,7 @@ std::string Datum::ToString() const {
}
ss << values[i].ToString();
}
- ss << ')';
+ ss << ')';
return ss.str();
}
default:
@@ -262,23 +262,23 @@ std::string Datum::ToString() const {
ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args) {
for (const auto& descr : args) {
if (descr.shape == ValueDescr::ARRAY) {
- return ValueDescr::ARRAY;
+ return ValueDescr::ARRAY;
}
}
- return ValueDescr::SCALAR;
-}
-
-void PrintTo(const Datum& datum, std::ostream* os) {
- switch (datum.kind()) {
- case Datum::SCALAR:
- *os << datum.scalar()->ToString();
- break;
- case Datum::ARRAY:
- *os << datum.make_array()->ToString();
- break;
- default:
- *os << datum.ToString();
- }
+ return ValueDescr::SCALAR;
}
+void PrintTo(const Datum& datum, std::ostream* os) {
+ switch (datum.kind()) {
+ case Datum::SCALAR:
+ *os << datum.scalar()->ToString();
+ break;
+ case Datum::ARRAY:
+ *os << datum.make_array()->ToString();
+ break;
+ default:
+ *os << datum.ToString();
+ }
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/datum.h b/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
index 6ba6af7f79e..d7f487c273c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
@@ -81,17 +81,17 @@ struct ARROW_EXPORT ValueDescr {
}
bool operator==(const ValueDescr& other) const {
- if (shape != other.shape) return false;
- if (type == other.type) return true;
- return type && type->Equals(other.type);
+ if (shape != other.shape) return false;
+ if (type == other.type) return true;
+ return type && type->Equals(other.type);
}
bool operator!=(const ValueDescr& other) const { return !(*this == other); }
std::string ToString() const;
- static std::string ToString(const std::vector<ValueDescr>&);
-
- ARROW_EXPORT friend void PrintTo(const ValueDescr&, std::ostream*);
+ static std::string ToString(const std::vector<ValueDescr>&);
+
+ ARROW_EXPORT friend void PrintTo(const ValueDescr&, std::ostream*);
};
/// \brief For use with scalar functions, returns the broadcasted Value::Shape
@@ -105,25 +105,25 @@ ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args);
struct ARROW_EXPORT Datum {
enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };
- struct Empty {};
-
+ struct Empty {};
+
// Datums variants may have a length. This special value indicate that the
// current variant does not have a length.
static constexpr int64_t kUnknownLength = -1;
- util::Variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
+ util::Variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
std::shared_ptr<Table>, std::vector<Datum>>
value;
/// \brief Empty datum, to be populated elsewhere
- Datum() = default;
-
- Datum(const Datum& other) = default;
- Datum& operator=(const Datum& other) = default;
- Datum(Datum&& other) = default;
- Datum& operator=(Datum&& other) = default;
+ Datum() = default;
+ Datum(const Datum& other) = default;
+ Datum& operator=(const Datum& other) = default;
+ Datum(Datum&& other) = default;
+ Datum& operator=(Datum&& other) = default;
+
Datum(std::shared_ptr<Scalar> value) // NOLINT implicit conversion
: value(std::move(value)) {}
@@ -163,8 +163,8 @@ struct ARROW_EXPORT Datum {
explicit Datum(uint64_t value);
explicit Datum(float value);
explicit Datum(double value);
- explicit Datum(std::string value);
- explicit Datum(const char* value);
+ explicit Datum(std::string value);
+ explicit Datum(const char* value);
Datum::Kind kind() const {
switch (this->value.index()) {
@@ -216,11 +216,11 @@ struct ARROW_EXPORT Datum {
}
template <typename ExactType>
- std::shared_ptr<ExactType> array_as() const {
- return internal::checked_pointer_cast<ExactType>(this->make_array());
- }
-
- template <typename ExactType>
+ std::shared_ptr<ExactType> array_as() const {
+ return internal::checked_pointer_cast<ExactType>(this->make_array());
+ }
+
+ template <typename ExactType>
const ExactType& scalar_as() const {
return internal::checked_cast<const ExactType&>(*this->scalar());
}
@@ -253,11 +253,11 @@ struct ARROW_EXPORT Datum {
/// \return nullptr if no type
std::shared_ptr<DataType> type() const;
- /// \brief The schema of the variant, if any
- ///
- /// \return nullptr if no schema
- std::shared_ptr<Schema> schema() const;
-
+ /// \brief The schema of the variant, if any
+ ///
+ /// \return nullptr if no schema
+ std::shared_ptr<Schema> schema() const;
+
/// \brief The value length of the variant, if any
///
/// \return kUnknownLength if no type
@@ -274,8 +274,8 @@ struct ARROW_EXPORT Datum {
bool operator!=(const Datum& other) const { return !Equals(other); }
std::string ToString() const;
-
- ARROW_EXPORT friend void PrintTo(const Datum&, std::ostream*);
+
+ ARROW_EXPORT friend void PrintTo(const Datum&, std::ostream*);
};
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
index 7804c130ca1..86893cb5837 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
@@ -476,14 +476,14 @@ Result<std::shared_ptr<Buffer>> BufferedInputStream::DoRead(int64_t nbytes) {
return impl_->Read(nbytes);
}
-Result<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadata() {
- return impl_->raw()->ReadMetadata();
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadataAsync(
- const IOContext& io_context) {
- return impl_->raw()->ReadMetadataAsync(io_context);
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadata() {
+ return impl_->raw()->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ return impl_->raw()->ReadMetadataAsync(io_context);
+}
+
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
index 8116613fa4e..3bcc3a82c1c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
@@ -132,9 +132,9 @@ class ARROW_EXPORT BufferedInputStream
// InputStream APIs
bool closed() const override;
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context) override;
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
private:
friend InputStreamConcurrencyWrapper<BufferedInputStream>;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
index 722026ccd9b..8031d897ba5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
@@ -16,11 +16,11 @@
// under the License.
#include <algorithm>
-#include <atomic>
+#include <atomic>
#include <cmath>
-#include <mutex>
+#include <mutex>
#include <utility>
-#include <vector>
+#include <vector>
#include "arrow/buffer.h"
#include "arrow/io/caching.h"
@@ -34,16 +34,16 @@ namespace io {
CacheOptions CacheOptions::Defaults() {
return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
- internal::ReadRangeCache::kDefaultRangeSizeLimit,
- /*lazy=*/false};
-}
-
-CacheOptions CacheOptions::LazyDefaults() {
- return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
- internal::ReadRangeCache::kDefaultRangeSizeLimit,
- /*lazy=*/true};
+ internal::ReadRangeCache::kDefaultRangeSizeLimit,
+ /*lazy=*/false};
}
+CacheOptions CacheOptions::LazyDefaults() {
+ return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
+ internal::ReadRangeCache::kDefaultRangeSizeLimit,
+ /*lazy=*/true};
+}
+
CacheOptions CacheOptions::MakeFromNetworkMetrics(int64_t time_to_first_byte_millis,
int64_t transfer_bandwidth_mib_per_sec,
double ideal_bandwidth_utilization_frac,
@@ -125,7 +125,7 @@ CacheOptions CacheOptions::MakeFromNetworkMetrics(int64_t time_to_first_byte_mil
(1 - ideal_bandwidth_utilization_frac))));
DCHECK_GT(range_size_limit, 0) << "Computed range_size_limit must be > 0";
- return {hole_size_limit, range_size_limit, false};
+ return {hole_size_limit, range_size_limit, false};
}
namespace internal {
@@ -134,10 +134,10 @@ struct RangeCacheEntry {
ReadRange range;
Future<std::shared_ptr<Buffer>> future;
- RangeCacheEntry() = default;
- RangeCacheEntry(const ReadRange& range_, Future<std::shared_ptr<Buffer>> future_)
- : range(range_), future(std::move(future_)) {}
-
+ RangeCacheEntry() = default;
+ RangeCacheEntry(const ReadRange& range_, Future<std::shared_ptr<Buffer>> future_)
+ : range(range_), future(std::move(future_)) {}
+
friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) {
return left.range.offset < right.range.offset;
}
@@ -145,36 +145,36 @@ struct RangeCacheEntry {
struct ReadRangeCache::Impl {
std::shared_ptr<RandomAccessFile> file;
- IOContext ctx;
+ IOContext ctx;
CacheOptions options;
// Ordered by offset (so as to find a matching region by binary search)
std::vector<RangeCacheEntry> entries;
- virtual ~Impl() = default;
-
- // Get the future corresponding to a range
- virtual Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) {
- return entry->future;
- }
-
- // Make cache entries for ranges
- virtual std::vector<RangeCacheEntry> MakeCacheEntries(
- const std::vector<ReadRange>& ranges) {
- std::vector<RangeCacheEntry> new_entries;
- new_entries.reserve(ranges.size());
- for (const auto& range : ranges) {
- new_entries.emplace_back(range, file->ReadAsync(ctx, range.offset, range.length));
- }
- return new_entries;
- }
-
- // Add the given ranges to the cache, coalescing them where possible
- virtual Status Cache(std::vector<ReadRange> ranges) {
- ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
- options.range_size_limit);
- std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
- // Add new entries, themselves ordered by offset
+ virtual ~Impl() = default;
+
+ // Get the future corresponding to a range
+ virtual Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) {
+ return entry->future;
+ }
+
+ // Make cache entries for ranges
+ virtual std::vector<RangeCacheEntry> MakeCacheEntries(
+ const std::vector<ReadRange>& ranges) {
+ std::vector<RangeCacheEntry> new_entries;
+ new_entries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ new_entries.emplace_back(range, file->ReadAsync(ctx, range.offset, range.length));
+ }
+ return new_entries;
+ }
+
+ // Add the given ranges to the cache, coalescing them where possible
+ virtual Status Cache(std::vector<ReadRange> ranges) {
+ ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
+ options.range_size_limit);
+ std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
+ // Add new entries, themselves ordered by offset
if (entries.size() > 0) {
std::vector<RangeCacheEntry> merged(entries.size() + new_entries.size());
std::merge(entries.begin(), entries.end(), new_entries.begin(), new_entries.end(),
@@ -183,134 +183,134 @@ struct ReadRangeCache::Impl {
} else {
entries = std::move(new_entries);
}
- // Prefetch immediately, regardless of executor availability, if possible
- return file->WillNeed(ranges);
- }
-
- // Read the given range from the cache, blocking if needed. Cannot read a range
- // that spans cache entries.
- virtual Result<std::shared_ptr<Buffer>> Read(ReadRange range) {
- if (range.length == 0) {
- static const uint8_t byte = 0;
- return std::make_shared<Buffer>(&byte, 0);
- }
-
- const auto it = std::lower_bound(
- entries.begin(), entries.end(), range,
- [](const RangeCacheEntry& entry, const ReadRange& range) {
- return entry.range.offset + entry.range.length < range.offset + range.length;
- });
- if (it != entries.end() && it->range.Contains(range)) {
- auto fut = MaybeRead(&*it);
- ARROW_ASSIGN_OR_RAISE(auto buf, fut.result());
- return SliceBuffer(std::move(buf), range.offset - it->range.offset, range.length);
- }
- return Status::Invalid("ReadRangeCache did not find matching cache entry");
- }
-
- virtual Future<> Wait() {
- std::vector<Future<>> futures;
- for (auto& entry : entries) {
- futures.emplace_back(MaybeRead(&entry));
- }
- return AllComplete(futures);
- }
-
- // Return a Future that completes when the given ranges have been read.
- virtual Future<> WaitFor(std::vector<ReadRange> ranges) {
- auto end = std::remove_if(ranges.begin(), ranges.end(),
- [](const ReadRange& range) { return range.length == 0; });
- ranges.resize(end - ranges.begin());
- std::vector<Future<>> futures;
- futures.reserve(ranges.size());
- for (auto& range : ranges) {
- const auto it = std::lower_bound(
- entries.begin(), entries.end(), range,
- [](const RangeCacheEntry& entry, const ReadRange& range) {
- return entry.range.offset + entry.range.length < range.offset + range.length;
- });
- if (it != entries.end() && it->range.Contains(range)) {
- futures.push_back(Future<>(MaybeRead(&*it)));
- } else {
- return Status::Invalid("Range was not requested for caching: offset=",
- range.offset, " length=", range.length);
- }
- }
- return AllComplete(futures);
- }
-};
-
-// Don't read ranges when they're first added. Instead, wait until they're requested
-// (either through Read or WaitFor).
-struct ReadRangeCache::LazyImpl : public ReadRangeCache::Impl {
- // Protect against concurrent modification of entries[i]->future
- std::mutex entry_mutex;
-
- virtual ~LazyImpl() = default;
-
- Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) override {
- // Called by superclass Read()/WaitFor() so we have the lock
- if (!entry->future.is_valid()) {
- entry->future = file->ReadAsync(ctx, entry->range.offset, entry->range.length);
- }
- return entry->future;
- }
-
- std::vector<RangeCacheEntry> MakeCacheEntries(
- const std::vector<ReadRange>& ranges) override {
- std::vector<RangeCacheEntry> new_entries;
- new_entries.reserve(ranges.size());
- for (const auto& range : ranges) {
- // In the lazy variant, don't read data here - later, a call to Read or WaitFor
- // will call back to MaybeRead (under the lock) which will fill the future.
- new_entries.emplace_back(range, Future<std::shared_ptr<Buffer>>());
- }
- return new_entries;
- }
-
- Status Cache(std::vector<ReadRange> ranges) override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::Cache(std::move(ranges));
- }
-
- Result<std::shared_ptr<Buffer>> Read(ReadRange range) override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::Read(range);
- }
-
- Future<> Wait() override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::Wait();
- }
-
- Future<> WaitFor(std::vector<ReadRange> ranges) override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::WaitFor(std::move(ranges));
+ // Prefetch immediately, regardless of executor availability, if possible
+ return file->WillNeed(ranges);
}
+
+ // Read the given range from the cache, blocking if needed. Cannot read a range
+ // that spans cache entries.
+ virtual Result<std::shared_ptr<Buffer>> Read(ReadRange range) {
+ if (range.length == 0) {
+ static const uint8_t byte = 0;
+ return std::make_shared<Buffer>(&byte, 0);
+ }
+
+ const auto it = std::lower_bound(
+ entries.begin(), entries.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length < range.offset + range.length;
+ });
+ if (it != entries.end() && it->range.Contains(range)) {
+ auto fut = MaybeRead(&*it);
+ ARROW_ASSIGN_OR_RAISE(auto buf, fut.result());
+ return SliceBuffer(std::move(buf), range.offset - it->range.offset, range.length);
+ }
+ return Status::Invalid("ReadRangeCache did not find matching cache entry");
+ }
+
+ virtual Future<> Wait() {
+ std::vector<Future<>> futures;
+ for (auto& entry : entries) {
+ futures.emplace_back(MaybeRead(&entry));
+ }
+ return AllComplete(futures);
+ }
+
+ // Return a Future that completes when the given ranges have been read.
+ virtual Future<> WaitFor(std::vector<ReadRange> ranges) {
+ auto end = std::remove_if(ranges.begin(), ranges.end(),
+ [](const ReadRange& range) { return range.length == 0; });
+ ranges.resize(end - ranges.begin());
+ std::vector<Future<>> futures;
+ futures.reserve(ranges.size());
+ for (auto& range : ranges) {
+ const auto it = std::lower_bound(
+ entries.begin(), entries.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length < range.offset + range.length;
+ });
+ if (it != entries.end() && it->range.Contains(range)) {
+ futures.push_back(Future<>(MaybeRead(&*it)));
+ } else {
+ return Status::Invalid("Range was not requested for caching: offset=",
+ range.offset, " length=", range.length);
+ }
+ }
+ return AllComplete(futures);
+ }
};
-ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+// Don't read ranges when they're first added. Instead, wait until they're requested
+// (either through Read or WaitFor).
+struct ReadRangeCache::LazyImpl : public ReadRangeCache::Impl {
+ // Protect against concurrent modification of entries[i]->future
+ std::mutex entry_mutex;
+
+ virtual ~LazyImpl() = default;
+
+ Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) override {
+ // Called by superclass Read()/WaitFor() so we have the lock
+ if (!entry->future.is_valid()) {
+ entry->future = file->ReadAsync(ctx, entry->range.offset, entry->range.length);
+ }
+ return entry->future;
+ }
+
+ std::vector<RangeCacheEntry> MakeCacheEntries(
+ const std::vector<ReadRange>& ranges) override {
+ std::vector<RangeCacheEntry> new_entries;
+ new_entries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ // In the lazy variant, don't read data here - later, a call to Read or WaitFor
+ // will call back to MaybeRead (under the lock) which will fill the future.
+ new_entries.emplace_back(range, Future<std::shared_ptr<Buffer>>());
+ }
+ return new_entries;
+ }
+
+ Status Cache(std::vector<ReadRange> ranges) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Cache(std::move(ranges));
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(ReadRange range) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Read(range);
+ }
+
+ Future<> Wait() override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Wait();
+ }
+
+ Future<> WaitFor(std::vector<ReadRange> ranges) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::WaitFor(std::move(ranges));
+ }
+};
+
+ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
CacheOptions options)
- : impl_(options.lazy ? new LazyImpl() : new Impl()) {
+ : impl_(options.lazy ? new LazyImpl() : new Impl()) {
impl_->file = std::move(file);
impl_->ctx = std::move(ctx);
impl_->options = options;
}
-ReadRangeCache::~ReadRangeCache() = default;
+ReadRangeCache::~ReadRangeCache() = default;
Status ReadRangeCache::Cache(std::vector<ReadRange> ranges) {
- return impl_->Cache(std::move(ranges));
+ return impl_->Cache(std::move(ranges));
}
Result<std::shared_ptr<Buffer>> ReadRangeCache::Read(ReadRange range) {
- return impl_->Read(range);
-}
-
-Future<> ReadRangeCache::Wait() { return impl_->Wait(); }
+ return impl_->Read(range);
+}
-Future<> ReadRangeCache::WaitFor(std::vector<ReadRange> ranges) {
- return impl_->WaitFor(std::move(ranges));
+Future<> ReadRangeCache::Wait() { return impl_->Wait(); }
+
+Future<> ReadRangeCache::WaitFor(std::vector<ReadRange> ranges) {
+ return impl_->WaitFor(std::move(ranges));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
index 59a9b60e82f..833b36e31a0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
@@ -24,7 +24,7 @@
#include <vector>
#include "arrow/io/interfaces.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -34,19 +34,19 @@ struct ARROW_EXPORT CacheOptions {
static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9;
static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64;
- /// \brief The maximum distance in bytes between two consecutive
+ /// \brief The maximum distance in bytes between two consecutive
/// ranges; beyond this value, ranges are not combined
int64_t hole_size_limit;
- /// \brief The maximum size in bytes of a combined range; if
+ /// \brief The maximum size in bytes of a combined range; if
/// combining two consecutive ranges would produce a range of a
/// size greater than this, they are not combined
int64_t range_size_limit;
- /// \brief A lazy cache does not perform any I/O until requested.
- bool lazy;
+ /// \brief A lazy cache does not perform any I/O until requested.
+ bool lazy;
bool operator==(const CacheOptions& other) const {
return hole_size_limit == other.hole_size_limit &&
- range_size_limit == other.range_size_limit && lazy == other.lazy;
+ range_size_limit == other.range_size_limit && lazy == other.lazy;
}
/// \brief Construct CacheOptions from network storage metrics (e.g. S3).
@@ -69,45 +69,45 @@ struct ARROW_EXPORT CacheOptions {
int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib);
static CacheOptions Defaults();
- static CacheOptions LazyDefaults();
+ static CacheOptions LazyDefaults();
};
namespace internal {
/// \brief A read cache designed to hide IO latencies when reading.
///
-/// This class takes multiple byte ranges that an application expects to read, and
-/// coalesces them into fewer, larger read requests, which benefits performance on some
-/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
-/// these read requests in parallel up front.
-///
-/// To use:
-/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
-/// the exact offset and length that will later be read. The cache will combine those
-/// ranges according to parameters (see constructor).
-///
-/// By default, the cache will also start fetching the combined ranges in parallel in
-/// the background, unless CacheOptions.lazy is set.
-///
-/// 2. Call WaitFor() to be notified when the given ranges have been read. If
-/// CacheOptions.lazy is set, I/O will be triggered in the background here instead.
-/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
-/// chunk of the file that can be parsed in parallel).
-///
-/// 3. Call Read() to retrieve the actual data for the given ranges.
-/// A synchronous application may skip WaitFor() and just call Read() - it will still
-/// benefit from coalescing and parallel fetching.
+/// This class takes multiple byte ranges that an application expects to read, and
+/// coalesces them into fewer, larger read requests, which benefits performance on some
+/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
+/// these read requests in parallel up front.
+///
+/// To use:
+/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
+/// the exact offset and length that will later be read. The cache will combine those
+/// ranges according to parameters (see constructor).
+///
+/// By default, the cache will also start fetching the combined ranges in parallel in
+/// the background, unless CacheOptions.lazy is set.
+///
+/// 2. Call WaitFor() to be notified when the given ranges have been read. If
+/// CacheOptions.lazy is set, I/O will be triggered in the background here instead.
+/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
+/// chunk of the file that can be parsed in parallel).
+///
+/// 3. Call Read() to retrieve the actual data for the given ranges.
+/// A synchronous application may skip WaitFor() and just call Read() - it will still
+/// benefit from coalescing and parallel fetching.
class ARROW_EXPORT ReadRangeCache {
public:
static constexpr int64_t kDefaultHoleSizeLimit = 8192;
static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024;
/// Construct a read cache with default
- explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
+ explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
: ReadRangeCache(file, std::move(ctx), CacheOptions::Defaults()) {}
/// Construct a read cache with given options
- explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+ explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
CacheOptions options);
~ReadRangeCache();
@@ -120,16 +120,16 @@ class ARROW_EXPORT ReadRangeCache {
/// \brief Read a range previously given to Cache().
Result<std::shared_ptr<Buffer>> Read(ReadRange range);
- /// \brief Wait until all ranges added so far have been cached.
- Future<> Wait();
-
- /// \brief Wait until all given ranges have been cached.
- Future<> WaitFor(std::vector<ReadRange> ranges);
-
+ /// \brief Wait until all ranges added so far have been cached.
+ Future<> Wait();
+
+ /// \brief Wait until all given ranges have been cached.
+ Future<> WaitFor(std::vector<ReadRange> ranges);
+
protected:
struct Impl;
- struct LazyImpl;
-
+ struct LazyImpl;
+
std::unique_ptr<Impl> impl_;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
index 72977f0f297..0e6f4dc339a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
@@ -342,7 +342,7 @@ class CompressedInputStream::Impl {
RETURN_NOT_OK(EnsureCompressedData());
if (compressed_pos_ == compressed_->size()) {
// No more data to decompress
- if (!fresh_decompressor_ && !decompressor_->IsFinished()) {
+ if (!fresh_decompressor_ && !decompressor_->IsFinished()) {
return Status::IOError("Truncated compressed stream");
}
*has_data = false;
@@ -437,14 +437,14 @@ Result<std::shared_ptr<Buffer>> CompressedInputStream::DoRead(int64_t nbytes) {
std::shared_ptr<InputStream> CompressedInputStream::raw() const { return impl_->raw(); }
-Result<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadata() {
- return impl_->raw()->ReadMetadata();
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadataAsync(
- const IOContext& io_context) {
- return impl_->raw()->ReadMetadataAsync(io_context);
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadata() {
+ return impl_->raw()->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ return impl_->raw()->ReadMetadataAsync(io_context);
+}
+
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
index cd1a7f673ce..9eb5e44139f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
@@ -89,9 +89,9 @@ class ARROW_EXPORT CompressedInputStream
// InputStream interface
bool closed() const override;
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context) override;
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
/// \brief Return the underlying raw input stream.
std::shared_ptr<InputStream> raw() const;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
index 70e15335af2..25308240653 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
@@ -390,11 +390,11 @@ class MemoryMappedFile::MemoryMap
// An object representing the entire memory-mapped region.
// It can be sliced in order to return individual subregions, which
// will then keep the original region alive as long as necessary.
- class Region : public Buffer {
+ class Region : public Buffer {
public:
Region(std::shared_ptr<MemoryMappedFile::MemoryMap> memory_map, uint8_t* data,
int64_t size)
- : Buffer(data, size) {
+ : Buffer(data, size) {
is_mutable_ = memory_map->writable();
}
@@ -539,8 +539,8 @@ class MemoryMappedFile::MemoryMap
void advance(int64_t nbytes) { position_ = position_ + nbytes; }
- uint8_t* data() { return region_ ? region_->data() : nullptr; }
-
+ uint8_t* data() { return region_ ? region_->data() : nullptr; }
+
uint8_t* head() { return data() + position_; }
bool writable() { return file_->mode() != FileMode::READ; }
@@ -696,7 +696,7 @@ Result<std::shared_ptr<Buffer>> MemoryMappedFile::Read(int64_t nbytes) {
return buffer;
}
-Future<std::shared_ptr<Buffer>> MemoryMappedFile::ReadAsync(const IOContext&,
+Future<std::shared_ptr<Buffer>> MemoryMappedFile::ReadAsync(const IOContext&,
int64_t position,
int64_t nbytes) {
return Future<std::shared_ptr<Buffer>>::MakeFinished(ReadAt(position, nbytes));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
index 50d4f2c4dfc..4447f82174f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
@@ -185,7 +185,7 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface {
Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
// Synchronous ReadAsync override
- Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
int64_t nbytes) override;
Status WillNeed(const std::vector<ReadRange>& ranges) override;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
index 954c0f37b2d..cf5d71f2cc8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
@@ -29,11 +29,11 @@
#include "arrow/buffer.h"
#include "arrow/io/concurrency.h"
-#include "arrow/io/type_fwd.h"
+#include "arrow/io/type_fwd.h"
#include "arrow/io/util_internal.h"
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/future.h"
#include "arrow/util/iterator.h"
#include "arrow/util/logging.h"
@@ -42,36 +42,36 @@
namespace arrow {
-using internal::checked_pointer_cast;
+using internal::checked_pointer_cast;
using internal::Executor;
using internal::TaskHints;
using internal::ThreadPool;
namespace io {
-static IOContext g_default_io_context{};
+static IOContext g_default_io_context{};
-IOContext::IOContext(MemoryPool* pool, StopToken stop_token)
- : IOContext(pool, internal::GetIOThreadPool(), std::move(stop_token)) {}
-
-const IOContext& default_io_context() { return g_default_io_context; }
-
-int GetIOThreadPoolCapacity() { return internal::GetIOThreadPool()->GetCapacity(); }
-
-Status SetIOThreadPoolCapacity(int threads) {
- return internal::GetIOThreadPool()->SetCapacity(threads);
-}
+IOContext::IOContext(MemoryPool* pool, StopToken stop_token)
+ : IOContext(pool, internal::GetIOThreadPool(), std::move(stop_token)) {}
+const IOContext& default_io_context() { return g_default_io_context; }
+
+int GetIOThreadPoolCapacity() { return internal::GetIOThreadPool()->GetCapacity(); }
+
+Status SetIOThreadPoolCapacity(int threads) {
+ return internal::GetIOThreadPool()->SetCapacity(threads);
+}
+
FileInterface::~FileInterface() = default;
Status FileInterface::Abort() { return Close(); }
-namespace {
-
+namespace {
+
class InputStreamBlockIterator {
public:
InputStreamBlockIterator(std::shared_ptr<InputStream> stream, int64_t block_size)
- : stream_(std::move(stream)), block_size_(block_size) {}
+ : stream_(std::move(stream)), block_size_(block_size) {}
Result<std::shared_ptr<Buffer>> Next() {
if (done_) {
@@ -95,10 +95,10 @@ class InputStreamBlockIterator {
bool done_ = false;
};
-} // namespace
-
-const IOContext& Readable::io_context() const { return g_default_io_context; }
-
+} // namespace
+
+const IOContext& Readable::io_context() const { return g_default_io_context; }
+
Status InputStream::Advance(int64_t nbytes) { return Read(nbytes).status(); }
Result<util::string_view> InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) {
@@ -107,22 +107,22 @@ Result<util::string_view> InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) {
bool InputStream::supports_zero_copy() const { return false; }
-Result<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadata() {
- return std::shared_ptr<const KeyValueMetadata>{};
-}
-
-// Default ReadMetadataAsync() implementation: simply issue the read on the context's
-// executor
-Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync(
- const IOContext& ctx) {
- auto self = shared_from_this();
- return DeferNotOk(internal::SubmitIO(ctx, [self] { return self->ReadMetadata(); }));
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync() {
- return ReadMetadataAsync(io_context());
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadata() {
+ return std::shared_ptr<const KeyValueMetadata>{};
+}
+
+// Default ReadMetadataAsync() implementation: simply issue the read on the context's
+// executor
+Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync(
+ const IOContext& ctx) {
+ auto self = shared_from_this();
+ return DeferNotOk(internal::SubmitIO(ctx, [self] { return self->ReadMetadata(); }));
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync() {
+ return ReadMetadataAsync(io_context());
+}
+
Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
std::shared_ptr<InputStream> stream, int64_t block_size) {
if (stream->closed()) {
@@ -132,13 +132,13 @@ Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
return Iterator<std::shared_ptr<Buffer>>(InputStreamBlockIterator(stream, block_size));
}
-struct RandomAccessFile::Impl {
+struct RandomAccessFile::Impl {
std::mutex lock_;
};
RandomAccessFile::~RandomAccessFile() = default;
-RandomAccessFile::RandomAccessFile() : interface_impl_(new Impl()) {}
+RandomAccessFile::RandomAccessFile() : interface_impl_(new Impl()) {}
Result<int64_t> RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
std::lock_guard<std::mutex> lock(interface_impl_->lock_);
@@ -154,26 +154,26 @@ Result<std::shared_ptr<Buffer>> RandomAccessFile::ReadAt(int64_t position,
}
// Default ReadAsync() implementation: simply issue the read on the context's executor
-Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(const IOContext& ctx,
+Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(const IOContext& ctx,
int64_t position,
int64_t nbytes) {
- auto self = checked_pointer_cast<RandomAccessFile>(shared_from_this());
- return DeferNotOk(internal::SubmitIO(
- ctx, [self, position, nbytes] { return self->ReadAt(position, nbytes); }));
-}
-
-Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(int64_t position,
- int64_t nbytes) {
- return ReadAsync(io_context(), position, nbytes);
+ auto self = checked_pointer_cast<RandomAccessFile>(shared_from_this());
+ return DeferNotOk(internal::SubmitIO(
+ ctx, [self, position, nbytes] { return self->ReadAt(position, nbytes); }));
}
+Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(int64_t position,
+ int64_t nbytes) {
+ return ReadAsync(io_context(), position, nbytes);
+}
+
// Default WillNeed() implementation: no-op
Status RandomAccessFile::WillNeed(const std::vector<ReadRange>& ranges) {
return Status::OK();
}
-Status Writable::Write(util::string_view data) {
- return Write(data.data(), static_cast<int64_t>(data.size()));
+Status Writable::Write(util::string_view data) {
+ return Write(data.data(), static_cast<int64_t>(data.size()));
}
Status Writable::Write(const std::shared_ptr<Buffer>& data) {
@@ -380,15 +380,15 @@ struct ReadRangeCombiner {
auto end = std::remove_if(ranges.begin(), ranges.end(),
[](const ReadRange& range) { return range.length == 0; });
// Sort in position order
- std::sort(ranges.begin(), end,
+ std::sort(ranges.begin(), end,
[](const ReadRange& a, const ReadRange& b) { return a.offset < b.offset; });
- // Remove ranges that overlap 100%
- end = std::unique(ranges.begin(), end,
- [](const ReadRange& left, const ReadRange& right) {
- return right.offset >= left.offset &&
- right.offset + right.length <= left.offset + left.length;
- });
- ranges.resize(end - ranges.begin());
+ // Remove ranges that overlap 100%
+ end = std::unique(ranges.begin(), end,
+ [](const ReadRange& left, const ReadRange& right) {
+ return right.offset >= left.offset &&
+ right.offset + right.length <= left.offset + left.length;
+ });
+ ranges.resize(end - ranges.begin());
// Skip further processing if ranges is empty after removing zero-sized ranges.
if (ranges.empty()) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
index e524afa99a3..1459b173d89 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
@@ -24,7 +24,7 @@
#include "arrow/io/type_fwd.h"
#include "arrow/type_fwd.h"
-#include "arrow/util/cancel.h"
+#include "arrow/util/cancel.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h"
#include "arrow/util/type_fwd.h"
@@ -49,57 +49,57 @@ struct ReadRange {
}
};
-/// EXPERIMENTAL: options provider for IO tasks
-///
-/// Includes an Executor (which will be used to execute asynchronous reads),
-/// a MemoryPool (which will be used to allocate buffers when zero copy reads
-/// are not possible), and an external id (in case the executor receives tasks from
-/// multiple sources and must distinguish tasks associated with this IOContext).
-struct ARROW_EXPORT IOContext {
- // No specified executor: will use a global IO thread pool
- IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {}
-
- explicit IOContext(StopToken stop_token)
- : IOContext(default_memory_pool(), std::move(stop_token)) {}
-
- explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable());
-
- explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
- StopToken stop_token = StopToken::Unstoppable(),
- int64_t external_id = -1)
- : pool_(pool),
- executor_(executor),
- external_id_(external_id),
- stop_token_(std::move(stop_token)) {}
-
- explicit IOContext(::arrow::internal::Executor* executor,
- StopToken stop_token = StopToken::Unstoppable(),
- int64_t external_id = -1)
- : pool_(default_memory_pool()),
- executor_(executor),
- external_id_(external_id),
- stop_token_(std::move(stop_token)) {}
-
- MemoryPool* pool() const { return pool_; }
-
- ::arrow::internal::Executor* executor() const { return executor_; }
-
+/// EXPERIMENTAL: options provider for IO tasks
+///
+/// Includes an Executor (which will be used to execute asynchronous reads),
+/// a MemoryPool (which will be used to allocate buffers when zero copy reads
+/// are not possible), and an external id (in case the executor receives tasks from
+/// multiple sources and must distinguish tasks associated with this IOContext).
+struct ARROW_EXPORT IOContext {
+ // No specified executor: will use a global IO thread pool
+ IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {}
+
+ explicit IOContext(StopToken stop_token)
+ : IOContext(default_memory_pool(), std::move(stop_token)) {}
+
+ explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable());
+
+ explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+ StopToken stop_token = StopToken::Unstoppable(),
+ int64_t external_id = -1)
+ : pool_(pool),
+ executor_(executor),
+ external_id_(external_id),
+ stop_token_(std::move(stop_token)) {}
+
+ explicit IOContext(::arrow::internal::Executor* executor,
+ StopToken stop_token = StopToken::Unstoppable(),
+ int64_t external_id = -1)
+ : pool_(default_memory_pool()),
+ executor_(executor),
+ external_id_(external_id),
+ stop_token_(std::move(stop_token)) {}
+
+ MemoryPool* pool() const { return pool_; }
+
+ ::arrow::internal::Executor* executor() const { return executor_; }
+
// An application-specific ID, forwarded to executor task submissions
- int64_t external_id() const { return external_id_; }
-
- StopToken stop_token() const { return stop_token_; }
-
- private:
- MemoryPool* pool_;
- ::arrow::internal::Executor* executor_;
- int64_t external_id_;
- StopToken stop_token_;
-};
-
-struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext {
- using IOContext::IOContext;
+ int64_t external_id() const { return external_id_; }
+
+ StopToken stop_token() const { return stop_token_; }
+
+ private:
+ MemoryPool* pool_;
+ ::arrow::internal::Executor* executor_;
+ int64_t external_id_;
+ StopToken stop_token_;
};
+struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext {
+ using IOContext::IOContext;
+};
+
class ARROW_EXPORT FileInterface {
public:
virtual ~FileInterface() = 0;
@@ -168,7 +168,7 @@ class ARROW_EXPORT Writable {
/// \brief Flush buffered bytes, if any
virtual Status Flush();
- Status Write(util::string_view data);
+ Status Write(util::string_view data);
};
class ARROW_EXPORT Readable {
@@ -189,12 +189,12 @@ class ARROW_EXPORT Readable {
/// In some cases (e.g. a memory-mapped file), this method may avoid a
/// memory copy.
virtual Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) = 0;
-
- /// EXPERIMENTAL: The IOContext associated with this file.
- ///
- /// By default, this is the same as default_io_context(), but it may be
- /// overriden by subclasses.
- virtual const IOContext& io_context() const;
+
+ /// EXPERIMENTAL: The IOContext associated with this file.
+ ///
+ /// By default, this is the same as default_io_context(), but it may be
+ /// overriden by subclasses.
+ virtual const IOContext& io_context() const;
};
class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable {
@@ -202,9 +202,9 @@ class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable
OutputStream() = default;
};
-class ARROW_EXPORT InputStream : virtual public FileInterface,
- virtual public Readable,
- public std::enable_shared_from_this<InputStream> {
+class ARROW_EXPORT InputStream : virtual public FileInterface,
+ virtual public Readable,
+ public std::enable_shared_from_this<InputStream> {
public:
/// \brief Advance or skip stream indicated number of bytes
/// \param[in] nbytes the number to move forward
@@ -227,23 +227,23 @@ class ARROW_EXPORT InputStream : virtual public FileInterface,
/// Zero copy reads imply the use of Buffer-returning Read() overloads.
virtual bool supports_zero_copy() const;
- /// \brief Read and return stream metadata
- ///
- /// If the stream implementation doesn't support metadata, empty metadata
- /// is returned. Note that it is allowed to return a null pointer rather
- /// than an allocated empty metadata.
- virtual Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
-
- /// \brief Read stream metadata asynchronously
- virtual Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context);
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync();
-
+ /// \brief Read and return stream metadata
+ ///
+ /// If the stream implementation doesn't support metadata, empty metadata
+ /// is returned. Note that it is allowed to return a null pointer rather
+ /// than an allocated empty metadata.
+ virtual Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ /// \brief Read stream metadata asynchronously
+ virtual Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context);
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync();
+
protected:
InputStream() = default;
};
-class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
+class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
public:
/// Necessary because we hold a std::unique_ptr
~RandomAccessFile() override;
@@ -292,12 +292,12 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
virtual Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes);
/// EXPERIMENTAL: Read data asynchronously.
- virtual Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ virtual Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
int64_t nbytes);
- /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext.
- Future<std::shared_ptr<Buffer>> ReadAsync(int64_t position, int64_t nbytes);
-
+ /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext.
+ Future<std::shared_ptr<Buffer>> ReadAsync(int64_t position, int64_t nbytes);
+
/// EXPERIMENTAL: Inform that the given ranges may be read soon.
///
/// Some implementations might arrange to prefetch some of the data.
@@ -309,8 +309,8 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
RandomAccessFile();
private:
- struct ARROW_NO_EXPORT Impl;
- std::unique_ptr<Impl> interface_impl_;
+ struct ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> interface_impl_;
};
class ARROW_EXPORT WritableFile : public OutputStream, public Seekable {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
index 6495242e63b..b52c456fd89 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
@@ -261,10 +261,10 @@ void FixedSizeBufferWriter::set_memcopy_threshold(int64_t threshold) {
// ----------------------------------------------------------------------
// In-memory buffer reader
-BufferReader::BufferReader(std::shared_ptr<Buffer> buffer)
- : buffer_(std::move(buffer)),
- data_(buffer_ ? buffer_->data() : reinterpret_cast<const uint8_t*>("")),
- size_(buffer_ ? buffer_->size() : 0),
+BufferReader::BufferReader(std::shared_ptr<Buffer> buffer)
+ : buffer_(std::move(buffer)),
+ data_(buffer_ ? buffer_->data() : reinterpret_cast<const uint8_t*>("")),
+ size_(buffer_ ? buffer_->size() : 0),
position_(0),
is_open_(true) {}
@@ -320,7 +320,7 @@ Status BufferReader::WillNeed(const std::vector<ReadRange>& ranges) {
return st;
}
-Future<std::shared_ptr<Buffer>> BufferReader::ReadAsync(const IOContext&,
+Future<std::shared_ptr<Buffer>> BufferReader::ReadAsync(const IOContext&,
int64_t position,
int64_t nbytes) {
return Future<std::shared_ptr<Buffer>>::MakeFinished(DoReadAt(position, nbytes));
@@ -344,8 +344,8 @@ Result<std::shared_ptr<Buffer>> BufferReader::DoReadAt(int64_t position, int64_t
DCHECK_GE(nbytes, 0);
// Arrange for data to be paged in
- // RETURN_NOT_OK(::arrow::internal::MemoryAdviseWillNeed(
- // {{const_cast<uint8_t*>(data_ + position), static_cast<size_t>(nbytes)}}));
+ // RETURN_NOT_OK(::arrow::internal::MemoryAdviseWillNeed(
+ // {{const_cast<uint8_t*>(data_ + position), static_cast<size_t>(nbytes)}}));
if (nbytes > 0 && buffer_ != nullptr) {
return SliceBuffer(buffer_, position, nbytes);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
index 8213439ef74..ff9e179d862 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
@@ -88,7 +88,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream {
uint8_t* mutable_data_;
};
-/// \brief A helper class to track the size of allocations
+/// \brief A helper class to track the size of allocations
///
/// Writes to this stream do not copy or retain any data, they just bump
/// a size counter that can be later used to know exactly which data size
@@ -145,7 +145,7 @@ class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile {
class ARROW_EXPORT BufferReader
: public internal::RandomAccessFileConcurrencyWrapper<BufferReader> {
public:
- explicit BufferReader(std::shared_ptr<Buffer> buffer);
+ explicit BufferReader(std::shared_ptr<Buffer> buffer);
explicit BufferReader(const Buffer& buffer);
BufferReader(const uint8_t* data, int64_t size);
@@ -160,7 +160,7 @@ class ARROW_EXPORT BufferReader
std::shared_ptr<Buffer> buffer() const { return buffer_; }
// Synchronous ReadAsync override
- Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
int64_t nbytes) override;
Status WillNeed(const std::vector<ReadRange>& ranges) override;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
index 7ef4843a224..48ac06de186 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
@@ -1,95 +1,95 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/io/stdio.h"
-
-#include <iostream>
-
-#include "arrow/buffer.h"
-#include "arrow/result.h"
-
-namespace arrow {
-namespace io {
-
-//
-// StdoutStream implementation
-//
-
-StdoutStream::StdoutStream() : pos_(0) { set_mode(FileMode::WRITE); }
-
-Status StdoutStream::Close() { return Status::OK(); }
-
-bool StdoutStream::closed() const { return false; }
-
-Result<int64_t> StdoutStream::Tell() const { return pos_; }
-
-Status StdoutStream::Write(const void* data, int64_t nbytes) {
- pos_ += nbytes;
- std::cout.write(reinterpret_cast<const char*>(data), nbytes);
- return Status::OK();
-}
-
-//
-// StderrStream implementation
-//
-
-StderrStream::StderrStream() : pos_(0) { set_mode(FileMode::WRITE); }
-
-Status StderrStream::Close() { return Status::OK(); }
-
-bool StderrStream::closed() const { return false; }
-
-Result<int64_t> StderrStream::Tell() const { return pos_; }
-
-Status StderrStream::Write(const void* data, int64_t nbytes) {
- pos_ += nbytes;
- std::cerr.write(reinterpret_cast<const char*>(data), nbytes);
- return Status::OK();
-}
-
-//
-// StdinStream implementation
-//
-
-StdinStream::StdinStream() : pos_(0) { set_mode(FileMode::READ); }
-
-Status StdinStream::Close() { return Status::OK(); }
-
-bool StdinStream::closed() const { return false; }
-
-Result<int64_t> StdinStream::Tell() const { return pos_; }
-
-Result<int64_t> StdinStream::Read(int64_t nbytes, void* out) {
- std::cin.read(reinterpret_cast<char*>(out), nbytes);
- if (std::cin) {
- pos_ += nbytes;
- return nbytes;
- } else {
- return 0;
- }
-}
-
-Result<std::shared_ptr<Buffer>> StdinStream::Read(int64_t nbytes) {
- ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes));
- ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
- ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false));
- buffer->ZeroPadding();
- return std::move(buffer);
-}
-
-} // namespace io
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/stdio.h"
+
+#include <iostream>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace io {
+
+//
+// StdoutStream implementation
+//
+
+StdoutStream::StdoutStream() : pos_(0) { set_mode(FileMode::WRITE); }
+
+Status StdoutStream::Close() { return Status::OK(); }
+
+bool StdoutStream::closed() const { return false; }
+
+Result<int64_t> StdoutStream::Tell() const { return pos_; }
+
+Status StdoutStream::Write(const void* data, int64_t nbytes) {
+ pos_ += nbytes;
+ std::cout.write(reinterpret_cast<const char*>(data), nbytes);
+ return Status::OK();
+}
+
+//
+// StderrStream implementation
+//
+
+StderrStream::StderrStream() : pos_(0) { set_mode(FileMode::WRITE); }
+
+Status StderrStream::Close() { return Status::OK(); }
+
+bool StderrStream::closed() const { return false; }
+
+Result<int64_t> StderrStream::Tell() const { return pos_; }
+
+Status StderrStream::Write(const void* data, int64_t nbytes) {
+ pos_ += nbytes;
+ std::cerr.write(reinterpret_cast<const char*>(data), nbytes);
+ return Status::OK();
+}
+
+//
+// StdinStream implementation
+//
+
+StdinStream::StdinStream() : pos_(0) { set_mode(FileMode::READ); }
+
+Status StdinStream::Close() { return Status::OK(); }
+
+bool StdinStream::closed() const { return false; }
+
+Result<int64_t> StdinStream::Tell() const { return pos_; }
+
+Result<int64_t> StdinStream::Read(int64_t nbytes, void* out) {
+ std::cin.read(reinterpret_cast<char*>(out), nbytes);
+ if (std::cin) {
+ pos_ += nbytes;
+ return nbytes;
+ } else {
+ return 0;
+ }
+}
+
+Result<std::shared_ptr<Buffer>> StdinStream::Read(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes));
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
+ ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false));
+ buffer->ZeroPadding();
+ return std::move(buffer);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
index 9484ac77124..6df07d670af 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
@@ -1,82 +1,82 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/io/interfaces.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace io {
-
-// Output stream that just writes to stdout.
-class ARROW_EXPORT StdoutStream : public OutputStream {
- public:
- StdoutStream();
- ~StdoutStream() override {}
-
- Status Close() override;
- bool closed() const override;
-
- Result<int64_t> Tell() const override;
-
- Status Write(const void* data, int64_t nbytes) override;
-
- private:
- int64_t pos_;
-};
-
-// Output stream that just writes to stderr.
-class ARROW_EXPORT StderrStream : public OutputStream {
- public:
- StderrStream();
- ~StderrStream() override {}
-
- Status Close() override;
- bool closed() const override;
-
- Result<int64_t> Tell() const override;
-
- Status Write(const void* data, int64_t nbytes) override;
-
- private:
- int64_t pos_;
-};
-
-// Input stream that just reads from stdin.
-class ARROW_EXPORT StdinStream : public InputStream {
- public:
- StdinStream();
- ~StdinStream() override {}
-
- Status Close() override;
- bool closed() const override;
-
- Result<int64_t> Tell() const override;
-
- Result<int64_t> Read(int64_t nbytes, void* out) override;
-
- Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
-
- private:
- int64_t pos_;
-};
-
-} // namespace io
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+// Output stream that just writes to stdout.
+class ARROW_EXPORT StdoutStream : public OutputStream {
+ public:
+ StdoutStream();
+ ~StdoutStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+// Output stream that just writes to stderr.
+class ARROW_EXPORT StderrStream : public OutputStream {
+ public:
+ StderrStream();
+ ~StderrStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+// Input stream that just reads from stdin.
+class ARROW_EXPORT StdinStream : public InputStream {
+ public:
+ StdinStream();
+ ~StdinStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
index 3fdf5a7a9ba..50198ad20ef 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
@@ -145,18 +145,18 @@ Result<int64_t> TransformInputStream::Tell() const {
return impl_->pos_;
}
-Result<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadata() {
- RETURN_NOT_OK(impl_->CheckClosed());
-
- return impl_->wrapped_->ReadMetadata();
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadataAsync(
- const IOContext& io_context) {
- RETURN_NOT_OK(impl_->CheckClosed());
-
- return impl_->wrapped_->ReadMetadataAsync(io_context);
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadata() {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->wrapped_->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->wrapped_->ReadMetadataAsync(io_context);
+}
+
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
index c117f275929..6ecaa6d6101 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
@@ -45,10 +45,10 @@ class ARROW_EXPORT TransformInputStream : public InputStream {
Result<int64_t> Read(int64_t nbytes, void* out) override;
Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context) override;
-
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
+
Result<int64_t> Tell() const override;
protected:
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
index a2fd33bf360..632616de1fe 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
@@ -17,9 +17,9 @@
#pragma once
-#include "arrow/type_fwd.h"
-#include "arrow/util/visibility.h"
-
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
namespace arrow {
namespace io {
@@ -27,30 +27,30 @@ struct FileMode {
enum type { READ, WRITE, READWRITE };
};
-struct IOContext;
-struct CacheOptions;
-
-/// EXPERIMENTAL: convenience global singleton for default IOContext settings
-ARROW_EXPORT
-const IOContext& default_io_context();
-
-/// \brief Get the capacity of the global I/O thread pool
-///
-/// Return the number of worker threads in the thread pool to which
-/// Arrow dispatches various I/O-bound tasks. This is an ideal number,
-/// not necessarily the exact number of threads at a given point in time.
-///
-/// You can change this number using SetIOThreadPoolCapacity().
-ARROW_EXPORT int GetIOThreadPoolCapacity();
-
-/// \brief Set the capacity of the global I/O thread pool
-///
-/// Set the number of worker threads in the thread pool to which
-/// Arrow dispatches various I/O-bound tasks.
-///
-/// The current number is returned by GetIOThreadPoolCapacity().
-ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads);
-
+struct IOContext;
+struct CacheOptions;
+
+/// EXPERIMENTAL: convenience global singleton for default IOContext settings
+ARROW_EXPORT
+const IOContext& default_io_context();
+
+/// \brief Get the capacity of the global I/O thread pool
+///
+/// Return the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks. This is an ideal number,
+/// not necessarily the exact number of threads at a given point in time.
+///
+/// You can change this number using SetIOThreadPoolCapacity().
+ARROW_EXPORT int GetIOThreadPoolCapacity();
+
+/// \brief Set the capacity of the global I/O thread pool
+///
+/// Set the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks.
+///
+/// The current number is returned by GetIOThreadPoolCapacity().
+ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads);
+
class FileInterface;
class Seekable;
class Writable;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
index b1d75d1d0bd..dc9d6781ada 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
@@ -18,11 +18,11 @@
#pragma once
#include <memory>
-#include <utility>
+#include <utility>
#include <vector>
#include "arrow/io/interfaces.h"
-#include "arrow/util/thread_pool.h"
+#include "arrow/util/thread_pool.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
@@ -52,15 +52,15 @@ std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
ARROW_EXPORT
::arrow::internal::ThreadPool* GetIOThreadPool();
-template <typename... SubmitArgs>
-auto SubmitIO(IOContext io_context, SubmitArgs&&... submit_args)
- -> decltype(std::declval<::arrow::internal::Executor*>()->Submit(submit_args...)) {
- ::arrow::internal::TaskHints hints;
- hints.external_id = io_context.external_id();
- return io_context.executor()->Submit(hints, io_context.stop_token(),
- std::forward<SubmitArgs>(submit_args)...);
-}
-
+template <typename... SubmitArgs>
+auto SubmitIO(IOContext io_context, SubmitArgs&&... submit_args)
+ -> decltype(std::declval<::arrow::internal::Executor*>()->Submit(submit_args...)) {
+ ::arrow::internal::TaskHints hints;
+ hints.external_id = io_context.external_id();
+ return io_context.executor()->Submit(hints, io_context.stop_token(),
+ std::forward<SubmitArgs>(submit_args)...);
+}
+
} // namespace internal
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
index 3ab2c8b3847..13b1424ee5e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
@@ -20,14 +20,14 @@
#include <algorithm>
#include <cstdint>
#include <memory>
-#include <set>
+#include <set>
#include <unordered_map>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/concatenate.h"
-#include "arrow/array/validate.h"
+#include "arrow/array/validate.h"
#include "arrow/extension_type.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
@@ -88,16 +88,16 @@ struct DictionaryFieldMapper::Impl {
int num_fields() const { return static_cast<int>(field_path_to_id.size()); }
- int num_dicts() const {
- std::set<int64_t> uniqueIds;
-
- for (auto& kv : field_path_to_id) {
- uniqueIds.insert(kv.second);
- }
-
- return static_cast<int>(uniqueIds.size());
- }
-
+ int num_dicts() const {
+ std::set<int64_t> uniqueIds;
+
+ for (auto& kv : field_path_to_id) {
+ uniqueIds.insert(kv.second);
+ }
+
+ return static_cast<int>(uniqueIds.size());
+ }
+
private:
void ImportFields(const FieldPosition& pos,
const std::vector<std::shared_ptr<Field>>& fields) {
@@ -151,32 +151,32 @@ Result<int64_t> DictionaryFieldMapper::GetFieldId(std::vector<int> field_path) c
int DictionaryFieldMapper::num_fields() const { return impl_->num_fields(); }
-int DictionaryFieldMapper::num_dicts() const { return impl_->num_dicts(); }
-
+int DictionaryFieldMapper::num_dicts() const { return impl_->num_dicts(); }
+
// ----------------------------------------------------------------------
// DictionaryMemo implementation
-namespace {
-
-bool HasUnresolvedNestedDict(const ArrayData& data) {
- if (data.type->id() == Type::DICTIONARY) {
- if (data.dictionary == nullptr) {
- return true;
- }
- if (HasUnresolvedNestedDict(*data.dictionary)) {
- return true;
- }
- }
- for (const auto& child : data.child_data) {
- if (HasUnresolvedNestedDict(*child)) {
- return true;
- }
- }
- return false;
-}
-
-} // namespace
-
+namespace {
+
+bool HasUnresolvedNestedDict(const ArrayData& data) {
+ if (data.type->id() == Type::DICTIONARY) {
+ if (data.dictionary == nullptr) {
+ return true;
+ }
+ if (HasUnresolvedNestedDict(*data.dictionary)) {
+ return true;
+ }
+ }
+ for (const auto& child : data.child_data) {
+ if (HasUnresolvedNestedDict(*child)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace
+
struct DictionaryMemo::Impl {
// Map of dictionary id to dictionary array(s) (several in case of deltas)
std::unordered_map<int64_t, ArrayDataVector> id_to_dictionary_;
@@ -205,12 +205,12 @@ struct DictionaryMemo::Impl {
// corrupted data. Full validation is necessary for certain types
// (for example nested dictionaries).
for (const auto& data : *data_vector) {
- if (HasUnresolvedNestedDict(*data)) {
- return Status::NotImplemented(
- "Encountered delta dictionary with an unresolved nested dictionary");
- }
- RETURN_NOT_OK(::arrow::internal::ValidateArray(*data));
- RETURN_NOT_OK(::arrow::internal::ValidateArrayFull(*data));
+ if (HasUnresolvedNestedDict(*data)) {
+ return Status::NotImplemented(
+ "Encountered delta dictionary with an unresolved nested dictionary");
+ }
+ RETURN_NOT_OK(::arrow::internal::ValidateArray(*data));
+ RETURN_NOT_OK(::arrow::internal::ValidateArrayFull(*data));
to_combine.push_back(MakeArray(data));
}
ARROW_ASSIGN_OR_RAISE(auto combined_dict, Concatenate(to_combine, pool));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
index e4287cb1974..25fa70f0dfb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
@@ -80,10 +80,10 @@ class ARROW_EXPORT DictionaryFieldMapper {
int num_fields() const;
- /// \brief Returns number of unique dictionaries, taking into
- /// account that different fields can share the same dictionary.
- int num_dicts() const;
-
+ /// \brief Returns number of unique dictionaries, taking into
+ /// account that different fields can share the same dictionary.
+ int num_dicts() const;
+
private:
struct Impl;
std::unique_ptr<Impl> impl_;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
index b1c30eec0b3..3354ee930ed 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
@@ -61,15 +61,15 @@ class ExtensionType;
namespace ipc {
namespace feather {
-namespace {
+namespace {
-using FBB = flatbuffers::FlatBufferBuilder;
+using FBB = flatbuffers::FlatBufferBuilder;
-constexpr const char* kFeatherV1MagicBytes = "FEA1";
-constexpr const int kFeatherDefaultAlignment = 8;
-const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0};
-
-inline int64_t PaddedLength(int64_t nbytes) {
+constexpr const char* kFeatherV1MagicBytes = "FEA1";
+constexpr const int kFeatherDefaultAlignment = 8;
+const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0};
+
+inline int64_t PaddedLength(int64_t nbytes) {
static const int64_t alignment = kFeatherDefaultAlignment;
return ((nbytes + alignment - 1) / alignment) * alignment;
}
@@ -120,14 +120,14 @@ struct ColumnType {
enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME };
};
-inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) {
+inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) {
return static_cast<TimeUnit::type>(static_cast<int>(unit));
}
/// For compatibility, we need to write any data sometimes just to keep producing
/// files that can be read with an older reader.
-Status WritePaddedBlank(io::OutputStream* stream, int64_t length,
- int64_t* bytes_written) {
+Status WritePaddedBlank(io::OutputStream* stream, int64_t length,
+ int64_t* bytes_written) {
const uint8_t null = 0;
for (int64_t i = 0; i < length; i++) {
RETURN_NOT_OK(stream->Write(&null, 1));
@@ -180,7 +180,7 @@ class ReaderV1 : public Reader {
GetDataType(col->values(), col->metadata_type(), col->metadata(), &type));
fields.push_back(::arrow::field(col->name()->str(), type));
}
- schema_ = ::arrow::schema(std::move(fields));
+ schema_ = ::arrow::schema(std::move(fields));
return Status::OK();
}
@@ -343,7 +343,7 @@ class ReaderV1 : public Reader {
columns.emplace_back();
RETURN_NOT_OK(GetColumn(i, &columns.back()));
}
- *out = Table::Make(this->schema(), std::move(columns), this->num_rows());
+ *out = Table::Make(this->schema(), std::move(columns), this->num_rows());
return Status::OK();
}
@@ -360,8 +360,8 @@ class ReaderV1 : public Reader {
RETURN_NOT_OK(GetColumn(field_index, &columns.back()));
fields.push_back(my_schema->field(field_index));
}
- *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
- this->num_rows());
+ *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
+ this->num_rows());
return Status::OK();
}
@@ -380,8 +380,8 @@ class ReaderV1 : public Reader {
RETURN_NOT_OK(GetColumn(field_index, &columns.back()));
fields.push_back(sch->field(field_index));
}
- *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
- this->num_rows());
+ *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
+ this->num_rows());
return Status::OK();
}
@@ -440,14 +440,14 @@ Result<fbs::Type> ToFlatbufferType(const DataType& type) {
}
}
-inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray(
+inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray(
FBB& fbb, const ArrayMetadata& array) {
return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset,
array.length, array.null_count, array.total_bytes);
}
// Convert Feather enums to Flatbuffer enums
-inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) {
+inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) {
return static_cast<fbs::TimeUnit>(static_cast<int>(unit));
}
@@ -459,7 +459,7 @@ const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = {
fbs::TypeMetadata::TimeMetadata // TIME
};
-inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) {
+inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) {
return COLUMN_TYPE_ENUM_MAPPING[column_type];
}
@@ -755,8 +755,8 @@ class ReaderV2 : public Reader {
std::shared_ptr<Schema> schema_;
};
-} // namespace
-
+} // namespace
+
Result<std::shared_ptr<Reader>> Reader::Open(
const std::shared_ptr<io::RandomAccessFile>& source) {
// Pathological issue where the file is smaller than header and footer
@@ -801,8 +801,8 @@ Status WriteTable(const Table& table, io::OutputStream* dst,
return WriteFeatherV1(table, dst);
} else {
IpcWriteOptions ipc_options = IpcWriteOptions::Defaults();
- ipc_options.unify_dictionaries = true;
- ipc_options.allow_64bit = true;
+ ipc_options.unify_dictionaries = true;
+ ipc_options.allow_64bit = true;
ARROW_ASSIGN_OR_RAISE(
ipc_options.codec,
util::Codec::Create(properties.compression, properties.compression_level));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
index a32ff6d0a5a..3c43cf7cff7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
@@ -25,7 +25,7 @@
#include <string>
#include <vector>
-#include "arrow/type_fwd.h"
+#include "arrow/type_fwd.h"
#include "arrow/util/compression.h"
#include "arrow/util/visibility.h"
@@ -128,7 +128,7 @@ struct ARROW_EXPORT WriteProperties {
Compression::type compression = Compression::UNCOMPRESSED;
/// Compressor-specific compression level
- int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
+ int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
};
ARROW_EXPORT
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
index 4dd3a664aa6..805a0c44354 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
@@ -1,61 +1,61 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Implement a simple JSON representation format for arrays
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include "arrow/status.h"
-#include "arrow/util/string_view.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace ipc {
-namespace internal {
-namespace json {
-
-ARROW_EXPORT
-Status ArrayFromJSON(const std::shared_ptr<DataType>&, const std::string& json,
- std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status ArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
- std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status ArrayFromJSON(const std::shared_ptr<DataType>&, const char* json,
- std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status DictArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view indices_json,
- util::string_view dictionary_json, std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status ScalarFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
- std::shared_ptr<Scalar>* out);
-
-} // namespace json
-} // namespace internal
-} // namespace ipc
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement a simple JSON representation format for arrays
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+
+namespace ipc {
+namespace internal {
+namespace json {
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, const std::string& json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, const char* json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status DictArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view indices_json,
+ util::string_view dictionary_json, std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ScalarFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
+ std::shared_ptr<Scalar>* out);
+
+} // namespace json
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
index 197556efcea..e047e29c201 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
@@ -32,8 +32,8 @@
#include "arrow/ipc/options.h"
#include "arrow/ipc/util.h"
#include "arrow/status.h"
-#include "arrow/util/endian.h"
-#include "arrow/util/future.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/future.h"
#include "arrow/util/logging.h"
#include "arrow/util/ubsan.h"
@@ -269,10 +269,10 @@ std::string FormatMessageType(MessageType type) {
return "record batch";
case MessageType::DICTIONARY_BATCH:
return "dictionary";
- case MessageType::TENSOR:
- return "tensor";
- case MessageType::SPARSE_TENSOR:
- return "sparse tensor";
+ case MessageType::TENSOR:
+ return "tensor";
+ case MessageType::SPARSE_TENSOR:
+ return "sparse tensor";
default:
break;
}
@@ -325,60 +325,60 @@ Result<std::unique_ptr<Message>> ReadMessage(int64_t offset, int32_t metadata_le
}
}
-Future<std::shared_ptr<Message>> ReadMessageAsync(int64_t offset, int32_t metadata_length,
- int64_t body_length,
- io::RandomAccessFile* file,
- const io::IOContext& context) {
- struct State {
- std::unique_ptr<Message> result;
- std::shared_ptr<MessageDecoderListener> listener;
- std::shared_ptr<MessageDecoder> decoder;
- };
- auto state = std::make_shared<State>();
- state->listener = std::make_shared<AssignMessageDecoderListener>(&state->result);
- state->decoder = std::make_shared<MessageDecoder>(state->listener);
-
- if (metadata_length < state->decoder->next_required_size()) {
- return Status::Invalid("metadata_length should be at least ",
- state->decoder->next_required_size());
- }
- return file->ReadAsync(context, offset, metadata_length + body_length)
- .Then([=](std::shared_ptr<Buffer> metadata) -> Result<std::shared_ptr<Message>> {
- if (metadata->size() < metadata_length) {
- return Status::Invalid("Expected to read ", metadata_length,
- " metadata bytes but got ", metadata->size());
- }
- ARROW_RETURN_NOT_OK(
- state->decoder->Consume(SliceBuffer(metadata, 0, metadata_length)));
- switch (state->decoder->state()) {
- case MessageDecoder::State::INITIAL:
- return std::move(state->result);
- case MessageDecoder::State::METADATA_LENGTH:
- return Status::Invalid("metadata length is missing. File offset: ", offset,
- ", metadata length: ", metadata_length);
- case MessageDecoder::State::METADATA:
- return Status::Invalid("flatbuffer size ",
- state->decoder->next_required_size(),
- " invalid. File offset: ", offset,
- ", metadata length: ", metadata_length);
- case MessageDecoder::State::BODY: {
- auto body = SliceBuffer(metadata, metadata_length, body_length);
- if (body->size() < state->decoder->next_required_size()) {
- return Status::IOError("Expected to be able to read ",
- state->decoder->next_required_size(),
- " bytes for message body, got ", body->size());
- }
- RETURN_NOT_OK(state->decoder->Consume(body));
- return std::move(state->result);
- }
- case MessageDecoder::State::EOS:
- return Status::Invalid("Unexpected empty message in IPC file format");
- default:
- return Status::Invalid("Unexpected state: ", state->decoder->state());
- }
- });
-}
-
+Future<std::shared_ptr<Message>> ReadMessageAsync(int64_t offset, int32_t metadata_length,
+ int64_t body_length,
+ io::RandomAccessFile* file,
+ const io::IOContext& context) {
+ struct State {
+ std::unique_ptr<Message> result;
+ std::shared_ptr<MessageDecoderListener> listener;
+ std::shared_ptr<MessageDecoder> decoder;
+ };
+ auto state = std::make_shared<State>();
+ state->listener = std::make_shared<AssignMessageDecoderListener>(&state->result);
+ state->decoder = std::make_shared<MessageDecoder>(state->listener);
+
+ if (metadata_length < state->decoder->next_required_size()) {
+ return Status::Invalid("metadata_length should be at least ",
+ state->decoder->next_required_size());
+ }
+ return file->ReadAsync(context, offset, metadata_length + body_length)
+ .Then([=](std::shared_ptr<Buffer> metadata) -> Result<std::shared_ptr<Message>> {
+ if (metadata->size() < metadata_length) {
+ return Status::Invalid("Expected to read ", metadata_length,
+ " metadata bytes but got ", metadata->size());
+ }
+ ARROW_RETURN_NOT_OK(
+ state->decoder->Consume(SliceBuffer(metadata, 0, metadata_length)));
+ switch (state->decoder->state()) {
+ case MessageDecoder::State::INITIAL:
+ return std::move(state->result);
+ case MessageDecoder::State::METADATA_LENGTH:
+ return Status::Invalid("metadata length is missing. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::METADATA:
+ return Status::Invalid("flatbuffer size ",
+ state->decoder->next_required_size(),
+ " invalid. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::BODY: {
+ auto body = SliceBuffer(metadata, metadata_length, body_length);
+ if (body->size() < state->decoder->next_required_size()) {
+ return Status::IOError("Expected to be able to read ",
+ state->decoder->next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ RETURN_NOT_OK(state->decoder->Consume(body));
+ return std::move(state->result);
+ }
+ case MessageDecoder::State::EOS:
+ return Status::Invalid("Unexpected empty message in IPC file format");
+ default:
+ return Status::Invalid("Unexpected state: ", state->decoder->state());
+ }
+ });
+}
+
Status AlignStream(io::InputStream* stream, int32_t alignment) {
ARROW_ASSIGN_OR_RAISE(int64_t position, stream->Tell());
return stream->Advance(PaddedLength(position, alignment) - position);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
index b2683259cb4..d437bdfe773 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
@@ -365,7 +365,7 @@ class ARROW_EXPORT MessageDecoder {
/// memcpy(buffer->mutable_data() + current_buffer_size,
/// small_chunk,
/// small_chunk_size);
- /// if (buffer->size() < decoder.next_required_size()) {
+ /// if (buffer->size() < decoder.next_required_size()) {
/// continue;
/// }
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
@@ -459,11 +459,11 @@ Result<std::unique_ptr<Message>> ReadMessage(const int64_t offset,
const int32_t metadata_length,
io::RandomAccessFile* file);
-ARROW_EXPORT
-Future<std::shared_ptr<Message>> ReadMessageAsync(
- const int64_t offset, const int32_t metadata_length, const int64_t body_length,
- io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
-
+ARROW_EXPORT
+Future<std::shared_ptr<Message>> ReadMessageAsync(
+ const int64_t offset, const int32_t metadata_length, const int64_t body_length,
+ io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
+
/// \brief Advance stream to an 8-byte offset if its position is not a multiple
/// of 8 already
/// \param[in] stream an input stream
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
index 4b332bd9e1e..9d0db6a0d8b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
@@ -271,12 +271,12 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
return Status::OK();
case flatbuf::Type::Decimal: {
auto dec_type = static_cast<const flatbuf::Decimal*>(type_data);
- if (dec_type->bitWidth() == 128) {
- return Decimal128Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
- } else if (dec_type->bitWidth() == 256) {
- return Decimal256Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
- } else {
- return Status::Invalid("Library only supports 128-bit or 256-bit decimal values");
+ if (dec_type->bitWidth() == 128) {
+ return Decimal128Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
+ } else if (dec_type->bitWidth() == 256) {
+ return Decimal256Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
+ } else {
+ return Status::Invalid("Library only supports 128-bit or 256-bit decimal values");
}
}
case flatbuf::Type::Date: {
@@ -428,7 +428,7 @@ static Status GetDictionaryEncoding(FBB& fbb, const std::shared_ptr<Field>& fiel
const DictionaryType& type, int64_t dictionary_id,
DictionaryOffset* out) {
// We assume that the dictionary index type (as an integer) has already been
- // validated elsewhere, and can safely assume we are dealing with integers
+ // validated elsewhere, and can safely assume we are dealing with integers
const auto& index_type = checked_cast<const IntegerType&>(*type.index_type());
auto index_type_offset =
@@ -594,24 +594,24 @@ class FieldToFlatbufferVisitor {
return Status::OK();
}
- Status Visit(const Decimal128Type& type) {
+ Status Visit(const Decimal128Type& type) {
const auto& dec_type = checked_cast<const Decimal128Type&>(type);
fb_type_ = flatbuf::Type::Decimal;
- type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
- /*bitWidth=*/128)
- .Union();
- return Status::OK();
- }
-
- Status Visit(const Decimal256Type& type) {
- const auto& dec_type = checked_cast<const Decimal256Type&>(type);
- fb_type_ = flatbuf::Type::Decimal;
- type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
- /*bitWith=*/256)
- .Union();
+ type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
+ /*bitWidth=*/128)
+ .Union();
return Status::OK();
}
+ Status Visit(const Decimal256Type& type) {
+ const auto& dec_type = checked_cast<const Decimal256Type&>(type);
+ fb_type_ = flatbuf::Type::Decimal;
+ type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
+ /*bitWith=*/256)
+ .Union();
+ return Status::OK();
+ }
+
Status Visit(const ListType& type) {
fb_type_ = flatbuf::Type::List;
RETURN_NOT_OK(VisitChildFields(type));
@@ -753,15 +753,15 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos,
// Reconstruct the data type
// 1. Data type children
- FieldVector child_fields;
+ FieldVector child_fields;
const auto& children = field->children();
- // As a tolerance, allow for a null children field meaning "no children" (ARROW-12100)
- if (children != nullptr) {
- child_fields.resize(children->size());
- for (int i = 0; i < static_cast<int>(children->size()); ++i) {
- RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
- dictionary_memo, &child_fields[i]));
- }
+ // As a tolerance, allow for a null children field meaning "no children" (ARROW-12100)
+ if (children != nullptr) {
+ child_fields.resize(children->size());
+ for (int i = 0; i < static_cast<int>(children->size()); ++i) {
+ RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
+ dictionary_memo, &child_fields[i]));
+ }
}
// 2. Top-level concrete data type
@@ -871,12 +871,12 @@ Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema,
Result<std::shared_ptr<Buffer>> WriteFBMessage(
FBB& fbb, flatbuf::MessageHeader header_type, flatbuffers::Offset<void> header,
int64_t body_length, MetadataVersion version,
- const std::shared_ptr<const KeyValueMetadata>& custom_metadata, MemoryPool* pool) {
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata, MemoryPool* pool) {
auto message = flatbuf::CreateMessage(fbb, MetadataVersionToFlatbuffer(version),
header_type, header, body_length,
SerializeCustomMetadata(fbb, custom_metadata));
fbb.Finish(message);
- return WriteFlatbufferBuilder(fbb, pool);
+ return WriteFlatbufferBuilder(fbb, pool);
}
using FieldNodeVector =
@@ -1183,8 +1183,8 @@ Status WriteSchemaMessage(const Schema& schema, const DictionaryFieldMapper& map
flatbuffers::Offset<flatbuf::Schema> fb_schema;
RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, mapper, &fb_schema));
return WriteFBMessage(fbb, flatbuf::MessageHeader::Schema, fb_schema.Union(),
- /*body_length=*/0, options.metadata_version,
- /*custom_metadata=*/nullptr, options.memory_pool)
+ /*body_length=*/0, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool)
.Value(out);
}
@@ -1198,8 +1198,8 @@ Status WriteRecordBatchMessage(
RETURN_NOT_OK(
MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch));
return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(),
- body_length, options.metadata_version, custom_metadata,
- options.memory_pool)
+ body_length, options.metadata_version, custom_metadata,
+ options.memory_pool)
.Value(out);
}
@@ -1233,8 +1233,8 @@ Result<std::shared_ptr<Buffer>> WriteTensorMessage(const Tensor& tensor,
flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer);
return WriteFBMessage(fbb, flatbuf::MessageHeader::Tensor, fb_tensor.Union(),
- body_length, options.metadata_version,
- /*custom_metadata=*/nullptr, options.memory_pool);
+ body_length, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool);
}
Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
@@ -1245,8 +1245,8 @@ Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
RETURN_NOT_OK(
MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor));
return WriteFBMessage(fbb, flatbuf::MessageHeader::SparseTensor,
- fb_sparse_tensor.Union(), body_length, options.metadata_version,
- /*custom_metadata=*/nullptr, options.memory_pool);
+ fb_sparse_tensor.Union(), body_length, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool);
}
Status WriteDictionaryMessage(
@@ -1261,8 +1261,8 @@ Status WriteDictionaryMessage(
auto dictionary_batch =
flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union();
return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch,
- body_length, options.metadata_version, custom_metadata,
- options.memory_pool)
+ body_length, options.metadata_version, custom_metadata,
+ options.memory_pool)
.Value(out);
}
@@ -1338,11 +1338,11 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
std::shared_ptr<KeyValueMetadata> metadata;
RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata));
- // set endianess using the value in flatbuf schema
- auto endianness = schema->endianness() == flatbuf::Endianness::Little
- ? Endianness::Little
- : Endianness::Big;
- *out = ::arrow::schema(std::move(fields), endianness, metadata);
+ // set endianess using the value in flatbuf schema
+ auto endianness = schema->endianness() == flatbuf::Endianness::Little
+ ? Endianness::Little
+ : Endianness::Big;
+ *out = ::arrow::schema(std::move(fields), endianness, metadata);
return Status::OK();
}
@@ -1356,9 +1356,9 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
return Status::IOError("Header-type of flatbuffer-encoded Message is not Tensor.");
}
- flatbuffers::uoffset_t ndim = tensor->shape()->size();
+ flatbuffers::uoffset_t ndim = tensor->shape()->size();
- for (flatbuffers::uoffset_t i = 0; i < ndim; ++i) {
+ for (flatbuffers::uoffset_t i = 0; i < ndim; ++i) {
auto dim = tensor->shape()->Get(i);
shape->push_back(dim->size());
@@ -1366,12 +1366,12 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
}
if (tensor->strides() && tensor->strides()->size() > 0) {
- if (tensor->strides()->size() != ndim) {
- return Status::IOError(
- "The sizes of shape and strides in a tensor are mismatched.");
- }
-
- for (decltype(ndim) i = 0; i < ndim; ++i) {
+ if (tensor->strides()->size() != ndim) {
+ return Status::IOError(
+ "The sizes of shape and strides in a tensor are mismatched.");
+ }
+
+ for (decltype(ndim) i = 0; i < ndim; ++i) {
strides->push_back(tensor->strides()->Get(i));
}
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
index 9cf489dd668..d47b244d324 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
@@ -156,22 +156,22 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
Status GetKeyValueMetadata(const KVVector* fb_metadata,
std::shared_ptr<KeyValueMetadata>* out);
-template <typename RootType>
-bool VerifyFlatbuffers(const uint8_t* data, int64_t size) {
- // Heuristic: tables in a Arrow flatbuffers buffer must take at least 1 bit
- // each in average (ARROW-11559).
- // Especially, the only recursive table (the `Field` table in Schema.fbs)
- // must have a non-empty `type` member.
- flatbuffers::Verifier verifier(
- data, static_cast<size_t>(size),
- /*max_depth=*/128,
- /*max_tables=*/static_cast<flatbuffers::uoffset_t>(8 * size));
- return verifier.VerifyBuffer<RootType>(nullptr);
-}
-
+template <typename RootType>
+bool VerifyFlatbuffers(const uint8_t* data, int64_t size) {
+ // Heuristic: tables in a Arrow flatbuffers buffer must take at least 1 bit
+ // each in average (ARROW-11559).
+ // Especially, the only recursive table (the `Field` table in Schema.fbs)
+ // must have a non-empty `type` member.
+ flatbuffers::Verifier verifier(
+ data, static_cast<size_t>(size),
+ /*max_depth=*/128,
+ /*max_tables=*/static_cast<flatbuffers::uoffset_t>(8 * size));
+ return verifier.VerifyBuffer<RootType>(nullptr);
+}
+
static inline Status VerifyMessage(const uint8_t* data, int64_t size,
const flatbuf::Message** out) {
- if (!VerifyFlatbuffers<flatbuf::Message>(data, size)) {
+ if (!VerifyFlatbuffers<flatbuf::Message>(data, size)) {
return Status::IOError("Invalid flatbuffers message.");
}
*out = flatbuf::GetMessage(data);
@@ -211,11 +211,11 @@ Status WriteDictionaryMessage(
const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
static inline Result<std::shared_ptr<Buffer>> WriteFlatbufferBuilder(
- flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference
- MemoryPool* pool = default_memory_pool()) {
+ flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference
+ MemoryPool* pool = default_memory_pool()) {
int32_t size = fbb.GetSize();
- ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(size, pool));
+ ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(size, pool));
uint8_t* dst = result->mutable_data();
memcpy(dst, fbb.GetBufferPointer(), size);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
index 2e0f800b5ad..2845a61523a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
@@ -39,26 +39,26 @@ constexpr int kMaxNestingDepth = 64;
/// \brief Options for writing Arrow IPC messages
struct ARROW_EXPORT IpcWriteOptions {
- /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
- ///
- /// Some implementations may not be able to parse streams created with this option.
+ /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
+ ///
+ /// Some implementations may not be able to parse streams created with this option.
bool allow_64bit = false;
-
- /// \brief The maximum permitted schema nesting depth.
+
+ /// \brief The maximum permitted schema nesting depth.
int max_recursion_depth = kMaxNestingDepth;
- /// \brief Write padding after memory buffers up to this multiple of bytes.
+ /// \brief Write padding after memory buffers up to this multiple of bytes.
int32_t alignment = 8;
- /// \brief Write the pre-0.15.0 IPC message format
- ///
- /// This legacy format consists of a 4-byte prefix instead of 8-byte.
+ /// \brief Write the pre-0.15.0 IPC message format
+ ///
+ /// This legacy format consists of a 4-byte prefix instead of 8-byte.
bool write_legacy_ipc_format = false;
/// \brief The memory pool to use for allocations made during IPC writing
- ///
- /// While Arrow IPC is predominantly zero-copy, it may have to allocate
- /// memory in some cases (for example if compression is enabled).
+ ///
+ /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+ /// memory in some cases (for example if compression is enabled).
MemoryPool* memory_pool = default_memory_pool();
/// \brief Compression codec to use for record batch body buffers
@@ -70,38 +70,38 @@ struct ARROW_EXPORT IpcWriteOptions {
/// like compression
bool use_threads = true;
- /// \brief Whether to emit dictionary deltas
- ///
- /// If false, a changed dictionary for a given field will emit a full
- /// dictionary replacement.
- /// If true, a changed dictionary will be compared against the previous
- /// version. If possible, a dictionary delta will be omitted, otherwise
- /// a full dictionary replacement.
- ///
- /// Default is false to maximize stream compatibility.
- ///
- /// Also, note that if a changed dictionary is a nested dictionary,
- /// then a delta is never emitted, for compatibility with the read path.
- bool emit_dictionary_deltas = false;
-
- /// \brief Whether to unify dictionaries for the IPC file format
- ///
- /// The IPC file format doesn't support dictionary replacements or deltas.
- /// Therefore, chunks of a column with a dictionary type must have the same
- /// dictionary in each record batch.
- ///
- /// If this option is true, RecordBatchWriter::WriteTable will attempt
- /// to unify dictionaries across each table column. If this option is
- /// false, unequal dictionaries across a table column will simply raise
- /// an error.
- ///
- /// Note that enabling this option has a runtime cost. Also, not all types
- /// currently support dictionary unification.
- ///
- /// This option is ignored for IPC streams, which support dictionary replacement
- /// and deltas.
- bool unify_dictionaries = false;
-
+ /// \brief Whether to emit dictionary deltas
+ ///
+ /// If false, a changed dictionary for a given field will emit a full
+ /// dictionary replacement.
+ /// If true, a changed dictionary will be compared against the previous
+ /// version. If possible, a dictionary delta will be omitted, otherwise
+ /// a full dictionary replacement.
+ ///
+ /// Default is false to maximize stream compatibility.
+ ///
+ /// Also, note that if a changed dictionary is a nested dictionary,
+ /// then a delta is never emitted, for compatibility with the read path.
+ bool emit_dictionary_deltas = false;
+
+ /// \brief Whether to unify dictionaries for the IPC file format
+ ///
+ /// The IPC file format doesn't support dictionary replacements or deltas.
+ /// Therefore, chunks of a column with a dictionary type must have the same
+ /// dictionary in each record batch.
+ ///
+ /// If this option is true, RecordBatchWriter::WriteTable will attempt
+ /// to unify dictionaries across each table column. If this option is
+ /// false, unequal dictionaries across a table column will simply raise
+ /// an error.
+ ///
+ /// Note that enabling this option has a runtime cost. Also, not all types
+ /// currently support dictionary unification.
+ ///
+ /// This option is ignored for IPC streams, which support dictionary replacement
+ /// and deltas.
+ bool unify_dictionaries = false;
+
/// \brief Format version to use for IPC messages and their metadata.
///
/// Presently using V5 version (readable by 1.0.0 and later).
@@ -115,40 +115,40 @@ struct ARROW_EXPORT IpcWriteOptions {
using IpcOptions = IpcWriteOptions;
#endif
-/// \brief Options for reading Arrow IPC messages
+/// \brief Options for reading Arrow IPC messages
struct ARROW_EXPORT IpcReadOptions {
- /// \brief The maximum permitted schema nesting depth.
+ /// \brief The maximum permitted schema nesting depth.
int max_recursion_depth = kMaxNestingDepth;
- /// \brief The memory pool to use for allocations made during IPC reading
- ///
- /// While Arrow IPC is predominantly zero-copy, it may have to allocate
- /// memory in some cases (for example if compression is enabled).
+ /// \brief The memory pool to use for allocations made during IPC reading
+ ///
+ /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+ /// memory in some cases (for example if compression is enabled).
MemoryPool* memory_pool = default_memory_pool();
/// \brief EXPERIMENTAL: Top-level schema fields to include when
- /// deserializing RecordBatch.
- ///
- /// If empty (the default), return all deserialized fields.
- /// If non-empty, the values are the indices of fields in the top-level schema.
+ /// deserializing RecordBatch.
+ ///
+ /// If empty (the default), return all deserialized fields.
+ /// If non-empty, the values are the indices of fields in the top-level schema.
std::vector<int> included_fields;
/// \brief Use global CPU thread pool to parallelize any computational tasks
/// like decompression
bool use_threads = true;
- /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness
- ///
- /// If the endianness of the received schema is not equal to platform-native
- /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
- /// This includes the value buffers of numeric types, temporal types, decimal
- /// types, as well as the offset buffers of variable-sized binary and list-like
- /// types.
- ///
- /// Endianness conversion is achieved by the RecordBatchFileReader,
- /// RecordBatchStreamReader and StreamDecoder classes.
- bool ensure_native_endian = true;
-
+ /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness
+ ///
+ /// If the endianness of the received schema is not equal to platform-native
+ /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
+ /// This includes the value buffers of numeric types, temporal types, decimal
+ /// types, as well as the offset buffers of variable-sized binary and list-like
+ /// types.
+ ///
+ /// Endianness conversion is achieved by the RecordBatchFileReader,
+ /// RecordBatchStreamReader and StreamDecoder classes.
+ bool ensure_native_endian = true;
+
static IpcReadOptions Defaults();
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
index a3c345cc440..5e90be7d4e6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
@@ -31,7 +31,7 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/extension_type.h"
-#include "arrow/io/caching.h"
+#include "arrow/io/caching.h"
#include "arrow/io/interfaces.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/message.h"
@@ -47,14 +47,14 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/compression.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/parallel.h"
-#include "arrow/util/string.h"
-#include "arrow/util/thread_pool.h"
+#include "arrow/util/string.h"
+#include "arrow/util/thread_pool.h"
#include "arrow/util/ubsan.h"
-#include "arrow/util/vector.h"
+#include "arrow/util/vector.h"
#include "arrow/visitor_inline.h"
#include "generated/File_generated.h" // IWYU pragma: export
@@ -112,30 +112,30 @@ Status InvalidMessageType(MessageType expected, MessageType actual) {
// ----------------------------------------------------------------------
// Record batch read path
-/// \brief Structure to keep common arguments to be passed
-struct IpcReadContext {
- IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap,
- MetadataVersion version = MetadataVersion::V5,
- Compression::type kind = Compression::UNCOMPRESSED)
- : dictionary_memo(memo),
- options(option),
- metadata_version(version),
- compression(kind),
- swap_endian(swap) {}
-
- DictionaryMemo* dictionary_memo;
-
- const IpcReadOptions& options;
-
- MetadataVersion metadata_version;
-
- Compression::type compression;
-
- /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements
- /// if this flag is true
- const bool swap_endian;
-};
-
+/// \brief Structure to keep common arguments to be passed
+struct IpcReadContext {
+ IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap,
+ MetadataVersion version = MetadataVersion::V5,
+ Compression::type kind = Compression::UNCOMPRESSED)
+ : dictionary_memo(memo),
+ options(option),
+ metadata_version(version),
+ compression(kind),
+ swap_endian(swap) {}
+
+ DictionaryMemo* dictionary_memo;
+
+ const IpcReadOptions& options;
+
+ MetadataVersion metadata_version;
+
+ Compression::type compression;
+
+ /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements
+ /// if this flag is true
+ const bool swap_endian;
+};
+
/// The field_index and buffer_index are incremented based on how much of the
/// batch is "consumed" (through nested data reconstruction, for example)
class ArrayLoader {
@@ -467,9 +467,9 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op
Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
- const std::vector<bool>* inclusion_mask, const IpcReadContext& context,
- io::RandomAccessFile* file) {
- ArrayLoader loader(metadata, context.metadata_version, context.options, file);
+ const std::vector<bool>* inclusion_mask, const IpcReadContext& context,
+ io::RandomAccessFile* file) {
+ ArrayLoader loader(metadata, context.metadata_version, context.options, file);
ArrayDataVector columns(schema->num_fields());
ArrayDataVector filtered_columns;
@@ -499,8 +499,8 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
// Dictionary resolution needs to happen on the unfiltered columns,
// because fields are mapped structurally (by path in the original schema).
- RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo,
- context.options.memory_pool));
+ RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo,
+ context.options.memory_pool));
if (inclusion_mask) {
filtered_schema = ::arrow::schema(std::move(filtered_fields), schema->metadata());
@@ -509,30 +509,30 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
filtered_schema = schema;
filtered_columns = std::move(columns);
}
- if (context.compression != Compression::UNCOMPRESSED) {
- RETURN_NOT_OK(
- DecompressBuffers(context.compression, context.options, &filtered_columns));
+ if (context.compression != Compression::UNCOMPRESSED) {
+ RETURN_NOT_OK(
+ DecompressBuffers(context.compression, context.options, &filtered_columns));
}
- // swap endian in a set of ArrayData if necessary (swap_endian == true)
- if (context.swap_endian) {
- for (int i = 0; i < static_cast<int>(filtered_columns.size()); ++i) {
- ARROW_ASSIGN_OR_RAISE(filtered_columns[i],
- arrow::internal::SwapEndianArrayData(filtered_columns[i]));
- }
- }
- return RecordBatch::Make(std::move(filtered_schema), metadata->length(),
+ // swap endian in a set of ArrayData if necessary (swap_endian == true)
+ if (context.swap_endian) {
+ for (int i = 0; i < static_cast<int>(filtered_columns.size()); ++i) {
+ ARROW_ASSIGN_OR_RAISE(filtered_columns[i],
+ arrow::internal::SwapEndianArrayData(filtered_columns[i]));
+ }
+ }
+ return RecordBatch::Make(std::move(filtered_schema), metadata->length(),
std::move(filtered_columns));
}
Result<std::shared_ptr<RecordBatch>> LoadRecordBatch(
const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
- const std::vector<bool>& inclusion_mask, const IpcReadContext& context,
- io::RandomAccessFile* file) {
+ const std::vector<bool>& inclusion_mask, const IpcReadContext& context,
+ io::RandomAccessFile* file) {
if (inclusion_mask.size() > 0) {
- return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
+ return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
} else {
- return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file);
+ return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file);
}
}
@@ -569,9 +569,9 @@ Status GetCompressionExperimental(const flatbuf::Message* message,
RETURN_NOT_OK(internal::GetKeyValueMetadata(message->custom_metadata(), &metadata));
int index = metadata->FindKey("ARROW:experimental_compression");
if (index != -1) {
- // Arrow 0.17 stored string in upper case, internal utils now require lower case
- auto name = arrow::internal::AsciiToLower(metadata->value(index));
- ARROW_ASSIGN_OR_RAISE(*out, util::Codec::GetCompressionType(name));
+ // Arrow 0.17 stored string in upper case, internal utils now require lower case
+ auto name = arrow::internal::AsciiToLower(metadata->value(index));
+ ARROW_ASSIGN_OR_RAISE(*out, util::Codec::GetCompressionType(name));
}
return internal::CheckCompressionSupported(*out);
}
@@ -610,8 +610,8 @@ Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
Result<std::shared_ptr<RecordBatch>> ReadRecordBatchInternal(
const Buffer& metadata, const std::shared_ptr<Schema>& schema,
- const std::vector<bool>& inclusion_mask, IpcReadContext& context,
- io::RandomAccessFile* file) {
+ const std::vector<bool>& inclusion_mask, IpcReadContext& context,
+ io::RandomAccessFile* file) {
const flatbuf::Message* message = nullptr;
RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
auto batch = message->header_as_RecordBatch();
@@ -622,15 +622,15 @@ Result<std::shared_ptr<RecordBatch>> ReadRecordBatchInternal(
Compression::type compression;
RETURN_NOT_OK(GetCompression(batch, &compression));
- if (context.compression == Compression::UNCOMPRESSED &&
+ if (context.compression == Compression::UNCOMPRESSED &&
message->version() == flatbuf::MetadataVersion::V4) {
// Possibly obtain codec information from experimental serialization format
// in 0.17.x
RETURN_NOT_OK(GetCompressionExperimental(message, &compression));
}
- context.compression = compression;
- context.metadata_version = internal::GetMetadataVersion(message->version());
- return LoadRecordBatch(batch, schema, inclusion_mask, context, file);
+ context.compression = compression;
+ context.metadata_version = internal::GetMetadataVersion(message->version());
+ return LoadRecordBatch(batch, schema, inclusion_mask, context, file);
}
// If we are selecting only certain fields, populate an inclusion mask for fast lookups.
@@ -663,8 +663,8 @@ Status GetInclusionMaskAndOutSchema(const std::shared_ptr<Schema>& full_schema,
included_fields.push_back(full_schema->field(i));
}
- *out_schema = schema(std::move(included_fields), full_schema->endianness(),
- full_schema->metadata());
+ *out_schema = schema(std::move(included_fields), full_schema->endianness(),
+ full_schema->metadata());
return Status::OK();
}
@@ -672,32 +672,32 @@ Status UnpackSchemaMessage(const void* opaque_schema, const IpcReadOptions& opti
DictionaryMemo* dictionary_memo,
std::shared_ptr<Schema>* schema,
std::shared_ptr<Schema>* out_schema,
- std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
+ std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
RETURN_NOT_OK(internal::GetSchema(opaque_schema, dictionary_memo, schema));
// If we are selecting only certain fields, populate the inclusion mask now
// for fast lookups
- RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields,
- field_inclusion_mask, out_schema));
- *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian();
- if (*swap_endian) {
- // create a new schema with native endianness before swapping endian in ArrayData
- *schema = schema->get()->WithEndianness(Endianness::Native);
- *out_schema = out_schema->get()->WithEndianness(Endianness::Native);
- }
- return Status::OK();
+ RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields,
+ field_inclusion_mask, out_schema));
+ *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian();
+ if (*swap_endian) {
+ // create a new schema with native endianness before swapping endian in ArrayData
+ *schema = schema->get()->WithEndianness(Endianness::Native);
+ *out_schema = out_schema->get()->WithEndianness(Endianness::Native);
+ }
+ return Status::OK();
}
Status UnpackSchemaMessage(const Message& message, const IpcReadOptions& options,
DictionaryMemo* dictionary_memo,
std::shared_ptr<Schema>* schema,
std::shared_ptr<Schema>* out_schema,
- std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
+ std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message.type());
CHECK_HAS_NO_BODY(message);
return UnpackSchemaMessage(message.header(), options, dictionary_memo, schema,
- out_schema, field_inclusion_mask, swap_endian);
+ out_schema, field_inclusion_mask, swap_endian);
}
Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
@@ -707,14 +707,14 @@ Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
std::shared_ptr<Schema> out_schema;
// Empty means do not use
std::vector<bool> inclusion_mask;
- IpcReadContext context(const_cast<DictionaryMemo*>(dictionary_memo), options, false);
- RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields,
+ IpcReadContext context(const_cast<DictionaryMemo*>(dictionary_memo), options, false);
+ RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields,
&inclusion_mask, &out_schema));
- return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file);
+ return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file);
}
-Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
- DictionaryKind* kind, io::RandomAccessFile* file) {
+Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
+ DictionaryKind* kind, io::RandomAccessFile* file) {
const flatbuf::Message* message = nullptr;
RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
const auto dictionary_batch = message->header_as_DictionaryBatch();
@@ -741,46 +741,46 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
// Look up the dictionary value type, which must have been added to the
// DictionaryMemo already prior to invoking this function
- ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id));
+ ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id));
// Load the dictionary data from the dictionary batch
ArrayLoader loader(batch_meta, internal::GetMetadataVersion(message->version()),
- context.options, file);
- auto dict_data = std::make_shared<ArrayData>();
+ context.options, file);
+ auto dict_data = std::make_shared<ArrayData>();
const Field dummy_field("", value_type);
RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get()));
if (compression != Compression::UNCOMPRESSED) {
ArrayDataVector dict_fields{dict_data};
- RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields));
- }
-
- // swap endian in dict_data if necessary (swap_endian == true)
- if (context.swap_endian) {
- ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data));
+ RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields));
}
+ // swap endian in dict_data if necessary (swap_endian == true)
+ if (context.swap_endian) {
+ ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data));
+ }
+
if (dictionary_batch->isDelta()) {
if (kind != nullptr) {
*kind = DictionaryKind::Delta;
}
- return context.dictionary_memo->AddDictionaryDelta(id, dict_data);
+ return context.dictionary_memo->AddDictionaryDelta(id, dict_data);
}
ARROW_ASSIGN_OR_RAISE(bool inserted,
- context.dictionary_memo->AddOrReplaceDictionary(id, dict_data));
+ context.dictionary_memo->AddOrReplaceDictionary(id, dict_data));
if (kind != nullptr) {
*kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement;
}
return Status::OK();
}
-Status ReadDictionary(const Message& message, const IpcReadContext& context,
- DictionaryKind* kind) {
+Status ReadDictionary(const Message& message, const IpcReadContext& context,
+ DictionaryKind* kind) {
// Only invoke this method if we already know we have a dictionary message
DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH);
CHECK_HAS_BODY(message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body()));
- return ReadDictionary(*message.metadata(), context, kind, reader.get());
+ return ReadDictionary(*message.metadata(), context, kind, reader.get());
}
// ----------------------------------------------------------------------
@@ -799,10 +799,10 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
return Status::Invalid("Tried reading schema message, was null or length 0");
}
- RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_,
- &out_schema_, &field_inclusion_mask_,
- &swap_endian_));
- return Status::OK();
+ RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_,
+ &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
+ return Status::OK();
}
Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
@@ -834,9 +834,9 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
CHECK_HAS_BODY(*message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
- context, reader.get())
+ context, reader.get())
.Value(batch);
}
@@ -866,8 +866,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
// Read dictionary from dictionary batch
Status ReadDictionary(const Message& message) {
DictionaryKind kind;
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
- RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
switch (kind) {
case DictionaryKind::New:
break;
@@ -888,7 +888,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
// TODO(wesm): In future, we may want to reconcile the ids in the stream with
// those found in the schema
- const auto num_dicts = dictionary_memo_.fields().num_dicts();
+ const auto num_dicts = dictionary_memo_.fields().num_dicts();
for (int i = 0; i < num_dicts; ++i) {
ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
if (!message) {
@@ -933,8 +933,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
DictionaryMemo dictionary_memo_;
std::shared_ptr<Schema> schema_, out_schema_;
-
- bool swap_endian_;
+
+ bool swap_endian_;
};
// ----------------------------------------------------------------------
@@ -961,94 +961,94 @@ Result<std::shared_ptr<RecordBatchStreamReader>> RecordBatchStreamReader::Open(
// ----------------------------------------------------------------------
// Reader implementation
-// Common functions used in both the random-access file reader and the
-// asynchronous generator
+// Common functions used in both the random-access file reader and the
+// asynchronous generator
static inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) {
return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()};
}
-static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block,
- io::RandomAccessFile* file) {
- if (!BitUtil::IsMultipleOf8(block.offset) ||
- !BitUtil::IsMultipleOf8(block.metadata_length) ||
- !BitUtil::IsMultipleOf8(block.body_length)) {
- return Status::Invalid("Unaligned block in IPC file");
- }
-
- // TODO(wesm): this breaks integration tests, see ARROW-3256
- // DCHECK_EQ((*out)->body_length(), block.body_length);
-
- ARROW_ASSIGN_OR_RAISE(auto message,
- ReadMessage(block.offset, block.metadata_length, file));
- return std::move(message);
-}
-
-static Future<std::shared_ptr<Message>> ReadMessageFromBlockAsync(
- const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) {
- if (!BitUtil::IsMultipleOf8(block.offset) ||
- !BitUtil::IsMultipleOf8(block.metadata_length) ||
- !BitUtil::IsMultipleOf8(block.body_length)) {
- return Status::Invalid("Unaligned block in IPC file");
- }
-
- // TODO(wesm): this breaks integration tests, see ARROW-3256
- // DCHECK_EQ((*out)->body_length(), block.body_length);
-
- return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file,
- io_context);
-}
-
-static Status ReadOneDictionary(Message* message, const IpcReadContext& context) {
- CHECK_HAS_BODY(*message);
- ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- DictionaryKind kind;
- RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get()));
- if (kind != DictionaryKind::New) {
- return Status::Invalid(
- "Unsupported dictionary replacement or "
- "dictionary delta in IPC file");
- }
- return Status::OK();
-}
-
-class RecordBatchFileReaderImpl;
-
-/// A generator of record batches.
-///
-/// All batches are yielded in order.
-class ARROW_EXPORT IpcFileRecordBatchGenerator {
- public:
- using Item = std::shared_ptr<RecordBatch>;
-
- explicit IpcFileRecordBatchGenerator(
- std::shared_ptr<RecordBatchFileReaderImpl> state,
- std::shared_ptr<io::internal::ReadRangeCache> cached_source,
- const io::IOContext& io_context, arrow::internal::Executor* executor)
- : state_(std::move(state)),
- cached_source_(std::move(cached_source)),
- io_context_(io_context),
- executor_(executor),
- index_(0) {}
-
- Future<Item> operator()();
- Future<std::shared_ptr<Message>> ReadBlock(const FileBlock& block);
-
- static Status ReadDictionaries(
- RecordBatchFileReaderImpl* state,
- std::vector<std::shared_ptr<Message>> dictionary_messages);
- static Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
- RecordBatchFileReaderImpl* state, Message* message);
-
- private:
- std::shared_ptr<RecordBatchFileReaderImpl> state_;
- std::shared_ptr<io::internal::ReadRangeCache> cached_source_;
- io::IOContext io_context_;
- arrow::internal::Executor* executor_;
- int index_;
- // Odd Future type, but this lets us use All() easily
- Future<> read_dictionaries_;
-};
-
+static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block,
+ io::RandomAccessFile* file) {
+ if (!BitUtil::IsMultipleOf8(block.offset) ||
+ !BitUtil::IsMultipleOf8(block.metadata_length) ||
+ !BitUtil::IsMultipleOf8(block.body_length)) {
+ return Status::Invalid("Unaligned block in IPC file");
+ }
+
+ // TODO(wesm): this breaks integration tests, see ARROW-3256
+ // DCHECK_EQ((*out)->body_length(), block.body_length);
+
+ ARROW_ASSIGN_OR_RAISE(auto message,
+ ReadMessage(block.offset, block.metadata_length, file));
+ return std::move(message);
+}
+
+static Future<std::shared_ptr<Message>> ReadMessageFromBlockAsync(
+ const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) {
+ if (!BitUtil::IsMultipleOf8(block.offset) ||
+ !BitUtil::IsMultipleOf8(block.metadata_length) ||
+ !BitUtil::IsMultipleOf8(block.body_length)) {
+ return Status::Invalid("Unaligned block in IPC file");
+ }
+
+ // TODO(wesm): this breaks integration tests, see ARROW-3256
+ // DCHECK_EQ((*out)->body_length(), block.body_length);
+
+ return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file,
+ io_context);
+}
+
+static Status ReadOneDictionary(Message* message, const IpcReadContext& context) {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ DictionaryKind kind;
+ RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get()));
+ if (kind != DictionaryKind::New) {
+ return Status::Invalid(
+ "Unsupported dictionary replacement or "
+ "dictionary delta in IPC file");
+ }
+ return Status::OK();
+}
+
+class RecordBatchFileReaderImpl;
+
+/// A generator of record batches.
+///
+/// All batches are yielded in order.
+class ARROW_EXPORT IpcFileRecordBatchGenerator {
+ public:
+ using Item = std::shared_ptr<RecordBatch>;
+
+ explicit IpcFileRecordBatchGenerator(
+ std::shared_ptr<RecordBatchFileReaderImpl> state,
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source,
+ const io::IOContext& io_context, arrow::internal::Executor* executor)
+ : state_(std::move(state)),
+ cached_source_(std::move(cached_source)),
+ io_context_(io_context),
+ executor_(executor),
+ index_(0) {}
+
+ Future<Item> operator()();
+ Future<std::shared_ptr<Message>> ReadBlock(const FileBlock& block);
+
+ static Status ReadDictionaries(
+ RecordBatchFileReaderImpl* state,
+ std::vector<std::shared_ptr<Message>> dictionary_messages);
+ static Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ RecordBatchFileReaderImpl* state, Message* message);
+
+ private:
+ std::shared_ptr<RecordBatchFileReaderImpl> state_;
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source_;
+ io::IOContext io_context_;
+ arrow::internal::Executor* executor_;
+ int index_;
+ // Odd Future type, but this lets us use All() easily
+ Future<> read_dictionaries_;
+};
+
class RecordBatchFileReaderImpl : public RecordBatchFileReader {
public:
RecordBatchFileReaderImpl() : file_(NULLPTR), footer_offset_(0), footer_(NULLPTR) {}
@@ -1074,33 +1074,33 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
CHECK_HAS_BODY(*message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
- ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal(
- *message->metadata(), schema_,
- field_inclusion_mask_, context, reader.get()));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal(
+ *message->metadata(), schema_,
+ field_inclusion_mask_, context, reader.get()));
++stats_.num_record_batches;
return batch;
}
- Result<int64_t> CountRows() override {
- int64_t total = 0;
- for (int i = 0; i < num_record_batches(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto outer_message,
- ReadMessageFromBlock(GetRecordBatchBlock(i)));
- auto metadata = outer_message->metadata();
- const flatbuf::Message* message = nullptr;
- RETURN_NOT_OK(
- internal::VerifyMessage(metadata->data(), metadata->size(), &message));
- auto batch = message->header_as_RecordBatch();
- if (batch == nullptr) {
- return Status::IOError(
- "Header-type of flatbuffer-encoded Message is not RecordBatch.");
- }
- total += batch->length();
- }
- return total;
- }
-
+ Result<int64_t> CountRows() override {
+ int64_t total = 0;
+ for (int i = 0; i < num_record_batches(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto outer_message,
+ ReadMessageFromBlock(GetRecordBatchBlock(i)));
+ auto metadata = outer_message->metadata();
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(
+ internal::VerifyMessage(metadata->data(), metadata->size(), &message));
+ auto batch = message->header_as_RecordBatch();
+ if (batch == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not RecordBatch.");
+ }
+ total += batch->length();
+ }
+ return total;
+ }
+
Status Open(const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
const IpcReadOptions& options) {
owned_file_ = file;
@@ -1116,75 +1116,75 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
// Get the schema and record any observed dictionaries
RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_,
- &schema_, &out_schema_, &field_inclusion_mask_,
- &swap_endian_));
+ &schema_, &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
++stats_.num_messages;
return Status::OK();
}
- Future<> OpenAsync(const std::shared_ptr<io::RandomAccessFile>& file,
- int64_t footer_offset, const IpcReadOptions& options) {
- owned_file_ = file;
- return OpenAsync(file.get(), footer_offset, options);
- }
-
- Future<> OpenAsync(io::RandomAccessFile* file, int64_t footer_offset,
- const IpcReadOptions& options) {
- file_ = file;
- options_ = options;
- footer_offset_ = footer_offset;
- auto cpu_executor = ::arrow::internal::GetCpuThreadPool();
- auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
- return ReadFooterAsync(cpu_executor).Then([self, options]() -> Status {
- // Get the schema and record any observed dictionaries
- RETURN_NOT_OK(UnpackSchemaMessage(
- self->footer_->schema(), options, &self->dictionary_memo_, &self->schema_,
- &self->out_schema_, &self->field_inclusion_mask_, &self->swap_endian_));
- ++self->stats_.num_messages;
- return Status::OK();
- });
- }
-
+ Future<> OpenAsync(const std::shared_ptr<io::RandomAccessFile>& file,
+ int64_t footer_offset, const IpcReadOptions& options) {
+ owned_file_ = file;
+ return OpenAsync(file.get(), footer_offset, options);
+ }
+
+ Future<> OpenAsync(io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ file_ = file;
+ options_ = options;
+ footer_offset_ = footer_offset;
+ auto cpu_executor = ::arrow::internal::GetCpuThreadPool();
+ auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ return ReadFooterAsync(cpu_executor).Then([self, options]() -> Status {
+ // Get the schema and record any observed dictionaries
+ RETURN_NOT_OK(UnpackSchemaMessage(
+ self->footer_->schema(), options, &self->dictionary_memo_, &self->schema_,
+ &self->out_schema_, &self->field_inclusion_mask_, &self->swap_endian_));
+ ++self->stats_.num_messages;
+ return Status::OK();
+ });
+ }
+
std::shared_ptr<Schema> schema() const override { return out_schema_; }
std::shared_ptr<const KeyValueMetadata> metadata() const override { return metadata_; }
ReadStats stats() const override { return stats_; }
- Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
- const bool coalesce, const io::IOContext& io_context,
- const io::CacheOptions cache_options,
- arrow::internal::Executor* executor) override {
- auto state = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
- std::shared_ptr<io::internal::ReadRangeCache> cached_source;
- if (coalesce) {
- if (!owned_file_) return Status::Invalid("Cannot coalesce without an owned file");
- cached_source = std::make_shared<io::internal::ReadRangeCache>(
- owned_file_, io_context, cache_options);
- auto num_dictionaries = this->num_dictionaries();
- auto num_record_batches = this->num_record_batches();
- std::vector<io::ReadRange> ranges(num_dictionaries + num_record_batches);
- for (int i = 0; i < num_dictionaries; i++) {
- auto block = FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
- ranges[i].offset = block.offset;
- ranges[i].length = block.metadata_length + block.body_length;
- }
- for (int i = 0; i < num_record_batches; i++) {
- auto block = FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
- ranges[num_dictionaries + i].offset = block.offset;
- ranges[num_dictionaries + i].length = block.metadata_length + block.body_length;
- }
- RETURN_NOT_OK(cached_source->Cache(std::move(ranges)));
- }
- return IpcFileRecordBatchGenerator(std::move(state), std::move(cached_source),
- io_context, executor);
- }
-
+ Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+ const bool coalesce, const io::IOContext& io_context,
+ const io::CacheOptions cache_options,
+ arrow::internal::Executor* executor) override {
+ auto state = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source;
+ if (coalesce) {
+ if (!owned_file_) return Status::Invalid("Cannot coalesce without an owned file");
+ cached_source = std::make_shared<io::internal::ReadRangeCache>(
+ owned_file_, io_context, cache_options);
+ auto num_dictionaries = this->num_dictionaries();
+ auto num_record_batches = this->num_record_batches();
+ std::vector<io::ReadRange> ranges(num_dictionaries + num_record_batches);
+ for (int i = 0; i < num_dictionaries; i++) {
+ auto block = FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
+ ranges[i].offset = block.offset;
+ ranges[i].length = block.metadata_length + block.body_length;
+ }
+ for (int i = 0; i < num_record_batches; i++) {
+ auto block = FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
+ ranges[num_dictionaries + i].offset = block.offset;
+ ranges[num_dictionaries + i].length = block.metadata_length + block.body_length;
+ }
+ RETURN_NOT_OK(cached_source->Cache(std::move(ranges)));
+ }
+ return IpcFileRecordBatchGenerator(std::move(state), std::move(cached_source),
+ io_context, executor);
+ }
+
private:
- friend AsyncGenerator<std::shared_ptr<Message>> MakeMessageGenerator(
- std::shared_ptr<RecordBatchFileReaderImpl>, const io::IOContext&);
- friend class IpcFileRecordBatchGenerator;
-
+ friend AsyncGenerator<std::shared_ptr<Message>> MakeMessageGenerator(
+ std::shared_ptr<RecordBatchFileReaderImpl>, const io::IOContext&);
+ friend class IpcFileRecordBatchGenerator;
+
FileBlock GetRecordBatchBlock(int i) const {
return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
}
@@ -1194,28 +1194,28 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
}
Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block) {
- ARROW_ASSIGN_OR_RAISE(auto message, arrow::ipc::ReadMessageFromBlock(block, file_));
+ ARROW_ASSIGN_OR_RAISE(auto message, arrow::ipc::ReadMessageFromBlock(block, file_));
++stats_.num_messages;
return std::move(message);
}
Status ReadDictionaries() {
// Read all the dictionaries
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
for (int i = 0; i < num_dictionaries(); ++i) {
ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetDictionaryBlock(i)));
- RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
+ RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
++stats_.num_dictionary_batches;
}
return Status::OK();
}
Status ReadFooter() {
- auto fut = ReadFooterAsync(/*executor=*/nullptr);
- return fut.status();
- }
-
- Future<> ReadFooterAsync(arrow::internal::Executor* executor) {
+ auto fut = ReadFooterAsync(/*executor=*/nullptr);
+ return fut.status();
+ }
+
+ Future<> ReadFooterAsync(arrow::internal::Executor* executor) {
const int32_t magic_size = static_cast<int>(strlen(kArrowMagicBytes));
if (footer_offset_ <= magic_size * 2 + 4) {
@@ -1223,53 +1223,53 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
}
int file_end_size = static_cast<int>(magic_size + sizeof(int32_t));
- auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
- auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size);
- if (executor) read_magic = executor->Transfer(std::move(read_magic));
- return read_magic
- .Then([=](const std::shared_ptr<Buffer>& buffer)
- -> Future<std::shared_ptr<Buffer>> {
- const int64_t expected_footer_size = magic_size + sizeof(int32_t);
- if (buffer->size() < expected_footer_size) {
- return Status::Invalid("Unable to read ", expected_footer_size,
- "from end of file");
- }
-
- if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) {
- return Status::Invalid("Not an Arrow file");
- }
-
- int32_t footer_length = BitUtil::FromLittleEndian(
- *reinterpret_cast<const int32_t*>(buffer->data()));
-
- if (footer_length <= 0 ||
- footer_length > self->footer_offset_ - magic_size * 2 - 4) {
- return Status::Invalid("File is smaller than indicated metadata size");
- }
-
- // Now read the footer
- auto read_footer = self->file_->ReadAsync(
- self->footer_offset_ - footer_length - file_end_size, footer_length);
- if (executor) read_footer = executor->Transfer(std::move(read_footer));
- return read_footer;
- })
- .Then([=](const std::shared_ptr<Buffer>& buffer) -> Status {
- self->footer_buffer_ = buffer;
- const auto data = self->footer_buffer_->data();
- const auto size = self->footer_buffer_->size();
- if (!internal::VerifyFlatbuffers<flatbuf::Footer>(data, size)) {
- return Status::IOError("Verification of flatbuffer-encoded Footer failed.");
- }
- self->footer_ = flatbuf::GetFooter(data);
-
- auto fb_metadata = self->footer_->custom_metadata();
- if (fb_metadata != nullptr) {
- std::shared_ptr<KeyValueMetadata> md;
- RETURN_NOT_OK(internal::GetKeyValueMetadata(fb_metadata, &md));
- self->metadata_ = std::move(md); // const-ify
- }
- return Status::OK();
- });
+ auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size);
+ if (executor) read_magic = executor->Transfer(std::move(read_magic));
+ return read_magic
+ .Then([=](const std::shared_ptr<Buffer>& buffer)
+ -> Future<std::shared_ptr<Buffer>> {
+ const int64_t expected_footer_size = magic_size + sizeof(int32_t);
+ if (buffer->size() < expected_footer_size) {
+ return Status::Invalid("Unable to read ", expected_footer_size,
+ "from end of file");
+ }
+
+ if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) {
+ return Status::Invalid("Not an Arrow file");
+ }
+
+ int32_t footer_length = BitUtil::FromLittleEndian(
+ *reinterpret_cast<const int32_t*>(buffer->data()));
+
+ if (footer_length <= 0 ||
+ footer_length > self->footer_offset_ - magic_size * 2 - 4) {
+ return Status::Invalid("File is smaller than indicated metadata size");
+ }
+
+ // Now read the footer
+ auto read_footer = self->file_->ReadAsync(
+ self->footer_offset_ - footer_length - file_end_size, footer_length);
+ if (executor) read_footer = executor->Transfer(std::move(read_footer));
+ return read_footer;
+ })
+ .Then([=](const std::shared_ptr<Buffer>& buffer) -> Status {
+ self->footer_buffer_ = buffer;
+ const auto data = self->footer_buffer_->data();
+ const auto size = self->footer_buffer_->size();
+ if (!internal::VerifyFlatbuffers<flatbuf::Footer>(data, size)) {
+ return Status::IOError("Verification of flatbuffer-encoded Footer failed.");
+ }
+ self->footer_ = flatbuf::GetFooter(data);
+
+ auto fb_metadata = self->footer_->custom_metadata();
+ if (fb_metadata != nullptr) {
+ std::shared_ptr<KeyValueMetadata> md;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(fb_metadata, &md));
+ self->metadata_ = std::move(md); // const-ify
+ }
+ return Status::OK();
+ });
}
int num_dictionaries() const {
@@ -1300,8 +1300,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
std::shared_ptr<Schema> out_schema_;
ReadStats stats_;
-
- bool swap_endian_;
+
+ bool swap_endian_;
};
Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
@@ -1331,109 +1331,109 @@ Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
return result;
}
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
- ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
- return OpenAsync(std::move(file), footer_offset, options);
-}
-
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- io::RandomAccessFile* file, const IpcReadOptions& options) {
- ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
- return OpenAsync(file, footer_offset, options);
-}
-
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
- const IpcReadOptions& options) {
- auto result = std::make_shared<RecordBatchFileReaderImpl>();
- return result->OpenAsync(file, footer_offset, options)
- .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
-}
-
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) {
- auto result = std::make_shared<RecordBatchFileReaderImpl>();
- return result->OpenAsync(file, footer_offset, options)
- .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
-}
-
-Future<IpcFileRecordBatchGenerator::Item> IpcFileRecordBatchGenerator::operator()() {
- auto state = state_;
- if (!read_dictionaries_.is_valid()) {
- std::vector<Future<std::shared_ptr<Message>>> messages(state->num_dictionaries());
- for (int i = 0; i < state->num_dictionaries(); i++) {
- auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i));
- messages[i] = ReadBlock(block);
- }
- auto read_messages = All(std::move(messages));
- if (executor_) read_messages = executor_->Transfer(read_messages);
- read_dictionaries_ = read_messages.Then(
- [=](const std::vector<Result<std::shared_ptr<Message>>>& maybe_messages)
- -> Status {
- ARROW_ASSIGN_OR_RAISE(auto messages,
- arrow::internal::UnwrapOrRaise(maybe_messages));
- return ReadDictionaries(state.get(), std::move(messages));
- });
- }
- if (index_ >= state_->num_record_batches()) {
- return Future<Item>::MakeFinished(IterationTraits<Item>::End());
- }
- auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++));
- auto read_message = ReadBlock(block);
- auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; });
- // Force transfer. This may be wasteful in some cases, but ensures we get off the
- // I/O threads as soon as possible, and ensures we don't decode record batches
- // synchronously in the case that the message read has already finished.
- if (executor_) {
- auto executor = executor_;
- return read_messages.Then(
- [=](const std::shared_ptr<Message>& message) -> Future<Item> {
- return DeferNotOk(executor->Submit(
- [=]() { return ReadRecordBatch(state.get(), message.get()); }));
- });
- }
- return read_messages.Then([=](const std::shared_ptr<Message>& message) -> Result<Item> {
- return ReadRecordBatch(state.get(), message.get());
- });
-}
-
-Future<std::shared_ptr<Message>> IpcFileRecordBatchGenerator::ReadBlock(
- const FileBlock& block) {
- if (cached_source_) {
- auto cached_source = cached_source_;
- io::ReadRange range{block.offset, block.metadata_length + block.body_length};
- auto pool = state_->options_.memory_pool;
- return cached_source->WaitFor({range}).Then(
- [cached_source, pool, range]() -> Result<std::shared_ptr<Message>> {
- ARROW_ASSIGN_OR_RAISE(auto buffer, cached_source->Read(range));
- io::BufferReader stream(std::move(buffer));
- return ReadMessage(&stream, pool);
- });
- } else {
- return ReadMessageFromBlockAsync(block, state_->file_, io_context_);
- }
-}
-
-Status IpcFileRecordBatchGenerator::ReadDictionaries(
- RecordBatchFileReaderImpl* state,
- std::vector<std::shared_ptr<Message>> dictionary_messages) {
- IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
- for (const auto& message : dictionary_messages) {
- RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
- }
- return Status::OK();
-}
-
-Result<std::shared_ptr<RecordBatch>> IpcFileRecordBatchGenerator::ReadRecordBatch(
- RecordBatchFileReaderImpl* state, Message* message) {
- CHECK_HAS_BODY(*message);
- ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
- return ReadRecordBatchInternal(*message->metadata(), state->schema_,
- state->field_inclusion_mask_, context, reader.get());
-}
-
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return OpenAsync(std::move(file), footer_offset, options);
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ io::RandomAccessFile* file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return OpenAsync(file, footer_offset, options);
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ return result->OpenAsync(file, footer_offset, options)
+ .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ return result->OpenAsync(file, footer_offset, options)
+ .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
+}
+
+Future<IpcFileRecordBatchGenerator::Item> IpcFileRecordBatchGenerator::operator()() {
+ auto state = state_;
+ if (!read_dictionaries_.is_valid()) {
+ std::vector<Future<std::shared_ptr<Message>>> messages(state->num_dictionaries());
+ for (int i = 0; i < state->num_dictionaries(); i++) {
+ auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i));
+ messages[i] = ReadBlock(block);
+ }
+ auto read_messages = All(std::move(messages));
+ if (executor_) read_messages = executor_->Transfer(read_messages);
+ read_dictionaries_ = read_messages.Then(
+ [=](const std::vector<Result<std::shared_ptr<Message>>>& maybe_messages)
+ -> Status {
+ ARROW_ASSIGN_OR_RAISE(auto messages,
+ arrow::internal::UnwrapOrRaise(maybe_messages));
+ return ReadDictionaries(state.get(), std::move(messages));
+ });
+ }
+ if (index_ >= state_->num_record_batches()) {
+ return Future<Item>::MakeFinished(IterationTraits<Item>::End());
+ }
+ auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++));
+ auto read_message = ReadBlock(block);
+ auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; });
+ // Force transfer. This may be wasteful in some cases, but ensures we get off the
+ // I/O threads as soon as possible, and ensures we don't decode record batches
+ // synchronously in the case that the message read has already finished.
+ if (executor_) {
+ auto executor = executor_;
+ return read_messages.Then(
+ [=](const std::shared_ptr<Message>& message) -> Future<Item> {
+ return DeferNotOk(executor->Submit(
+ [=]() { return ReadRecordBatch(state.get(), message.get()); }));
+ });
+ }
+ return read_messages.Then([=](const std::shared_ptr<Message>& message) -> Result<Item> {
+ return ReadRecordBatch(state.get(), message.get());
+ });
+}
+
+Future<std::shared_ptr<Message>> IpcFileRecordBatchGenerator::ReadBlock(
+ const FileBlock& block) {
+ if (cached_source_) {
+ auto cached_source = cached_source_;
+ io::ReadRange range{block.offset, block.metadata_length + block.body_length};
+ auto pool = state_->options_.memory_pool;
+ return cached_source->WaitFor({range}).Then(
+ [cached_source, pool, range]() -> Result<std::shared_ptr<Message>> {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, cached_source->Read(range));
+ io::BufferReader stream(std::move(buffer));
+ return ReadMessage(&stream, pool);
+ });
+ } else {
+ return ReadMessageFromBlockAsync(block, state_->file_, io_context_);
+ }
+}
+
+Status IpcFileRecordBatchGenerator::ReadDictionaries(
+ RecordBatchFileReaderImpl* state,
+ std::vector<std::shared_ptr<Message>> dictionary_messages) {
+ IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
+ for (const auto& message : dictionary_messages) {
+ RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
+ }
+ return Status::OK();
+}
+
+Result<std::shared_ptr<RecordBatch>> IpcFileRecordBatchGenerator::ReadRecordBatch(
+ RecordBatchFileReaderImpl* state, Message* message) {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
+ return ReadRecordBatchInternal(*message->metadata(), state->schema_,
+ state->field_inclusion_mask_, context, reader.get());
+}
+
Status Listener::OnEOS() { return Status::OK(); }
Status Listener::OnSchemaDecoded(std::shared_ptr<Schema> schema) { return Status::OK(); }
@@ -1452,16 +1452,16 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
};
public:
- explicit StreamDecoderImpl(std::shared_ptr<Listener> listener, IpcReadOptions options)
- : listener_(std::move(listener)),
- options_(std::move(options)),
+ explicit StreamDecoderImpl(std::shared_ptr<Listener> listener, IpcReadOptions options)
+ : listener_(std::move(listener)),
+ options_(std::move(options)),
state_(State::SCHEMA),
message_decoder_(std::shared_ptr<StreamDecoderImpl>(this, [](void*) {}),
options_.memory_pool),
- n_required_dictionaries_(0) {}
+ n_required_dictionaries_(0) {}
Status OnMessageDecoded(std::unique_ptr<Message> message) override {
- ++stats_.num_messages;
+ ++stats_.num_messages;
switch (state_) {
case State::SCHEMA:
ARROW_RETURN_NOT_OK(OnSchemaMessageDecoded(std::move(message)));
@@ -1495,13 +1495,13 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
int64_t next_required_size() const { return message_decoder_.next_required_size(); }
- ReadStats stats() const { return stats_; }
-
+ ReadStats stats() const { return stats_; }
+
private:
Status OnSchemaMessageDecoded(std::unique_ptr<Message> message) {
RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_,
- &out_schema_, &field_inclusion_mask_,
- &swap_endian_));
+ &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
n_required_dictionaries_ = dictionary_memo_.fields().num_fields();
if (n_required_dictionaries_ == 0) {
@@ -1529,54 +1529,54 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
}
Status OnRecordBatchMessageDecoded(std::unique_ptr<Message> message) {
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
if (message->type() == MessageType::DICTIONARY_BATCH) {
return ReadDictionary(*message);
} else {
CHECK_HAS_BODY(*message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
ARROW_ASSIGN_OR_RAISE(
auto batch,
ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
- context, reader.get()));
- ++stats_.num_record_batches;
+ context, reader.get()));
+ ++stats_.num_record_batches;
return listener_->OnRecordBatchDecoded(std::move(batch));
}
}
// Read dictionary from dictionary batch
Status ReadDictionary(const Message& message) {
- DictionaryKind kind;
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
- RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
- ++stats_.num_dictionary_batches;
- switch (kind) {
- case DictionaryKind::New:
- break;
- case DictionaryKind::Delta:
- ++stats_.num_dictionary_deltas;
- break;
- case DictionaryKind::Replacement:
- ++stats_.num_replaced_dictionaries;
- break;
- }
- return Status::OK();
+ DictionaryKind kind;
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
+ ++stats_.num_dictionary_batches;
+ switch (kind) {
+ case DictionaryKind::New:
+ break;
+ case DictionaryKind::Delta:
+ ++stats_.num_dictionary_deltas;
+ break;
+ case DictionaryKind::Replacement:
+ ++stats_.num_replaced_dictionaries;
+ break;
+ }
+ return Status::OK();
}
std::shared_ptr<Listener> listener_;
- const IpcReadOptions options_;
+ const IpcReadOptions options_;
State state_;
MessageDecoder message_decoder_;
std::vector<bool> field_inclusion_mask_;
int n_required_dictionaries_;
DictionaryMemo dictionary_memo_;
std::shared_ptr<Schema> schema_, out_schema_;
- ReadStats stats_;
- bool swap_endian_;
+ ReadStats stats_;
+ bool swap_endian_;
};
-StreamDecoder::StreamDecoder(std::shared_ptr<Listener> listener, IpcReadOptions options) {
+StreamDecoder::StreamDecoder(std::shared_ptr<Listener> listener, IpcReadOptions options) {
impl_.reset(new StreamDecoderImpl(std::move(listener), options));
}
@@ -1593,8 +1593,8 @@ std::shared_ptr<Schema> StreamDecoder::schema() const { return impl_->schema();
int64_t StreamDecoder::next_required_size() const { return impl_->next_required_size(); }
-ReadStats StreamDecoder::stats() const { return impl_->stats(); }
-
+ReadStats StreamDecoder::stats() const { return impl_->stats(); }
+
Result<std::shared_ptr<Schema>> ReadSchema(io::InputStream* stream,
DictionaryMemo* dictionary_memo) {
std::unique_ptr<MessageReader> reader = MessageReader::Open(stream);
@@ -2059,23 +2059,23 @@ Status FuzzIpcFile(const uint8_t* data, int64_t size) {
return Status::OK();
}
-Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
- auto buffer = std::make_shared<Buffer>(data, size);
- io::BufferReader buffer_reader(buffer);
-
- std::shared_ptr<Tensor> tensor;
-
- while (true) {
- ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader));
- if (tensor == nullptr) {
- break;
- }
- RETURN_NOT_OK(tensor->Validate());
- }
-
- return Status::OK();
-}
-
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<Buffer>(data, size);
+ io::BufferReader buffer_reader(buffer);
+
+ std::shared_ptr<Tensor> tensor;
+
+ while (true) {
+ ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader));
+ if (tensor == nullptr) {
+ break;
+ }
+ RETURN_NOT_OK(tensor->Validate());
+ }
+
+ return Status::OK();
+}
+
} // namespace internal
} // namespace ipc
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
index 6f2157557f3..60db2837a68 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
@@ -25,14 +25,14 @@
#include <utility>
#include <vector>
-#include "arrow/io/caching.h"
-#include "arrow/io/type_fwd.h"
+#include "arrow/io/caching.h"
+#include "arrow/io/type_fwd.h"
#include "arrow/ipc/message.h"
#include "arrow/ipc/options.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/async_generator.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -101,8 +101,8 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader {
};
/// \brief Reads the record batch file format
-class ARROW_EXPORT RecordBatchFileReader
- : public std::enable_shared_from_this<RecordBatchFileReader> {
+class ARROW_EXPORT RecordBatchFileReader
+ : public std::enable_shared_from_this<RecordBatchFileReader> {
public:
virtual ~RecordBatchFileReader() = default;
@@ -150,26 +150,26 @@ class ARROW_EXPORT RecordBatchFileReader
const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
const IpcReadOptions& options = IpcReadOptions::Defaults());
- /// \brief Open a file asynchronously (owns the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
- /// \brief Open a file asynchronously (borrows the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- io::RandomAccessFile* file,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
- /// \brief Open a file asynchronously (owns the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
- /// \brief Open a file asynchronously (borrows the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- io::RandomAccessFile* file, int64_t footer_offset,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
+ /// \brief Open a file asynchronously (owns the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (borrows the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ io::RandomAccessFile* file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (owns the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (borrows the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
/// \brief The schema read from the file
virtual std::shared_ptr<Schema> schema() const = 0;
@@ -192,24 +192,24 @@ class ARROW_EXPORT RecordBatchFileReader
/// \brief Return current read statistics
virtual ReadStats stats() const = 0;
-
- /// \brief Computes the total number of rows in the file.
- virtual Result<int64_t> CountRows() = 0;
-
- /// \brief Get a reentrant generator of record batches.
- ///
- /// \param[in] coalesce If true, enable I/O coalescing.
- /// \param[in] io_context The IOContext to use (controls which thread pool
- /// is used for I/O).
- /// \param[in] cache_options Options for coalescing (if enabled).
- /// \param[in] executor Optionally, an executor to use for decoding record
- /// batches. This is generally only a benefit for very wide and/or
- /// compressed batches.
- virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
- const bool coalesce = false,
- const io::IOContext& io_context = io::default_io_context(),
- const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
- arrow::internal::Executor* executor = NULLPTR) = 0;
+
+ /// \brief Computes the total number of rows in the file.
+ virtual Result<int64_t> CountRows() = 0;
+
+ /// \brief Get a reentrant generator of record batches.
+ ///
+ /// \param[in] coalesce If true, enable I/O coalescing.
+ /// \param[in] io_context The IOContext to use (controls which thread pool
+ /// is used for I/O).
+ /// \param[in] cache_options Options for coalescing (if enabled).
+ /// \param[in] executor Optionally, an executor to use for decoding record
+ /// batches. This is generally only a benefit for very wide and/or
+ /// compressed batches.
+ virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+ const bool coalesce = false,
+ const io::IOContext& io_context = io::default_io_context(),
+ const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
+ arrow::internal::Executor* executor = NULLPTR) = 0;
};
/// \brief A general listener class to receive events.
@@ -304,7 +304,7 @@ class ARROW_EXPORT StreamDecoder {
/// Listener::OnRecordBatchDecoded() to receive decoded record batches
/// \param[in] options any IPC reading options (optional)
StreamDecoder(std::shared_ptr<Listener> listener,
- IpcReadOptions options = IpcReadOptions::Defaults());
+ IpcReadOptions options = IpcReadOptions::Defaults());
virtual ~StreamDecoder();
@@ -380,7 +380,7 @@ class ARROW_EXPORT StreamDecoder {
/// memcpy(buffer->mutable_data() + current_buffer_size,
/// small_chunk,
/// small_chunk_size);
- /// if (buffer->size() < decoder.next_required_size()) {
+ /// if (buffer->size() < decoder.next_required_size()) {
/// continue;
/// }
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
@@ -397,9 +397,9 @@ class ARROW_EXPORT StreamDecoder {
/// decoder
int64_t next_required_size() const;
- /// \brief Return current read statistics
- ReadStats stats() const;
-
+ /// \brief Return current read statistics
+ ReadStats stats() const;
+
private:
class StreamDecoderImpl;
std::unique_ptr<StreamDecoderImpl> impl_;
@@ -526,8 +526,8 @@ Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload&
ARROW_EXPORT
Status FuzzIpcStream(const uint8_t* data, int64_t size);
ARROW_EXPORT
-Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
-ARROW_EXPORT
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
+ARROW_EXPORT
Status FuzzIpcFile(const uint8_t* data, int64_t size);
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
index 3493c4f1409..abb1dbc2dd6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
@@ -47,9 +47,9 @@ enum class MessageType {
SPARSE_TENSOR
};
-struct IpcReadOptions;
-struct IpcWriteOptions;
-
+struct IpcReadOptions;
+struct IpcWriteOptions;
+
class MessageReader;
class RecordBatchStreamReader;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
index 7b9254b7e59..7bb86316497 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
@@ -49,7 +49,7 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/compression.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/make_unique.h"
@@ -70,18 +70,18 @@ using internal::kArrowMagicBytes;
namespace {
-bool HasNestedDict(const ArrayData& data) {
- if (data.type->id() == Type::DICTIONARY) {
- return true;
- }
- for (const auto& child : data.child_data) {
- if (HasNestedDict(*child)) {
- return true;
- }
- }
- return false;
-}
-
+bool HasNestedDict(const ArrayData& data) {
+ if (data.type->id() == Type::DICTIONARY) {
+ return true;
+ }
+ for (const auto& child : data.child_data) {
+ if (HasNestedDict(*child)) {
+ return true;
+ }
+ }
+ return false;
+}
+
Status GetTruncatedBitmap(int64_t offset, int64_t length,
const std::shared_ptr<Buffer> input, MemoryPool* pool,
std::shared_ptr<Buffer>* buffer) {
@@ -557,7 +557,7 @@ class DictionarySerializer : public RecordBatchSerializer {
Status Assemble(const std::shared_ptr<Array>& dictionary) {
// Make a dummy record batch. A bit tedious as we have to make a schema
auto schema = arrow::schema({arrow::field("dictionary", dictionary->type())});
- auto batch = RecordBatch::Make(std::move(schema), dictionary->length(), {dictionary});
+ auto batch = RecordBatch::Make(std::move(schema), dictionary->length(), {dictionary});
return RecordBatchSerializer::Assemble(*batch);
}
@@ -997,21 +997,21 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
IpcPayload payload;
RETURN_NOT_OK(GetRecordBatchPayload(batch, options_, &payload));
- RETURN_NOT_OK(WritePayload(payload));
- ++stats_.num_record_batches;
- return Status::OK();
- }
-
- Status WriteTable(const Table& table, int64_t max_chunksize) override {
- if (is_file_format_ && options_.unify_dictionaries) {
- ARROW_ASSIGN_OR_RAISE(auto unified_table,
- DictionaryUnifier::UnifyTable(table, options_.memory_pool));
- return RecordBatchWriter::WriteTable(*unified_table, max_chunksize);
- } else {
- return RecordBatchWriter::WriteTable(table, max_chunksize);
- }
+ RETURN_NOT_OK(WritePayload(payload));
+ ++stats_.num_record_batches;
+ return Status::OK();
}
+ Status WriteTable(const Table& table, int64_t max_chunksize) override {
+ if (is_file_format_ && options_.unify_dictionaries) {
+ ARROW_ASSIGN_OR_RAISE(auto unified_table,
+ DictionaryUnifier::UnifyTable(table, options_.memory_pool));
+ return RecordBatchWriter::WriteTable(*unified_table, max_chunksize);
+ } else {
+ return RecordBatchWriter::WriteTable(table, max_chunksize);
+ }
+ }
+
Status Close() override {
RETURN_NOT_OK(CheckStarted());
return payload_writer_->Close();
@@ -1023,11 +1023,11 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
IpcPayload payload;
RETURN_NOT_OK(GetSchemaPayload(schema_, options_, mapper_, &payload));
- return WritePayload(payload);
+ return WritePayload(payload);
}
- WriteStats stats() const override { return stats_; }
-
+ WriteStats stats() const override { return stats_; }
+
protected:
Status CheckStarted() {
if (!started_) {
@@ -1038,7 +1038,7 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
Status WriteDictionaries(const RecordBatch& batch) {
ARROW_ASSIGN_OR_RAISE(const auto dictionaries, CollectDictionaries(batch, mapper_));
- const auto equal_options = EqualOptions().nans_equal(true);
+ const auto equal_options = EqualOptions().nans_equal(true);
for (const auto& pair : dictionaries) {
int64_t dictionary_id = pair.first;
@@ -1047,57 +1047,57 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
// If a dictionary with this id was already emitted, check if it was the same.
auto* last_dictionary = &last_dictionaries_[dictionary_id];
const bool dictionary_exists = (*last_dictionary != nullptr);
- int64_t delta_start = 0;
+ int64_t delta_start = 0;
if (dictionary_exists) {
if ((*last_dictionary)->data() == dictionary->data()) {
// Fast shortcut for a common case.
// Same dictionary data by pointer => no need to emit it again
continue;
}
- const int64_t last_length = (*last_dictionary)->length();
- const int64_t new_length = dictionary->length();
- if (new_length == last_length &&
- ((*last_dictionary)->Equals(dictionary, equal_options))) {
+ const int64_t last_length = (*last_dictionary)->length();
+ const int64_t new_length = dictionary->length();
+ if (new_length == last_length &&
+ ((*last_dictionary)->Equals(dictionary, equal_options))) {
// Same dictionary by value => no need to emit it again
// (while this can have a CPU cost, this code path is required
// for the IPC file format)
continue;
}
- if (is_file_format_) {
- return Status::Invalid(
- "Dictionary replacement detected when writing IPC file format. "
- "Arrow IPC files only support a single dictionary for a given field "
- "across all batches.");
- }
-
- // (the read path doesn't support outer dictionary deltas, don't emit them)
- if (new_length > last_length && options_.emit_dictionary_deltas &&
- !HasNestedDict(*dictionary->data()) &&
- ((*last_dictionary)
- ->RangeEquals(dictionary, 0, last_length, 0, equal_options))) {
- // New dictionary starts with the current dictionary
- delta_start = last_length;
- }
+ if (is_file_format_) {
+ return Status::Invalid(
+ "Dictionary replacement detected when writing IPC file format. "
+ "Arrow IPC files only support a single dictionary for a given field "
+ "across all batches.");
+ }
+
+ // (the read path doesn't support outer dictionary deltas, don't emit them)
+ if (new_length > last_length && options_.emit_dictionary_deltas &&
+ !HasNestedDict(*dictionary->data()) &&
+ ((*last_dictionary)
+ ->RangeEquals(dictionary, 0, last_length, 0, equal_options))) {
+ // New dictionary starts with the current dictionary
+ delta_start = last_length;
+ }
}
- IpcPayload payload;
- if (delta_start) {
- RETURN_NOT_OK(GetDictionaryPayload(dictionary_id, /*is_delta=*/true,
- dictionary->Slice(delta_start), options_,
- &payload));
- } else {
- RETURN_NOT_OK(
- GetDictionaryPayload(dictionary_id, dictionary, options_, &payload));
- }
- RETURN_NOT_OK(WritePayload(payload));
- ++stats_.num_dictionary_batches;
- if (dictionary_exists) {
- if (delta_start) {
- ++stats_.num_dictionary_deltas;
- } else {
- ++stats_.num_replaced_dictionaries;
- }
+ IpcPayload payload;
+ if (delta_start) {
+ RETURN_NOT_OK(GetDictionaryPayload(dictionary_id, /*is_delta=*/true,
+ dictionary->Slice(delta_start), options_,
+ &payload));
+ } else {
+ RETURN_NOT_OK(
+ GetDictionaryPayload(dictionary_id, dictionary, options_, &payload));
}
+ RETURN_NOT_OK(WritePayload(payload));
+ ++stats_.num_dictionary_batches;
+ if (dictionary_exists) {
+ if (delta_start) {
+ ++stats_.num_dictionary_deltas;
+ } else {
+ ++stats_.num_replaced_dictionaries;
+ }
+ }
// Remember dictionary for next batches
*last_dictionary = dictionary;
@@ -1105,12 +1105,12 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
return Status::OK();
}
- Status WritePayload(const IpcPayload& payload) {
- RETURN_NOT_OK(payload_writer_->WritePayload(payload));
- ++stats_.num_messages;
- return Status::OK();
- }
-
+ Status WritePayload(const IpcPayload& payload) {
+ RETURN_NOT_OK(payload_writer_->WritePayload(payload));
+ ++stats_.num_messages;
+ return Status::OK();
+ }
+
std::unique_ptr<IpcPayloadWriter> payload_writer_;
std::shared_ptr<Schema> shared_schema_;
const Schema& schema_;
@@ -1126,7 +1126,7 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
bool started_ = false;
IpcWriteOptions options_;
- WriteStats stats_;
+ WriteStats stats_;
};
class StreamBookKeeper {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
index 0ea83d7630a..05d62d1bcad 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
@@ -60,23 +60,23 @@ struct IpcPayload {
int64_t body_length = 0;
};
-struct WriteStats {
- /// Number of IPC messages written.
- int64_t num_messages = 0;
- /// Number of record batches written.
- int64_t num_record_batches = 0;
- /// Number of dictionary batches written.
- ///
- /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
- int64_t num_dictionary_batches = 0;
-
- /// Number of dictionary deltas written.
- int64_t num_dictionary_deltas = 0;
- /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
- /// an existing dictionary with an unrelated new dictionary).
- int64_t num_replaced_dictionaries = 0;
-};
-
+struct WriteStats {
+ /// Number of IPC messages written.
+ int64_t num_messages = 0;
+ /// Number of record batches written.
+ int64_t num_record_batches = 0;
+ /// Number of dictionary batches written.
+ ///
+ /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
+ int64_t num_dictionary_batches = 0;
+
+ /// Number of dictionary deltas written.
+ int64_t num_dictionary_deltas = 0;
+ /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
+ /// an existing dictionary with an unrelated new dictionary).
+ int64_t num_replaced_dictionaries = 0;
+};
+
/// \class RecordBatchWriter
/// \brief Abstract interface for writing a stream of record batches
class ARROW_EXPORT RecordBatchWriter {
@@ -96,25 +96,25 @@ class ARROW_EXPORT RecordBatchWriter {
/// \brief Write Table with a particular chunksize
/// \param[in] table table to write
- /// \param[in] max_chunksize maximum length of table chunks. To indicate
- /// that no maximum should be enforced, pass -1.
+ /// \param[in] max_chunksize maximum length of table chunks. To indicate
+ /// that no maximum should be enforced, pass -1.
/// \return Status
- virtual Status WriteTable(const Table& table, int64_t max_chunksize);
+ virtual Status WriteTable(const Table& table, int64_t max_chunksize);
/// \brief Perform any logic necessary to finish the stream
///
/// \return Status
virtual Status Close() = 0;
-
- /// \brief Return current write statistics
- virtual WriteStats stats() const = 0;
+
+ /// \brief Return current write statistics
+ virtual WriteStats stats() const = 0;
};
-/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
-/// instances
-///
-/// @{
-
+/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
+/// instances
+///
+/// @{
+
/// Create a new IPC stream writer from stream sink and schema. User is
/// responsible for closing the actual OutputStream.
///
@@ -165,14 +165,14 @@ Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
-/// @}
-
-ARROW_DEPRECATED("Use MakeStreamWriter")
-ARROW_EXPORT
-Result<std::shared_ptr<RecordBatchWriter>> NewStreamWriter(
- io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
- const IpcWriteOptions& options = IpcWriteOptions::Defaults());
-
+/// @}
+
+ARROW_DEPRECATED("Use MakeStreamWriter")
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> NewStreamWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
ARROW_DEPRECATED("Use MakeFileWriter")
ARROW_EXPORT
Result<std::shared_ptr<RecordBatchWriter>> NewFileWriter(
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
index 2d6f3176224..eb5c3643dd4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
@@ -18,32 +18,32 @@
#include "arrow/memory_pool.h"
#include <algorithm> // IWYU pragma: keep
-#include <atomic>
-#include <cstdlib> // IWYU pragma: keep
-#include <cstring> // IWYU pragma: keep
-#include <iostream> // IWYU pragma: keep
+#include <atomic>
+#include <cstdlib> // IWYU pragma: keep
+#include <cstring> // IWYU pragma: keep
+#include <iostream> // IWYU pragma: keep
#include <limits>
#include <memory>
-#if defined(sun) || defined(__sun)
-#include <stdlib.h>
-#endif
-
-#include "arrow/buffer.h"
-#include "arrow/io/util_internal.h"
-#include "arrow/result.h"
+#if defined(sun) || defined(__sun)
+#include <stdlib.h>
+#endif
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/io_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/io_util.h"
#include "arrow/util/logging.h" // IWYU pragma: keep
-#include "arrow/util/optional.h"
-#include "arrow/util/string.h"
-#include "arrow/util/thread_pool.h"
-
-#ifdef __GLIBC__
-#include <malloc.h>
-#endif
-
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+#include "arrow/util/thread_pool.h"
+
+#ifdef __GLIBC__
+#include <malloc.h>
+#endif
+
#ifdef ARROW_JEMALLOC
// Needed to support jemalloc 3 and 4
#define JEMALLOC_MANGLE
@@ -101,88 +101,88 @@ const char* je_arrow_malloc_conf =
namespace arrow {
-namespace {
-
+namespace {
+
constexpr size_t kAlignment = 64;
-constexpr char kDefaultBackendEnvVar[] = "ARROW_DEFAULT_MEMORY_POOL";
-
-enum class MemoryPoolBackend : uint8_t { System, Jemalloc, Mimalloc };
-
-struct SupportedBackend {
- const char* name;
- MemoryPoolBackend backend;
-};
-
-// See ARROW-12248 for why we use static in-function singletons rather than
-// global constants below (in SupportedBackends() and UserSelectedBackend()).
-// In some contexts (especially R bindings) `default_memory_pool()` may be
-// called before all globals are initialized, and then the ARROW_DEFAULT_MEMORY_POOL
-// environment variable would be ignored.
-
-const std::vector<SupportedBackend>& SupportedBackends() {
- static std::vector<SupportedBackend> backends = {
- // ARROW-12316: Apple => mimalloc first, then jemalloc
- // non-Apple => jemalloc first, then mimalloc
-#if defined(ARROW_JEMALLOC) && !defined(__APPLE__)
- {"jemalloc", MemoryPoolBackend::Jemalloc},
-#endif
-#ifdef ARROW_MIMALLOC
- {"mimalloc", MemoryPoolBackend::Mimalloc},
-#endif
-#if defined(ARROW_JEMALLOC) && defined(__APPLE__)
- {"jemalloc", MemoryPoolBackend::Jemalloc},
-#endif
- {"system", MemoryPoolBackend::System}
- };
- return backends;
-}
-
-// Return the MemoryPoolBackend selected by the user through the
-// ARROW_DEFAULT_MEMORY_POOL environment variable, if any.
-util::optional<MemoryPoolBackend> UserSelectedBackend() {
- static auto user_selected_backend = []() -> util::optional<MemoryPoolBackend> {
- auto unsupported_backend = [](const std::string& name) {
- std::vector<std::string> supported;
- for (const auto backend : SupportedBackends()) {
- supported.push_back(std::string("'") + backend.name + "'");
- }
- ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in "
- << kDefaultBackendEnvVar << " (supported backends are "
- << internal::JoinStrings(supported, ", ") << ")";
- };
-
- auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar);
- if (!maybe_name.ok()) {
- return {};
- }
- const auto name = *std::move(maybe_name);
- if (name.empty()) {
- // An empty environment variable is considered missing
- return {};
- }
- const auto found = std::find_if(
- SupportedBackends().begin(), SupportedBackends().end(),
- [&](const SupportedBackend& backend) { return name == backend.name; });
- if (found != SupportedBackends().end()) {
- return found->backend;
- }
- unsupported_backend(name);
- return {};
- }();
-
- return user_selected_backend;
-}
-
-MemoryPoolBackend DefaultBackend() {
- auto backend = UserSelectedBackend();
- if (backend.has_value()) {
- return backend.value();
- }
- struct SupportedBackend default_backend = SupportedBackends().front();
- return default_backend.backend;
-}
-
+constexpr char kDefaultBackendEnvVar[] = "ARROW_DEFAULT_MEMORY_POOL";
+
+enum class MemoryPoolBackend : uint8_t { System, Jemalloc, Mimalloc };
+
+struct SupportedBackend {
+ const char* name;
+ MemoryPoolBackend backend;
+};
+
+// See ARROW-12248 for why we use static in-function singletons rather than
+// global constants below (in SupportedBackends() and UserSelectedBackend()).
+// In some contexts (especially R bindings) `default_memory_pool()` may be
+// called before all globals are initialized, and then the ARROW_DEFAULT_MEMORY_POOL
+// environment variable would be ignored.
+
+const std::vector<SupportedBackend>& SupportedBackends() {
+ static std::vector<SupportedBackend> backends = {
+ // ARROW-12316: Apple => mimalloc first, then jemalloc
+ // non-Apple => jemalloc first, then mimalloc
+#if defined(ARROW_JEMALLOC) && !defined(__APPLE__)
+ {"jemalloc", MemoryPoolBackend::Jemalloc},
+#endif
+#ifdef ARROW_MIMALLOC
+ {"mimalloc", MemoryPoolBackend::Mimalloc},
+#endif
+#if defined(ARROW_JEMALLOC) && defined(__APPLE__)
+ {"jemalloc", MemoryPoolBackend::Jemalloc},
+#endif
+ {"system", MemoryPoolBackend::System}
+ };
+ return backends;
+}
+
+// Return the MemoryPoolBackend selected by the user through the
+// ARROW_DEFAULT_MEMORY_POOL environment variable, if any.
+util::optional<MemoryPoolBackend> UserSelectedBackend() {
+ static auto user_selected_backend = []() -> util::optional<MemoryPoolBackend> {
+ auto unsupported_backend = [](const std::string& name) {
+ std::vector<std::string> supported;
+ for (const auto backend : SupportedBackends()) {
+ supported.push_back(std::string("'") + backend.name + "'");
+ }
+ ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in "
+ << kDefaultBackendEnvVar << " (supported backends are "
+ << internal::JoinStrings(supported, ", ") << ")";
+ };
+
+ auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar);
+ if (!maybe_name.ok()) {
+ return {};
+ }
+ const auto name = *std::move(maybe_name);
+ if (name.empty()) {
+ // An empty environment variable is considered missing
+ return {};
+ }
+ const auto found = std::find_if(
+ SupportedBackends().begin(), SupportedBackends().end(),
+ [&](const SupportedBackend& backend) { return name == backend.name; });
+ if (found != SupportedBackends().end()) {
+ return found->backend;
+ }
+ unsupported_backend(name);
+ return {};
+ }();
+
+ return user_selected_backend;
+}
+
+MemoryPoolBackend DefaultBackend() {
+ auto backend = UserSelectedBackend();
+ if (backend.has_value()) {
+ return backend.value();
+ }
+ struct SupportedBackend default_backend = SupportedBackends().front();
+ return default_backend.backend;
+}
+
// A static piece of memory for 0-size allocations, so as to return
// an aligned non-null pointer.
alignas(kAlignment) static uint8_t zero_size_area[1];
@@ -204,11 +204,11 @@ class SystemAllocator {
if (!*out) {
return Status::OutOfMemory("malloc of size ", size, " failed");
}
-#elif defined(sun) || defined(__sun)
- *out = reinterpret_cast<uint8_t*>(memalign(kAlignment, static_cast<size_t>(size)));
- if (!*out) {
- return Status::OutOfMemory("malloc of size ", size, " failed");
- }
+#elif defined(sun) || defined(__sun)
+ *out = reinterpret_cast<uint8_t*>(memalign(kAlignment, static_cast<size_t>(size)));
+ if (!*out) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
#else
const int result = posix_memalign(reinterpret_cast<void**>(out), kAlignment,
static_cast<size_t>(size));
@@ -262,14 +262,14 @@ class SystemAllocator {
#endif
}
}
-
- static void ReleaseUnused() {
-#ifdef __GLIBC__
- // The return value of malloc_trim is not an error but to inform
- // you if memory was actually released or not, which we do not care about here
- ARROW_UNUSED(malloc_trim(0));
-#endif
- }
+
+ static void ReleaseUnused() {
+#ifdef __GLIBC__
+ // The return value of malloc_trim is not an error but to inform
+ // you if memory was actually released or not, which we do not care about here
+ ARROW_UNUSED(malloc_trim(0));
+#endif
+ }
};
#ifdef ARROW_JEMALLOC
@@ -317,10 +317,10 @@ class JemallocAllocator {
dallocx(ptr, MALLOCX_ALIGN(kAlignment));
}
}
-
- static void ReleaseUnused() {
- mallctl("arena." ARROW_STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, 0);
- }
+
+ static void ReleaseUnused() {
+ mallctl("arena." ARROW_STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, 0);
+ }
};
#endif // defined(ARROW_JEMALLOC)
@@ -343,8 +343,8 @@ class MimallocAllocator {
return Status::OK();
}
- static void ReleaseUnused() { mi_collect(true); }
-
+ static void ReleaseUnused() { mi_collect(true); }
+
static Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) {
uint8_t* previous_ptr = *ptr;
if (previous_ptr == zero_size_area) {
@@ -451,8 +451,8 @@ class BaseMemoryPoolImpl : public MemoryPool {
stats_.UpdateAllocatedBytes(-size);
}
- void ReleaseUnused() override { Allocator::ReleaseUnused(); }
-
+ void ReleaseUnused() override { Allocator::ReleaseUnused(); }
+
int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
int64_t max_memory() const override { return stats_.max_memory(); }
@@ -480,46 +480,46 @@ class MimallocMemoryPool : public BaseMemoryPoolImpl<MimallocAllocator> {
};
#endif
-std::unique_ptr<MemoryPool> MemoryPool::CreateDefault() {
- auto backend = DefaultBackend();
- switch (backend) {
- case MemoryPoolBackend::System:
- return std::unique_ptr<MemoryPool>(new SystemMemoryPool);
+std::unique_ptr<MemoryPool> MemoryPool::CreateDefault() {
+ auto backend = DefaultBackend();
+ switch (backend) {
+ case MemoryPoolBackend::System:
+ return std::unique_ptr<MemoryPool>(new SystemMemoryPool);
#ifdef ARROW_JEMALLOC
- case MemoryPoolBackend::Jemalloc:
- return std::unique_ptr<MemoryPool>(new JemallocMemoryPool);
-#endif
-#ifdef ARROW_MIMALLOC
- case MemoryPoolBackend::Mimalloc:
- return std::unique_ptr<MemoryPool>(new MimallocMemoryPool);
+ case MemoryPoolBackend::Jemalloc:
+ return std::unique_ptr<MemoryPool>(new JemallocMemoryPool);
#endif
- default:
- ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
- return nullptr;
- }
+#ifdef ARROW_MIMALLOC
+ case MemoryPoolBackend::Mimalloc:
+ return std::unique_ptr<MemoryPool>(new MimallocMemoryPool);
+#endif
+ default:
+ ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
+ return nullptr;
+ }
}
-static struct GlobalState {
- ~GlobalState() { finalizing.store(true, std::memory_order_relaxed); }
-
- bool is_finalizing() const { return finalizing.load(std::memory_order_relaxed); }
-
- std::atomic<bool> finalizing{false}; // constructed first, destroyed last
-
- SystemMemoryPool system_pool;
+static struct GlobalState {
+ ~GlobalState() { finalizing.store(true, std::memory_order_relaxed); }
+
+ bool is_finalizing() const { return finalizing.load(std::memory_order_relaxed); }
+
+ std::atomic<bool> finalizing{false}; // constructed first, destroyed last
+
+ SystemMemoryPool system_pool;
#ifdef ARROW_JEMALLOC
- JemallocMemoryPool jemalloc_pool;
+ JemallocMemoryPool jemalloc_pool;
#endif
#ifdef ARROW_MIMALLOC
- MimallocMemoryPool mimalloc_pool;
+ MimallocMemoryPool mimalloc_pool;
#endif
-} global_state;
+} global_state;
-MemoryPool* system_memory_pool() { return &global_state.system_pool; }
+MemoryPool* system_memory_pool() { return &global_state.system_pool; }
Status jemalloc_memory_pool(MemoryPool** out) {
#ifdef ARROW_JEMALLOC
- *out = &global_state.jemalloc_pool;
+ *out = &global_state.jemalloc_pool;
return Status::OK();
#else
return Status::NotImplemented("This Arrow build does not enable jemalloc");
@@ -528,7 +528,7 @@ Status jemalloc_memory_pool(MemoryPool** out) {
Status mimalloc_memory_pool(MemoryPool** out) {
#ifdef ARROW_MIMALLOC
- *out = &global_state.mimalloc_pool;
+ *out = &global_state.mimalloc_pool;
return Status::OK();
#else
return Status::NotImplemented("This Arrow build does not enable mimalloc");
@@ -536,22 +536,22 @@ Status mimalloc_memory_pool(MemoryPool** out) {
}
MemoryPool* default_memory_pool() {
- auto backend = DefaultBackend();
- switch (backend) {
- case MemoryPoolBackend::System:
- return &global_state.system_pool;
+ auto backend = DefaultBackend();
+ switch (backend) {
+ case MemoryPoolBackend::System:
+ return &global_state.system_pool;
#ifdef ARROW_JEMALLOC
- case MemoryPoolBackend::Jemalloc:
- return &global_state.jemalloc_pool;
-#endif
-#ifdef ARROW_MIMALLOC
- case MemoryPoolBackend::Mimalloc:
- return &global_state.mimalloc_pool;
+ case MemoryPoolBackend::Jemalloc:
+ return &global_state.jemalloc_pool;
#endif
- default:
- ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
- return nullptr;
- }
+#ifdef ARROW_MIMALLOC
+ case MemoryPoolBackend::Mimalloc:
+ return &global_state.mimalloc_pool;
+#endif
+ default:
+ ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
+ return nullptr;
+ }
}
#define RETURN_IF_JEMALLOC_ERROR(ERR) \
@@ -674,124 +674,124 @@ int64_t ProxyMemoryPool::max_memory() const { return impl_->max_memory(); }
std::string ProxyMemoryPool::backend_name() const { return impl_->backend_name(); }
-std::vector<std::string> SupportedMemoryBackendNames() {
- std::vector<std::string> supported;
- for (const auto backend : SupportedBackends()) {
- supported.push_back(backend.name);
- }
- return supported;
-}
-
-// -----------------------------------------------------------------------
-// Pool buffer and allocation
-
-/// A Buffer whose lifetime is tied to a particular MemoryPool
-class PoolBuffer final : public ResizableBuffer {
- public:
- explicit PoolBuffer(std::shared_ptr<MemoryManager> mm, MemoryPool* pool)
- : ResizableBuffer(nullptr, 0, std::move(mm)), pool_(pool) {}
-
- ~PoolBuffer() override {
- // Avoid calling pool_->Free if the global pools are destroyed
- // (XXX this will not work with user-defined pools)
-
- // This can happen if a Future is destructing on one thread while or
- // after memory pools are destructed on the main thread (as there is
- // no guarantee of destructor order between thread/memory pools)
- uint8_t* ptr = mutable_data();
- if (ptr && !global_state.is_finalizing()) {
- pool_->Free(ptr, capacity_);
- }
- }
-
- Status Reserve(const int64_t capacity) override {
- if (capacity < 0) {
- return Status::Invalid("Negative buffer capacity: ", capacity);
- }
- uint8_t* ptr = mutable_data();
- if (!ptr || capacity > capacity_) {
- int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
- if (ptr) {
- RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
- } else {
- RETURN_NOT_OK(pool_->Allocate(new_capacity, &ptr));
- }
- data_ = ptr;
- capacity_ = new_capacity;
- }
- return Status::OK();
- }
-
- Status Resize(const int64_t new_size, bool shrink_to_fit = true) override {
- if (ARROW_PREDICT_FALSE(new_size < 0)) {
- return Status::Invalid("Negative buffer resize: ", new_size);
- }
- uint8_t* ptr = mutable_data();
- if (ptr && shrink_to_fit && new_size <= size_) {
- // Buffer is non-null and is not growing, so shrink to the requested size without
- // excess space.
- int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size);
- if (capacity_ != new_capacity) {
- // Buffer hasn't got yet the requested size.
- RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
- data_ = ptr;
- capacity_ = new_capacity;
- }
- } else {
- RETURN_NOT_OK(Reserve(new_size));
- }
- size_ = new_size;
-
- return Status::OK();
- }
-
- static std::shared_ptr<PoolBuffer> MakeShared(MemoryPool* pool) {
- std::shared_ptr<MemoryManager> mm;
- if (pool == nullptr) {
- pool = default_memory_pool();
- mm = default_cpu_memory_manager();
- } else {
- mm = CPUDevice::memory_manager(pool);
- }
- return std::make_shared<PoolBuffer>(std::move(mm), pool);
- }
-
- static std::unique_ptr<PoolBuffer> MakeUnique(MemoryPool* pool) {
- std::shared_ptr<MemoryManager> mm;
- if (pool == nullptr) {
- pool = default_memory_pool();
- mm = default_cpu_memory_manager();
- } else {
- mm = CPUDevice::memory_manager(pool);
- }
- return std::unique_ptr<PoolBuffer>(new PoolBuffer(std::move(mm), pool));
- }
-
- private:
- MemoryPool* pool_;
-};
-
-namespace {
-// A utility that does most of the work of the `AllocateBuffer` and
-// `AllocateResizableBuffer` methods. The argument `buffer` should be a smart pointer to
-// a PoolBuffer.
-template <typename BufferPtr, typename PoolBufferPtr>
-inline Result<BufferPtr> ResizePoolBuffer(PoolBufferPtr&& buffer, const int64_t size) {
- RETURN_NOT_OK(buffer->Resize(size));
- buffer->ZeroPadding();
- return std::move(buffer);
-}
-
-} // namespace
-
-Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, MemoryPool* pool) {
- return ResizePoolBuffer<std::unique_ptr<Buffer>>(PoolBuffer::MakeUnique(pool), size);
-}
-
-Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(const int64_t size,
- MemoryPool* pool) {
- return ResizePoolBuffer<std::unique_ptr<ResizableBuffer>>(PoolBuffer::MakeUnique(pool),
- size);
-}
-
+std::vector<std::string> SupportedMemoryBackendNames() {
+ std::vector<std::string> supported;
+ for (const auto backend : SupportedBackends()) {
+ supported.push_back(backend.name);
+ }
+ return supported;
+}
+
+// -----------------------------------------------------------------------
+// Pool buffer and allocation
+
+/// A Buffer whose lifetime is tied to a particular MemoryPool
+class PoolBuffer final : public ResizableBuffer {
+ public:
+ explicit PoolBuffer(std::shared_ptr<MemoryManager> mm, MemoryPool* pool)
+ : ResizableBuffer(nullptr, 0, std::move(mm)), pool_(pool) {}
+
+ ~PoolBuffer() override {
+ // Avoid calling pool_->Free if the global pools are destroyed
+ // (XXX this will not work with user-defined pools)
+
+ // This can happen if a Future is destructing on one thread while or
+ // after memory pools are destructed on the main thread (as there is
+ // no guarantee of destructor order between thread/memory pools)
+ uint8_t* ptr = mutable_data();
+ if (ptr && !global_state.is_finalizing()) {
+ pool_->Free(ptr, capacity_);
+ }
+ }
+
+ Status Reserve(const int64_t capacity) override {
+ if (capacity < 0) {
+ return Status::Invalid("Negative buffer capacity: ", capacity);
+ }
+ uint8_t* ptr = mutable_data();
+ if (!ptr || capacity > capacity_) {
+ int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
+ if (ptr) {
+ RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
+ } else {
+ RETURN_NOT_OK(pool_->Allocate(new_capacity, &ptr));
+ }
+ data_ = ptr;
+ capacity_ = new_capacity;
+ }
+ return Status::OK();
+ }
+
+ Status Resize(const int64_t new_size, bool shrink_to_fit = true) override {
+ if (ARROW_PREDICT_FALSE(new_size < 0)) {
+ return Status::Invalid("Negative buffer resize: ", new_size);
+ }
+ uint8_t* ptr = mutable_data();
+ if (ptr && shrink_to_fit && new_size <= size_) {
+ // Buffer is non-null and is not growing, so shrink to the requested size without
+ // excess space.
+ int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size);
+ if (capacity_ != new_capacity) {
+ // Buffer hasn't got yet the requested size.
+ RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
+ data_ = ptr;
+ capacity_ = new_capacity;
+ }
+ } else {
+ RETURN_NOT_OK(Reserve(new_size));
+ }
+ size_ = new_size;
+
+ return Status::OK();
+ }
+
+ static std::shared_ptr<PoolBuffer> MakeShared(MemoryPool* pool) {
+ std::shared_ptr<MemoryManager> mm;
+ if (pool == nullptr) {
+ pool = default_memory_pool();
+ mm = default_cpu_memory_manager();
+ } else {
+ mm = CPUDevice::memory_manager(pool);
+ }
+ return std::make_shared<PoolBuffer>(std::move(mm), pool);
+ }
+
+ static std::unique_ptr<PoolBuffer> MakeUnique(MemoryPool* pool) {
+ std::shared_ptr<MemoryManager> mm;
+ if (pool == nullptr) {
+ pool = default_memory_pool();
+ mm = default_cpu_memory_manager();
+ } else {
+ mm = CPUDevice::memory_manager(pool);
+ }
+ return std::unique_ptr<PoolBuffer>(new PoolBuffer(std::move(mm), pool));
+ }
+
+ private:
+ MemoryPool* pool_;
+};
+
+namespace {
+// A utility that does most of the work of the `AllocateBuffer` and
+// `AllocateResizableBuffer` methods. The argument `buffer` should be a smart pointer to
+// a PoolBuffer.
+template <typename BufferPtr, typename PoolBufferPtr>
+inline Result<BufferPtr> ResizePoolBuffer(PoolBufferPtr&& buffer, const int64_t size) {
+ RETURN_NOT_OK(buffer->Resize(size));
+ buffer->ZeroPadding();
+ return std::move(buffer);
+}
+
+} // namespace
+
+Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, MemoryPool* pool) {
+ return ResizePoolBuffer<std::unique_ptr<Buffer>>(PoolBuffer::MakeUnique(pool), size);
+}
+
+Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(const int64_t size,
+ MemoryPool* pool) {
+ return ResizePoolBuffer<std::unique_ptr<ResizableBuffer>>(PoolBuffer::MakeUnique(pool),
+ size);
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
index 81b1b112dc7..45c49ff5cc8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
@@ -63,7 +63,7 @@ class MemoryPoolStats {
/// take care of the required 64-byte alignment.
class ARROW_EXPORT MemoryPool {
public:
- virtual ~MemoryPool() = default;
+ virtual ~MemoryPool() = default;
/// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool
static std::unique_ptr<MemoryPool> CreateDefault();
@@ -87,13 +87,13 @@ class ARROW_EXPORT MemoryPool {
/// faster deallocation if supported by its backend.
virtual void Free(uint8_t* buffer, int64_t size) = 0;
- /// Return unused memory to the OS
- ///
- /// Only applies to allocators that hold onto unused memory. This will be
- /// best effort, a memory pool may not implement this feature or may be
- /// unable to fulfill the request due to fragmentation.
- virtual void ReleaseUnused() {}
-
+ /// Return unused memory to the OS
+ ///
+ /// Only applies to allocators that hold onto unused memory. This will be
+ /// best effort, a memory pool may not implement this feature or may be
+ /// unable to fulfill the request due to fragmentation.
+ virtual void ReleaseUnused() {}
+
/// The number of bytes that were allocated and not yet free'd through
/// this allocator.
virtual int64_t bytes_allocated() const = 0;
@@ -104,11 +104,11 @@ class ARROW_EXPORT MemoryPool {
/// returns -1
virtual int64_t max_memory() const;
- /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc").
+ /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc").
virtual std::string backend_name() const = 0;
protected:
- MemoryPool() = default;
+ MemoryPool() = default;
};
class ARROW_EXPORT LoggingMemoryPool : public MemoryPool {
@@ -156,10 +156,10 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
std::unique_ptr<ProxyMemoryPoolImpl> impl_;
};
-/// \brief Return a process-wide memory pool based on the system allocator.
+/// \brief Return a process-wide memory pool based on the system allocator.
ARROW_EXPORT MemoryPool* system_memory_pool();
-/// \brief Return a process-wide memory pool based on jemalloc.
+/// \brief Return a process-wide memory pool based on jemalloc.
///
/// May return NotImplemented if jemalloc is not available.
ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out);
@@ -175,11 +175,11 @@ ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out);
ARROW_EXPORT
Status jemalloc_set_decay_ms(int ms);
-/// \brief Return a process-wide memory pool based on mimalloc.
+/// \brief Return a process-wide memory pool based on mimalloc.
///
/// May return NotImplemented if mimalloc is not available.
ARROW_EXPORT Status mimalloc_memory_pool(MemoryPool** out);
-ARROW_EXPORT std::vector<std::string> SupportedMemoryBackendNames();
-
+ARROW_EXPORT std::vector<std::string> SupportedMemoryBackendNames();
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
index 8d1c16e0ed6..8187af43345 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#include "arrow/pretty_print.h"
-
+#include "arrow/pretty_print.h"
+
#include <algorithm>
#include <chrono>
#include <cstddef>
@@ -69,12 +69,12 @@ class PrettyPrinter {
};
void PrettyPrinter::OpenArray(const Array& array) {
- if (!options_.skip_new_lines) {
- Indent();
- }
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
(*sink_) << "[";
if (array.length() > 0) {
- Newline();
+ Newline();
indent_ += options_.indent_size;
}
}
@@ -125,15 +125,15 @@ class ArrayPrinter : public PrettyPrinter {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink_) << ",";
- Newline();
- }
- if (!options_.skip_new_lines) {
- Indent();
+ (*sink_) << ",";
+ Newline();
}
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
if ((i >= options_.window) && (i < (array.length() - options_.window))) {
- (*sink_) << "...";
- Newline();
+ (*sink_) << "...";
+ Newline();
i = array.length() - options_.window - 1;
skip_comma = true;
} else if (array.IsNull(i)) {
@@ -142,7 +142,7 @@ class ArrayPrinter : public PrettyPrinter {
func(i);
}
}
- Newline();
+ Newline();
}
Status WriteDataValues(const BooleanArray& array) {
@@ -232,11 +232,11 @@ class ArrayPrinter : public PrettyPrinter {
return Status::OK();
}
- Status WriteDataValues(const Decimal256Array& array) {
- WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); });
- return Status::OK();
- }
-
+ Status WriteDataValues(const Decimal256Array& array) {
+ WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); });
+ return Status::OK();
+ }
+
template <typename T>
enable_if_list_like<typename T::TypeClass, Status> WriteDataValues(const T& array) {
bool skip_comma = true;
@@ -244,13 +244,13 @@ class ArrayPrinter : public PrettyPrinter {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink_) << ",";
- Newline();
+ (*sink_) << ",";
+ Newline();
}
if ((i >= options_.window) && (i < (array.length() - options_.window))) {
Indent();
- (*sink_) << "...";
- Newline();
+ (*sink_) << "...";
+ Newline();
i = array.length() - options_.window - 1;
skip_comma = true;
} else if (array.IsNull(i)) {
@@ -259,11 +259,11 @@ class ArrayPrinter : public PrettyPrinter {
} else {
std::shared_ptr<Array> slice =
array.values()->Slice(array.value_offset(i), array.value_length(i));
- RETURN_NOT_OK(
- PrettyPrint(*slice, PrettyPrintOptions{indent_, options_.window}, sink_));
+ RETURN_NOT_OK(
+ PrettyPrint(*slice, PrettyPrintOptions{indent_, options_.window}, sink_));
}
}
- Newline();
+ Newline();
return Status::OK();
}
@@ -273,36 +273,36 @@ class ArrayPrinter : public PrettyPrinter {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink_) << ",";
- Newline();
+ (*sink_) << ",";
+ Newline();
}
-
- if (!options_.skip_new_lines) {
- Indent();
- }
-
+
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
+
if ((i >= options_.window) && (i < (array.length() - options_.window))) {
- (*sink_) << "...";
- Newline();
+ (*sink_) << "...";
+ Newline();
i = array.length() - options_.window - 1;
skip_comma = true;
} else if (array.IsNull(i)) {
(*sink_) << options_.null_rep;
} else {
- (*sink_) << "keys:";
- Newline();
+ (*sink_) << "keys:";
+ Newline();
auto keys_slice =
array.keys()->Slice(array.value_offset(i), array.value_length(i));
- RETURN_NOT_OK(PrettyPrint(*keys_slice,
- PrettyPrintOptions{indent_, options_.window}, sink_));
- Newline();
+ RETURN_NOT_OK(PrettyPrint(*keys_slice,
+ PrettyPrintOptions{indent_, options_.window}, sink_));
+ Newline();
Indent();
- (*sink_) << "values:";
- Newline();
+ (*sink_) << "values:";
+ Newline();
auto values_slice =
array.items()->Slice(array.value_offset(i), array.value_length(i));
- RETURN_NOT_OK(PrettyPrint(*values_slice,
- PrettyPrintOptions{indent_, options_.window}, sink_));
+ RETURN_NOT_OK(PrettyPrint(*values_slice,
+ PrettyPrintOptions{indent_, options_.window}, sink_));
}
}
(*sink_) << "\n";
@@ -341,7 +341,7 @@ class ArrayPrinter : public PrettyPrinter {
int64_t length) {
for (size_t i = 0; i < fields.size(); ++i) {
Newline();
- Indent();
+ Indent();
std::stringstream ss;
ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << "\n";
Write(ss.str());
@@ -369,14 +369,14 @@ class ArrayPrinter : public PrettyPrinter {
RETURN_NOT_OK(WriteValidityBitmap(array));
Newline();
- Indent();
+ Indent();
Write("-- type_ids: ");
UInt8Array type_codes(array.length(), array.type_codes(), nullptr, 0, array.offset());
RETURN_NOT_OK(PrettyPrint(type_codes, indent_ + options_.indent_size, sink_));
if (array.mode() == UnionMode::DENSE) {
Newline();
- Indent();
+ Indent();
Write("-- value_offsets: ");
Int32Array value_offsets(
array.length(), checked_cast<const DenseUnionArray&>(array).value_offsets(),
@@ -395,13 +395,13 @@ class ArrayPrinter : public PrettyPrinter {
Status Visit(const DictionaryArray& array) {
Newline();
- Indent();
+ Indent();
Write("-- dictionary:\n");
RETURN_NOT_OK(
PrettyPrint(*array.dictionary(), indent_ + options_.indent_size, sink_));
Newline();
- Indent();
+ Indent();
Write("-- indices:\n");
return PrettyPrint(*array.indices(), indent_ + options_.indent_size, sink_);
}
@@ -452,7 +452,7 @@ Status ArrayPrinter::WriteValidityBitmap(const Array& array) {
if (array.null_count() > 0) {
Newline();
- Indent();
+ Indent();
BooleanArray is_valid(array.length(), array.null_bitmap(), nullptr, 0,
array.offset());
return PrettyPrint(is_valid, indent_ + options_.indent_size, sink_);
@@ -492,28 +492,28 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op
for (int i = 0; i < indent; ++i) {
(*sink) << " ";
}
- (*sink) << "[";
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ (*sink) << "[";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
bool skip_comma = true;
for (int i = 0; i < num_chunks; ++i) {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink) << ",";
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ (*sink) << ",";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
}
if ((i >= window) && (i < (num_chunks - window))) {
for (int i = 0; i < indent; ++i) {
(*sink) << " ";
}
- (*sink) << "...";
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ (*sink) << "...";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
i = num_chunks - window - 1;
skip_comma = true;
} else {
@@ -523,9 +523,9 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op
RETURN_NOT_OK(printer.Print(*chunked_arr.chunk(i)));
}
}
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
for (int i = 0; i < indent; ++i) {
(*sink) << " ";
@@ -605,7 +605,7 @@ class SchemaPrinter : public PrettyPrinter {
void PrintVerboseMetadata(const KeyValueMetadata& metadata) {
for (int64_t i = 0; i < metadata.size(); ++i) {
Newline();
- Indent();
+ Indent();
Write(metadata.key(i) + ": '" + metadata.value(i) + "'");
}
}
@@ -613,7 +613,7 @@ class SchemaPrinter : public PrettyPrinter {
void PrintTruncatedMetadata(const KeyValueMetadata& metadata) {
for (int64_t i = 0; i < metadata.size(); ++i) {
Newline();
- Indent();
+ Indent();
size_t size = metadata.value(i).size();
size_t truncated_size = std::max<size_t>(10, 70 - metadata.key(i).size() - indent_);
if (size <= truncated_size) {
@@ -629,7 +629,7 @@ class SchemaPrinter : public PrettyPrinter {
void PrintMetadata(const std::string& metadata_type, const KeyValueMetadata& metadata) {
if (metadata.size() > 0) {
Newline();
- Indent();
+ Indent();
Write(metadata_type);
if (options_.truncate_metadata) {
PrintTruncatedMetadata(metadata);
@@ -643,7 +643,7 @@ class SchemaPrinter : public PrettyPrinter {
for (int i = 0; i < schema_.num_fields(); ++i) {
if (i > 0) {
Newline();
- Indent();
+ Indent();
} else {
Indent();
}
@@ -668,7 +668,7 @@ Status SchemaPrinter::PrintType(const DataType& type, bool nullable) {
}
for (int i = 0; i < type.num_fields(); ++i) {
Newline();
- Indent();
+ Indent();
std::stringstream ss;
ss << "child " << i << ", ";
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
index 1bc086a6889..d85684cf460 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
@@ -19,7 +19,7 @@
#include <iosfwd>
#include <string>
-#include <utility>
+#include <utility>
#include "arrow/util/visibility.h"
@@ -35,14 +35,14 @@ class Table;
struct PrettyPrintOptions {
PrettyPrintOptions() = default;
- PrettyPrintOptions(int indent_arg, // NOLINT runtime/explicit
- int window_arg = 10, int indent_size_arg = 2,
+ PrettyPrintOptions(int indent_arg, // NOLINT runtime/explicit
+ int window_arg = 10, int indent_size_arg = 2,
std::string null_rep_arg = "null", bool skip_new_lines_arg = false,
bool truncate_metadata_arg = true)
: indent(indent_arg),
indent_size(indent_size_arg),
window(window_arg),
- null_rep(std::move(null_rep_arg)),
+ null_rep(std::move(null_rep_arg)),
skip_new_lines(skip_new_lines_arg),
truncate_metadata(truncate_metadata_arg) {}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
index 66f9e932b58..21703f3cf24 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
@@ -69,14 +69,14 @@ class SimpleRecordBatch : public RecordBatch {
boxed_columns_.resize(schema_->num_fields());
}
- const std::vector<std::shared_ptr<Array>>& columns() const override {
- for (int i = 0; i < num_columns(); ++i) {
- // Force all columns to be boxed
- column(i);
- }
- return boxed_columns_;
- }
-
+ const std::vector<std::shared_ptr<Array>>& columns() const override {
+ for (int i = 0; i < num_columns(); ++i) {
+ // Force all columns to be boxed
+ column(i);
+ }
+ return boxed_columns_;
+ }
+
std::shared_ptr<Array> column(int i) const override {
std::shared_ptr<Array> result = internal::atomic_load(&boxed_columns_[i]);
if (!result) {
@@ -88,7 +88,7 @@ class SimpleRecordBatch : public RecordBatch {
std::shared_ptr<ArrayData> column_data(int i) const override { return columns_[i]; }
- const ArrayDataVector& column_data() const override { return columns_; }
+ const ArrayDataVector& column_data() const override { return columns_; }
Result<std::shared_ptr<RecordBatch>> AddColumn(
int i, const std::shared_ptr<Field>& field,
@@ -97,9 +97,9 @@ class SimpleRecordBatch : public RecordBatch {
ARROW_CHECK(column != nullptr);
if (!field->type()->Equals(column->type())) {
- return Status::TypeError("Column data type ", field->type()->name(),
- " does not match field data type ",
- column->type()->name());
+ return Status::TypeError("Column data type ", field->type()->name(),
+ " does not match field data type ",
+ column->type()->name());
}
if (column->length() != num_rows_) {
return Status::Invalid(
@@ -108,42 +108,42 @@ class SimpleRecordBatch : public RecordBatch {
}
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->AddField(i, field));
- return RecordBatch::Make(std::move(new_schema), num_rows_,
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
internal::AddVectorElement(columns_, i, column->data()));
}
- Result<std::shared_ptr<RecordBatch>> SetColumn(
- int i, const std::shared_ptr<Field>& field,
- const std::shared_ptr<Array>& column) const override {
- ARROW_CHECK(field != nullptr);
- ARROW_CHECK(column != nullptr);
-
- if (!field->type()->Equals(column->type())) {
- return Status::TypeError("Column data type ", field->type()->name(),
- " does not match field data type ",
- column->type()->name());
- }
- if (column->length() != num_rows_) {
- return Status::Invalid(
- "Added column's length must match record batch's length. Expected length ",
- num_rows_, " but got length ", column->length());
- }
-
- ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field));
- return RecordBatch::Make(std::move(new_schema), num_rows_,
- internal::ReplaceVectorElement(columns_, i, column->data()));
- }
-
+ Result<std::shared_ptr<RecordBatch>> SetColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const override {
+ ARROW_CHECK(field != nullptr);
+ ARROW_CHECK(column != nullptr);
+
+ if (!field->type()->Equals(column->type())) {
+ return Status::TypeError("Column data type ", field->type()->name(),
+ " does not match field data type ",
+ column->type()->name());
+ }
+ if (column->length() != num_rows_) {
+ return Status::Invalid(
+ "Added column's length must match record batch's length. Expected length ",
+ num_rows_, " but got length ", column->length());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field));
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
+ internal::ReplaceVectorElement(columns_, i, column->data()));
+ }
+
Result<std::shared_ptr<RecordBatch>> RemoveColumn(int i) const override {
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->RemoveField(i));
- return RecordBatch::Make(std::move(new_schema), num_rows_,
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
internal::DeleteVectorElement(columns_, i));
}
std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
const std::shared_ptr<const KeyValueMetadata>& metadata) const override {
auto new_schema = schema_->WithMetadata(metadata);
- return RecordBatch::Make(std::move(new_schema), num_rows_, columns_);
+ return RecordBatch::Make(std::move(new_schema), num_rows_, columns_);
}
std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const override {
@@ -191,8 +191,8 @@ std::shared_ptr<RecordBatch> RecordBatch::Make(
Result<std::shared_ptr<RecordBatch>> RecordBatch::FromStructArray(
const std::shared_ptr<Array>& array) {
if (array->type_id() != Type::STRUCT) {
- return Status::TypeError("Cannot construct record batch from array of type ",
- *array->type());
+ return Status::TypeError("Cannot construct record batch from array of type ",
+ *array->type());
}
if (array->null_count() != 0) {
return Status::Invalid(
@@ -251,27 +251,27 @@ bool RecordBatch::ApproxEquals(const RecordBatch& other) const {
return true;
}
-Result<std::shared_ptr<RecordBatch>> RecordBatch::SelectColumns(
- const std::vector<int>& indices) const {
- int n = static_cast<int>(indices.size());
-
- FieldVector fields(n);
- ArrayVector columns(n);
-
- for (int i = 0; i < n; i++) {
- int pos = indices[i];
- if (pos < 0 || pos > num_columns() - 1) {
- return Status::Invalid("Invalid column index ", pos, " to select columns.");
- }
- fields[i] = schema()->field(pos);
- columns[i] = column(pos);
- }
-
- auto new_schema =
- std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
- return RecordBatch::Make(std::move(new_schema), num_rows(), std::move(columns));
-}
-
+Result<std::shared_ptr<RecordBatch>> RecordBatch::SelectColumns(
+ const std::vector<int>& indices) const {
+ int n = static_cast<int>(indices.size());
+
+ FieldVector fields(n);
+ ArrayVector columns(n);
+
+ for (int i = 0; i < n; i++) {
+ int pos = indices[i];
+ if (pos < 0 || pos > num_columns() - 1) {
+ return Status::Invalid("Invalid column index ", pos, " to select columns.");
+ }
+ fields[i] = schema()->field(pos);
+ columns[i] = column(pos);
+ }
+
+ auto new_schema =
+ std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
+ return RecordBatch::Make(std::move(new_schema), num_rows(), std::move(columns));
+}
+
std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset) const {
return Slice(offset, this->num_rows() - offset);
}
@@ -304,7 +304,7 @@ Status RecordBatch::ValidateFull() const {
RETURN_NOT_OK(Validate());
for (int i = 0; i < num_columns(); ++i) {
const auto& array = *this->column(i);
- RETURN_NOT_OK(internal::ValidateArrayFull(array));
+ RETURN_NOT_OK(internal::ValidateArrayFull(array));
}
return Status::OK();
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
index 3dc1f54a083..735d4f6f06b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
@@ -87,10 +87,10 @@ class ARROW_EXPORT RecordBatch {
// \return the table's schema
/// \return true if batches are equal
- const std::shared_ptr<Schema>& schema() const { return schema_; }
+ const std::shared_ptr<Schema>& schema() const { return schema_; }
/// \brief Retrieve all columns at once
- virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
+ virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
/// \brief Retrieve an array from the record batch
/// \param[in] i field index, does not boundscheck
@@ -108,7 +108,7 @@ class ARROW_EXPORT RecordBatch {
virtual std::shared_ptr<ArrayData> column_data(int i) const = 0;
/// \brief Retrieve all arrays' internal data from the record batch.
- virtual const ArrayDataVector& column_data() const = 0;
+ virtual const ArrayDataVector& column_data() const = 0;
/// \brief Add column to the record batch, producing a new RecordBatch
///
@@ -130,11 +130,11 @@ class ARROW_EXPORT RecordBatch {
virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
int i, std::string field_name, const std::shared_ptr<Array>& column) const;
- /// \brief Replace a column in the table, producing a new Table
- virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
- int i, const std::shared_ptr<Field>& field,
- const std::shared_ptr<Array>& column) const = 0;
-
+ /// \brief Replace a column in the table, producing a new Table
+ virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const = 0;
+
/// \brief Remove column from the record batch, producing a new RecordBatch
///
/// \param[in] i field index, does boundscheck
@@ -166,10 +166,10 @@ class ARROW_EXPORT RecordBatch {
/// \return PrettyPrint representation suitable for debugging
std::string ToString() const;
- /// \brief Return new record batch with specified columns
- Result<std::shared_ptr<RecordBatch>> SelectColumns(
- const std::vector<int>& indices) const;
-
+ /// \brief Return new record batch with specified columns
+ Result<std::shared_ptr<RecordBatch>> SelectColumns(
+ const std::vector<int>& indices) const;
+
/// \brief Perform cheap validation checks to determine obvious inconsistencies
/// within the record batch's schema and internal data.
///
@@ -199,8 +199,8 @@ class ARROW_EXPORT RecordBatch {
/// \brief Abstract interface for reading stream of record batches
class ARROW_EXPORT RecordBatchReader {
public:
- using ValueType = std::shared_ptr<RecordBatch>;
-
+ using ValueType = std::shared_ptr<RecordBatch>;
+
virtual ~RecordBatchReader() = default;
/// \return the shared schema of the record batches in the stream
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/result.h b/contrib/libs/apache/arrow/cpp/src/arrow/result.h
index cb7437cd242..21483c89533 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/result.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/result.h
@@ -18,7 +18,7 @@
#pragma once
-#include <cstddef>
+#include <cstddef>
#include <new>
#include <string>
#include <type_traits>
@@ -29,9 +29,9 @@
namespace arrow {
-template <typename>
-struct EnsureResult;
-
+template <typename>
+struct EnsureResult;
+
namespace internal {
#if __cplusplus >= 201703L
@@ -317,7 +317,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
return ValueUnsafe();
}
const T& operator*() const& { return ValueOrDie(); }
- const T* operator->() const { return &ValueOrDie(); }
+ const T* operator->() const { return &ValueOrDie(); }
/// Gets a mutable reference to the stored `T` value.
///
@@ -332,7 +332,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
return ValueUnsafe();
}
T& operator*() & { return ValueOrDie(); }
- T* operator->() { return &ValueOrDie(); }
+ T* operator->() { return &ValueOrDie(); }
/// Moves and returns the internally-stored `T` value.
///
@@ -385,7 +385,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
/// Apply a function to the internally stored value to produce a new result or propagate
/// the stored error.
template <typename M>
- typename EnsureResult<typename std::result_of<M && (T)>::type>::type Map(M&& m) && {
+ typename EnsureResult<typename std::result_of<M && (T)>::type>::type Map(M&& m) && {
if (!ok()) {
return status();
}
@@ -395,36 +395,36 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
/// Apply a function to the internally stored value to produce a new result or propagate
/// the stored error.
template <typename M>
- typename EnsureResult<typename std::result_of<M && (const T&)>::type>::type Map(
- M&& m) const& {
+ typename EnsureResult<typename std::result_of<M && (const T&)>::type>::type Map(
+ M&& m) const& {
if (!ok()) {
return status();
}
return std::forward<M>(m)(ValueUnsafe());
}
- /// Cast the internally stored value to produce a new result or propagate the stored
- /// error.
- template <typename U, typename E = typename std::enable_if<
- std::is_constructible<U, T>::value>::type>
- Result<U> As() && {
- if (!ok()) {
- return status();
- }
- return U(MoveValueUnsafe());
- }
-
- /// Cast the internally stored value to produce a new result or propagate the stored
- /// error.
- template <typename U, typename E = typename std::enable_if<
- std::is_constructible<U, const T&>::value>::type>
- Result<U> As() const& {
- if (!ok()) {
- return status();
- }
- return U(ValueUnsafe());
- }
-
+ /// Cast the internally stored value to produce a new result or propagate the stored
+ /// error.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, T>::value>::type>
+ Result<U> As() && {
+ if (!ok()) {
+ return status();
+ }
+ return U(MoveValueUnsafe());
+ }
+
+ /// Cast the internally stored value to produce a new result or propagate the stored
+ /// error.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, const T&>::value>::type>
+ Result<U> As() const& {
+ if (!ok()) {
+ return status();
+ }
+ return U(ValueUnsafe());
+ }
+
const T& ValueUnsafe() const& {
return *internal::launder(reinterpret_cast<const T*>(&data_));
}
@@ -448,16 +448,16 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
void Destroy() {
if (ARROW_PREDICT_TRUE(status_.ok())) {
- static_assert(offsetof(Result<T>, status_) == 0,
- "Status is guaranteed to be at the start of Result<>");
+ static_assert(offsetof(Result<T>, status_) == 0,
+ "Status is guaranteed to be at the start of Result<>");
internal::launder(reinterpret_cast<const T*>(&data_))->~T();
}
}
};
-#define ARROW_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
- auto&& result_name = (rexpr); \
- ARROW_RETURN_IF_(!(result_name).ok(), (result_name).status(), ARROW_STRINGIFY(rexpr)); \
+#define ARROW_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
+ auto&& result_name = (rexpr); \
+ ARROW_RETURN_IF_(!(result_name).ok(), (result_name).status(), ARROW_STRINGIFY(rexpr)); \
lhs = std::move(result_name).ValueUnsafe();
#define ARROW_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)
@@ -475,14 +475,14 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
/// WARNING: ARROW_ASSIGN_OR_RAISE expands into multiple statements;
/// it cannot be used in a single statement (e.g. as the body of an if
/// statement without {})!
-///
-/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
-/// an lvalue Result which you *don't* want to move out of cast appropriately.
-///
-/// WARNING: ARROW_ASSIGN_OR_RAISE is not a single expression; it will not
-/// maintain lifetimes of all temporaries in `rexpr` (e.g.
-/// `ARROW_ASSIGN_OR_RAISE(auto x, MakeTemp().GetResultRef());`
-/// will most likely segfault)!
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
+/// an lvalue Result which you *don't* want to move out of cast appropriately.
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE is not a single expression; it will not
+/// maintain lifetimes of all temporaries in `rexpr` (e.g.
+/// `ARROW_ASSIGN_OR_RAISE(auto x, MakeTemp().GetResultRef());`
+/// will most likely segfault)!
#define ARROW_ASSIGN_OR_RAISE(lhs, rexpr) \
ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
lhs, rexpr);
@@ -490,7 +490,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
namespace internal {
template <typename T>
-inline const Status& GenericToStatus(const Result<T>& res) {
+inline const Status& GenericToStatus(const Result<T>& res) {
return res.status();
}
@@ -501,19 +501,19 @@ inline Status GenericToStatus(Result<T>&& res) {
} // namespace internal
-template <typename T, typename R = typename EnsureResult<T>::type>
-R ToResult(T t) {
- return R(std::move(t));
+template <typename T, typename R = typename EnsureResult<T>::type>
+R ToResult(T t) {
+ return R(std::move(t));
}
-template <typename T>
-struct EnsureResult {
- using type = Result<T>;
-};
-
-template <typename T>
-struct EnsureResult<Result<T>> {
- using type = Result<T>;
-};
-
+template <typename T>
+struct EnsureResult {
+ using type = Result<T>;
+};
+
+template <typename T>
+struct EnsureResult<Result<T>> {
+ using type = Result<T>;
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
index cb7755ba3f1..4f9d94a0a38 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
@@ -18,7 +18,7 @@
#include "arrow/scalar.h"
#include <memory>
-#include <sstream>
+#include <sstream>
#include <string>
#include <utility>
@@ -45,10 +45,10 @@ bool Scalar::Equals(const Scalar& other, const EqualOptions& options) const {
return ScalarEquals(*this, other, options);
}
-bool Scalar::ApproxEquals(const Scalar& other, const EqualOptions& options) const {
- return ScalarApproxEquals(*this, other, options);
-}
-
+bool Scalar::ApproxEquals(const Scalar& other, const EqualOptions& options) const {
+ return ScalarApproxEquals(*this, other, options);
+}
+
struct ScalarHashImpl {
static std::hash<std::string> string_hash;
@@ -74,14 +74,14 @@ struct ScalarHashImpl {
return StdHash(s.value.low_bits()) & StdHash(s.value.high_bits());
}
- Status Visit(const Decimal256Scalar& s) {
- Status status = Status::OK();
- for (uint64_t elem : s.value.little_endian_array()) {
- status &= StdHash(elem);
- }
- return status;
- }
-
+ Status Visit(const Decimal256Scalar& s) {
+ Status status = Status::OK();
+ for (uint64_t elem : s.value.little_endian_array()) {
+ status &= StdHash(elem);
+ }
+ return status;
+ }
+
Status Visit(const BaseListScalar& s) { return ArrayHash(*s.value); }
Status Visit(const StructScalar& s) {
@@ -91,11 +91,11 @@ struct ScalarHashImpl {
return Status::OK();
}
- Status Visit(const DictionaryScalar& s) {
- AccumulateHashFrom(*s.value.index);
- return Status::OK();
- }
-
+ Status Visit(const DictionaryScalar& s) {
+ AccumulateHashFrom(*s.value.index);
+ return Status::OK();
+ }
+
// TODO(bkietz) implement less wimpy hashing when these have ValueType
Status Visit(const UnionScalar& s) { return Status::OK(); }
Status Visit(const ExtensionScalar& s) { return Status::OK(); }
@@ -132,21 +132,21 @@ struct ScalarHashImpl {
return Status::OK();
}
- explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
- if (scalar.is_valid) {
- AccumulateHashFrom(scalar);
- }
- }
+ explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
+ if (scalar.is_valid) {
+ AccumulateHashFrom(scalar);
+ }
+ }
void AccumulateHashFrom(const Scalar& scalar) {
DCHECK_OK(StdHash(scalar.type->fingerprint()));
DCHECK_OK(VisitScalarInline(scalar, this));
}
- size_t hash_;
+ size_t hash_;
};
-size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; }
+size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; }
StringScalar::StringScalar(std::string s)
: StringScalar(Buffer::FromString(std::move(s))) {}
@@ -193,20 +193,20 @@ FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr<Array> value)
: BaseListScalar(
value, fixed_size_list(value->type(), static_cast<int32_t>(value->length()))) {}
-Result<std::shared_ptr<StructScalar>> StructScalar::Make(
- ScalarVector values, std::vector<std::string> field_names) {
- if (values.size() != field_names.size()) {
- return Status::Invalid("Mismatching number of field names and child scalars");
- }
-
- FieldVector fields(field_names.size());
- for (size_t i = 0; i < fields.size(); ++i) {
- fields[i] = arrow::field(std::move(field_names[i]), values[i]->type);
- }
-
- return std::make_shared<StructScalar>(std::move(values), struct_(std::move(fields)));
-}
-
+Result<std::shared_ptr<StructScalar>> StructScalar::Make(
+ ScalarVector values, std::vector<std::string> field_names) {
+ if (values.size() != field_names.size()) {
+ return Status::Invalid("Mismatching number of field names and child scalars");
+ }
+
+ FieldVector fields(field_names.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ fields[i] = arrow::field(std::move(field_names[i]), values[i]->type);
+ }
+
+ return std::make_shared<StructScalar>(std::move(values), struct_(std::move(fields)));
+}
+
Result<std::shared_ptr<Scalar>> StructScalar::field(FieldRef ref) const {
ARROW_ASSIGN_OR_RAISE(auto path, ref.FindOne(*type));
if (path.indices().size() != 1) {
@@ -277,13 +277,13 @@ Result<std::shared_ptr<Scalar>> DictionaryScalar::GetEncodedValue() const {
return value.dictionary->GetScalar(index_value);
}
-std::shared_ptr<DictionaryScalar> DictionaryScalar::Make(std::shared_ptr<Scalar> index,
- std::shared_ptr<Array> dict) {
- auto type = dictionary(index->type, dict->type());
- return std::make_shared<DictionaryScalar>(ValueType{std::move(index), std::move(dict)},
- std::move(type));
-}
-
+std::shared_ptr<DictionaryScalar> DictionaryScalar::Make(std::shared_ptr<Scalar> index,
+ std::shared_ptr<Array> dict) {
+ auto type = dictionary(index->type, dict->type());
+ return std::make_shared<DictionaryScalar>(ValueType{std::move(index), std::move(dict)},
+ std::move(type));
+}
+
template <typename T>
using scalar_constructor_has_arrow_type =
std::is_constructible<typename TypeTraits<T>::ScalarType, std::shared_ptr<DataType>>;
@@ -551,31 +551,31 @@ Status CastImpl(const ScalarType& from, StringScalar* to) {
return Status::OK();
}
-Status CastImpl(const Decimal128Scalar& from, StringScalar* to) {
- auto from_type = checked_cast<const Decimal128Type*>(from.type.get());
- to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
- return Status::OK();
-}
-
-Status CastImpl(const Decimal256Scalar& from, StringScalar* to) {
- auto from_type = checked_cast<const Decimal256Type*>(from.type.get());
- to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
- return Status::OK();
-}
-
-Status CastImpl(const StructScalar& from, StringScalar* to) {
- std::stringstream ss;
- ss << '{';
- for (int i = 0; static_cast<size_t>(i) < from.value.size(); i++) {
- if (i > 0) ss << ", ";
- ss << from.type->field(i)->name() << ':' << from.type->field(i)->type()->ToString()
- << " = " << from.value[i]->ToString();
- }
- ss << '}';
- to->value = Buffer::FromString(ss.str());
- return Status::OK();
-}
-
+Status CastImpl(const Decimal128Scalar& from, StringScalar* to) {
+ auto from_type = checked_cast<const Decimal128Type*>(from.type.get());
+ to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
+ return Status::OK();
+}
+
+Status CastImpl(const Decimal256Scalar& from, StringScalar* to) {
+ auto from_type = checked_cast<const Decimal256Type*>(from.type.get());
+ to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
+ return Status::OK();
+}
+
+Status CastImpl(const StructScalar& from, StringScalar* to) {
+ std::stringstream ss;
+ ss << '{';
+ for (int i = 0; static_cast<size_t>(i) < from.value.size(); i++) {
+ if (i > 0) ss << ", ";
+ ss << from.type->field(i)->name() << ':' << from.type->field(i)->type()->ToString()
+ << " = " << from.value[i]->ToString();
+ }
+ ss << '}';
+ to->value = Buffer::FromString(ss.str());
+ return Status::OK();
+}
+
struct CastImplVisitor {
Status NotImplemented() {
return Status::NotImplemented("cast to ", *to_type_, " from ", *from_.type);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
index 24744859686..1d5e2c93ff4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
@@ -65,19 +65,19 @@ struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
bool Equals(const Scalar& other,
const EqualOptions& options = EqualOptions::Defaults()) const;
- bool ApproxEquals(const Scalar& other,
- const EqualOptions& options = EqualOptions::Defaults()) const;
-
+ bool ApproxEquals(const Scalar& other,
+ const EqualOptions& options = EqualOptions::Defaults()) const;
+
struct ARROW_EXPORT Hash {
- size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
+ size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
size_t operator()(const std::shared_ptr<Scalar>& scalar) const {
- return scalar->hash();
+ return scalar->hash();
}
};
- size_t hash() const;
-
+ size_t hash() const;
+
std::string ToString() const;
static Result<std::shared_ptr<Scalar>> Parse(const std::shared_ptr<DataType>& type,
@@ -350,17 +350,17 @@ struct ARROW_EXPORT Decimal128Scalar : public Scalar {
Decimal128 value;
};
-struct ARROW_EXPORT Decimal256Scalar : public Scalar {
- using Scalar::Scalar;
- using TypeClass = Decimal256Type;
- using ValueType = Decimal256;
-
- Decimal256Scalar(Decimal256 value, std::shared_ptr<DataType> type)
- : Scalar(std::move(type), true), value(value) {}
-
- Decimal256 value;
-};
-
+struct ARROW_EXPORT Decimal256Scalar : public Scalar {
+ using Scalar::Scalar;
+ using TypeClass = Decimal256Type;
+ using ValueType = Decimal256;
+
+ Decimal256Scalar(Decimal256 value, std::shared_ptr<DataType> type)
+ : Scalar(std::move(type), true), value(value) {}
+
+ Decimal256 value;
+};
+
struct ARROW_EXPORT BaseListScalar : public Scalar {
using Scalar::Scalar;
using ValueType = std::shared_ptr<Array>;
@@ -411,9 +411,9 @@ struct ARROW_EXPORT StructScalar : public Scalar {
StructScalar(ValueType value, std::shared_ptr<DataType> type)
: Scalar(std::move(type), true), value(std::move(value)) {}
- static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
- std::vector<std::string> field_names);
-
+ static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
+ std::vector<std::string> field_names);
+
explicit StructScalar(std::shared_ptr<DataType> type) : Scalar(std::move(type)) {}
};
@@ -448,9 +448,9 @@ struct ARROW_EXPORT DictionaryScalar : public Scalar {
DictionaryScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true)
: Scalar(std::move(type), is_valid), value(std::move(value)) {}
- static std::shared_ptr<DictionaryScalar> Make(std::shared_ptr<Scalar> index,
- std::shared_ptr<Array> dict);
-
+ static std::shared_ptr<DictionaryScalar> Make(std::shared_ptr<Scalar> index,
+ std::shared_ptr<Array> dict);
+
Result<std::shared_ptr<Scalar>> GetEncodedValue() const;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/status.cc b/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
index 0f02cb57a23..d6399f8bfce 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
@@ -68,9 +68,9 @@ std::string Status::CodeAsString(StatusCode code) {
case StatusCode::Invalid:
type = "Invalid";
break;
- case StatusCode::Cancelled:
- type = "Cancelled";
- break;
+ case StatusCode::Cancelled:
+ type = "Cancelled";
+ break;
case StatusCode::IOError:
type = "IOError";
break;
@@ -135,7 +135,7 @@ void Status::Abort(const std::string& message) const {
void Status::AddContextLine(const char* filename, int line, const char* expr) {
ARROW_CHECK(!ok()) << "Cannot add context line to ok status";
std::stringstream ss;
- ss << "\n" << filename << ":" << line << " " << expr;
+ ss << "\n" << filename << ":" << line << " " << expr;
state_->msg += ss.str();
}
#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/status.h b/contrib/libs/apache/arrow/cpp/src/arrow/status.h
index 056d60d6f32..9fbc840a541 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/status.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/status.h
@@ -83,7 +83,7 @@ enum class StatusCode : char {
IOError = 5,
CapacityError = 6,
IndexError = 7,
- Cancelled = 8,
+ Cancelled = 8,
UnknownError = 9,
NotImplemented = 10,
SerializationError = 11,
@@ -205,12 +205,12 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
return Status::FromArgs(StatusCode::Invalid, std::forward<Args>(args)...);
}
- /// Return an error status for cancelled operation
- template <typename... Args>
- static Status Cancelled(Args&&... args) {
- return Status::FromArgs(StatusCode::Cancelled, std::forward<Args>(args)...);
- }
-
+ /// Return an error status for cancelled operation
+ template <typename... Args>
+ static Status Cancelled(Args&&... args) {
+ return Status::FromArgs(StatusCode::Cancelled, std::forward<Args>(args)...);
+ }
+
/// Return an error status when an index is out of bounds
template <typename... Args>
static Status IndexError(Args&&... args) {
@@ -270,8 +270,8 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
bool IsKeyError() const { return code() == StatusCode::KeyError; }
/// Return true iff the status indicates invalid data.
bool IsInvalid() const { return code() == StatusCode::Invalid; }
- /// Return true iff the status indicates a cancelled operation.
- bool IsCancelled() const { return code() == StatusCode::Cancelled; }
+ /// Return true iff the status indicates a cancelled operation.
+ bool IsCancelled() const { return code() == StatusCode::Cancelled; }
/// Return true iff the status indicates an IO-related failure.
bool IsIOError() const { return code() == StatusCode::IOError; }
/// Return true iff the status indicates a container reaching capacity limits.
@@ -312,10 +312,10 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
StatusCode code() const { return ok() ? StatusCode::OK : state_->code; }
/// \brief Return the specific error message attached to this status.
- const std::string& message() const {
- static const std::string no_message = "";
- return ok() ? no_message : state_->msg;
- }
+ const std::string& message() const {
+ static const std::string no_message = "";
+ return ok() ? no_message : state_->msg;
+ }
/// \brief Return the status detail attached to this message.
const std::shared_ptr<StatusDetail>& detail() const {
@@ -443,7 +443,7 @@ namespace internal {
// Extract Status from Status or Result<T>
// Useful for the status check macros such as RETURN_NOT_OK.
-inline const Status& GenericToStatus(const Status& st) { return st; }
+inline const Status& GenericToStatus(const Status& st) { return st; }
inline Status GenericToStatus(Status&& st) { return std::move(st); }
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h b/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
index 6225a89aae4..c996923ca67 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
@@ -1,146 +1,146 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstddef>
-#include <iterator>
-#include <utility>
-
-#include "arrow/type_fwd.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/optional.h"
-
-namespace arrow {
-namespace stl {
-
-namespace detail {
-
-template <typename ArrayType>
-struct DefaultValueAccessor {
- using ValueType = decltype(std::declval<ArrayType>().GetView(0));
-
- ValueType operator()(const ArrayType& array, int64_t index) {
- return array.GetView(index);
- }
-};
-
-} // namespace detail
-
-template <typename ArrayType,
- typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
-class ArrayIterator {
- public:
- using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
- using difference_type = int64_t;
- using pointer = value_type*;
- using reference = value_type&;
- using iterator_category = std::random_access_iterator_tag;
-
- // Some algorithms need to default-construct an iterator
- ArrayIterator() : array_(NULLPTR), index_(0) {}
-
- explicit ArrayIterator(const ArrayType& array, int64_t index = 0)
- : array_(&array), index_(index) {}
-
- // Value access
- value_type operator*() const {
- return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
- }
-
- value_type operator[](difference_type n) const {
- return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
- }
-
- int64_t index() const { return index_; }
-
- // Forward / backward
- ArrayIterator& operator++() {
- ++index_;
- return *this;
- }
- ArrayIterator& operator--() {
- --index_;
- return *this;
- }
- ArrayIterator operator++(int) {
- ArrayIterator tmp(*this);
- ++index_;
- return tmp;
- }
- ArrayIterator operator--(int) {
- ArrayIterator tmp(*this);
- --index_;
- return tmp;
- }
-
- // Arithmetic
- difference_type operator-(const ArrayIterator& other) const {
- return index_ - other.index_;
- }
- ArrayIterator operator+(difference_type n) const {
- return ArrayIterator(*array_, index_ + n);
- }
- ArrayIterator operator-(difference_type n) const {
- return ArrayIterator(*array_, index_ - n);
- }
- friend inline ArrayIterator operator+(difference_type diff,
- const ArrayIterator& other) {
- return ArrayIterator(*other.array_, diff + other.index_);
- }
- friend inline ArrayIterator operator-(difference_type diff,
- const ArrayIterator& other) {
- return ArrayIterator(*other.array_, diff - other.index_);
- }
- ArrayIterator& operator+=(difference_type n) {
- index_ += n;
- return *this;
- }
- ArrayIterator& operator-=(difference_type n) {
- index_ -= n;
- return *this;
- }
-
- // Comparisons
- bool operator==(const ArrayIterator& other) const { return index_ == other.index_; }
- bool operator!=(const ArrayIterator& other) const { return index_ != other.index_; }
- bool operator<(const ArrayIterator& other) const { return index_ < other.index_; }
- bool operator>(const ArrayIterator& other) const { return index_ > other.index_; }
- bool operator<=(const ArrayIterator& other) const { return index_ <= other.index_; }
- bool operator>=(const ArrayIterator& other) const { return index_ >= other.index_; }
-
- private:
- const ArrayType* array_;
- int64_t index_;
-};
-
-} // namespace stl
-} // namespace arrow
-
-namespace std {
-
-template <typename ArrayType>
-struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
- using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
- using difference_type = typename IteratorType::difference_type;
- using value_type = typename IteratorType::value_type;
- using pointer = typename IteratorType::pointer;
- using reference = typename IteratorType::reference;
- using iterator_category = typename IteratorType::iterator_category;
-};
-
-} // namespace std
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+namespace stl {
+
+namespace detail {
+
+template <typename ArrayType>
+struct DefaultValueAccessor {
+ using ValueType = decltype(std::declval<ArrayType>().GetView(0));
+
+ ValueType operator()(const ArrayType& array, int64_t index) {
+ return array.GetView(index);
+ }
+};
+
+} // namespace detail
+
+template <typename ArrayType,
+ typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ArrayIterator {
+ public:
+ using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+ using difference_type = int64_t;
+ using pointer = value_type*;
+ using reference = value_type&;
+ using iterator_category = std::random_access_iterator_tag;
+
+ // Some algorithms need to default-construct an iterator
+ ArrayIterator() : array_(NULLPTR), index_(0) {}
+
+ explicit ArrayIterator(const ArrayType& array, int64_t index = 0)
+ : array_(&array), index_(index) {}
+
+ // Value access
+ value_type operator*() const {
+ return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+ }
+
+ value_type operator[](difference_type n) const {
+ return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
+ }
+
+ int64_t index() const { return index_; }
+
+ // Forward / backward
+ ArrayIterator& operator++() {
+ ++index_;
+ return *this;
+ }
+ ArrayIterator& operator--() {
+ --index_;
+ return *this;
+ }
+ ArrayIterator operator++(int) {
+ ArrayIterator tmp(*this);
+ ++index_;
+ return tmp;
+ }
+ ArrayIterator operator--(int) {
+ ArrayIterator tmp(*this);
+ --index_;
+ return tmp;
+ }
+
+ // Arithmetic
+ difference_type operator-(const ArrayIterator& other) const {
+ return index_ - other.index_;
+ }
+ ArrayIterator operator+(difference_type n) const {
+ return ArrayIterator(*array_, index_ + n);
+ }
+ ArrayIterator operator-(difference_type n) const {
+ return ArrayIterator(*array_, index_ - n);
+ }
+ friend inline ArrayIterator operator+(difference_type diff,
+ const ArrayIterator& other) {
+ return ArrayIterator(*other.array_, diff + other.index_);
+ }
+ friend inline ArrayIterator operator-(difference_type diff,
+ const ArrayIterator& other) {
+ return ArrayIterator(*other.array_, diff - other.index_);
+ }
+ ArrayIterator& operator+=(difference_type n) {
+ index_ += n;
+ return *this;
+ }
+ ArrayIterator& operator-=(difference_type n) {
+ index_ -= n;
+ return *this;
+ }
+
+ // Comparisons
+ bool operator==(const ArrayIterator& other) const { return index_ == other.index_; }
+ bool operator!=(const ArrayIterator& other) const { return index_ != other.index_; }
+ bool operator<(const ArrayIterator& other) const { return index_ < other.index_; }
+ bool operator>(const ArrayIterator& other) const { return index_ > other.index_; }
+ bool operator<=(const ArrayIterator& other) const { return index_ <= other.index_; }
+ bool operator>=(const ArrayIterator& other) const { return index_ >= other.index_; }
+
+ private:
+ const ArrayType* array_;
+ int64_t index_;
+};
+
+} // namespace stl
+} // namespace arrow
+
+namespace std {
+
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
+ using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
+ using difference_type = typename IteratorType::difference_type;
+ using value_type = typename IteratorType::value_type;
+ using pointer = typename IteratorType::pointer;
+ using reference = typename IteratorType::reference;
+ using iterator_category = typename IteratorType::iterator_category;
+};
+
+} // namespace std
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table.cc b/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
index d4c7802c834..6b5362c873d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
@@ -92,10 +92,10 @@ class SimpleTable : public Table {
std::shared_ptr<ChunkedArray> column(int i) const override { return columns_[i]; }
- const std::vector<std::shared_ptr<ChunkedArray>>& columns() const override {
- return columns_;
- }
-
+ const std::vector<std::shared_ptr<ChunkedArray>>& columns() const override {
+ return columns_;
+ }
+
std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const override {
auto sliced = columns_;
int64_t num_rows = length;
@@ -103,13 +103,13 @@ class SimpleTable : public Table {
column = column->Slice(offset, length);
num_rows = column->length();
}
- return Table::Make(schema_, std::move(sliced), num_rows);
+ return Table::Make(schema_, std::move(sliced), num_rows);
}
Result<std::shared_ptr<Table>> RemoveColumn(int i) const override {
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->RemoveField(i));
- return Table::Make(std::move(new_schema), internal::DeleteVectorElement(columns_, i),
+ return Table::Make(std::move(new_schema), internal::DeleteVectorElement(columns_, i),
this->num_rows());
}
@@ -129,7 +129,7 @@ class SimpleTable : public Table {
}
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->AddField(i, field_arg));
- return Table::Make(std::move(new_schema),
+ return Table::Make(std::move(new_schema),
internal::AddVectorElement(columns_, i, std::move(col)));
}
@@ -149,14 +149,14 @@ class SimpleTable : public Table {
}
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field_arg));
- return Table::Make(std::move(new_schema),
+ return Table::Make(std::move(new_schema),
internal::ReplaceVectorElement(columns_, i, std::move(col)));
}
std::shared_ptr<Table> ReplaceSchemaMetadata(
const std::shared_ptr<const KeyValueMetadata>& metadata) const override {
auto new_schema = schema_->WithMetadata(metadata);
- return Table::Make(std::move(new_schema), columns_);
+ return Table::Make(std::move(new_schema), columns_);
}
Result<std::shared_ptr<Table>> Flatten(MemoryPool* pool) const override {
@@ -374,7 +374,7 @@ Result<std::shared_ptr<Table>> Table::SelectColumns(
auto new_schema =
std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
- return Table::Make(std::move(new_schema), std::move(columns), num_rows());
+ return Table::Make(std::move(new_schema), std::move(columns), num_rows());
}
std::string Table::ToString() const {
@@ -435,7 +435,7 @@ Result<std::shared_ptr<Table>> ConcatenateTables(
}
columns[i] = std::make_shared<ChunkedArray>(column_arrays, schema->field(i)->type());
}
- return Table::Make(std::move(schema), std::move(columns));
+ return Table::Make(std::move(schema), std::move(columns));
}
Result<std::shared_ptr<Table>> PromoteTableToSchema(const std::shared_ptr<Table>& table,
@@ -564,7 +564,7 @@ Result<std::shared_ptr<Table>> Table::CombineChunks(MemoryPool* pool) const {
compacted_columns[i] = std::make_shared<ChunkedArray>(compacted);
}
}
- return Table::Make(schema(), std::move(compacted_columns), num_rows_);
+ return Table::Make(schema(), std::move(compacted_columns), num_rows_);
}
// ----------------------------------------------------------------------
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table.h b/contrib/libs/apache/arrow/cpp/src/arrow/table.h
index f1e5f23eed8..96b50e002f9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/table.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table.h
@@ -98,7 +98,7 @@ class ARROW_EXPORT Table {
virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
/// \brief Return vector of all columns for table
- virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
+ virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
/// Return a column's field by index
std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
@@ -151,7 +151,7 @@ class ARROW_EXPORT Table {
/// \brief Return new table with specified columns
Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
- /// \brief Replace schema key-value metadata with new metadata
+ /// \brief Replace schema key-value metadata with new metadata
/// \since 0.5.0
///
/// \param[in] metadata new KeyValueMetadata
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
index c026c355758..170dfc70c3c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
@@ -21,7 +21,7 @@
#include <utility>
#include "arrow/array/array_base.h"
-#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_base.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/type.h"
@@ -74,9 +74,9 @@ Status RecordBatchBuilder::Flush(bool reset_builders,
}
}
std::shared_ptr<Schema> schema =
- std::make_shared<Schema>(std::move(schema_fields), schema_->metadata());
+ std::make_shared<Schema>(std::move(schema_fields), schema_->metadata());
- *batch = RecordBatch::Make(std::move(schema), length, std::move(fields));
+ *batch = RecordBatch::Make(std::move(schema), length, std::move(fields));
if (reset_builders) {
return InitBuilders();
} else {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
index d591bacff02..91d5975715b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
@@ -31,7 +31,7 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/int_util_internal.h"
+#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -41,72 +41,72 @@ using internal::checked_cast;
namespace internal {
-Status ComputeRowMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides) {
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides) {
const int byte_width = GetByteWidth(type);
- const size_t ndim = shape.size();
-
- int64_t remaining = 0;
- if (!shape.empty() && shape.front() > 0) {
- remaining = byte_width;
- for (size_t i = 1; i < ndim; ++i) {
- if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) {
- return Status::Invalid(
- "Row-major strides computed from shape would not fit in 64-bit integer");
- }
- }
+ const size_t ndim = shape.size();
+
+ int64_t remaining = 0;
+ if (!shape.empty() && shape.front() > 0) {
+ remaining = byte_width;
+ for (size_t i = 1; i < ndim; ++i) {
+ if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) {
+ return Status::Invalid(
+ "Row-major strides computed from shape would not fit in 64-bit integer");
+ }
+ }
}
if (remaining == 0) {
strides->assign(shape.size(), byte_width);
- return Status::OK();
+ return Status::OK();
}
- strides->push_back(remaining);
- for (size_t i = 1; i < ndim; ++i) {
- remaining /= shape[i];
+ strides->push_back(remaining);
+ for (size_t i = 1; i < ndim; ++i) {
+ remaining /= shape[i];
strides->push_back(remaining);
}
-
- return Status::OK();
+
+ return Status::OK();
}
-Status ComputeColumnMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides) {
- const int byte_width = internal::GetByteWidth(type);
- const size_t ndim = shape.size();
-
- int64_t total = 0;
- if (!shape.empty() && shape.back() > 0) {
- total = byte_width;
- for (size_t i = 0; i < ndim - 1; ++i) {
- if (internal::MultiplyWithOverflow(total, shape[i], &total)) {
- return Status::Invalid(
- "Column-major strides computed from shape would not fit in 64-bit "
- "integer");
- }
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides) {
+ const int byte_width = internal::GetByteWidth(type);
+ const size_t ndim = shape.size();
+
+ int64_t total = 0;
+ if (!shape.empty() && shape.back() > 0) {
+ total = byte_width;
+ for (size_t i = 0; i < ndim - 1; ++i) {
+ if (internal::MultiplyWithOverflow(total, shape[i], &total)) {
+ return Status::Invalid(
+ "Column-major strides computed from shape would not fit in 64-bit "
+ "integer");
+ }
}
}
-
- if (total == 0) {
- strides->assign(shape.size(), byte_width);
- return Status::OK();
- }
-
- total = byte_width;
- for (size_t i = 0; i < ndim - 1; ++i) {
+
+ if (total == 0) {
+ strides->assign(shape.size(), byte_width);
+ return Status::OK();
+ }
+
+ total = byte_width;
+ for (size_t i = 0; i < ndim - 1; ++i) {
strides->push_back(total);
- total *= shape[i];
+ total *= shape[i];
}
- strides->push_back(total);
-
- return Status::OK();
+ strides->push_back(total);
+
+ return Status::OK();
}
-} // namespace internal
-
+} // namespace internal
+
namespace {
inline bool IsTensorStridesRowMajor(const std::shared_ptr<DataType>& type,
@@ -114,11 +114,11 @@ inline bool IsTensorStridesRowMajor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& strides) {
std::vector<int64_t> c_strides;
const auto& fw_type = checked_cast<const FixedWidthType&>(*type);
- if (internal::ComputeRowMajorStrides(fw_type, shape, &c_strides).ok()) {
- return strides == c_strides;
- } else {
- return false;
- }
+ if (internal::ComputeRowMajorStrides(fw_type, shape, &c_strides).ok()) {
+ return strides == c_strides;
+ } else {
+ return false;
+ }
}
inline bool IsTensorStridesColumnMajor(const std::shared_ptr<DataType>& type,
@@ -126,11 +126,11 @@ inline bool IsTensorStridesColumnMajor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& strides) {
std::vector<int64_t> f_strides;
const auto& fw_type = checked_cast<const FixedWidthType&>(*type);
- if (internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides).ok()) {
- return strides == f_strides;
- } else {
- return false;
- }
+ if (internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides).ok()) {
+ return strides == f_strides;
+ } else {
+ return false;
+ }
}
inline Status CheckTensorValidity(const std::shared_ptr<DataType>& type,
@@ -162,29 +162,29 @@ Status CheckTensorStridesValidity(const std::shared_ptr<Buffer>& data,
return Status::OK();
}
- // Check the largest offset can be computed without overflow
- const size_t ndim = shape.size();
- int64_t largest_offset = 0;
- for (size_t i = 0; i < ndim; ++i) {
- if (shape[i] == 0) continue;
- if (strides[i] < 0) {
- // TODO(mrkn): Support negative strides for sharing views
- return Status::Invalid("negative strides not supported");
- }
-
- int64_t dim_offset;
- if (!internal::MultiplyWithOverflow(shape[i] - 1, strides[i], &dim_offset)) {
- if (!internal::AddWithOverflow(largest_offset, dim_offset, &largest_offset)) {
- continue;
- }
- }
-
- return Status::Invalid(
- "offsets computed from shape and strides would not fit in 64-bit integer");
+ // Check the largest offset can be computed without overflow
+ const size_t ndim = shape.size();
+ int64_t largest_offset = 0;
+ for (size_t i = 0; i < ndim; ++i) {
+ if (shape[i] == 0) continue;
+ if (strides[i] < 0) {
+ // TODO(mrkn): Support negative strides for sharing views
+ return Status::Invalid("negative strides not supported");
+ }
+
+ int64_t dim_offset;
+ if (!internal::MultiplyWithOverflow(shape[i] - 1, strides[i], &dim_offset)) {
+ if (!internal::AddWithOverflow(largest_offset, dim_offset, &largest_offset)) {
+ continue;
+ }
+ }
+
+ return Status::Invalid(
+ "offsets computed from shape and strides would not fit in 64-bit integer");
}
-
+
const int byte_width = internal::GetByteWidth(*type);
- if (largest_offset > data->size() - byte_width) {
+ if (largest_offset > data->size() - byte_width) {
return Status::Invalid("strides must not involve buffer over run");
}
return Status::OK();
@@ -209,10 +209,10 @@ Status ValidateTensorParameters(const std::shared_ptr<DataType>& type,
RETURN_NOT_OK(CheckTensorValidity(type, data, shape));
if (!strides.empty()) {
RETURN_NOT_OK(CheckTensorStridesValidity(data, shape, strides, type));
- } else {
- std::vector<int64_t> tmp_strides;
- RETURN_NOT_OK(ComputeRowMajorStrides(checked_cast<const FixedWidthType&>(*type),
- shape, &tmp_strides));
+ } else {
+ std::vector<int64_t> tmp_strides;
+ RETURN_NOT_OK(ComputeRowMajorStrides(checked_cast<const FixedWidthType&>(*type),
+ shape, &tmp_strides));
}
if (dim_names.size() > shape.size()) {
return Status::Invalid("too many dim_names are supplied");
@@ -229,8 +229,8 @@ Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buff
: type_(type), data_(data), shape_(shape), strides_(strides), dim_names_(dim_names) {
ARROW_CHECK(is_tensor_supported(type->id()));
if (shape.size() > 0 && strides.size() == 0) {
- ARROW_CHECK_OK(internal::ComputeRowMajorStrides(
- checked_cast<const FixedWidthType&>(*type_), shape, &strides_));
+ ARROW_CHECK_OK(internal::ComputeRowMajorStrides(
+ checked_cast<const FixedWidthType&>(*type_), shape, &strides_));
}
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
index 91e9ad26066..eebb488272e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
@@ -56,16 +56,16 @@ static inline bool is_tensor_supported(Type::type type_id) {
namespace internal {
ARROW_EXPORT
-Status ComputeRowMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides);
-
-ARROW_EXPORT
-Status ComputeColumnMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides);
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides);
ARROW_EXPORT
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides);
+
+ARROW_EXPORT
bool IsTensorStridesContiguous(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& strides);
@@ -180,10 +180,10 @@ class ARROW_EXPORT Tensor {
return *ptr;
}
- Status Validate() const {
- return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
- }
-
+ Status Validate() const {
+ return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
+ }
+
protected:
Tensor() {}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
index 2124d0a4e4b..d79739240af 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
@@ -213,9 +213,9 @@ class SparseCOOTensorConverter : private SparseTensorConverterMixin {
// make results
const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
std::vector<int64_t> indices_strides;
- RETURN_NOT_OK(internal::ComputeRowMajorStrides(
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(
checked_cast<const FixedWidthType&>(*index_value_type_), indices_shape,
- &indices_strides));
+ &indices_strides));
auto coords = std::make_shared<Tensor>(index_value_type_, std::move(indices_buffer),
indices_shape, indices_strides);
ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true));
@@ -305,7 +305,7 @@ Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
std::fill_n(values, value_elsize * sparse_tensor->size(), 0);
std::vector<int64_t> strides;
- RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
+ RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
const auto* raw_data = sparse_tensor->raw_data();
const int ndim = sparse_tensor->ndim();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
index 77a71d8a12e..27173dbc697 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
@@ -211,7 +211,7 @@ class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin {
}
Result<std::shared_ptr<Tensor>> Build() {
- RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
ARROW_ASSIGN_OR_RAISE(values_buffer_,
AllocateBuffer(value_elsize_ * tensor_size_, pool_));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
index 137b5d3202f..8c71b1efdaf 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
@@ -177,7 +177,7 @@ Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
std::fill_n(values, value_elsize * tensor_size, 0);
std::vector<int64_t> strides;
- RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));
+ RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));
const auto nc = shape[1];
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type.cc b/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
index 41914f43663..6551b31575d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
@@ -68,8 +68,8 @@ constexpr Type::type StructType::type_id;
constexpr Type::type Decimal128Type::type_id;
-constexpr Type::type Decimal256Type::type_id;
-
+constexpr Type::type Decimal256Type::type_id;
+
constexpr Type::type SparseUnionType::type_id;
constexpr Type::type DenseUnionType::type_id;
@@ -130,8 +130,8 @@ std::string ToString(Type::type id) {
TO_STRING_CASE(HALF_FLOAT)
TO_STRING_CASE(FLOAT)
TO_STRING_CASE(DOUBLE)
- TO_STRING_CASE(DECIMAL128)
- TO_STRING_CASE(DECIMAL256)
+ TO_STRING_CASE(DECIMAL128)
+ TO_STRING_CASE(DECIMAL256)
TO_STRING_CASE(DATE32)
TO_STRING_CASE(DATE64)
TO_STRING_CASE(TIME32)
@@ -188,32 +188,32 @@ int GetByteWidth(const DataType& type) {
namespace {
-struct PhysicalTypeVisitor {
- const std::shared_ptr<DataType>& real_type;
- std::shared_ptr<DataType> result;
-
- Status Visit(const DataType&) {
- result = real_type;
- return Status::OK();
- }
-
- template <typename Type, typename PhysicalType = typename Type::PhysicalType>
- Status Visit(const Type&) {
- result = TypeTraits<PhysicalType>::type_singleton();
- return Status::OK();
- }
-};
-
-} // namespace
-
-std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& real_type) {
- PhysicalTypeVisitor visitor{real_type, {}};
- ARROW_CHECK_OK(VisitTypeInline(*real_type, &visitor));
- return std::move(visitor.result);
-}
-
-namespace {
-
+struct PhysicalTypeVisitor {
+ const std::shared_ptr<DataType>& real_type;
+ std::shared_ptr<DataType> result;
+
+ Status Visit(const DataType&) {
+ result = real_type;
+ return Status::OK();
+ }
+
+ template <typename Type, typename PhysicalType = typename Type::PhysicalType>
+ Status Visit(const Type&) {
+ result = TypeTraits<PhysicalType>::type_singleton();
+ return Status::OK();
+ }
+};
+
+} // namespace
+
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& real_type) {
+ PhysicalTypeVisitor visitor{real_type, {}};
+ ARROW_CHECK_OK(VisitTypeInline(*real_type, &visitor));
+ return std::move(visitor.result);
+}
+
+namespace {
+
using internal::checked_cast;
// Merges `existing` and `other` if one of them is of NullType, otherwise
@@ -771,44 +771,44 @@ std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName(
return result;
}
-Result<std::shared_ptr<DataType>> DecimalType::Make(Type::type type_id, int32_t precision,
- int32_t scale) {
- if (type_id == Type::DECIMAL128) {
- return Decimal128Type::Make(precision, scale);
- } else if (type_id == Type::DECIMAL256) {
- return Decimal256Type::Make(precision, scale);
- } else {
- return Status::Invalid("Not a decimal type_id: ", type_id);
- }
-}
-
-// Taken from the Apache Impala codebase. The comments next
-// to the return values are the maximum value that can be represented in 2's
-// complement with the returned number of bytes.
-int32_t DecimalType::DecimalSize(int32_t precision) {
- DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
- << precision;
-
- // Generated in python with:
- // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
- // >>> [-1] + [decimal_size(i) for i in range(1, 77)]
- constexpr int32_t kBytes[] = {
- -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
- 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
- 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
- 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32};
-
- if (precision <= 76) {
- return kBytes[precision];
- }
- return static_cast<int32_t>(std::ceil((precision / 8.0) * std::log2(10) + 1));
-}
-
+Result<std::shared_ptr<DataType>> DecimalType::Make(Type::type type_id, int32_t precision,
+ int32_t scale) {
+ if (type_id == Type::DECIMAL128) {
+ return Decimal128Type::Make(precision, scale);
+ } else if (type_id == Type::DECIMAL256) {
+ return Decimal256Type::Make(precision, scale);
+ } else {
+ return Status::Invalid("Not a decimal type_id: ", type_id);
+ }
+}
+
+// Taken from the Apache Impala codebase. The comments next
+// to the return values are the maximum value that can be represented in 2's
+// complement with the returned number of bytes.
+int32_t DecimalType::DecimalSize(int32_t precision) {
+ DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
+ << precision;
+
+ // Generated in python with:
+ // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
+ // >>> [-1] + [decimal_size(i) for i in range(1, 77)]
+ constexpr int32_t kBytes[] = {
+ -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
+ 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
+ 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
+ 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32};
+
+ if (precision <= 76) {
+ return kBytes[precision];
+ }
+ return static_cast<int32_t>(std::ceil((precision / 8.0) * std::log2(10) + 1));
+}
+
// ----------------------------------------------------------------------
// Decimal128 type
Decimal128Type::Decimal128Type(int32_t precision, int32_t scale)
- : DecimalType(type_id, 16, precision, scale) {
+ : DecimalType(type_id, 16, precision, scale) {
ARROW_CHECK_GE(precision, kMinPrecision);
ARROW_CHECK_LE(precision, kMaxPrecision);
}
@@ -821,22 +821,22 @@ Result<std::shared_ptr<DataType>> Decimal128Type::Make(int32_t precision, int32_
}
// ----------------------------------------------------------------------
-// Decimal256 type
-
-Decimal256Type::Decimal256Type(int32_t precision, int32_t scale)
- : DecimalType(type_id, 32, precision, scale) {
- ARROW_CHECK_GE(precision, kMinPrecision);
- ARROW_CHECK_LE(precision, kMaxPrecision);
-}
-
-Result<std::shared_ptr<DataType>> Decimal256Type::Make(int32_t precision, int32_t scale) {
- if (precision < kMinPrecision || precision > kMaxPrecision) {
- return Status::Invalid("Decimal precision out of range: ", precision);
- }
- return std::make_shared<Decimal256Type>(precision, scale);
-}
-
-// ----------------------------------------------------------------------
+// Decimal256 type
+
+Decimal256Type::Decimal256Type(int32_t precision, int32_t scale)
+ : DecimalType(type_id, 32, precision, scale) {
+ ARROW_CHECK_GE(precision, kMinPrecision);
+ ARROW_CHECK_LE(precision, kMaxPrecision);
+}
+
+Result<std::shared_ptr<DataType>> Decimal256Type::Make(int32_t precision, int32_t scale) {
+ if (precision < kMinPrecision || precision > kMaxPrecision) {
+ return Status::Invalid("Decimal precision out of range: ", precision);
+ }
+ return std::make_shared<Decimal256Type>(precision, scale);
+}
+
+// ----------------------------------------------------------------------
// Dictionary-encoded type
Status DictionaryType::ValidateParameters(const DataType& index_type,
@@ -894,15 +894,15 @@ size_t FieldPath::hash() const {
}
std::string FieldPath::ToString() const {
- if (this->indices().empty()) {
- return "FieldPath(empty)";
- }
-
+ if (this->indices().empty()) {
+ return "FieldPath(empty)";
+ }
+
std::string repr = "FieldPath(";
for (auto index : this->indices()) {
repr += std::to_string(index) + " ";
}
- repr.back() = ')';
+ repr.back() = ')';
return repr;
}
@@ -964,10 +964,10 @@ struct FieldPathGetImpl {
int depth = 0;
const T* out;
for (int index : path->indices()) {
- if (children == nullptr) {
- return Status::NotImplemented("Get child data of non-struct array");
- }
-
+ if (children == nullptr) {
+ return Status::NotImplemented("Get child data of non-struct array");
+ }
+
if (index < 0 || static_cast<size_t>(index) >= children->size()) {
*out_of_range_depth = depth;
return nullptr;
@@ -1005,11 +1005,11 @@ struct FieldPathGetImpl {
const ArrayDataVector& child_data) {
return FieldPathGetImpl::Get(
path, &child_data,
- [](const std::shared_ptr<ArrayData>& data) -> const ArrayDataVector* {
- if (data->type->id() != Type::STRUCT) {
- return nullptr;
+ [](const std::shared_ptr<ArrayData>& data) -> const ArrayDataVector* {
+ if (data->type->id() != Type::STRUCT) {
+ return nullptr;
}
- return &data->child_data;
+ return &data->child_data;
});
}
};
@@ -1032,21 +1032,21 @@ Result<std::shared_ptr<Field>> FieldPath::Get(const FieldVector& fields) const {
Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const {
ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data()));
- return MakeArray(std::move(data));
+ return MakeArray(std::move(data));
}
-Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const {
- ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data()));
- return MakeArray(std::move(data));
-}
-
-Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const {
- if (data.type->id() != Type::STRUCT) {
- return Status::NotImplemented("Get child data of non-struct array");
- }
- return FieldPathGetImpl::Get(this, data.child_data);
+Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const {
+ ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data()));
+ return MakeArray(std::move(data));
}
+Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const {
+ if (data.type->id() != Type::STRUCT) {
+ return Status::NotImplemented("Get child data of non-struct array");
+ }
+ return FieldPathGetImpl::Get(this, data.child_data);
+}
+
FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
DCHECK_GT(util::get<FieldPath>(impl_).indices().size(), 0);
}
@@ -1054,13 +1054,13 @@ FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
void FieldRef::Flatten(std::vector<FieldRef> children) {
// flatten children
struct Visitor {
- void operator()(std::string* name) { *out++ = FieldRef(std::move(*name)); }
+ void operator()(std::string* name) { *out++ = FieldRef(std::move(*name)); }
- void operator()(FieldPath* indices) { *out++ = FieldRef(std::move(*indices)); }
+ void operator()(FieldPath* indices) { *out++ = FieldRef(std::move(*indices)); }
- void operator()(std::vector<FieldRef>* children) {
- for (auto& child : *children) {
- util::visit(*this, &child.impl_);
+ void operator()(std::vector<FieldRef>* children) {
+ for (auto& child : *children) {
+ util::visit(*this, &child.impl_);
}
}
@@ -1069,7 +1069,7 @@ void FieldRef::Flatten(std::vector<FieldRef> children) {
std::vector<FieldRef> out;
Visitor visitor{std::back_inserter(out)};
- visitor(&children);
+ visitor(&children);
DCHECK(!out.empty());
DCHECK(std::none_of(out.begin(), out.end(),
@@ -1195,10 +1195,10 @@ std::string FieldRef::ToString() const {
}
std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const {
- if (auto name = this->name()) {
- return internal::MapVector([](int i) { return FieldPath{i}; },
- schema.GetAllFieldIndices(*name));
- }
+ if (auto name = this->name()) {
+ return internal::MapVector([](int i) { return FieldPath{i}; },
+ schema.GetAllFieldIndices(*name));
+ }
return FindAll(schema.fields());
}
@@ -1296,11 +1296,11 @@ std::vector<FieldPath> FieldRef::FindAll(const FieldVector& fields) const {
return util::visit(Visitor{fields}, impl_);
}
-std::vector<FieldPath> FieldRef::FindAll(const ArrayData& array) const {
- return FindAll(*array.type);
+std::vector<FieldPath> FieldRef::FindAll(const ArrayData& array) const {
+ return FindAll(*array.type);
}
-std::vector<FieldPath> FieldRef::FindAll(const Array& array) const {
+std::vector<FieldPath> FieldRef::FindAll(const Array& array) const {
return FindAll(*array.type());
}
@@ -1313,56 +1313,56 @@ void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); }
// ----------------------------------------------------------------------
// Schema implementation
-std::string EndiannessToString(Endianness endianness) {
- switch (endianness) {
- case Endianness::Little:
- return "little";
- case Endianness::Big:
- return "big";
- default:
- DCHECK(false) << "invalid endianness";
- return "???";
- }
-}
-
+std::string EndiannessToString(Endianness endianness) {
+ switch (endianness) {
+ case Endianness::Little:
+ return "little";
+ case Endianness::Big:
+ return "big";
+ default:
+ DCHECK(false) << "invalid endianness";
+ return "???";
+ }
+}
+
class Schema::Impl {
public:
- Impl(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ Impl(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
std::shared_ptr<const KeyValueMetadata> metadata)
: fields_(std::move(fields)),
- endianness_(endianness),
+ endianness_(endianness),
name_to_index_(CreateNameToIndexMap(fields_)),
metadata_(std::move(metadata)) {}
std::vector<std::shared_ptr<Field>> fields_;
- Endianness endianness_;
+ Endianness endianness_;
std::unordered_multimap<std::string, int> name_to_index_;
std::shared_ptr<const KeyValueMetadata> metadata_;
};
-Schema::Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata)
- : detail::Fingerprintable(),
- impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {}
-
+Schema::Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata)
+ : detail::Fingerprintable(),
+ impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {}
+
Schema::Schema(std::vector<std::shared_ptr<Field>> fields,
std::shared_ptr<const KeyValueMetadata> metadata)
: detail::Fingerprintable(),
- impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {}
+ impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {}
Schema::Schema(const Schema& schema)
: detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {}
-Schema::~Schema() = default;
-
-std::shared_ptr<Schema> Schema::WithEndianness(Endianness endianness) const {
- return std::make_shared<Schema>(impl_->fields_, endianness, impl_->metadata_);
-}
-
-Endianness Schema::endianness() const { return impl_->endianness_; }
-
-bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; }
+Schema::~Schema() = default;
+std::shared_ptr<Schema> Schema::WithEndianness(Endianness endianness) const {
+ return std::make_shared<Schema>(impl_->fields_, endianness, impl_->metadata_);
+}
+
+Endianness Schema::endianness() const { return impl_->endianness_; }
+
+bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; }
+
int Schema::num_fields() const { return static_cast<int>(impl_->fields_.size()); }
const std::shared_ptr<Field>& Schema::field(int i) const {
@@ -1380,11 +1380,11 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const {
return true;
}
- // checks endianness equality
- if (endianness() != other.endianness()) {
- return false;
- }
-
+ // checks endianness equality
+ if (endianness() != other.endianness()) {
+ return false;
+ }
+
// checks field equality
if (num_fields() != other.num_fields()) {
return false;
@@ -1509,7 +1509,7 @@ std::shared_ptr<Schema> Schema::WithMetadata(
return std::make_shared<Schema>(impl_->fields_, metadata);
}
-const std::shared_ptr<const KeyValueMetadata>& Schema::metadata() const {
+const std::shared_ptr<const KeyValueMetadata>& Schema::metadata() const {
return impl_->metadata_;
}
@@ -1529,10 +1529,10 @@ std::string Schema::ToString(bool show_metadata) const {
++i;
}
- if (impl_->endianness_ != Endianness::Native) {
- buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --";
- }
-
+ if (impl_->endianness_ != Endianness::Native) {
+ buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --";
+ }
+
if (show_metadata && HasMetadata()) {
buffer << impl_->metadata_->ToString();
}
@@ -1712,12 +1712,12 @@ std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
return std::make_shared<Schema>(std::move(fields), std::move(metadata));
}
-std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
- Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata) {
- return std::make_shared<Schema>(std::move(fields), endianness, std::move(metadata));
-}
-
+std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
+ Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Schema>(std::move(fields), endianness, std::move(metadata));
+}
+
Result<std::shared_ptr<Schema>> UnifySchemas(
const std::vector<std::shared_ptr<Schema>>& schemas,
const Field::MergeOptions field_merge_options) {
@@ -1876,7 +1876,7 @@ std::string Schema::ComputeFingerprint() const {
}
ss << field_fingerprint << ";";
}
- ss << (endianness() == Endianness::Little ? "L" : "B");
+ ss << (endianness() == Endianness::Little ? "L" : "B");
ss << "}";
return ss.str();
}
@@ -2248,35 +2248,35 @@ std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
std::move(metadata));
}
-std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
- std::shared_ptr<const KeyValueMetadata> metadata) {
- return std::make_shared<Field>(std::move(name), std::move(type), /*nullable=*/true,
- std::move(metadata));
-}
-
+std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Field>(std::move(name), std::move(type), /*nullable=*/true,
+ std::move(metadata));
+}
+
std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
- return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale)
- : decimal256(precision, scale);
-}
-
-std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale) {
+ return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale)
+ : decimal256(precision, scale);
+}
+
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale) {
return std::make_shared<Decimal128Type>(precision, scale);
}
-std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale) {
- return std::make_shared<Decimal256Type>(precision, scale);
-}
-
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale) {
+ return std::make_shared<Decimal256Type>(precision, scale);
+}
+
std::string Decimal128Type::ToString() const {
std::stringstream s;
- s << "decimal128(" << precision_ << ", " << scale_ << ")";
- return s.str();
-}
-
-std::string Decimal256Type::ToString() const {
- std::stringstream s;
- s << "decimal256(" << precision_ << ", " << scale_ << ")";
+ s << "decimal128(" << precision_ << ", " << scale_ << ")";
return s.str();
}
+std::string Decimal256Type::ToString() const {
+ std::stringstream s;
+ s << "decimal256(" << precision_ << ", " << scale_ << ")";
+ return s.str();
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type.h b/contrib/libs/apache/arrow/cpp/src/arrow/type.h
index b933da66089..eb65603e0ea 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type.h
@@ -30,7 +30,7 @@
#include "arrow/result.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/util/checked_cast.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
#include "arrow/util/variant.h"
#include "arrow/util/visibility.h"
@@ -127,7 +127,7 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
ARROW_DEPRECATED("Use field(i)")
const std::shared_ptr<Field>& child(int i) const { return field(i); }
- /// Returns the child-field at index i.
+ /// Returns the child-field at index i.
const std::shared_ptr<Field>& field(int i) const { return children_[i]; }
ARROW_DEPRECATED("Use fields()")
@@ -182,18 +182,18 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
ARROW_EXPORT
std::ostream& operator<<(std::ostream& os, const DataType& type);
-/// \brief Return the compatible physical data type
-///
-/// Some types may have distinct logical meanings but the exact same physical
-/// representation. For example, TimestampType has Int64Type as a physical
-/// type (defined as TimestampType::PhysicalType).
-///
-/// The return value is as follows:
-/// - if a `PhysicalType` alias exists in the concrete type class, return
-/// an instance of `PhysicalType`.
-/// - otherwise, return the input type itself.
-std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type);
-
+/// \brief Return the compatible physical data type
+///
+/// Some types may have distinct logical meanings but the exact same physical
+/// representation. For example, TimestampType has Int64Type as a physical
+/// type (defined as TimestampType::PhysicalType).
+///
+/// The return value is as follows:
+/// - if a `PhysicalType` alias exists in the concrete type class, return
+/// an instance of `PhysicalType`.
+/// - otherwise, return the input type itself.
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type);
+
/// \brief Base class for all fixed-width data types
class ARROW_EXPORT FixedWidthType : public DataType {
public:
@@ -626,10 +626,10 @@ class ARROW_EXPORT LargeListType : public BaseListType {
/// \brief Concrete type class for map data
///
/// Map data is nested data where each value is a variable number of
-/// key-item pairs. Its physical representation is the same as
-/// a list of `{key, item}` structs.
-///
-/// Maps can be recursively nested, for example map(utf8, map(utf8, int32)).
+/// key-item pairs. Its physical representation is the same as
+/// a list of `{key, item}` structs.
+///
+/// Maps can be recursively nested, for example map(utf8, map(utf8, int32)).
class ARROW_EXPORT MapType : public ListType {
public:
static constexpr Type::type type_id = Type::MAP;
@@ -876,22 +876,22 @@ class ARROW_EXPORT StructType : public NestedType {
/// \brief Base type class for (fixed-size) decimal data
class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
public:
- explicit DecimalType(Type::type type_id, int32_t byte_width, int32_t precision,
- int32_t scale)
- : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
-
- /// Constructs concrete decimal types
- static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
- int32_t scale);
-
+ explicit DecimalType(Type::type type_id, int32_t byte_width, int32_t precision,
+ int32_t scale)
+ : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
+
+ /// Constructs concrete decimal types
+ static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
+ int32_t scale);
+
int32_t precision() const { return precision_; }
int32_t scale() const { return scale_; }
- /// \brief Returns the number of bytes needed for precision.
- ///
- /// precision must be >= 1
- static int32_t DecimalSize(int32_t precision);
-
+ /// \brief Returns the number of bytes needed for precision.
+ ///
+ /// precision must be >= 1
+ static int32_t DecimalSize(int32_t precision);
+
protected:
std::string ComputeFingerprint() const override;
@@ -900,24 +900,24 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
};
/// \brief Concrete type class for 128-bit decimal data
-///
-/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
-/// integer. The precision is the number of significant digits that the
-/// decimal type can represent; the scale is the number of digits after
-/// the decimal point (note the scale can be negative).
-///
-/// As an example, `Decimal128Type(7, 3)` can exactly represent the numbers
-/// 1234.567 and -1234.567 (encoded internally as the 128-bit integers
-/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
-///
-/// Decimal128Type has a maximum precision of 38 significant digits
-/// (also available as Decimal128Type::kMaxPrecision).
-/// If higher precision is needed, consider using Decimal256Type.
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer. The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// As an example, `Decimal128Type(7, 3)` can exactly represent the numbers
+/// 1234.567 and -1234.567 (encoded internally as the 128-bit integers
+/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+///
+/// Decimal128Type has a maximum precision of 38 significant digits
+/// (also available as Decimal128Type::kMaxPrecision).
+/// If higher precision is needed, consider using Decimal256Type.
class ARROW_EXPORT Decimal128Type : public DecimalType {
public:
- static constexpr Type::type type_id = Type::DECIMAL128;
+ static constexpr Type::type type_id = Type::DECIMAL128;
- static constexpr const char* type_name() { return "decimal128"; }
+ static constexpr const char* type_name() { return "decimal128"; }
/// Decimal128Type constructor that aborts on invalid input.
explicit Decimal128Type(int32_t precision, int32_t scale);
@@ -926,47 +926,47 @@ class ARROW_EXPORT Decimal128Type : public DecimalType {
static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
std::string ToString() const override;
- std::string name() const override { return "decimal128"; }
+ std::string name() const override { return "decimal128"; }
static constexpr int32_t kMinPrecision = 1;
static constexpr int32_t kMaxPrecision = 38;
- static constexpr int32_t kByteWidth = 16;
-};
-
-/// \brief Concrete type class for 256-bit decimal data
-///
-/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
-/// integer. The precision is the number of significant digits that the
-/// decimal type can represent; the scale is the number of digits after
-/// the decimal point (note the scale can be negative).
-///
-/// Decimal256Type has a maximum precision of 76 significant digits.
-/// (also available as Decimal256Type::kMaxPrecision).
-///
-/// For most use cases, the maximum precision offered by Decimal128Type
-/// is sufficient, and it will result in a more compact and more efficient
-/// encoding.
-class ARROW_EXPORT Decimal256Type : public DecimalType {
- public:
- static constexpr Type::type type_id = Type::DECIMAL256;
-
- static constexpr const char* type_name() { return "decimal256"; }
-
- /// Decimal256Type constructor that aborts on invalid input.
- explicit Decimal256Type(int32_t precision, int32_t scale);
-
- /// Decimal256Type constructor that returns an error on invalid input.
- static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
-
- std::string ToString() const override;
- std::string name() const override { return "decimal256"; }
-
- static constexpr int32_t kMinPrecision = 1;
- static constexpr int32_t kMaxPrecision = 76;
- static constexpr int32_t kByteWidth = 32;
+ static constexpr int32_t kByteWidth = 16;
};
-/// \brief Base type class for union data
+/// \brief Concrete type class for 256-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer. The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// Decimal256Type has a maximum precision of 76 significant digits.
+/// (also available as Decimal256Type::kMaxPrecision).
+///
+/// For most use cases, the maximum precision offered by Decimal128Type
+/// is sufficient, and it will result in a more compact and more efficient
+/// encoding.
+class ARROW_EXPORT Decimal256Type : public DecimalType {
+ public:
+ static constexpr Type::type type_id = Type::DECIMAL256;
+
+ static constexpr const char* type_name() { return "decimal256"; }
+
+ /// Decimal256Type constructor that aborts on invalid input.
+ explicit Decimal256Type(int32_t precision, int32_t scale);
+
+ /// Decimal256Type constructor that returns an error on invalid input.
+ static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+ std::string ToString() const override;
+ std::string name() const override { return "decimal256"; }
+
+ static constexpr int32_t kMinPrecision = 1;
+ static constexpr int32_t kMaxPrecision = 76;
+ static constexpr int32_t kByteWidth = 32;
+};
+
+/// \brief Base type class for union data
class ARROW_EXPORT UnionType : public NestedType {
public:
static constexpr int8_t kMaxTypeCode = 127;
@@ -1014,17 +1014,17 @@ class ARROW_EXPORT UnionType : public NestedType {
std::vector<int> child_ids_;
};
-/// \brief Concrete type class for sparse union data
-///
-/// A sparse union is a nested type where each logical value is taken from
-/// a single child. A buffer of 8-bit type ids indicates which child
-/// a given logical value is to be taken from.
-///
-/// In a sparse union, each child array should have the same length as the
-/// union array, regardless of the actual number of union values that
-/// refer to it.
-///
-/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+/// \brief Concrete type class for sparse union data
+///
+/// A sparse union is a nested type where each logical value is taken from
+/// a single child. A buffer of 8-bit type ids indicates which child
+/// a given logical value is to be taken from.
+///
+/// In a sparse union, each child array should have the same length as the
+/// union array, regardless of the actual number of union values that
+/// refer to it.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
class ARROW_EXPORT SparseUnionType : public UnionType {
public:
static constexpr Type::type type_id = Type::SPARSE_UNION;
@@ -1041,20 +1041,20 @@ class ARROW_EXPORT SparseUnionType : public UnionType {
std::string name() const override { return "sparse_union"; }
};
-/// \brief Concrete type class for dense union data
-///
-/// A dense union is a nested type where each logical value is taken from
-/// a single child, at a specific offset. A buffer of 8-bit type ids
-/// indicates which child a given logical value is to be taken from,
-/// and a buffer of 32-bit offsets indicates at which physical position
-/// in the given child array the logical value is to be taken from.
-///
-/// Unlike a sparse union, a dense union allows encoding only the child array
-/// values which are actually referred to by the union array. This is
-/// counterbalanced by the additional footprint of the offsets buffer, and
-/// the additional indirection cost when looking up values.
-///
-/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+/// \brief Concrete type class for dense union data
+///
+/// A dense union is a nested type where each logical value is taken from
+/// a single child, at a specific offset. A buffer of 8-bit type ids
+/// indicates which child a given logical value is to be taken from,
+/// and a buffer of 32-bit offsets indicates at which physical position
+/// in the given child array the logical value is to be taken from.
+///
+/// Unlike a sparse union, a dense union allows encoding only the child array
+/// values which are actually referred to by the union array. This is
+/// counterbalanced by the additional footprint of the offsets buffer, and
+/// the additional indirection cost when looking up values.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
class ARROW_EXPORT DenseUnionType : public UnionType {
public:
static constexpr Type::type type_id = Type::DENSE_UNION;
@@ -1413,7 +1413,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
/// FieldPaths provide a number of accessors for drilling down to potentially nested
/// children. They are overloaded for convenience to support Schema (returns a field),
/// DataType (returns a child field), Field (returns a child field of this field's type)
-/// Array (returns a child array), RecordBatch (returns a column).
+/// Array (returns a child array), RecordBatch (returns a column).
class ARROW_EXPORT FieldPath {
public:
FieldPath() = default;
@@ -1427,11 +1427,11 @@ class ARROW_EXPORT FieldPath {
std::string ToString() const;
size_t hash() const;
- struct Hash {
- size_t operator()(const FieldPath& path) const { return path.hash(); }
- };
+ struct Hash {
+ size_t operator()(const FieldPath& path) const { return path.hash(); }
+ };
- bool empty() const { return indices_.empty(); }
+ bool empty() const { return indices_.empty(); }
bool operator==(const FieldPath& other) const { return indices() == other.indices(); }
bool operator!=(const FieldPath& other) const { return indices() != other.indices(); }
@@ -1449,9 +1449,9 @@ class ARROW_EXPORT FieldPath {
/// \brief Retrieve the referenced column from a RecordBatch or Table
Result<std::shared_ptr<Array>> Get(const RecordBatch& batch) const;
- /// \brief Retrieve the referenced child from an Array or ArrayData
+ /// \brief Retrieve the referenced child from an Array or ArrayData
Result<std::shared_ptr<Array>> Get(const Array& array) const;
- Result<std::shared_ptr<ArrayData>> Get(const ArrayData& data) const;
+ Result<std::shared_ptr<ArrayData>> Get(const ArrayData& data) const;
private:
std::vector<int> indices_;
@@ -1543,13 +1543,13 @@ class ARROW_EXPORT FieldRef {
std::string ToString() const;
size_t hash() const;
- struct Hash {
- size_t operator()(const FieldRef& ref) const { return ref.hash(); }
- };
-
- explicit operator bool() const { return Equals(FieldPath{}); }
- bool operator!() const { return !Equals(FieldPath{}); }
+ struct Hash {
+ size_t operator()(const FieldRef& ref) const { return ref.hash(); }
+ };
+ explicit operator bool() const { return Equals(FieldPath{}); }
+ bool operator!() const { return !Equals(FieldPath{}); }
+
bool IsFieldPath() const { return util::holds_alternative<FieldPath>(impl_); }
bool IsName() const { return util::holds_alternative<std::string>(impl_); }
bool IsNested() const {
@@ -1558,13 +1558,13 @@ class ARROW_EXPORT FieldRef {
return true;
}
- const FieldPath* field_path() const {
- return IsFieldPath() ? &util::get<FieldPath>(impl_) : NULLPTR;
- }
- const std::string* name() const {
- return IsName() ? &util::get<std::string>(impl_) : NULLPTR;
- }
-
+ const FieldPath* field_path() const {
+ return IsFieldPath() ? &util::get<FieldPath>(impl_) : NULLPTR;
+ }
+ const std::string* name() const {
+ return IsName() ? &util::get<std::string>(impl_) : NULLPTR;
+ }
+
/// \brief Retrieve FieldPath of every child field which matches this FieldRef.
std::vector<FieldPath> FindAll(const Schema& schema) const;
std::vector<FieldPath> FindAll(const Field& field) const;
@@ -1572,7 +1572,7 @@ class ARROW_EXPORT FieldRef {
std::vector<FieldPath> FindAll(const FieldVector& fields) const;
/// \brief Convenience function which applies FindAll to arg's type or schema.
- std::vector<FieldPath> FindAll(const ArrayData& array) const;
+ std::vector<FieldPath> FindAll(const ArrayData& array) const;
std::vector<FieldPath> FindAll(const Array& array) const;
std::vector<FieldPath> FindAll(const RecordBatch& batch) const;
@@ -1644,16 +1644,16 @@ class ARROW_EXPORT FieldRef {
template <typename T>
Result<GetType<T>> GetOneOrNone(const T& root) const {
ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root));
- if (match.empty()) {
- return static_cast<GetType<T>>(NULLPTR);
+ if (match.empty()) {
+ return static_cast<GetType<T>>(NULLPTR);
}
- return match.Get(root).ValueOrDie();
+ return match.Get(root).ValueOrDie();
}
private:
void Flatten(std::vector<FieldRef> children);
- util::Variant<FieldPath, std::string, std::vector<FieldRef>> impl_;
+ util::Variant<FieldPath, std::string, std::vector<FieldRef>> impl_;
ARROW_EXPORT friend void PrintTo(const FieldRef& ref, std::ostream* os);
};
@@ -1661,16 +1661,16 @@ class ARROW_EXPORT FieldRef {
// ----------------------------------------------------------------------
// Schema
-enum class Endianness {
- Little = 0,
- Big = 1,
-#if ARROW_LITTLE_ENDIAN
- Native = Little
-#else
- Native = Big
-#endif
-};
-
+enum class Endianness {
+ Little = 0,
+ Big = 1,
+#if ARROW_LITTLE_ENDIAN
+ Native = Little
+#else
+ Native = Big
+#endif
+};
+
/// \class Schema
/// \brief Sequence of arrow::Field objects describing the columns of a record
/// batch or table data structure
@@ -1678,12 +1678,12 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
public util::EqualityComparable<Schema>,
public util::ToStringOstreamable<Schema> {
public:
- explicit Schema(FieldVector fields, Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-
- explicit Schema(FieldVector fields,
+ explicit Schema(FieldVector fields, Endianness endianness,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+ explicit Schema(FieldVector fields,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
Schema(const Schema&);
~Schema() override;
@@ -1692,24 +1692,24 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
bool Equals(const Schema& other, bool check_metadata = false) const;
bool Equals(const std::shared_ptr<Schema>& other, bool check_metadata = false) const;
- /// \brief Set endianness in the schema
- ///
- /// \return new Schema
- std::shared_ptr<Schema> WithEndianness(Endianness endianness) const;
-
- /// \brief Return endianness in the schema
- Endianness endianness() const;
-
- /// \brief Indicate if endianness is equal to platform-native endianness
- bool is_native_endian() const;
-
+ /// \brief Set endianness in the schema
+ ///
+ /// \return new Schema
+ std::shared_ptr<Schema> WithEndianness(Endianness endianness) const;
+
+ /// \brief Return endianness in the schema
+ Endianness endianness() const;
+
+ /// \brief Indicate if endianness is equal to platform-native endianness
+ bool is_native_endian() const;
+
/// \brief Return the number of fields (columns) in the schema
int num_fields() const;
/// Return the ith schema element. Does not boundscheck
const std::shared_ptr<Field>& field(int i) const;
- const FieldVector& fields() const;
+ const FieldVector& fields() const;
std::vector<std::string> field_names() const;
@@ -1717,7 +1717,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
/// \brief Return the indices of all fields having this name in sorted order
- FieldVector GetAllFieldsByName(const std::string& name) const;
+ FieldVector GetAllFieldsByName(const std::string& name) const;
/// Returns -1 if name not found
int GetFieldIndex(const std::string& name) const;
@@ -1731,7 +1731,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
/// \brief The custom key-value metadata, if any
///
/// \return metadata may be null
- const std::shared_ptr<const KeyValueMetadata>& metadata() const;
+ const std::shared_ptr<const KeyValueMetadata>& metadata() const;
/// \brief Render a string representation of the schema suitable for debugging
/// \param[in] show_metadata when true, if KeyValueMetadata is non-empty,
@@ -1771,9 +1771,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
std::unique_ptr<Impl> impl_;
};
-ARROW_EXPORT
-std::string EndiannessToString(Endianness endianness);
-
+ARROW_EXPORT
+std::string EndiannessToString(Endianness endianness);
+
// ----------------------------------------------------------------------
/// \brief Convenience class to incrementally construct/merge schemas.
@@ -1802,18 +1802,18 @@ class ARROW_EXPORT SchemaBuilder {
};
/// \brief Construct an empty SchemaBuilder
- /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
SchemaBuilder(
ConflictPolicy conflict_policy = CONFLICT_APPEND,
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
/// \brief Construct a SchemaBuilder from a list of fields
- /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
SchemaBuilder(
std::vector<std::shared_ptr<Field>> fields,
ConflictPolicy conflict_policy = CONFLICT_APPEND,
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
/// \brief Construct a SchemaBuilder from a schema, preserving the metadata
- /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
SchemaBuilder(
const std::shared_ptr<Schema>& schema,
ConflictPolicy conflict_policy = CONFLICT_APPEND,
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
index 7e564106bbe..80b8345b625 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
@@ -29,20 +29,20 @@ namespace arrow {
template <typename T>
class Iterator;
-template <typename T>
-struct IterationTraits;
+template <typename T>
+struct IterationTraits;
template <typename T>
class Result;
class Status;
-namespace internal {
-struct Empty;
-} // namespace internal
-template <typename T = internal::Empty>
-class Future;
-
+namespace internal {
+struct Empty;
+} // namespace internal
+template <typename T = internal::Empty>
+class Future;
+
namespace util {
class Codec;
} // namespace util
@@ -60,7 +60,7 @@ class DataType;
class Field;
class FieldRef;
class KeyValueMetadata;
-enum class Endianness;
+enum class Endianness;
class Schema;
using DataTypeVector = std::vector<std::shared_ptr<DataType>>;
@@ -80,9 +80,9 @@ class RecordBatch;
class RecordBatchReader;
class Table;
-struct Datum;
-struct ValueDescr;
-
+struct Datum;
+struct ValueDescr;
+
using ChunkedArrayVector = std::vector<std::shared_ptr<ChunkedArray>>;
using RecordBatchVector = std::vector<std::shared_ptr<RecordBatch>>;
using RecordBatchIterator = Iterator<std::shared_ptr<RecordBatch>>;
@@ -154,16 +154,16 @@ class StructBuilder;
struct StructScalar;
class Decimal128;
-class Decimal256;
+class Decimal256;
class DecimalType;
class Decimal128Type;
-class Decimal256Type;
+class Decimal256Type;
class Decimal128Array;
-class Decimal256Array;
+class Decimal256Array;
class Decimal128Builder;
-class Decimal256Builder;
+class Decimal256Builder;
struct Decimal128Scalar;
-struct Decimal256Scalar;
+struct Decimal256Scalar;
struct UnionMode {
enum type { SPARSE, DENSE };
@@ -262,9 +262,9 @@ class ExtensionType;
class ExtensionArray;
struct ExtensionScalar;
-class Tensor;
-class SparseTensor;
-
+class Tensor;
+class SparseTensor;
+
// ----------------------------------------------------------------------
struct Type {
@@ -345,15 +345,15 @@ struct Type {
/// DAY_TIME interval in SQL style
INTERVAL_DAY_TIME,
- /// Precision- and scale-based decimal type with 128 bits.
- DECIMAL128,
-
- /// Defined for backward-compatibility.
- DECIMAL = DECIMAL128,
-
- /// Precision- and scale-based decimal type with 256 bits.
- DECIMAL256,
+ /// Precision- and scale-based decimal type with 128 bits.
+ DECIMAL128,
+ /// Defined for backward-compatibility.
+ DECIMAL = DECIMAL128,
+
+ /// Precision- and scale-based decimal type with 256 bits.
+ DECIMAL256,
+
/// A list of some logical data type
LIST,
@@ -447,21 +447,21 @@ std::shared_ptr<DataType> ARROW_EXPORT date64();
ARROW_EXPORT
std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width);
-/// \brief Create a DecimalType instance depending on the precision
-///
-/// If the precision is greater than 38, a Decimal256Type is returned,
-/// otherwise a Decimal128Type.
+/// \brief Create a DecimalType instance depending on the precision
+///
+/// If the precision is greater than 38, a Decimal256Type is returned,
+/// otherwise a Decimal128Type.
ARROW_EXPORT
std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale);
-/// \brief Create a Decimal128Type instance
-ARROW_EXPORT
-std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale);
-
-/// \brief Create a Decimal256Type instance
-ARROW_EXPORT
-std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale);
-
+/// \brief Create a Decimal128Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal256Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale);
+
/// \brief Create a ListType instance from its child Field type
ARROW_EXPORT
std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_type);
@@ -502,7 +502,7 @@ ARROW_EXPORT
std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type,
int32_t list_size);
/// \brief Return a Duration instance (naming use _type to avoid namespace conflict with
-/// built in time classes).
+/// built in time classes).
std::shared_ptr<DataType> ARROW_EXPORT duration(TimeUnit::type unit);
/// \brief Return a DayTimeIntervalType instance
@@ -638,17 +638,17 @@ std::shared_ptr<Field> ARROW_EXPORT
field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-/// \brief Create a Field instance with metadata
-///
-/// The field will be assumed to be nullable.
-///
-/// \param name the field name
-/// \param type the field value type
-/// \param metadata any custom key-value metadata
-std::shared_ptr<Field> ARROW_EXPORT
-field(std::string name, std::shared_ptr<DataType> type,
- std::shared_ptr<const KeyValueMetadata> metadata);
-
+/// \brief Create a Field instance with metadata
+///
+/// The field will be assumed to be nullable.
+///
+/// \param name the field name
+/// \param type the field value type
+/// \param metadata any custom key-value metadata
+std::shared_ptr<Field> ARROW_EXPORT
+field(std::string name, std::shared_ptr<DataType> type,
+ std::shared_ptr<const KeyValueMetadata> metadata);
+
/// \brief Create a Schema instance
///
/// \param fields the schema's fields
@@ -659,17 +659,17 @@ std::shared_ptr<Schema> schema(
std::vector<std::shared_ptr<Field>> fields,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-/// \brief Create a Schema instance
-///
-/// \param fields the schema's fields
-/// \param endianness the endianness of the data
-/// \param metadata any custom key-value metadata, default null
-/// \return schema shared_ptr to Schema
-ARROW_EXPORT
-std::shared_ptr<Schema> schema(
- std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-
+/// \brief Create a Schema instance
+///
+/// \param fields the schema's fields
+/// \param endianness the endianness of the data
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+ std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
/// @}
/// Return the process-wide default memory pool.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
index e4d809967f9..c9637e09ed5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
@@ -66,8 +66,8 @@ TYPE_ID_TRAIT(TIMESTAMP, TimestampType)
TYPE_ID_TRAIT(INTERVAL_DAY_TIME, DayTimeIntervalType)
TYPE_ID_TRAIT(INTERVAL_MONTHS, MonthIntervalType)
TYPE_ID_TRAIT(DURATION, DurationType)
-TYPE_ID_TRAIT(DECIMAL128, Decimal128Type)
-TYPE_ID_TRAIT(DECIMAL256, Decimal256Type)
+TYPE_ID_TRAIT(DECIMAL128, Decimal128Type)
+TYPE_ID_TRAIT(DECIMAL256, Decimal256Type)
TYPE_ID_TRAIT(STRUCT, StructType)
TYPE_ID_TRAIT(LIST, ListType)
TYPE_ID_TRAIT(LARGE_LIST, LargeListType)
@@ -233,7 +233,7 @@ struct TypeTraits<MonthIntervalType> {
using ArrayType = MonthIntervalArray;
using BuilderType = MonthIntervalBuilder;
using ScalarType = MonthIntervalScalar;
- using CType = MonthIntervalType::c_type;
+ using CType = MonthIntervalType::c_type;
static constexpr int64_t bytes_required(int64_t elements) {
return elements * static_cast<int64_t>(sizeof(int32_t));
@@ -291,14 +291,14 @@ struct TypeTraits<Decimal128Type> {
};
template <>
-struct TypeTraits<Decimal256Type> {
- using ArrayType = Decimal256Array;
- using BuilderType = Decimal256Builder;
- using ScalarType = Decimal256Scalar;
- constexpr static bool is_parameter_free = false;
-};
-
-template <>
+struct TypeTraits<Decimal256Type> {
+ using ArrayType = Decimal256Array;
+ using BuilderType = Decimal256Builder;
+ using ScalarType = Decimal256Scalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
struct TypeTraits<BinaryType> {
using ArrayType = BinaryArray;
using BuilderType = BinaryBuilder;
@@ -587,18 +587,18 @@ using is_decimal_type = std::is_base_of<DecimalType, T>;
template <typename T, typename R = void>
using enable_if_decimal = enable_if_t<is_decimal_type<T>::value, R>;
-template <typename T>
-using is_decimal128_type = std::is_base_of<Decimal128Type, T>;
-
-template <typename T, typename R = void>
-using enable_if_decimal128 = enable_if_t<is_decimal128_type<T>::value, R>;
-
-template <typename T>
-using is_decimal256_type = std::is_base_of<Decimal256Type, T>;
-
-template <typename T, typename R = void>
-using enable_if_decimal256 = enable_if_t<is_decimal256_type<T>::value, R>;
-
+template <typename T>
+using is_decimal128_type = std::is_base_of<Decimal128Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal128 = enable_if_t<is_decimal128_type<T>::value, R>;
+
+template <typename T>
+using is_decimal256_type = std::is_base_of<Decimal256Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal256 = enable_if_t<is_decimal256_type<T>::value, R>;
+
// Nested Types
template <typename T>
@@ -636,7 +636,7 @@ template <typename T>
using is_list_type =
std::integral_constant<bool, std::is_same<T, ListType>::value ||
std::is_same<T, LargeListType>::value ||
- std::is_same<T, FixedSizeListType>::value>;
+ std::is_same<T, FixedSizeListType>::value>;
template <typename T, typename R = void>
using enable_if_list_type = enable_if_t<is_list_type<T>::value, R>;
@@ -846,17 +846,17 @@ static inline bool is_floating(Type::type type_id) {
return false;
}
-static inline bool is_decimal(Type::type type_id) {
- switch (type_id) {
- case Type::DECIMAL128:
- case Type::DECIMAL256:
- return true;
- default:
- break;
- }
- return false;
-}
-
+static inline bool is_decimal(Type::type type_id) {
+ switch (type_id) {
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
static inline bool is_primitive(Type::type type_id) {
switch (type_id) {
case Type::BOOL:
@@ -927,8 +927,8 @@ static inline bool is_dictionary(Type::type type_id) {
static inline bool is_fixed_size_binary(Type::type type_id) {
switch (type_id) {
- case Type::DECIMAL128:
- case Type::DECIMAL256:
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
case Type::FIXED_SIZE_BINARY:
return true;
default:
@@ -941,52 +941,52 @@ static inline bool is_fixed_width(Type::type type_id) {
return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
}
-static inline int bit_width(Type::type type_id) {
- switch (type_id) {
- case Type::BOOL:
- return 1;
- case Type::UINT8:
- case Type::INT8:
- return 8;
- case Type::UINT16:
- case Type::INT16:
- return 16;
- case Type::UINT32:
- case Type::INT32:
- case Type::DATE32:
- case Type::TIME32:
- return 32;
- case Type::UINT64:
- case Type::INT64:
- case Type::DATE64:
- case Type::TIME64:
- case Type::TIMESTAMP:
- case Type::DURATION:
- return 64;
-
- case Type::HALF_FLOAT:
- return 16;
- case Type::FLOAT:
- return 32;
- case Type::DOUBLE:
- return 64;
-
- case Type::INTERVAL_MONTHS:
- return 32;
- case Type::INTERVAL_DAY_TIME:
- return 64;
-
- case Type::DECIMAL128:
- return 128;
- case Type::DECIMAL256:
- return 256;
-
- default:
- break;
- }
- return 0;
-}
-
+static inline int bit_width(Type::type type_id) {
+ switch (type_id) {
+ case Type::BOOL:
+ return 1;
+ case Type::UINT8:
+ case Type::INT8:
+ return 8;
+ case Type::UINT16:
+ case Type::INT16:
+ return 16;
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return 32;
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIME64:
+ case Type::TIMESTAMP:
+ case Type::DURATION:
+ return 64;
+
+ case Type::HALF_FLOAT:
+ return 16;
+ case Type::FLOAT:
+ return 32;
+ case Type::DOUBLE:
+ return 64;
+
+ case Type::INTERVAL_MONTHS:
+ return 32;
+ case Type::INTERVAL_DAY_TIME:
+ return 64;
+
+ case Type::DECIMAL128:
+ return 128;
+ case Type::DECIMAL256:
+ return 256;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
static inline bool is_nested(Type::type type_id) {
switch (type_id) {
case Type::LIST:
@@ -1003,22 +1003,22 @@ static inline bool is_nested(Type::type type_id) {
return false;
}
-static inline int offset_bit_width(Type::type type_id) {
- switch (type_id) {
- case Type::STRING:
- case Type::BINARY:
- case Type::LIST:
- case Type::MAP:
- case Type::DENSE_UNION:
- return 32;
- case Type::LARGE_STRING:
- case Type::LARGE_BINARY:
- case Type::LARGE_LIST:
- return 64;
- default:
- break;
- }
- return 0;
-}
-
+static inline int offset_bit_width(Type::type type_id) {
+ switch (type_id) {
+ case Type::STRING:
+ case Type::BINARY:
+ case Type::LIST:
+ case Type::MAP:
+ case Type::DENSE_UNION:
+ return 32;
+ case Type::LARGE_STRING:
+ case Type::LARGE_BINARY:
+ case Type::LARGE_LIST:
+ return 64;
+ default:
+ break;
+ }
+ return 0;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
index 2a0e6ba709d..8f9ae1f7706 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
@@ -1,33 +1,33 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/result.h"
-
-namespace arrow {
-
-template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
-Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
- UnaryOperation unary_op) {
- for (; first != last; ++first, (void)++out) {
- ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
- }
- return Status::OK();
-}
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+
+namespace arrow {
+
+template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
+Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
+ UnaryOperation unary_op) {
+ for (; first != last; ++first, (void)++out) {
+ ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
index 9d1021edff5..c672ebab778 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
@@ -1,1614 +1,1614 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <atomic>
-#include <cassert>
-#include <cstring>
-#include <deque>
-#include <limits>
-#include <queue>
-
-#include "arrow/util/functional.h"
-#include "arrow/util/future.h"
-#include "arrow/util/io_util.h"
-#include "arrow/util/iterator.h"
-#include "arrow/util/mutex.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/queue.h"
-#include "arrow/util/thread_pool.h"
-
-namespace arrow {
-
-// The methods in this file create, modify, and utilize AsyncGenerator which is an
-// iterator of futures. This allows an asynchronous source (like file input) to be run
-// through a pipeline in the same way that iterators can be used to create pipelined
-// workflows.
-//
-// In order to support pipeline parallelism we introduce the concept of asynchronous
-// reentrancy. This is different than synchronous reentrancy. With synchronous code a
-// function is reentrant if the function can be called again while a previous call to that
-// function is still running. Unless otherwise specified none of these generators are
-// synchronously reentrant. Care should be taken to avoid calling them in such a way (and
-// the utilities Visit/Collect/Await take care to do this).
-//
-// Asynchronous reentrancy on the other hand means the function is called again before the
-// future returned by the function is marked finished (but after the call to get the
-// future returns). Some of these generators are async-reentrant while others (e.g.
-// those that depend on ordered processing like decompression) are not. Read the MakeXYZ
-// function comments to determine which generators support async reentrancy.
-//
-// Note: Generators that are not asynchronously reentrant can still support readahead
-// (\see MakeSerialReadaheadGenerator).
-//
-// Readahead operators, and some other operators, may introduce queueing. Any operators
-// that introduce buffering should detail the amount of buffering they introduce in their
-// MakeXYZ function comments.
-template <typename T>
-using AsyncGenerator = std::function<Future<T>()>;
-
-template <typename T>
-struct IterationTraits<AsyncGenerator<T>> {
- /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
- /// an empty function indicates the end of iteration.
- static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
-
- static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
-};
-
-template <typename T>
-Future<T> AsyncGeneratorEnd() {
- return Future<T>::MakeFinished(IterationTraits<T>::End());
-}
-
-/// returning a future that completes when all have been visited
-template <typename T, typename Visitor>
-Future<> VisitAsyncGenerator(AsyncGenerator<T> generator, Visitor visitor) {
- struct LoopBody {
- struct Callback {
- Result<ControlFlow<>> operator()(const T& next) {
- if (IsIterationEnd(next)) {
- return Break();
- } else {
- auto visited = visitor(next);
- if (visited.ok()) {
- return Continue();
- } else {
- return visited;
- }
- }
- }
-
- Visitor visitor;
- };
-
- Future<ControlFlow<>> operator()() {
- Callback callback{visitor};
- auto next = generator();
- return next.Then(std::move(callback));
- }
-
- AsyncGenerator<T> generator;
- Visitor visitor;
- };
-
- return Loop(LoopBody{std::move(generator), std::move(visitor)});
-}
-
-/// \brief Waits for an async generator to complete, discarding results.
-template <typename T>
-Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
- std::function<Status(T)> visitor = [](const T&) { return Status::OK(); };
- return VisitAsyncGenerator(generator, visitor);
-}
-
-/// \brief Collects the results of an async generator into a vector
-template <typename T>
-Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
- auto vec = std::make_shared<std::vector<T>>();
- struct LoopBody {
- Future<ControlFlow<std::vector<T>>> operator()() {
- auto next = generator_();
- auto vec = vec_;
- return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
- if (IsIterationEnd(result)) {
- return Break(*vec);
- } else {
- vec->push_back(result);
- return Continue();
- }
- });
- }
- AsyncGenerator<T> generator_;
- std::shared_ptr<std::vector<T>> vec_;
- };
- return Loop(LoopBody{std::move(generator), std::move(vec)});
-}
-
-/// \see MakeMappedGenerator
-template <typename T, typename V>
-class MappingGenerator {
- public:
- MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
- : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
-
- Future<V> operator()() {
- auto future = Future<V>::Make();
- bool should_trigger;
- {
- auto guard = state_->mutex.Lock();
- if (state_->finished) {
- return AsyncGeneratorEnd<V>();
- }
- should_trigger = state_->waiting_jobs.empty();
- state_->waiting_jobs.push_back(future);
- }
- if (should_trigger) {
- state_->source().AddCallback(Callback{state_});
- }
- return future;
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
- : source(std::move(source)),
- map(std::move(map)),
- waiting_jobs(),
- mutex(),
- finished(false) {}
-
- void Purge() {
- // This might be called by an original callback (if the source iterator fails or
- // ends) or by a mapped callback (if the map function fails or ends prematurely).
- // Either way it should only be called once and after finished is set so there is no
- // need to guard access to `waiting_jobs`.
- while (!waiting_jobs.empty()) {
- waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
- waiting_jobs.pop_front();
- }
- }
-
- AsyncGenerator<T> source;
- std::function<Future<V>(const T&)> map;
- std::deque<Future<V>> waiting_jobs;
- util::Mutex mutex;
- bool finished;
- };
-
- struct Callback;
-
- struct MappedCallback {
- void operator()(const Result<V>& maybe_next) {
- bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
- bool should_purge = false;
- if (end) {
- {
- auto guard = state->mutex.Lock();
- should_purge = !state->finished;
- state->finished = true;
- }
- }
- sink.MarkFinished(maybe_next);
- if (should_purge) {
- state->Purge();
- }
- }
- std::shared_ptr<State> state;
- Future<V> sink;
- };
-
- struct Callback {
- void operator()(const Result<T>& maybe_next) {
- Future<V> sink;
- bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
- bool should_purge = false;
- bool should_trigger;
- {
- auto guard = state->mutex.Lock();
- if (end) {
- should_purge = !state->finished;
- state->finished = true;
- }
- sink = state->waiting_jobs.front();
- state->waiting_jobs.pop_front();
- should_trigger = !end && !state->waiting_jobs.empty();
- }
- if (should_purge) {
- state->Purge();
- }
- if (should_trigger) {
- state->source().AddCallback(Callback{state});
- }
- if (maybe_next.ok()) {
- const T& val = maybe_next.ValueUnsafe();
- if (IsIterationEnd(val)) {
- sink.MarkFinished(IterationTraits<V>::End());
- } else {
- Future<V> mapped_fut = state->map(val);
- mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
- }
- } else {
- sink.MarkFinished(maybe_next.status());
- }
- }
-
- std::shared_ptr<State> state;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief Creates a generator that will apply the map function to each element of
-/// source. The map function is not called on the end token.
-///
-/// Note: This function makes a copy of `map` for each item
-/// Note: Errors returned from the `map` function will be propagated
-///
-/// If the source generator is async-reentrant then this generator will be also
-template <typename T, typename MapFn,
- typename Mapped = detail::result_of_t<MapFn(const T&)>,
- typename V = typename EnsureFuture<Mapped>::type::ValueType>
-AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
- struct MapCallback {
- MapFn map_;
-
- Future<V> operator()(const T& val) { return ToFuture(map_(val)); }
- };
-
- return MappingGenerator<T, V>(std::move(source_generator), MapCallback{std::move(map)});
-}
-
-/// \see MakeSequencingGenerator
-template <typename T, typename ComesAfter, typename IsNext>
-class SequencingGenerator {
- public:
- SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
- T initial_value)
- : state_(std::make_shared<State>(std::move(source), std::move(compare),
- std::move(is_next), std::move(initial_value))) {}
-
- Future<T> operator()() {
- {
- auto guard = state_->mutex.Lock();
- // We can send a result immediately if the top of the queue is either an
- // error or the next item
- if (!state_->queue.empty() &&
- (!state_->queue.top().ok() ||
- state_->is_next(state_->previous_value, *state_->queue.top()))) {
- auto result = std::move(state_->queue.top());
- if (result.ok()) {
- state_->previous_value = *result;
- }
- state_->queue.pop();
- return Future<T>::MakeFinished(result);
- }
- if (state_->finished) {
- return AsyncGeneratorEnd<T>();
- }
- // The next item is not in the queue so we will need to wait
- auto new_waiting_fut = Future<T>::Make();
- state_->waiting_future = new_waiting_fut;
- guard.Unlock();
- state_->source().AddCallback(Callback{state_});
- return new_waiting_fut;
- }
- }
-
- private:
- struct WrappedComesAfter {
- bool operator()(const Result<T>& left, const Result<T>& right) {
- if (!left.ok() || !right.ok()) {
- // Should never happen
- return false;
- }
- return compare(*left, *right);
- }
- ComesAfter compare;
- };
-
- struct State {
- State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
- : source(std::move(source)),
- is_next(std::move(is_next)),
- previous_value(std::move(initial_value)),
- waiting_future(),
- queue(WrappedComesAfter{compare}),
- finished(false),
- mutex() {}
-
- AsyncGenerator<T> source;
- IsNext is_next;
- T previous_value;
- Future<T> waiting_future;
- std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
- bool finished;
- util::Mutex mutex;
- };
-
- class Callback {
- public:
- explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
-
- void operator()(const Result<T> result) {
- Future<T> to_deliver;
- bool finished;
- {
- auto guard = state_->mutex.Lock();
- bool ready_to_deliver = false;
- if (!result.ok()) {
- // Clear any cached results
- while (!state_->queue.empty()) {
- state_->queue.pop();
- }
- ready_to_deliver = true;
- state_->finished = true;
- } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
- ready_to_deliver = state_->queue.empty();
- state_->finished = true;
- } else {
- ready_to_deliver = state_->is_next(state_->previous_value, *result);
- }
-
- if (ready_to_deliver && state_->waiting_future.is_valid()) {
- to_deliver = state_->waiting_future;
- if (result.ok()) {
- state_->previous_value = *result;
- }
- } else {
- state_->queue.push(result);
- }
- // Capture state_->finished so we can access it outside the mutex
- finished = state_->finished;
- }
- // Must deliver result outside of the mutex
- if (to_deliver.is_valid()) {
- to_deliver.MarkFinished(result);
- } else {
- // Otherwise, if we didn't get the next item (or a terminal item), we
- // need to keep looking
- if (!finished) {
- state_->source().AddCallback(Callback{state_});
- }
- }
- }
-
- private:
- const std::shared_ptr<State> state_;
- };
-
- const std::shared_ptr<State> state_;
-};
-
-/// \brief Buffers an AsyncGenerator to return values in sequence order ComesAfter
-/// and IsNext determine the sequence order.
-///
-/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
-///
-/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
-/// `b` follows immediately after `a`. It should return true given `initial_value` and
-/// `b` if `b` is the first item in the sequence.
-///
-/// This operator will queue unboundedly while waiting for the next item. It is intended
-/// for jittery sources that might scatter an ordered sequence. It is NOT intended to
-/// sort. Using it to try and sort could result in excessive RAM usage. This generator
-/// will queue up to N blocks where N is the max "out of order"ness of the source.
-///
-/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
-/// blocks beyond where it belongs.
-///
-/// This generator is not async-reentrant but it consists only of a simple log(n)
-/// insertion into a priority queue.
-template <typename T, typename ComesAfter, typename IsNext>
-AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
- ComesAfter compare, IsNext is_next,
- T initial_value) {
- return SequencingGenerator<T, ComesAfter, IsNext>(
- std::move(source_generator), std::move(compare), std::move(is_next),
- std::move(initial_value));
-}
-
-/// \see MakeTransformedGenerator
-template <typename T, typename V>
-class TransformingGenerator {
- // The transforming generator state will be referenced as an async generator but will
- // also be referenced via callback to various futures. If the async generator owner
- // moves it around we need the state to be consistent for future callbacks.
- struct TransformingGeneratorState
- : std::enable_shared_from_this<TransformingGeneratorState> {
- TransformingGeneratorState(AsyncGenerator<T> generator, Transformer<T, V> transformer)
- : generator_(std::move(generator)),
- transformer_(std::move(transformer)),
- last_value_(),
- finished_() {}
-
- Future<V> operator()() {
- while (true) {
- auto maybe_next_result = Pump();
- if (!maybe_next_result.ok()) {
- return Future<V>::MakeFinished(maybe_next_result.status());
- }
- auto maybe_next = std::move(maybe_next_result).ValueUnsafe();
- if (maybe_next.has_value()) {
- return Future<V>::MakeFinished(*std::move(maybe_next));
- }
-
- auto next_fut = generator_();
- // If finished already, process results immediately inside the loop to avoid
- // stack overflow
- if (next_fut.is_finished()) {
- auto next_result = next_fut.result();
- if (next_result.ok()) {
- last_value_ = *next_result;
- } else {
- return Future<V>::MakeFinished(next_result.status());
- }
- // Otherwise, if not finished immediately, add callback to process results
- } else {
- auto self = this->shared_from_this();
- return next_fut.Then([self](const T& next_result) {
- self->last_value_ = next_result;
- return (*self)();
- });
- }
- }
- }
-
- // See comment on TransformingIterator::Pump
- Result<util::optional<V>> Pump() {
- if (!finished_ && last_value_.has_value()) {
- ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
- if (next.ReadyForNext()) {
- if (IsIterationEnd(*last_value_)) {
- finished_ = true;
- }
- last_value_.reset();
- }
- if (next.Finished()) {
- finished_ = true;
- }
- if (next.HasValue()) {
- return next.Value();
- }
- }
- if (finished_) {
- return IterationTraits<V>::End();
- }
- return util::nullopt;
- }
-
- AsyncGenerator<T> generator_;
- Transformer<T, V> transformer_;
- util::optional<T> last_value_;
- bool finished_;
- };
-
- public:
- explicit TransformingGenerator(AsyncGenerator<T> generator,
- Transformer<T, V> transformer)
- : state_(std::make_shared<TransformingGeneratorState>(std::move(generator),
- std::move(transformer))) {}
-
- Future<V> operator()() { return (*state_)(); }
-
- protected:
- std::shared_ptr<TransformingGeneratorState> state_;
-};
-
-/// \brief Transforms an async generator using a transformer function returning a new
-/// AsyncGenerator
-///
-/// The transform function here behaves exactly the same as the transform function in
-/// MakeTransformedIterator and you can safely use the same transform function to
-/// transform both synchronous and asynchronous streams.
-///
-/// This generator is not async-reentrant
-///
-/// This generator may queue up to 1 instance of T but will not delay
-template <typename T, typename V>
-AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
- Transformer<T, V> transformer) {
- return TransformingGenerator<T, V>(generator, transformer);
-}
-
-/// \see MakeSerialReadaheadGenerator
-template <typename T>
-class SerialReadaheadGenerator {
- public:
- SerialReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
- : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
-
- Future<T> operator()() {
- if (state_->first_) {
- // Lazy generator, need to wait for the first ask to prime the pump
- state_->first_ = false;
- auto next = state_->source_();
- return next.Then(Callback{state_}, ErrCallback{state_});
- }
-
- // This generator is not async-reentrant. We won't be called until the last
- // future finished so we know there is something in the queue
- auto finished = state_->finished_.load();
- if (finished && state_->readahead_queue_.IsEmpty()) {
- return AsyncGeneratorEnd<T>();
- }
-
- std::shared_ptr<Future<T>> next;
- if (!state_->readahead_queue_.Read(next)) {
- return Status::UnknownError("Could not read from readahead_queue");
- }
-
- auto last_available = state_->spaces_available_.fetch_add(1);
- if (last_available == 0 && !finished) {
- // Reader idled out, we need to restart it
- ARROW_RETURN_NOT_OK(state_->Pump(state_));
- }
- return *next;
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source, int max_readahead)
- : first_(true),
- source_(std::move(source)),
- finished_(false),
- // There is one extra "space" for the in-flight request
- spaces_available_(max_readahead + 1),
- // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
- readahead_queue_(max_readahead + 1) {}
-
- Status Pump(const std::shared_ptr<State>& self) {
- // Can't do readahead_queue.write(source().Then(...)) because then the
- // callback might run immediately and add itself to the queue before this gets added
- // to the queue messing up the order.
- auto next_slot = std::make_shared<Future<T>>();
- auto written = readahead_queue_.Write(next_slot);
- if (!written) {
- return Status::UnknownError("Could not write to readahead_queue");
- }
- // If this Pump is being called from a callback it is possible for the source to
- // poll and read from the queue between the Write and this spot where we fill the
- // value in. However, it is not possible for the future to read this value we are
- // writing. That is because this callback (the callback for future X) must be
- // finished before future X is marked complete and this source is not pulled
- // reentrantly so it will not poll for future X+1 until this callback has completed.
- *next_slot = source_().Then(Callback{self}, ErrCallback{self});
- return Status::OK();
- }
-
- // Only accessed by the consumer end
- bool first_;
- // Accessed by both threads
- AsyncGenerator<T> source_;
- std::atomic<bool> finished_;
- // The queue has a size but it is not atomic. We keep track of how many spaces are
- // left in the queue here so we know if we've just written the last value and we need
- // to stop reading ahead or if we've just read from a full queue and we need to
- // restart reading ahead
- std::atomic<uint32_t> spaces_available_;
- // Needs to be a queue of shared_ptr and not Future because we set the value of the
- // future after we add it to the queue
- util::SpscQueue<std::shared_ptr<Future<T>>> readahead_queue_;
- };
-
- struct Callback {
- Result<T> operator()(const T& next) {
- if (IsIterationEnd(next)) {
- state_->finished_.store(true);
- return next;
- }
- auto last_available = state_->spaces_available_.fetch_sub(1);
- if (last_available > 1) {
- ARROW_RETURN_NOT_OK(state_->Pump(state_));
- }
- return next;
- }
-
- std::shared_ptr<State> state_;
- };
-
- struct ErrCallback {
- Result<T> operator()(const Status& st) {
- state_->finished_.store(true);
- return st;
- }
-
- std::shared_ptr<State> state_;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \see MakeFromFuture
-template <typename T>
-class FutureFirstGenerator {
- public:
- explicit FutureFirstGenerator(Future<AsyncGenerator<T>> future)
- : state_(std::make_shared<State>(std::move(future))) {}
-
- Future<T> operator()() {
- if (state_->source_) {
- return state_->source_();
- } else {
- auto state = state_;
- return state_->future_.Then([state](const AsyncGenerator<T>& source) {
- state->source_ = source;
- return state->source_();
- });
- }
- }
-
- private:
- struct State {
- explicit State(Future<AsyncGenerator<T>> future) : future_(future), source_() {}
-
- Future<AsyncGenerator<T>> future_;
- AsyncGenerator<T> source_;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief Transforms a Future<AsyncGenerator<T>> into an AsyncGenerator<T>
-/// that waits for the future to complete as part of the first item.
-///
-/// This generator is not async-reentrant (even if the generator yielded by future is)
-///
-/// This generator does not queue
-template <typename T>
-AsyncGenerator<T> MakeFromFuture(Future<AsyncGenerator<T>> future) {
- return FutureFirstGenerator<T>(std::move(future));
-}
-
-/// \brief Creates a generator that will pull from the source into a queue. Unlike
-/// MakeReadaheadGenerator this will not pull reentrantly from the source.
-///
-/// The source generator does not need to be async-reentrant
-///
-/// This generator is not async-reentrant (even if the source is)
-///
-/// This generator may queue up to max_readahead additional instances of T
-template <typename T>
-AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
- int max_readahead) {
- return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
-}
-
-/// \see MakeReadaheadGenerator
-template <typename T>
-class ReadaheadGenerator {
- public:
- ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
- : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
-
- Future<T> AddMarkFinishedContinuation(Future<T> fut) {
- auto state = state_;
- return fut.Then(
- [state](const T& result) -> Result<T> {
- state->MarkFinishedIfDone(result);
- return result;
- },
- [state](const Status& err) -> Result<T> {
- state->finished.store(true);
- return err;
- });
- }
-
- Future<T> operator()() {
- if (state_->readahead_queue.empty()) {
- // This is the first request, let's pump the underlying queue
- for (int i = 0; i < state_->max_readahead; i++) {
- auto next = state_->source_generator();
- auto next_after_check = AddMarkFinishedContinuation(std::move(next));
- state_->readahead_queue.push(std::move(next_after_check));
- }
- }
- // Pop one and add one
- auto result = state_->readahead_queue.front();
- state_->readahead_queue.pop();
- if (state_->finished.load()) {
- state_->readahead_queue.push(AsyncGeneratorEnd<T>());
- } else {
- auto back_of_queue = state_->source_generator();
- auto back_of_queue_after_check =
- AddMarkFinishedContinuation(std::move(back_of_queue));
- state_->readahead_queue.push(std::move(back_of_queue_after_check));
- }
- return result;
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source_generator, int max_readahead)
- : source_generator(std::move(source_generator)), max_readahead(max_readahead) {
- finished.store(false);
- }
-
- void MarkFinishedIfDone(const T& next_result) {
- if (IsIterationEnd(next_result)) {
- finished.store(true);
- }
- }
-
- AsyncGenerator<T> source_generator;
- int max_readahead;
- std::atomic<bool> finished;
- std::queue<Future<T>> readahead_queue;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief A generator where the producer pushes items on a queue.
-///
-/// No back-pressure is applied, so this generator is mostly useful when
-/// producing the values is neither CPU- nor memory-expensive (e.g. fetching
-/// filesystem metadata).
-///
-/// This generator is not async-reentrant.
-template <typename T>
-class PushGenerator {
- struct State {
- util::Mutex mutex;
- std::deque<Result<T>> result_q;
- util::optional<Future<T>> consumer_fut;
- bool finished = false;
- };
-
- public:
- /// Producer API for PushGenerator
- class Producer {
- public:
- explicit Producer(const std::shared_ptr<State>& state) : weak_state_(state) {}
-
- /// \brief Push a value on the queue
- ///
- /// True is returned if the value was pushed, false if the generator is
- /// already closed or destroyed. If the latter, it is recommended to stop
- /// producing any further values.
- bool Push(Result<T> result) {
- auto state = weak_state_.lock();
- if (!state) {
- // Generator was destroyed
- return false;
- }
- auto lock = state->mutex.Lock();
- if (state->finished) {
- // Closed early
- return false;
- }
- if (state->consumer_fut.has_value()) {
- auto fut = std::move(state->consumer_fut.value());
- state->consumer_fut.reset();
- lock.Unlock(); // unlock before potentially invoking a callback
- fut.MarkFinished(std::move(result));
- } else {
- state->result_q.push_back(std::move(result));
- }
- return true;
- }
-
- /// \brief Tell the consumer we have finished producing
- ///
- /// It is allowed to call this and later call Push() again ("early close").
- /// In this case, calls to Push() after the queue is closed are silently
- /// ignored. This can help implementing non-trivial cancellation cases.
- ///
- /// True is returned on success, false if the generator is already closed
- /// or destroyed.
- bool Close() {
- auto state = weak_state_.lock();
- if (!state) {
- // Generator was destroyed
- return false;
- }
- auto lock = state->mutex.Lock();
- if (state->finished) {
- // Already closed
- return false;
- }
- state->finished = true;
- if (state->consumer_fut.has_value()) {
- auto fut = std::move(state->consumer_fut.value());
- state->consumer_fut.reset();
- lock.Unlock(); // unlock before potentially invoking a callback
- fut.MarkFinished(IterationTraits<T>::End());
- }
- return true;
- }
-
- /// Return whether the generator was closed or destroyed.
- bool is_closed() const {
- auto state = weak_state_.lock();
- if (!state) {
- // Generator was destroyed
- return true;
- }
- auto lock = state->mutex.Lock();
- return state->finished;
- }
-
- private:
- const std::weak_ptr<State> weak_state_;
- };
-
- PushGenerator() : state_(std::make_shared<State>()) {}
-
- /// Read an item from the queue
- Future<T> operator()() {
- auto lock = state_->mutex.Lock();
- assert(!state_->consumer_fut.has_value()); // Non-reentrant
- if (!state_->result_q.empty()) {
- auto fut = Future<T>::MakeFinished(std::move(state_->result_q.front()));
- state_->result_q.pop_front();
- return fut;
- }
- if (state_->finished) {
- return AsyncGeneratorEnd<T>();
- }
- auto fut = Future<T>::Make();
- state_->consumer_fut = fut;
- return fut;
- }
-
- /// \brief Return producer-side interface
- ///
- /// The returned object must be used by the producer to push values on the queue.
- /// Only a single Producer object should be instantiated.
- Producer producer() { return Producer{state_}; }
-
- private:
- const std::shared_ptr<State> state_;
-};
-
-/// \brief Creates a generator that pulls reentrantly from a source
-/// This generator will pull reentrantly from a source, ensuring that max_readahead
-/// requests are active at any given time.
-///
-/// The source generator must be async-reentrant
-///
-/// This generator itself is async-reentrant.
-///
-/// This generator may queue up to max_readahead instances of T
-template <typename T>
-AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
- int max_readahead) {
- return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
-}
-
-/// \brief Creates a generator that will yield finished futures from a vector
-///
-/// This generator is async-reentrant
-template <typename T>
-AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
- struct State {
- explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
-
- std::vector<T> vec;
- std::atomic<std::size_t> vec_idx;
- };
-
- auto state = std::make_shared<State>(std::move(vec));
- return [state]() {
- auto idx = state->vec_idx.fetch_add(1);
- if (idx >= state->vec.size()) {
- // Eagerly return memory
- state->vec.clear();
- return AsyncGeneratorEnd<T>();
- }
- return Future<T>::MakeFinished(state->vec[idx]);
- };
-}
-
-/// \see MakeMergedGenerator
-template <typename T>
-class MergedGenerator {
- public:
- explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
- int max_subscriptions)
- : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
-
- Future<T> operator()() {
- Future<T> waiting_future;
- std::shared_ptr<DeliveredJob> delivered_job;
- {
- auto guard = state_->mutex.Lock();
- if (!state_->delivered_jobs.empty()) {
- delivered_job = std::move(state_->delivered_jobs.front());
- state_->delivered_jobs.pop_front();
- } else if (state_->finished) {
- return IterationTraits<T>::End();
- } else {
- waiting_future = Future<T>::Make();
- state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
- }
- }
- if (delivered_job) {
- // deliverer will be invalid if outer callback encounters an error and delivers a
- // failed result
- if (delivered_job->deliverer) {
- delivered_job->deliverer().AddCallback(
- InnerCallback{state_, delivered_job->index});
- }
- return std::move(delivered_job->value);
- }
- if (state_->first) {
- state_->first = false;
- for (std::size_t i = 0; i < state_->active_subscriptions.size(); i++) {
- state_->PullSource().AddCallback(OuterCallback{state_, i});
- }
- }
- return waiting_future;
- }
-
- private:
- struct DeliveredJob {
- explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
- std::size_t index_)
- : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
-
- AsyncGenerator<T> deliverer;
- Result<T> value;
- std::size_t index;
- };
-
- struct State {
- State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
- : source(std::move(source)),
- active_subscriptions(max_subscriptions),
- delivered_jobs(),
- waiting_jobs(),
- mutex(),
- first(true),
- source_exhausted(false),
- finished(false),
- num_active_subscriptions(max_subscriptions) {}
-
- Future<AsyncGenerator<T>> PullSource() {
- // Need to guard access to source() so we don't pull sync-reentrantly which
- // is never valid.
- auto lock = mutex.Lock();
- return source();
- }
-
- AsyncGenerator<AsyncGenerator<T>> source;
- // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
- std::vector<AsyncGenerator<T>> active_subscriptions;
- std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
- // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
- // backpressure
- std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
- util::Mutex mutex;
- bool first;
- bool source_exhausted;
- bool finished;
- int num_active_subscriptions;
- };
-
- struct InnerCallback {
- void operator()(const Result<T>& maybe_next) {
- Future<T> sink;
- bool sub_finished = maybe_next.ok() && IsIterationEnd(*maybe_next);
- {
- auto guard = state->mutex.Lock();
- if (state->finished) {
- // We've errored out so just ignore this result and don't keep pumping
- return;
- }
- if (!sub_finished) {
- if (state->waiting_jobs.empty()) {
- state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
- state->active_subscriptions[index], maybe_next, index));
- } else {
- sink = std::move(*state->waiting_jobs.front());
- state->waiting_jobs.pop_front();
- }
- }
- }
- if (sub_finished) {
- state->PullSource().AddCallback(OuterCallback{state, index});
- } else if (sink.is_valid()) {
- sink.MarkFinished(maybe_next);
- if (maybe_next.ok()) {
- state->active_subscriptions[index]().AddCallback(*this);
- }
- }
- }
- std::shared_ptr<State> state;
- std::size_t index;
- };
-
- struct OuterCallback {
- void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
- bool should_purge = false;
- bool should_continue = false;
- Future<T> error_sink;
- {
- auto guard = state->mutex.Lock();
- if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
- state->source_exhausted = true;
- if (!maybe_next.ok() || --state->num_active_subscriptions == 0) {
- state->finished = true;
- should_purge = true;
- }
- if (!maybe_next.ok()) {
- if (state->waiting_jobs.empty()) {
- state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
- AsyncGenerator<T>(), maybe_next.status(), index));
- } else {
- error_sink = std::move(*state->waiting_jobs.front());
- state->waiting_jobs.pop_front();
- }
- }
- } else {
- state->active_subscriptions[index] = *maybe_next;
- should_continue = true;
- }
- }
- if (error_sink.is_valid()) {
- error_sink.MarkFinished(maybe_next.status());
- }
- if (should_continue) {
- (*maybe_next)().AddCallback(InnerCallback{state, index});
- } else if (should_purge) {
- // At this point state->finished has been marked true so no one else
- // will be interacting with waiting_jobs and we can iterate outside lock
- while (!state->waiting_jobs.empty()) {
- state->waiting_jobs.front()->MarkFinished(IterationTraits<T>::End());
- state->waiting_jobs.pop_front();
- }
- }
- }
- std::shared_ptr<State> state;
- std::size_t index;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief Creates a generator that takes in a stream of generators and pulls from up to
-/// max_subscriptions at a time
-///
-/// Note: This may deliver items out of sequence. For example, items from the third
-/// AsyncGenerator generated by the source may be emitted before some items from the first
-/// AsyncGenerator generated by the source.
-///
-/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
-/// This generator will not pull from the individual subscriptions reentrantly. Add
-/// readahead to the individual subscriptions if that is desired.
-/// This generator is async-reentrant
-///
-/// This generator may queue up to max_subscriptions instances of T
-template <typename T>
-AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
- int max_subscriptions) {
- return MergedGenerator<T>(std::move(source), max_subscriptions);
-}
-
-/// \brief Creates a generator that takes in a stream of generators and pulls from each
-/// one in sequence.
-///
-/// This generator is async-reentrant but will never pull from source reentrantly and
-/// will never pull from any subscription reentrantly.
-///
-/// This generator may queue 1 instance of T
-///
-/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
-/// forwards async-reentrant requests instead of buffering them (which is what
-/// MergedGenerator does)
-template <typename T>
-AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
- return MergedGenerator<T>(std::move(source), 1);
-}
-
-template <typename T>
-struct Enumerated {
- T value;
- int index;
- bool last;
-};
-
-template <typename T>
-struct IterationTraits<Enumerated<T>> {
- static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
- static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
-};
-
-/// \see MakeEnumeratedGenerator
-template <typename T>
-class EnumeratingGenerator {
- public:
- EnumeratingGenerator(AsyncGenerator<T> source, T initial_value)
- : state_(std::make_shared<State>(std::move(source), std::move(initial_value))) {}
-
- Future<Enumerated<T>> operator()() {
- if (state_->finished) {
- return AsyncGeneratorEnd<Enumerated<T>>();
- } else {
- auto state = state_;
- return state->source().Then([state](const T& next) {
- auto finished = IsIterationEnd<T>(next);
- auto prev = Enumerated<T>{state->prev_value, state->prev_index, finished};
- state->prev_value = next;
- state->prev_index++;
- state->finished = finished;
- return prev;
- });
- }
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source, T initial_value)
- : source(std::move(source)), prev_value(std::move(initial_value)), prev_index(0) {
- finished = IsIterationEnd<T>(prev_value);
- }
-
- AsyncGenerator<T> source;
- T prev_value;
- int prev_index;
- bool finished;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// Wraps items from a source generator with positional information
-///
-/// When used with MakeMergedGenerator and MakeSequencingGenerator this allows items to be
-/// processed in a "first-available" fashion and later resequenced which can reduce the
-/// impact of sources with erratic performance (e.g. a filesystem where some items may
-/// take longer to read than others).
-///
-/// TODO(ARROW-12371) Would require this generator be async-reentrant
-///
-/// \see MakeSequencingGenerator for an example of putting items back in order
-///
-/// This generator is not async-reentrant
-///
-/// This generator buffers one item (so it knows which item is the last item)
-template <typename T>
-AsyncGenerator<Enumerated<T>> MakeEnumeratedGenerator(AsyncGenerator<T> source) {
- return FutureFirstGenerator<Enumerated<T>>(
- source().Then([source](const T& initial_value) -> AsyncGenerator<Enumerated<T>> {
- return EnumeratingGenerator<T>(std::move(source), initial_value);
- }));
-}
-
-/// \see MakeTransferredGenerator
-template <typename T>
-class TransferringGenerator {
- public:
- explicit TransferringGenerator(AsyncGenerator<T> source, internal::Executor* executor)
- : source_(std::move(source)), executor_(executor) {}
-
- Future<T> operator()() { return executor_->Transfer(source_()); }
-
- private:
- AsyncGenerator<T> source_;
- internal::Executor* executor_;
-};
-
-/// \brief Transfers a future to an underlying executor.
-///
-/// Continuations run on the returned future will be run on the given executor
-/// if they cannot be run synchronously.
-///
-/// This is often needed to move computation off I/O threads or other external
-/// completion sources and back on to the CPU executor so the I/O thread can
-/// stay busy and focused on I/O
-///
-/// Keep in mind that continuations called on an already completed future will
-/// always be run synchronously and so no transfer will happen in that case.
-///
-/// This generator is async reentrant if the source is
-///
-/// This generator will not queue
-template <typename T>
-AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
- internal::Executor* executor) {
- return TransferringGenerator<T>(std::move(source), executor);
-}
-
-/// \see MakeBackgroundGenerator
-template <typename T>
-class BackgroundGenerator {
- public:
- explicit BackgroundGenerator(Iterator<T> it, internal::Executor* io_executor, int max_q,
- int q_restart)
- : state_(std::make_shared<State>(io_executor, std::move(it), max_q, q_restart)),
- cleanup_(std::make_shared<Cleanup>(state_.get())) {}
-
- Future<T> operator()() {
- auto guard = state_->mutex.Lock();
- Future<T> waiting_future;
- if (state_->queue.empty()) {
- if (state_->finished) {
- return AsyncGeneratorEnd<T>();
- } else {
- waiting_future = Future<T>::Make();
- state_->waiting_future = waiting_future;
- }
- } else {
- auto next = Future<T>::MakeFinished(std::move(state_->queue.front()));
- state_->queue.pop();
- if (state_->NeedsRestart()) {
- return state_->RestartTask(state_, std::move(guard), std::move(next));
- }
- return next;
- }
- // This should only trigger the very first time this method is called
- if (state_->NeedsRestart()) {
- return state_->RestartTask(state_, std::move(guard), std::move(waiting_future));
- }
- return waiting_future;
- }
-
- protected:
- static constexpr uint64_t kUnlikelyThreadId{std::numeric_limits<uint64_t>::max()};
-
- struct State {
- State(internal::Executor* io_executor, Iterator<T> it, int max_q, int q_restart)
- : io_executor(io_executor),
- max_q(max_q),
- q_restart(q_restart),
- it(std::move(it)),
- reading(false),
- finished(false),
- should_shutdown(false) {}
-
- void ClearQueue() {
- while (!queue.empty()) {
- queue.pop();
- }
- }
-
- bool TaskIsRunning() const { return task_finished.is_valid(); }
-
- bool NeedsRestart() const {
- return !finished && !reading && static_cast<int>(queue.size()) <= q_restart;
- }
-
- void DoRestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard) {
- // If we get here we are actually going to start a new task so let's create a
- // task_finished future for it
- state->task_finished = Future<>::Make();
- state->reading = true;
- auto spawn_status = io_executor->Spawn(
- [state]() { BackgroundGenerator::WorkerTask(std::move(state)); });
- if (!spawn_status.ok()) {
- // If we can't spawn a new task then send an error to the consumer (either via a
- // waiting future or the queue) and mark ourselves finished
- state->finished = true;
- state->task_finished = Future<>();
- if (waiting_future.has_value()) {
- auto to_deliver = std::move(waiting_future.value());
- waiting_future.reset();
- guard.Unlock();
- to_deliver.MarkFinished(spawn_status);
- } else {
- ClearQueue();
- queue.push(spawn_status);
- }
- }
- }
-
- Future<T> RestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard,
- Future<T> next) {
- if (TaskIsRunning()) {
- // If the task is still cleaning up we need to wait for it to finish before
- // restarting. We also want to block the consumer until we've restarted the
- // reader to avoid multiple restarts
- return task_finished.Then([state, next]() {
- // This may appear dangerous (recursive mutex) but we should be guaranteed the
- // outer guard has been released by this point. We know...
- // * task_finished is not already finished (it would be invalid in that case)
- // * task_finished will not be marked complete until we've given up the mutex
- auto guard_ = state->mutex.Lock();
- state->DoRestartTask(state, std::move(guard_));
- return next;
- });
- }
- // Otherwise we can restart immediately
- DoRestartTask(std::move(state), std::move(guard));
- return next;
- }
-
- internal::Executor* io_executor;
- const int max_q;
- const int q_restart;
- Iterator<T> it;
- std::atomic<uint64_t> worker_thread_id{kUnlikelyThreadId};
-
- // If true, the task is actively pumping items from the queue and does not need a
- // restart
- bool reading;
- // Set to true when a terminal item arrives
- bool finished;
- // Signal to the background task to end early because consumers have given up on it
- bool should_shutdown;
- // If the queue is empty, the consumer will create a waiting future and wait for it
- std::queue<Result<T>> queue;
- util::optional<Future<T>> waiting_future;
- // Every background task is given a future to complete when it is entirely finished
- // processing and ready for the next task to start or for State to be destroyed
- Future<> task_finished;
- util::Mutex mutex;
- };
-
- // Cleanup task that will be run when all consumer references to the generator are lost
- struct Cleanup {
- explicit Cleanup(State* state) : state(state) {}
- ~Cleanup() {
- /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
- /// there is no need to perform this check.
- ///
- /// It's a deadlock if we enter cleanup from
- /// the worker thread but it can happen if the consumer doesn't transfer away
- assert(state->worker_thread_id.load() != ::arrow::internal::GetThreadId());
- Future<> finish_fut;
- {
- auto lock = state->mutex.Lock();
- if (!state->TaskIsRunning()) {
- return;
- }
- // Signal the current task to stop and wait for it to finish
- state->should_shutdown = true;
- finish_fut = state->task_finished;
- }
- // Using future as a condition variable here
- Status st = finish_fut.status();
- ARROW_UNUSED(st);
- }
- State* state;
- };
-
- static void WorkerTask(std::shared_ptr<State> state) {
- state->worker_thread_id.store(::arrow::internal::GetThreadId());
- // We need to capture the state to read while outside the mutex
- bool reading = true;
- while (reading) {
- auto next = state->it.Next();
- // Need to capture state->waiting_future inside the mutex to mark finished outside
- Future<T> waiting_future;
- {
- auto guard = state->mutex.Lock();
-
- if (state->should_shutdown) {
- state->finished = true;
- break;
- }
-
- if (!next.ok() || IsIterationEnd<T>(*next)) {
- // Terminal item. Mark finished to true, send this last item, and quit
- state->finished = true;
- if (!next.ok()) {
- state->ClearQueue();
- }
- }
- // At this point we are going to send an item. Either we will add it to the
- // queue or deliver it to a waiting future.
- if (state->waiting_future.has_value()) {
- waiting_future = std::move(state->waiting_future.value());
- state->waiting_future.reset();
- } else {
- state->queue.push(std::move(next));
- // We just filled up the queue so it is time to quit. We may need to notify
- // a cleanup task so we transition to Quitting
- if (static_cast<int>(state->queue.size()) >= state->max_q) {
- state->reading = false;
- }
- }
- reading = state->reading && !state->finished;
- }
- // This should happen outside the mutex. Presumably there is a
- // transferring generator on the other end that will quickly transfer any
- // callbacks off of this thread so we can continue looping. Still, best not to
- // rely on that
- if (waiting_future.is_valid()) {
- waiting_future.MarkFinished(next);
- }
- }
- // Once we've sent our last item we can notify any waiters that we are done and so
- // either state can be cleaned up or a new background task can be started
- Future<> task_finished;
- {
- auto guard = state->mutex.Lock();
- // After we give up the mutex state can be safely deleted. We will no longer
- // reference it. We can safely transition to idle now.
- task_finished = state->task_finished;
- state->task_finished = Future<>();
- state->worker_thread_id.store(kUnlikelyThreadId);
- }
- task_finished.MarkFinished();
- }
-
- std::shared_ptr<State> state_;
- // state_ is held by both the generator and the background thread so it won't be cleaned
- // up when all consumer references are relinquished. cleanup_ is only held by the
- // generator so it will be destructed when the last consumer reference is gone. We use
- // this to cleanup / stop the background generator in case the consuming end stops
- // listening (e.g. due to a downstream error)
- std::shared_ptr<Cleanup> cleanup_;
-};
-
-constexpr int kDefaultBackgroundMaxQ = 32;
-constexpr int kDefaultBackgroundQRestart = 16;
-
-/// \brief Creates an AsyncGenerator<T> by iterating over an Iterator<T> on a background
-/// thread
-///
-/// The parameter max_q and q_restart control queue size and background thread task
-/// management. If the background task is fast you typically don't want it creating a
-/// thread task for every item. Instead the background thread will run until it fills
-/// up a readahead queue.
-///
-/// Once the queue has filled up the background thread task will terminate (allowing other
-/// I/O tasks to use the thread). Once the queue has been drained enough (specified by
-/// q_restart) then the background thread task will be restarted. If q_restart is too low
-/// then you may exhaust the queue waiting for the background thread task to start running
-/// again. If it is too high then it will be constantly stopping and restarting the
-/// background queue task
-///
-/// The "background thread" is a logical thread and will run as tasks on the io_executor.
-/// This thread may stop and start when the queue fills up but there will only be one
-/// active background thread task at any given time. You MUST transfer away from this
-/// background generator. Otherwise there could be a race condition if a callback on the
-/// background thread deletes the last consumer reference to the background generator. You
-/// can transfer onto the same executor as the background thread, it is only neccesary to
-/// create a new thread task, not to switch executors.
-///
-/// This generator is not async-reentrant
-///
-/// This generator will queue up to max_q blocks
-template <typename T>
-static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
- Iterator<T> iterator, internal::Executor* io_executor,
- int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart) {
- if (max_q < q_restart) {
- return Status::Invalid("max_q must be >= q_restart");
- }
- return BackgroundGenerator<T>(std::move(iterator), io_executor, max_q, q_restart);
-}
-
-/// \see MakeGeneratorIterator
-template <typename T>
-class GeneratorIterator {
- public:
- explicit GeneratorIterator(AsyncGenerator<T> source) : source_(std::move(source)) {}
-
- Result<T> Next() { return source_().result(); }
-
- private:
- AsyncGenerator<T> source_;
-};
-
-/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
-/// is finished
-template <typename T>
-Iterator<T> MakeGeneratorIterator(AsyncGenerator<T> source) {
- return Iterator<T>(GeneratorIterator<T>(std::move(source)));
-}
-
-/// \brief Adds readahead to an iterator using a background thread.
-///
-/// Under the hood this is converting the iterator to a generator using
-/// MakeBackgroundGenerator, adding readahead to the converted generator with
-/// MakeReadaheadGenerator, and then converting back to an iterator using
-/// MakeGeneratorIterator.
-template <typename T>
-Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
- ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
- auto max_q = readahead_queue_size;
- auto q_restart = std::max(1, max_q / 2);
- ARROW_ASSIGN_OR_RAISE(
- auto background_generator,
- MakeBackgroundGenerator(std::move(it), io_executor.get(), max_q, q_restart));
- // Capture io_executor to keep it alive as long as owned_bg_generator is still
- // referenced
- AsyncGenerator<T> owned_bg_generator = [io_executor, background_generator]() {
- return background_generator();
- };
- return MakeGeneratorIterator(std::move(owned_bg_generator));
-}
-
-/// \brief Make a generator that returns a single pre-generated future
-///
-/// This generator is async-reentrant.
-template <typename T>
-std::function<Future<T>()> MakeSingleFutureGenerator(Future<T> future) {
- assert(future.is_valid());
- auto state = std::make_shared<Future<T>>(std::move(future));
- return [state]() -> Future<T> {
- auto fut = std::move(*state);
- if (fut.is_valid()) {
- return fut;
- } else {
- return AsyncGeneratorEnd<T>();
- }
- };
-}
-
-/// \brief Make a generator that immediately ends.
-///
-/// This generator is async-reentrant.
-template <typename T>
-std::function<Future<T>()> MakeEmptyGenerator() {
- return []() -> Future<T> { return AsyncGeneratorEnd<T>(); };
-}
-
-/// \brief Make a generator that always fails with a given error
-///
-/// This generator is async-reentrant.
-template <typename T>
-AsyncGenerator<T> MakeFailingGenerator(Status st) {
- assert(!st.ok());
- auto state = std::make_shared<Status>(std::move(st));
- return [state]() -> Future<T> {
- auto st = std::move(*state);
- if (!st.ok()) {
- return std::move(st);
- } else {
- return AsyncGeneratorEnd<T>();
- }
- };
-}
-
-/// \brief Make a generator that always fails with a given error
-///
-/// This overload allows inferring the return type from the argument.
-template <typename T>
-AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
- return MakeFailingGenerator<T>(result.status());
-}
-
-/// \brief Prepends initial_values onto a generator
-///
-/// This generator is async-reentrant but will buffer requests and will not
-/// pull from following_values async-reentrantly.
-template <typename T>
-AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
- AsyncGenerator<T> following_values) {
- auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
- auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
- {std::move(initial_values_vec_gen), std::move(following_values)});
- return MakeConcatenatedGenerator(std::move(gen_gen));
-}
-
-template <typename T>
-struct CancellableGenerator {
- Future<T> operator()() {
- if (stop_token.IsStopRequested()) {
- return stop_token.Poll();
- }
- return source();
- }
-
- AsyncGenerator<T> source;
- StopToken stop_token;
-};
-
-/// \brief Allows an async generator to be cancelled
-///
-/// This generator is async-reentrant
-template <typename T>
-AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
- return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
-}
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <queue>
+
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/queue.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+// The methods in this file create, modify, and utilize AsyncGenerator which is an
+// iterator of futures. This allows an asynchronous source (like file input) to be run
+// through a pipeline in the same way that iterators can be used to create pipelined
+// workflows.
+//
+// In order to support pipeline parallelism we introduce the concept of asynchronous
+// reentrancy. This is different than synchronous reentrancy. With synchronous code a
+// function is reentrant if the function can be called again while a previous call to that
+// function is still running. Unless otherwise specified none of these generators are
+// synchronously reentrant. Care should be taken to avoid calling them in such a way (and
+// the utilities Visit/Collect/Await take care to do this).
+//
+// Asynchronous reentrancy on the other hand means the function is called again before the
+// future returned by the function is marked finished (but after the call to get the
+// future returns). Some of these generators are async-reentrant while others (e.g.
+// those that depend on ordered processing like decompression) are not. Read the MakeXYZ
+// function comments to determine which generators support async reentrancy.
+//
+// Note: Generators that are not asynchronously reentrant can still support readahead
+// (\see MakeSerialReadaheadGenerator).
+//
+// Readahead operators, and some other operators, may introduce queueing. Any operators
+// that introduce buffering should detail the amount of buffering they introduce in their
+// MakeXYZ function comments.
+template <typename T>
+using AsyncGenerator = std::function<Future<T>()>;
+
+template <typename T>
+struct IterationTraits<AsyncGenerator<T>> {
+ /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
+ /// an empty function indicates the end of iteration.
+ static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
+
+ static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
+};
+
+template <typename T>
+Future<T> AsyncGeneratorEnd() {
+ return Future<T>::MakeFinished(IterationTraits<T>::End());
+}
+
+/// returning a future that completes when all have been visited
+template <typename T, typename Visitor>
+Future<> VisitAsyncGenerator(AsyncGenerator<T> generator, Visitor visitor) {
+ struct LoopBody {
+ struct Callback {
+ Result<ControlFlow<>> operator()(const T& next) {
+ if (IsIterationEnd(next)) {
+ return Break();
+ } else {
+ auto visited = visitor(next);
+ if (visited.ok()) {
+ return Continue();
+ } else {
+ return visited;
+ }
+ }
+ }
+
+ Visitor visitor;
+ };
+
+ Future<ControlFlow<>> operator()() {
+ Callback callback{visitor};
+ auto next = generator();
+ return next.Then(std::move(callback));
+ }
+
+ AsyncGenerator<T> generator;
+ Visitor visitor;
+ };
+
+ return Loop(LoopBody{std::move(generator), std::move(visitor)});
+}
+
+/// \brief Waits for an async generator to complete, discarding results.
+template <typename T>
+Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
+ std::function<Status(T)> visitor = [](const T&) { return Status::OK(); };
+ return VisitAsyncGenerator(generator, visitor);
+}
+
+/// \brief Collects the results of an async generator into a vector
+template <typename T>
+Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
+ auto vec = std::make_shared<std::vector<T>>();
+ struct LoopBody {
+ Future<ControlFlow<std::vector<T>>> operator()() {
+ auto next = generator_();
+ auto vec = vec_;
+ return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
+ if (IsIterationEnd(result)) {
+ return Break(*vec);
+ } else {
+ vec->push_back(result);
+ return Continue();
+ }
+ });
+ }
+ AsyncGenerator<T> generator_;
+ std::shared_ptr<std::vector<T>> vec_;
+ };
+ return Loop(LoopBody{std::move(generator), std::move(vec)});
+}
+
+/// \see MakeMappedGenerator
+template <typename T, typename V>
+class MappingGenerator {
+ public:
+ MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+ : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
+
+ Future<V> operator()() {
+ auto future = Future<V>::Make();
+ bool should_trigger;
+ {
+ auto guard = state_->mutex.Lock();
+ if (state_->finished) {
+ return AsyncGeneratorEnd<V>();
+ }
+ should_trigger = state_->waiting_jobs.empty();
+ state_->waiting_jobs.push_back(future);
+ }
+ if (should_trigger) {
+ state_->source().AddCallback(Callback{state_});
+ }
+ return future;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+ : source(std::move(source)),
+ map(std::move(map)),
+ waiting_jobs(),
+ mutex(),
+ finished(false) {}
+
+ void Purge() {
+ // This might be called by an original callback (if the source iterator fails or
+ // ends) or by a mapped callback (if the map function fails or ends prematurely).
+ // Either way it should only be called once and after finished is set so there is no
+ // need to guard access to `waiting_jobs`.
+ while (!waiting_jobs.empty()) {
+ waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
+ waiting_jobs.pop_front();
+ }
+ }
+
+ AsyncGenerator<T> source;
+ std::function<Future<V>(const T&)> map;
+ std::deque<Future<V>> waiting_jobs;
+ util::Mutex mutex;
+ bool finished;
+ };
+
+ struct Callback;
+
+ struct MappedCallback {
+ void operator()(const Result<V>& maybe_next) {
+ bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+ bool should_purge = false;
+ if (end) {
+ {
+ auto guard = state->mutex.Lock();
+ should_purge = !state->finished;
+ state->finished = true;
+ }
+ }
+ sink.MarkFinished(maybe_next);
+ if (should_purge) {
+ state->Purge();
+ }
+ }
+ std::shared_ptr<State> state;
+ Future<V> sink;
+ };
+
+ struct Callback {
+ void operator()(const Result<T>& maybe_next) {
+ Future<V> sink;
+ bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+ bool should_purge = false;
+ bool should_trigger;
+ {
+ auto guard = state->mutex.Lock();
+ if (end) {
+ should_purge = !state->finished;
+ state->finished = true;
+ }
+ sink = state->waiting_jobs.front();
+ state->waiting_jobs.pop_front();
+ should_trigger = !end && !state->waiting_jobs.empty();
+ }
+ if (should_purge) {
+ state->Purge();
+ }
+ if (should_trigger) {
+ state->source().AddCallback(Callback{state});
+ }
+ if (maybe_next.ok()) {
+ const T& val = maybe_next.ValueUnsafe();
+ if (IsIterationEnd(val)) {
+ sink.MarkFinished(IterationTraits<V>::End());
+ } else {
+ Future<V> mapped_fut = state->map(val);
+ mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
+ }
+ } else {
+ sink.MarkFinished(maybe_next.status());
+ }
+ }
+
+ std::shared_ptr<State> state;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that will apply the map function to each element of
+/// source. The map function is not called on the end token.
+///
+/// Note: This function makes a copy of `map` for each item
+/// Note: Errors returned from the `map` function will be propagated
+///
+/// If the source generator is async-reentrant then this generator will be also
+template <typename T, typename MapFn,
+ typename Mapped = detail::result_of_t<MapFn(const T&)>,
+ typename V = typename EnsureFuture<Mapped>::type::ValueType>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
+ struct MapCallback {
+ MapFn map_;
+
+ Future<V> operator()(const T& val) { return ToFuture(map_(val)); }
+ };
+
+ return MappingGenerator<T, V>(std::move(source_generator), MapCallback{std::move(map)});
+}
+
+/// \see MakeSequencingGenerator
+template <typename T, typename ComesAfter, typename IsNext>
+class SequencingGenerator {
+ public:
+ SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
+ T initial_value)
+ : state_(std::make_shared<State>(std::move(source), std::move(compare),
+ std::move(is_next), std::move(initial_value))) {}
+
+ Future<T> operator()() {
+ {
+ auto guard = state_->mutex.Lock();
+ // We can send a result immediately if the top of the queue is either an
+ // error or the next item
+ if (!state_->queue.empty() &&
+ (!state_->queue.top().ok() ||
+ state_->is_next(state_->previous_value, *state_->queue.top()))) {
+ auto result = std::move(state_->queue.top());
+ if (result.ok()) {
+ state_->previous_value = *result;
+ }
+ state_->queue.pop();
+ return Future<T>::MakeFinished(result);
+ }
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ }
+ // The next item is not in the queue so we will need to wait
+ auto new_waiting_fut = Future<T>::Make();
+ state_->waiting_future = new_waiting_fut;
+ guard.Unlock();
+ state_->source().AddCallback(Callback{state_});
+ return new_waiting_fut;
+ }
+ }
+
+ private:
+ struct WrappedComesAfter {
+ bool operator()(const Result<T>& left, const Result<T>& right) {
+ if (!left.ok() || !right.ok()) {
+ // Should never happen
+ return false;
+ }
+ return compare(*left, *right);
+ }
+ ComesAfter compare;
+ };
+
+ struct State {
+ State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
+ : source(std::move(source)),
+ is_next(std::move(is_next)),
+ previous_value(std::move(initial_value)),
+ waiting_future(),
+ queue(WrappedComesAfter{compare}),
+ finished(false),
+ mutex() {}
+
+ AsyncGenerator<T> source;
+ IsNext is_next;
+ T previous_value;
+ Future<T> waiting_future;
+ std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
+ bool finished;
+ util::Mutex mutex;
+ };
+
+ class Callback {
+ public:
+ explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
+
+ void operator()(const Result<T> result) {
+ Future<T> to_deliver;
+ bool finished;
+ {
+ auto guard = state_->mutex.Lock();
+ bool ready_to_deliver = false;
+ if (!result.ok()) {
+ // Clear any cached results
+ while (!state_->queue.empty()) {
+ state_->queue.pop();
+ }
+ ready_to_deliver = true;
+ state_->finished = true;
+ } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
+ ready_to_deliver = state_->queue.empty();
+ state_->finished = true;
+ } else {
+ ready_to_deliver = state_->is_next(state_->previous_value, *result);
+ }
+
+ if (ready_to_deliver && state_->waiting_future.is_valid()) {
+ to_deliver = state_->waiting_future;
+ if (result.ok()) {
+ state_->previous_value = *result;
+ }
+ } else {
+ state_->queue.push(result);
+ }
+ // Capture state_->finished so we can access it outside the mutex
+ finished = state_->finished;
+ }
+ // Must deliver result outside of the mutex
+ if (to_deliver.is_valid()) {
+ to_deliver.MarkFinished(result);
+ } else {
+ // Otherwise, if we didn't get the next item (or a terminal item), we
+ // need to keep looking
+ if (!finished) {
+ state_->source().AddCallback(Callback{state_});
+ }
+ }
+ }
+
+ private:
+ const std::shared_ptr<State> state_;
+ };
+
+ const std::shared_ptr<State> state_;
+};
+
+/// \brief Buffers an AsyncGenerator to return values in sequence order ComesAfter
+/// and IsNext determine the sequence order.
+///
+/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
+///
+/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
+/// `b` follows immediately after `a`. It should return true given `initial_value` and
+/// `b` if `b` is the first item in the sequence.
+///
+/// This operator will queue unboundedly while waiting for the next item. It is intended
+/// for jittery sources that might scatter an ordered sequence. It is NOT intended to
+/// sort. Using it to try and sort could result in excessive RAM usage. This generator
+/// will queue up to N blocks where N is the max "out of order"ness of the source.
+///
+/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
+/// blocks beyond where it belongs.
+///
+/// This generator is not async-reentrant but it consists only of a simple log(n)
+/// insertion into a priority queue.
+template <typename T, typename ComesAfter, typename IsNext>
+AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
+ ComesAfter compare, IsNext is_next,
+ T initial_value) {
+ return SequencingGenerator<T, ComesAfter, IsNext>(
+ std::move(source_generator), std::move(compare), std::move(is_next),
+ std::move(initial_value));
+}
+
+/// \see MakeTransformedGenerator
+template <typename T, typename V>
+class TransformingGenerator {
+ // The transforming generator state will be referenced as an async generator but will
+ // also be referenced via callback to various futures. If the async generator owner
+ // moves it around we need the state to be consistent for future callbacks.
+ struct TransformingGeneratorState
+ : std::enable_shared_from_this<TransformingGeneratorState> {
+ TransformingGeneratorState(AsyncGenerator<T> generator, Transformer<T, V> transformer)
+ : generator_(std::move(generator)),
+ transformer_(std::move(transformer)),
+ last_value_(),
+ finished_() {}
+
+ Future<V> operator()() {
+ while (true) {
+ auto maybe_next_result = Pump();
+ if (!maybe_next_result.ok()) {
+ return Future<V>::MakeFinished(maybe_next_result.status());
+ }
+ auto maybe_next = std::move(maybe_next_result).ValueUnsafe();
+ if (maybe_next.has_value()) {
+ return Future<V>::MakeFinished(*std::move(maybe_next));
+ }
+
+ auto next_fut = generator_();
+ // If finished already, process results immediately inside the loop to avoid
+ // stack overflow
+ if (next_fut.is_finished()) {
+ auto next_result = next_fut.result();
+ if (next_result.ok()) {
+ last_value_ = *next_result;
+ } else {
+ return Future<V>::MakeFinished(next_result.status());
+ }
+ // Otherwise, if not finished immediately, add callback to process results
+ } else {
+ auto self = this->shared_from_this();
+ return next_fut.Then([self](const T& next_result) {
+ self->last_value_ = next_result;
+ return (*self)();
+ });
+ }
+ }
+ }
+
+ // See comment on TransformingIterator::Pump
+ Result<util::optional<V>> Pump() {
+ if (!finished_ && last_value_.has_value()) {
+ ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
+ if (next.ReadyForNext()) {
+ if (IsIterationEnd(*last_value_)) {
+ finished_ = true;
+ }
+ last_value_.reset();
+ }
+ if (next.Finished()) {
+ finished_ = true;
+ }
+ if (next.HasValue()) {
+ return next.Value();
+ }
+ }
+ if (finished_) {
+ return IterationTraits<V>::End();
+ }
+ return util::nullopt;
+ }
+
+ AsyncGenerator<T> generator_;
+ Transformer<T, V> transformer_;
+ util::optional<T> last_value_;
+ bool finished_;
+ };
+
+ public:
+ explicit TransformingGenerator(AsyncGenerator<T> generator,
+ Transformer<T, V> transformer)
+ : state_(std::make_shared<TransformingGeneratorState>(std::move(generator),
+ std::move(transformer))) {}
+
+ Future<V> operator()() { return (*state_)(); }
+
+ protected:
+ std::shared_ptr<TransformingGeneratorState> state_;
+};
+
+/// \brief Transforms an async generator using a transformer function returning a new
+/// AsyncGenerator
+///
+/// The transform function here behaves exactly the same as the transform function in
+/// MakeTransformedIterator and you can safely use the same transform function to
+/// transform both synchronous and asynchronous streams.
+///
+/// This generator is not async-reentrant
+///
+/// This generator may queue up to 1 instance of T but will not delay
+template <typename T, typename V>
+AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
+ Transformer<T, V> transformer) {
+ return TransformingGenerator<T, V>(generator, transformer);
+}
+
+/// \see MakeSerialReadaheadGenerator
+template <typename T>
+class SerialReadaheadGenerator {
+ public:
+ SerialReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+ : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+ Future<T> operator()() {
+ if (state_->first_) {
+ // Lazy generator, need to wait for the first ask to prime the pump
+ state_->first_ = false;
+ auto next = state_->source_();
+ return next.Then(Callback{state_}, ErrCallback{state_});
+ }
+
+ // This generator is not async-reentrant. We won't be called until the last
+ // future finished so we know there is something in the queue
+ auto finished = state_->finished_.load();
+ if (finished && state_->readahead_queue_.IsEmpty()) {
+ return AsyncGeneratorEnd<T>();
+ }
+
+ std::shared_ptr<Future<T>> next;
+ if (!state_->readahead_queue_.Read(next)) {
+ return Status::UnknownError("Could not read from readahead_queue");
+ }
+
+ auto last_available = state_->spaces_available_.fetch_add(1);
+ if (last_available == 0 && !finished) {
+ // Reader idled out, we need to restart it
+ ARROW_RETURN_NOT_OK(state_->Pump(state_));
+ }
+ return *next;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, int max_readahead)
+ : first_(true),
+ source_(std::move(source)),
+ finished_(false),
+ // There is one extra "space" for the in-flight request
+ spaces_available_(max_readahead + 1),
+ // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
+ readahead_queue_(max_readahead + 1) {}
+
+ Status Pump(const std::shared_ptr<State>& self) {
+ // Can't do readahead_queue.write(source().Then(...)) because then the
+ // callback might run immediately and add itself to the queue before this gets added
+ // to the queue messing up the order.
+ auto next_slot = std::make_shared<Future<T>>();
+ auto written = readahead_queue_.Write(next_slot);
+ if (!written) {
+ return Status::UnknownError("Could not write to readahead_queue");
+ }
+ // If this Pump is being called from a callback it is possible for the source to
+ // poll and read from the queue between the Write and this spot where we fill the
+ // value in. However, it is not possible for the future to read this value we are
+ // writing. That is because this callback (the callback for future X) must be
+ // finished before future X is marked complete and this source is not pulled
+ // reentrantly so it will not poll for future X+1 until this callback has completed.
+ *next_slot = source_().Then(Callback{self}, ErrCallback{self});
+ return Status::OK();
+ }
+
+ // Only accessed by the consumer end
+ bool first_;
+ // Accessed by both threads
+ AsyncGenerator<T> source_;
+ std::atomic<bool> finished_;
+ // The queue has a size but it is not atomic. We keep track of how many spaces are
+ // left in the queue here so we know if we've just written the last value and we need
+ // to stop reading ahead or if we've just read from a full queue and we need to
+ // restart reading ahead
+ std::atomic<uint32_t> spaces_available_;
+ // Needs to be a queue of shared_ptr and not Future because we set the value of the
+ // future after we add it to the queue
+ util::SpscQueue<std::shared_ptr<Future<T>>> readahead_queue_;
+ };
+
+ struct Callback {
+ Result<T> operator()(const T& next) {
+ if (IsIterationEnd(next)) {
+ state_->finished_.store(true);
+ return next;
+ }
+ auto last_available = state_->spaces_available_.fetch_sub(1);
+ if (last_available > 1) {
+ ARROW_RETURN_NOT_OK(state_->Pump(state_));
+ }
+ return next;
+ }
+
+ std::shared_ptr<State> state_;
+ };
+
+ struct ErrCallback {
+ Result<T> operator()(const Status& st) {
+ state_->finished_.store(true);
+ return st;
+ }
+
+ std::shared_ptr<State> state_;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \see MakeFromFuture
+template <typename T>
+class FutureFirstGenerator {
+ public:
+ explicit FutureFirstGenerator(Future<AsyncGenerator<T>> future)
+ : state_(std::make_shared<State>(std::move(future))) {}
+
+ Future<T> operator()() {
+ if (state_->source_) {
+ return state_->source_();
+ } else {
+ auto state = state_;
+ return state_->future_.Then([state](const AsyncGenerator<T>& source) {
+ state->source_ = source;
+ return state->source_();
+ });
+ }
+ }
+
+ private:
+ struct State {
+ explicit State(Future<AsyncGenerator<T>> future) : future_(future), source_() {}
+
+ Future<AsyncGenerator<T>> future_;
+ AsyncGenerator<T> source_;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Transforms a Future<AsyncGenerator<T>> into an AsyncGenerator<T>
+/// that waits for the future to complete as part of the first item.
+///
+/// This generator is not async-reentrant (even if the generator yielded by future is)
+///
+/// This generator does not queue
+template <typename T>
+AsyncGenerator<T> MakeFromFuture(Future<AsyncGenerator<T>> future) {
+ return FutureFirstGenerator<T>(std::move(future));
+}
+
+/// \brief Creates a generator that will pull from the source into a queue. Unlike
+/// MakeReadaheadGenerator this will not pull reentrantly from the source.
+///
+/// The source generator does not need to be async-reentrant
+///
+/// This generator is not async-reentrant (even if the source is)
+///
+/// This generator may queue up to max_readahead additional instances of T
+template <typename T>
+AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
+ int max_readahead) {
+ return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \see MakeReadaheadGenerator
+template <typename T>
+class ReadaheadGenerator {
+ public:
+ ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+ : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+ Future<T> AddMarkFinishedContinuation(Future<T> fut) {
+ auto state = state_;
+ return fut.Then(
+ [state](const T& result) -> Result<T> {
+ state->MarkFinishedIfDone(result);
+ return result;
+ },
+ [state](const Status& err) -> Result<T> {
+ state->finished.store(true);
+ return err;
+ });
+ }
+
+ Future<T> operator()() {
+ if (state_->readahead_queue.empty()) {
+ // This is the first request, let's pump the underlying queue
+ for (int i = 0; i < state_->max_readahead; i++) {
+ auto next = state_->source_generator();
+ auto next_after_check = AddMarkFinishedContinuation(std::move(next));
+ state_->readahead_queue.push(std::move(next_after_check));
+ }
+ }
+ // Pop one and add one
+ auto result = state_->readahead_queue.front();
+ state_->readahead_queue.pop();
+ if (state_->finished.load()) {
+ state_->readahead_queue.push(AsyncGeneratorEnd<T>());
+ } else {
+ auto back_of_queue = state_->source_generator();
+ auto back_of_queue_after_check =
+ AddMarkFinishedContinuation(std::move(back_of_queue));
+ state_->readahead_queue.push(std::move(back_of_queue_after_check));
+ }
+ return result;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source_generator, int max_readahead)
+ : source_generator(std::move(source_generator)), max_readahead(max_readahead) {
+ finished.store(false);
+ }
+
+ void MarkFinishedIfDone(const T& next_result) {
+ if (IsIterationEnd(next_result)) {
+ finished.store(true);
+ }
+ }
+
+ AsyncGenerator<T> source_generator;
+ int max_readahead;
+ std::atomic<bool> finished;
+ std::queue<Future<T>> readahead_queue;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief A generator where the producer pushes items on a queue.
+///
+/// No back-pressure is applied, so this generator is mostly useful when
+/// producing the values is neither CPU- nor memory-expensive (e.g. fetching
+/// filesystem metadata).
+///
+/// This generator is not async-reentrant.
+template <typename T>
+class PushGenerator {
+ struct State {
+ util::Mutex mutex;
+ std::deque<Result<T>> result_q;
+ util::optional<Future<T>> consumer_fut;
+ bool finished = false;
+ };
+
+ public:
+ /// Producer API for PushGenerator
+ class Producer {
+ public:
+ explicit Producer(const std::shared_ptr<State>& state) : weak_state_(state) {}
+
+ /// \brief Push a value on the queue
+ ///
+ /// True is returned if the value was pushed, false if the generator is
+ /// already closed or destroyed. If the latter, it is recommended to stop
+ /// producing any further values.
+ bool Push(Result<T> result) {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return false;
+ }
+ auto lock = state->mutex.Lock();
+ if (state->finished) {
+ // Closed early
+ return false;
+ }
+ if (state->consumer_fut.has_value()) {
+ auto fut = std::move(state->consumer_fut.value());
+ state->consumer_fut.reset();
+ lock.Unlock(); // unlock before potentially invoking a callback
+ fut.MarkFinished(std::move(result));
+ } else {
+ state->result_q.push_back(std::move(result));
+ }
+ return true;
+ }
+
+ /// \brief Tell the consumer we have finished producing
+ ///
+ /// It is allowed to call this and later call Push() again ("early close").
+ /// In this case, calls to Push() after the queue is closed are silently
+ /// ignored. This can help implementing non-trivial cancellation cases.
+ ///
+ /// True is returned on success, false if the generator is already closed
+ /// or destroyed.
+ bool Close() {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return false;
+ }
+ auto lock = state->mutex.Lock();
+ if (state->finished) {
+ // Already closed
+ return false;
+ }
+ state->finished = true;
+ if (state->consumer_fut.has_value()) {
+ auto fut = std::move(state->consumer_fut.value());
+ state->consumer_fut.reset();
+ lock.Unlock(); // unlock before potentially invoking a callback
+ fut.MarkFinished(IterationTraits<T>::End());
+ }
+ return true;
+ }
+
+ /// Return whether the generator was closed or destroyed.
+ bool is_closed() const {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return true;
+ }
+ auto lock = state->mutex.Lock();
+ return state->finished;
+ }
+
+ private:
+ const std::weak_ptr<State> weak_state_;
+ };
+
+ PushGenerator() : state_(std::make_shared<State>()) {}
+
+ /// Read an item from the queue
+ Future<T> operator()() {
+ auto lock = state_->mutex.Lock();
+ assert(!state_->consumer_fut.has_value()); // Non-reentrant
+ if (!state_->result_q.empty()) {
+ auto fut = Future<T>::MakeFinished(std::move(state_->result_q.front()));
+ state_->result_q.pop_front();
+ return fut;
+ }
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ }
+ auto fut = Future<T>::Make();
+ state_->consumer_fut = fut;
+ return fut;
+ }
+
+ /// \brief Return producer-side interface
+ ///
+ /// The returned object must be used by the producer to push values on the queue.
+ /// Only a single Producer object should be instantiated.
+ Producer producer() { return Producer{state_}; }
+
+ private:
+ const std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that pulls reentrantly from a source
+/// This generator will pull reentrantly from a source, ensuring that max_readahead
+/// requests are active at any given time.
+///
+/// The source generator must be async-reentrant
+///
+/// This generator itself is async-reentrant.
+///
+/// This generator may queue up to max_readahead instances of T
+template <typename T>
+AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
+ int max_readahead) {
+ return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \brief Creates a generator that will yield finished futures from a vector
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
+ struct State {
+ explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
+
+ std::vector<T> vec;
+ std::atomic<std::size_t> vec_idx;
+ };
+
+ auto state = std::make_shared<State>(std::move(vec));
+ return [state]() {
+ auto idx = state->vec_idx.fetch_add(1);
+ if (idx >= state->vec.size()) {
+ // Eagerly return memory
+ state->vec.clear();
+ return AsyncGeneratorEnd<T>();
+ }
+ return Future<T>::MakeFinished(state->vec[idx]);
+ };
+}
+
+/// \see MakeMergedGenerator
+template <typename T>
+class MergedGenerator {
+ public:
+ explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+ int max_subscriptions)
+ : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
+
+ Future<T> operator()() {
+ Future<T> waiting_future;
+ std::shared_ptr<DeliveredJob> delivered_job;
+ {
+ auto guard = state_->mutex.Lock();
+ if (!state_->delivered_jobs.empty()) {
+ delivered_job = std::move(state_->delivered_jobs.front());
+ state_->delivered_jobs.pop_front();
+ } else if (state_->finished) {
+ return IterationTraits<T>::End();
+ } else {
+ waiting_future = Future<T>::Make();
+ state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
+ }
+ }
+ if (delivered_job) {
+ // deliverer will be invalid if outer callback encounters an error and delivers a
+ // failed result
+ if (delivered_job->deliverer) {
+ delivered_job->deliverer().AddCallback(
+ InnerCallback{state_, delivered_job->index});
+ }
+ return std::move(delivered_job->value);
+ }
+ if (state_->first) {
+ state_->first = false;
+ for (std::size_t i = 0; i < state_->active_subscriptions.size(); i++) {
+ state_->PullSource().AddCallback(OuterCallback{state_, i});
+ }
+ }
+ return waiting_future;
+ }
+
+ private:
+ struct DeliveredJob {
+ explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
+ std::size_t index_)
+ : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
+
+ AsyncGenerator<T> deliverer;
+ Result<T> value;
+ std::size_t index;
+ };
+
+ struct State {
+ State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
+ : source(std::move(source)),
+ active_subscriptions(max_subscriptions),
+ delivered_jobs(),
+ waiting_jobs(),
+ mutex(),
+ first(true),
+ source_exhausted(false),
+ finished(false),
+ num_active_subscriptions(max_subscriptions) {}
+
+ Future<AsyncGenerator<T>> PullSource() {
+ // Need to guard access to source() so we don't pull sync-reentrantly which
+ // is never valid.
+ auto lock = mutex.Lock();
+ return source();
+ }
+
+ AsyncGenerator<AsyncGenerator<T>> source;
+ // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
+ std::vector<AsyncGenerator<T>> active_subscriptions;
+ std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
+ // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
+ // backpressure
+ std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+ util::Mutex mutex;
+ bool first;
+ bool source_exhausted;
+ bool finished;
+ int num_active_subscriptions;
+ };
+
+ struct InnerCallback {
+ void operator()(const Result<T>& maybe_next) {
+ Future<T> sink;
+ bool sub_finished = maybe_next.ok() && IsIterationEnd(*maybe_next);
+ {
+ auto guard = state->mutex.Lock();
+ if (state->finished) {
+ // We've errored out so just ignore this result and don't keep pumping
+ return;
+ }
+ if (!sub_finished) {
+ if (state->waiting_jobs.empty()) {
+ state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+ state->active_subscriptions[index], maybe_next, index));
+ } else {
+ sink = std::move(*state->waiting_jobs.front());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ }
+ if (sub_finished) {
+ state->PullSource().AddCallback(OuterCallback{state, index});
+ } else if (sink.is_valid()) {
+ sink.MarkFinished(maybe_next);
+ if (maybe_next.ok()) {
+ state->active_subscriptions[index]().AddCallback(*this);
+ }
+ }
+ }
+ std::shared_ptr<State> state;
+ std::size_t index;
+ };
+
+ struct OuterCallback {
+ void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
+ bool should_purge = false;
+ bool should_continue = false;
+ Future<T> error_sink;
+ {
+ auto guard = state->mutex.Lock();
+ if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
+ state->source_exhausted = true;
+ if (!maybe_next.ok() || --state->num_active_subscriptions == 0) {
+ state->finished = true;
+ should_purge = true;
+ }
+ if (!maybe_next.ok()) {
+ if (state->waiting_jobs.empty()) {
+ state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+ AsyncGenerator<T>(), maybe_next.status(), index));
+ } else {
+ error_sink = std::move(*state->waiting_jobs.front());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ } else {
+ state->active_subscriptions[index] = *maybe_next;
+ should_continue = true;
+ }
+ }
+ if (error_sink.is_valid()) {
+ error_sink.MarkFinished(maybe_next.status());
+ }
+ if (should_continue) {
+ (*maybe_next)().AddCallback(InnerCallback{state, index});
+ } else if (should_purge) {
+ // At this point state->finished has been marked true so no one else
+ // will be interacting with waiting_jobs and we can iterate outside lock
+ while (!state->waiting_jobs.empty()) {
+ state->waiting_jobs.front()->MarkFinished(IterationTraits<T>::End());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ }
+ std::shared_ptr<State> state;
+ std::size_t index;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from up to
+/// max_subscriptions at a time
+///
+/// Note: This may deliver items out of sequence. For example, items from the third
+/// AsyncGenerator generated by the source may be emitted before some items from the first
+/// AsyncGenerator generated by the source.
+///
+/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
+/// This generator will not pull from the individual subscriptions reentrantly. Add
+/// readahead to the individual subscriptions if that is desired.
+/// This generator is async-reentrant
+///
+/// This generator may queue up to max_subscriptions instances of T
+template <typename T>
+AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+ int max_subscriptions) {
+ return MergedGenerator<T>(std::move(source), max_subscriptions);
+}
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from each
+/// one in sequence.
+///
+/// This generator is async-reentrant but will never pull from source reentrantly and
+/// will never pull from any subscription reentrantly.
+///
+/// This generator may queue 1 instance of T
+///
+/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
+/// forwards async-reentrant requests instead of buffering them (which is what
+/// MergedGenerator does)
+template <typename T>
+AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
+ return MergedGenerator<T>(std::move(source), 1);
+}
+
+template <typename T>
+struct Enumerated {
+ T value;
+ int index;
+ bool last;
+};
+
+template <typename T>
+struct IterationTraits<Enumerated<T>> {
+ static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
+ static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
+};
+
+/// \see MakeEnumeratedGenerator
+template <typename T>
+class EnumeratingGenerator {
+ public:
+ EnumeratingGenerator(AsyncGenerator<T> source, T initial_value)
+ : state_(std::make_shared<State>(std::move(source), std::move(initial_value))) {}
+
+ Future<Enumerated<T>> operator()() {
+ if (state_->finished) {
+ return AsyncGeneratorEnd<Enumerated<T>>();
+ } else {
+ auto state = state_;
+ return state->source().Then([state](const T& next) {
+ auto finished = IsIterationEnd<T>(next);
+ auto prev = Enumerated<T>{state->prev_value, state->prev_index, finished};
+ state->prev_value = next;
+ state->prev_index++;
+ state->finished = finished;
+ return prev;
+ });
+ }
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, T initial_value)
+ : source(std::move(source)), prev_value(std::move(initial_value)), prev_index(0) {
+ finished = IsIterationEnd<T>(prev_value);
+ }
+
+ AsyncGenerator<T> source;
+ T prev_value;
+ int prev_index;
+ bool finished;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// Wraps items from a source generator with positional information
+///
+/// When used with MakeMergedGenerator and MakeSequencingGenerator this allows items to be
+/// processed in a "first-available" fashion and later resequenced which can reduce the
+/// impact of sources with erratic performance (e.g. a filesystem where some items may
+/// take longer to read than others).
+///
+/// TODO(ARROW-12371) Would require this generator be async-reentrant
+///
+/// \see MakeSequencingGenerator for an example of putting items back in order
+///
+/// This generator is not async-reentrant
+///
+/// This generator buffers one item (so it knows which item is the last item)
+template <typename T>
+AsyncGenerator<Enumerated<T>> MakeEnumeratedGenerator(AsyncGenerator<T> source) {
+ return FutureFirstGenerator<Enumerated<T>>(
+ source().Then([source](const T& initial_value) -> AsyncGenerator<Enumerated<T>> {
+ return EnumeratingGenerator<T>(std::move(source), initial_value);
+ }));
+}
+
+/// \see MakeTransferredGenerator
+template <typename T>
+class TransferringGenerator {
+ public:
+ explicit TransferringGenerator(AsyncGenerator<T> source, internal::Executor* executor)
+ : source_(std::move(source)), executor_(executor) {}
+
+ Future<T> operator()() { return executor_->Transfer(source_()); }
+
+ private:
+ AsyncGenerator<T> source_;
+ internal::Executor* executor_;
+};
+
+/// \brief Transfers a future to an underlying executor.
+///
+/// Continuations run on the returned future will be run on the given executor
+/// if they cannot be run synchronously.
+///
+/// This is often needed to move computation off I/O threads or other external
+/// completion sources and back on to the CPU executor so the I/O thread can
+/// stay busy and focused on I/O
+///
+/// Keep in mind that continuations called on an already completed future will
+/// always be run synchronously and so no transfer will happen in that case.
+///
+/// This generator is async reentrant if the source is
+///
+/// This generator will not queue
+template <typename T>
+AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
+ internal::Executor* executor) {
+ return TransferringGenerator<T>(std::move(source), executor);
+}
+
+/// \see MakeBackgroundGenerator
+template <typename T>
+class BackgroundGenerator {
+ public:
+ explicit BackgroundGenerator(Iterator<T> it, internal::Executor* io_executor, int max_q,
+ int q_restart)
+ : state_(std::make_shared<State>(io_executor, std::move(it), max_q, q_restart)),
+ cleanup_(std::make_shared<Cleanup>(state_.get())) {}
+
+ Future<T> operator()() {
+ auto guard = state_->mutex.Lock();
+ Future<T> waiting_future;
+ if (state_->queue.empty()) {
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ } else {
+ waiting_future = Future<T>::Make();
+ state_->waiting_future = waiting_future;
+ }
+ } else {
+ auto next = Future<T>::MakeFinished(std::move(state_->queue.front()));
+ state_->queue.pop();
+ if (state_->NeedsRestart()) {
+ return state_->RestartTask(state_, std::move(guard), std::move(next));
+ }
+ return next;
+ }
+ // This should only trigger the very first time this method is called
+ if (state_->NeedsRestart()) {
+ return state_->RestartTask(state_, std::move(guard), std::move(waiting_future));
+ }
+ return waiting_future;
+ }
+
+ protected:
+ static constexpr uint64_t kUnlikelyThreadId{std::numeric_limits<uint64_t>::max()};
+
+ struct State {
+ State(internal::Executor* io_executor, Iterator<T> it, int max_q, int q_restart)
+ : io_executor(io_executor),
+ max_q(max_q),
+ q_restart(q_restart),
+ it(std::move(it)),
+ reading(false),
+ finished(false),
+ should_shutdown(false) {}
+
+ void ClearQueue() {
+ while (!queue.empty()) {
+ queue.pop();
+ }
+ }
+
+ bool TaskIsRunning() const { return task_finished.is_valid(); }
+
+ bool NeedsRestart() const {
+ return !finished && !reading && static_cast<int>(queue.size()) <= q_restart;
+ }
+
+ void DoRestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard) {
+ // If we get here we are actually going to start a new task so let's create a
+ // task_finished future for it
+ state->task_finished = Future<>::Make();
+ state->reading = true;
+ auto spawn_status = io_executor->Spawn(
+ [state]() { BackgroundGenerator::WorkerTask(std::move(state)); });
+ if (!spawn_status.ok()) {
+ // If we can't spawn a new task then send an error to the consumer (either via a
+ // waiting future or the queue) and mark ourselves finished
+ state->finished = true;
+ state->task_finished = Future<>();
+ if (waiting_future.has_value()) {
+ auto to_deliver = std::move(waiting_future.value());
+ waiting_future.reset();
+ guard.Unlock();
+ to_deliver.MarkFinished(spawn_status);
+ } else {
+ ClearQueue();
+ queue.push(spawn_status);
+ }
+ }
+ }
+
+ Future<T> RestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard,
+ Future<T> next) {
+ if (TaskIsRunning()) {
+ // If the task is still cleaning up we need to wait for it to finish before
+ // restarting. We also want to block the consumer until we've restarted the
+ // reader to avoid multiple restarts
+ return task_finished.Then([state, next]() {
+ // This may appear dangerous (recursive mutex) but we should be guaranteed the
+ // outer guard has been released by this point. We know...
+ // * task_finished is not already finished (it would be invalid in that case)
+ // * task_finished will not be marked complete until we've given up the mutex
+ auto guard_ = state->mutex.Lock();
+ state->DoRestartTask(state, std::move(guard_));
+ return next;
+ });
+ }
+ // Otherwise we can restart immediately
+ DoRestartTask(std::move(state), std::move(guard));
+ return next;
+ }
+
+ internal::Executor* io_executor;
+ const int max_q;
+ const int q_restart;
+ Iterator<T> it;
+ std::atomic<uint64_t> worker_thread_id{kUnlikelyThreadId};
+
+ // If true, the task is actively pumping items from the queue and does not need a
+ // restart
+ bool reading;
+ // Set to true when a terminal item arrives
+ bool finished;
+ // Signal to the background task to end early because consumers have given up on it
+ bool should_shutdown;
+ // If the queue is empty, the consumer will create a waiting future and wait for it
+ std::queue<Result<T>> queue;
+ util::optional<Future<T>> waiting_future;
+ // Every background task is given a future to complete when it is entirely finished
+ // processing and ready for the next task to start or for State to be destroyed
+ Future<> task_finished;
+ util::Mutex mutex;
+ };
+
+ // Cleanup task that will be run when all consumer references to the generator are lost
+ struct Cleanup {
+ explicit Cleanup(State* state) : state(state) {}
+ ~Cleanup() {
+ /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
+ /// there is no need to perform this check.
+ ///
+ /// It's a deadlock if we enter cleanup from
+ /// the worker thread but it can happen if the consumer doesn't transfer away
+ assert(state->worker_thread_id.load() != ::arrow::internal::GetThreadId());
+ Future<> finish_fut;
+ {
+ auto lock = state->mutex.Lock();
+ if (!state->TaskIsRunning()) {
+ return;
+ }
+ // Signal the current task to stop and wait for it to finish
+ state->should_shutdown = true;
+ finish_fut = state->task_finished;
+ }
+ // Using future as a condition variable here
+ Status st = finish_fut.status();
+ ARROW_UNUSED(st);
+ }
+ State* state;
+ };
+
+ static void WorkerTask(std::shared_ptr<State> state) {
+ state->worker_thread_id.store(::arrow::internal::GetThreadId());
+ // We need to capture the state to read while outside the mutex
+ bool reading = true;
+ while (reading) {
+ auto next = state->it.Next();
+ // Need to capture state->waiting_future inside the mutex to mark finished outside
+ Future<T> waiting_future;
+ {
+ auto guard = state->mutex.Lock();
+
+ if (state->should_shutdown) {
+ state->finished = true;
+ break;
+ }
+
+ if (!next.ok() || IsIterationEnd<T>(*next)) {
+ // Terminal item. Mark finished to true, send this last item, and quit
+ state->finished = true;
+ if (!next.ok()) {
+ state->ClearQueue();
+ }
+ }
+ // At this point we are going to send an item. Either we will add it to the
+ // queue or deliver it to a waiting future.
+ if (state->waiting_future.has_value()) {
+ waiting_future = std::move(state->waiting_future.value());
+ state->waiting_future.reset();
+ } else {
+ state->queue.push(std::move(next));
+ // We just filled up the queue so it is time to quit. We may need to notify
+ // a cleanup task so we transition to Quitting
+ if (static_cast<int>(state->queue.size()) >= state->max_q) {
+ state->reading = false;
+ }
+ }
+ reading = state->reading && !state->finished;
+ }
+ // This should happen outside the mutex. Presumably there is a
+ // transferring generator on the other end that will quickly transfer any
+ // callbacks off of this thread so we can continue looping. Still, best not to
+ // rely on that
+ if (waiting_future.is_valid()) {
+ waiting_future.MarkFinished(next);
+ }
+ }
+ // Once we've sent our last item we can notify any waiters that we are done and so
+ // either state can be cleaned up or a new background task can be started
+ Future<> task_finished;
+ {
+ auto guard = state->mutex.Lock();
+ // After we give up the mutex state can be safely deleted. We will no longer
+ // reference it. We can safely transition to idle now.
+ task_finished = state->task_finished;
+ state->task_finished = Future<>();
+ state->worker_thread_id.store(kUnlikelyThreadId);
+ }
+ task_finished.MarkFinished();
+ }
+
+ std::shared_ptr<State> state_;
+ // state_ is held by both the generator and the background thread so it won't be cleaned
+ // up when all consumer references are relinquished. cleanup_ is only held by the
+ // generator so it will be destructed when the last consumer reference is gone. We use
+ // this to cleanup / stop the background generator in case the consuming end stops
+ // listening (e.g. due to a downstream error)
+ std::shared_ptr<Cleanup> cleanup_;
+};
+
+constexpr int kDefaultBackgroundMaxQ = 32;
+constexpr int kDefaultBackgroundQRestart = 16;
+
+/// \brief Creates an AsyncGenerator<T> by iterating over an Iterator<T> on a background
+/// thread
+///
+/// The parameter max_q and q_restart control queue size and background thread task
+/// management. If the background task is fast you typically don't want it creating a
+/// thread task for every item. Instead the background thread will run until it fills
+/// up a readahead queue.
+///
+/// Once the queue has filled up the background thread task will terminate (allowing other
+/// I/O tasks to use the thread). Once the queue has been drained enough (specified by
+/// q_restart) then the background thread task will be restarted. If q_restart is too low
+/// then you may exhaust the queue waiting for the background thread task to start running
+/// again. If it is too high then it will be constantly stopping and restarting the
+/// background queue task
+///
+/// The "background thread" is a logical thread and will run as tasks on the io_executor.
+/// This thread may stop and start when the queue fills up but there will only be one
+/// active background thread task at any given time. You MUST transfer away from this
+/// background generator. Otherwise there could be a race condition if a callback on the
+/// background thread deletes the last consumer reference to the background generator. You
+/// can transfer onto the same executor as the background thread, it is only neccesary to
+/// create a new thread task, not to switch executors.
+///
+/// This generator is not async-reentrant
+///
+/// This generator will queue up to max_q blocks
+template <typename T>
+static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
+ Iterator<T> iterator, internal::Executor* io_executor,
+ int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart) {
+ if (max_q < q_restart) {
+ return Status::Invalid("max_q must be >= q_restart");
+ }
+ return BackgroundGenerator<T>(std::move(iterator), io_executor, max_q, q_restart);
+}
+
+/// \see MakeGeneratorIterator
+template <typename T>
+class GeneratorIterator {
+ public:
+ explicit GeneratorIterator(AsyncGenerator<T> source) : source_(std::move(source)) {}
+
+ Result<T> Next() { return source_().result(); }
+
+ private:
+ AsyncGenerator<T> source_;
+};
+
+/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
+/// is finished
+template <typename T>
+Iterator<T> MakeGeneratorIterator(AsyncGenerator<T> source) {
+ return Iterator<T>(GeneratorIterator<T>(std::move(source)));
+}
+
+/// \brief Adds readahead to an iterator using a background thread.
+///
+/// Under the hood this is converting the iterator to a generator using
+/// MakeBackgroundGenerator, adding readahead to the converted generator with
+/// MakeReadaheadGenerator, and then converting back to an iterator using
+/// MakeGeneratorIterator.
+template <typename T>
+Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
+ ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
+ auto max_q = readahead_queue_size;
+ auto q_restart = std::max(1, max_q / 2);
+ ARROW_ASSIGN_OR_RAISE(
+ auto background_generator,
+ MakeBackgroundGenerator(std::move(it), io_executor.get(), max_q, q_restart));
+ // Capture io_executor to keep it alive as long as owned_bg_generator is still
+ // referenced
+ AsyncGenerator<T> owned_bg_generator = [io_executor, background_generator]() {
+ return background_generator();
+ };
+ return MakeGeneratorIterator(std::move(owned_bg_generator));
+}
+
+/// \brief Make a generator that returns a single pre-generated future
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeSingleFutureGenerator(Future<T> future) {
+ assert(future.is_valid());
+ auto state = std::make_shared<Future<T>>(std::move(future));
+ return [state]() -> Future<T> {
+ auto fut = std::move(*state);
+ if (fut.is_valid()) {
+ return fut;
+ } else {
+ return AsyncGeneratorEnd<T>();
+ }
+ };
+}
+
+/// \brief Make a generator that immediately ends.
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeEmptyGenerator() {
+ return []() -> Future<T> { return AsyncGeneratorEnd<T>(); };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This generator is async-reentrant.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(Status st) {
+ assert(!st.ok());
+ auto state = std::make_shared<Status>(std::move(st));
+ return [state]() -> Future<T> {
+ auto st = std::move(*state);
+ if (!st.ok()) {
+ return std::move(st);
+ } else {
+ return AsyncGeneratorEnd<T>();
+ }
+ };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This overload allows inferring the return type from the argument.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
+ return MakeFailingGenerator<T>(result.status());
+}
+
+/// \brief Prepends initial_values onto a generator
+///
+/// This generator is async-reentrant but will buffer requests and will not
+/// pull from following_values async-reentrantly.
+template <typename T>
+AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
+ AsyncGenerator<T> following_values) {
+ auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
+ auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
+ {std::move(initial_values_vec_gen), std::move(following_values)});
+ return MakeConcatenatedGenerator(std::move(gen_gen));
+}
+
+template <typename T>
+struct CancellableGenerator {
+ Future<T> operator()() {
+ if (stop_token.IsStopRequested()) {
+ return stop_token.Poll();
+ }
+ return source();
+ }
+
+ AsyncGenerator<T> source;
+ StopToken stop_token;
+};
+
+/// \brief Allows an async generator to be cancelled
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
+ return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
index 56809f28165..d6640775c4f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
@@ -28,7 +28,7 @@
#include <string>
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/int128_internal.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
@@ -121,223 +121,223 @@ static const BasicDecimal128 ScaleMultipliersHalf[] = {
BasicDecimal128(271050543121376108LL, 9257742014424809472ULL),
BasicDecimal128(2710505431213761085LL, 343699775700336640ULL)};
-static const BasicDecimal256 ScaleMultipliersDecimal256[] = {
- BasicDecimal256({1ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({7766279631452241920ULL, 5ULL, 0ULL, 0ULL}),
- BasicDecimal256({3875820019684212736ULL, 54ULL, 0ULL, 0ULL}),
- BasicDecimal256({1864712049423024128ULL, 542ULL, 0ULL, 0ULL}),
- BasicDecimal256({200376420520689664ULL, 5421ULL, 0ULL, 0ULL}),
- BasicDecimal256({2003764205206896640ULL, 54210ULL, 0ULL, 0ULL}),
- BasicDecimal256({1590897978359414784ULL, 542101ULL, 0ULL, 0ULL}),
- BasicDecimal256({15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL}),
- BasicDecimal256({11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL}),
- BasicDecimal256({4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL}),
- BasicDecimal256({7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL}),
- BasicDecimal256({5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL}),
- BasicDecimal256({13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL}),
- BasicDecimal256({9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL}),
- BasicDecimal256({4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL}),
- BasicDecimal256({4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL}),
- BasicDecimal256({3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL}),
- BasicDecimal256({12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL}),
- BasicDecimal256({68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL}),
- BasicDecimal256({687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL}),
- BasicDecimal256({6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL}),
- BasicDecimal256({13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL}),
- BasicDecimal256({4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL}),
- BasicDecimal256({11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, 0ULL}),
- BasicDecimal256({7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL}),
- BasicDecimal256({80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL}),
- BasicDecimal256({802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, 0ULL}),
- BasicDecimal256({8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, 0ULL}),
- BasicDecimal256(
- {6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, 0ULL}),
- BasicDecimal256(
- {9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, 0ULL}),
- BasicDecimal256(
- {17909126868192198656ULL, 10278298888501128114ULL, 29387358770ULL, 0ULL}),
- BasicDecimal256(
- {13070572018536022016ULL, 10549268516463523069ULL, 293873587705ULL, 0ULL}),
- BasicDecimal256(
- {1578511669393358848ULL, 13258964796087472617ULL, 2938735877055ULL, 0ULL}),
- BasicDecimal256(
- {15785116693933588480ULL, 3462439444907864858ULL, 29387358770557ULL, 0ULL}),
- BasicDecimal256(
- {10277214349659471872ULL, 16177650375369096972ULL, 293873587705571ULL, 0ULL}),
- BasicDecimal256(
- {10538423128046960640ULL, 14202551164014556797ULL, 2938735877055718ULL, 0ULL}),
- BasicDecimal256(
- {13150510911921848320ULL, 12898303124178706663ULL, 29387358770557187ULL, 0ULL}),
- BasicDecimal256(
- {2377900603251621888ULL, 18302566799529756941ULL, 293873587705571876ULL, 0ULL}),
- BasicDecimal256(
- {5332261958806667264ULL, 17004971331911604867ULL, 2938735877055718769ULL, 0ULL}),
- BasicDecimal256(
- {16429131440647569408ULL, 4029016655730084128ULL, 10940614696847636083ULL, 1ULL}),
- BasicDecimal256({16717361816799281152ULL, 3396678409881738056ULL,
- 17172426599928602752ULL, 15ULL}),
- BasicDecimal256({1152921504606846976ULL, 15520040025107828953ULL,
- 5703569335900062977ULL, 159ULL}),
- BasicDecimal256({11529215046068469760ULL, 7626447661401876602ULL,
- 1695461137871974930ULL, 1593ULL}),
- BasicDecimal256({4611686018427387904ULL, 2477500319180559562ULL,
- 16954611378719749304ULL, 15930ULL}),
- BasicDecimal256({9223372036854775808ULL, 6328259118096044006ULL,
- 3525417123811528497ULL, 159309ULL}),
- BasicDecimal256({0ULL, 7942358959831785217ULL, 16807427164405733357ULL, 1593091ULL}),
- BasicDecimal256({0ULL, 5636613303479645706ULL, 2053574980671369030ULL, 15930919ULL}),
- BasicDecimal256({0ULL, 1025900813667802212ULL, 2089005733004138687ULL, 159309191ULL}),
- BasicDecimal256(
- {0ULL, 10259008136678022120ULL, 2443313256331835254ULL, 1593091911ULL}),
- BasicDecimal256(
- {0ULL, 10356360998232463120ULL, 5986388489608800929ULL, 15930919111ULL}),
- BasicDecimal256(
- {0ULL, 11329889613776873120ULL, 4523652674959354447ULL, 159309191113ULL}),
- BasicDecimal256(
- {0ULL, 2618431695511421504ULL, 8343038602174441244ULL, 1593091911132ULL}),
- BasicDecimal256(
- {0ULL, 7737572881404663424ULL, 9643409726906205977ULL, 15930919111324ULL}),
- BasicDecimal256(
- {0ULL, 3588752519208427776ULL, 4200376900514301694ULL, 159309191113245ULL}),
- BasicDecimal256(
- {0ULL, 17440781118374726144ULL, 5110280857723913709ULL, 1593091911132452ULL}),
- BasicDecimal256(
- {0ULL, 8387114520361296896ULL, 14209320429820033867ULL, 15930919111324522ULL}),
- BasicDecimal256(
- {0ULL, 10084168908774762496ULL, 12965995782233477362ULL, 159309191113245227ULL}),
- BasicDecimal256(
- {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})};
-
-static const BasicDecimal256 ScaleMultipliersHalfDecimal256[] = {
- BasicDecimal256({0ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}),
- BasicDecimal256({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}),
- BasicDecimal256({932356024711512064ULL, 271ULL, 0ULL, 0ULL}),
- BasicDecimal256({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}),
- BasicDecimal256({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}),
- BasicDecimal256({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}),
- BasicDecimal256({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}),
- BasicDecimal256({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}),
- BasicDecimal256({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}),
- BasicDecimal256({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}),
- BasicDecimal256({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}),
- BasicDecimal256({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}),
- BasicDecimal256({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}),
- BasicDecimal256({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}),
- BasicDecimal256({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}),
- BasicDecimal256({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}),
- BasicDecimal256({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}),
- BasicDecimal256({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}),
- BasicDecimal256({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}),
- BasicDecimal256({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}),
- BasicDecimal256({15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}),
- BasicDecimal256({11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}),
- BasicDecimal256({5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}),
- BasicDecimal256({3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}),
- BasicDecimal256({40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}),
- BasicDecimal256({401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}),
- BasicDecimal256({4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}),
- BasicDecimal256(
- {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}),
- BasicDecimal256(
- {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}),
- BasicDecimal256(
- {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}),
- BasicDecimal256(
- {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}),
- BasicDecimal256(
- {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}),
- BasicDecimal256(
- {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}),
- BasicDecimal256(
- {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}),
- BasicDecimal256(
- {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}),
- BasicDecimal256(
- {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}),
- BasicDecimal256(
- {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}),
- BasicDecimal256(
- {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}),
- BasicDecimal256(
- {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}),
- BasicDecimal256(
- {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}),
- BasicDecimal256({9799832789158199296ULL, 16983392049408690284ULL,
- 12075156704804807296ULL, 79ULL}),
- BasicDecimal256({5764607523034234880ULL, 3813223830700938301ULL,
- 10071102605790763273ULL, 796ULL}),
- BasicDecimal256({2305843009213693952ULL, 1238750159590279781ULL,
- 8477305689359874652ULL, 7965ULL}),
- BasicDecimal256({4611686018427387904ULL, 12387501595902797811ULL,
- 10986080598760540056ULL, 79654ULL}),
- BasicDecimal256({9223372036854775808ULL, 13194551516770668416ULL,
- 17627085619057642486ULL, 796545ULL}),
- BasicDecimal256({0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}),
- BasicDecimal256({0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}),
- BasicDecimal256(
- {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}),
- BasicDecimal256(
- {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}),
- BasicDecimal256(
- {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}),
- BasicDecimal256(
- {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}),
- BasicDecimal256(
- {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}),
- BasicDecimal256(
- {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}),
- BasicDecimal256(
- {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}),
- BasicDecimal256(
- {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}),
- BasicDecimal256(
- {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}),
- BasicDecimal256(
- {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})};
-
+static const BasicDecimal256 ScaleMultipliersDecimal256[] = {
+ BasicDecimal256({1ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7766279631452241920ULL, 5ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3875820019684212736ULL, 54ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1864712049423024128ULL, 542ULL, 0ULL, 0ULL}),
+ BasicDecimal256({200376420520689664ULL, 5421ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2003764205206896640ULL, 54210ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1590897978359414784ULL, 542101ULL, 0ULL, 0ULL}),
+ BasicDecimal256({15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL}),
+ BasicDecimal256({11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL}),
+ BasicDecimal256({13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL}),
+ BasicDecimal256({12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL}),
+ BasicDecimal256({68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL}),
+ BasicDecimal256({687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL}),
+ BasicDecimal256({6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL}),
+ BasicDecimal256({13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL}),
+ BasicDecimal256({4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL}),
+ BasicDecimal256({11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, 0ULL}),
+ BasicDecimal256({7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL}),
+ BasicDecimal256({80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL}),
+ BasicDecimal256({802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, 0ULL}),
+ BasicDecimal256({8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, 0ULL}),
+ BasicDecimal256(
+ {6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, 0ULL}),
+ BasicDecimal256(
+ {9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, 0ULL}),
+ BasicDecimal256(
+ {17909126868192198656ULL, 10278298888501128114ULL, 29387358770ULL, 0ULL}),
+ BasicDecimal256(
+ {13070572018536022016ULL, 10549268516463523069ULL, 293873587705ULL, 0ULL}),
+ BasicDecimal256(
+ {1578511669393358848ULL, 13258964796087472617ULL, 2938735877055ULL, 0ULL}),
+ BasicDecimal256(
+ {15785116693933588480ULL, 3462439444907864858ULL, 29387358770557ULL, 0ULL}),
+ BasicDecimal256(
+ {10277214349659471872ULL, 16177650375369096972ULL, 293873587705571ULL, 0ULL}),
+ BasicDecimal256(
+ {10538423128046960640ULL, 14202551164014556797ULL, 2938735877055718ULL, 0ULL}),
+ BasicDecimal256(
+ {13150510911921848320ULL, 12898303124178706663ULL, 29387358770557187ULL, 0ULL}),
+ BasicDecimal256(
+ {2377900603251621888ULL, 18302566799529756941ULL, 293873587705571876ULL, 0ULL}),
+ BasicDecimal256(
+ {5332261958806667264ULL, 17004971331911604867ULL, 2938735877055718769ULL, 0ULL}),
+ BasicDecimal256(
+ {16429131440647569408ULL, 4029016655730084128ULL, 10940614696847636083ULL, 1ULL}),
+ BasicDecimal256({16717361816799281152ULL, 3396678409881738056ULL,
+ 17172426599928602752ULL, 15ULL}),
+ BasicDecimal256({1152921504606846976ULL, 15520040025107828953ULL,
+ 5703569335900062977ULL, 159ULL}),
+ BasicDecimal256({11529215046068469760ULL, 7626447661401876602ULL,
+ 1695461137871974930ULL, 1593ULL}),
+ BasicDecimal256({4611686018427387904ULL, 2477500319180559562ULL,
+ 16954611378719749304ULL, 15930ULL}),
+ BasicDecimal256({9223372036854775808ULL, 6328259118096044006ULL,
+ 3525417123811528497ULL, 159309ULL}),
+ BasicDecimal256({0ULL, 7942358959831785217ULL, 16807427164405733357ULL, 1593091ULL}),
+ BasicDecimal256({0ULL, 5636613303479645706ULL, 2053574980671369030ULL, 15930919ULL}),
+ BasicDecimal256({0ULL, 1025900813667802212ULL, 2089005733004138687ULL, 159309191ULL}),
+ BasicDecimal256(
+ {0ULL, 10259008136678022120ULL, 2443313256331835254ULL, 1593091911ULL}),
+ BasicDecimal256(
+ {0ULL, 10356360998232463120ULL, 5986388489608800929ULL, 15930919111ULL}),
+ BasicDecimal256(
+ {0ULL, 11329889613776873120ULL, 4523652674959354447ULL, 159309191113ULL}),
+ BasicDecimal256(
+ {0ULL, 2618431695511421504ULL, 8343038602174441244ULL, 1593091911132ULL}),
+ BasicDecimal256(
+ {0ULL, 7737572881404663424ULL, 9643409726906205977ULL, 15930919111324ULL}),
+ BasicDecimal256(
+ {0ULL, 3588752519208427776ULL, 4200376900514301694ULL, 159309191113245ULL}),
+ BasicDecimal256(
+ {0ULL, 17440781118374726144ULL, 5110280857723913709ULL, 1593091911132452ULL}),
+ BasicDecimal256(
+ {0ULL, 8387114520361296896ULL, 14209320429820033867ULL, 15930919111324522ULL}),
+ BasicDecimal256(
+ {0ULL, 10084168908774762496ULL, 12965995782233477362ULL, 159309191113245227ULL}),
+ BasicDecimal256(
+ {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})};
+
+static const BasicDecimal256 ScaleMultipliersHalfDecimal256[] = {
+ BasicDecimal256({0ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}),
+ BasicDecimal256({932356024711512064ULL, 271ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}),
+ BasicDecimal256({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}),
+ BasicDecimal256({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}),
+ BasicDecimal256({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}),
+ BasicDecimal256({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}),
+ BasicDecimal256({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}),
+ BasicDecimal256({15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}),
+ BasicDecimal256({11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}),
+ BasicDecimal256({5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}),
+ BasicDecimal256({3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}),
+ BasicDecimal256({40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}),
+ BasicDecimal256({401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}),
+ BasicDecimal256({4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}),
+ BasicDecimal256(
+ {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}),
+ BasicDecimal256(
+ {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}),
+ BasicDecimal256(
+ {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}),
+ BasicDecimal256(
+ {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}),
+ BasicDecimal256(
+ {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}),
+ BasicDecimal256(
+ {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}),
+ BasicDecimal256(
+ {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}),
+ BasicDecimal256(
+ {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}),
+ BasicDecimal256(
+ {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}),
+ BasicDecimal256(
+ {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}),
+ BasicDecimal256(
+ {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}),
+ BasicDecimal256(
+ {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}),
+ BasicDecimal256(
+ {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}),
+ BasicDecimal256({9799832789158199296ULL, 16983392049408690284ULL,
+ 12075156704804807296ULL, 79ULL}),
+ BasicDecimal256({5764607523034234880ULL, 3813223830700938301ULL,
+ 10071102605790763273ULL, 796ULL}),
+ BasicDecimal256({2305843009213693952ULL, 1238750159590279781ULL,
+ 8477305689359874652ULL, 7965ULL}),
+ BasicDecimal256({4611686018427387904ULL, 12387501595902797811ULL,
+ 10986080598760540056ULL, 79654ULL}),
+ BasicDecimal256({9223372036854775808ULL, 13194551516770668416ULL,
+ 17627085619057642486ULL, 796545ULL}),
+ BasicDecimal256({0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}),
+ BasicDecimal256({0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}),
+ BasicDecimal256(
+ {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}),
+ BasicDecimal256(
+ {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}),
+ BasicDecimal256(
+ {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}),
+ BasicDecimal256(
+ {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}),
+ BasicDecimal256(
+ {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}),
+ BasicDecimal256(
+ {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}),
+ BasicDecimal256(
+ {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}),
+ BasicDecimal256(
+ {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}),
+ BasicDecimal256(
+ {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}),
+ BasicDecimal256(
+ {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})};
+
#ifdef ARROW_USE_NATIVE_INT128
static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF;
#else
-static constexpr uint64_t kInt32Mask = 0xFFFFFFFF;
+static constexpr uint64_t kInt32Mask = 0xFFFFFFFF;
#endif
// same as ScaleMultipliers[38] - 1
@@ -468,127 +468,127 @@ BasicDecimal128& BasicDecimal128::operator>>=(uint32_t bits) {
namespace {
-// Convenience wrapper type over 128 bit unsigned integers. We opt not to
-// replace the uint128_t type in int128_internal.h because it would require
-// significantly more implementation work to be done. This class merely
-// provides the minimum necessary set of functions to perform 128+ bit
-// multiplication operations when there may or may not be native support.
+// Convenience wrapper type over 128 bit unsigned integers. We opt not to
+// replace the uint128_t type in int128_internal.h because it would require
+// significantly more implementation work to be done. This class merely
+// provides the minimum necessary set of functions to perform 128+ bit
+// multiplication operations when there may or may not be native support.
#ifdef ARROW_USE_NATIVE_INT128
-struct uint128_t {
- uint128_t() {}
- uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {}
- explicit uint128_t(const BasicDecimal128& decimal) {
- val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits();
- }
-
- explicit uint128_t(uint64_t value) : val_(value) {}
-
- uint64_t hi() { return val_ >> 64; }
- uint64_t lo() { return val_ & kInt64Mask; }
-
- uint128_t& operator+=(const uint128_t& other) {
- val_ += other.val_;
- return *this;
- }
-
- uint128_t& operator*=(const uint128_t& other) {
- val_ *= other.val_;
- return *this;
- }
-
- __uint128_t val_;
-};
-
+struct uint128_t {
+ uint128_t() {}
+ uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {}
+ explicit uint128_t(const BasicDecimal128& decimal) {
+ val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits();
+ }
+
+ explicit uint128_t(uint64_t value) : val_(value) {}
+
+ uint64_t hi() { return val_ >> 64; }
+ uint64_t lo() { return val_ & kInt64Mask; }
+
+ uint128_t& operator+=(const uint128_t& other) {
+ val_ += other.val_;
+ return *this;
+ }
+
+ uint128_t& operator*=(const uint128_t& other) {
+ val_ *= other.val_;
+ return *this;
+ }
+
+ __uint128_t val_;
+};
+
#else
-// Multiply two 64 bit word components into a 128 bit result, with high bits
-// stored in hi and low bits in lo.
-inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
- // Perform multiplication on two 64 bit words x and y into a 128 bit result
+// Multiply two 64 bit word components into a 128 bit result, with high bits
+// stored in hi and low bits in lo.
+inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
+ // Perform multiplication on two 64 bit words x and y into a 128 bit result
// by splitting up x and y into 32 bit high/low bit components,
// allowing us to represent the multiplication as
// x * y = x_lo * y_lo + x_hi * y_lo * 2^32 + y_hi * x_lo * 2^32
- // + x_hi * y_hi * 2^64
+ // + x_hi * y_hi * 2^64
//
- // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
+ // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
// Therefore,
// lo_lo is (x_lo * y_lo)_lo,
// lo_hi is ((x_lo * y_lo)_hi + (x_hi * y_lo)_lo + (x_lo * y_hi)_lo)_lo,
// hi_lo is ((x_hi * y_hi)_lo + (x_hi * y_lo)_hi + (x_lo * y_hi)_hi)_hi,
// hi_hi is (x_hi * y_hi)_hi
- const uint64_t x_lo = x & kInt32Mask;
- const uint64_t y_lo = y & kInt32Mask;
+ const uint64_t x_lo = x & kInt32Mask;
+ const uint64_t y_lo = y & kInt32Mask;
const uint64_t x_hi = x >> 32;
const uint64_t y_hi = y >> 32;
const uint64_t t = x_lo * y_lo;
- const uint64_t t_lo = t & kInt32Mask;
+ const uint64_t t_lo = t & kInt32Mask;
const uint64_t t_hi = t >> 32;
const uint64_t u = x_hi * y_lo + t_hi;
- const uint64_t u_lo = u & kInt32Mask;
+ const uint64_t u_lo = u & kInt32Mask;
const uint64_t u_hi = u >> 32;
const uint64_t v = x_lo * y_hi + u_lo;
const uint64_t v_hi = v >> 32;
*hi = x_hi * y_hi + u_hi + v_hi;
- *lo = (v << 32) + t_lo;
-}
-
-struct uint128_t {
- uint128_t() {}
- uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {}
- explicit uint128_t(const BasicDecimal128& decimal) {
- hi_ = decimal.high_bits();
- lo_ = decimal.low_bits();
- }
-
- uint64_t hi() const { return hi_; }
- uint64_t lo() const { return lo_; }
-
- uint128_t& operator+=(const uint128_t& other) {
- // To deduce the carry bit, we perform "65 bit" addition on the low bits and
- // seeing if the resulting high bit is 1. This is accomplished by shifting the
- // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the
- // result of adding the two chopped bits would have produced a carry.
- uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63;
- hi_ += other.hi_ + carry;
- lo_ += other.lo_;
- return *this;
- }
-
- uint128_t& operator*=(const uint128_t& other) {
- uint128_t r;
- ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_);
- r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_);
- *this = r;
- return *this;
- }
-
- uint64_t hi_;
- uint64_t lo_;
-};
+ *lo = (v << 32) + t_lo;
+}
+
+struct uint128_t {
+ uint128_t() {}
+ uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {}
+ explicit uint128_t(const BasicDecimal128& decimal) {
+ hi_ = decimal.high_bits();
+ lo_ = decimal.low_bits();
+ }
+
+ uint64_t hi() const { return hi_; }
+ uint64_t lo() const { return lo_; }
+
+ uint128_t& operator+=(const uint128_t& other) {
+ // To deduce the carry bit, we perform "65 bit" addition on the low bits and
+ // seeing if the resulting high bit is 1. This is accomplished by shifting the
+ // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the
+ // result of adding the two chopped bits would have produced a carry.
+ uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63;
+ hi_ += other.hi_ + carry;
+ lo_ += other.lo_;
+ return *this;
+ }
+
+ uint128_t& operator*=(const uint128_t& other) {
+ uint128_t r;
+ ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_);
+ r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_);
+ *this = r;
+ return *this;
+ }
+
+ uint64_t hi_;
+ uint64_t lo_;
+};
#endif
-// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t
-// array into a same sized output. Elements in the array should be in
-// little endian order, and output will be the same. Overflow in multiplication
-// will result in the lower N * 64 bits of the result being set.
-template <int N>
-inline void MultiplyUnsignedArray(const std::array<uint64_t, N>& lh,
- const std::array<uint64_t, N>& rh,
- std::array<uint64_t, N>* result) {
- for (int j = 0; j < N; ++j) {
- uint64_t carry = 0;
- for (int i = 0; i < N - j; ++i) {
- uint128_t tmp(lh[i]);
- tmp *= uint128_t(rh[j]);
- tmp += uint128_t((*result)[i + j]);
- tmp += uint128_t(carry);
- (*result)[i + j] = tmp.lo();
- carry = tmp.hi();
- }
- }
+// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t
+// array into a same sized output. Elements in the array should be in
+// little endian order, and output will be the same. Overflow in multiplication
+// will result in the lower N * 64 bits of the result being set.
+template <int N>
+inline void MultiplyUnsignedArray(const std::array<uint64_t, N>& lh,
+ const std::array<uint64_t, N>& rh,
+ std::array<uint64_t, N>* result) {
+ for (int j = 0; j < N; ++j) {
+ uint64_t carry = 0;
+ for (int i = 0; i < N - j; ++i) {
+ uint128_t tmp(lh[i]);
+ tmp *= uint128_t(rh[j]);
+ tmp += uint128_t((*result)[i + j]);
+ tmp += uint128_t(carry);
+ (*result)[i + j] = tmp.lo();
+ carry = tmp.hi();
+ }
+ }
}
} // namespace
@@ -599,62 +599,62 @@ BasicDecimal128& BasicDecimal128::operator*=(const BasicDecimal128& right) {
const bool negate = Sign() != right.Sign();
BasicDecimal128 x = BasicDecimal128::Abs(*this);
BasicDecimal128 y = BasicDecimal128::Abs(right);
- uint128_t r(x);
- r *= uint128_t{y};
- high_bits_ = r.hi();
- low_bits_ = r.lo();
+ uint128_t r(x);
+ r *= uint128_t{y};
+ high_bits_ = r.hi();
+ low_bits_ = r.lo();
if (negate) {
Negate();
}
return *this;
}
-/// Expands the given little endian array of uint64_t into a big endian array of
-/// uint32_t. The value of input array is expected to be non-negative. The result_array
-/// will remove leading zeros from the input array.
-/// \param value_array a little endian array to represent the value
-/// \param result_array a big endian array of length N*2 to set with the value
-/// \result the output length of the array
-template <size_t N>
-static int64_t FillInArray(const std::array<uint64_t, N>& value_array,
- uint32_t* result_array) {
- int64_t next_index = 0;
- // 1st loop to find out 1st non-negative value in input
- int64_t i = N - 1;
- for (; i >= 0; i--) {
- if (value_array[i] != 0) {
- if (value_array[i] <= std::numeric_limits<uint32_t>::max()) {
- result_array[next_index++] = static_cast<uint32_t>(value_array[i]);
- i--;
- }
- break;
- }
- }
- // 2nd loop to fill in the rest of the array.
- for (int64_t j = i; j >= 0; j--) {
- result_array[next_index++] = static_cast<uint32_t>(value_array[j] >> 32);
- result_array[next_index++] = static_cast<uint32_t>(value_array[j]);
- }
- return next_index;
-}
-
-/// Expands the given value into a big endian array of ints so that we can work on
-/// it. The array will be converted to an absolute value and the was_negative
+/// Expands the given little endian array of uint64_t into a big endian array of
+/// uint32_t. The value of input array is expected to be non-negative. The result_array
+/// will remove leading zeros from the input array.
+/// \param value_array a little endian array to represent the value
+/// \param result_array a big endian array of length N*2 to set with the value
+/// \result the output length of the array
+template <size_t N>
+static int64_t FillInArray(const std::array<uint64_t, N>& value_array,
+ uint32_t* result_array) {
+ int64_t next_index = 0;
+ // 1st loop to find out 1st non-negative value in input
+ int64_t i = N - 1;
+ for (; i >= 0; i--) {
+ if (value_array[i] != 0) {
+ if (value_array[i] <= std::numeric_limits<uint32_t>::max()) {
+ result_array[next_index++] = static_cast<uint32_t>(value_array[i]);
+ i--;
+ }
+ break;
+ }
+ }
+ // 2nd loop to fill in the rest of the array.
+ for (int64_t j = i; j >= 0; j--) {
+ result_array[next_index++] = static_cast<uint32_t>(value_array[j] >> 32);
+ result_array[next_index++] = static_cast<uint32_t>(value_array[j]);
+ }
+ return next_index;
+}
+
+/// Expands the given value into a big endian array of ints so that we can work on
+/// it. The array will be converted to an absolute value and the was_negative
/// flag will be set appropriately. The array will remove leading zeros from
/// the value.
-/// \param array a big endian array of length 4 to set with the value
+/// \param array a big endian array of length 4 to set with the value
/// \param was_negative a flag for whether the value was original negative
/// \result the output length of the array
static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
bool& was_negative) {
- BasicDecimal128 abs_value = BasicDecimal128::Abs(value);
- was_negative = value.high_bits() < 0;
- uint64_t high = static_cast<uint64_t>(abs_value.high_bits());
- uint64_t low = abs_value.low_bits();
-
- // FillInArray(std::array<uint64_t, N>& value_array, uint32_t* result_array) is not
- // called here as the following code has better performance, to avoid regression on
- // BasicDecimal128 Division.
+ BasicDecimal128 abs_value = BasicDecimal128::Abs(value);
+ was_negative = value.high_bits() < 0;
+ uint64_t high = static_cast<uint64_t>(abs_value.high_bits());
+ uint64_t low = abs_value.low_bits();
+
+ // FillInArray(std::array<uint64_t, N>& value_array, uint32_t* result_array) is not
+ // called here as the following code has better performance, to avoid regression on
+ // BasicDecimal128 Division.
if (high != 0) {
if (high > std::numeric_limits<uint32_t>::max()) {
array[0] = static_cast<uint32_t>(high >> 32);
@@ -670,7 +670,7 @@ static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
return 3;
}
- if (low > std::numeric_limits<uint32_t>::max()) {
+ if (low > std::numeric_limits<uint32_t>::max()) {
array[0] = static_cast<uint32_t>(low >> 32);
array[1] = static_cast<uint32_t>(low);
return 2;
@@ -684,24 +684,24 @@ static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
return 1;
}
-/// Expands the given value into a big endian array of ints so that we can work on
-/// it. The array will be converted to an absolute value and the was_negative
-/// flag will be set appropriately. The array will remove leading zeros from
-/// the value.
-/// \param array a big endian array of length 8 to set with the value
-/// \param was_negative a flag for whether the value was original negative
-/// \result the output length of the array
-static int64_t FillInArray(const BasicDecimal256& value, uint32_t* array,
- bool& was_negative) {
- BasicDecimal256 positive_value = value;
- was_negative = false;
- if (positive_value.IsNegative()) {
- positive_value.Negate();
- was_negative = true;
- }
- return FillInArray<4>(positive_value.little_endian_array(), array);
-}
-
+/// Expands the given value into a big endian array of ints so that we can work on
+/// it. The array will be converted to an absolute value and the was_negative
+/// flag will be set appropriately. The array will remove leading zeros from
+/// the value.
+/// \param array a big endian array of length 8 to set with the value
+/// \param was_negative a flag for whether the value was original negative
+/// \result the output length of the array
+static int64_t FillInArray(const BasicDecimal256& value, uint32_t* array,
+ bool& was_negative) {
+ BasicDecimal256 positive_value = value;
+ was_negative = false;
+ if (positive_value.IsNegative()) {
+ positive_value.Negate();
+ was_negative = true;
+ }
+ return FillInArray<4>(positive_value.little_endian_array(), array);
+}
+
/// Shift the number in the array left by bits positions.
/// \param array the number to shift, must have length elements
/// \param length the number of entries in the array
@@ -719,7 +719,7 @@ static void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
/// \param array the number to shift, must have length elements
/// \param length the number of entries in the array
/// \param bits the number of bits to shift (0 <= bits < 32)
-static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
+static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
if (length > 0 && bits != 0) {
for (int64_t i = length - 1; i > 0; --i) {
array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits));
@@ -730,10 +730,10 @@ static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits
/// \brief Fix the signs of the result and remainder at the end of the division based on
/// the signs of the dividend and divisor.
-template <class DecimalClass>
-static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainder,
- bool dividend_was_negative,
- bool divisor_was_negative) {
+template <class DecimalClass>
+static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainder,
+ bool dividend_was_negative,
+ bool divisor_was_negative) {
if (dividend_was_negative != divisor_was_negative) {
result->Negate();
}
@@ -743,65 +743,65 @@ static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainde
}
}
-/// \brief Build a little endian array of uint64_t from a big endian array of uint32_t.
-template <size_t N>
-static DecimalStatus BuildFromArray(std::array<uint64_t, N>* result_array,
- const uint32_t* array, int64_t length) {
- for (int64_t i = length - 2 * N - 1; i >= 0; i--) {
- if (array[i] != 0) {
+/// \brief Build a little endian array of uint64_t from a big endian array of uint32_t.
+template <size_t N>
+static DecimalStatus BuildFromArray(std::array<uint64_t, N>* result_array,
+ const uint32_t* array, int64_t length) {
+ for (int64_t i = length - 2 * N - 1; i >= 0; i--) {
+ if (array[i] != 0) {
return DecimalStatus::kOverflow;
- }
- }
- int64_t next_index = length - 1;
- size_t i = 0;
- for (; i < N && next_index >= 0; i++) {
- uint64_t lower_bits = array[next_index--];
- (*result_array)[i] =
- (next_index < 0)
- ? lower_bits
- : ((static_cast<uint64_t>(array[next_index--]) << 32) + lower_bits);
- }
- for (; i < N; i++) {
- (*result_array)[i] = 0;
+ }
}
+ int64_t next_index = length - 1;
+ size_t i = 0;
+ for (; i < N && next_index >= 0; i++) {
+ uint64_t lower_bits = array[next_index--];
+ (*result_array)[i] =
+ (next_index < 0)
+ ? lower_bits
+ : ((static_cast<uint64_t>(array[next_index--]) << 32) + lower_bits);
+ }
+ for (; i < N; i++) {
+ (*result_array)[i] = 0;
+ }
+ return DecimalStatus::kSuccess;
+}
+
+/// \brief Build a BasicDecimal128 from a big endian array of uint32_t.
+static DecimalStatus BuildFromArray(BasicDecimal128* value, const uint32_t* array,
+ int64_t length) {
+ std::array<uint64_t, 2> result_array;
+ auto status = BuildFromArray(&result_array, array, length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ *value = {static_cast<int64_t>(result_array[1]), result_array[0]};
return DecimalStatus::kSuccess;
}
-/// \brief Build a BasicDecimal128 from a big endian array of uint32_t.
-static DecimalStatus BuildFromArray(BasicDecimal128* value, const uint32_t* array,
- int64_t length) {
- std::array<uint64_t, 2> result_array;
- auto status = BuildFromArray(&result_array, array, length);
- if (status != DecimalStatus::kSuccess) {
- return status;
- }
- *value = {static_cast<int64_t>(result_array[1]), result_array[0]};
- return DecimalStatus::kSuccess;
-}
-
-/// \brief Build a BasicDecimal256 from a big endian array of uint32_t.
-static DecimalStatus BuildFromArray(BasicDecimal256* value, const uint32_t* array,
- int64_t length) {
- std::array<uint64_t, 4> result_array;
- auto status = BuildFromArray(&result_array, array, length);
- if (status != DecimalStatus::kSuccess) {
- return status;
- }
- *value = result_array;
- return DecimalStatus::kSuccess;
-}
-
+/// \brief Build a BasicDecimal256 from a big endian array of uint32_t.
+static DecimalStatus BuildFromArray(BasicDecimal256* value, const uint32_t* array,
+ int64_t length) {
+ std::array<uint64_t, 4> result_array;
+ auto status = BuildFromArray(&result_array, array, length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ *value = result_array;
+ return DecimalStatus::kSuccess;
+}
+
/// \brief Do a division where the divisor fits into a single 32 bit value.
-template <class DecimalClass>
-static inline DecimalStatus SingleDivide(const uint32_t* dividend,
- int64_t dividend_length, uint32_t divisor,
- DecimalClass* remainder,
- bool dividend_was_negative,
- bool divisor_was_negative,
- DecimalClass* result) {
+template <class DecimalClass>
+static inline DecimalStatus SingleDivide(const uint32_t* dividend,
+ int64_t dividend_length, uint32_t divisor,
+ DecimalClass* remainder,
+ bool dividend_was_negative,
+ bool divisor_was_negative,
+ DecimalClass* result) {
uint64_t r = 0;
- constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t) + 1;
- uint32_t result_array[kDecimalArrayLength];
+ constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t) + 1;
+ uint32_t result_array[kDecimalArrayLength];
for (int64_t j = 0; j < dividend_length; j++) {
r <<= 32;
r += dividend[j];
@@ -818,27 +818,27 @@ static inline DecimalStatus SingleDivide(const uint32_t* dividend,
return DecimalStatus::kSuccess;
}
-/// \brief Do a decimal division with remainder.
-template <class DecimalClass>
-static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
- const DecimalClass& divisor,
- DecimalClass* result, DecimalClass* remainder) {
- constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t);
+/// \brief Do a decimal division with remainder.
+template <class DecimalClass>
+static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
+ const DecimalClass& divisor,
+ DecimalClass* result, DecimalClass* remainder) {
+ constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t);
// Split the dividend and divisor into integer pieces so that we can
// work on them.
- uint32_t dividend_array[kDecimalArrayLength + 1];
- uint32_t divisor_array[kDecimalArrayLength];
+ uint32_t dividend_array[kDecimalArrayLength + 1];
+ uint32_t divisor_array[kDecimalArrayLength];
bool dividend_was_negative;
bool divisor_was_negative;
// leave an extra zero before the dividend
dividend_array[0] = 0;
int64_t dividend_length =
- FillInArray(dividend, dividend_array + 1, dividend_was_negative) + 1;
+ FillInArray(dividend, dividend_array + 1, dividend_was_negative) + 1;
int64_t divisor_length = FillInArray(divisor, divisor_array, divisor_was_negative);
// Handle some of the easy cases.
if (dividend_length <= divisor_length) {
- *remainder = dividend;
+ *remainder = dividend;
*result = 0;
return DecimalStatus::kSuccess;
}
@@ -853,8 +853,8 @@ static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
}
int64_t result_length = dividend_length - divisor_length;
- uint32_t result_array[kDecimalArrayLength];
- DCHECK_LE(result_length, kDecimalArrayLength);
+ uint32_t result_array[kDecimalArrayLength];
+ DCHECK_LE(result_length, kDecimalArrayLength);
// Normalize by shifting both by a multiple of 2 so that
// the digit guessing is better. The requirement is that
@@ -933,12 +933,12 @@ static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
return DecimalStatus::kSuccess;
}
-DecimalStatus BasicDecimal128::Divide(const BasicDecimal128& divisor,
- BasicDecimal128* result,
- BasicDecimal128* remainder) const {
- return DecimalDivide(*this, divisor, result, remainder);
-}
-
+DecimalStatus BasicDecimal128::Divide(const BasicDecimal128& divisor,
+ BasicDecimal128* result,
+ BasicDecimal128* remainder) const {
+ return DecimalDivide(*this, divisor, result, remainder);
+}
+
bool operator==(const BasicDecimal128& left, const BasicDecimal128& right) {
return left.high_bits() == right.high_bits() && left.low_bits() == right.low_bits();
}
@@ -1008,13 +1008,13 @@ BasicDecimal128 operator%(const BasicDecimal128& left, const BasicDecimal128& ri
return remainder;
}
-template <class DecimalClass>
-static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_scale,
- const DecimalClass& multiplier,
- DecimalClass* result) {
+template <class DecimalClass>
+static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_scale,
+ const DecimalClass& multiplier,
+ DecimalClass* result) {
if (delta_scale < 0) {
DCHECK_NE(multiplier, 0);
- DecimalClass remainder;
+ DecimalClass remainder;
auto status = value.Divide(multiplier, result, &remainder);
DCHECK_EQ(status, DecimalStatus::kSuccess);
return remainder != 0;
@@ -1024,23 +1024,23 @@ static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_s
return (value < 0) ? *result > value : *result < value;
}
-template <class DecimalClass>
-DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
- int32_t new_scale, DecimalClass* out) {
+template <class DecimalClass>
+DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
+ int32_t new_scale, DecimalClass* out) {
DCHECK_NE(out, nullptr);
if (original_scale == new_scale) {
- *out = value;
+ *out = value;
return DecimalStatus::kSuccess;
}
const int32_t delta_scale = new_scale - original_scale;
const int32_t abs_delta_scale = std::abs(delta_scale);
- DecimalClass multiplier = DecimalClass::GetScaleMultiplier(abs_delta_scale);
+ DecimalClass multiplier = DecimalClass::GetScaleMultiplier(abs_delta_scale);
const bool rescale_would_cause_data_loss =
- RescaleWouldCauseDataLoss(value, delta_scale, multiplier, out);
+ RescaleWouldCauseDataLoss(value, delta_scale, multiplier, out);
// Fail if we overflow or truncate
if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) {
@@ -1050,11 +1050,11 @@ DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
return DecimalStatus::kSuccess;
}
-DecimalStatus BasicDecimal128::Rescale(int32_t original_scale, int32_t new_scale,
- BasicDecimal128* out) const {
- return DecimalRescale(*this, original_scale, new_scale, out);
-}
-
+DecimalStatus BasicDecimal128::Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal128* out) const {
+ return DecimalRescale(*this, original_scale, new_scale, out);
+}
+
void BasicDecimal128::GetWholeAndFraction(int scale, BasicDecimal128* whole,
BasicDecimal128* fraction) const {
DCHECK_GE(scale, 0);
@@ -1117,228 +1117,228 @@ int32_t BasicDecimal128::CountLeadingBinaryZeros() const {
}
}
-#if ARROW_LITTLE_ENDIAN
-BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
- : little_endian_array_(
- std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[0],
- reinterpret_cast<const uint64_t*>(bytes)[1],
- reinterpret_cast<const uint64_t*>(bytes)[2],
- reinterpret_cast<const uint64_t*>(bytes)[3]})) {}
-#else
-BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
- : little_endian_array_(
- std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[3],
- reinterpret_cast<const uint64_t*>(bytes)[2],
- reinterpret_cast<const uint64_t*>(bytes)[1],
- reinterpret_cast<const uint64_t*>(bytes)[0]})) {}
-#endif
-
-BasicDecimal256& BasicDecimal256::Negate() {
- uint64_t carry = 1;
- for (uint64_t& elem : little_endian_array_) {
- elem = ~elem + carry;
- carry &= (elem == 0);
- }
- return *this;
-}
-
-BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; }
-
-BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) {
- BasicDecimal256 result(in);
- return result.Abs();
-}
-
-BasicDecimal256& BasicDecimal256::operator+=(const BasicDecimal256& right) {
- uint64_t carry = 0;
- for (size_t i = 0; i < little_endian_array_.size(); i++) {
- const uint64_t right_value = right.little_endian_array_[i];
- uint64_t sum = right_value + carry;
- carry = 0;
- if (sum < right_value) {
- carry += 1;
- }
- sum += little_endian_array_[i];
- if (sum < little_endian_array_[i]) {
- carry += 1;
- }
- little_endian_array_[i] = sum;
- }
- return *this;
-}
-
-BasicDecimal256& BasicDecimal256::operator-=(const BasicDecimal256& right) {
- *this += -right;
- return *this;
-}
-
-BasicDecimal256& BasicDecimal256::operator<<=(uint32_t bits) {
- if (bits == 0) {
- return *this;
- }
- int cross_word_shift = bits / 64;
- if (static_cast<size_t>(cross_word_shift) >= little_endian_array_.size()) {
- little_endian_array_ = {0, 0, 0, 0};
- return *this;
- }
- uint32_t in_word_shift = bits % 64;
- for (int i = static_cast<int>(little_endian_array_.size() - 1); i >= cross_word_shift;
- i--) {
- // Account for shifts larger then 64 bits
- little_endian_array_[i] = little_endian_array_[i - cross_word_shift];
- little_endian_array_[i] <<= in_word_shift;
- if (in_word_shift != 0 && i >= cross_word_shift + 1) {
- little_endian_array_[i] |=
- little_endian_array_[i - (cross_word_shift + 1)] >> (64 - in_word_shift);
- }
- }
- for (int i = cross_word_shift - 1; i >= 0; i--) {
- little_endian_array_[i] = 0;
- }
- return *this;
-}
-
-std::array<uint8_t, 32> BasicDecimal256::ToBytes() const {
- std::array<uint8_t, 32> out{{0}};
- ToBytes(out.data());
- return out;
-}
-
-void BasicDecimal256::ToBytes(uint8_t* out) const {
- DCHECK_NE(out, nullptr);
-#if ARROW_LITTLE_ENDIAN
- reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[0];
- reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[1];
- reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[2];
- reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[3];
-#else
- reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[3];
- reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[2];
- reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[1];
- reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[0];
-#endif
-}
-
-BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) {
- // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the
- // min the negation taking the absolute values here should always be safe.
- const bool negate = Sign() != right.Sign();
- BasicDecimal256 x = BasicDecimal256::Abs(*this);
- BasicDecimal256 y = BasicDecimal256::Abs(right);
-
- uint128_t r_hi;
- uint128_t r_lo;
- std::array<uint64_t, 4> res{0, 0, 0, 0};
- MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res);
- little_endian_array_ = res;
- if (negate) {
- Negate();
- }
- return *this;
-}
-
-DecimalStatus BasicDecimal256::Divide(const BasicDecimal256& divisor,
- BasicDecimal256* result,
- BasicDecimal256* remainder) const {
- return DecimalDivide(*this, divisor, result, remainder);
-}
-
-DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale,
- BasicDecimal256* out) const {
- return DecimalRescale(*this, original_scale, new_scale, out);
-}
-
-BasicDecimal256 BasicDecimal256::IncreaseScaleBy(int32_t increase_by) const {
- DCHECK_GE(increase_by, 0);
- DCHECK_LE(increase_by, 76);
-
- return (*this) * ScaleMultipliersDecimal256[increase_by];
-}
-
-BasicDecimal256 BasicDecimal256::ReduceScaleBy(int32_t reduce_by, bool round) const {
- DCHECK_GE(reduce_by, 0);
- DCHECK_LE(reduce_by, 76);
-
- if (reduce_by == 0) {
- return *this;
- }
-
- BasicDecimal256 divisor(ScaleMultipliersDecimal256[reduce_by]);
- BasicDecimal256 result;
- BasicDecimal256 remainder;
- auto s = Divide(divisor, &result, &remainder);
- DCHECK_EQ(s, DecimalStatus::kSuccess);
- if (round) {
- auto divisor_half = ScaleMultipliersHalfDecimal256[reduce_by];
- if (remainder.Abs() >= divisor_half) {
- if (result > 0) {
- result += 1;
- } else {
- result -= 1;
- }
- }
- }
- return result;
-}
-
-bool BasicDecimal256::FitsInPrecision(int32_t precision) const {
- DCHECK_GT(precision, 0);
- DCHECK_LE(precision, 76);
- return BasicDecimal256::Abs(*this) < ScaleMultipliersDecimal256[precision];
-}
-
-const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) {
- DCHECK_GE(scale, 0);
- DCHECK_LE(scale, 76);
-
- return ScaleMultipliersDecimal256[scale];
-}
-
-BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
- BasicDecimal256 result = left;
- result *= right;
- return result;
-}
-
-bool operator<(const BasicDecimal256& left, const BasicDecimal256& right) {
- const std::array<uint64_t, 4>& lhs = left.little_endian_array();
- const std::array<uint64_t, 4>& rhs = right.little_endian_array();
- return lhs[3] != rhs[3]
- ? static_cast<int64_t>(lhs[3]) < static_cast<int64_t>(rhs[3])
- : lhs[2] != rhs[2] ? lhs[2] < rhs[2]
- : lhs[1] != rhs[1] ? lhs[1] < rhs[1] : lhs[0] < rhs[0];
-}
-
-BasicDecimal256 operator-(const BasicDecimal256& operand) {
- BasicDecimal256 result(operand);
- return result.Negate();
-}
-
-BasicDecimal256 operator~(const BasicDecimal256& operand) {
- const std::array<uint64_t, 4>& arr = operand.little_endian_array();
- BasicDecimal256 result({~arr[0], ~arr[1], ~arr[2], ~arr[3]});
- return result;
-}
-
-BasicDecimal256& BasicDecimal256::operator/=(const BasicDecimal256& right) {
- BasicDecimal256 remainder;
- auto s = Divide(right, this, &remainder);
- DCHECK_EQ(s, DecimalStatus::kSuccess);
- return *this;
-}
-
-BasicDecimal256 operator+(const BasicDecimal256& left, const BasicDecimal256& right) {
- BasicDecimal256 sum = left;
- sum += right;
- return sum;
-}
-
-BasicDecimal256 operator/(const BasicDecimal256& left, const BasicDecimal256& right) {
- BasicDecimal256 remainder;
- BasicDecimal256 result;
- auto s = left.Divide(right, &result, &remainder);
- DCHECK_EQ(s, DecimalStatus::kSuccess);
- return result;
-}
-
+#if ARROW_LITTLE_ENDIAN
+BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
+ : little_endian_array_(
+ std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[0],
+ reinterpret_cast<const uint64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[2],
+ reinterpret_cast<const uint64_t*>(bytes)[3]})) {}
+#else
+BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
+ : little_endian_array_(
+ std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[3],
+ reinterpret_cast<const uint64_t*>(bytes)[2],
+ reinterpret_cast<const uint64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[0]})) {}
+#endif
+
+BasicDecimal256& BasicDecimal256::Negate() {
+ uint64_t carry = 1;
+ for (uint64_t& elem : little_endian_array_) {
+ elem = ~elem + carry;
+ carry &= (elem == 0);
+ }
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; }
+
+BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) {
+ BasicDecimal256 result(in);
+ return result.Abs();
+}
+
+BasicDecimal256& BasicDecimal256::operator+=(const BasicDecimal256& right) {
+ uint64_t carry = 0;
+ for (size_t i = 0; i < little_endian_array_.size(); i++) {
+ const uint64_t right_value = right.little_endian_array_[i];
+ uint64_t sum = right_value + carry;
+ carry = 0;
+ if (sum < right_value) {
+ carry += 1;
+ }
+ sum += little_endian_array_[i];
+ if (sum < little_endian_array_[i]) {
+ carry += 1;
+ }
+ little_endian_array_[i] = sum;
+ }
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::operator-=(const BasicDecimal256& right) {
+ *this += -right;
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::operator<<=(uint32_t bits) {
+ if (bits == 0) {
+ return *this;
+ }
+ int cross_word_shift = bits / 64;
+ if (static_cast<size_t>(cross_word_shift) >= little_endian_array_.size()) {
+ little_endian_array_ = {0, 0, 0, 0};
+ return *this;
+ }
+ uint32_t in_word_shift = bits % 64;
+ for (int i = static_cast<int>(little_endian_array_.size() - 1); i >= cross_word_shift;
+ i--) {
+ // Account for shifts larger then 64 bits
+ little_endian_array_[i] = little_endian_array_[i - cross_word_shift];
+ little_endian_array_[i] <<= in_word_shift;
+ if (in_word_shift != 0 && i >= cross_word_shift + 1) {
+ little_endian_array_[i] |=
+ little_endian_array_[i - (cross_word_shift + 1)] >> (64 - in_word_shift);
+ }
+ }
+ for (int i = cross_word_shift - 1; i >= 0; i--) {
+ little_endian_array_[i] = 0;
+ }
+ return *this;
+}
+
+std::array<uint8_t, 32> BasicDecimal256::ToBytes() const {
+ std::array<uint8_t, 32> out{{0}};
+ ToBytes(out.data());
+ return out;
+}
+
+void BasicDecimal256::ToBytes(uint8_t* out) const {
+ DCHECK_NE(out, nullptr);
+#if ARROW_LITTLE_ENDIAN
+ reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[0];
+ reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[1];
+ reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[2];
+ reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[3];
+#else
+ reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[3];
+ reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[2];
+ reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[1];
+ reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[0];
+#endif
+}
+
+BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) {
+ // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the
+ // min the negation taking the absolute values here should always be safe.
+ const bool negate = Sign() != right.Sign();
+ BasicDecimal256 x = BasicDecimal256::Abs(*this);
+ BasicDecimal256 y = BasicDecimal256::Abs(right);
+
+ uint128_t r_hi;
+ uint128_t r_lo;
+ std::array<uint64_t, 4> res{0, 0, 0, 0};
+ MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res);
+ little_endian_array_ = res;
+ if (negate) {
+ Negate();
+ }
+ return *this;
+}
+
+DecimalStatus BasicDecimal256::Divide(const BasicDecimal256& divisor,
+ BasicDecimal256* result,
+ BasicDecimal256* remainder) const {
+ return DecimalDivide(*this, divisor, result, remainder);
+}
+
+DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal256* out) const {
+ return DecimalRescale(*this, original_scale, new_scale, out);
+}
+
+BasicDecimal256 BasicDecimal256::IncreaseScaleBy(int32_t increase_by) const {
+ DCHECK_GE(increase_by, 0);
+ DCHECK_LE(increase_by, 76);
+
+ return (*this) * ScaleMultipliersDecimal256[increase_by];
+}
+
+BasicDecimal256 BasicDecimal256::ReduceScaleBy(int32_t reduce_by, bool round) const {
+ DCHECK_GE(reduce_by, 0);
+ DCHECK_LE(reduce_by, 76);
+
+ if (reduce_by == 0) {
+ return *this;
+ }
+
+ BasicDecimal256 divisor(ScaleMultipliersDecimal256[reduce_by]);
+ BasicDecimal256 result;
+ BasicDecimal256 remainder;
+ auto s = Divide(divisor, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ if (round) {
+ auto divisor_half = ScaleMultipliersHalfDecimal256[reduce_by];
+ if (remainder.Abs() >= divisor_half) {
+ if (result > 0) {
+ result += 1;
+ } else {
+ result -= 1;
+ }
+ }
+ }
+ return result;
+}
+
+bool BasicDecimal256::FitsInPrecision(int32_t precision) const {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 76);
+ return BasicDecimal256::Abs(*this) < ScaleMultipliersDecimal256[precision];
+}
+
+const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) {
+ DCHECK_GE(scale, 0);
+ DCHECK_LE(scale, 76);
+
+ return ScaleMultipliersDecimal256[scale];
+}
+
+BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 result = left;
+ result *= right;
+ return result;
+}
+
+bool operator<(const BasicDecimal256& left, const BasicDecimal256& right) {
+ const std::array<uint64_t, 4>& lhs = left.little_endian_array();
+ const std::array<uint64_t, 4>& rhs = right.little_endian_array();
+ return lhs[3] != rhs[3]
+ ? static_cast<int64_t>(lhs[3]) < static_cast<int64_t>(rhs[3])
+ : lhs[2] != rhs[2] ? lhs[2] < rhs[2]
+ : lhs[1] != rhs[1] ? lhs[1] < rhs[1] : lhs[0] < rhs[0];
+}
+
+BasicDecimal256 operator-(const BasicDecimal256& operand) {
+ BasicDecimal256 result(operand);
+ return result.Negate();
+}
+
+BasicDecimal256 operator~(const BasicDecimal256& operand) {
+ const std::array<uint64_t, 4>& arr = operand.little_endian_array();
+ BasicDecimal256 result({~arr[0], ~arr[1], ~arr[2], ~arr[3]});
+ return result;
+}
+
+BasicDecimal256& BasicDecimal256::operator/=(const BasicDecimal256& right) {
+ BasicDecimal256 remainder;
+ auto s = Divide(right, this, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return *this;
+}
+
+BasicDecimal256 operator+(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 sum = left;
+ sum += right;
+ return sum;
+}
+
+BasicDecimal256 operator/(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 remainder;
+ BasicDecimal256 result;
+ auto s = left.Divide(right, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return result;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
index acc8ea4930f..a8f61c73c87 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
@@ -42,8 +42,8 @@ enum class DecimalStatus {
/// streams and boost.
class ARROW_EXPORT BasicDecimal128 {
public:
- static constexpr int bit_width = 128;
-
+ static constexpr int bit_width = 128;
+
/// \brief Create a BasicDecimal128 from the two's complement representation.
constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
: low_bits_(low), high_bits_(high) {}
@@ -111,10 +111,10 @@ class ARROW_EXPORT BasicDecimal128 {
BasicDecimal128& operator>>=(uint32_t bits);
/// \brief Get the high bits of the two's complement representation of the number.
- inline constexpr int64_t high_bits() const { return high_bits_; }
+ inline constexpr int64_t high_bits() const { return high_bits_; }
/// \brief Get the low bits of the two's complement representation of the number.
- inline constexpr uint64_t low_bits() const { return low_bits_; }
+ inline constexpr uint64_t low_bits() const { return low_bits_; }
/// \brief Return the raw bytes of the value in native-endian byte order.
std::array<uint8_t, 16> ToBytes() const;
@@ -180,163 +180,163 @@ ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left,
ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
const BasicDecimal128& right);
-class ARROW_EXPORT BasicDecimal256 {
- private:
- // Due to a bug in clang, we have to declare the extend method prior to its
- // usage.
- template <typename T>
- inline static constexpr uint64_t extend(T low_bits) noexcept {
- return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
- }
-
- public:
- static constexpr int bit_width = 256;
-
- /// \brief Create a BasicDecimal256 from the two's complement representation.
- constexpr BasicDecimal256(const std::array<uint64_t, 4>& little_endian_array) noexcept
- : little_endian_array_(little_endian_array) {}
-
- /// \brief Empty constructor creates a BasicDecimal256 with a value of 0.
- constexpr BasicDecimal256() noexcept : little_endian_array_({0, 0, 0, 0}) {}
-
- /// \brief Convert any integer value into a BasicDecimal256.
- template <typename T,
- typename = typename std::enable_if<
- std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
- constexpr BasicDecimal256(T value) noexcept
- : little_endian_array_({static_cast<uint64_t>(value), extend(value), extend(value),
- extend(value)}) {}
-
- constexpr BasicDecimal256(const BasicDecimal128& value) noexcept
- : little_endian_array_({value.low_bits(), static_cast<uint64_t>(value.high_bits()),
- extend(value.high_bits()), extend(value.high_bits())}) {}
-
- /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in
- /// native-endian byte order.
- explicit BasicDecimal256(const uint8_t* bytes);
-
- /// \brief Negate the current value (in-place)
- BasicDecimal256& Negate();
-
- /// \brief Absolute value (in-place)
- BasicDecimal256& Abs();
-
- /// \brief Absolute value
- static BasicDecimal256 Abs(const BasicDecimal256& left);
-
- /// \brief Add a number to this one. The result is truncated to 256 bits.
- BasicDecimal256& operator+=(const BasicDecimal256& right);
-
- /// \brief Subtract a number from this one. The result is truncated to 256 bits.
- BasicDecimal256& operator-=(const BasicDecimal256& right);
-
- /// \brief Get the bits of the two's complement representation of the number. The 4
- /// elements are in little endian order. The bits within each uint64_t element are in
- /// native endian order. For example,
- /// BasicDecimal256(123).little_endian_array() = {123, 0, 0, 0};
- /// BasicDecimal256(-2).little_endian_array() = {0xFF...FE, 0xFF...FF, 0xFF...FF,
- /// 0xFF...FF}.
- inline const std::array<uint64_t, 4>& little_endian_array() const {
- return little_endian_array_;
- }
-
- /// \brief Get the lowest bits of the two's complement representation of the number.
- inline constexpr uint64_t low_bits() const { return little_endian_array_[0]; }
-
- /// \brief Return the raw bytes of the value in native-endian byte order.
- std::array<uint8_t, 32> ToBytes() const;
- void ToBytes(uint8_t* out) const;
-
- /// \brief Scale multiplier for given scale value.
- static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
-
- /// \brief Convert BasicDecimal256 from one scale to another
- DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
- BasicDecimal256* out) const;
-
- /// \brief Scale up.
- BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
-
- /// \brief Scale down.
- /// - If 'round' is true, the right-most digits are dropped and the result value is
- /// rounded up (+1 for positive, -1 for negative) based on the value of the
- /// dropped digits (>= 10^reduce_by / 2).
- /// - If 'round' is false, the right-most digits are simply dropped.
- BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
-
- /// \brief Whether this number fits in the given precision
- ///
- /// Return true if the number of significant digits is less or equal to `precision`.
- bool FitsInPrecision(int32_t precision) const;
-
- inline int64_t Sign() const {
- return 1 | (static_cast<int64_t>(little_endian_array_[3]) >> 63);
- }
-
- inline int64_t IsNegative() const {
- return static_cast<int64_t>(little_endian_array_[3]) < 0;
- }
-
- /// \brief Multiply this number by another number. The result is truncated to 256 bits.
- BasicDecimal256& operator*=(const BasicDecimal256& right);
-
- /// Divide this number by right and return the result.
- ///
- /// This operation is not destructive.
- /// The answer rounds to zero. Signs work like:
- /// 21 / 5 -> 4, 1
- /// -21 / 5 -> -4, -1
- /// 21 / -5 -> -4, 1
- /// -21 / -5 -> 4, -1
- /// \param[in] divisor the number to divide by
- /// \param[out] result the quotient
- /// \param[out] remainder the remainder after the division
- DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
- BasicDecimal256* remainder) const;
-
- /// \brief Shift left by the given number of bits.
- BasicDecimal256& operator<<=(uint32_t bits);
-
- /// \brief In-place division.
- BasicDecimal256& operator/=(const BasicDecimal256& right);
-
- private:
- std::array<uint64_t, 4> little_endian_array_;
-};
-
-ARROW_EXPORT inline bool operator==(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return left.little_endian_array() == right.little_endian_array();
-}
-
-ARROW_EXPORT inline bool operator!=(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return left.little_endian_array() != right.little_endian_array();
-}
-
-ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
-
-ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return !operator<(right, left);
-}
-
-ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return operator<(right, left);
-}
-
-ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return !operator<(left, right);
-}
-
-ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
-ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
-ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
- const BasicDecimal256& right);
-ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
- const BasicDecimal256& right);
-ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
- const BasicDecimal256& right);
+class ARROW_EXPORT BasicDecimal256 {
+ private:
+ // Due to a bug in clang, we have to declare the extend method prior to its
+ // usage.
+ template <typename T>
+ inline static constexpr uint64_t extend(T low_bits) noexcept {
+ return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
+ }
+
+ public:
+ static constexpr int bit_width = 256;
+
+ /// \brief Create a BasicDecimal256 from the two's complement representation.
+ constexpr BasicDecimal256(const std::array<uint64_t, 4>& little_endian_array) noexcept
+ : little_endian_array_(little_endian_array) {}
+
+ /// \brief Empty constructor creates a BasicDecimal256 with a value of 0.
+ constexpr BasicDecimal256() noexcept : little_endian_array_({0, 0, 0, 0}) {}
+
+ /// \brief Convert any integer value into a BasicDecimal256.
+ template <typename T,
+ typename = typename std::enable_if<
+ std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
+ constexpr BasicDecimal256(T value) noexcept
+ : little_endian_array_({static_cast<uint64_t>(value), extend(value), extend(value),
+ extend(value)}) {}
+
+ constexpr BasicDecimal256(const BasicDecimal128& value) noexcept
+ : little_endian_array_({value.low_bits(), static_cast<uint64_t>(value.high_bits()),
+ extend(value.high_bits()), extend(value.high_bits())}) {}
+
+ /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in
+ /// native-endian byte order.
+ explicit BasicDecimal256(const uint8_t* bytes);
+
+ /// \brief Negate the current value (in-place)
+ BasicDecimal256& Negate();
+
+ /// \brief Absolute value (in-place)
+ BasicDecimal256& Abs();
+
+ /// \brief Absolute value
+ static BasicDecimal256 Abs(const BasicDecimal256& left);
+
+ /// \brief Add a number to this one. The result is truncated to 256 bits.
+ BasicDecimal256& operator+=(const BasicDecimal256& right);
+
+ /// \brief Subtract a number from this one. The result is truncated to 256 bits.
+ BasicDecimal256& operator-=(const BasicDecimal256& right);
+
+ /// \brief Get the bits of the two's complement representation of the number. The 4
+ /// elements are in little endian order. The bits within each uint64_t element are in
+ /// native endian order. For example,
+ /// BasicDecimal256(123).little_endian_array() = {123, 0, 0, 0};
+ /// BasicDecimal256(-2).little_endian_array() = {0xFF...FE, 0xFF...FF, 0xFF...FF,
+ /// 0xFF...FF}.
+ inline const std::array<uint64_t, 4>& little_endian_array() const {
+ return little_endian_array_;
+ }
+
+ /// \brief Get the lowest bits of the two's complement representation of the number.
+ inline constexpr uint64_t low_bits() const { return little_endian_array_[0]; }
+
+ /// \brief Return the raw bytes of the value in native-endian byte order.
+ std::array<uint8_t, 32> ToBytes() const;
+ void ToBytes(uint8_t* out) const;
+
+ /// \brief Scale multiplier for given scale value.
+ static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
+
+ /// \brief Convert BasicDecimal256 from one scale to another
+ DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal256* out) const;
+
+ /// \brief Scale up.
+ BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
+
+ /// \brief Scale down.
+ /// - If 'round' is true, the right-most digits are dropped and the result value is
+ /// rounded up (+1 for positive, -1 for negative) based on the value of the
+ /// dropped digits (>= 10^reduce_by / 2).
+ /// - If 'round' is false, the right-most digits are simply dropped.
+ BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+ /// \brief Whether this number fits in the given precision
+ ///
+ /// Return true if the number of significant digits is less or equal to `precision`.
+ bool FitsInPrecision(int32_t precision) const;
+
+ inline int64_t Sign() const {
+ return 1 | (static_cast<int64_t>(little_endian_array_[3]) >> 63);
+ }
+
+ inline int64_t IsNegative() const {
+ return static_cast<int64_t>(little_endian_array_[3]) < 0;
+ }
+
+ /// \brief Multiply this number by another number. The result is truncated to 256 bits.
+ BasicDecimal256& operator*=(const BasicDecimal256& right);
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \param[out] result the quotient
+ /// \param[out] remainder the remainder after the division
+ DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
+ BasicDecimal256* remainder) const;
+
+ /// \brief Shift left by the given number of bits.
+ BasicDecimal256& operator<<=(uint32_t bits);
+
+ /// \brief In-place division.
+ BasicDecimal256& operator/=(const BasicDecimal256& right);
+
+ private:
+ std::array<uint64_t, 4> little_endian_array_;
+};
+
+ARROW_EXPORT inline bool operator==(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return left.little_endian_array() == right.little_endian_array();
+}
+
+ARROW_EXPORT inline bool operator!=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return left.little_endian_array() != right.little_endian_array();
+}
+
+ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
+
+ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return !operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return !operator<(left, right);
+}
+
+ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
+ const BasicDecimal256& right);
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
index c67cedc4a06..c7c97676f7c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
@@ -27,7 +27,7 @@
namespace arrow {
namespace internal {
-BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
+BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
const int16_t run_length = static_cast<int16_t>(std::min(bits_remaining_, block_size));
int16_t popcount = static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length));
bits_remaining_ -= run_length;
@@ -37,11 +37,11 @@ BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
return {run_length, popcount};
}
-// Prevent pointer arithmetic on nullptr, which is undefined behavior even if the pointer
-// is never dereferenced.
-inline const uint8_t* EnsureNotNull(const uint8_t* ptr) {
- static const uint8_t byte{};
- return ptr == nullptr ? &byte : ptr;
+// Prevent pointer arithmetic on nullptr, which is undefined behavior even if the pointer
+// is never dereferenced.
+inline const uint8_t* EnsureNotNull(const uint8_t* ptr) {
+ static const uint8_t byte{};
+ return ptr == nullptr ? &byte : ptr;
}
OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
@@ -49,7 +49,7 @@ OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
: has_bitmap_(validity_bitmap != nullptr),
position_(0),
length_(length),
- counter_(EnsureNotNull(validity_bitmap), offset, length) {}
+ counter_(EnsureNotNull(validity_bitmap), offset, length) {}
OptionalBitBlockCounter::OptionalBitBlockCounter(
const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset, int64_t length)
@@ -64,10 +64,10 @@ OptionalBinaryBitBlockCounter::OptionalBinaryBitBlockCounter(const uint8_t* left
: has_bitmap_(HasBitmapFromBitmaps(left_bitmap != nullptr, right_bitmap != nullptr)),
position_(0),
length_(length),
- unary_counter_(EnsureNotNull(left_bitmap != nullptr ? left_bitmap : right_bitmap),
+ unary_counter_(EnsureNotNull(left_bitmap != nullptr ? left_bitmap : right_bitmap),
left_bitmap != nullptr ? left_offset : right_offset, length),
- binary_counter_(EnsureNotNull(left_bitmap), left_offset,
- EnsureNotNull(right_bitmap), right_offset, length) {}
+ binary_counter_(EnsureNotNull(left_bitmap), left_offset,
+ EnsureNotNull(right_bitmap), right_offset, length) {}
OptionalBinaryBitBlockCounter::OptionalBinaryBitBlockCounter(
const std::shared_ptr<Buffer>& left_bitmap, int64_t left_offset,
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
index 63036af52a4..5a14031cf0e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
@@ -25,26 +25,26 @@
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
-#include "arrow/util/ubsan.h"
+#include "arrow/util/ubsan.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
namespace detail {
-inline uint64_t LoadWord(const uint8_t* bytes) {
- return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
-}
-
-inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
- if (shift == 0) {
- return current;
- }
- return (current >> shift) | (next << (64 - shift));
-}
-
+inline uint64_t LoadWord(const uint8_t* bytes) {
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
+}
+
+inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
+ if (shift == 0) {
+ return current;
+ }
+ return (current >> shift) | (next << (64 - shift));
+}
+
// These templates are here to help with unit tests
template <typename T>
@@ -58,16 +58,16 @@ struct BitBlockAnd<bool> {
};
template <typename T>
-struct BitBlockAndNot {
- static T Call(T left, T right) { return left & ~right; }
-};
-
-template <>
-struct BitBlockAndNot<bool> {
- static bool Call(bool left, bool right) { return left && !right; }
-};
-
-template <typename T>
+struct BitBlockAndNot {
+ static T Call(T left, T right) { return left & ~right; }
+};
+
+template <>
+struct BitBlockAndNot<bool> {
+ static bool Call(bool left, bool right) { return left && !right; }
+};
+
+template <typename T>
struct BitBlockOr {
static T Call(T left, T right) { return left | right; }
};
@@ -120,82 +120,82 @@ class ARROW_EXPORT BitBlockCounter {
/// block will have a length less than 256 if the bitmap length is not a
/// multiple of 256, and will return 0-length blocks in subsequent
/// invocations.
- BitBlockCount NextFourWords() {
- using detail::LoadWord;
- using detail::ShiftWord;
-
- if (!bits_remaining_) {
- return {0, 0};
- }
- int64_t total_popcount = 0;
- if (offset_ == 0) {
- if (bits_remaining_ < kFourWordsBits) {
- return GetBlockSlow(kFourWordsBits);
- }
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_));
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 8));
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 16));
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 24));
- } else {
- // When the offset is > 0, we need there to be a word beyond the last
- // aligned word in the bitmap for the bit shifting logic.
- if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
- return GetBlockSlow(kFourWordsBits);
- }
- auto current = LoadWord(bitmap_);
- auto next = LoadWord(bitmap_ + 8);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- current = next;
- next = LoadWord(bitmap_ + 16);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- current = next;
- next = LoadWord(bitmap_ + 24);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- current = next;
- next = LoadWord(bitmap_ + 32);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- }
- bitmap_ += BitUtil::BytesForBits(kFourWordsBits);
- bits_remaining_ -= kFourWordsBits;
- return {256, static_cast<int16_t>(total_popcount)};
- }
-
+ BitBlockCount NextFourWords() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ int64_t total_popcount = 0;
+ if (offset_ == 0) {
+ if (bits_remaining_ < kFourWordsBits) {
+ return GetBlockSlow(kFourWordsBits);
+ }
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 8));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 16));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 24));
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
+ return GetBlockSlow(kFourWordsBits);
+ }
+ auto current = LoadWord(bitmap_);
+ auto next = LoadWord(bitmap_ + 8);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 16);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 24);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 32);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ }
+ bitmap_ += BitUtil::BytesForBits(kFourWordsBits);
+ bits_remaining_ -= kFourWordsBits;
+ return {256, static_cast<int16_t>(total_popcount)};
+ }
+
/// \brief Return the next run of available bits, usually 64. The returned
/// pair contains the size of run and the number of true values. The last
/// block will have a length less than 64 if the bitmap length is not a
/// multiple of 64, and will return 0-length blocks in subsequent
/// invocations.
- BitBlockCount NextWord() {
- using detail::LoadWord;
- using detail::ShiftWord;
-
- if (!bits_remaining_) {
- return {0, 0};
- }
- int64_t popcount = 0;
- if (offset_ == 0) {
- if (bits_remaining_ < kWordBits) {
- return GetBlockSlow(kWordBits);
- }
- popcount = BitUtil::PopCount(LoadWord(bitmap_));
- } else {
- // When the offset is > 0, we need there to be a word beyond the last
- // aligned word in the bitmap for the bit shifting logic.
- if (bits_remaining_ < 2 * kWordBits - offset_) {
- return GetBlockSlow(kWordBits);
- }
- popcount =
- BitUtil::PopCount(ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
- }
- bitmap_ += kWordBits / 8;
- bits_remaining_ -= kWordBits;
- return {64, static_cast<int16_t>(popcount)};
- }
-
+ BitBlockCount NextWord() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ int64_t popcount = 0;
+ if (offset_ == 0) {
+ if (bits_remaining_ < kWordBits) {
+ return GetBlockSlow(kWordBits);
+ }
+ popcount = BitUtil::PopCount(LoadWord(bitmap_));
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if (bits_remaining_ < 2 * kWordBits - offset_) {
+ return GetBlockSlow(kWordBits);
+ }
+ popcount =
+ BitUtil::PopCount(ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
+ }
+ bitmap_ += kWordBits / 8;
+ bits_remaining_ -= kWordBits;
+ return {64, static_cast<int16_t>(popcount)};
+ }
+
private:
/// \brief Return block with the requested size when doing word-wise
/// computation is not possible due to inadequate bits remaining.
- BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
+ BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
const uint8_t* bitmap_;
int64_t bits_remaining_;
@@ -274,67 +274,67 @@ class ARROW_EXPORT BinaryBitBlockCounter {
/// the number of true values. The last block will have a length less than 64
/// if the bitmap length is not a multiple of 64, and will return 0-length
/// blocks in subsequent invocations.
- BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
-
- /// \brief Computes "x & ~y" block for each available run of bits.
- BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
+ BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
+ /// \brief Computes "x & ~y" block for each available run of bits.
+ BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
+
/// \brief Computes "x | y" block for each available run of bits.
- BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
+ BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
/// \brief Computes "x | ~y" block for each available run of bits.
- BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
+ BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
private:
template <template <typename T> class Op>
- BitBlockCount NextWord() {
- using detail::LoadWord;
- using detail::ShiftWord;
-
- if (!bits_remaining_) {
- return {0, 0};
- }
- // When the offset is > 0, we need there to be a word beyond the last aligned
- // word in the bitmap for the bit shifting logic.
- constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
- const int64_t bits_required_to_use_words =
- std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
- right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
- if (bits_remaining_ < bits_required_to_use_words) {
- const int16_t run_length =
- static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
- int16_t popcount = 0;
- for (int64_t i = 0; i < run_length; ++i) {
- if (Op<bool>::Call(BitUtil::GetBit(left_bitmap_, left_offset_ + i),
- BitUtil::GetBit(right_bitmap_, right_offset_ + i))) {
- ++popcount;
- }
- }
- // This code path should trigger _at most_ 2 times. In the "two times"
- // case, the first time the run length will be a multiple of 8.
- left_bitmap_ += run_length / 8;
- right_bitmap_ += run_length / 8;
- bits_remaining_ -= run_length;
- return {run_length, popcount};
- }
-
- int64_t popcount = 0;
- if (left_offset_ == 0 && right_offset_ == 0) {
- popcount = BitUtil::PopCount(
- Op<uint64_t>::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
- } else {
- auto left_word =
- ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
- auto right_word =
- ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
- popcount = BitUtil::PopCount(Op<uint64_t>::Call(left_word, right_word));
- }
- left_bitmap_ += kWordBits / 8;
- right_bitmap_ += kWordBits / 8;
- bits_remaining_ -= kWordBits;
- return {64, static_cast<int16_t>(popcount)};
- }
-
+ BitBlockCount NextWord() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ // When the offset is > 0, we need there to be a word beyond the last aligned
+ // word in the bitmap for the bit shifting logic.
+ constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
+ const int64_t bits_required_to_use_words =
+ std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
+ right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
+ if (bits_remaining_ < bits_required_to_use_words) {
+ const int16_t run_length =
+ static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
+ int16_t popcount = 0;
+ for (int64_t i = 0; i < run_length; ++i) {
+ if (Op<bool>::Call(BitUtil::GetBit(left_bitmap_, left_offset_ + i),
+ BitUtil::GetBit(right_bitmap_, right_offset_ + i))) {
+ ++popcount;
+ }
+ }
+ // This code path should trigger _at most_ 2 times. In the "two times"
+ // case, the first time the run length will be a multiple of 8.
+ left_bitmap_ += run_length / 8;
+ right_bitmap_ += run_length / 8;
+ bits_remaining_ -= run_length;
+ return {run_length, popcount};
+ }
+
+ int64_t popcount = 0;
+ if (left_offset_ == 0 && right_offset_ == 0) {
+ popcount = BitUtil::PopCount(
+ Op<uint64_t>::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
+ } else {
+ auto left_word =
+ ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
+ auto right_word =
+ ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
+ popcount = BitUtil::PopCount(Op<uint64_t>::Call(left_word, right_word));
+ }
+ left_bitmap_ += kWordBits / 8;
+ right_bitmap_ += kWordBits / 8;
+ bits_remaining_ -= kWordBits;
+ return {64, static_cast<int16_t>(popcount)};
+ }
+
const uint8_t* left_bitmap_;
int64_t left_offset_;
const uint8_t* right_bitmap_;
@@ -379,30 +379,30 @@ class ARROW_EXPORT OptionalBinaryBitBlockCounter {
}
}
- BitBlockCount NextOrNotBlock() {
- static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
- switch (has_bitmap_) {
- case HasBitmap::BOTH: {
- BitBlockCount block = binary_counter_.NextOrNotWord();
- position_ += block.length;
- return block;
- }
- case HasBitmap::ONE: {
- BitBlockCount block = unary_counter_.NextWord();
- position_ += block.length;
- return block;
- }
- case HasBitmap::NONE:
- default: {
- const int16_t block_size =
- static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
- position_ += block_size;
- // All values are non-null
- return {block_size, block_size};
- }
- }
- }
-
+ BitBlockCount NextOrNotBlock() {
+ static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+ switch (has_bitmap_) {
+ case HasBitmap::BOTH: {
+ BitBlockCount block = binary_counter_.NextOrNotWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::ONE: {
+ BitBlockCount block = unary_counter_.NextWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::NONE:
+ default: {
+ const int16_t block_size =
+ static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+ position_ += block_size;
+ // All values are non-null
+ return {block_size, block_size};
+ }
+ }
+ }
+
private:
enum class HasBitmap : int { BOTH, ONE, NONE };
@@ -427,9 +427,9 @@ class ARROW_EXPORT OptionalBinaryBitBlockCounter {
// Functional-style bit block visitors.
template <typename VisitNotNull, typename VisitNull>
-static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
- int64_t length, VisitNotNull&& visit_not_null,
- VisitNull&& visit_null) {
+static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+ int64_t length, VisitNotNull&& visit_not_null,
+ VisitNull&& visit_null) {
const uint8_t* bitmap = NULLPTR;
if (bitmap_buf != NULLPTR) {
bitmap = bitmap_buf->data();
@@ -460,9 +460,9 @@ static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t
}
template <typename VisitNotNull, typename VisitNull>
-static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
- int64_t length, VisitNotNull&& visit_not_null,
- VisitNull&& visit_null) {
+static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+ int64_t length, VisitNotNull&& visit_not_null,
+ VisitNull&& visit_null) {
const uint8_t* bitmap = NULLPTR;
if (bitmap_buf != NULLPTR) {
bitmap = bitmap_buf->data();
@@ -492,11 +492,11 @@ static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_
}
template <typename VisitNotNull, typename VisitNull>
-static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
- int64_t left_offset,
- const std::shared_ptr<Buffer>& right_bitmap_buf,
- int64_t right_offset, int64_t length,
- VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
+ int64_t left_offset,
+ const std::shared_ptr<Buffer>& right_bitmap_buf,
+ int64_t right_offset, int64_t length,
+ VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
if (left_bitmap_buf == NULLPTR || right_bitmap_buf == NULLPTR) {
// At most one bitmap is present
if (left_bitmap_buf == NULLPTR) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
index eda6088eb32..1114ec61f19 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
@@ -45,7 +45,7 @@ BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t
// Prepare for inversion in NextRun.
// Clear out any preceding bits.
- word_ = word_ & ~BitUtil::LeastSignificantBitMask(position_);
+ word_ = word_ & ~BitUtil::LeastSignificantBitMask(position_);
}
#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
index 3e196628477..10155687a20 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
@@ -17,14 +17,14 @@
#pragma once
-#include <cassert>
+#include <cassert>
#include <cstdint>
#include <cstring>
#include <string>
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_reader.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -42,14 +42,14 @@ struct BitRun {
}
};
-inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
return lhs.length == rhs.length && lhs.set == rhs.set;
}
-inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
- return lhs.length != rhs.length || lhs.set != rhs.set;
-}
-
+inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
+ return lhs.length != rhs.length || lhs.set != rhs.set;
+}
+
class BitRunReaderLinear {
public:
BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
@@ -70,7 +70,7 @@ class BitRunReaderLinear {
};
#if ARROW_LITTLE_ENDIAN
-/// A convenience class for counting the number of contiguous set/unset bits
+/// A convenience class for counting the number of contiguous set/unset bits
/// in a bitmap.
class ARROW_EXPORT BitRunReader {
public:
@@ -102,7 +102,7 @@ class ARROW_EXPORT BitRunReader {
int64_t start_bit_offset = start_position & 63;
// Invert the word for proper use of CountTrailingZeros and
// clear bits so CountTrailingZeros can do it magic.
- word_ = ~word_ & ~BitUtil::LeastSignificantBitMask(start_bit_offset);
+ word_ = ~word_ & ~BitUtil::LeastSignificantBitMask(start_bit_offset);
// Go forward until the next change from unset to set.
int64_t new_bits = BitUtil::CountTrailingZeros(word_) - start_bit_offset;
@@ -151,7 +151,7 @@ class ARROW_EXPORT BitRunReader {
}
// Two cases:
- // 1. For unset, CountTrailingZeros works naturally so we don't
+ // 1. For unset, CountTrailingZeros works naturally so we don't
// invert the word.
// 2. Otherwise invert so we can use CountTrailingZeros.
if (current_run_bit_set_) {
@@ -168,348 +168,348 @@ class ARROW_EXPORT BitRunReader {
using BitRunReader = BitRunReaderLinear;
#endif
-struct SetBitRun {
- int64_t position;
- int64_t length;
-
- bool AtEnd() const { return length == 0; }
-
- std::string ToString() const {
- return std::string("{pos=") + std::to_string(position) +
- ", len=" + std::to_string(length) + "}";
- }
-
- bool operator==(const SetBitRun& other) const {
- return position == other.position && length == other.length;
- }
- bool operator!=(const SetBitRun& other) const {
- return position != other.position || length != other.length;
- }
-};
-
-template <bool Reverse>
-class BaseSetBitRunReader {
- public:
- /// \brief Constructs new SetBitRunReader.
- ///
- /// \param[in] bitmap source data
- /// \param[in] start_offset bit offset into the source data
- /// \param[in] length number of bits to copy
- ARROW_NOINLINE
- BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
- : bitmap_(bitmap),
- length_(length),
- remaining_(length_),
- current_word_(0),
- current_num_bits_(0) {
- if (Reverse) {
- bitmap_ += (start_offset + length) / 8;
- const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
- if (length > 0 && end_bit_offset) {
- // Get LSBs from last byte
- ++bitmap_;
- current_num_bits_ =
- std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
- current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
- }
- } else {
- bitmap_ += start_offset / 8;
- const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
- if (length > 0 && bit_offset) {
- // Get MSBs from first byte
- current_num_bits_ =
- std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
- current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
- }
- }
- }
-
- ARROW_NOINLINE
- SetBitRun NextRun() {
- int64_t pos = 0;
- int64_t len = 0;
- if (current_num_bits_) {
- const auto run = FindCurrentRun();
- assert(remaining_ >= 0);
- if (run.length && current_num_bits_) {
- // The run ends in current_word_
- return AdjustRun(run);
- }
- pos = run.position;
- len = run.length;
- }
- if (!len) {
- // We didn't get any ones in current_word_, so we can skip any zeros
- // in the following words
- SkipNextZeros();
- if (remaining_ == 0) {
- return {0, 0};
- }
- assert(current_num_bits_);
- pos = position();
- } else if (!current_num_bits_) {
- if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
- current_word_ = LoadFullWord();
- current_num_bits_ = 64;
- } else if (remaining_ > 0) {
- current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
- current_num_bits_ = static_cast<int32_t>(remaining_);
- } else {
- // No bits remaining, perhaps we found a run?
- return AdjustRun({pos, len});
- }
- // If current word starts with a zero, we got a full run
- if (!(current_word_ & kFirstBit)) {
- return AdjustRun({pos, len});
- }
- }
- // Current word should now start with a set bit
- len += CountNextOnes();
- return AdjustRun({pos, len});
- }
-
- protected:
- int64_t position() const {
- if (Reverse) {
- return remaining_;
- } else {
- return length_ - remaining_;
- }
- }
-
- SetBitRun AdjustRun(SetBitRun run) {
- if (Reverse) {
- assert(run.position >= run.length);
- run.position -= run.length;
- }
- return run;
- }
-
- uint64_t LoadFullWord() {
- uint64_t word;
- if (Reverse) {
- bitmap_ -= 8;
- }
- memcpy(&word, bitmap_, 8);
- if (!Reverse) {
- bitmap_ += 8;
- }
- return BitUtil::ToLittleEndian(word);
- }
-
- uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
- assert(num_bits > 0);
- uint64_t word = 0;
- const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
- if (Reverse) {
- // Read in the most significant bytes of the word
- bitmap_ -= num_bytes;
- memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
- // XXX MostSignificantBitmask
- return (BitUtil::ToLittleEndian(word) << bit_offset) &
- ~BitUtil::LeastSignificantBitMask(64 - num_bits);
- } else {
- memcpy(&word, bitmap_, num_bytes);
- bitmap_ += num_bytes;
- return (BitUtil::ToLittleEndian(word) >> bit_offset) &
- BitUtil::LeastSignificantBitMask(num_bits);
- }
- }
-
- void SkipNextZeros() {
- assert(current_num_bits_ == 0);
- while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
- current_word_ = LoadFullWord();
- const auto num_zeros = CountFirstZeros(current_word_);
- if (num_zeros < 64) {
- // Run of zeros ends here
- current_word_ = ConsumeBits(current_word_, num_zeros);
- current_num_bits_ = 64 - num_zeros;
- remaining_ -= num_zeros;
- assert(remaining_ >= 0);
- assert(current_num_bits_ >= 0);
- return;
- }
- remaining_ -= 64;
- }
- // Run of zeros continues in last bitmap word
- if (remaining_ > 0) {
- current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
- current_num_bits_ = static_cast<int32_t>(remaining_);
- const auto num_zeros =
- std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
- current_word_ = ConsumeBits(current_word_, num_zeros);
- current_num_bits_ -= num_zeros;
- remaining_ -= num_zeros;
- assert(remaining_ >= 0);
- assert(current_num_bits_ >= 0);
- }
- }
-
- int64_t CountNextOnes() {
- assert(current_word_ & kFirstBit);
-
- int64_t len;
- if (~current_word_) {
- const auto num_ones = CountFirstZeros(~current_word_);
- assert(num_ones <= current_num_bits_);
- assert(num_ones <= remaining_);
- remaining_ -= num_ones;
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ -= num_ones;
- if (current_num_bits_) {
- // Run of ones ends here
- return num_ones;
- }
- len = num_ones;
- } else {
- // current_word_ is all ones
- remaining_ -= 64;
- current_num_bits_ = 0;
- len = 64;
- }
-
- while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
- current_word_ = LoadFullWord();
- const auto num_ones = CountFirstZeros(~current_word_);
- len += num_ones;
- remaining_ -= num_ones;
- if (num_ones < 64) {
- // Run of ones ends here
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ = 64 - num_ones;
- return len;
- }
- }
- // Run of ones continues in last bitmap word
- if (remaining_ > 0) {
- current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
- current_num_bits_ = static_cast<int32_t>(remaining_);
- const auto num_ones = CountFirstZeros(~current_word_);
- assert(num_ones <= current_num_bits_);
- assert(num_ones <= remaining_);
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ -= num_ones;
- remaining_ -= num_ones;
- len += num_ones;
- }
- return len;
- }
-
- SetBitRun FindCurrentRun() {
- // Skip any pending zeros
- const auto num_zeros = CountFirstZeros(current_word_);
- if (num_zeros >= current_num_bits_) {
- remaining_ -= current_num_bits_;
- current_word_ = 0;
- current_num_bits_ = 0;
- return {0, 0};
- }
- assert(num_zeros <= remaining_);
- current_word_ = ConsumeBits(current_word_, num_zeros);
- current_num_bits_ -= num_zeros;
- remaining_ -= num_zeros;
- const int64_t pos = position();
- // Count any ones
- const auto num_ones = CountFirstZeros(~current_word_);
- assert(num_ones <= current_num_bits_);
- assert(num_ones <= remaining_);
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ -= num_ones;
- remaining_ -= num_ones;
- return {pos, num_ones};
- }
-
- inline int CountFirstZeros(uint64_t word);
- inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
-
- const uint8_t* bitmap_;
- const int64_t length_;
- int64_t remaining_;
- uint64_t current_word_;
- int32_t current_num_bits_;
-
- static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
-};
-
-template <>
-inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
- return BitUtil::CountTrailingZeros(word);
-}
-
-template <>
-inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
- return BitUtil::CountLeadingZeros(word);
-}
-
-template <>
-inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
- return word >> num_bits;
-}
-
-template <>
-inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
- return word << num_bits;
-}
-
-using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
-using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
-
-// Functional-style bit run visitors.
-
-// XXX: Try to make this function small so the compiler can inline and optimize
-// the `visit` function, which is normally a hot loop with vectorizable code.
-// - don't inline SetBitRunReader constructor, it doesn't hurt performance
-// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
-template <typename Visit>
-inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
- Visit&& visit) {
- if (bitmap == NULLPTR) {
- // Assuming all set (as in a null bitmap)
- return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
- }
- SetBitRunReader reader(bitmap, offset, length);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- ARROW_RETURN_NOT_OK(visit(run.position, run.length));
- }
- return Status::OK();
-}
-
-template <typename Visit>
-inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
- Visit&& visit) {
- if (bitmap == NULLPTR) {
- // Assuming all set (as in a null bitmap)
- visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
- return;
- }
- SetBitRunReader reader(bitmap, offset, length);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- visit(run.position, run.length);
- }
-}
-
-template <typename Visit>
-inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
- int64_t length, Visit&& visit) {
- return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
- std::forward<Visit>(visit));
-}
-
-template <typename Visit>
-inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
- int64_t length, Visit&& visit) {
- VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
- std::forward<Visit>(visit));
-}
-
+struct SetBitRun {
+ int64_t position;
+ int64_t length;
+
+ bool AtEnd() const { return length == 0; }
+
+ std::string ToString() const {
+ return std::string("{pos=") + std::to_string(position) +
+ ", len=" + std::to_string(length) + "}";
+ }
+
+ bool operator==(const SetBitRun& other) const {
+ return position == other.position && length == other.length;
+ }
+ bool operator!=(const SetBitRun& other) const {
+ return position != other.position || length != other.length;
+ }
+};
+
+template <bool Reverse>
+class BaseSetBitRunReader {
+ public:
+ /// \brief Constructs new SetBitRunReader.
+ ///
+ /// \param[in] bitmap source data
+ /// \param[in] start_offset bit offset into the source data
+ /// \param[in] length number of bits to copy
+ ARROW_NOINLINE
+ BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap),
+ length_(length),
+ remaining_(length_),
+ current_word_(0),
+ current_num_bits_(0) {
+ if (Reverse) {
+ bitmap_ += (start_offset + length) / 8;
+ const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
+ if (length > 0 && end_bit_offset) {
+ // Get LSBs from last byte
+ ++bitmap_;
+ current_num_bits_ =
+ std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
+ current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
+ }
+ } else {
+ bitmap_ += start_offset / 8;
+ const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
+ if (length > 0 && bit_offset) {
+ // Get MSBs from first byte
+ current_num_bits_ =
+ std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
+ current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
+ }
+ }
+ }
+
+ ARROW_NOINLINE
+ SetBitRun NextRun() {
+ int64_t pos = 0;
+ int64_t len = 0;
+ if (current_num_bits_) {
+ const auto run = FindCurrentRun();
+ assert(remaining_ >= 0);
+ if (run.length && current_num_bits_) {
+ // The run ends in current_word_
+ return AdjustRun(run);
+ }
+ pos = run.position;
+ len = run.length;
+ }
+ if (!len) {
+ // We didn't get any ones in current_word_, so we can skip any zeros
+ // in the following words
+ SkipNextZeros();
+ if (remaining_ == 0) {
+ return {0, 0};
+ }
+ assert(current_num_bits_);
+ pos = position();
+ } else if (!current_num_bits_) {
+ if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ current_num_bits_ = 64;
+ } else if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ } else {
+ // No bits remaining, perhaps we found a run?
+ return AdjustRun({pos, len});
+ }
+ // If current word starts with a zero, we got a full run
+ if (!(current_word_ & kFirstBit)) {
+ return AdjustRun({pos, len});
+ }
+ }
+ // Current word should now start with a set bit
+ len += CountNextOnes();
+ return AdjustRun({pos, len});
+ }
+
+ protected:
+ int64_t position() const {
+ if (Reverse) {
+ return remaining_;
+ } else {
+ return length_ - remaining_;
+ }
+ }
+
+ SetBitRun AdjustRun(SetBitRun run) {
+ if (Reverse) {
+ assert(run.position >= run.length);
+ run.position -= run.length;
+ }
+ return run;
+ }
+
+ uint64_t LoadFullWord() {
+ uint64_t word;
+ if (Reverse) {
+ bitmap_ -= 8;
+ }
+ memcpy(&word, bitmap_, 8);
+ if (!Reverse) {
+ bitmap_ += 8;
+ }
+ return BitUtil::ToLittleEndian(word);
+ }
+
+ uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+ assert(num_bits > 0);
+ uint64_t word = 0;
+ const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+ if (Reverse) {
+ // Read in the most significant bytes of the word
+ bitmap_ -= num_bytes;
+ memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
+ // XXX MostSignificantBitmask
+ return (BitUtil::ToLittleEndian(word) << bit_offset) &
+ ~BitUtil::LeastSignificantBitMask(64 - num_bits);
+ } else {
+ memcpy(&word, bitmap_, num_bytes);
+ bitmap_ += num_bytes;
+ return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+ BitUtil::LeastSignificantBitMask(num_bits);
+ }
+ }
+
+ void SkipNextZeros() {
+ assert(current_num_bits_ == 0);
+ while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ const auto num_zeros = CountFirstZeros(current_word_);
+ if (num_zeros < 64) {
+ // Run of zeros ends here
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ = 64 - num_zeros;
+ remaining_ -= num_zeros;
+ assert(remaining_ >= 0);
+ assert(current_num_bits_ >= 0);
+ return;
+ }
+ remaining_ -= 64;
+ }
+ // Run of zeros continues in last bitmap word
+ if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ const auto num_zeros =
+ std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ -= num_zeros;
+ remaining_ -= num_zeros;
+ assert(remaining_ >= 0);
+ assert(current_num_bits_ >= 0);
+ }
+ }
+
+ int64_t CountNextOnes() {
+ assert(current_word_ & kFirstBit);
+
+ int64_t len;
+ if (~current_word_) {
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ remaining_ -= num_ones;
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ if (current_num_bits_) {
+ // Run of ones ends here
+ return num_ones;
+ }
+ len = num_ones;
+ } else {
+ // current_word_ is all ones
+ remaining_ -= 64;
+ current_num_bits_ = 0;
+ len = 64;
+ }
+
+ while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ const auto num_ones = CountFirstZeros(~current_word_);
+ len += num_ones;
+ remaining_ -= num_ones;
+ if (num_ones < 64) {
+ // Run of ones ends here
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ = 64 - num_ones;
+ return len;
+ }
+ }
+ // Run of ones continues in last bitmap word
+ if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ remaining_ -= num_ones;
+ len += num_ones;
+ }
+ return len;
+ }
+
+ SetBitRun FindCurrentRun() {
+ // Skip any pending zeros
+ const auto num_zeros = CountFirstZeros(current_word_);
+ if (num_zeros >= current_num_bits_) {
+ remaining_ -= current_num_bits_;
+ current_word_ = 0;
+ current_num_bits_ = 0;
+ return {0, 0};
+ }
+ assert(num_zeros <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ -= num_zeros;
+ remaining_ -= num_zeros;
+ const int64_t pos = position();
+ // Count any ones
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ remaining_ -= num_ones;
+ return {pos, num_ones};
+ }
+
+ inline int CountFirstZeros(uint64_t word);
+ inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
+
+ const uint8_t* bitmap_;
+ const int64_t length_;
+ int64_t remaining_;
+ uint64_t current_word_;
+ int32_t current_num_bits_;
+
+ static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
+};
+
+template <>
+inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
+ return BitUtil::CountTrailingZeros(word);
+}
+
+template <>
+inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
+ return BitUtil::CountLeadingZeros(word);
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
+ return word >> num_bits;
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
+ return word << num_bits;
+}
+
+using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
+using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
+
+// Functional-style bit run visitors.
+
+// XXX: Try to make this function small so the compiler can inline and optimize
+// the `visit` function, which is normally a hot loop with vectorizable code.
+// - don't inline SetBitRunReader constructor, it doesn't hurt performance
+// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
+template <typename Visit>
+inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
+ Visit&& visit) {
+ if (bitmap == NULLPTR) {
+ // Assuming all set (as in a null bitmap)
+ return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+ }
+ SetBitRunReader reader(bitmap, offset, length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ ARROW_RETURN_NOT_OK(visit(run.position, run.length));
+ }
+ return Status::OK();
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
+ Visit&& visit) {
+ if (bitmap == NULLPTR) {
+ // Assuming all set (as in a null bitmap)
+ visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+ return;
+ }
+ SetBitRunReader reader(bitmap, offset, length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ visit(run.position, run.length);
+ }
+}
+
+template <typename Visit>
+inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+ int64_t length, Visit&& visit) {
+ return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
+ std::forward<Visit>(visit));
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+ int64_t length, Visit&& visit) {
+ VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
+ std::forward<Visit>(visit));
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
index b9e695dfcb0..cdd3683557c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
@@ -1,433 +1,433 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala (incubating) as of 2016-01-29
-
-#pragma once
-
-#include <string.h>
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bpacking.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace BitUtil {
-
-/// Utility class to write bit/byte streams. This class can write data to either be
-/// bit packed or byte aligned (and a single stream that has a mix of both).
-/// This class does not allocate memory.
-class BitWriter {
- public:
- /// buffer: buffer to write bits to. Buffer should be preallocated with
- /// 'buffer_len' bytes.
- BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
- Clear();
- }
-
- void Clear() {
- buffered_values_ = 0;
- byte_offset_ = 0;
- bit_offset_ = 0;
- }
-
- /// The number of current bytes written, including the current byte (i.e. may include a
- /// fraction of a byte). Includes buffered values.
- int bytes_written() const {
- return byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_));
- }
- uint8_t* buffer() const { return buffer_; }
- int buffer_len() const { return max_bytes_; }
-
- /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
- /// packed. Returns false if there was not enough space. num_bits must be <= 32.
- bool PutValue(uint64_t v, int num_bits);
-
- /// Writes v to the next aligned byte using num_bytes. If T is larger than
- /// num_bytes, the extra high-order bytes will be ignored. Returns false if
- /// there was not enough space.
- /// Assume the v is stored in buffer_ as a litte-endian format
- template <typename T>
- bool PutAligned(T v, int num_bytes);
-
- /// Write a Vlq encoded int to the buffer. Returns false if there was not enough
- /// room. The value is written byte aligned.
- /// For more details on vlq:
- /// en.wikipedia.org/wiki/Variable-length_quantity
- bool PutVlqInt(uint32_t v);
-
- // Writes an int zigzag encoded.
- bool PutZigZagVlqInt(int32_t v);
-
- /// Get a pointer to the next aligned byte and advance the underlying buffer
- /// by num_bytes.
- /// Returns NULL if there was not enough space.
- uint8_t* GetNextBytePtr(int num_bytes = 1);
-
- /// Flushes all buffered values to the buffer. Call this when done writing to
- /// the buffer. If 'align' is true, buffered_values_ is reset and any future
- /// writes will be written to the next byte boundary.
- void Flush(bool align = false);
-
- private:
- uint8_t* buffer_;
- int max_bytes_;
-
- /// Bit-packed values are initially written to this variable before being memcpy'd to
- /// buffer_. This is faster than writing values byte by byte directly to buffer_.
- uint64_t buffered_values_;
-
- int byte_offset_; // Offset in buffer_
- int bit_offset_; // Offset in buffered_values_
-};
-
-/// Utility class to read bit/byte stream. This class can read bits or bytes
-/// that are either byte aligned or not. It also has utilities to read multiple
-/// bytes in one read (e.g. encoded int).
-class BitReader {
- public:
- /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
- BitReader(const uint8_t* buffer, int buffer_len)
- : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
- int num_bytes = std::min(8, max_bytes_ - byte_offset_);
- memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
- buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
- }
-
- BitReader()
- : buffer_(NULL),
- max_bytes_(0),
- buffered_values_(0),
- byte_offset_(0),
- bit_offset_(0) {}
-
- void Reset(const uint8_t* buffer, int buffer_len) {
- buffer_ = buffer;
- max_bytes_ = buffer_len;
- byte_offset_ = 0;
- bit_offset_ = 0;
- int num_bytes = std::min(8, max_bytes_ - byte_offset_);
- memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
- buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
- }
-
- /// Gets the next value from the buffer. Returns true if 'v' could be read or false if
- /// there are not enough bytes left. num_bits must be <= 32.
- template <typename T>
- bool GetValue(int num_bits, T* v);
-
- /// Get a number of values from the buffer. Return the number of values actually read.
- template <typename T>
- int GetBatch(int num_bits, T* v, int batch_size);
-
- /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
- /// needs to be a little-endian native type and big enough to store
- /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
- /// be advanced to the start of the next byte before 'v' is read. Returns
- /// false if there are not enough bytes left.
- /// Assume the v was stored in buffer_ as a litte-endian format
- template <typename T>
- bool GetAligned(int num_bytes, T* v);
-
- /// Reads a vlq encoded int from the stream. The encoded int must start at
- /// the beginning of a byte. Return false if there were not enough bytes in
- /// the buffer.
- bool GetVlqInt(uint32_t* v);
-
- // Reads a zigzag encoded int `into` v.
- bool GetZigZagVlqInt(int32_t* v);
-
- /// Returns the number of bytes left in the stream, not including the current
- /// byte (i.e., there may be an additional fraction of a byte).
- int bytes_left() {
- return max_bytes_ -
- (byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_)));
- }
-
- /// Maximum byte length of a vlq encoded int
- static constexpr int kMaxVlqByteLength = 5;
-
- private:
- const uint8_t* buffer_;
- int max_bytes_;
-
- /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
- /// faster than reading values byte by byte directly from buffer_.
- uint64_t buffered_values_;
-
- int byte_offset_; // Offset in buffer_
- int bit_offset_; // Offset in buffered_values_
-};
-
-inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
- // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
- DCHECK_LE(num_bits, 32);
- DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
-
- if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
- return false;
-
- buffered_values_ |= v << bit_offset_;
- bit_offset_ += num_bits;
-
- if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
- // Flush buffered_values_ and write out bits of v that did not fit
- buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
- memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
- buffered_values_ = 0;
- byte_offset_ += 8;
- bit_offset_ -= 64;
- buffered_values_ = v >> (num_bits - bit_offset_);
- }
- DCHECK_LT(bit_offset_, 64);
- return true;
-}
-
-inline void BitWriter::Flush(bool align) {
- int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
- DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
- auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
- memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);
-
- if (align) {
- buffered_values_ = 0;
- byte_offset_ += num_bytes;
- bit_offset_ = 0;
- }
-}
-
-inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
- Flush(/* align */ true);
- DCHECK_LE(byte_offset_, max_bytes_);
- if (byte_offset_ + num_bytes > max_bytes_) return NULL;
- uint8_t* ptr = buffer_ + byte_offset_;
- byte_offset_ += num_bytes;
- return ptr;
-}
-
-template <typename T>
-inline bool BitWriter::PutAligned(T val, int num_bytes) {
- uint8_t* ptr = GetNextBytePtr(num_bytes);
- if (ptr == NULL) return false;
- val = arrow::BitUtil::ToLittleEndian(val);
- memcpy(ptr, &val, num_bytes);
- return true;
-}
-
-namespace detail {
-
-template <typename T>
-inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
- int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
- *v = static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
- *bit_offset);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- *bit_offset += num_bits;
- if (*bit_offset >= 64) {
- *byte_offset += 8;
- *bit_offset -= 64;
-
- int bytes_remaining = max_bytes - *byte_offset;
- if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
- memcpy(buffered_values, buffer + *byte_offset, 8);
- } else {
- memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
- }
- *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800 4805)
-#endif
- // Read bits of v that crossed into new buffered_values_
- *v = *v | static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset)
- << (num_bits - *bit_offset));
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- DCHECK_LE(*bit_offset, 64);
- }
-}
-
-} // namespace detail
-
-template <typename T>
-inline bool BitReader::GetValue(int num_bits, T* v) {
- return GetBatch(num_bits, v, 1) == 1;
-}
-
-template <typename T>
-inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
- DCHECK(buffer_ != NULL);
- // TODO: revisit this limit if necessary
- DCHECK_LE(num_bits, 32);
- DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
-
- int bit_offset = bit_offset_;
- int byte_offset = byte_offset_;
- uint64_t buffered_values = buffered_values_;
- int max_bytes = max_bytes_;
- const uint8_t* buffer = buffer_;
-
- uint64_t needed_bits = num_bits * batch_size;
- constexpr uint64_t kBitsPerByte = 8;
- uint64_t remaining_bits = (max_bytes - byte_offset) * kBitsPerByte - bit_offset;
- if (remaining_bits < needed_bits) {
- batch_size = static_cast<int>(remaining_bits) / num_bits;
- }
-
- int i = 0;
- if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
- for (; i < batch_size && bit_offset != 0; ++i) {
- detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
- &buffered_values);
- }
- }
-
- if (sizeof(T) == 4) {
- int num_unpacked =
- internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
- reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
- i += num_unpacked;
- byte_offset += num_unpacked * num_bits / 8;
- } else {
- const int buffer_size = 1024;
- uint32_t unpack_buffer[buffer_size];
- while (i < batch_size) {
- int unpack_size = std::min(buffer_size, batch_size - i);
- int num_unpacked =
- internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
- unpack_buffer, unpack_size, num_bits);
- if (num_unpacked == 0) {
- break;
- }
- for (int k = 0; k < num_unpacked; ++k) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
- v[i + k] = static_cast<T>(unpack_buffer[k]);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- }
- i += num_unpacked;
- byte_offset += num_unpacked * num_bits / 8;
- }
- }
-
- int bytes_remaining = max_bytes - byte_offset;
- if (bytes_remaining >= 8) {
- memcpy(&buffered_values, buffer + byte_offset, 8);
- } else {
- memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
- }
- buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
-
- for (; i < batch_size; ++i) {
- detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
- &buffered_values);
- }
-
- bit_offset_ = bit_offset;
- byte_offset_ = byte_offset;
- buffered_values_ = buffered_values;
-
- return batch_size;
-}
-
-template <typename T>
-inline bool BitReader::GetAligned(int num_bytes, T* v) {
- if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
- return false;
- }
-
- int bytes_read = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
- if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) {
- return false;
- }
-
- // Advance byte_offset to next unread byte and read num_bytes
- byte_offset_ += bytes_read;
- memcpy(v, buffer_ + byte_offset_, num_bytes);
- *v = arrow::BitUtil::FromLittleEndian(*v);
- byte_offset_ += num_bytes;
-
- // Reset buffered_values_
- bit_offset_ = 0;
- int bytes_remaining = max_bytes_ - byte_offset_;
- if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
- memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
- } else {
- memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
- }
- buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
- return true;
-}
-
-inline bool BitWriter::PutVlqInt(uint32_t v) {
- bool result = true;
- while ((v & 0xFFFFFF80UL) != 0UL) {
- result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
- v >>= 7;
- }
- result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
- return result;
-}
-
-inline bool BitReader::GetVlqInt(uint32_t* v) {
- uint32_t tmp = 0;
-
- for (int i = 0; i < kMaxVlqByteLength; i++) {
- uint8_t byte = 0;
- if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
- return false;
- }
- tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
-
- if ((byte & 0x80) == 0) {
- *v = tmp;
- return true;
- }
- }
-
- return false;
-}
-
-inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
- auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
- return PutVlqInt((u_v << 1) ^ (u_v >> 31));
-}
-
-inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
- uint32_t u;
- if (!GetVlqInt(&u)) return false;
- *v = ::arrow::util::SafeCopy<int32_t>((u >> 1) ^ (u << 31));
- return true;
-}
-
-} // namespace BitUtil
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29
+
+#pragma once
+
+#include <string.h>
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bpacking.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace BitUtil {
+
+/// Utility class to write bit/byte streams. This class can write data to either be
+/// bit packed or byte aligned (and a single stream that has a mix of both).
+/// This class does not allocate memory.
+class BitWriter {
+ public:
+ /// buffer: buffer to write bits to. Buffer should be preallocated with
+ /// 'buffer_len' bytes.
+ BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
+ Clear();
+ }
+
+ void Clear() {
+ buffered_values_ = 0;
+ byte_offset_ = 0;
+ bit_offset_ = 0;
+ }
+
+ /// The number of current bytes written, including the current byte (i.e. may include a
+ /// fraction of a byte). Includes buffered values.
+ int bytes_written() const {
+ return byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ }
+ uint8_t* buffer() const { return buffer_; }
+ int buffer_len() const { return max_bytes_; }
+
+ /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
+ /// packed. Returns false if there was not enough space. num_bits must be <= 32.
+ bool PutValue(uint64_t v, int num_bits);
+
+ /// Writes v to the next aligned byte using num_bytes. If T is larger than
+ /// num_bytes, the extra high-order bytes will be ignored. Returns false if
+ /// there was not enough space.
+ /// Assume the v is stored in buffer_ as a litte-endian format
+ template <typename T>
+ bool PutAligned(T v, int num_bytes);
+
+ /// Write a Vlq encoded int to the buffer. Returns false if there was not enough
+ /// room. The value is written byte aligned.
+ /// For more details on vlq:
+ /// en.wikipedia.org/wiki/Variable-length_quantity
+ bool PutVlqInt(uint32_t v);
+
+ // Writes an int zigzag encoded.
+ bool PutZigZagVlqInt(int32_t v);
+
+ /// Get a pointer to the next aligned byte and advance the underlying buffer
+ /// by num_bytes.
+ /// Returns NULL if there was not enough space.
+ uint8_t* GetNextBytePtr(int num_bytes = 1);
+
+ /// Flushes all buffered values to the buffer. Call this when done writing to
+ /// the buffer. If 'align' is true, buffered_values_ is reset and any future
+ /// writes will be written to the next byte boundary.
+ void Flush(bool align = false);
+
+ private:
+ uint8_t* buffer_;
+ int max_bytes_;
+
+ /// Bit-packed values are initially written to this variable before being memcpy'd to
+ /// buffer_. This is faster than writing values byte by byte directly to buffer_.
+ uint64_t buffered_values_;
+
+ int byte_offset_; // Offset in buffer_
+ int bit_offset_; // Offset in buffered_values_
+};
+
+/// Utility class to read bit/byte stream. This class can read bits or bytes
+/// that are either byte aligned or not. It also has utilities to read multiple
+/// bytes in one read (e.g. encoded int).
+class BitReader {
+ public:
+ /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
+ BitReader(const uint8_t* buffer, int buffer_len)
+ : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
+ int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+ memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ }
+
+ BitReader()
+ : buffer_(NULL),
+ max_bytes_(0),
+ buffered_values_(0),
+ byte_offset_(0),
+ bit_offset_(0) {}
+
+ void Reset(const uint8_t* buffer, int buffer_len) {
+ buffer_ = buffer;
+ max_bytes_ = buffer_len;
+ byte_offset_ = 0;
+ bit_offset_ = 0;
+ int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+ memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ }
+
+ /// Gets the next value from the buffer. Returns true if 'v' could be read or false if
+ /// there are not enough bytes left. num_bits must be <= 32.
+ template <typename T>
+ bool GetValue(int num_bits, T* v);
+
+ /// Get a number of values from the buffer. Return the number of values actually read.
+ template <typename T>
+ int GetBatch(int num_bits, T* v, int batch_size);
+
+ /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+ /// needs to be a little-endian native type and big enough to store
+ /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+ /// be advanced to the start of the next byte before 'v' is read. Returns
+ /// false if there are not enough bytes left.
+ /// Assume the v was stored in buffer_ as a litte-endian format
+ template <typename T>
+ bool GetAligned(int num_bytes, T* v);
+
+ /// Reads a vlq encoded int from the stream. The encoded int must start at
+ /// the beginning of a byte. Return false if there were not enough bytes in
+ /// the buffer.
+ bool GetVlqInt(uint32_t* v);
+
+ // Reads a zigzag encoded int `into` v.
+ bool GetZigZagVlqInt(int32_t* v);
+
+ /// Returns the number of bytes left in the stream, not including the current
+ /// byte (i.e., there may be an additional fraction of a byte).
+ int bytes_left() {
+ return max_bytes_ -
+ (byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_)));
+ }
+
+ /// Maximum byte length of a vlq encoded int
+ static constexpr int kMaxVlqByteLength = 5;
+
+ private:
+ const uint8_t* buffer_;
+ int max_bytes_;
+
+ /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+ /// faster than reading values byte by byte directly from buffer_.
+ uint64_t buffered_values_;
+
+ int byte_offset_; // Offset in buffer_
+ int bit_offset_; // Offset in buffered_values_
+};
+
+inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
+ // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
+ DCHECK_LE(num_bits, 32);
+ DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
+
+ if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
+ return false;
+
+ buffered_values_ |= v << bit_offset_;
+ bit_offset_ += num_bits;
+
+ if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
+ // Flush buffered_values_ and write out bits of v that did not fit
+ buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
+ memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
+ buffered_values_ = 0;
+ byte_offset_ += 8;
+ bit_offset_ -= 64;
+ buffered_values_ = v >> (num_bits - bit_offset_);
+ }
+ DCHECK_LT(bit_offset_, 64);
+ return true;
+}
+
+inline void BitWriter::Flush(bool align) {
+ int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
+ auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
+ memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);
+
+ if (align) {
+ buffered_values_ = 0;
+ byte_offset_ += num_bytes;
+ bit_offset_ = 0;
+ }
+}
+
+inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
+ Flush(/* align */ true);
+ DCHECK_LE(byte_offset_, max_bytes_);
+ if (byte_offset_ + num_bytes > max_bytes_) return NULL;
+ uint8_t* ptr = buffer_ + byte_offset_;
+ byte_offset_ += num_bytes;
+ return ptr;
+}
+
+template <typename T>
+inline bool BitWriter::PutAligned(T val, int num_bytes) {
+ uint8_t* ptr = GetNextBytePtr(num_bytes);
+ if (ptr == NULL) return false;
+ val = arrow::BitUtil::ToLittleEndian(val);
+ memcpy(ptr, &val, num_bytes);
+ return true;
+}
+
+namespace detail {
+
+template <typename T>
+inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
+ int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+ *v = static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
+ *bit_offset);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ *bit_offset += num_bits;
+ if (*bit_offset >= 64) {
+ *byte_offset += 8;
+ *bit_offset -= 64;
+
+ int bytes_remaining = max_bytes - *byte_offset;
+ if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+ memcpy(buffered_values, buffer + *byte_offset, 8);
+ } else {
+ memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
+ }
+ *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800 4805)
+#endif
+ // Read bits of v that crossed into new buffered_values_
+ *v = *v | static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset)
+ << (num_bits - *bit_offset));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ DCHECK_LE(*bit_offset, 64);
+ }
+}
+
+} // namespace detail
+
+template <typename T>
+inline bool BitReader::GetValue(int num_bits, T* v) {
+ return GetBatch(num_bits, v, 1) == 1;
+}
+
+template <typename T>
+inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
+ DCHECK(buffer_ != NULL);
+ // TODO: revisit this limit if necessary
+ DCHECK_LE(num_bits, 32);
+ DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
+
+ int bit_offset = bit_offset_;
+ int byte_offset = byte_offset_;
+ uint64_t buffered_values = buffered_values_;
+ int max_bytes = max_bytes_;
+ const uint8_t* buffer = buffer_;
+
+ uint64_t needed_bits = num_bits * batch_size;
+ constexpr uint64_t kBitsPerByte = 8;
+ uint64_t remaining_bits = (max_bytes - byte_offset) * kBitsPerByte - bit_offset;
+ if (remaining_bits < needed_bits) {
+ batch_size = static_cast<int>(remaining_bits) / num_bits;
+ }
+
+ int i = 0;
+ if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
+ for (; i < batch_size && bit_offset != 0; ++i) {
+ detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+ &buffered_values);
+ }
+ }
+
+ if (sizeof(T) == 4) {
+ int num_unpacked =
+ internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+ reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
+ i += num_unpacked;
+ byte_offset += num_unpacked * num_bits / 8;
+ } else {
+ const int buffer_size = 1024;
+ uint32_t unpack_buffer[buffer_size];
+ while (i < batch_size) {
+ int unpack_size = std::min(buffer_size, batch_size - i);
+ int num_unpacked =
+ internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+ unpack_buffer, unpack_size, num_bits);
+ if (num_unpacked == 0) {
+ break;
+ }
+ for (int k = 0; k < num_unpacked; ++k) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+ v[i + k] = static_cast<T>(unpack_buffer[k]);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ }
+ i += num_unpacked;
+ byte_offset += num_unpacked * num_bits / 8;
+ }
+ }
+
+ int bytes_remaining = max_bytes - byte_offset;
+ if (bytes_remaining >= 8) {
+ memcpy(&buffered_values, buffer + byte_offset, 8);
+ } else {
+ memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
+ }
+ buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
+
+ for (; i < batch_size; ++i) {
+ detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+ &buffered_values);
+ }
+
+ bit_offset_ = bit_offset;
+ byte_offset_ = byte_offset;
+ buffered_values_ = buffered_values;
+
+ return batch_size;
+}
+
+template <typename T>
+inline bool BitReader::GetAligned(int num_bytes, T* v) {
+ if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
+ return false;
+ }
+
+ int bytes_read = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) {
+ return false;
+ }
+
+ // Advance byte_offset to next unread byte and read num_bytes
+ byte_offset_ += bytes_read;
+ memcpy(v, buffer_ + byte_offset_, num_bytes);
+ *v = arrow::BitUtil::FromLittleEndian(*v);
+ byte_offset_ += num_bytes;
+
+ // Reset buffered_values_
+ bit_offset_ = 0;
+ int bytes_remaining = max_bytes_ - byte_offset_;
+ if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+ memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+ } else {
+ memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+ }
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ return true;
+}
+
+inline bool BitWriter::PutVlqInt(uint32_t v) {
+ bool result = true;
+ while ((v & 0xFFFFFF80UL) != 0UL) {
+ result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
+ v >>= 7;
+ }
+ result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
+ return result;
+}
+
+inline bool BitReader::GetVlqInt(uint32_t* v) {
+ uint32_t tmp = 0;
+
+ for (int i = 0; i < kMaxVlqByteLength; i++) {
+ uint8_t byte = 0;
+ if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
+ return false;
+ }
+ tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
+
+ if ((byte & 0x80) == 0) {
+ *v = tmp;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+ auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
+ return PutVlqInt((u_v << 1) ^ (u_v >> 31));
+}
+
+inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
+ uint32_t u;
+ if (!GetVlqInt(&u)) return false;
+ *v = ::arrow::util::SafeCopy<int32_t>((u >> 1) ^ (u << 31));
+ return true;
+}
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
index ee4bcde7713..1b123f4153a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
@@ -20,8 +20,8 @@
#include <cstdint>
#include <cstring>
-#include "arrow/util/logging.h"
-
+#include "arrow/util/logging.h"
+
namespace arrow {
namespace BitUtil {
@@ -69,59 +69,59 @@ void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_ar
bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
}
-template <bool value>
-void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
- // offset length
- // data |<------------->|
- // |--------|...|--------|...|--------|
- // |<--->| |<--->|
- // pro epi
- if (ARROW_PREDICT_FALSE(length == 0)) {
- return;
- }
-
- constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
-
- auto prologue = static_cast<int32_t>(BitUtil::RoundUp(offset, 8) - offset);
- DCHECK_LT(prologue, 8);
-
- if (length < prologue) { // special case where a mask is required
- // offset length
- // data |<->|
- // |--------|...|--------|...
- // mask --> |111|
- // |<---->|
- // pro
- uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
- BitUtil::kPrecedingBitmask[8 - prologue + length];
- data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask;
- return;
- }
-
- // align to a byte boundary
- data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte);
- offset += prologue;
- length -= prologue;
-
- // set values per byte
- DCHECK_EQ(offset % 8, 0);
- std::memset(data + offset / 8, set_byte, length / 8);
- offset += BitUtil::RoundDown(length, 8);
- length -= BitUtil::RoundDown(length, 8);
-
- // clean up
- DCHECK_LT(length, 8);
- data[offset / 8] =
- BitUtil::SpliceWord(static_cast<int32_t>(length), set_byte, data[offset / 8]);
-}
-
-void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
- SetBitmapImpl<true>(data, offset, length);
-}
-
-void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
- SetBitmapImpl<false>(data, offset, length);
-}
-
+template <bool value>
+void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
+ // offset length
+ // data |<------------->|
+ // |--------|...|--------|...|--------|
+ // |<--->| |<--->|
+ // pro epi
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ return;
+ }
+
+ constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
+
+ auto prologue = static_cast<int32_t>(BitUtil::RoundUp(offset, 8) - offset);
+ DCHECK_LT(prologue, 8);
+
+ if (length < prologue) { // special case where a mask is required
+ // offset length
+ // data |<->|
+ // |--------|...|--------|...
+ // mask --> |111|
+ // |<---->|
+ // pro
+ uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
+ BitUtil::kPrecedingBitmask[8 - prologue + length];
+ data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask;
+ return;
+ }
+
+ // align to a byte boundary
+ data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte);
+ offset += prologue;
+ length -= prologue;
+
+ // set values per byte
+ DCHECK_EQ(offset % 8, 0);
+ std::memset(data + offset / 8, set_byte, length / 8);
+ offset += BitUtil::RoundDown(length, 8);
+ length -= BitUtil::RoundDown(length, 8);
+
+ // clean up
+ DCHECK_LT(length, 8);
+ data[offset / 8] =
+ BitUtil::SpliceWord(static_cast<int32_t>(length), set_byte, data[offset / 8]);
+}
+
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
+ SetBitmapImpl<true>(data, offset, length);
+}
+
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
+ SetBitmapImpl<false>(data, offset, length);
+}
+
} // namespace BitUtil
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
index c306ce7821b..216cf9fba88 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
@@ -112,7 +112,7 @@ constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
// Returns a mask for the bit_index lower order bits.
// Only valid for bit_index in the range [0, 64).
-constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
+constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
return (static_cast<uint64_t>(1) << bit_index) - 1;
}
@@ -290,14 +290,14 @@ static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63
// the bitwise complement version of kPrecedingBitmask
static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128};
-static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
+static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
return (bits[i >> 3] >> (i & 0x07)) & 1;
}
// Gets the i-th bit from a byte. Should only be used with i <= 7.
-static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
- return byte & kBitmask[i];
-}
+static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
+ return byte & kBitmask[i];
+}
static inline void ClearBit(uint8_t* bits, int64_t i) {
bits[i / 8] &= kFlippedBitmask[i % 8];
@@ -318,37 +318,37 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
ARROW_EXPORT
void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
-/// \brief Sets all bits in the bitmap to true
-ARROW_EXPORT
-void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
-
-/// \brief Clears all bits in the bitmap (set to false)
-ARROW_EXPORT
-void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
-
-/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
-/// returned
-/// ex:
-/// ref: https://stackoverflow.com/a/59523400
-template <typename Word>
-constexpr Word PrecedingWordBitmask(unsigned int const i) {
- return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
-}
-static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
-static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
-static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
-static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
-
-/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
-/// from `high`.
-/// Word ret
-/// for (i = 0; i < sizeof(Word)*8; i++){
-/// ret[i]= i < n ? low[i]: high[i];
-/// }
-template <typename Word>
-constexpr Word SpliceWord(int n, Word low, Word high) {
- return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
-}
-
+/// \brief Sets all bits in the bitmap to true
+ARROW_EXPORT
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// \brief Clears all bits in the bitmap (set to false)
+ARROW_EXPORT
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
+/// returned
+/// ex:
+/// ref: https://stackoverflow.com/a/59523400
+template <typename Word>
+constexpr Word PrecedingWordBitmask(unsigned int const i) {
+ return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
+}
+static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
+static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
+static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
+static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
+
+/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
+/// from `high`.
+/// Word ret
+/// for (i = 0; i < sizeof(Word)*8; i++){
+/// ret[i]= i < n ? low[i]: high[i];
+/// }
+template <typename Word>
+constexpr Word SpliceWord(int n, Word low, Word high) {
+ return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
+}
+
} // namespace BitUtil
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
index 33d1dee1957..bd389138316 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
@@ -46,16 +46,16 @@ std::string Bitmap::Diff(const Bitmap& other) const {
return ToArray()->Diff(*other.ToArray());
}
-void Bitmap::CopyFrom(const Bitmap& other) {
- ::arrow::internal::CopyBitmap(other.buffer_->data(), other.offset_, other.length_,
- buffer_->mutable_data(), offset_);
-}
-
-void Bitmap::CopyFromInverted(const Bitmap& other) {
- ::arrow::internal::InvertBitmap(other.buffer_->data(), other.offset_, other.length_,
- buffer_->mutable_data(), offset_);
-}
-
+void Bitmap::CopyFrom(const Bitmap& other) {
+ ::arrow::internal::CopyBitmap(other.buffer_->data(), other.offset_, other.length_,
+ buffer_->mutable_data(), offset_);
+}
+
+void Bitmap::CopyFromInverted(const Bitmap& other) {
+ ::arrow::internal::InvertBitmap(other.buffer_->data(), other.offset_, other.length_,
+ buffer_->mutable_data(), offset_);
+}
+
bool Bitmap::Equals(const Bitmap& other) const {
if (length_ != other.length_) {
return false;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
index 141f863c0b8..13e7c5dc00a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
@@ -29,11 +29,11 @@
#include "arrow/buffer.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_reader.h"
-#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
#include "arrow/util/compare.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/functional.h"
#include "arrow/util/string_builder.h"
#include "arrow/util/string_view.h"
@@ -90,13 +90,13 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v);
}
- void SetBitsTo(bool v) {
- BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
- }
-
- void CopyFrom(const Bitmap& other);
- void CopyFromInverted(const Bitmap& other);
-
+ void SetBitsTo(bool v) {
+ BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
+ }
+
+ void CopyFrom(const Bitmap& other);
+ void CopyFromInverted(const Bitmap& other);
+
/// \brief Visit bits from each bitmap as bitset<N>
///
/// All bitmaps must have identical length.
@@ -112,21 +112,21 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
}
}
- /// \brief Visit bits from each bitmap as bitset<N>
- ///
- /// All bitmaps must have identical length.
- template <size_t N, typename Visitor>
- static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
- int64_t bit_length = BitLength(bitmaps);
- std::bitset<N> bits;
- for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
- for (size_t i = 0; i < N; ++i) {
- bits[i] = bitmaps[i].GetBit(bit_i);
- }
- visitor(bits);
- }
- }
-
+ /// \brief Visit bits from each bitmap as bitset<N>
+ ///
+ /// All bitmaps must have identical length.
+ template <size_t N, typename Visitor>
+ static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps);
+ std::bitset<N> bits;
+ for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+ for (size_t i = 0; i < N; ++i) {
+ bits[i] = bitmaps[i].GetBit(bit_i);
+ }
+ visitor(bits);
+ }
+ }
+
/// \brief Visit words of bits from each bitmap as array<Word, N>
///
/// All bitmaps must have identical length. The first bit in a visited bitmap
@@ -135,14 +135,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
/// returned.
///
/// TODO(bkietz) allow for early termination
- // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
- // It also has a large prolog / epilog overhead and should be used
- // carefully in other cases.
- // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
- // and BitmapUInt64Reader.
+ // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+ // It also has a large prolog / epilog overhead and should be used
+ // carefully in other cases.
+ // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+ // and BitmapUInt64Reader.
template <size_t N, typename Visitor,
- typename Word = typename std::decay<
- internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
constexpr int64_t kBitWidth = sizeof(Word) * 8;
@@ -243,132 +243,132 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
return min_offset;
}
- template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
- typename Word = typename std::decay<
- internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
- static void RunVisitWordsAndWriteLoop(int64_t bit_length,
- std::array<ReaderT, N>& readers,
- std::array<WriterT, M>& writers,
- Visitor&& visitor) {
- constexpr int64_t kBitWidth = sizeof(Word) * 8;
-
- std::array<Word, N> visited_words;
- std::array<Word, M> output_words;
-
- // every reader will have same number of words, since they are same length'ed
- // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
- // Word boundary, every Word would have to be created from 2 adjoining Words
- auto n_words = readers[0].words();
- bit_length -= n_words * kBitWidth;
- while (n_words--) {
- // first collect all words to visited_words array
- for (size_t i = 0; i < N; i++) {
- visited_words[i] = readers[i].NextWord();
- }
- visitor(visited_words, &output_words);
- for (size_t i = 0; i < M; i++) {
- writers[i].PutNextWord(output_words[i]);
- }
- }
-
- // every reader will have same number of trailing bytes, because of the above reason
- // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
- // remaining full/ partial words to write
-
- if (bit_length) {
- // convert the word visitor lambda to a byte_visitor
- auto byte_visitor = [&](const std::array<uint8_t, N>& in,
- std::array<uint8_t, M>* out) {
- std::array<Word, N> in_words;
- std::array<Word, M> out_words;
- std::copy(in.begin(), in.end(), in_words.begin());
- visitor(in_words, &out_words);
- for (size_t i = 0; i < M; i++) {
- out->at(i) = static_cast<uint8_t>(out_words[i]);
- }
- };
-
- std::array<uint8_t, N> visited_bytes;
- std::array<uint8_t, M> output_bytes;
- int n_bytes = readers[0].trailing_bytes();
- while (n_bytes--) {
- visited_bytes.fill(0);
- output_bytes.fill(0);
- int valid_bits;
- for (size_t i = 0; i < N; i++) {
- visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
- }
- byte_visitor(visited_bytes, &output_bytes);
- for (size_t i = 0; i < M; i++) {
- writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
- }
- }
- }
- }
-
- /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
- /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
- ///
- /// All bitmaps must have identical length. The first bit in a visited bitmap
- /// may be offset within the first visited word, but words will otherwise contain
- /// densely packed bits loaded from the bitmap. That offset within the first word is
- /// returned.
- /// Visitor is expected to have the following signature
- /// [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
- ///
- // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
- // It also has a large prolog / epilog overhead and should be used
- // carefully in other cases.
- // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
- // and BitmapUInt64Reader.
- template <size_t N, size_t M, typename Visitor,
- typename Word = typename std::decay<
- internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
- static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
- std::array<Bitmap, M>* out_bitmaps_arg,
- Visitor&& visitor) {
- int64_t bit_length = BitLength(bitmaps_arg);
- assert(bit_length == BitLength(*out_bitmaps_arg));
-
- // if both input and output bitmaps have no byte offset, then use special template
- if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
- [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
- std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
- [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
- std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
- for (size_t i = 0; i < N; ++i) {
- const Bitmap& in_bitmap = bitmaps_arg[i];
- readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
- in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
- }
-
- std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
- for (size_t i = 0; i < M; ++i) {
- const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
- writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
- out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
- }
-
- RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
- } else {
- std::array<BitmapWordReader<Word>, N> readers;
- for (size_t i = 0; i < N; ++i) {
- const Bitmap& in_bitmap = bitmaps_arg[i];
- readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
- in_bitmap.length_);
- }
-
- std::array<BitmapWordWriter<Word>, M> writers;
- for (size_t i = 0; i < M; ++i) {
- const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
- writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
- out_bitmap.offset_, out_bitmap.length_);
- }
-
- RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
- }
- }
-
+ template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static void RunVisitWordsAndWriteLoop(int64_t bit_length,
+ std::array<ReaderT, N>& readers,
+ std::array<WriterT, M>& writers,
+ Visitor&& visitor) {
+ constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+ std::array<Word, N> visited_words;
+ std::array<Word, M> output_words;
+
+ // every reader will have same number of words, since they are same length'ed
+ // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
+ // Word boundary, every Word would have to be created from 2 adjoining Words
+ auto n_words = readers[0].words();
+ bit_length -= n_words * kBitWidth;
+ while (n_words--) {
+ // first collect all words to visited_words array
+ for (size_t i = 0; i < N; i++) {
+ visited_words[i] = readers[i].NextWord();
+ }
+ visitor(visited_words, &output_words);
+ for (size_t i = 0; i < M; i++) {
+ writers[i].PutNextWord(output_words[i]);
+ }
+ }
+
+ // every reader will have same number of trailing bytes, because of the above reason
+ // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
+ // remaining full/ partial words to write
+
+ if (bit_length) {
+ // convert the word visitor lambda to a byte_visitor
+ auto byte_visitor = [&](const std::array<uint8_t, N>& in,
+ std::array<uint8_t, M>* out) {
+ std::array<Word, N> in_words;
+ std::array<Word, M> out_words;
+ std::copy(in.begin(), in.end(), in_words.begin());
+ visitor(in_words, &out_words);
+ for (size_t i = 0; i < M; i++) {
+ out->at(i) = static_cast<uint8_t>(out_words[i]);
+ }
+ };
+
+ std::array<uint8_t, N> visited_bytes;
+ std::array<uint8_t, M> output_bytes;
+ int n_bytes = readers[0].trailing_bytes();
+ while (n_bytes--) {
+ visited_bytes.fill(0);
+ output_bytes.fill(0);
+ int valid_bits;
+ for (size_t i = 0; i < N; i++) {
+ visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
+ }
+ byte_visitor(visited_bytes, &output_bytes);
+ for (size_t i = 0; i < M; i++) {
+ writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
+ }
+ }
+ }
+ }
+
+ /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
+ /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
+ ///
+ /// All bitmaps must have identical length. The first bit in a visited bitmap
+ /// may be offset within the first visited word, but words will otherwise contain
+ /// densely packed bits loaded from the bitmap. That offset within the first word is
+ /// returned.
+ /// Visitor is expected to have the following signature
+ /// [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
+ ///
+ // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+ // It also has a large prolog / epilog overhead and should be used
+ // carefully in other cases.
+ // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+ // and BitmapUInt64Reader.
+ template <size_t N, size_t M, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+ std::array<Bitmap, M>* out_bitmaps_arg,
+ Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps_arg);
+ assert(bit_length == BitLength(*out_bitmaps_arg));
+
+ // if both input and output bitmaps have no byte offset, then use special template
+ if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
+ [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
+ std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
+ [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
+ std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
+ for (size_t i = 0; i < N; ++i) {
+ const Bitmap& in_bitmap = bitmaps_arg[i];
+ readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
+ in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
+ }
+
+ std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
+ for (size_t i = 0; i < M; ++i) {
+ const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+ writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
+ out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
+ }
+
+ RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+ } else {
+ std::array<BitmapWordReader<Word>, N> readers;
+ for (size_t i = 0; i < N; ++i) {
+ const Bitmap& in_bitmap = bitmaps_arg[i];
+ readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
+ in_bitmap.length_);
+ }
+
+ std::array<BitmapWordWriter<Word>, M> writers;
+ for (size_t i = 0; i < M; ++i) {
+ const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+ writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
+ out_bitmap.offset_, out_bitmap.length_);
+ }
+
+ RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+ }
+ }
+
const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
/// offset of first bit relative to buffer().data()
@@ -445,14 +445,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
/// assert bitmaps have identical length and return that length
static int64_t BitLength(const Bitmap* bitmaps, size_t N);
- template <size_t N>
- static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
- for (size_t i = 1; i < N; ++i) {
- assert(bitmaps[i].length() == bitmaps[0].length());
- }
- return bitmaps[0].length();
- }
-
+ template <size_t N>
+ static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
+ for (size_t i = 1; i < N; ++i) {
+ assert(bitmaps[i].length() == bitmaps[0].length());
+ }
+ return bitmaps[0].length();
+ }
+
std::shared_ptr<Buffer> buffer_;
int64_t offset_ = 0, length_ = 0;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
index 129fa913231..68a9016d8a0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
@@ -62,9 +62,9 @@ void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generat
template <class Generator>
void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
Generator&& g) {
- static_assert(std::is_same<typename std::result_of<Generator && ()>::type, bool>::value,
- "Functor passed to GenerateBitsUnrolled must return bool");
-
+ static_assert(std::is_same<typename std::result_of<Generator && ()>::type, bool>::value,
+ "Functor passed to GenerateBitsUnrolled must return bool");
+
if (length == 0) {
return;
}
@@ -77,7 +77,7 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
if (bit_mask != 0x01) {
current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset];
while (bit_mask != 0 && remaining > 0) {
- current_byte |= g() * bit_mask;
+ current_byte |= g() * bit_mask;
bit_mask = static_cast<uint8_t>(bit_mask << 1);
--remaining;
}
@@ -85,14 +85,14 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
}
int64_t remaining_bytes = remaining / 8;
- uint8_t out_results[8];
+ uint8_t out_results[8];
while (remaining_bytes-- > 0) {
- for (int i = 0; i < 8; ++i) {
- out_results[i] = g();
- }
- *cur++ = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
- out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
- out_results[6] << 6 | out_results[7] << 7);
+ for (int i = 0; i < 8; ++i) {
+ out_results[i] = g();
+ }
+ *cur++ = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
+ out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
+ out_results[6] << 6 | out_results[7] << 7);
}
int64_t remaining_bits = remaining % 8;
@@ -100,7 +100,7 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
current_byte = 0;
bit_mask = 0x01;
while (remaining_bits-- > 0) {
- current_byte |= g() * bit_mask;
+ current_byte |= g() * bit_mask;
bit_mask = static_cast<uint8_t>(bit_mask << 1);
}
*cur++ = current_byte;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
index 63c8b008f4a..afbad3f8aba 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
@@ -172,7 +172,7 @@ Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* data
}
Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* data,
- int64_t offset, int64_t length) {
+ int64_t offset, int64_t length) {
return TransferBitmap<TransferMode::Invert>(pool, data, offset, length);
}
@@ -215,26 +215,26 @@ bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right
return true;
}
-bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length) {
- if (left == nullptr && right == nullptr) {
- return true;
- } else if (left != nullptr && right != nullptr) {
- return BitmapEquals(left, left_offset, right, right_offset, length);
- } else if (left != nullptr) {
- return CountSetBits(left, left_offset, length) == length;
- } else {
- return CountSetBits(right, right_offset, length) == length;
- }
-}
-
-bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
- const std::shared_ptr<Buffer>& right, int64_t right_offset,
- int64_t length) {
- return OptionalBitmapEquals(left ? left->data() : nullptr, left_offset,
- right ? right->data() : nullptr, right_offset, length);
-}
-
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length) {
+ if (left == nullptr && right == nullptr) {
+ return true;
+ } else if (left != nullptr && right != nullptr) {
+ return BitmapEquals(left, left_offset, right, right_offset, length);
+ } else if (left != nullptr) {
+ return CountSetBits(left, left_offset, length) == length;
+ } else {
+ return CountSetBits(right, right_offset, length) == length;
+ }
+}
+
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right, int64_t right_offset,
+ int64_t length) {
+ return OptionalBitmapEquals(left ? left->data() : nullptr, left_offset,
+ right ? right->data() : nullptr, right_offset, length);
+}
+
namespace {
template <template <typename> class BitOp>
@@ -346,42 +346,42 @@ void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
BitmapOp<std::bit_xor>(left, left_offset, right, right_offset, length, out_offset, out);
}
-template <typename T>
-struct AndNotOp {
- constexpr T operator()(const T& l, const T& r) const { return l & ~r; }
-};
-
-Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset) {
- return BitmapOp<AndNotOp>(pool, left, left_offset, right, right_offset, length,
- out_offset);
-}
-
-void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset,
- uint8_t* out) {
- BitmapOp<AndNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
-}
-
-template <typename T>
-struct OrNotOp {
- constexpr T operator()(const T& l, const T& r) const { return l | ~r; }
-};
-
-Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset) {
- return BitmapOp<OrNotOp>(pool, left, left_offset, right, right_offset, length,
- out_offset);
-}
-
-void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
- BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
-}
-
+template <typename T>
+struct AndNotOp {
+ constexpr T operator()(const T& l, const T& r) const { return l & ~r; }
+};
+
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<AndNotOp>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset,
+ uint8_t* out) {
+ BitmapOp<AndNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+template <typename T>
+struct OrNotOp {
+ constexpr T operator()(const T& l, const T& r) const { return l | ~r; }
+};
+
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<OrNotOp>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
+ BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
index 40a7797a239..ad9990459de 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
@@ -96,17 +96,17 @@ ARROW_EXPORT
bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
int64_t right_offset, int64_t length);
-// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
-// all-ones bitmap.
-ARROW_EXPORT
-bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length);
-
-ARROW_EXPORT
-bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
- const std::shared_ptr<Buffer>& right, int64_t right_offset,
- int64_t length);
-
+// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
+// all-ones bitmap.
+ARROW_EXPORT
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length);
+
+ARROW_EXPORT
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right, int64_t right_offset,
+ int64_t length);
+
/// \brief Do a "bitmap and" on right and left buffers starting at
/// their respective bit-offsets for the given bit-length and put
/// the results in out_buffer starting at the given bit-offset.
@@ -164,43 +164,43 @@ ARROW_EXPORT
void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-/// \brief Do a "bitmap and not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out_buffer starting at the given bit-offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset);
-
-/// \brief Do a "bitmap and not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out starting at the given bit-offset.
-ARROW_EXPORT
-void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
-/// \brief Do a "bitmap or not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out_buffer starting at the given bit-offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset);
-
-/// \brief Do a "bitmap or not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out starting at the given bit-offset.
-ARROW_EXPORT
-void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
index 7c43747fafb..c0f08ff249c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
@@ -22,7 +22,7 @@
#include "arrow/buffer.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
namespace arrow {
@@ -70,190 +70,190 @@ class BitmapReader {
int64_t bit_offset_;
};
-// XXX Cannot name it BitmapWordReader because the name is already used
-// in bitmap_ops.cc
-
-class BitmapUInt64Reader {
- public:
- BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
- : bitmap_(bitmap + start_offset / 8),
- num_carry_bits_(8 - start_offset % 8),
- length_(length),
- remaining_length_(length_) {
- if (length_ > 0) {
- // Load carry bits from the first byte's MSBs
- if (length_ >= num_carry_bits_) {
- carry_bits_ =
- LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
- } else {
- carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
- }
- }
- }
-
- uint64_t NextWord() {
- if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
- // We can load a full word
- uint64_t next_word = LoadFullWord();
- // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
- uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
- carry_bits_ = next_word >> (64 - num_carry_bits_);
- remaining_length_ -= 64;
- return word;
- } else if (remaining_length_ > num_carry_bits_) {
- // We can load a partial word
- uint64_t next_word =
- LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
- uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
- carry_bits_ = next_word >> (64 - num_carry_bits_);
- remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
- return word;
- } else {
- remaining_length_ = 0;
- return carry_bits_;
- }
- }
-
- int64_t position() const { return length_ - remaining_length_; }
-
- int64_t length() const { return length_; }
-
- private:
- uint64_t LoadFullWord() {
- uint64_t word;
- memcpy(&word, bitmap_, 8);
- bitmap_ += 8;
- return BitUtil::ToLittleEndian(word);
- }
-
- uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
- uint64_t word = 0;
- const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
- memcpy(&word, bitmap_, num_bytes);
- bitmap_ += num_bytes;
- return (BitUtil::ToLittleEndian(word) >> bit_offset) &
- BitUtil::LeastSignificantBitMask(num_bits);
- }
-
- const uint8_t* bitmap_;
- const int64_t num_carry_bits_; // in [1, 8]
- const int64_t length_;
- int64_t remaining_length_;
- uint64_t carry_bits_;
-};
-
-// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
-// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
-// and should probably not be used for small bitmaps.
-
-template <typename Word, bool may_have_byte_offset = true>
-class BitmapWordReader {
- public:
- BitmapWordReader() = default;
- BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
- : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
- bitmap_(bitmap + offset / 8),
- bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
- // decrement word count by one as we may touch two adjacent words in one iteration
- nwords_ = length / (sizeof(Word) * 8) - 1;
- if (nwords_ < 0) {
- nwords_ = 0;
- }
- trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
- trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
-
- if (nwords_ > 0) {
- current_word_ = load<Word>(bitmap_);
- } else if (length > 0) {
- current_byte_ = load<uint8_t>(bitmap_);
- }
- }
-
- Word NextWord() {
- bitmap_ += sizeof(Word);
- const Word next_word = load<Word>(bitmap_);
- Word word = current_word_;
- if (may_have_byte_offset && offset_) {
- // combine two adjacent words into one word
- // |<------ next ----->|<---- current ---->|
- // +-------------+-----+-------------+-----+
- // | --- | A | B | --- |
- // +-------------+-----+-------------+-----+
- // | | offset
- // v v
- // +-----+-------------+
- // | A | B |
- // +-----+-------------+
- // |<------ word ----->|
- word >>= offset_;
- word |= next_word << (sizeof(Word) * 8 - offset_);
- }
- current_word_ = next_word;
- return word;
- }
-
- uint8_t NextTrailingByte(int& valid_bits) {
- uint8_t byte;
- assert(trailing_bits_ > 0);
-
- if (trailing_bits_ <= 8) {
- // last byte
- valid_bits = trailing_bits_;
- trailing_bits_ = 0;
- byte = 0;
- internal::BitmapReader reader(bitmap_, offset_, valid_bits);
- for (int i = 0; i < valid_bits; ++i) {
- byte >>= 1;
- if (reader.IsSet()) {
- byte |= 0x80;
- }
- reader.Next();
- }
- byte >>= (8 - valid_bits);
- } else {
- ++bitmap_;
- const uint8_t next_byte = load<uint8_t>(bitmap_);
- byte = current_byte_;
- if (may_have_byte_offset && offset_) {
- byte >>= offset_;
- byte |= next_byte << (8 - offset_);
- }
- current_byte_ = next_byte;
- trailing_bits_ -= 8;
- trailing_bytes_--;
- valid_bits = 8;
- }
- return byte;
- }
-
- int64_t words() const { return nwords_; }
- int trailing_bytes() const { return trailing_bytes_; }
-
- private:
- int64_t offset_;
- const uint8_t* bitmap_;
-
- const uint8_t* bitmap_end_;
- int64_t nwords_;
- int trailing_bits_;
- int trailing_bytes_;
- union {
- Word current_word_;
- struct {
-#if ARROW_LITTLE_ENDIAN == 0
- uint8_t padding_bytes_[sizeof(Word) - 1];
-#endif
- uint8_t current_byte_;
- };
- };
-
- template <typename DType>
- DType load(const uint8_t* bitmap) {
- assert(bitmap + sizeof(DType) <= bitmap_end_);
- return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
- }
-};
-
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc
+
+class BitmapUInt64Reader {
+ public:
+ BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap + start_offset / 8),
+ num_carry_bits_(8 - start_offset % 8),
+ length_(length),
+ remaining_length_(length_) {
+ if (length_ > 0) {
+ // Load carry bits from the first byte's MSBs
+ if (length_ >= num_carry_bits_) {
+ carry_bits_ =
+ LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
+ } else {
+ carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
+ }
+ }
+ }
+
+ uint64_t NextWord() {
+ if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
+ // We can load a full word
+ uint64_t next_word = LoadFullWord();
+ // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
+ uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+ carry_bits_ = next_word >> (64 - num_carry_bits_);
+ remaining_length_ -= 64;
+ return word;
+ } else if (remaining_length_ > num_carry_bits_) {
+ // We can load a partial word
+ uint64_t next_word =
+ LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
+ uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+ carry_bits_ = next_word >> (64 - num_carry_bits_);
+ remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
+ return word;
+ } else {
+ remaining_length_ = 0;
+ return carry_bits_;
+ }
+ }
+
+ int64_t position() const { return length_ - remaining_length_; }
+
+ int64_t length() const { return length_; }
+
+ private:
+ uint64_t LoadFullWord() {
+ uint64_t word;
+ memcpy(&word, bitmap_, 8);
+ bitmap_ += 8;
+ return BitUtil::ToLittleEndian(word);
+ }
+
+ uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+ uint64_t word = 0;
+ const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+ memcpy(&word, bitmap_, num_bytes);
+ bitmap_ += num_bytes;
+ return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+ BitUtil::LeastSignificantBitMask(num_bits);
+ }
+
+ const uint8_t* bitmap_;
+ const int64_t num_carry_bits_; // in [1, 8]
+ const int64_t length_;
+ int64_t remaining_length_;
+ uint64_t carry_bits_;
+};
+
+// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
+// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
+// and should probably not be used for small bitmaps.
+
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordReader {
+ public:
+ BitmapWordReader() = default;
+ BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
+ : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+ bitmap_(bitmap + offset / 8),
+ bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
+ // decrement word count by one as we may touch two adjacent words in one iteration
+ nwords_ = length / (sizeof(Word) * 8) - 1;
+ if (nwords_ < 0) {
+ nwords_ = 0;
+ }
+ trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
+ trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
+
+ if (nwords_ > 0) {
+ current_word_ = load<Word>(bitmap_);
+ } else if (length > 0) {
+ current_byte_ = load<uint8_t>(bitmap_);
+ }
+ }
+
+ Word NextWord() {
+ bitmap_ += sizeof(Word);
+ const Word next_word = load<Word>(bitmap_);
+ Word word = current_word_;
+ if (may_have_byte_offset && offset_) {
+ // combine two adjacent words into one word
+ // |<------ next ----->|<---- current ---->|
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // | | offset
+ // v v
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // |<------ word ----->|
+ word >>= offset_;
+ word |= next_word << (sizeof(Word) * 8 - offset_);
+ }
+ current_word_ = next_word;
+ return word;
+ }
+
+ uint8_t NextTrailingByte(int& valid_bits) {
+ uint8_t byte;
+ assert(trailing_bits_ > 0);
+
+ if (trailing_bits_ <= 8) {
+ // last byte
+ valid_bits = trailing_bits_;
+ trailing_bits_ = 0;
+ byte = 0;
+ internal::BitmapReader reader(bitmap_, offset_, valid_bits);
+ for (int i = 0; i < valid_bits; ++i) {
+ byte >>= 1;
+ if (reader.IsSet()) {
+ byte |= 0x80;
+ }
+ reader.Next();
+ }
+ byte >>= (8 - valid_bits);
+ } else {
+ ++bitmap_;
+ const uint8_t next_byte = load<uint8_t>(bitmap_);
+ byte = current_byte_;
+ if (may_have_byte_offset && offset_) {
+ byte >>= offset_;
+ byte |= next_byte << (8 - offset_);
+ }
+ current_byte_ = next_byte;
+ trailing_bits_ -= 8;
+ trailing_bytes_--;
+ valid_bits = 8;
+ }
+ return byte;
+ }
+
+ int64_t words() const { return nwords_; }
+ int trailing_bytes() const { return trailing_bytes_; }
+
+ private:
+ int64_t offset_;
+ const uint8_t* bitmap_;
+
+ const uint8_t* bitmap_end_;
+ int64_t nwords_;
+ int trailing_bits_;
+ int trailing_bytes_;
+ union {
+ Word current_word_;
+ struct {
+#if ARROW_LITTLE_ENDIAN == 0
+ uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+ uint8_t current_byte_;
+ };
+ };
+
+ template <typename DType>
+ DType load(const uint8_t* bitmap) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+ }
+};
+
/// \brief Index into a possibly non-existent bitmap
struct OptionalBitIndexer {
const uint8_t* bitmap;
@@ -263,7 +263,7 @@ struct OptionalBitIndexer {
: bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
bool operator[](int64_t i) const {
- return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
+ return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
}
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
index 8a16993e052..dc495d1135b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
@@ -1,88 +1,88 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_reader.h"
-
-namespace arrow {
-namespace internal {
-
-// A function that visits each bit in a bitmap and calls a visitor function with a
-// boolean representation of that bit. This is intended to be analogous to
-// GenerateBits.
-template <class Visitor>
-void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
- Visitor&& visit) {
- BitmapReader reader(bitmap, start_offset, length);
- for (int64_t index = 0; index < length; ++index) {
- visit(reader.IsSet());
- reader.Next();
- }
-}
-
-// Like VisitBits(), but unrolls its main loop for better performance.
-template <class Visitor>
-void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
- Visitor&& visit) {
- if (length == 0) {
- return;
- }
-
- // Start by visiting any bits preceding the first full byte.
- int64_t num_bits_before_full_bytes =
- BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
- // Truncate num_bits_before_full_bytes if it is greater than length.
- if (num_bits_before_full_bytes > length) {
- num_bits_before_full_bytes = length;
- }
- // Use the non loop-unrolled VisitBits since we don't want to add branches
- VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
-
- // Shift the start pointer to the first full byte and compute the
- // number of full bytes to be read.
- const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
- const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
-
- // Iterate over each full byte of the input bitmap and call the visitor in
- // a loop-unrolled manner.
- for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
- // Get the current bit-packed byte value from the bitmap.
- const uint8_t byte = *(first_full_byte + byte_index);
-
- // Execute the visitor function on each bit of the current byte.
- visit(BitUtil::GetBitFromByte(byte, 0));
- visit(BitUtil::GetBitFromByte(byte, 1));
- visit(BitUtil::GetBitFromByte(byte, 2));
- visit(BitUtil::GetBitFromByte(byte, 3));
- visit(BitUtil::GetBitFromByte(byte, 4));
- visit(BitUtil::GetBitFromByte(byte, 5));
- visit(BitUtil::GetBitFromByte(byte, 6));
- visit(BitUtil::GetBitFromByte(byte, 7));
- }
-
- // Write any leftover bits in the last byte.
- const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
- VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
- visit);
-}
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+
+namespace arrow {
+namespace internal {
+
+// A function that visits each bit in a bitmap and calls a visitor function with a
+// boolean representation of that bit. This is intended to be analogous to
+// GenerateBits.
+template <class Visitor>
+void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Visitor&& visit) {
+ BitmapReader reader(bitmap, start_offset, length);
+ for (int64_t index = 0; index < length; ++index) {
+ visit(reader.IsSet());
+ reader.Next();
+ }
+}
+
+// Like VisitBits(), but unrolls its main loop for better performance.
+template <class Visitor>
+void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Visitor&& visit) {
+ if (length == 0) {
+ return;
+ }
+
+ // Start by visiting any bits preceding the first full byte.
+ int64_t num_bits_before_full_bytes =
+ BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
+ // Truncate num_bits_before_full_bytes if it is greater than length.
+ if (num_bits_before_full_bytes > length) {
+ num_bits_before_full_bytes = length;
+ }
+ // Use the non loop-unrolled VisitBits since we don't want to add branches
+ VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
+
+ // Shift the start pointer to the first full byte and compute the
+ // number of full bytes to be read.
+ const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
+ const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
+
+ // Iterate over each full byte of the input bitmap and call the visitor in
+ // a loop-unrolled manner.
+ for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
+ // Get the current bit-packed byte value from the bitmap.
+ const uint8_t byte = *(first_full_byte + byte_index);
+
+ // Execute the visitor function on each bit of the current byte.
+ visit(BitUtil::GetBitFromByte(byte, 0));
+ visit(BitUtil::GetBitFromByte(byte, 1));
+ visit(BitUtil::GetBitFromByte(byte, 2));
+ visit(BitUtil::GetBitFromByte(byte, 3));
+ visit(BitUtil::GetBitFromByte(byte, 4));
+ visit(BitUtil::GetBitFromByte(byte, 5));
+ visit(BitUtil::GetBitFromByte(byte, 6));
+ visit(BitUtil::GetBitFromByte(byte, 7));
+ }
+
+ // Write any leftover bits in the last byte.
+ const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
+ VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
+ visit);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
index d5c6d909df0..096cfc8655a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
@@ -21,7 +21,7 @@
#include <cstring>
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
namespace arrow {
@@ -180,106 +180,106 @@ class FirstTimeBitmapWriter {
int64_t byte_offset_;
};
-template <typename Word, bool may_have_byte_offset = true>
-class BitmapWordWriter {
- public:
- BitmapWordWriter() = default;
- BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
- : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
- bitmap_(bitmap + offset / 8),
- bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
- mask_((1U << offset_) - 1) {
- if (offset_) {
- if (length >= static_cast<int>(sizeof(Word) * 8)) {
- current_word_ = load<Word>(bitmap_);
- } else if (length > 0) {
- current_byte_ = load<uint8_t>(bitmap_);
- }
- }
- }
-
- void PutNextWord(Word word) {
- if (may_have_byte_offset && offset_) {
- // split one word into two adjacent words, don't touch unused bits
- // |<------ word ----->|
- // +-----+-------------+
- // | A | B |
- // +-----+-------------+
- // | |
- // v v offset
- // +-------------+-----+-------------+-----+
- // | --- | A | B | --- |
- // +-------------+-----+-------------+-----+
- // |<------ next ----->|<---- current ---->|
- word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
- Word next_word = load<Word>(bitmap_ + sizeof(Word));
- current_word_ = (current_word_ & mask_) | (word & ~mask_);
- next_word = (next_word & ~mask_) | (word & mask_);
- store<Word>(bitmap_, current_word_);
- store<Word>(bitmap_ + sizeof(Word), next_word);
- current_word_ = next_word;
- } else {
- store<Word>(bitmap_, word);
- }
- bitmap_ += sizeof(Word);
- }
-
- void PutNextTrailingByte(uint8_t byte, int valid_bits) {
- if (valid_bits == 8) {
- if (may_have_byte_offset && offset_) {
- byte = (byte << offset_) | (byte >> (8 - offset_));
- uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
- current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
- next_byte = (next_byte & ~mask_) | (byte & mask_);
- store<uint8_t>(bitmap_, current_byte_);
- store<uint8_t>(bitmap_ + 1, next_byte);
- current_byte_ = next_byte;
- } else {
- store<uint8_t>(bitmap_, byte);
- }
- ++bitmap_;
- } else {
- assert(valid_bits > 0);
- assert(valid_bits < 8);
- assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
- internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
- for (int i = 0; i < valid_bits; ++i) {
- (byte & 0x01) ? writer.Set() : writer.Clear();
- writer.Next();
- byte >>= 1;
- }
- writer.Finish();
- }
- }
-
- private:
- int64_t offset_;
- uint8_t* bitmap_;
-
- const uint8_t* bitmap_end_;
- uint64_t mask_;
- union {
- Word current_word_;
- struct {
-#if ARROW_LITTLE_ENDIAN == 0
- uint8_t padding_bytes_[sizeof(Word) - 1];
-#endif
- uint8_t current_byte_;
- };
- };
-
- template <typename DType>
- DType load(const uint8_t* bitmap) {
- assert(bitmap + sizeof(DType) <= bitmap_end_);
- return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
- }
-
- template <typename DType>
- void store(uint8_t* bitmap, DType data) {
- assert(bitmap + sizeof(DType) <= bitmap_end_);
- util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
- }
-};
-
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordWriter {
+ public:
+ BitmapWordWriter() = default;
+ BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
+ : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+ bitmap_(bitmap + offset / 8),
+ bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
+ mask_((1U << offset_) - 1) {
+ if (offset_) {
+ if (length >= static_cast<int>(sizeof(Word) * 8)) {
+ current_word_ = load<Word>(bitmap_);
+ } else if (length > 0) {
+ current_byte_ = load<uint8_t>(bitmap_);
+ }
+ }
+ }
+
+ void PutNextWord(Word word) {
+ if (may_have_byte_offset && offset_) {
+ // split one word into two adjacent words, don't touch unused bits
+ // |<------ word ----->|
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // | |
+ // v v offset
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // |<------ next ----->|<---- current ---->|
+ word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
+ Word next_word = load<Word>(bitmap_ + sizeof(Word));
+ current_word_ = (current_word_ & mask_) | (word & ~mask_);
+ next_word = (next_word & ~mask_) | (word & mask_);
+ store<Word>(bitmap_, current_word_);
+ store<Word>(bitmap_ + sizeof(Word), next_word);
+ current_word_ = next_word;
+ } else {
+ store<Word>(bitmap_, word);
+ }
+ bitmap_ += sizeof(Word);
+ }
+
+ void PutNextTrailingByte(uint8_t byte, int valid_bits) {
+ if (valid_bits == 8) {
+ if (may_have_byte_offset && offset_) {
+ byte = (byte << offset_) | (byte >> (8 - offset_));
+ uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
+ current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
+ next_byte = (next_byte & ~mask_) | (byte & mask_);
+ store<uint8_t>(bitmap_, current_byte_);
+ store<uint8_t>(bitmap_ + 1, next_byte);
+ current_byte_ = next_byte;
+ } else {
+ store<uint8_t>(bitmap_, byte);
+ }
+ ++bitmap_;
+ } else {
+ assert(valid_bits > 0);
+ assert(valid_bits < 8);
+ assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
+ internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
+ for (int i = 0; i < valid_bits; ++i) {
+ (byte & 0x01) ? writer.Set() : writer.Clear();
+ writer.Next();
+ byte >>= 1;
+ }
+ writer.Finish();
+ }
+ }
+
+ private:
+ int64_t offset_;
+ uint8_t* bitmap_;
+
+ const uint8_t* bitmap_end_;
+ uint64_t mask_;
+ union {
+ Word current_word_;
+ struct {
+#if ARROW_LITTLE_ENDIAN == 0
+ uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+ uint8_t current_byte_;
+ };
+ };
+
+ template <typename DType>
+ DType load(const uint8_t* bitmap) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+ }
+
+ template <typename DType>
+ void store(uint8_t* bitmap, DType data) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
+ }
+};
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
index d9cafd602a2..538b7382e43 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
@@ -27,9 +27,9 @@
#if defined(ARROW_HAVE_RUNTIME_AVX512)
#error #include "arrow/util/bpacking_avx512.h"
#endif
-#if defined(ARROW_HAVE_NEON)
-#error #include "arrow/util/bpacking_neon.h"
-#endif
+#if defined(ARROW_HAVE_NEON)
+#error #include "arrow/util/bpacking_neon.h"
+#endif
namespace arrow {
namespace internal {
@@ -166,12 +166,12 @@ struct Unpack32DynamicFunction {
} // namespace
int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
-#if defined(ARROW_HAVE_NEON)
- return unpack32_neon(in, out, batch_size, num_bits);
-#else
+#if defined(ARROW_HAVE_NEON)
+ return unpack32_neon(in, out, batch_size, num_bits);
+#else
static DynamicDispatch<Unpack32DynamicFunction> dispatch;
return dispatch.func(in, out, batch_size, num_bits);
-#endif
+#endif
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
index e5a4dbbed89..7f4ca3e384c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
@@ -17,7 +17,7 @@
#pragma once
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/visibility.h"
#include <stdint.h>
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
index 28dcce52bb8..53627aee18a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
@@ -1,626 +1,626 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/util/simd.h"
-#include "arrow/util/ubsan.h"
-
-#include <stdint.h>
-#include <algorithm>
-
-#ifdef ARROW_HAVE_SSE4_2
-// Enable the SIMD for ByteStreamSplit Encoder/Decoder
-#define ARROW_HAVE_SIMD_SPLIT
-#endif // ARROW_HAVE_SSE4_2
-
-namespace arrow {
-namespace util {
-namespace internal {
-
-#if defined(ARROW_HAVE_SSE4_2)
-template <typename T>
-void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
-
- const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
- const int64_t num_blocks = size / kBlockSize;
- uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
-
- // First handle suffix.
- // This helps catch if the simd-based processing overflows into the suffix
- // since almost surely a test would fail.
- const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- gathered_byte_data[b] = data[byte_index];
- }
- out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
- }
-
- // The blocks get processed hierarchically using the unpack intrinsics.
- // Example with four streams:
- // Stage 1: AAAA BBBB CCCC DDDD
- // Stage 2: ACAC ACAC BDBD BDBD
- // Stage 3: ABCD ABCD ABCD ABCD
- __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
- constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
-
- for (int64_t i = 0; i < num_blocks; ++i) {
- for (size_t j = 0; j < kNumStreams; ++j) {
- stage[0][j] = _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * stride]));
- }
- for (size_t step = 0; step < kNumStreamsLog2; ++step) {
- for (size_t j = 0; j < kNumStreamsHalf; ++j) {
- stage[step + 1U][j * 2] =
- _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- stage[step + 1U][j * 2 + 1U] =
- _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- }
- }
- for (size_t j = 0; j < kNumStreams; ++j) {
- _mm_storeu_si128(reinterpret_cast<__m128i*>(
- &output_data[(i * kNumStreams + j) * sizeof(__m128i)]),
- stage[kNumStreamsLog2][j]);
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- __m128i stage[3][kNumStreams];
- __m128i final_result[kNumStreams];
-
- const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
- const size_t num_blocks = size / kBlockSize;
- const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
- __m128i* output_buffer_streams[kNumStreams];
- for (size_t i = 0; i < kNumStreams; ++i) {
- output_buffer_streams[i] =
- reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]);
- }
-
- // First handle suffix.
- const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
- for (size_t i = num_processed_elements; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
- // The current shuffling algorithm diverges for float and double types but the compiler
- // should be able to remove the branch since only one path is taken for each template
- // instantiation.
- // Example run for floats:
- // Step 0, copy:
- // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
- // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
- // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
- // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
- // Step 3: __mm_unpacklo_epi8 and _mm_unpackhi_epi8:
- // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
- // Step 4: __mm_unpacklo_epi64 and _mm_unpackhi_epi64:
- // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
- for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
- // First copy the data to stage 0.
- for (size_t i = 0; i < kNumStreams; ++i) {
- stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams + i]);
- }
-
- // The shuffling of bytes is performed through the unpack intrinsics.
- // In my measurements this gives better performance then an implementation
- // which uses the shuffle intrinsics.
- for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) {
- for (size_t i = 0; i < kNumStreams / 2U; ++i) {
- stage[stage_lvl + 1][i * 2] =
- _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- stage[stage_lvl + 1][i * 2 + 1] =
- _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- }
- }
- if (kNumStreams == 8U) {
- // This is the path for double.
- __m128i tmp[8];
- for (size_t i = 0; i < 4; ++i) {
- tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
- tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
- }
-
- for (size_t i = 0; i < 4; ++i) {
- final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
- final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
- }
- } else {
- // this is the path for float.
- __m128i tmp[4];
- for (size_t i = 0; i < 2; ++i) {
- tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
- tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
- }
- for (size_t i = 0; i < 2; ++i) {
- final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]);
- final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
- }
- }
- for (size_t i = 0; i < kNumStreams; ++i) {
- _mm_storeu_si128(&output_buffer_streams[i][block_index], final_result[i]);
- }
- }
-}
-#endif // ARROW_HAVE_SSE4_2
-
-#if defined(ARROW_HAVE_AVX2)
-template <typename T>
-void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
-
- const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
- if (size < kBlockSize) // Back to SSE for small size
- return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
- const int64_t num_blocks = size / kBlockSize;
- uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
-
- // First handle suffix.
- const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- gathered_byte_data[b] = data[byte_index];
- }
- out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
- }
-
- // Processed hierarchically using unpack intrinsics, then permute intrinsics.
- __m256i stage[kNumStreamsLog2 + 1U][kNumStreams];
- __m256i final_result[kNumStreams];
- constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
-
- for (int64_t i = 0; i < num_blocks; ++i) {
- for (size_t j = 0; j < kNumStreams; ++j) {
- stage[0][j] = _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(&data[i * sizeof(__m256i) + j * stride]));
- }
-
- for (size_t step = 0; step < kNumStreamsLog2; ++step) {
- for (size_t j = 0; j < kNumStreamsHalf; ++j) {
- stage[step + 1U][j * 2] =
- _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- stage[step + 1U][j * 2 + 1U] =
- _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- }
- }
-
- if (kNumStreams == 8U) {
- // path for double, 128i index:
- // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
- // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
- final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00100000);
- final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00100000);
- final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b00100000);
- final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b00100000);
- final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00110001);
- final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00110001);
- final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b00110001);
- final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b00110001);
- } else {
- // path for float, 128i index:
- // {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07}
- final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00100000);
- final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00100000);
- final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00110001);
- final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00110001);
- }
-
- for (size_t j = 0; j < kNumStreams; ++j) {
- _mm256_storeu_si256(reinterpret_cast<__m256i*>(
- &output_data[(i * kNumStreams + j) * sizeof(__m256i)]),
- final_result[j]);
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- if (kNumStreams == 8U) // Back to SSE, currently no path for double.
- return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
-
- const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
- if (size < kBlockSize) // Back to SSE for small size
- return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
- const size_t num_blocks = size / kBlockSize;
- const __m256i* raw_values_simd = reinterpret_cast<const __m256i*>(raw_values);
- __m256i* output_buffer_streams[kNumStreams];
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- output_buffer_streams[i] =
- reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]);
- }
-
- // First handle suffix.
- const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
- for (size_t i = num_processed_elements; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
-
- // Path for float.
- // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
- // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
- // 3. Pack final 256i block with _mm256_permute2x128_si256.
- constexpr size_t kNumUnpack = 3U;
- __m256i stage[kNumUnpack + 1][kNumStreams];
- static const __m256i kPermuteMask =
- _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
- __m256i permute[kNumStreams];
- __m256i final_result[kNumStreams];
-
- for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
- for (size_t i = 0; i < kNumStreams; ++i) {
- stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]);
- }
-
- for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) {
- for (size_t i = 0; i < kNumStreams / 2U; ++i) {
- stage[stage_lvl + 1][i * 2] =
- _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- stage[stage_lvl + 1][i * 2 + 1] =
- _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- }
- }
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask);
- }
-
- final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00100000);
- final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00110001);
- final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000);
- final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001);
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]);
- }
- }
-}
-#endif // ARROW_HAVE_AVX2
-
-#if defined(ARROW_HAVE_AVX512)
-template <typename T>
-void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
-
- const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
- if (size < kBlockSize) // Back to AVX2 for small size
- return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
- const int64_t num_blocks = size / kBlockSize;
- uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
-
- // First handle suffix.
- const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- gathered_byte_data[b] = data[byte_index];
- }
- out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
- }
-
- // Processed hierarchically using the unpack, then two shuffles.
- __m512i stage[kNumStreamsLog2 + 1U][kNumStreams];
- __m512i shuffle[kNumStreams];
- __m512i final_result[kNumStreams];
- constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
-
- for (int64_t i = 0; i < num_blocks; ++i) {
- for (size_t j = 0; j < kNumStreams; ++j) {
- stage[0][j] = _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
- }
-
- for (size_t step = 0; step < kNumStreamsLog2; ++step) {
- for (size_t j = 0; j < kNumStreamsHalf; ++j) {
- stage[step + 1U][j * 2] =
- _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- stage[step + 1U][j * 2 + 1U] =
- _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- }
- }
-
- if (kNumStreams == 8U) {
- // path for double, 128i index:
- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
- shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b01000100);
- shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b01000100);
- shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b01000100);
- shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b01000100);
- shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b11101110);
- shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b11101110);
- shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b11101110);
- shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b11101110);
-
- final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
- final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
- final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
- final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
- final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
- final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
- final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
- final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
- } else {
- // path for float, 128i index:
- // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
- // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
- shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b01000100);
- shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b01000100);
- shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b11101110);
- shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b11101110);
-
- final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
- final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
- final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
- final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
- }
-
- for (size_t j = 0; j < kNumStreams; ++j) {
- _mm512_storeu_si512(reinterpret_cast<__m512i*>(
- &output_data[(i * kNumStreams + j) * sizeof(__m512i)]),
- final_result[j]);
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
- if (size < kBlockSize) // Back to AVX2 for small size
- return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
-
- const size_t num_blocks = size / kBlockSize;
- const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
- __m512i* output_buffer_streams[kNumStreams];
- for (size_t i = 0; i < kNumStreams; ++i) {
- output_buffer_streams[i] =
- reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
- }
-
- // First handle suffix.
- const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
- for (size_t i = num_processed_elements; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
-
- constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U;
- __m512i final_result[kNumStreams];
- __m512i unpack[KNumUnpack + 1][kNumStreams];
- __m512i permutex[kNumStreams];
- __m512i permutex_mask;
- if (kNumStreams == 8U) {
- // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
- permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
- 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
- 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
- 0x00190011, 0x00090001, 0x00180010, 0x00080000);
- } else {
- permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
- 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
- }
-
- for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
- for (size_t i = 0; i < kNumStreams; ++i) {
- unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
- }
-
- for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
- for (size_t i = 0; i < kNumStreams / 2U; ++i) {
- unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
- unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
- unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
- unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
- }
- }
-
- if (kNumStreams == 8U) {
- // path for double
- // 1. unpack to epi16 block
- // 2. permutexvar_epi16 to 128i block
- // 3. shuffle 128i to final 512i target, index:
- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
- for (size_t i = 0; i < kNumStreams; ++i)
- permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);
-
- __m512i shuffle[kNumStreams];
- shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
- shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
- shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
- shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
- shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
- shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
- shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
- shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
-
- final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
- final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
- final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
- final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
- final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
- final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
- final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
- final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
- } else {
- // Path for float.
- // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
- // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
- // 3. Pack final 256i block with _mm256_permute2x128_si256.
- for (size_t i = 0; i < kNumStreams; ++i)
- permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);
-
- final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
- final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
- final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
- final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
- }
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
- }
- }
-}
-#endif // ARROW_HAVE_AVX512
-
-#if defined(ARROW_HAVE_SIMD_SPLIT)
-template <typename T>
-void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
- int64_t stride, T* out) {
-#if defined(ARROW_HAVE_AVX512)
- return ByteStreamSplitDecodeAvx512(data, num_values, stride, out);
-#elif defined(ARROW_HAVE_AVX2)
- return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
-#elif defined(ARROW_HAVE_SSE4_2)
- return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
-#else
-#error "ByteStreamSplitDecodeSimd not implemented"
-#endif
-}
-
-template <typename T>
-void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
-#if defined(ARROW_HAVE_AVX512)
- return ByteStreamSplitEncodeAvx512<T>(raw_values, num_values, output_buffer_raw);
-#elif defined(ARROW_HAVE_AVX2)
- return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
-#elif defined(ARROW_HAVE_SSE4_2)
- return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
-#else
-#error "ByteStreamSplitEncodeSimd not implemented"
-#endif
-}
-#endif
-
-template <typename T>
-void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- for (size_t i = 0U; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- auto output_buffer_raw = reinterpret_cast<uint8_t*>(out);
-
- for (int64_t i = 0; i < num_values; ++i) {
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- output_buffer_raw[i * kNumStreams + b] = data[byte_index];
- }
- }
-}
-
-template <typename T>
-void inline ByteStreamSplitEncode(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
-#if defined(ARROW_HAVE_SIMD_SPLIT)
- return ByteStreamSplitEncodeSimd<T>(raw_values, num_values, output_buffer_raw);
-#else
- return ByteStreamSplitEncodeScalar<T>(raw_values, num_values, output_buffer_raw);
-#endif
-}
-
-template <typename T>
-void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
-#if defined(ARROW_HAVE_SIMD_SPLIT)
- return ByteStreamSplitDecodeSimd(data, num_values, stride, out);
-#else
- return ByteStreamSplitDecodeScalar(data, num_values, stride, out);
-#endif
-}
-
-} // namespace internal
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/simd.h"
+#include "arrow/util/ubsan.h"
+
+#include <stdint.h>
+#include <algorithm>
+
+#ifdef ARROW_HAVE_SSE4_2
+// Enable the SIMD for ByteStreamSplit Encoder/Decoder
+#define ARROW_HAVE_SIMD_SPLIT
+#endif // ARROW_HAVE_SSE4_2
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+#if defined(ARROW_HAVE_SSE4_2)
+template <typename T>
+void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ // This helps catch if the simd-based processing overflows into the suffix
+ // since almost surely a test would fail.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // The blocks get processed hierarchically using the unpack intrinsics.
+ // Example with four streams:
+ // Stage 1: AAAA BBBB CCCC DDDD
+ // Stage 2: ACAC ACAC BDBD BDBD
+ // Stage 3: ABCD ABCD ABCD ABCD
+ __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm_loadu_si128(
+ reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * stride]));
+ }
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m128i)]),
+ stage[kNumStreamsLog2][j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ __m128i stage[3][kNumStreams];
+ __m128i final_result[kNumStreams];
+
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
+ const size_t num_blocks = size / kBlockSize;
+ const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
+ __m128i* output_buffer_streams[kNumStreams];
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+ // The current shuffling algorithm diverges for float and double types but the compiler
+ // should be able to remove the branch since only one path is taken for each template
+ // instantiation.
+ // Example run for floats:
+ // Step 0, copy:
+ // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
+ // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
+ // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
+ // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
+ // Step 3: __mm_unpacklo_epi8 and _mm_unpackhi_epi8:
+ // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
+ // Step 4: __mm_unpacklo_epi64 and _mm_unpackhi_epi64:
+ // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ // First copy the data to stage 0.
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams + i]);
+ }
+
+ // The shuffling of bytes is performed through the unpack intrinsics.
+ // In my measurements this gives better performance then an implementation
+ // which uses the shuffle intrinsics.
+ for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ stage[stage_lvl + 1][i * 2] =
+ _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ stage[stage_lvl + 1][i * 2 + 1] =
+ _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ }
+ }
+ if (kNumStreams == 8U) {
+ // This is the path for double.
+ __m128i tmp[8];
+ for (size_t i = 0; i < 4; ++i) {
+ tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
+ tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
+ }
+
+ for (size_t i = 0; i < 4; ++i) {
+ final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
+ final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
+ }
+ } else {
+ // this is the path for float.
+ __m128i tmp[4];
+ for (size_t i = 0; i < 2; ++i) {
+ tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
+ tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
+ }
+ for (size_t i = 0; i < 2; ++i) {
+ final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]);
+ final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
+ }
+ }
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm_storeu_si128(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_SSE4_2
+
+#if defined(ARROW_HAVE_AVX2)
+template <typename T>
+void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
+ if (size < kBlockSize) // Back to SSE for small size
+ return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // Processed hierarchically using unpack intrinsics, then permute intrinsics.
+ __m256i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ __m256i final_result[kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(&data[i * sizeof(__m256i) + j * stride]));
+ }
+
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double, 128i index:
+ // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
+ // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
+ final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00100000);
+ final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b00100000);
+ final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b00100000);
+ final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00110001);
+ final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00110001);
+ final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b00110001);
+ final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b00110001);
+ } else {
+ // path for float, 128i index:
+ // {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07}
+ final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00100000);
+ final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00110001);
+ final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00110001);
+ }
+
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m256i)]),
+ final_result[j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ if (kNumStreams == 8U) // Back to SSE, currently no path for double.
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
+ if (size < kBlockSize) // Back to SSE for small size
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+ const size_t num_blocks = size / kBlockSize;
+ const __m256i* raw_values_simd = reinterpret_cast<const __m256i*>(raw_values);
+ __m256i* output_buffer_streams[kNumStreams];
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+
+ // Path for float.
+ // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
+ // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+ // 3. Pack final 256i block with _mm256_permute2x128_si256.
+ constexpr size_t kNumUnpack = 3U;
+ __m256i stage[kNumUnpack + 1][kNumStreams];
+ static const __m256i kPermuteMask =
+ _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+ __m256i permute[kNumStreams];
+ __m256i final_result[kNumStreams];
+
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]);
+ }
+
+ for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ stage[stage_lvl + 1][i * 2] =
+ _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ stage[stage_lvl + 1][i * 2 + 1] =
+ _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ }
+ }
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask);
+ }
+
+ final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00110001);
+ final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000);
+ final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001);
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_AVX2
+
+#if defined(ARROW_HAVE_AVX512)
+template <typename T>
+void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
+ if (size < kBlockSize) // Back to AVX2 for small size
+ return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // Processed hierarchically using the unpack, then two shuffles.
+ __m512i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ __m512i shuffle[kNumStreams];
+ __m512i final_result[kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
+ }
+
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double, 128i index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+ // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+ // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+ shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b01000100);
+ shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b01000100);
+ shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b11101110);
+ shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b11101110);
+ shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b11101110);
+ shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
+ final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
+ final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
+ final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
+ } else {
+ // path for float, 128i index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
+ shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b11101110);
+ shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ }
+
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm512_storeu_si512(reinterpret_cast<__m512i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m512i)]),
+ final_result[j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
+ if (size < kBlockSize) // Back to AVX2 for small size
+ return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
+
+ const size_t num_blocks = size / kBlockSize;
+ const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
+ __m512i* output_buffer_streams[kNumStreams];
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+
+ constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U;
+ __m512i final_result[kNumStreams];
+ __m512i unpack[KNumUnpack + 1][kNumStreams];
+ __m512i permutex[kNumStreams];
+ __m512i permutex_mask;
+ if (kNumStreams == 8U) {
+ // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
+ permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
+ 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
+ 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
+ 0x00190011, 0x00090001, 0x00180010, 0x00080000);
+ } else {
+ permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
+ 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
+ }
+
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
+ }
+
+ for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
+ unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+ unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
+ unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double
+ // 1. unpack to epi16 block
+ // 2. permutexvar_epi16 to 128i block
+ // 3. shuffle 128i to final 512i target, index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+ // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+ // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+ for (size_t i = 0; i < kNumStreams; ++i)
+ permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);
+
+ __m512i shuffle[kNumStreams];
+ shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+ shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
+ shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+ shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
+ shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+ shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
+ final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
+ final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
+ final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
+ } else {
+ // Path for float.
+ // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
+ // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+ // 3. Pack final 256i block with _mm256_permute2x128_si256.
+ for (size_t i = 0; i < kNumStreams; ++i)
+ permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);
+
+ final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+ final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+ final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+ final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+ }
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_AVX512
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+template <typename T>
+void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
+ int64_t stride, T* out) {
+#if defined(ARROW_HAVE_AVX512)
+ return ByteStreamSplitDecodeAvx512(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_AVX2)
+ return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_SSE4_2)
+ return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+#else
+#error "ByteStreamSplitDecodeSimd not implemented"
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_AVX512)
+ return ByteStreamSplitEncodeAvx512<T>(raw_values, num_values, output_buffer_raw);
+#elif defined(ARROW_HAVE_AVX2)
+ return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
+#elif defined(ARROW_HAVE_SSE4_2)
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+#else
+#error "ByteStreamSplitEncodeSimd not implemented"
+#endif
+}
+#endif
+
+template <typename T>
+void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ for (size_t i = 0U; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ auto output_buffer_raw = reinterpret_cast<uint8_t*>(out);
+
+ for (int64_t i = 0; i < num_values; ++i) {
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ output_buffer_raw[i * kNumStreams + b] = data[byte_index];
+ }
+ }
+}
+
+template <typename T>
+void inline ByteStreamSplitEncode(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ return ByteStreamSplitEncodeSimd<T>(raw_values, num_values, output_buffer_raw);
+#else
+ return ByteStreamSplitEncodeScalar<T>(raw_values, num_values, output_buffer_raw);
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ return ByteStreamSplitDecodeSimd(data, num_values, stride, out);
+#else
+ return ByteStreamSplitDecodeScalar(data, num_values, stride, out);
+#endif
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
index 874b2c2c886..671280a0a17 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
@@ -1,226 +1,226 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/util/cancel.h"
-
-#include <atomic>
-#include <mutex>
-#include <sstream>
-#include <utility>
-
-#include "arrow/result.h"
-#include "arrow/util/atomic_shared_ptr.h"
-#include "arrow/util/io_util.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-#if ATOMIC_INT_LOCK_FREE != 2
-#error Lock-free atomic int required for signal safety
-#endif
-
-using internal::ReinstateSignalHandler;
-using internal::SetSignalHandler;
-using internal::SignalHandler;
-
-// NOTE: We care mainly about the making the common case (not cancelled) fast.
-
-struct StopSourceImpl {
- std::atomic<int> requested_{0}; // will be -1 or signal number if requested
- std::mutex mutex_;
- Status cancel_error_;
-};
-
-StopSource::StopSource() : impl_(new StopSourceImpl) {}
-
-StopSource::~StopSource() = default;
-
-void StopSource::RequestStop() { RequestStop(Status::Cancelled("Operation cancelled")); }
-
-void StopSource::RequestStop(Status st) {
- std::lock_guard<std::mutex> lock(impl_->mutex_);
- DCHECK(!st.ok());
- if (!impl_->requested_) {
- impl_->requested_ = -1;
- impl_->cancel_error_ = std::move(st);
- }
-}
-
-void StopSource::RequestStopFromSignal(int signum) {
- // Only async-signal-safe code allowed here
- impl_->requested_.store(signum);
-}
-
-void StopSource::Reset() {
- std::lock_guard<std::mutex> lock(impl_->mutex_);
- impl_->cancel_error_ = Status::OK();
- impl_->requested_.store(0);
-}
-
-StopToken StopSource::token() { return StopToken(impl_); }
-
-bool StopToken::IsStopRequested() const {
- if (!impl_) {
- return false;
- }
- return impl_->requested_.load() != 0;
-}
-
-Status StopToken::Poll() const {
- if (!impl_) {
- return Status::OK();
- }
- if (!impl_->requested_.load()) {
- return Status::OK();
- }
-
- std::lock_guard<std::mutex> lock(impl_->mutex_);
- if (impl_->cancel_error_.ok()) {
- auto signum = impl_->requested_.load();
- DCHECK_GT(signum, 0);
- impl_->cancel_error_ = internal::CancelledFromSignal(signum, "Operation cancelled");
- }
- return impl_->cancel_error_;
-}
-
-namespace {
-
-struct SignalStopState {
- struct SavedSignalHandler {
- int signum;
- SignalHandler handler;
- };
-
- Status RegisterHandlers(const std::vector<int>& signals) {
- if (!saved_handlers_.empty()) {
- return Status::Invalid("Signal handlers already registered");
- }
- for (int signum : signals) {
- ARROW_ASSIGN_OR_RAISE(auto handler,
- SetSignalHandler(signum, SignalHandler{&HandleSignal}));
- saved_handlers_.push_back({signum, handler});
- }
- return Status::OK();
- }
-
- void UnregisterHandlers() {
- auto handlers = std::move(saved_handlers_);
- for (const auto& h : handlers) {
- ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler).status());
- }
- }
-
- ~SignalStopState() {
- UnregisterHandlers();
- Disable();
- }
-
- StopSource* stop_source() { return stop_source_.get(); }
-
- bool enabled() { return stop_source_ != nullptr; }
-
- void Enable() {
- // Before creating a new StopSource, delete any lingering reference to
- // the previous one in the trash can. See DoHandleSignal() for details.
- EmptyTrashCan();
- internal::atomic_store(&stop_source_, std::make_shared<StopSource>());
- }
-
- void Disable() { internal::atomic_store(&stop_source_, NullSource()); }
-
- static SignalStopState* instance() { return &instance_; }
-
- private:
- // For readability
- std::shared_ptr<StopSource> NullSource() { return nullptr; }
-
- void EmptyTrashCan() { internal::atomic_store(&trash_can_, NullSource()); }
-
- static void HandleSignal(int signum) { instance_.DoHandleSignal(signum); }
-
- void DoHandleSignal(int signum) {
- // async-signal-safe code only
- auto source = internal::atomic_load(&stop_source_);
- if (source) {
- source->RequestStopFromSignal(signum);
- // Disable() may have been called in the meantime, but we can't
- // deallocate a shared_ptr here, so instead move it to a "trash can".
- // This minimizes the possibility of running a deallocator here,
- // however it doesn't entirely preclude it.
- //
- // Possible case:
- // - a signal handler (A) starts running, fetches the current source
- // - Disable() then Enable() are called, emptying the trash can and
- // replacing the current source
- // - a signal handler (B) starts running, fetches the current source
- // - signal handler A resumes, moves its source (the old source) into
- // the trash can (the only remaining reference)
- // - signal handler B resumes, moves its source (the current source)
- // into the trash can. This triggers deallocation of the old source,
- // since the trash can had the only remaining reference to it.
- //
- // This case should be sufficiently unlikely, but we cannot entirely
- // rule it out. The problem might be solved properly with a lock-free
- // linked list of StopSources.
- internal::atomic_store(&trash_can_, std::move(source));
- }
- ReinstateSignalHandler(signum, &HandleSignal);
- }
-
- std::shared_ptr<StopSource> stop_source_;
- std::shared_ptr<StopSource> trash_can_;
-
- std::vector<SavedSignalHandler> saved_handlers_;
-
- static SignalStopState instance_;
-};
-
-SignalStopState SignalStopState::instance_{};
-
-} // namespace
-
-Result<StopSource*> SetSignalStopSource() {
- auto stop_state = SignalStopState::instance();
- if (stop_state->enabled()) {
- return Status::Invalid("Signal stop source already set up");
- }
- stop_state->Enable();
- return stop_state->stop_source();
-}
-
-void ResetSignalStopSource() {
- auto stop_state = SignalStopState::instance();
- DCHECK(stop_state->enabled());
- stop_state->Disable();
-}
-
-Status RegisterCancellingSignalHandler(const std::vector<int>& signals) {
- auto stop_state = SignalStopState::instance();
- if (!stop_state->enabled()) {
- return Status::Invalid("Signal stop source was not set up");
- }
- return stop_state->RegisterHandlers(signals);
-}
-
-void UnregisterCancellingSignalHandler() {
- auto stop_state = SignalStopState::instance();
- DCHECK(stop_state->enabled());
- stop_state->UnregisterHandlers();
-}
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/cancel.h"
+
+#include <atomic>
+#include <mutex>
+#include <sstream>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/util/atomic_shared_ptr.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+#if ATOMIC_INT_LOCK_FREE != 2
+#error Lock-free atomic int required for signal safety
+#endif
+
+using internal::ReinstateSignalHandler;
+using internal::SetSignalHandler;
+using internal::SignalHandler;
+
+// NOTE: We care mainly about the making the common case (not cancelled) fast.
+
+struct StopSourceImpl {
+ std::atomic<int> requested_{0}; // will be -1 or signal number if requested
+ std::mutex mutex_;
+ Status cancel_error_;
+};
+
+StopSource::StopSource() : impl_(new StopSourceImpl) {}
+
+StopSource::~StopSource() = default;
+
+void StopSource::RequestStop() { RequestStop(Status::Cancelled("Operation cancelled")); }
+
+void StopSource::RequestStop(Status st) {
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ DCHECK(!st.ok());
+ if (!impl_->requested_) {
+ impl_->requested_ = -1;
+ impl_->cancel_error_ = std::move(st);
+ }
+}
+
+void StopSource::RequestStopFromSignal(int signum) {
+ // Only async-signal-safe code allowed here
+ impl_->requested_.store(signum);
+}
+
+void StopSource::Reset() {
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ impl_->cancel_error_ = Status::OK();
+ impl_->requested_.store(0);
+}
+
+StopToken StopSource::token() { return StopToken(impl_); }
+
+bool StopToken::IsStopRequested() const {
+ if (!impl_) {
+ return false;
+ }
+ return impl_->requested_.load() != 0;
+}
+
+Status StopToken::Poll() const {
+ if (!impl_) {
+ return Status::OK();
+ }
+ if (!impl_->requested_.load()) {
+ return Status::OK();
+ }
+
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ if (impl_->cancel_error_.ok()) {
+ auto signum = impl_->requested_.load();
+ DCHECK_GT(signum, 0);
+ impl_->cancel_error_ = internal::CancelledFromSignal(signum, "Operation cancelled");
+ }
+ return impl_->cancel_error_;
+}
+
+namespace {
+
+struct SignalStopState {
+ struct SavedSignalHandler {
+ int signum;
+ SignalHandler handler;
+ };
+
+ Status RegisterHandlers(const std::vector<int>& signals) {
+ if (!saved_handlers_.empty()) {
+ return Status::Invalid("Signal handlers already registered");
+ }
+ for (int signum : signals) {
+ ARROW_ASSIGN_OR_RAISE(auto handler,
+ SetSignalHandler(signum, SignalHandler{&HandleSignal}));
+ saved_handlers_.push_back({signum, handler});
+ }
+ return Status::OK();
+ }
+
+ void UnregisterHandlers() {
+ auto handlers = std::move(saved_handlers_);
+ for (const auto& h : handlers) {
+ ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler).status());
+ }
+ }
+
+ ~SignalStopState() {
+ UnregisterHandlers();
+ Disable();
+ }
+
+ StopSource* stop_source() { return stop_source_.get(); }
+
+ bool enabled() { return stop_source_ != nullptr; }
+
+ void Enable() {
+ // Before creating a new StopSource, delete any lingering reference to
+ // the previous one in the trash can. See DoHandleSignal() for details.
+ EmptyTrashCan();
+ internal::atomic_store(&stop_source_, std::make_shared<StopSource>());
+ }
+
+ void Disable() { internal::atomic_store(&stop_source_, NullSource()); }
+
+ static SignalStopState* instance() { return &instance_; }
+
+ private:
+ // For readability
+ std::shared_ptr<StopSource> NullSource() { return nullptr; }
+
+ void EmptyTrashCan() { internal::atomic_store(&trash_can_, NullSource()); }
+
+ static void HandleSignal(int signum) { instance_.DoHandleSignal(signum); }
+
+ void DoHandleSignal(int signum) {
+ // async-signal-safe code only
+ auto source = internal::atomic_load(&stop_source_);
+ if (source) {
+ source->RequestStopFromSignal(signum);
+ // Disable() may have been called in the meantime, but we can't
+ // deallocate a shared_ptr here, so instead move it to a "trash can".
+ // This minimizes the possibility of running a deallocator here,
+ // however it doesn't entirely preclude it.
+ //
+ // Possible case:
+ // - a signal handler (A) starts running, fetches the current source
+ // - Disable() then Enable() are called, emptying the trash can and
+ // replacing the current source
+ // - a signal handler (B) starts running, fetches the current source
+ // - signal handler A resumes, moves its source (the old source) into
+ // the trash can (the only remaining reference)
+ // - signal handler B resumes, moves its source (the current source)
+ // into the trash can. This triggers deallocation of the old source,
+ // since the trash can had the only remaining reference to it.
+ //
+ // This case should be sufficiently unlikely, but we cannot entirely
+ // rule it out. The problem might be solved properly with a lock-free
+ // linked list of StopSources.
+ internal::atomic_store(&trash_can_, std::move(source));
+ }
+ ReinstateSignalHandler(signum, &HandleSignal);
+ }
+
+ std::shared_ptr<StopSource> stop_source_;
+ std::shared_ptr<StopSource> trash_can_;
+
+ std::vector<SavedSignalHandler> saved_handlers_;
+
+ static SignalStopState instance_;
+};
+
+SignalStopState SignalStopState::instance_{};
+
+} // namespace
+
+Result<StopSource*> SetSignalStopSource() {
+ auto stop_state = SignalStopState::instance();
+ if (stop_state->enabled()) {
+ return Status::Invalid("Signal stop source already set up");
+ }
+ stop_state->Enable();
+ return stop_state->stop_source();
+}
+
+void ResetSignalStopSource() {
+ auto stop_state = SignalStopState::instance();
+ DCHECK(stop_state->enabled());
+ stop_state->Disable();
+}
+
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals) {
+ auto stop_state = SignalStopState::instance();
+ if (!stop_state->enabled()) {
+ return Status::Invalid("Signal stop source was not set up");
+ }
+ return stop_state->RegisterHandlers(signals);
+}
+
+void UnregisterCancellingSignalHandler() {
+ auto stop_state = SignalStopState::instance();
+ DCHECK(stop_state->enabled());
+ stop_state->UnregisterHandlers();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
index 9e00f673a21..7c755c02d68 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
@@ -1,102 +1,102 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/status.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class StopToken;
-
-struct StopSourceImpl;
-
-/// EXPERIMENTAL
-class ARROW_EXPORT StopSource {
- public:
- StopSource();
- ~StopSource();
-
- // Consumer API (the side that stops)
- void RequestStop();
- void RequestStop(Status error);
- void RequestStopFromSignal(int signum);
-
- StopToken token();
-
- // For internal use only
- void Reset();
-
- protected:
- std::shared_ptr<StopSourceImpl> impl_;
-};
-
-/// EXPERIMENTAL
-class ARROW_EXPORT StopToken {
- public:
- // Public for Cython
- StopToken() {}
-
- explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
-
- // A trivial token that never propagates any stop request
- static StopToken Unstoppable() { return StopToken(); }
-
- // Producer API (the side that gets asked to stopped)
- Status Poll() const;
- bool IsStopRequested() const;
-
- protected:
- std::shared_ptr<StopSourceImpl> impl_;
-};
-
-/// EXPERIMENTAL: Set a global StopSource that can receive signals
-///
-/// The only allowed order of calls is the following:
-/// - SetSignalStopSource()
-/// - any number of pairs of (RegisterCancellingSignalHandler,
-/// UnregisterCancellingSignalHandler) calls
-/// - ResetSignalStopSource()
-///
-/// Beware that these settings are process-wide. Typically, only one
-/// thread should call these APIs, even in a multithreaded setting.
-ARROW_EXPORT
-Result<StopSource*> SetSignalStopSource();
-
-/// EXPERIMENTAL: Reset the global signal-receiving StopSource
-///
-/// This will invalidate the pointer returned by SetSignalStopSource.
-ARROW_EXPORT
-void ResetSignalStopSource();
-
-/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
-ARROW_EXPORT
-Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
-
-/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
-ARROW_EXPORT
-void UnregisterCancellingSignalHandler();
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class StopToken;
+
+struct StopSourceImpl;
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopSource {
+ public:
+ StopSource();
+ ~StopSource();
+
+ // Consumer API (the side that stops)
+ void RequestStop();
+ void RequestStop(Status error);
+ void RequestStopFromSignal(int signum);
+
+ StopToken token();
+
+ // For internal use only
+ void Reset();
+
+ protected:
+ std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopToken {
+ public:
+ // Public for Cython
+ StopToken() {}
+
+ explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
+
+ // A trivial token that never propagates any stop request
+ static StopToken Unstoppable() { return StopToken(); }
+
+ // Producer API (the side that gets asked to stopped)
+ Status Poll() const;
+ bool IsStopRequested() const;
+
+ protected:
+ std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL: Set a global StopSource that can receive signals
+///
+/// The only allowed order of calls is the following:
+/// - SetSignalStopSource()
+/// - any number of pairs of (RegisterCancellingSignalHandler,
+/// UnregisterCancellingSignalHandler) calls
+/// - ResetSignalStopSource()
+///
+/// Beware that these settings are process-wide. Typically, only one
+/// thread should call these APIs, even in a multithreaded setting.
+ARROW_EXPORT
+Result<StopSource*> SetSignalStopSource();
+
+/// EXPERIMENTAL: Reset the global signal-receiving StopSource
+///
+/// This will invalidate the pointer returned by SetSignalStopSource.
+ARROW_EXPORT
+void ResetSignalStopSource();
+
+/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
+ARROW_EXPORT
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
+
+/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
+ARROW_EXPORT
+void UnregisterCancellingSignalHandler();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
index 8db199b4e76..41109e80faa 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
@@ -29,18 +29,18 @@
namespace arrow {
namespace util {
-namespace {
-
-Status CheckSupportsCompressionLevel(Compression::type type) {
- if (!Codec::SupportsCompressionLevel(type)) {
- return Status::Invalid(
- "The specified codec does not support the compression level parameter");
- }
- return Status::OK();
-}
-
-} // namespace
-
+namespace {
+
+Status CheckSupportsCompressionLevel(Compression::type type) {
+ if (!Codec::SupportsCompressionLevel(type)) {
+ return Status::Invalid(
+ "The specified codec does not support the compression level parameter");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
int Codec::UseDefaultCompressionLevel() { return kUseDefaultCompressionLevel; }
Status Codec::Init() { return Status::OK(); }
@@ -115,24 +115,24 @@ bool Codec::SupportsCompressionLevel(Compression::type codec) {
}
}
-Result<int> Codec::MaximumCompressionLevel(Compression::type codec_type) {
- RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
- ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
- return codec->maximum_compression_level();
-}
-
-Result<int> Codec::MinimumCompressionLevel(Compression::type codec_type) {
- RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
- ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
- return codec->minimum_compression_level();
-}
-
-Result<int> Codec::DefaultCompressionLevel(Compression::type codec_type) {
- RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
- ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
- return codec->default_compression_level();
-}
-
+Result<int> Codec::MaximumCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->maximum_compression_level();
+}
+
+Result<int> Codec::MinimumCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->minimum_compression_level();
+}
+
+Result<int> Codec::DefaultCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->default_compression_level();
+}
+
Result<std::unique_ptr<Codec>> Codec::Create(Compression::type codec_type,
int compression_level) {
if (!IsAvailable(codec_type)) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
index 0832e82a606..bd5f2d1c647 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
@@ -24,13 +24,13 @@
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace util {
-constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
+constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
/// \brief Streaming compressor interface
///
@@ -132,27 +132,27 @@ class ARROW_EXPORT Codec {
/// \brief Return true if indicated codec supports setting a compression level
static bool SupportsCompressionLevel(Compression::type codec);
- /// \brief Return the smallest supported compression level for the codec
- /// Note: This function creates a temporary Codec instance
- static Result<int> MinimumCompressionLevel(Compression::type codec);
-
- /// \brief Return the largest supported compression level for the codec
- /// Note: This function creates a temporary Codec instance
- static Result<int> MaximumCompressionLevel(Compression::type codec);
-
- /// \brief Return the default compression level
- /// Note: This function creates a temporary Codec instance
- static Result<int> DefaultCompressionLevel(Compression::type codec);
-
- /// \brief Return the smallest supported compression level
- virtual int minimum_compression_level() const = 0;
-
- /// \brief Return the largest supported compression level
- virtual int maximum_compression_level() const = 0;
-
- /// \brief Return the default compression level
- virtual int default_compression_level() const = 0;
-
+ /// \brief Return the smallest supported compression level for the codec
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> MinimumCompressionLevel(Compression::type codec);
+
+ /// \brief Return the largest supported compression level for the codec
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> MaximumCompressionLevel(Compression::type codec);
+
+ /// \brief Return the default compression level
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> DefaultCompressionLevel(Compression::type codec);
+
+ /// \brief Return the smallest supported compression level
+ virtual int minimum_compression_level() const = 0;
+
+ /// \brief Return the largest supported compression level
+ virtual int maximum_compression_level() const = 0;
+
+ /// \brief Return the default compression level
+ virtual int default_compression_level() const = 0;
+
/// \brief One-shot decompression function
///
/// output_buffer_len must be correct and therefore be obtained in advance.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
index cb547c2c8cf..cc41ce43f91 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
@@ -224,11 +224,11 @@ class BrotliCodec : public Codec {
Compression::type compression_type() const override { return Compression::BROTLI; }
int compression_level() const override { return compression_level_; }
- int minimum_compression_level() const override { return BROTLI_MIN_QUALITY; }
- int maximum_compression_level() const override { return BROTLI_MAX_QUALITY; }
- int default_compression_level() const override {
- return kBrotliDefaultCompressionLevel;
- }
+ int minimum_compression_level() const override { return BROTLI_MIN_QUALITY; }
+ int maximum_compression_level() const override { return BROTLI_MAX_QUALITY; }
+ int default_compression_level() const override {
+ return kBrotliDefaultCompressionLevel;
+ }
private:
const int compression_level_;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
index c783e405590..04c13cc4c5f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
@@ -27,7 +27,7 @@
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
@@ -300,9 +300,9 @@ class Lz4FrameCodec : public Codec {
}
Compression::type compression_type() const override { return Compression::LZ4_FRAME; }
- int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
protected:
const LZ4F_preferences_t prefs_;
@@ -353,9 +353,9 @@ class Lz4Codec : public Codec {
}
Compression::type compression_type() const override { return Compression::LZ4; }
- int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
};
// ----------------------------------------------------------------------
@@ -424,52 +424,52 @@ class Lz4HadoopCodec : public Lz4Codec {
int64_t TryDecompressHadoop(int64_t input_len, const uint8_t* input,
int64_t output_buffer_len, uint8_t* output_buffer) {
- // Parquet files written with the Hadoop Lz4Codec use their own framing.
- // The input buffer can contain an arbitrary number of "frames", each
- // with the following structure:
- // - bytes 0..3: big-endian uint32_t representing the frame decompressed size
- // - bytes 4..7: big-endian uint32_t representing the frame compressed size
- // - bytes 8...: frame compressed data
+ // Parquet files written with the Hadoop Lz4Codec use their own framing.
+ // The input buffer can contain an arbitrary number of "frames", each
+ // with the following structure:
+ // - bytes 0..3: big-endian uint32_t representing the frame decompressed size
+ // - bytes 4..7: big-endian uint32_t representing the frame compressed size
+ // - bytes 8...: frame compressed data
//
// The Hadoop Lz4Codec source code can be found here:
// https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc
- int64_t total_decompressed_size = 0;
-
- while (input_len >= kPrefixLength) {
- const uint32_t expected_decompressed_size =
- BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input));
- const uint32_t expected_compressed_size =
- BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input + sizeof(uint32_t)));
- input += kPrefixLength;
- input_len -= kPrefixLength;
-
- if (input_len < expected_compressed_size) {
- // Not enough bytes for Hadoop "frame"
- return kNotHadoop;
+ int64_t total_decompressed_size = 0;
+
+ while (input_len >= kPrefixLength) {
+ const uint32_t expected_decompressed_size =
+ BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input));
+ const uint32_t expected_compressed_size =
+ BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input + sizeof(uint32_t)));
+ input += kPrefixLength;
+ input_len -= kPrefixLength;
+
+ if (input_len < expected_compressed_size) {
+ // Not enough bytes for Hadoop "frame"
+ return kNotHadoop;
}
- if (output_buffer_len < expected_decompressed_size) {
- // Not enough bytes to hold advertised output => probably not Hadoop
- return kNotHadoop;
- }
- // Try decompressing and compare with expected decompressed length
- auto maybe_decompressed_size = Lz4Codec::Decompress(
- expected_compressed_size, input, output_buffer_len, output_buffer);
- if (!maybe_decompressed_size.ok() ||
- *maybe_decompressed_size != expected_decompressed_size) {
- return kNotHadoop;
- }
- input += expected_compressed_size;
- input_len -= expected_compressed_size;
- output_buffer += expected_decompressed_size;
- output_buffer_len -= expected_decompressed_size;
- total_decompressed_size += expected_decompressed_size;
+ if (output_buffer_len < expected_decompressed_size) {
+ // Not enough bytes to hold advertised output => probably not Hadoop
+ return kNotHadoop;
+ }
+ // Try decompressing and compare with expected decompressed length
+ auto maybe_decompressed_size = Lz4Codec::Decompress(
+ expected_compressed_size, input, output_buffer_len, output_buffer);
+ if (!maybe_decompressed_size.ok() ||
+ *maybe_decompressed_size != expected_decompressed_size) {
+ return kNotHadoop;
+ }
+ input += expected_compressed_size;
+ input_len -= expected_compressed_size;
+ output_buffer += expected_decompressed_size;
+ output_buffer_len -= expected_decompressed_size;
+ total_decompressed_size += expected_decompressed_size;
}
- if (input_len == 0) {
- return total_decompressed_size;
- } else {
- return kNotHadoop;
- }
+ if (input_len == 0) {
+ return total_decompressed_size;
+ } else {
+ return kNotHadoop;
+ }
}
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
index 3756f957d04..da00607d13b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
@@ -86,9 +86,9 @@ class SnappyCodec : public Codec {
}
Compression::type compression_type() const override { return Compression::SNAPPY; }
- int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
};
} // namespace
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
index e9cb2470ee2..51373cc227c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
@@ -52,9 +52,9 @@ constexpr int GZIP_CODEC = 16;
// Determine if this is libz or gzip from header.
constexpr int DETECT_CODEC = 32;
-constexpr int kGZipMinCompressionLevel = 1;
-constexpr int kGZipMaxCompressionLevel = 9;
-
+constexpr int kGZipMinCompressionLevel = 1;
+constexpr int kGZipMaxCompressionLevel = 9;
+
int CompressionWindowBitsForFormat(GZipFormat::type format) {
int window_bits = WINDOW_BITS;
switch (format) {
@@ -249,9 +249,9 @@ class GZipCompressor : public Compressor {
// again with the same value of the flush parameter and more output space
// (updated avail_out), until the flush is complete (deflate returns
// with non-zero avail_out)."
- // "Note that Z_BUF_ERROR is not fatal, and deflate() can be called again
- // with more input and more output space to continue compressing."
- return FlushResult{bytes_written, stream_.avail_out == 0};
+ // "Note that Z_BUF_ERROR is not fatal, and deflate() can be called again
+ // with more input and more output space to continue compressing."
+ return FlushResult{bytes_written, stream_.avail_out == 0};
}
Result<EndResult> End(int64_t output_len, uint8_t* output) override {
@@ -471,9 +471,9 @@ class GZipCodec : public Codec {
Compression::type compression_type() const override { return Compression::GZIP; }
int compression_level() const override { return compression_level_; }
- int minimum_compression_level() const override { return kGZipMinCompressionLevel; }
- int maximum_compression_level() const override { return kGZipMaxCompressionLevel; }
- int default_compression_level() const override { return kGZipDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kGZipMinCompressionLevel; }
+ int maximum_compression_level() const override { return kGZipMaxCompressionLevel; }
+ int default_compression_level() const override { return kGZipDefaultCompressionLevel; }
private:
// zlib is stateful and the z_stream state variable must be initialized
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
index e15ecb4e1fe..715b6e7374a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
@@ -228,9 +228,9 @@ class ZSTDCodec : public Codec {
}
Compression::type compression_type() const override { return Compression::ZSTD; }
- int minimum_compression_level() const override { return ZSTD_minCLevel(); }
- int maximum_compression_level() const override { return ZSTD_maxCLevel(); }
- int default_compression_level() const override { return kZSTDDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return ZSTD_minCLevel(); }
+ int maximum_compression_level() const override { return ZSTD_maxCLevel(); }
+ int default_compression_level() const override { return kZSTDDefaultCompressionLevel; }
int compression_level() const override { return compression_level_; }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
index d803521a2d9..1f54969539a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
@@ -31,11 +31,11 @@
#endif
#ifdef _WIN32
-#include <immintrin.h>
+#include <immintrin.h>
#include <intrin.h>
#include <array>
#include <bitset>
-
+
#include "arrow/util/windows_compatibility.h"
#endif
@@ -51,19 +51,19 @@
#include "arrow/result.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
+#include "arrow/util/optional.h"
#include "arrow/util/string.h"
-namespace arrow {
-namespace internal {
-
-namespace {
-
+namespace arrow {
+namespace internal {
+
+namespace {
+
using std::max;
-constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
-constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
-constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M
+constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
+constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
+constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M
#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
@@ -72,31 +72,31 @@ void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
"=d"(CPUInfo[3])
: "a"(function_id), "c"(subfunction_id));
}
-
-int64_t _xgetbv(int xcr) {
- int out = 0;
- __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
- return out;
-}
-#endif
-
-#ifdef __APPLE__
-util::optional<int64_t> IntegerSysCtlByName(const char* name) {
- size_t len = sizeof(int64_t);
- int64_t data = 0;
- if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
- return data;
- }
- // ENOENT is the official errno value for non-existing sysctl's,
- // but EINVAL and ENOTSUP have been seen in the wild.
- if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
- auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
- ARROW_LOG(WARNING) << st.ToString();
- }
- return util::nullopt;
-}
+
+int64_t _xgetbv(int xcr) {
+ int out = 0;
+ __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
+ return out;
+}
#endif
+#ifdef __APPLE__
+util::optional<int64_t> IntegerSysCtlByName(const char* name) {
+ size_t len = sizeof(int64_t);
+ int64_t data = 0;
+ if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
+ return data;
+ }
+ // ENOENT is the official errno value for non-existing sysctl's,
+ // but EINVAL and ENOTSUP have been seen in the wild.
+ if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
+ auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
+ ARROW_LOG(WARNING) << st.ToString();
+ }
+ return util::nullopt;
+}
+#endif
+
#if defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
// There is no direct instruction to get cache size on Arm64 like '__cpuid' on x86;
// Get Arm64 cache size by reading '/sys/devices/system/cpu/cpu0/cache/index*/size';
@@ -105,11 +105,11 @@ util::optional<int64_t> IntegerSysCtlByName(const char* name) {
// index1: L1 Icache
// index2: L2 cache
// index3: L3 cache
-const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
-const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
-const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";
+const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
+const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
+const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";
-int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
+int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
char* content = nullptr;
char* last_char = nullptr;
size_t file_len = 0;
@@ -148,8 +148,8 @@ int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
}
#endif
-#if !defined(_WIN32) && !defined(__APPLE__)
-struct {
+#if !defined(_WIN32) && !defined(__APPLE__)
+struct {
std::string name;
int64_t flag;
} flag_mappings[] = {
@@ -166,7 +166,7 @@ struct {
{"asimd", CpuInfo::ASIMD},
#endif
};
-const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
// Helper function to parse for hardware flags.
// values contains a list of space-separated flags. check to see if the flags we
@@ -274,13 +274,13 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
}
}
- bool zmm_enabled = false;
- if (features_ECX[27]) { // OSXSAVE
- // Query if the OS supports saving ZMM registers when switching contexts
- int64_t xcr0 = _xgetbv(0);
- zmm_enabled = (xcr0 & 0xE0) == 0xE0;
- }
-
+ bool zmm_enabled = false;
+ if (features_ECX[27]) { // OSXSAVE
+ // Query if the OS supports saving ZMM registers when switching contexts
+ int64_t xcr0 = _xgetbv(0);
+ zmm_enabled = (xcr0 & 0xE0) == 0xE0;
+ }
+
if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
@@ -296,22 +296,22 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
- // ARROW-11427: only use AVX512 if enabled by the OS
- if (zmm_enabled) {
- if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
- if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
- if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
- if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
- if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
- }
+ // ARROW-11427: only use AVX512 if enabled by the OS
+ if (zmm_enabled) {
+ if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
+ if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
+ if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
+ if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
+ if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
+ }
}
return true;
}
#endif
-} // namespace
-
+} // namespace
+
CpuInfo::CpuInfo()
: hardware_flags_(0),
num_cores_(1),
@@ -348,37 +348,37 @@ void CpuInfo::Init() {
if (QueryPerformanceFrequency(&performance_frequency)) {
max_mhz = static_cast<float>(performance_frequency.QuadPart);
}
-#elif defined(__APPLE__)
- // On macOS, get CPU information from system information base
- struct SysCtlCpuFeature {
- const char* name;
- int64_t flag;
- };
- std::vector<SysCtlCpuFeature> features = {
-#if defined(__aarch64__)
- // ARM64 (note that this is exposed under Rosetta as well)
- {"hw.optional.neon", ASIMD},
-#else
- // x86
- {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
- {"hw.optional.avx1_0", AVX},
- {"hw.optional.avx2_0", AVX2},
- {"hw.optional.bmi1", BMI1},
- {"hw.optional.bmi2", BMI2},
- {"hw.optional.avx512f", AVX512F},
- {"hw.optional.avx512cd", AVX512CD},
- {"hw.optional.avx512dq", AVX512DQ},
- {"hw.optional.avx512bw", AVX512BW},
- {"hw.optional.avx512vl", AVX512VL},
-#endif
- };
- for (const auto& feature : features) {
- auto v = IntegerSysCtlByName(feature.name);
- if (v.value_or(0)) {
- hardware_flags_ |= feature.flag;
- }
- }
+#elif defined(__APPLE__)
+ // On macOS, get CPU information from system information base
+ struct SysCtlCpuFeature {
+ const char* name;
+ int64_t flag;
+ };
+ std::vector<SysCtlCpuFeature> features = {
+#if defined(__aarch64__)
+ // ARM64 (note that this is exposed under Rosetta as well)
+ {"hw.optional.neon", ASIMD},
#else
+ // x86
+ {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
+ {"hw.optional.avx1_0", AVX},
+ {"hw.optional.avx2_0", AVX2},
+ {"hw.optional.bmi1", BMI1},
+ {"hw.optional.bmi2", BMI2},
+ {"hw.optional.avx512f", AVX512F},
+ {"hw.optional.avx512cd", AVX512CD},
+ {"hw.optional.avx512dq", AVX512DQ},
+ {"hw.optional.avx512bw", AVX512BW},
+ {"hw.optional.avx512vl", AVX512VL},
+#endif
+ };
+ for (const auto& feature : features) {
+ auto v = IntegerSysCtlByName(feature.name);
+ if (v.value_or(0)) {
+ hardware_flags_ |= feature.flag;
+ }
+ }
+#else
// Read from /proc/cpuinfo
std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
while (cpuinfo) {
@@ -413,20 +413,20 @@ void CpuInfo::Init() {
#endif
#ifdef __APPLE__
- // On macOS, get cache size from system information base
- SetDefaultCacheSize();
- auto c = IntegerSysCtlByName("hw.l1dcachesize");
- if (c.has_value()) {
- cache_sizes_[0] = *c;
- }
- c = IntegerSysCtlByName("hw.l2cachesize");
- if (c.has_value()) {
- cache_sizes_[1] = *c;
- }
- c = IntegerSysCtlByName("hw.l3cachesize");
- if (c.has_value()) {
- cache_sizes_[2] = *c;
- }
+ // On macOS, get cache size from system information base
+ SetDefaultCacheSize();
+ auto c = IntegerSysCtlByName("hw.l1dcachesize");
+ if (c.has_value()) {
+ cache_sizes_[0] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l2cachesize");
+ if (c.has_value()) {
+ cache_sizes_[1] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l3cachesize");
+ if (c.has_value()) {
+ cache_sizes_[2] = *c;
+ }
#elif _WIN32
if (!RetrieveCacheSize(cache_sizes_)) {
SetDefaultCacheSize();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
index 83819c25519..7b434229c1b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
@@ -70,18 +70,18 @@ class ARROW_EXPORT CpuInfo {
/// Returns all the flags for this cpu
int64_t hardware_flags();
- /// \brief Returns whether or not the given feature is enabled.
- ///
- /// IsSupported() is true iff IsDetected() is also true and the feature
- /// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
- /// environment variable).
+ /// \brief Returns whether or not the given feature is enabled.
+ ///
+ /// IsSupported() is true iff IsDetected() is also true and the feature
+ /// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
+ /// environment variable).
bool IsSupported(int64_t flags) const { return (hardware_flags_ & flags) == flags; }
- /// Returns whether or not the given feature is available on the CPU.
- bool IsDetected(int64_t flags) const {
- return (original_hardware_flags_ & flags) == flags;
- }
-
+ /// Returns whether or not the given feature is available on the CPU.
+ bool IsDetected(int64_t flags) const {
+ return (original_hardware_flags_ & flags) == flags;
+ }
+
/// \brief The processor supports SSE4.2 and the Arrow libraries are built
/// with support for it
bool CanUseSSE4_2() const;
@@ -113,15 +113,15 @@ class ARROW_EXPORT CpuInfo {
private:
CpuInfo();
- enum UserSimdLevel {
- USER_SIMD_NONE = 0,
- USER_SIMD_SSE4_2,
- USER_SIMD_AVX,
- USER_SIMD_AVX2,
- USER_SIMD_AVX512,
- USER_SIMD_MAX,
- };
-
+ enum UserSimdLevel {
+ USER_SIMD_NONE = 0,
+ USER_SIMD_SSE4_2,
+ USER_SIMD_AVX,
+ USER_SIMD_AVX2,
+ USER_SIMD_AVX512,
+ USER_SIMD_MAX,
+ };
+
void Init();
/// Inits CPU cache size variables with default values
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
index 7aefd1ab9cd..bbbb11c7252 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
@@ -30,7 +30,7 @@
#include "arrow/status.h"
#include "arrow/util/decimal.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/formatting.h"
#include "arrow/util/int128_internal.h"
#include "arrow/util/int_util_internal.h"
@@ -94,47 +94,47 @@ static constexpr double kDoublePowersOfTen[2 * 38 + 1] = {
1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27,
1e28, 1e29, 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38};
-// On the Windows R toolchain, INFINITY is double type instead of float
-static constexpr float kFloatInf = std::numeric_limits<float>::infinity();
-static constexpr float kFloatPowersOfTen76[2 * 76 + 1] = {
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f,
- 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f,
- 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f,
- 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f,
- 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f,
- 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f,
- 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f,
- 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f,
- 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f,
- 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f,
- 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f,
- 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f,
- 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf};
-
-static constexpr double kDoublePowersOfTen76[2 * 76 + 1] = {
- 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65,
- 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53,
- 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41,
- 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29,
- 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
- 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5,
- 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
- 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
- 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
- 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43,
- 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55,
- 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67,
- 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76};
-
+// On the Windows R toolchain, INFINITY is double type instead of float
+static constexpr float kFloatInf = std::numeric_limits<float>::infinity();
+static constexpr float kFloatPowersOfTen76[2 * 76 + 1] = {
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f,
+ 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f,
+ 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f,
+ 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f,
+ 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f,
+ 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f,
+ 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f,
+ 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f,
+ 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f,
+ 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f,
+ 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f,
+ 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f,
+ 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf};
+
+static constexpr double kDoublePowersOfTen76[2 * 76 + 1] = {
+ 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65,
+ 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53,
+ 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41,
+ 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29,
+ 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
+ 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5,
+ 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
+ 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+ 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
+ 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43,
+ 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55,
+ 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67,
+ 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76};
+
namespace {
template <typename Real, typename Derived>
@@ -267,7 +267,7 @@ static void AppendLittleEndianArrayToString(const std::array<uint64_t, n>& array
// *elem = dividend / 1e9;
// remainder = dividend % 1e9.
uint32_t hi = static_cast<uint32_t>(*elem >> 32);
- uint32_t lo = static_cast<uint32_t>(*elem & BitUtil::LeastSignificantBitMask(32));
+ uint32_t lo = static_cast<uint32_t>(*elem & BitUtil::LeastSignificantBitMask(32));
uint64_t dividend_hi = (static_cast<uint64_t>(remainder) << 32) | hi;
uint64_t quotient_hi = dividend_hi / k1e9;
remainder = static_cast<uint32_t>(dividend_hi % k1e9);
@@ -486,24 +486,24 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out)
return pos == size;
}
-inline Status ToArrowStatus(DecimalStatus dstatus, int num_bits) {
- switch (dstatus) {
- case DecimalStatus::kSuccess:
- return Status::OK();
-
- case DecimalStatus::kDivideByZero:
- return Status::Invalid("Division by 0 in Decimal", num_bits);
-
- case DecimalStatus::kOverflow:
- return Status::Invalid("Overflow occurred during Decimal", num_bits, " operation.");
-
- case DecimalStatus::kRescaleDataLoss:
- return Status::Invalid("Rescaling Decimal", num_bits,
- " value would cause data loss");
- }
- return Status::OK();
-}
-
+inline Status ToArrowStatus(DecimalStatus dstatus, int num_bits) {
+ switch (dstatus) {
+ case DecimalStatus::kSuccess:
+ return Status::OK();
+
+ case DecimalStatus::kDivideByZero:
+ return Status::Invalid("Division by 0 in Decimal", num_bits);
+
+ case DecimalStatus::kOverflow:
+ return Status::Invalid("Overflow occurred during Decimal", num_bits, " operation.");
+
+ case DecimalStatus::kRescaleDataLoss:
+ return Status::Invalid("Rescaling Decimal", num_bits,
+ " value would cause data loss");
+ }
+ return Status::OK();
+}
+
} // namespace
Status Decimal128::FromString(const util::string_view& s, Decimal128* out,
@@ -609,7 +609,7 @@ Result<Decimal128> Decimal128::FromBigEndian(const uint8_t* bytes, int32_t lengt
int64_t high, low;
- if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
+ if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
"was ", length, ", but must be between ", kMinDecimalBytes,
" and ", kMaxDecimalBytes);
@@ -657,275 +657,275 @@ Result<Decimal128> Decimal128::FromBigEndian(const uint8_t* bytes, int32_t lengt
}
Status Decimal128::ToArrowStatus(DecimalStatus dstatus) const {
- return arrow::ToArrowStatus(dstatus, 128);
-}
-
-std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) {
- os << decimal.ToIntegerString();
- return os;
-}
-
-Decimal256::Decimal256(const std::string& str) : Decimal256() {
- *this = Decimal256::FromString(str).ValueOrDie();
-}
-
-std::string Decimal256::ToIntegerString() const {
- std::string result;
- if (static_cast<int64_t>(little_endian_array()[3]) < 0) {
- result.push_back('-');
- Decimal256 abs = *this;
- abs.Negate();
- AppendLittleEndianArrayToString(abs.little_endian_array(), &result);
- } else {
- AppendLittleEndianArrayToString(little_endian_array(), &result);
- }
- return result;
-}
-
-std::string Decimal256::ToString(int32_t scale) const {
- std::string str(ToIntegerString());
- AdjustIntegerStringWithScale(scale, &str);
- return str;
-}
-
-Status Decimal256::FromString(const util::string_view& s, Decimal256* out,
- int32_t* precision, int32_t* scale) {
- if (s.empty()) {
- return Status::Invalid("Empty string cannot be converted to decimal");
- }
-
- DecimalComponents dec;
- if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
- return Status::Invalid("The string '", s, "' is not a valid decimal number");
- }
-
- // Count number of significant digits (without leading zeros)
- size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
- size_t significant_digits = dec.fractional_digits.size();
- if (first_non_zero != std::string::npos) {
- significant_digits += dec.whole_digits.size() - first_non_zero;
- }
-
- if (precision != nullptr) {
- *precision = static_cast<int32_t>(significant_digits);
- }
-
- if (scale != nullptr) {
- if (dec.has_exponent) {
- auto adjusted_exponent = dec.exponent;
- auto len = static_cast<int32_t>(significant_digits);
- *scale = -adjusted_exponent + len - 1;
- } else {
- *scale = static_cast<int32_t>(dec.fractional_digits.size());
- }
- }
-
- if (out != nullptr) {
- std::array<uint64_t, 4> little_endian_array = {0, 0, 0, 0};
- ShiftAndAdd(dec.whole_digits, little_endian_array.data(), little_endian_array.size());
- ShiftAndAdd(dec.fractional_digits, little_endian_array.data(),
- little_endian_array.size());
- *out = Decimal256(little_endian_array);
-
- if (dec.sign == '-') {
- out->Negate();
- }
- }
-
- return Status::OK();
-}
-
-Status Decimal256::FromString(const std::string& s, Decimal256* out, int32_t* precision,
- int32_t* scale) {
- return FromString(util::string_view(s), out, precision, scale);
-}
-
-Status Decimal256::FromString(const char* s, Decimal256* out, int32_t* precision,
- int32_t* scale) {
- return FromString(util::string_view(s), out, precision, scale);
-}
-
-Result<Decimal256> Decimal256::FromString(const util::string_view& s) {
- Decimal256 out;
- RETURN_NOT_OK(FromString(s, &out, nullptr, nullptr));
- return std::move(out);
-}
-
-Result<Decimal256> Decimal256::FromString(const std::string& s) {
- return FromString(util::string_view(s));
-}
-
-Result<Decimal256> Decimal256::FromString(const char* s) {
- return FromString(util::string_view(s));
-}
-
-Result<Decimal256> Decimal256::FromBigEndian(const uint8_t* bytes, int32_t length) {
- static constexpr int32_t kMinDecimalBytes = 1;
- static constexpr int32_t kMaxDecimalBytes = 32;
-
- std::array<uint64_t, 4> little_endian_array;
-
- if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
- return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
- "was ", length, ", but must be between ", kMinDecimalBytes,
- " and ", kMaxDecimalBytes);
- }
-
- // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the
- // sign bit.
- const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
-
- for (int word_idx = 0; word_idx < 4; word_idx++) {
- const int32_t word_length = std::min(length, static_cast<int32_t>(sizeof(uint64_t)));
-
- if (word_length == 8) {
- // Full words can be assigned as is (and are UB with the shift below).
- little_endian_array[word_idx] =
- UInt64FromBigEndian(bytes + length - word_length, word_length);
- } else {
- // Sign extend the word its if necessary
- uint64_t word = -1 * is_negative;
- if (length > 0) {
- // Incorporate the actual values if present.
- // Shift left enough bits to make room for the incoming int64_t
- word = SafeLeftShift(word, word_length * CHAR_BIT);
- // Preserve the upper bits by inplace OR-ing the int64_t
- word |= UInt64FromBigEndian(bytes + length - word_length, word_length);
- }
- little_endian_array[word_idx] = word;
- }
- // Move on to the next word.
- length -= word_length;
- }
-
- return Decimal256(little_endian_array);
-}
-
-Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
- return arrow::ToArrowStatus(dstatus, 256);
-}
-
-namespace {
-
-template <typename Real, typename Derived>
-struct Decimal256RealConversion {
- static Result<Decimal256> FromPositiveReal(Real real, int32_t precision,
- int32_t scale) {
- auto x = real;
- if (scale >= -76 && scale <= 76) {
- x *= Derived::powers_of_ten()[scale + 76];
- } else {
- x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
- }
- x = std::nearbyint(x);
- const auto max_abs = Derived::powers_of_ten()[precision + 76];
- if (x >= max_abs) {
- return Status::Invalid("Cannot convert ", real,
- " to Decimal256(precision = ", precision,
- ", scale = ", scale, "): overflow");
- }
- // Extract parts
- const auto part3 = std::floor(std::ldexp(x, -192));
- x -= std::ldexp(part3, 192);
- const auto part2 = std::floor(std::ldexp(x, -128));
- x -= std::ldexp(part2, 128);
- const auto part1 = std::floor(std::ldexp(x, -64));
- x -= std::ldexp(part1, 64);
- const auto part0 = x;
-
- DCHECK_GE(part3, 0);
- DCHECK_LT(part3, 1.8446744073709552e+19); // 2**64
- DCHECK_GE(part2, 0);
- DCHECK_LT(part2, 1.8446744073709552e+19); // 2**64
- DCHECK_GE(part1, 0);
- DCHECK_LT(part1, 1.8446744073709552e+19); // 2**64
- DCHECK_GE(part0, 0);
- DCHECK_LT(part0, 1.8446744073709552e+19); // 2**64
- return Decimal256(std::array<uint64_t, 4>{
- static_cast<uint64_t>(part0), static_cast<uint64_t>(part1),
- static_cast<uint64_t>(part2), static_cast<uint64_t>(part3)});
- }
-
- static Result<Decimal256> FromReal(Real x, int32_t precision, int32_t scale) {
- DCHECK_GT(precision, 0);
- DCHECK_LE(precision, 76);
-
- if (!std::isfinite(x)) {
- return Status::Invalid("Cannot convert ", x, " to Decimal256");
- }
- if (x < 0) {
- ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
- return dec.Negate();
- } else {
- // Includes negative zero
- return FromPositiveReal(x, precision, scale);
- }
- }
-
- static Real ToRealPositive(const Decimal256& decimal, int32_t scale) {
- DCHECK_GE(decimal, 0);
- Real x = 0;
- const auto& parts = decimal.little_endian_array();
- x += Derived::two_to_192(static_cast<Real>(parts[3]));
- x += Derived::two_to_128(static_cast<Real>(parts[2]));
- x += Derived::two_to_64(static_cast<Real>(parts[1]));
- x += static_cast<Real>(parts[0]);
- if (scale >= -76 && scale <= 76) {
- x *= Derived::powers_of_ten()[-scale + 76];
- } else {
- x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
- }
- return x;
- }
-
- static Real ToReal(Decimal256 decimal, int32_t scale) {
- if (decimal.little_endian_array()[3] & (1ULL << 63)) {
- // Convert the absolute value to avoid precision loss
- decimal.Negate();
- return -ToRealPositive(decimal, scale);
- } else {
- return ToRealPositive(decimal, scale);
- }
- }
-};
-
-struct Decimal256FloatConversion
- : public Decimal256RealConversion<float, Decimal256FloatConversion> {
- static constexpr const float* powers_of_ten() { return kFloatPowersOfTen76; }
-
- static float two_to_64(float x) { return x * 1.8446744e+19f; }
- static float two_to_128(float x) { return x == 0 ? 0 : INFINITY; }
- static float two_to_192(float x) { return x == 0 ? 0 : INFINITY; }
-};
-
-struct Decimal256DoubleConversion
- : public Decimal256RealConversion<double, Decimal256DoubleConversion> {
- static constexpr const double* powers_of_ten() { return kDoublePowersOfTen76; }
-
- static double two_to_64(double x) { return x * 1.8446744073709552e+19; }
- static double two_to_128(double x) { return x * 3.402823669209385e+38; }
- static double two_to_192(double x) { return x * 6.277101735386681e+57; }
-};
-
-} // namespace
-
-Result<Decimal256> Decimal256::FromReal(float x, int32_t precision, int32_t scale) {
- return Decimal256FloatConversion::FromReal(x, precision, scale);
-}
-
-Result<Decimal256> Decimal256::FromReal(double x, int32_t precision, int32_t scale) {
- return Decimal256DoubleConversion::FromReal(x, precision, scale);
-}
-
-float Decimal256::ToFloat(int32_t scale) const {
- return Decimal256FloatConversion::ToReal(*this, scale);
-}
-
-double Decimal256::ToDouble(int32_t scale) const {
- return Decimal256DoubleConversion::ToReal(*this, scale);
-}
-
-std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
+ return arrow::ToArrowStatus(dstatus, 128);
+}
+
+std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) {
+ os << decimal.ToIntegerString();
+ return os;
+}
+
+Decimal256::Decimal256(const std::string& str) : Decimal256() {
+ *this = Decimal256::FromString(str).ValueOrDie();
+}
+
+std::string Decimal256::ToIntegerString() const {
+ std::string result;
+ if (static_cast<int64_t>(little_endian_array()[3]) < 0) {
+ result.push_back('-');
+ Decimal256 abs = *this;
+ abs.Negate();
+ AppendLittleEndianArrayToString(abs.little_endian_array(), &result);
+ } else {
+ AppendLittleEndianArrayToString(little_endian_array(), &result);
+ }
+ return result;
+}
+
+std::string Decimal256::ToString(int32_t scale) const {
+ std::string str(ToIntegerString());
+ AdjustIntegerStringWithScale(scale, &str);
+ return str;
+}
+
+Status Decimal256::FromString(const util::string_view& s, Decimal256* out,
+ int32_t* precision, int32_t* scale) {
+ if (s.empty()) {
+ return Status::Invalid("Empty string cannot be converted to decimal");
+ }
+
+ DecimalComponents dec;
+ if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+ return Status::Invalid("The string '", s, "' is not a valid decimal number");
+ }
+
+ // Count number of significant digits (without leading zeros)
+ size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+ size_t significant_digits = dec.fractional_digits.size();
+ if (first_non_zero != std::string::npos) {
+ significant_digits += dec.whole_digits.size() - first_non_zero;
+ }
+
+ if (precision != nullptr) {
+ *precision = static_cast<int32_t>(significant_digits);
+ }
+
+ if (scale != nullptr) {
+ if (dec.has_exponent) {
+ auto adjusted_exponent = dec.exponent;
+ auto len = static_cast<int32_t>(significant_digits);
+ *scale = -adjusted_exponent + len - 1;
+ } else {
+ *scale = static_cast<int32_t>(dec.fractional_digits.size());
+ }
+ }
+
+ if (out != nullptr) {
+ std::array<uint64_t, 4> little_endian_array = {0, 0, 0, 0};
+ ShiftAndAdd(dec.whole_digits, little_endian_array.data(), little_endian_array.size());
+ ShiftAndAdd(dec.fractional_digits, little_endian_array.data(),
+ little_endian_array.size());
+ *out = Decimal256(little_endian_array);
+
+ if (dec.sign == '-') {
+ out->Negate();
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Decimal256::FromString(const std::string& s, Decimal256* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Status Decimal256::FromString(const char* s, Decimal256* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromString(const util::string_view& s) {
+ Decimal256 out;
+ RETURN_NOT_OK(FromString(s, &out, nullptr, nullptr));
+ return std::move(out);
+}
+
+Result<Decimal256> Decimal256::FromString(const std::string& s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal256> Decimal256::FromString(const char* s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal256> Decimal256::FromBigEndian(const uint8_t* bytes, int32_t length) {
+ static constexpr int32_t kMinDecimalBytes = 1;
+ static constexpr int32_t kMaxDecimalBytes = 32;
+
+ std::array<uint64_t, 4> little_endian_array;
+
+ if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
+ return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
+ "was ", length, ", but must be between ", kMinDecimalBytes,
+ " and ", kMaxDecimalBytes);
+ }
+
+ // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the
+ // sign bit.
+ const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
+
+ for (int word_idx = 0; word_idx < 4; word_idx++) {
+ const int32_t word_length = std::min(length, static_cast<int32_t>(sizeof(uint64_t)));
+
+ if (word_length == 8) {
+ // Full words can be assigned as is (and are UB with the shift below).
+ little_endian_array[word_idx] =
+ UInt64FromBigEndian(bytes + length - word_length, word_length);
+ } else {
+ // Sign extend the word its if necessary
+ uint64_t word = -1 * is_negative;
+ if (length > 0) {
+ // Incorporate the actual values if present.
+ // Shift left enough bits to make room for the incoming int64_t
+ word = SafeLeftShift(word, word_length * CHAR_BIT);
+ // Preserve the upper bits by inplace OR-ing the int64_t
+ word |= UInt64FromBigEndian(bytes + length - word_length, word_length);
+ }
+ little_endian_array[word_idx] = word;
+ }
+ // Move on to the next word.
+ length -= word_length;
+ }
+
+ return Decimal256(little_endian_array);
+}
+
+Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
+ return arrow::ToArrowStatus(dstatus, 256);
+}
+
+namespace {
+
+template <typename Real, typename Derived>
+struct Decimal256RealConversion {
+ static Result<Decimal256> FromPositiveReal(Real real, int32_t precision,
+ int32_t scale) {
+ auto x = real;
+ if (scale >= -76 && scale <= 76) {
+ x *= Derived::powers_of_ten()[scale + 76];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
+ }
+ x = std::nearbyint(x);
+ const auto max_abs = Derived::powers_of_ten()[precision + 76];
+ if (x >= max_abs) {
+ return Status::Invalid("Cannot convert ", real,
+ " to Decimal256(precision = ", precision,
+ ", scale = ", scale, "): overflow");
+ }
+ // Extract parts
+ const auto part3 = std::floor(std::ldexp(x, -192));
+ x -= std::ldexp(part3, 192);
+ const auto part2 = std::floor(std::ldexp(x, -128));
+ x -= std::ldexp(part2, 128);
+ const auto part1 = std::floor(std::ldexp(x, -64));
+ x -= std::ldexp(part1, 64);
+ const auto part0 = x;
+
+ DCHECK_GE(part3, 0);
+ DCHECK_LT(part3, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part2, 0);
+ DCHECK_LT(part2, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part1, 0);
+ DCHECK_LT(part1, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part0, 0);
+ DCHECK_LT(part0, 1.8446744073709552e+19); // 2**64
+ return Decimal256(std::array<uint64_t, 4>{
+ static_cast<uint64_t>(part0), static_cast<uint64_t>(part1),
+ static_cast<uint64_t>(part2), static_cast<uint64_t>(part3)});
+ }
+
+ static Result<Decimal256> FromReal(Real x, int32_t precision, int32_t scale) {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 76);
+
+ if (!std::isfinite(x)) {
+ return Status::Invalid("Cannot convert ", x, " to Decimal256");
+ }
+ if (x < 0) {
+ ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
+ return dec.Negate();
+ } else {
+ // Includes negative zero
+ return FromPositiveReal(x, precision, scale);
+ }
+ }
+
+ static Real ToRealPositive(const Decimal256& decimal, int32_t scale) {
+ DCHECK_GE(decimal, 0);
+ Real x = 0;
+ const auto& parts = decimal.little_endian_array();
+ x += Derived::two_to_192(static_cast<Real>(parts[3]));
+ x += Derived::two_to_128(static_cast<Real>(parts[2]));
+ x += Derived::two_to_64(static_cast<Real>(parts[1]));
+ x += static_cast<Real>(parts[0]);
+ if (scale >= -76 && scale <= 76) {
+ x *= Derived::powers_of_ten()[-scale + 76];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
+ }
+ return x;
+ }
+
+ static Real ToReal(Decimal256 decimal, int32_t scale) {
+ if (decimal.little_endian_array()[3] & (1ULL << 63)) {
+ // Convert the absolute value to avoid precision loss
+ decimal.Negate();
+ return -ToRealPositive(decimal, scale);
+ } else {
+ return ToRealPositive(decimal, scale);
+ }
+ }
+};
+
+struct Decimal256FloatConversion
+ : public Decimal256RealConversion<float, Decimal256FloatConversion> {
+ static constexpr const float* powers_of_ten() { return kFloatPowersOfTen76; }
+
+ static float two_to_64(float x) { return x * 1.8446744e+19f; }
+ static float two_to_128(float x) { return x == 0 ? 0 : INFINITY; }
+ static float two_to_192(float x) { return x == 0 ? 0 : INFINITY; }
+};
+
+struct Decimal256DoubleConversion
+ : public Decimal256RealConversion<double, Decimal256DoubleConversion> {
+ static constexpr const double* powers_of_ten() { return kDoublePowersOfTen76; }
+
+ static double two_to_64(double x) { return x * 1.8446744073709552e+19; }
+ static double two_to_128(double x) { return x * 3.402823669209385e+38; }
+ static double two_to_192(double x) { return x * 6.277101735386681e+57; }
+};
+
+} // namespace
+
+Result<Decimal256> Decimal256::FromReal(float x, int32_t precision, int32_t scale) {
+ return Decimal256FloatConversion::FromReal(x, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromReal(double x, int32_t precision, int32_t scale) {
+ return Decimal256DoubleConversion::FromReal(x, precision, scale);
+}
+
+float Decimal256::ToFloat(int32_t scale) const {
+ return Decimal256FloatConversion::ToReal(*this, scale);
+}
+
+double Decimal256::ToDouble(int32_t scale) const {
+ return Decimal256DoubleConversion::ToReal(*this, scale);
+}
+
+std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
os << decimal.ToIntegerString();
return os;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
index 4a158728833..4c6cc9dd1db 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
@@ -55,8 +55,8 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
/// \endcond
/// \brief constructor creates a Decimal128 from a BasicDecimal128.
- constexpr Decimal128(const BasicDecimal128& value) noexcept // NOLINT runtime/explicit
- : BasicDecimal128(value) {}
+ constexpr Decimal128(const BasicDecimal128& value) noexcept // NOLINT runtime/explicit
+ : BasicDecimal128(value) {}
/// \brief Parse the number from a base 10 string representation.
explicit Decimal128(const std::string& value);
@@ -173,119 +173,119 @@ struct Decimal128::ToRealConversion<double> {
}
};
-/// Represents a signed 256-bit integer in two's complement.
-/// The max decimal precision that can be safely represented is
-/// 76 significant digits.
-///
-/// The implementation is split into two parts :
-///
-/// 1. BasicDecimal256
-/// - can be safely compiled to IR without references to libstdc++.
-/// 2. Decimal256
-/// - (TODO) has additional functionality on top of BasicDecimal256 to deal with
-/// strings and streams.
-class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
- public:
- /// \cond FALSE
- // (need to avoid a duplicate definition in Sphinx)
- using BasicDecimal256::BasicDecimal256;
- /// \endcond
-
- /// \brief constructor creates a Decimal256 from a BasicDecimal256.
- constexpr Decimal256(const BasicDecimal256& value) noexcept : BasicDecimal256(value) {}
-
- /// \brief Parse the number from a base 10 string representation.
- explicit Decimal256(const std::string& value);
-
- /// \brief Empty constructor creates a Decimal256 with a value of 0.
- // This is required on some older compilers.
- constexpr Decimal256() noexcept : BasicDecimal256() {}
-
- /// \brief Convert the Decimal256 value to a base 10 decimal string with the given
- /// scale.
- std::string ToString(int32_t scale) const;
-
- /// \brief Convert the value to an integer string
- std::string ToIntegerString() const;
-
- /// \brief Convert a decimal string to a Decimal256 value, optionally including
- /// precision and scale if they're passed in and not null.
- static Status FromString(const util::string_view& s, Decimal256* out,
- int32_t* precision, int32_t* scale = NULLPTR);
- static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
- int32_t* scale = NULLPTR);
- static Status FromString(const char* s, Decimal256* out, int32_t* precision,
- int32_t* scale = NULLPTR);
- static Result<Decimal256> FromString(const util::string_view& s);
- static Result<Decimal256> FromString(const std::string& s);
- static Result<Decimal256> FromString(const char* s);
-
- /// \brief Convert Decimal256 from one scale to another
- Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
- Decimal256 out;
- auto dstatus = BasicDecimal256::Rescale(original_scale, new_scale, &out);
- ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
- return std::move(out);
- }
-
- /// Divide this number by right and return the result.
- ///
- /// This operation is not destructive.
- /// The answer rounds to zero. Signs work like:
- /// 21 / 5 -> 4, 1
- /// -21 / 5 -> -4, -1
- /// 21 / -5 -> -4, 1
- /// -21 / -5 -> 4, -1
- /// \param[in] divisor the number to divide by
- /// \return the pair of the quotient and the remainder
- Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
- std::pair<Decimal256, Decimal256> result;
- auto dstatus = BasicDecimal256::Divide(divisor, &result.first, &result.second);
- ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
- return std::move(result);
- }
-
- /// \brief Convert from a big-endian byte representation. The length must be
- /// between 1 and 32.
- /// \return error status if the length is an invalid value
- static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
-
- static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
- static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
-
- /// \brief Convert to a floating-point number (scaled).
- /// May return infinity in case of overflow.
- float ToFloat(int32_t scale) const;
- /// \brief Convert to a floating-point number (scaled)
- double ToDouble(int32_t scale) const;
-
- /// \brief Convert to a floating-point number (scaled)
- template <typename T>
- T ToReal(int32_t scale) const {
- return ToRealConversion<T>::ToReal(*this, scale);
- }
-
- friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
- const Decimal256& decimal);
-
- private:
- /// Converts internal error code to Status
- Status ToArrowStatus(DecimalStatus dstatus) const;
-
- template <typename T>
- struct ToRealConversion {};
-};
-
-template <>
-struct Decimal256::ToRealConversion<float> {
- static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
-};
-
-template <>
-struct Decimal256::ToRealConversion<double> {
- static double ToReal(const Decimal256& dec, int32_t scale) {
- return dec.ToDouble(scale);
- }
-};
-
+/// Represents a signed 256-bit integer in two's complement.
+/// The max decimal precision that can be safely represented is
+/// 76 significant digits.
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal256
+/// - can be safely compiled to IR without references to libstdc++.
+/// 2. Decimal256
+/// - (TODO) has additional functionality on top of BasicDecimal256 to deal with
+/// strings and streams.
+class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
+ public:
+ /// \cond FALSE
+ // (need to avoid a duplicate definition in Sphinx)
+ using BasicDecimal256::BasicDecimal256;
+ /// \endcond
+
+ /// \brief constructor creates a Decimal256 from a BasicDecimal256.
+ constexpr Decimal256(const BasicDecimal256& value) noexcept : BasicDecimal256(value) {}
+
+ /// \brief Parse the number from a base 10 string representation.
+ explicit Decimal256(const std::string& value);
+
+ /// \brief Empty constructor creates a Decimal256 with a value of 0.
+ // This is required on some older compilers.
+ constexpr Decimal256() noexcept : BasicDecimal256() {}
+
+ /// \brief Convert the Decimal256 value to a base 10 decimal string with the given
+ /// scale.
+ std::string ToString(int32_t scale) const;
+
+ /// \brief Convert the value to an integer string
+ std::string ToIntegerString() const;
+
+ /// \brief Convert a decimal string to a Decimal256 value, optionally including
+ /// precision and scale if they're passed in and not null.
+ static Status FromString(const util::string_view& s, Decimal256* out,
+ int32_t* precision, int32_t* scale = NULLPTR);
+ static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Status FromString(const char* s, Decimal256* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Result<Decimal256> FromString(const util::string_view& s);
+ static Result<Decimal256> FromString(const std::string& s);
+ static Result<Decimal256> FromString(const char* s);
+
+ /// \brief Convert Decimal256 from one scale to another
+ Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
+ Decimal256 out;
+ auto dstatus = BasicDecimal256::Rescale(original_scale, new_scale, &out);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(out);
+ }
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \return the pair of the quotient and the remainder
+ Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
+ std::pair<Decimal256, Decimal256> result;
+ auto dstatus = BasicDecimal256::Divide(divisor, &result.first, &result.second);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(result);
+ }
+
+ /// \brief Convert from a big-endian byte representation. The length must be
+ /// between 1 and 32.
+ /// \return error status if the length is an invalid value
+ static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
+
+ static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
+ static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
+
+ /// \brief Convert to a floating-point number (scaled).
+ /// May return infinity in case of overflow.
+ float ToFloat(int32_t scale) const;
+ /// \brief Convert to a floating-point number (scaled)
+ double ToDouble(int32_t scale) const;
+
+ /// \brief Convert to a floating-point number (scaled)
+ template <typename T>
+ T ToReal(int32_t scale) const {
+ return ToRealConversion<T>::ToReal(*this, scale);
+ }
+
+ friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
+ const Decimal256& decimal);
+
+ private:
+ /// Converts internal error code to Status
+ Status ToArrowStatus(DecimalStatus dstatus) const;
+
+ template <typename T>
+ struct ToRealConversion {};
+};
+
+template <>
+struct Decimal256::ToRealConversion<float> {
+ static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
+};
+
+template <>
+struct Decimal256::ToRealConversion<double> {
+ static double ToReal(const Decimal256& dec, int32_t scale) {
+ return dec.ToDouble(scale);
+ }
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
index fe1b6ea3126..a499fdd2562 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
@@ -17,7 +17,7 @@
#include "arrow/util/delimiting.h"
#include "arrow/buffer.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/logging.h"
namespace arrow {
@@ -61,35 +61,35 @@ class NewlineBoundaryFinder : public BoundaryFinder {
return Status::OK();
}
- Status FindNth(util::string_view partial, util::string_view block, int64_t count,
- int64_t* out_pos, int64_t* num_found) override {
- DCHECK(partial.find_first_of(newline_delimiters) == util::string_view::npos);
-
- int64_t found = 0;
- int64_t pos = kNoDelimiterFound;
-
- auto cur_pos = block.find_first_of(newline_delimiters);
- while (cur_pos != util::string_view::npos) {
- if (block[cur_pos] == '\r' && cur_pos + 1 < block.length() &&
- block[cur_pos + 1] == '\n') {
- cur_pos += 2;
- } else {
- ++cur_pos;
- }
-
- pos = static_cast<int64_t>(cur_pos);
- if (++found >= count) {
- break;
- }
-
- cur_pos = block.find_first_of(newline_delimiters, cur_pos);
- }
-
- *out_pos = pos;
- *num_found = found;
- return Status::OK();
- }
-
+ Status FindNth(util::string_view partial, util::string_view block, int64_t count,
+ int64_t* out_pos, int64_t* num_found) override {
+ DCHECK(partial.find_first_of(newline_delimiters) == util::string_view::npos);
+
+ int64_t found = 0;
+ int64_t pos = kNoDelimiterFound;
+
+ auto cur_pos = block.find_first_of(newline_delimiters);
+ while (cur_pos != util::string_view::npos) {
+ if (block[cur_pos] == '\r' && cur_pos + 1 < block.length() &&
+ block[cur_pos + 1] == '\n') {
+ cur_pos += 2;
+ } else {
+ ++cur_pos;
+ }
+
+ pos = static_cast<int64_t>(cur_pos);
+ if (++found >= count) {
+ break;
+ }
+
+ cur_pos = block.find_first_of(newline_delimiters, cur_pos);
+ }
+
+ *out_pos = pos;
+ *num_found = found;
+ return Status::OK();
+ }
+
protected:
static constexpr const char* newline_delimiters = "\r\n";
};
@@ -168,26 +168,26 @@ Status Chunker::ProcessFinal(std::shared_ptr<Buffer> partial,
return Status::OK();
}
-Status Chunker::ProcessSkip(std::shared_ptr<Buffer> partial,
- std::shared_ptr<Buffer> block, bool final, int64_t* count,
- std::shared_ptr<Buffer>* rest) {
- DCHECK_GT(*count, 0);
- int64_t pos;
- int64_t num_found;
- ARROW_RETURN_NOT_OK(boundary_finder_->FindNth(
- util::string_view(*partial), util::string_view(*block), *count, &pos, &num_found));
- if (pos == BoundaryFinder::kNoDelimiterFound) {
- return StraddlingTooLarge();
- }
- if (ARROW_PREDICT_FALSE(final && *count > num_found && block->size() != pos)) {
- // Skip the last row in the final block which does not have a delimiter
- ++num_found;
- *rest = SliceBuffer(block, 0, 0);
- } else {
- *rest = SliceBuffer(block, pos);
- }
- *count -= num_found;
- return Status::OK();
-}
-
+Status Chunker::ProcessSkip(std::shared_ptr<Buffer> partial,
+ std::shared_ptr<Buffer> block, bool final, int64_t* count,
+ std::shared_ptr<Buffer>* rest) {
+ DCHECK_GT(*count, 0);
+ int64_t pos;
+ int64_t num_found;
+ ARROW_RETURN_NOT_OK(boundary_finder_->FindNth(
+ util::string_view(*partial), util::string_view(*block), *count, &pos, &num_found));
+ if (pos == BoundaryFinder::kNoDelimiterFound) {
+ return StraddlingTooLarge();
+ }
+ if (ARROW_PREDICT_FALSE(final && *count > num_found && block->size() != pos)) {
+ // Skip the last row in the final block which does not have a delimiter
+ ++num_found;
+ *rest = SliceBuffer(block, 0, 0);
+ } else {
+ *rest = SliceBuffer(block, pos);
+ }
+ *count -= num_found;
+ return Status::OK();
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
index b4b868340db..0ffe652441d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
@@ -53,19 +53,19 @@ class ARROW_EXPORT BoundaryFinder {
/// `out_pos` will be -1 if no delimiter is found.
virtual Status FindLast(util::string_view block, int64_t* out_pos) = 0;
- /// \brief Find the position of the Nth delimiter inside the block
- ///
- /// `partial` is taken to be the beginning of the block, and `block`
- /// its continuation. Also, `partial` doesn't contain a delimiter.
- ///
- /// The returned `out_pos` is relative to `block`'s start and should point
- /// to the first character after the first delimiter.
- /// `out_pos` will be -1 if no delimiter is found.
- ///
- /// The returned `num_found` is the number of delimiters actually found
- virtual Status FindNth(util::string_view partial, util::string_view block,
- int64_t count, int64_t* out_pos, int64_t* num_found) = 0;
-
+ /// \brief Find the position of the Nth delimiter inside the block
+ ///
+ /// `partial` is taken to be the beginning of the block, and `block`
+ /// its continuation. Also, `partial` doesn't contain a delimiter.
+ ///
+ /// The returned `out_pos` is relative to `block`'s start and should point
+ /// to the first character after the first delimiter.
+ /// `out_pos` will be -1 if no delimiter is found.
+ ///
+ /// The returned `num_found` is the number of delimiters actually found
+ virtual Status FindNth(util::string_view partial, util::string_view block,
+ int64_t count, int64_t* out_pos, int64_t* num_found) = 0;
+
static constexpr int64_t kNoDelimiterFound = -1;
protected:
@@ -151,27 +151,27 @@ class ARROW_EXPORT Chunker {
Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
- /// \brief Skip count number of rows
- /// Pre-conditions:
- /// - `partial` is the start of a valid block of delimited data
- /// (i.e. starts just after a delimiter)
- /// - `block` follows `partial` in file order
- ///
- /// Post-conditions:
- /// - `count` is updated to indicate the number of rows that still need to be skipped
- /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
- /// `partial`
- /// - Else `rest` could be one or more valid blocks of delimited data which need to be
- /// parsed
- ///
- /// \param[in] partial incomplete delimited data
- /// \param[in] block delimited data following partial
- /// \param[in] final whether this is the final chunk
- /// \param[in,out] count number of rows that need to be skipped
- /// \param[out] rest subrange of block containing what was not skipped
- Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
- bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
-
+ /// \brief Skip count number of rows
+ /// Pre-conditions:
+ /// - `partial` is the start of a valid block of delimited data
+ /// (i.e. starts just after a delimiter)
+ /// - `block` follows `partial` in file order
+ ///
+ /// Post-conditions:
+ /// - `count` is updated to indicate the number of rows that still need to be skipped
+ /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
+ /// `partial`
+ /// - Else `rest` could be one or more valid blocks of delimited data which need to be
+ /// parsed
+ ///
+ /// \param[in] partial incomplete delimited data
+ /// \param[in] block delimited data following partial
+ /// \param[in] final whether this is the final chunk
+ /// \param[in,out] count number of rows that need to be skipped
+ /// \param[out] rest subrange of block containing what was not skipped
+ Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
+ bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
+
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
index 0cb2e44d275..a1d953d12ad 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
@@ -1,181 +1,181 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#ifdef _WIN32
-#define ARROW_LITTLE_ENDIAN 1
-#else
-#if defined(__APPLE__) || defined(__FreeBSD__)
-#include <machine/endian.h> // IWYU pragma: keep
-#elif defined(sun) || defined(__sun)
-#include <sys/byteorder.h> // IWYU pragma: keep
-#else
-#include <endian.h> // IWYU pragma: keep
-#endif
-#
-#ifndef __BYTE_ORDER__
-#error "__BYTE_ORDER__ not defined"
-#endif
-#
-#ifndef __ORDER_LITTLE_ENDIAN__
-#error "__ORDER_LITTLE_ENDIAN__ not defined"
-#endif
-#
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define ARROW_LITTLE_ENDIAN 1
-#else
-#define ARROW_LITTLE_ENDIAN 0
-#endif
-#endif
-
-#if defined(_MSC_VER)
-#include <intrin.h> // IWYU pragma: keep
-#define ARROW_BYTE_SWAP64 _byteswap_uint64
-#define ARROW_BYTE_SWAP32 _byteswap_ulong
-#else
-#define ARROW_BYTE_SWAP64 __builtin_bswap64
-#define ARROW_BYTE_SWAP32 __builtin_bswap32
-#endif
-
-#include "arrow/util/type_traits.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace BitUtil {
-
-//
-// Byte-swap 16-bit, 32-bit and 64-bit values
-//
-
-// Swap the byte order (i.e. endianness)
-static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
-static inline uint64_t ByteSwap(uint64_t value) {
- return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
-}
-static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
-static inline uint32_t ByteSwap(uint32_t value) {
- return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
-}
-static inline int16_t ByteSwap(int16_t value) {
- constexpr auto m = static_cast<int16_t>(0xff);
- return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
-}
-static inline uint16_t ByteSwap(uint16_t value) {
- return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
-}
-static inline uint8_t ByteSwap(uint8_t value) { return value; }
-static inline int8_t ByteSwap(int8_t value) { return value; }
-static inline double ByteSwap(double value) {
- const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
- return util::SafeCopy<double>(swapped);
-}
-static inline float ByteSwap(float value) {
- const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
- return util::SafeCopy<float>(swapped);
-}
-
-// Write the swapped bytes into dst. Src and dst cannot overlap.
-static inline void ByteSwap(void* dst, const void* src, int len) {
- switch (len) {
- case 1:
- *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
- return;
- case 2:
- *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
- return;
- case 4:
- *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
- return;
- case 8:
- *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
- return;
- default:
- break;
- }
-
- auto d = reinterpret_cast<uint8_t*>(dst);
- auto s = reinterpret_cast<const uint8_t*>(src);
- for (int i = 0; i < len; ++i) {
- d[i] = s[len - i - 1];
- }
-}
-
-// Convert to little/big endian format from the machine's native endian format.
-#if ARROW_LITTLE_ENDIAN
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToBigEndian(T value) {
- return ByteSwap(value);
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToLittleEndian(T value) {
- return value;
-}
-#else
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToBigEndian(T value) {
- return value;
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToLittleEndian(T value) {
- return ByteSwap(value);
-}
-#endif
-
-// Convert from big/little endian format to the machine's native endian format.
-#if ARROW_LITTLE_ENDIAN
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromBigEndian(T value) {
- return ByteSwap(value);
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromLittleEndian(T value) {
- return value;
-}
-#else
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromBigEndian(T value) {
- return value;
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromLittleEndian(T value) {
- return ByteSwap(value);
-}
-#endif
-
-} // namespace BitUtil
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#define ARROW_LITTLE_ENDIAN 1
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <machine/endian.h> // IWYU pragma: keep
+#elif defined(sun) || defined(__sun)
+#include <sys/byteorder.h> // IWYU pragma: keep
+#else
+#include <endian.h> // IWYU pragma: keep
+#endif
+#
+#ifndef __BYTE_ORDER__
+#error "__BYTE_ORDER__ not defined"
+#endif
+#
+#ifndef __ORDER_LITTLE_ENDIAN__
+#error "__ORDER_LITTLE_ENDIAN__ not defined"
+#endif
+#
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define ARROW_LITTLE_ENDIAN 1
+#else
+#define ARROW_LITTLE_ENDIAN 0
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h> // IWYU pragma: keep
+#define ARROW_BYTE_SWAP64 _byteswap_uint64
+#define ARROW_BYTE_SWAP32 _byteswap_ulong
+#else
+#define ARROW_BYTE_SWAP64 __builtin_bswap64
+#define ARROW_BYTE_SWAP32 __builtin_bswap32
+#endif
+
+#include "arrow/util/type_traits.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace BitUtil {
+
+//
+// Byte-swap 16-bit, 32-bit and 64-bit values
+//
+
+// Swap the byte order (i.e. endianness)
+static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
+static inline uint64_t ByteSwap(uint64_t value) {
+ return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
+}
+static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
+static inline uint32_t ByteSwap(uint32_t value) {
+ return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
+}
+static inline int16_t ByteSwap(int16_t value) {
+ constexpr auto m = static_cast<int16_t>(0xff);
+ return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
+}
+static inline uint16_t ByteSwap(uint16_t value) {
+ return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+}
+static inline uint8_t ByteSwap(uint8_t value) { return value; }
+static inline int8_t ByteSwap(int8_t value) { return value; }
+static inline double ByteSwap(double value) {
+ const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
+ return util::SafeCopy<double>(swapped);
+}
+static inline float ByteSwap(float value) {
+ const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
+ return util::SafeCopy<float>(swapped);
+}
+
+// Write the swapped bytes into dst. Src and dst cannot overlap.
+static inline void ByteSwap(void* dst, const void* src, int len) {
+ switch (len) {
+ case 1:
+ *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
+ return;
+ case 2:
+ *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
+ return;
+ case 4:
+ *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
+ return;
+ case 8:
+ *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
+ return;
+ default:
+ break;
+ }
+
+ auto d = reinterpret_cast<uint8_t*>(dst);
+ auto s = reinterpret_cast<const uint8_t*>(src);
+ for (int i = 0; i < len; ++i) {
+ d[i] = s[len - i - 1];
+ }
+}
+
+// Convert to little/big endian format from the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToBigEndian(T value) {
+ return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToLittleEndian(T value) {
+ return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToBigEndian(T value) {
+ return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToLittleEndian(T value) {
+ return ByteSwap(value);
+}
+#endif
+
+// Convert from big/little endian format to the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromBigEndian(T value) {
+ return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromLittleEndian(T value) {
+ return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromBigEndian(T value) {
+ return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromLittleEndian(T value) {
+ return ByteSwap(value);
+}
+#endif
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
index c16d42ce5cf..efa8a997efe 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
@@ -43,29 +43,29 @@ struct FloatToStringFormatter::Impl {
: converter_(DoubleToStringConverter::EMIT_POSITIVE_EXPONENT_SIGN, "inf", "nan",
'e', -6, 10, 6, 0) {}
- Impl(int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
- int decimal_in_shortest_low, int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode)
- : converter_(flags, inf_symbol, nan_symbol, exp_character, decimal_in_shortest_low,
- decimal_in_shortest_high, max_leading_padding_zeroes_in_precision_mode,
- max_trailing_padding_zeroes_in_precision_mode) {}
-
+ Impl(int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
+ int decimal_in_shortest_low, int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : converter_(flags, inf_symbol, nan_symbol, exp_character, decimal_in_shortest_low,
+ decimal_in_shortest_high, max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode) {}
+
DoubleToStringConverter converter_;
};
FloatToStringFormatter::FloatToStringFormatter() : impl_(new Impl()) {}
-FloatToStringFormatter::FloatToStringFormatter(
- int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
- int decimal_in_shortest_low, int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode)
- : impl_(new Impl(flags, inf_symbol, nan_symbol, exp_character,
- decimal_in_shortest_low, decimal_in_shortest_high,
- max_leading_padding_zeroes_in_precision_mode,
- max_trailing_padding_zeroes_in_precision_mode)) {}
-
+FloatToStringFormatter::FloatToStringFormatter(
+ int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
+ int decimal_in_shortest_low, int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : impl_(new Impl(flags, inf_symbol, nan_symbol, exp_character,
+ decimal_in_shortest_low, decimal_in_shortest_high,
+ max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode)) {}
+
FloatToStringFormatter::~FloatToStringFormatter() {}
int FloatToStringFormatter::FormatFloat(float v, char* out_buffer, int out_size) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
index 566c9795f83..ac91ec6a123 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
@@ -31,7 +31,7 @@
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/double_conversion.h"
+#include "arrow/util/double_conversion.h"
#include "arrow/util/string_view.h"
#include "arrow/util/time.h"
#include "arrow/util/visibility.h"
@@ -220,11 +220,11 @@ class StringFormatter<UInt64Type> : public IntToStringFormatterMixin<UInt64Type>
class ARROW_EXPORT FloatToStringFormatter {
public:
FloatToStringFormatter();
- FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
- char exp_character, int decimal_in_shortest_low,
- int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode);
+ FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
+ char exp_character, int decimal_in_shortest_low,
+ int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode);
~FloatToStringFormatter();
// Returns the number of characters written
@@ -245,16 +245,16 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter {
explicit FloatToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
- FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
- char exp_character, int decimal_in_shortest_low,
- int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode)
- : FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
- decimal_in_shortest_low, decimal_in_shortest_high,
- max_leading_padding_zeroes_in_precision_mode,
- max_trailing_padding_zeroes_in_precision_mode) {}
-
+ FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
+ char exp_character, int decimal_in_shortest_low,
+ int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
+ decimal_in_shortest_low, decimal_in_shortest_high,
+ max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode) {}
+
template <typename Appender>
Return<Appender> operator()(value_type value, Appender&& append) {
char buffer[buffer_size];
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
index 9da79046fec..30b3066d06e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
@@ -17,27 +17,27 @@
#pragma once
-#include <memory>
+#include <memory>
#include <tuple>
#include <type_traits>
-#include "arrow/result.h"
+#include "arrow/result.h"
#include "arrow/util/macros.h"
namespace arrow {
namespace internal {
-struct Empty {
- static Result<Empty> ToResult(Status s) {
- if (ARROW_PREDICT_TRUE(s.ok())) {
- return Empty{};
- }
- return s;
- }
-};
-
+struct Empty {
+ static Result<Empty> ToResult(Status s) {
+ if (ARROW_PREDICT_TRUE(s.ok())) {
+ return Empty{};
+ }
+ return s;
+ }
+};
+
/// Helper struct for examining lambdas and other callables.
-/// TODO(ARROW-12655) support function pointers
+/// TODO(ARROW-12655) support function pointers
struct call_traits {
public:
template <typename R, typename... A>
@@ -63,20 +63,20 @@ struct call_traits {
static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
R (F::*)(A...) const);
- template <std::size_t I, typename F, typename R, typename... A>
- static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
- R (F::*)(A...) &&);
-
- template <typename F, typename R, typename... A>
- static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
-
- template <typename F, typename R, typename... A>
- static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
- const);
-
- template <typename F, typename R, typename... A>
- static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
-
+ template <std::size_t I, typename F, typename R, typename... A>
+ static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+ R (F::*)(A...) &&);
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
+ const);
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
+
/// bool constant indicating whether F is a callable with more than one possible
/// signature. Will be true_type for objects which define multiple operator() or which
/// define a template operator()
@@ -97,64 +97,64 @@ struct call_traits {
using argument_type = decltype(argument_type_impl<I>(&std::decay<F>::type::operator()));
template <typename F>
- using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
-
- template <typename F>
+ using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
+
+ template <typename F>
using return_type = decltype(return_type_impl(&std::decay<F>::type::operator()));
template <typename F, typename T, typename RT = T>
using enable_if_return =
typename std::enable_if<std::is_same<return_type<F>, T>::value, RT>;
-
- template <typename T, typename R = void>
- using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
-
- template <typename T, typename R = void>
- using enable_if_not_empty =
- typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
-};
-
-/// A type erased callable object which may only be invoked once.
-/// It can be constructed from any lambda which matches the provided call signature.
-/// Invoking it results in destruction of the lambda, freeing any state/references
-/// immediately. Invoking a default constructed FnOnce or one which has already been
-/// invoked will segfault.
-template <typename Signature>
-class FnOnce;
-
-template <typename R, typename... A>
-class FnOnce<R(A...)> {
- public:
- FnOnce() = default;
-
- template <typename Fn,
- typename = typename std::enable_if<std::is_convertible<
- typename std::result_of<Fn && (A...)>::type, R>::value>::type>
- FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) { // NOLINT runtime/explicit
- }
-
- explicit operator bool() const { return impl_ != NULLPTR; }
-
- R operator()(A... a) && {
- auto bye = std::move(impl_);
- return bye->invoke(std::forward<A&&>(a)...);
- }
-
- private:
- struct Impl {
- virtual ~Impl() = default;
- virtual R invoke(A&&... a) = 0;
- };
-
- template <typename Fn>
- struct FnImpl : Impl {
- explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
- R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
- Fn fn_;
- };
-
- std::unique_ptr<Impl> impl_;
+
+ template <typename T, typename R = void>
+ using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
+
+ template <typename T, typename R = void>
+ using enable_if_not_empty =
+ typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
};
+/// A type erased callable object which may only be invoked once.
+/// It can be constructed from any lambda which matches the provided call signature.
+/// Invoking it results in destruction of the lambda, freeing any state/references
+/// immediately. Invoking a default constructed FnOnce or one which has already been
+/// invoked will segfault.
+template <typename Signature>
+class FnOnce;
+
+template <typename R, typename... A>
+class FnOnce<R(A...)> {
+ public:
+ FnOnce() = default;
+
+ template <typename Fn,
+ typename = typename std::enable_if<std::is_convertible<
+ typename std::result_of<Fn && (A...)>::type, R>::value>::type>
+ FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) { // NOLINT runtime/explicit
+ }
+
+ explicit operator bool() const { return impl_ != NULLPTR; }
+
+ R operator()(A... a) && {
+ auto bye = std::move(impl_);
+ return bye->invoke(std::forward<A&&>(a)...);
+ }
+
+ private:
+ struct Impl {
+ virtual ~Impl() = default;
+ virtual R invoke(A&&... a) = 0;
+ };
+
+ template <typename Fn>
+ struct FnImpl : Impl {
+ explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
+ R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
+ Fn fn_;
+ };
+
+ std::unique_ptr<Impl> impl_;
+};
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
index f288a15be3f..640ff63655a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
@@ -26,7 +26,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
-#include "arrow/util/thread_pool.h"
+#include "arrow/util/thread_pool.h"
namespace arrow {
@@ -40,8 +40,8 @@ using internal::checked_cast;
// should ideally not limit scalability.
static std::mutex global_waiter_mutex;
-const double FutureWaiter::kInfinity = HUGE_VAL;
-
+const double FutureWaiter::kInfinity = HUGE_VAL;
+
class FutureWaiterImpl : public FutureWaiter {
public:
FutureWaiterImpl(Kind kind, std::vector<FutureImpl*> futures)
@@ -76,7 +76,7 @@ class FutureWaiterImpl : public FutureWaiter {
}
}
- ~FutureWaiterImpl() override {
+ ~FutureWaiterImpl() override {
for (auto future : futures_) {
future->RemoveWaiter(this);
}
@@ -177,9 +177,9 @@ FutureWaiterImpl* GetConcreteWaiter(FutureWaiter* waiter) {
} // namespace
-FutureWaiter::FutureWaiter() = default;
+FutureWaiter::FutureWaiter() = default;
-FutureWaiter::~FutureWaiter() = default;
+FutureWaiter::~FutureWaiter() = default;
std::unique_ptr<FutureWaiter> FutureWaiter::Make(Kind kind,
std::vector<FutureImpl*> futures) {
@@ -232,70 +232,70 @@ class ConcreteFutureImpl : public FutureImpl {
void DoMarkFailed() { DoMarkFinishedOrFailed(FutureState::FAILURE); }
- void CheckOptions(const CallbackOptions& opts) {
- if (opts.should_schedule != ShouldSchedule::Never) {
- DCHECK_NE(opts.executor, nullptr)
- << "An executor must be specified when adding a callback that might schedule";
- }
- }
-
- void AddCallback(Callback callback, CallbackOptions opts) {
- CheckOptions(opts);
- std::unique_lock<std::mutex> lock(mutex_);
- CallbackRecord callback_record{std::move(callback), opts};
- if (IsFutureFinished(state_)) {
- lock.unlock();
- RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/true);
- } else {
- callbacks_.push_back(std::move(callback_record));
- }
- }
-
- bool TryAddCallback(const std::function<Callback()>& callback_factory,
- CallbackOptions opts) {
- CheckOptions(opts);
- std::unique_lock<std::mutex> lock(mutex_);
- if (IsFutureFinished(state_)) {
- return false;
- } else {
- callbacks_.push_back({callback_factory(), opts});
- return true;
- }
- }
-
- bool ShouldScheduleCallback(const CallbackRecord& callback_record,
- bool in_add_callback) {
- switch (callback_record.options.should_schedule) {
- case ShouldSchedule::Never:
- return false;
- case ShouldSchedule::Always:
- return true;
- case ShouldSchedule::IfUnfinished:
- return !in_add_callback;
- case ShouldSchedule::IfDifferentExecutor:
- return !callback_record.options.executor->OwnsThisThread();
- default:
- DCHECK(false) << "Unrecognized ShouldSchedule option";
- return false;
- }
- }
-
- void RunOrScheduleCallback(CallbackRecord&& callback_record, bool in_add_callback) {
- if (ShouldScheduleCallback(callback_record, in_add_callback)) {
- struct CallbackTask {
- void operator()() { std::move(callback)(*self); }
-
- Callback callback;
- std::shared_ptr<FutureImpl> self;
- };
- // Need to keep `this` alive until the callback has a chance to be scheduled.
- CallbackTask task{std::move(callback_record.callback), shared_from_this()};
- DCHECK_OK(callback_record.options.executor->Spawn(std::move(task)));
- } else {
- std::move(callback_record.callback)(*this);
- }
- }
-
+ void CheckOptions(const CallbackOptions& opts) {
+ if (opts.should_schedule != ShouldSchedule::Never) {
+ DCHECK_NE(opts.executor, nullptr)
+ << "An executor must be specified when adding a callback that might schedule";
+ }
+ }
+
+ void AddCallback(Callback callback, CallbackOptions opts) {
+ CheckOptions(opts);
+ std::unique_lock<std::mutex> lock(mutex_);
+ CallbackRecord callback_record{std::move(callback), opts};
+ if (IsFutureFinished(state_)) {
+ lock.unlock();
+ RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/true);
+ } else {
+ callbacks_.push_back(std::move(callback_record));
+ }
+ }
+
+ bool TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts) {
+ CheckOptions(opts);
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (IsFutureFinished(state_)) {
+ return false;
+ } else {
+ callbacks_.push_back({callback_factory(), opts});
+ return true;
+ }
+ }
+
+ bool ShouldScheduleCallback(const CallbackRecord& callback_record,
+ bool in_add_callback) {
+ switch (callback_record.options.should_schedule) {
+ case ShouldSchedule::Never:
+ return false;
+ case ShouldSchedule::Always:
+ return true;
+ case ShouldSchedule::IfUnfinished:
+ return !in_add_callback;
+ case ShouldSchedule::IfDifferentExecutor:
+ return !callback_record.options.executor->OwnsThisThread();
+ default:
+ DCHECK(false) << "Unrecognized ShouldSchedule option";
+ return false;
+ }
+ }
+
+ void RunOrScheduleCallback(CallbackRecord&& callback_record, bool in_add_callback) {
+ if (ShouldScheduleCallback(callback_record, in_add_callback)) {
+ struct CallbackTask {
+ void operator()() { std::move(callback)(*self); }
+
+ Callback callback;
+ std::shared_ptr<FutureImpl> self;
+ };
+ // Need to keep `this` alive until the callback has a chance to be scheduled.
+ CallbackTask task{std::move(callback_record.callback), shared_from_this()};
+ DCHECK_OK(callback_record.options.executor->Spawn(std::move(task)));
+ } else {
+ std::move(callback_record.callback)(*this);
+ }
+ }
+
void DoMarkFinishedOrFailed(FutureState state) {
{
// Lock the hypothetical waiter first, and the future after.
@@ -310,17 +310,17 @@ class ConcreteFutureImpl : public FutureImpl {
}
}
cv_.notify_all();
-
- // run callbacks, lock not needed since the future is finished by this
- // point so nothing else can modify the callbacks list and it is safe
- // to iterate.
- //
- // In fact, it is important not to hold the locks because the callback
- // may be slow or do its own locking on other resources
- for (auto& callback_record : callbacks_) {
- RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/false);
- }
- callbacks_.clear();
+
+ // run callbacks, lock not needed since the future is finished by this
+ // point so nothing else can modify the callbacks list and it is safe
+ // to iterate.
+ //
+ // In fact, it is important not to hold the locks because the callback
+ // may be slow or do its own locking on other resources
+ for (auto& callback_record : callbacks_) {
+ RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/false);
+ }
+ callbacks_.clear();
}
void DoWait() {
@@ -355,12 +355,12 @@ std::unique_ptr<FutureImpl> FutureImpl::Make() {
return std::unique_ptr<FutureImpl>(new ConcreteFutureImpl());
}
-std::unique_ptr<FutureImpl> FutureImpl::MakeFinished(FutureState state) {
- std::unique_ptr<ConcreteFutureImpl> ptr(new ConcreteFutureImpl());
- ptr->state_ = state;
- return std::move(ptr);
-}
-
+std::unique_ptr<FutureImpl> FutureImpl::MakeFinished(FutureState state) {
+ std::unique_ptr<ConcreteFutureImpl> ptr(new ConcreteFutureImpl());
+ ptr->state_ = state;
+ return std::move(ptr);
+}
+
FutureImpl::FutureImpl() : state_(FutureState::PENDING) {}
FutureState FutureImpl::SetWaiter(FutureWaiter* w, int future_num) {
@@ -379,43 +379,43 @@ void FutureImpl::MarkFinished() { GetConcreteFuture(this)->DoMarkFinished(); }
void FutureImpl::MarkFailed() { GetConcreteFuture(this)->DoMarkFailed(); }
-void FutureImpl::AddCallback(Callback callback, CallbackOptions opts) {
- GetConcreteFuture(this)->AddCallback(std::move(callback), opts);
-}
-
-bool FutureImpl::TryAddCallback(const std::function<Callback()>& callback_factory,
- CallbackOptions opts) {
- return GetConcreteFuture(this)->TryAddCallback(callback_factory, opts);
-}
-
-Future<> AllComplete(const std::vector<Future<>>& futures) {
- struct State {
- explicit State(int64_t n_futures) : mutex(), n_remaining(n_futures) {}
-
- std::mutex mutex;
- std::atomic<size_t> n_remaining;
- };
-
- if (futures.empty()) {
- return Future<>::MakeFinished();
- }
-
- auto state = std::make_shared<State>(futures.size());
- auto out = Future<>::Make();
- for (const auto& future : futures) {
- future.AddCallback([state, out](const Status& status) mutable {
- if (!status.ok()) {
- std::unique_lock<std::mutex> lock(state->mutex);
- if (!out.is_finished()) {
- out.MarkFinished(status);
- }
- return;
- }
- if (state->n_remaining.fetch_sub(1) != 1) return;
- out.MarkFinished();
- });
- }
- return out;
-}
-
+void FutureImpl::AddCallback(Callback callback, CallbackOptions opts) {
+ GetConcreteFuture(this)->AddCallback(std::move(callback), opts);
+}
+
+bool FutureImpl::TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts) {
+ return GetConcreteFuture(this)->TryAddCallback(callback_factory, opts);
+}
+
+Future<> AllComplete(const std::vector<Future<>>& futures) {
+ struct State {
+ explicit State(int64_t n_futures) : mutex(), n_remaining(n_futures) {}
+
+ std::mutex mutex;
+ std::atomic<size_t> n_remaining;
+ };
+
+ if (futures.empty()) {
+ return Future<>::MakeFinished();
+ }
+
+ auto state = std::make_shared<State>(futures.size());
+ auto out = Future<>::Make();
+ for (const auto& future : futures) {
+ future.AddCallback([state, out](const Status& status) mutable {
+ if (!status.ok()) {
+ std::unique_lock<std::mutex> lock(state->mutex);
+ if (!out.is_finished()) {
+ out.MarkFinished(status);
+ }
+ return;
+ }
+ if (state->n_remaining.fetch_sub(1) != 1) return;
+ out.MarkFinished();
+ });
+ }
+ return out;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
index d9e0a939f25..9352de6596f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
@@ -18,8 +18,8 @@
#pragma once
#include <atomic>
-#include <cmath>
-#include <functional>
+#include <cmath>
+#include <functional>
#include <memory>
#include <type_traits>
#include <utility>
@@ -27,263 +27,263 @@
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/functional.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/functional.h"
#include "arrow/util/macros.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
-template <typename>
-struct EnsureFuture;
-
-namespace detail {
-
-template <typename>
-struct is_future : std::false_type {};
-
-template <typename T>
-struct is_future<Future<T>> : std::true_type {};
-
-template <typename Signature>
-using result_of_t = typename std::result_of<Signature>::type;
-
-// Helper to find the synchronous counterpart for a Future
-template <typename T>
-struct SyncType {
- using type = Result<T>;
-};
-
-template <>
-struct SyncType<internal::Empty> {
- using type = Status;
-};
-
-template <typename Fn>
-using first_arg_is_status =
- std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
- Status>;
-
-template <typename Fn, typename Then, typename Else,
- typename Count = internal::call_traits::argument_count<Fn>>
-using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
-
-/// Creates a callback that can be added to a future to mark a `dest` future finished
-template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
- bool DestEmpty = Dest::is_empty>
-struct MarkNextFinished {};
-
-/// If the source and dest are both empty we can pass on the status
-template <typename Source, typename Dest>
-struct MarkNextFinished<Source, Dest, true, true> {
- void operator()(const Status& status) && { next.MarkFinished(status); }
- Dest next;
-};
-
-/// If the source is not empty but the dest is then we can take the
-/// status out of the result
-template <typename Source, typename Dest>
-struct MarkNextFinished<Source, Dest, false, true> {
- void operator()(const Result<typename Source::ValueType>& res) && {
- next.MarkFinished(internal::Empty::ToResult(res.status()));
- }
- Dest next;
-};
-
-/// If neither are empty we pass on the result
-template <typename Source, typename Dest>
-struct MarkNextFinished<Source, Dest, false, false> {
- void operator()(const Result<typename Source::ValueType>& res) && {
- next.MarkFinished(res);
- }
- Dest next;
-};
-
-/// Helper that contains information about how to apply a continuation
-struct ContinueFuture {
- template <typename Return>
- struct ForReturnImpl;
-
- template <typename Return>
- using ForReturn = typename ForReturnImpl<Return>::type;
-
- template <typename Signature>
- using ForSignature = ForReturn<result_of_t<Signature>>;
-
- // If the callback returns void then we return Future<> that always finishes OK.
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
- NextFuture next, ContinueFunc&& f, Args&&... a) const {
- std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
- next.MarkFinished();
- }
-
- /// If the callback returns a non-future then we return Future<T>
- /// and mark the future finished with the callback result. It will get promoted
- /// to Result<T> as part of MarkFinished if it isn't already.
- ///
- /// If the callback returns Status and we return Future<> then also send the callback
- /// result as-is to the destination future.
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<
- !std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
- (!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
- operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
- next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
- }
-
- /// If the callback returns a Result and the next future is Future<> then we mark
- /// the future finished with the callback result.
- ///
- /// It may seem odd that the next future is Future<> when the callback returns a
- /// result but this can occur if the OnFailure callback returns a result while the
- /// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
- /// version of Then with an OnSuccess callback that returns void)
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<!std::is_void<ContinueResult>::value &&
- !is_future<ContinueResult>::value && NextFuture::is_empty &&
- !std::is_same<ContinueResult, Status>::value>::type
- operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
- next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
- }
-
- /// If the callback returns a Future<T> then we return Future<T>. We create a new
- /// future and add a callback to the future given to us by the user that forwards the
- /// result to the future we just created
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
- NextFuture next, ContinueFunc&& f, Args&&... a) const {
- ContinueResult signal_to_complete_next =
- std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
- MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
- signal_to_complete_next.AddCallback(std::move(callback));
- }
-
- /// Helpers to conditionally ignore arguments to ContinueFunc
- template <typename ContinueFunc, typename NextFuture, typename... Args>
- void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
- Args&&...) const {
- operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
- }
- template <typename ContinueFunc, typename NextFuture, typename... Args>
- void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
- Args&&... a) const {
- operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
- std::forward<Args>(a)...);
- }
-};
-
-/// Helper struct which tells us what kind of Future gets returned from `Then` based on
-/// the return type of the OnSuccess callback
-template <>
-struct ContinueFuture::ForReturnImpl<void> {
- using type = Future<>;
-};
-
-template <>
-struct ContinueFuture::ForReturnImpl<Status> {
- using type = Future<>;
-};
-
-template <typename R>
-struct ContinueFuture::ForReturnImpl {
- using type = Future<R>;
-};
-
-template <typename T>
-struct ContinueFuture::ForReturnImpl<Result<T>> {
- using type = Future<T>;
-};
-
-template <typename T>
-struct ContinueFuture::ForReturnImpl<Future<T>> {
- using type = Future<T>;
-};
-
-} // namespace detail
-
+template <typename>
+struct EnsureFuture;
+
+namespace detail {
+
+template <typename>
+struct is_future : std::false_type {};
+
+template <typename T>
+struct is_future<Future<T>> : std::true_type {};
+
+template <typename Signature>
+using result_of_t = typename std::result_of<Signature>::type;
+
+// Helper to find the synchronous counterpart for a Future
+template <typename T>
+struct SyncType {
+ using type = Result<T>;
+};
+
+template <>
+struct SyncType<internal::Empty> {
+ using type = Status;
+};
+
+template <typename Fn>
+using first_arg_is_status =
+ std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
+ Status>;
+
+template <typename Fn, typename Then, typename Else,
+ typename Count = internal::call_traits::argument_count<Fn>>
+using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
+
+/// Creates a callback that can be added to a future to mark a `dest` future finished
+template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
+ bool DestEmpty = Dest::is_empty>
+struct MarkNextFinished {};
+
+/// If the source and dest are both empty we can pass on the status
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, true, true> {
+ void operator()(const Status& status) && { next.MarkFinished(status); }
+ Dest next;
+};
+
+/// If the source is not empty but the dest is then we can take the
+/// status out of the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, true> {
+ void operator()(const Result<typename Source::ValueType>& res) && {
+ next.MarkFinished(internal::Empty::ToResult(res.status()));
+ }
+ Dest next;
+};
+
+/// If neither are empty we pass on the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, false> {
+ void operator()(const Result<typename Source::ValueType>& res) && {
+ next.MarkFinished(res);
+ }
+ Dest next;
+};
+
+/// Helper that contains information about how to apply a continuation
+struct ContinueFuture {
+ template <typename Return>
+ struct ForReturnImpl;
+
+ template <typename Return>
+ using ForReturn = typename ForReturnImpl<Return>::type;
+
+ template <typename Signature>
+ using ForSignature = ForReturn<result_of_t<Signature>>;
+
+ // If the callback returns void then we return Future<> that always finishes OK.
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
+ NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+ next.MarkFinished();
+ }
+
+ /// If the callback returns a non-future then we return Future<T>
+ /// and mark the future finished with the callback result. It will get promoted
+ /// to Result<T> as part of MarkFinished if it isn't already.
+ ///
+ /// If the callback returns Status and we return Future<> then also send the callback
+ /// result as-is to the destination future.
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<
+ !std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
+ (!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
+ operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
+ }
+
+ /// If the callback returns a Result and the next future is Future<> then we mark
+ /// the future finished with the callback result.
+ ///
+ /// It may seem odd that the next future is Future<> when the callback returns a
+ /// result but this can occur if the OnFailure callback returns a result while the
+ /// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
+ /// version of Then with an OnSuccess callback that returns void)
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<!std::is_void<ContinueResult>::value &&
+ !is_future<ContinueResult>::value && NextFuture::is_empty &&
+ !std::is_same<ContinueResult, Status>::value>::type
+ operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
+ }
+
+ /// If the callback returns a Future<T> then we return Future<T>. We create a new
+ /// future and add a callback to the future given to us by the user that forwards the
+ /// result to the future we just created
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
+ NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ ContinueResult signal_to_complete_next =
+ std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+ MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
+ signal_to_complete_next.AddCallback(std::move(callback));
+ }
+
+ /// Helpers to conditionally ignore arguments to ContinueFunc
+ template <typename ContinueFunc, typename NextFuture, typename... Args>
+ void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
+ Args&&...) const {
+ operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
+ }
+ template <typename ContinueFunc, typename NextFuture, typename... Args>
+ void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
+ Args&&... a) const {
+ operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
+ std::forward<Args>(a)...);
+ }
+};
+
+/// Helper struct which tells us what kind of Future gets returned from `Then` based on
+/// the return type of the OnSuccess callback
+template <>
+struct ContinueFuture::ForReturnImpl<void> {
+ using type = Future<>;
+};
+
+template <>
+struct ContinueFuture::ForReturnImpl<Status> {
+ using type = Future<>;
+};
+
+template <typename R>
+struct ContinueFuture::ForReturnImpl {
+ using type = Future<R>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Result<T>> {
+ using type = Future<T>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Future<T>> {
+ using type = Future<T>;
+};
+
+} // namespace detail
+
/// A Future's execution or completion status
enum class FutureState : int8_t { PENDING, SUCCESS, FAILURE };
inline bool IsFutureFinished(FutureState state) { return state != FutureState::PENDING; }
-/// \brief Describe whether the callback should be scheduled or run synchronously
-enum class ShouldSchedule {
- /// Always run the callback synchronously (the default)
- Never = 0,
- /// Schedule a new task only if the future is not finished when the
- /// callback is added
- IfUnfinished = 1,
- /// Always schedule the callback as a new task
- Always = 2,
- /// Schedule a new task only if it would run on an executor other than
- /// the specified executor.
- IfDifferentExecutor = 3,
-};
-
-/// \brief Options that control how a continuation is run
-struct CallbackOptions {
- /// Describe whether the callback should be run synchronously or scheduled
- ShouldSchedule should_schedule = ShouldSchedule::Never;
- /// If the callback is scheduled then this is the executor it should be scheduled
- /// on. If this is NULL then should_schedule must be Never
- internal::Executor* executor = NULLPTR;
-
- static CallbackOptions Defaults() { return {}; }
-};
-
-// Untyped private implementation
-class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
+/// \brief Describe whether the callback should be scheduled or run synchronously
+enum class ShouldSchedule {
+ /// Always run the callback synchronously (the default)
+ Never = 0,
+ /// Schedule a new task only if the future is not finished when the
+ /// callback is added
+ IfUnfinished = 1,
+ /// Always schedule the callback as a new task
+ Always = 2,
+ /// Schedule a new task only if it would run on an executor other than
+ /// the specified executor.
+ IfDifferentExecutor = 3,
+};
+
+/// \brief Options that control how a continuation is run
+struct CallbackOptions {
+ /// Describe whether the callback should be run synchronously or scheduled
+ ShouldSchedule should_schedule = ShouldSchedule::Never;
+ /// If the callback is scheduled then this is the executor it should be scheduled
+ /// on. If this is NULL then should_schedule must be Never
+ internal::Executor* executor = NULLPTR;
+
+ static CallbackOptions Defaults() { return {}; }
+};
+
+// Untyped private implementation
+class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
public:
- FutureImpl();
+ FutureImpl();
virtual ~FutureImpl() = default;
FutureState state() { return state_.load(); }
static std::unique_ptr<FutureImpl> Make();
- static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
+ static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
// Future API
void MarkFinished();
void MarkFailed();
void Wait();
bool Wait(double seconds);
- template <typename ValueType>
- Result<ValueType>* CastResult() const {
- return static_cast<Result<ValueType>*>(result_.get());
- }
-
- using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
- void AddCallback(Callback callback, CallbackOptions opts);
- bool TryAddCallback(const std::function<Callback()>& callback_factory,
- CallbackOptions opts);
-
+ template <typename ValueType>
+ Result<ValueType>* CastResult() const {
+ return static_cast<Result<ValueType>*>(result_.get());
+ }
+
+ using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
+ void AddCallback(Callback callback, CallbackOptions opts);
+ bool TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts);
+
// Waiter API
inline FutureState SetWaiter(FutureWaiter* w, int future_num);
inline void RemoveWaiter(FutureWaiter* w);
- std::atomic<FutureState> state_{FutureState::PENDING};
-
- // Type erased storage for arbitrary results
- // XXX small objects could be stored inline instead of boxed in a pointer
- using Storage = std::unique_ptr<void, void (*)(void*)>;
- Storage result_{NULLPTR, NULLPTR};
-
- struct CallbackRecord {
- Callback callback;
- CallbackOptions options;
- };
- std::vector<CallbackRecord> callbacks_;
+ std::atomic<FutureState> state_{FutureState::PENDING};
+
+ // Type erased storage for arbitrary results
+ // XXX small objects could be stored inline instead of boxed in a pointer
+ using Storage = std::unique_ptr<void, void (*)(void*)>;
+ Storage result_{NULLPTR, NULLPTR};
+
+ struct CallbackRecord {
+ Callback callback;
+ CallbackOptions options;
+ };
+ std::vector<CallbackRecord> callbacks_;
};
// An object that waits on multiple futures at once. Only one waiter
@@ -292,9 +292,9 @@ class ARROW_EXPORT FutureWaiter {
public:
enum Kind : int8_t { ANY, ALL, ALL_OR_FIRST_FAILED, ITERATE };
- // HUGE_VAL isn't constexpr on Windows
- // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
- static const double kInfinity;
+ // HUGE_VAL isn't constexpr on Windows
+ // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
+ static const double kInfinity;
static std::unique_ptr<FutureWaiter> Make(Kind kind, std::vector<FutureImpl*> futures);
@@ -318,7 +318,7 @@ class ARROW_EXPORT FutureWaiter {
static std::vector<FutureImpl*> ExtractFutures(const std::vector<FutureType>& futures) {
std::vector<FutureImpl*> base_futures(futures.size());
for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
- base_futures[i] = futures[i].impl_.get();
+ base_futures[i] = futures[i].impl_.get();
}
return base_futures;
}
@@ -329,7 +329,7 @@ class ARROW_EXPORT FutureWaiter {
const std::vector<FutureType*>& futures) {
std::vector<FutureImpl*> base_futures(futures.size());
for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
- base_futures[i] = futures[i]->impl_.get();
+ base_futures[i] = futures[i]->impl_.get();
}
return base_futures;
}
@@ -358,11 +358,11 @@ class ARROW_EXPORT FutureWaiter {
/// to complete, or wait on multiple Futures at once (using WaitForAll,
/// WaitForAny or AsCompletedIterator).
template <typename T>
-class ARROW_MUST_USE_TYPE Future {
+class ARROW_MUST_USE_TYPE Future {
public:
- using ValueType = T;
- using SyncType = typename detail::SyncType<T>::type;
- static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
+ using ValueType = T;
+ using SyncType = typename detail::SyncType<T>::type;
+ static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
// The default constructor creates an invalid Future. Use Future::Make()
// for a valid Future. This constructor is mostly for the convenience
// of being able to presize a vector of Futures.
@@ -370,7 +370,7 @@ class ARROW_MUST_USE_TYPE Future {
// Consumer API
- bool is_valid() const { return impl_ != NULLPTR; }
+ bool is_valid() const { return impl_ != NULLPTR; }
/// \brief Return the Future's current state
///
@@ -381,41 +381,41 @@ class ARROW_MUST_USE_TYPE Future {
return impl_->state();
}
- /// \brief Whether the Future is finished
+ /// \brief Whether the Future is finished
///
- /// A false return value is only indicative, as the Future can complete
- /// concurrently. A true return value is definitive, though.
- bool is_finished() const {
+ /// A false return value is only indicative, as the Future can complete
+ /// concurrently. A true return value is definitive, though.
+ bool is_finished() const {
CheckValid();
- return IsFutureFinished(impl_->state());
- }
-
- /// \brief Wait for the Future to complete and return its Result
- const Result<ValueType>& result() const& {
+ return IsFutureFinished(impl_->state());
+ }
+
+ /// \brief Wait for the Future to complete and return its Result
+ const Result<ValueType>& result() const& {
Wait();
- return *GetResult();
+ return *GetResult();
}
- /// \brief Returns an rvalue to the result. This method is potentially unsafe
- ///
- /// The future is not the unique owner of the result, copies of a future will
- /// also point to the same result. You must make sure that no other copies
- /// of the future exist. Attempts to add callbacks after you move the result
- /// will result in undefined behavior.
- Result<ValueType>&& MoveResult() {
+ /// \brief Returns an rvalue to the result. This method is potentially unsafe
+ ///
+ /// The future is not the unique owner of the result, copies of a future will
+ /// also point to the same result. You must make sure that no other copies
+ /// of the future exist. Attempts to add callbacks after you move the result
+ /// will result in undefined behavior.
+ Result<ValueType>&& MoveResult() {
Wait();
- return std::move(*GetResult());
+ return std::move(*GetResult());
}
/// \brief Wait for the Future to complete and return its Status
- const Status& status() const { return result().status(); }
-
- /// \brief Future<T> is convertible to Future<>, which views only the
- /// Status of the original. Marking the returned Future Finished is not supported.
- explicit operator Future<>() const {
- Future<> status_future;
- status_future.impl_ = impl_;
- return status_future;
+ const Status& status() const { return result().status(); }
+
+ /// \brief Future<T> is convertible to Future<>, which views only the
+ /// Status of the original. Marking the returned Future Finished is not supported.
+ explicit operator Future<>() const {
+ Future<> status_future;
+ status_future.impl_ = impl_;
+ return status_future;
}
/// \brief Wait for the Future to complete
@@ -441,270 +441,270 @@ class ARROW_MUST_USE_TYPE Future {
// Producer API
- /// \brief Producer API: mark Future finished
+ /// \brief Producer API: mark Future finished
///
- /// The Future's result is set to `res`.
- void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
-
- /// \brief Mark a Future<> completed with the provided Status.
- template <typename E = ValueType, typename = typename std::enable_if<
- std::is_same<E, internal::Empty>::value>::type>
- void MarkFinished(Status s = Status::OK()) {
- return DoMarkFinished(E::ToResult(std::move(s)));
+ /// The Future's result is set to `res`.
+ void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
+
+ /// \brief Mark a Future<> completed with the provided Status.
+ template <typename E = ValueType, typename = typename std::enable_if<
+ std::is_same<E, internal::Empty>::value>::type>
+ void MarkFinished(Status s = Status::OK()) {
+ return DoMarkFinished(E::ToResult(std::move(s)));
}
/// \brief Producer API: instantiate a valid Future
///
- /// The Future's state is initialized with PENDING. If you are creating a future with
- /// this method you must ensure that future is eventually completed (with success or
- /// failure). Creating a future, returning it, and never completing the future can lead
- /// to memory leaks (for example, see Loop).
+ /// The Future's state is initialized with PENDING. If you are creating a future with
+ /// this method you must ensure that future is eventually completed (with success or
+ /// failure). Creating a future, returning it, and never completing the future can lead
+ /// to memory leaks (for example, see Loop).
static Future Make() {
Future fut;
- fut.impl_ = FutureImpl::Make();
+ fut.impl_ = FutureImpl::Make();
return fut;
}
/// \brief Producer API: instantiate a finished Future
- static Future<ValueType> MakeFinished(Result<ValueType> res) {
- Future<ValueType> fut;
- fut.InitializeFromResult(std::move(res));
+ static Future<ValueType> MakeFinished(Result<ValueType> res) {
+ Future<ValueType> fut;
+ fut.InitializeFromResult(std::move(res));
return fut;
}
- /// \brief Make a finished Future<> with the provided Status.
- template <typename E = ValueType, typename = typename std::enable_if<
- std::is_same<E, internal::Empty>::value>::type>
- static Future<> MakeFinished(Status s = Status::OK()) {
- return MakeFinished(E::ToResult(std::move(s)));
- }
-
- struct WrapResultyOnComplete {
- template <typename OnComplete>
- struct Callback {
- void operator()(const FutureImpl& impl) && {
- std::move(on_complete)(*impl.CastResult<ValueType>());
- }
- OnComplete on_complete;
- };
- };
-
- struct WrapStatusyOnComplete {
- template <typename OnComplete>
- struct Callback {
- static_assert(std::is_same<internal::Empty, ValueType>::value,
- "Only callbacks for Future<> should accept Status and not Result");
-
- void operator()(const FutureImpl& impl) && {
- std::move(on_complete)(impl.CastResult<ValueType>()->status());
- }
- OnComplete on_complete;
- };
- };
-
- template <typename OnComplete>
- using WrapOnComplete = typename std::conditional<
- detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
- WrapResultyOnComplete>::type::template Callback<OnComplete>;
-
- /// \brief Consumer API: Register a callback to run when this future completes
- ///
- /// The callback should receive the result of the future (const Result<T>&)
- /// For a void or statusy future this should be (const Status&)
- ///
- /// There is no guarantee to the order in which callbacks will run. In
- /// particular, callbacks added while the future is being marked complete
- /// may be executed immediately, ahead of, or even the same time as, other
- /// callbacks that have been previously added.
- ///
- /// WARNING: callbacks may hold arbitrary references, including cyclic references.
- /// Since callbacks will only be destroyed after they are invoked, this can lead to
- /// memory leaks if a Future is never marked finished (abandoned):
- ///
- /// {
- /// auto fut = Future<>::Make();
- /// fut.AddCallback([fut]() {});
- /// }
- ///
- /// In this example `fut` falls out of scope but is not destroyed because it holds a
- /// cyclic reference to itself through the callback.
- template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
- void AddCallback(OnComplete on_complete,
- CallbackOptions opts = CallbackOptions::Defaults()) const {
- // We know impl_ will not be dangling when invoking callbacks because at least one
- // thread will be waiting for MarkFinished to return. Thus it's safe to keep a
- // weak reference to impl_ here
- impl_->AddCallback(Callback{std::move(on_complete)}, opts);
- }
-
- /// \brief Overload of AddCallback that will return false instead of running
- /// synchronously
- ///
- /// This overload will guarantee the callback is never run synchronously. If the future
- /// is already finished then it will simply return false. This can be useful to avoid
- /// stack overflow in a situation where you have recursive Futures. For an example
- /// see the Loop function
- ///
- /// Takes in a callback factory function to allow moving callbacks (the factory function
- /// will only be called if the callback can successfully be added)
- ///
- /// Returns true if a callback was actually added and false if the callback failed
- /// to add because the future was marked complete.
- template <typename CallbackFactory,
- typename OnComplete = detail::result_of_t<CallbackFactory()>,
- typename Callback = WrapOnComplete<OnComplete>>
- bool TryAddCallback(const CallbackFactory& callback_factory,
- CallbackOptions opts = CallbackOptions::Defaults()) const {
- return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
- }
-
- template <typename OnSuccess, typename OnFailure>
- struct ThenOnComplete {
- static constexpr bool has_no_args =
- internal::call_traits::argument_count<OnSuccess>::value == 0;
-
- using ContinuedFuture = detail::ContinueFuture::ForSignature<
- detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
-
- static_assert(
- std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
- ContinuedFuture>::value,
- "OnSuccess and OnFailure must continue with the same future type");
-
- struct DummyOnSuccess {
- void operator()(const T&);
- };
- using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
- 0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
-
- static_assert(
- !std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
- "OnSuccess' argument should not be a Result");
-
- void operator()(const Result<T>& result) && {
- detail::ContinueFuture continue_future;
- if (ARROW_PREDICT_TRUE(result.ok())) {
- // move on_failure to a(n immediately destroyed) temporary to free its resources
- ARROW_UNUSED(OnFailure(std::move(on_failure)));
- continue_future.IgnoringArgsIf(
- detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
- std::move(next), std::move(on_success), result.ValueOrDie());
- } else {
- ARROW_UNUSED(OnSuccess(std::move(on_success)));
- continue_future(std::move(next), std::move(on_failure), result.status());
- }
- }
-
- OnSuccess on_success;
- OnFailure on_failure;
- ContinuedFuture next;
- };
-
- template <typename OnSuccess>
- struct PassthruOnFailure {
- using ContinuedFuture = detail::ContinueFuture::ForSignature<
- detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
-
- Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
- };
-
- /// \brief Consumer API: Register a continuation to run when this future completes
- ///
- /// The continuation will run in the same thread that called MarkFinished (whatever
- /// callback is registered with this function will run before MarkFinished returns).
- /// Avoid long-running callbacks in favor of submitting a task to an Executor and
- /// returning the future.
- ///
- /// Two callbacks are supported:
- /// - OnSuccess, called with the result (const ValueType&) on successul completion.
- /// for an empty future this will be called with nothing ()
- /// - OnFailure, called with the error (const Status&) on failed completion.
- /// This callback is optional and defaults to a passthru of any errors.
- ///
- /// Then() returns a Future whose ValueType is derived from the return type of the
- /// callbacks. If a callback returns:
- /// - void, a Future<> will be returned which will completes successully as soon
- /// as the callback runs.
- /// - Status, a Future<> will be returned which will complete with the returned Status
- /// as soon as the callback runs.
- /// - V or Result<V>, a Future<V> will be returned which will complete with the result
- /// of invoking the callback as soon as the callback runs.
- /// - Future<V>, a Future<V> will be returned which will be marked complete when the
- /// future returned by the callback completes (and will complete with the same
- /// result).
- ///
- /// The continued Future type must be the same for both callbacks.
- ///
- /// Note that OnFailure can swallow errors, allowing continued Futures to successully
- /// complete even if this Future fails.
- ///
- /// If this future is already completed then the callback will be run immediately
- /// and the returned future may already be marked complete.
- ///
- /// See AddCallback for general considerations when writing callbacks.
- template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
- typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
- typename ContinuedFuture = typename OnComplete::ContinuedFuture>
- ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
- CallbackOptions options = CallbackOptions::Defaults()) const {
- auto next = ContinuedFuture::Make();
- AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
- std::forward<OnFailure>(on_failure), next},
- options);
- return next;
- }
-
- /// \brief Implicit constructor to create a finished future from a value
- Future(ValueType val) : Future() { // NOLINT runtime/explicit
- impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
- SetResult(std::move(val));
- }
-
- /// \brief Implicit constructor to create a future from a Result, enabling use
- /// of macros like ARROW_ASSIGN_OR_RAISE.
- Future(Result<ValueType> res) : Future() { // NOLINT runtime/explicit
- if (ARROW_PREDICT_TRUE(res.ok())) {
- impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
- } else {
- impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
- }
- SetResult(std::move(res));
- }
-
- /// \brief Implicit constructor to create a future from a Status, enabling use
- /// of macros like ARROW_RETURN_NOT_OK.
- Future(Status s) // NOLINT runtime/explicit
- : Future(Result<ValueType>(std::move(s))) {}
-
+ /// \brief Make a finished Future<> with the provided Status.
+ template <typename E = ValueType, typename = typename std::enable_if<
+ std::is_same<E, internal::Empty>::value>::type>
+ static Future<> MakeFinished(Status s = Status::OK()) {
+ return MakeFinished(E::ToResult(std::move(s)));
+ }
+
+ struct WrapResultyOnComplete {
+ template <typename OnComplete>
+ struct Callback {
+ void operator()(const FutureImpl& impl) && {
+ std::move(on_complete)(*impl.CastResult<ValueType>());
+ }
+ OnComplete on_complete;
+ };
+ };
+
+ struct WrapStatusyOnComplete {
+ template <typename OnComplete>
+ struct Callback {
+ static_assert(std::is_same<internal::Empty, ValueType>::value,
+ "Only callbacks for Future<> should accept Status and not Result");
+
+ void operator()(const FutureImpl& impl) && {
+ std::move(on_complete)(impl.CastResult<ValueType>()->status());
+ }
+ OnComplete on_complete;
+ };
+ };
+
+ template <typename OnComplete>
+ using WrapOnComplete = typename std::conditional<
+ detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
+ WrapResultyOnComplete>::type::template Callback<OnComplete>;
+
+ /// \brief Consumer API: Register a callback to run when this future completes
+ ///
+ /// The callback should receive the result of the future (const Result<T>&)
+ /// For a void or statusy future this should be (const Status&)
+ ///
+ /// There is no guarantee to the order in which callbacks will run. In
+ /// particular, callbacks added while the future is being marked complete
+ /// may be executed immediately, ahead of, or even the same time as, other
+ /// callbacks that have been previously added.
+ ///
+ /// WARNING: callbacks may hold arbitrary references, including cyclic references.
+ /// Since callbacks will only be destroyed after they are invoked, this can lead to
+ /// memory leaks if a Future is never marked finished (abandoned):
+ ///
+ /// {
+ /// auto fut = Future<>::Make();
+ /// fut.AddCallback([fut]() {});
+ /// }
+ ///
+ /// In this example `fut` falls out of scope but is not destroyed because it holds a
+ /// cyclic reference to itself through the callback.
+ template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
+ void AddCallback(OnComplete on_complete,
+ CallbackOptions opts = CallbackOptions::Defaults()) const {
+ // We know impl_ will not be dangling when invoking callbacks because at least one
+ // thread will be waiting for MarkFinished to return. Thus it's safe to keep a
+ // weak reference to impl_ here
+ impl_->AddCallback(Callback{std::move(on_complete)}, opts);
+ }
+
+ /// \brief Overload of AddCallback that will return false instead of running
+ /// synchronously
+ ///
+ /// This overload will guarantee the callback is never run synchronously. If the future
+ /// is already finished then it will simply return false. This can be useful to avoid
+ /// stack overflow in a situation where you have recursive Futures. For an example
+ /// see the Loop function
+ ///
+ /// Takes in a callback factory function to allow moving callbacks (the factory function
+ /// will only be called if the callback can successfully be added)
+ ///
+ /// Returns true if a callback was actually added and false if the callback failed
+ /// to add because the future was marked complete.
+ template <typename CallbackFactory,
+ typename OnComplete = detail::result_of_t<CallbackFactory()>,
+ typename Callback = WrapOnComplete<OnComplete>>
+ bool TryAddCallback(const CallbackFactory& callback_factory,
+ CallbackOptions opts = CallbackOptions::Defaults()) const {
+ return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
+ }
+
+ template <typename OnSuccess, typename OnFailure>
+ struct ThenOnComplete {
+ static constexpr bool has_no_args =
+ internal::call_traits::argument_count<OnSuccess>::value == 0;
+
+ using ContinuedFuture = detail::ContinueFuture::ForSignature<
+ detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+ static_assert(
+ std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
+ ContinuedFuture>::value,
+ "OnSuccess and OnFailure must continue with the same future type");
+
+ struct DummyOnSuccess {
+ void operator()(const T&);
+ };
+ using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
+ 0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
+
+ static_assert(
+ !std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
+ "OnSuccess' argument should not be a Result");
+
+ void operator()(const Result<T>& result) && {
+ detail::ContinueFuture continue_future;
+ if (ARROW_PREDICT_TRUE(result.ok())) {
+ // move on_failure to a(n immediately destroyed) temporary to free its resources
+ ARROW_UNUSED(OnFailure(std::move(on_failure)));
+ continue_future.IgnoringArgsIf(
+ detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
+ std::move(next), std::move(on_success), result.ValueOrDie());
+ } else {
+ ARROW_UNUSED(OnSuccess(std::move(on_success)));
+ continue_future(std::move(next), std::move(on_failure), result.status());
+ }
+ }
+
+ OnSuccess on_success;
+ OnFailure on_failure;
+ ContinuedFuture next;
+ };
+
+ template <typename OnSuccess>
+ struct PassthruOnFailure {
+ using ContinuedFuture = detail::ContinueFuture::ForSignature<
+ detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+ Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
+ };
+
+ /// \brief Consumer API: Register a continuation to run when this future completes
+ ///
+ /// The continuation will run in the same thread that called MarkFinished (whatever
+ /// callback is registered with this function will run before MarkFinished returns).
+ /// Avoid long-running callbacks in favor of submitting a task to an Executor and
+ /// returning the future.
+ ///
+ /// Two callbacks are supported:
+ /// - OnSuccess, called with the result (const ValueType&) on successul completion.
+ /// for an empty future this will be called with nothing ()
+ /// - OnFailure, called with the error (const Status&) on failed completion.
+ /// This callback is optional and defaults to a passthru of any errors.
+ ///
+ /// Then() returns a Future whose ValueType is derived from the return type of the
+ /// callbacks. If a callback returns:
+ /// - void, a Future<> will be returned which will completes successully as soon
+ /// as the callback runs.
+ /// - Status, a Future<> will be returned which will complete with the returned Status
+ /// as soon as the callback runs.
+ /// - V or Result<V>, a Future<V> will be returned which will complete with the result
+ /// of invoking the callback as soon as the callback runs.
+ /// - Future<V>, a Future<V> will be returned which will be marked complete when the
+ /// future returned by the callback completes (and will complete with the same
+ /// result).
+ ///
+ /// The continued Future type must be the same for both callbacks.
+ ///
+ /// Note that OnFailure can swallow errors, allowing continued Futures to successully
+ /// complete even if this Future fails.
+ ///
+ /// If this future is already completed then the callback will be run immediately
+ /// and the returned future may already be marked complete.
+ ///
+ /// See AddCallback for general considerations when writing callbacks.
+ template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
+ typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
+ typename ContinuedFuture = typename OnComplete::ContinuedFuture>
+ ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
+ CallbackOptions options = CallbackOptions::Defaults()) const {
+ auto next = ContinuedFuture::Make();
+ AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
+ std::forward<OnFailure>(on_failure), next},
+ options);
+ return next;
+ }
+
+ /// \brief Implicit constructor to create a finished future from a value
+ Future(ValueType val) : Future() { // NOLINT runtime/explicit
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ SetResult(std::move(val));
+ }
+
+ /// \brief Implicit constructor to create a future from a Result, enabling use
+ /// of macros like ARROW_ASSIGN_OR_RAISE.
+ Future(Result<ValueType> res) : Future() { // NOLINT runtime/explicit
+ if (ARROW_PREDICT_TRUE(res.ok())) {
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ } else {
+ impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+ }
+ SetResult(std::move(res));
+ }
+
+ /// \brief Implicit constructor to create a future from a Status, enabling use
+ /// of macros like ARROW_RETURN_NOT_OK.
+ Future(Status s) // NOLINT runtime/explicit
+ : Future(Result<ValueType>(std::move(s))) {}
+
protected:
- void InitializeFromResult(Result<ValueType> res) {
- if (ARROW_PREDICT_TRUE(res.ok())) {
- impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
- } else {
- impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
- }
- SetResult(std::move(res));
- }
-
- void Initialize() { impl_ = FutureImpl::Make(); }
-
- Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
-
- void SetResult(Result<ValueType> res) {
- impl_->result_ = {new Result<ValueType>(std::move(res)),
- [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
- }
-
- void DoMarkFinished(Result<ValueType> res) {
- SetResult(std::move(res));
-
- if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
- impl_->MarkFinished();
- } else {
- impl_->MarkFailed();
- }
- }
-
+ void InitializeFromResult(Result<ValueType> res) {
+ if (ARROW_PREDICT_TRUE(res.ok())) {
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ } else {
+ impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+ }
+ SetResult(std::move(res));
+ }
+
+ void Initialize() { impl_ = FutureImpl::Make(); }
+
+ Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
+
+ void SetResult(Result<ValueType> res) {
+ impl_->result_ = {new Result<ValueType>(std::move(res)),
+ [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
+ }
+
+ void DoMarkFinished(Result<ValueType> res) {
+ SetResult(std::move(res));
+
+ if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
+ impl_->MarkFinished();
+ } else {
+ impl_->MarkFailed();
+ }
+ }
+
void CheckValid() const {
#ifndef NDEBUG
if (!is_valid()) {
@@ -713,54 +713,54 @@ class ARROW_MUST_USE_TYPE Future {
#endif
}
- explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
-
- std::shared_ptr<FutureImpl> impl_;
+ explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
+ std::shared_ptr<FutureImpl> impl_;
+
friend class FutureWaiter;
- friend struct detail::ContinueFuture;
-
- template <typename U>
- friend class Future;
- friend class WeakFuture<T>;
-
- FRIEND_TEST(FutureRefTest, ChainRemoved);
- FRIEND_TEST(FutureRefTest, TailRemoved);
- FRIEND_TEST(FutureRefTest, HeadRemoved);
+ friend struct detail::ContinueFuture;
+
+ template <typename U>
+ friend class Future;
+ friend class WeakFuture<T>;
+
+ FRIEND_TEST(FutureRefTest, ChainRemoved);
+ FRIEND_TEST(FutureRefTest, TailRemoved);
+ FRIEND_TEST(FutureRefTest, HeadRemoved);
};
-template <typename T>
-typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
- return fut.result();
-}
-
-template <>
-inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
- const Future<internal::Empty>& fut) {
- return fut.status();
-}
-
-template <typename T>
-class WeakFuture {
- public:
- explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
-
- Future<T> get() { return Future<T>{impl_.lock()}; }
-
- private:
- std::weak_ptr<FutureImpl> impl_;
-};
-
-/// If a Result<Future> holds an error instead of a Future, construct a finished Future
-/// holding that error.
-template <typename T>
-static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
- if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
- return Future<T>::MakeFinished(std::move(maybe_future).status());
- }
- return std::move(maybe_future).MoveValueUnsafe();
-}
-
+template <typename T>
+typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
+ return fut.result();
+}
+
+template <>
+inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
+ const Future<internal::Empty>& fut) {
+ return fut.status();
+}
+
+template <typename T>
+class WeakFuture {
+ public:
+ explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
+
+ Future<T> get() { return Future<T>{impl_.lock()}; }
+
+ private:
+ std::weak_ptr<FutureImpl> impl_;
+};
+
+/// If a Result<Future> holds an error instead of a Future, construct a finished Future
+/// holding that error.
+template <typename T>
+static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
+ if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
+ return Future<T>::MakeFinished(std::move(maybe_future).status());
+ }
+ return std::move(maybe_future).MoveValueUnsafe();
+}
+
/// \brief Wait for all the futures to end, or for the given timeout to expire.
///
/// `true` is returned if all the futures completed before the timeout was reached,
@@ -783,53 +783,53 @@ inline bool WaitForAll(const std::vector<Future<T>*>& futures,
return waiter->Wait(seconds);
}
-/// \brief Create a Future which completes when all of `futures` complete.
-///
-/// The future's result is a vector of the results of `futures`.
-/// Note that this future will never be marked "failed"; failed results
-/// will be stored in the result vector alongside successful results.
-template <typename T>
-Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
- struct State {
- explicit State(std::vector<Future<T>> f)
- : futures(std::move(f)), n_remaining(futures.size()) {}
-
- std::vector<Future<T>> futures;
- std::atomic<size_t> n_remaining;
- };
-
- if (futures.size() == 0) {
- return {std::vector<Result<T>>{}};
- }
-
- auto state = std::make_shared<State>(std::move(futures));
-
- auto out = Future<std::vector<Result<T>>>::Make();
- for (const Future<T>& future : state->futures) {
- future.AddCallback([state, out](const Result<T>&) mutable {
- if (state->n_remaining.fetch_sub(1) != 1) return;
-
- std::vector<Result<T>> results(state->futures.size());
- for (size_t i = 0; i < results.size(); ++i) {
- results[i] = state->futures[i].result();
- }
- out.MarkFinished(std::move(results));
- });
- }
- return out;
-}
-
-template <>
-inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
-
-/// \brief Create a Future which completes when all of `futures` complete.
-///
-/// The future will be marked complete if all `futures` complete
-/// successfully. Otherwise, it will be marked failed with the status of
-/// the first failing future.
-ARROW_EXPORT
-Future<> AllComplete(const std::vector<Future<>>& futures);
-
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future's result is a vector of the results of `futures`.
+/// Note that this future will never be marked "failed"; failed results
+/// will be stored in the result vector alongside successful results.
+template <typename T>
+Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
+ struct State {
+ explicit State(std::vector<Future<T>> f)
+ : futures(std::move(f)), n_remaining(futures.size()) {}
+
+ std::vector<Future<T>> futures;
+ std::atomic<size_t> n_remaining;
+ };
+
+ if (futures.size() == 0) {
+ return {std::vector<Result<T>>{}};
+ }
+
+ auto state = std::make_shared<State>(std::move(futures));
+
+ auto out = Future<std::vector<Result<T>>>::Make();
+ for (const Future<T>& future : state->futures) {
+ future.AddCallback([state, out](const Result<T>&) mutable {
+ if (state->n_remaining.fetch_sub(1) != 1) return;
+
+ std::vector<Result<T>> results(state->futures.size());
+ for (size_t i = 0; i < results.size(); ++i) {
+ results[i] = state->futures[i].result();
+ }
+ out.MarkFinished(std::move(results));
+ });
+ }
+ return out;
+}
+
+template <>
+inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future will be marked complete if all `futures` complete
+/// successfully. Otherwise, it will be marked failed with the status of
+/// the first failing future.
+ARROW_EXPORT
+Future<> AllComplete(const std::vector<Future<>>& futures);
+
/// \brief Wait for one of the futures to end, or for the given timeout to expire.
///
/// The indices of all completed futures are returned. Note that some futures
@@ -854,104 +854,104 @@ inline std::vector<int> WaitForAny(const std::vector<Future<T>*>& futures,
return waiter->MoveFinishedFutures();
}
-struct Continue {
- template <typename T>
- operator util::optional<T>() && { // NOLINT explicit
- return {};
- }
-};
-
-template <typename T = internal::Empty>
-util::optional<T> Break(T break_value = {}) {
- return util::optional<T>{std::move(break_value)};
-}
-
-template <typename T = internal::Empty>
-using ControlFlow = util::optional<T>;
-
-/// \brief Loop through an asynchronous sequence
-///
-/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
-/// of each yielded future the resulting ControlFlow will be examined. A Break will
-/// terminate the loop, while a Continue will re-invoke `iterate`.
-///
-/// \return A future which will complete when a Future returned by iterate completes with
-/// a Break
-template <typename Iterate,
- typename Control = typename detail::result_of_t<Iterate()>::ValueType,
- typename BreakValueType = typename Control::value_type>
-Future<BreakValueType> Loop(Iterate iterate) {
- struct Callback {
- bool CheckForTermination(const Result<Control>& control_res) {
- if (!control_res.ok()) {
- break_fut.MarkFinished(control_res.status());
- return true;
- }
- if (control_res->has_value()) {
- break_fut.MarkFinished(**control_res);
- return true;
- }
- return false;
- }
-
- void operator()(const Result<Control>& maybe_control) && {
- if (CheckForTermination(maybe_control)) return;
-
- auto control_fut = iterate();
- while (true) {
- if (control_fut.TryAddCallback([this]() { return *this; })) {
- // Adding a callback succeeded; control_fut was not finished
- // and we must wait to CheckForTermination.
- return;
- }
- // Adding a callback failed; control_fut was finished and we
- // can CheckForTermination immediately. This also avoids recursion and potential
- // stack overflow.
- if (CheckForTermination(control_fut.result())) return;
-
- control_fut = iterate();
- }
- }
-
- Iterate iterate;
-
- // If the future returned by control_fut is never completed then we will be hanging on
- // to break_fut forever even if the listener has given up listening on it. Instead we
- // rely on the fact that a producer (the caller of Future<>::Make) is always
- // responsible for completing the futures they create.
- // TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
- Future<BreakValueType> break_fut;
- };
-
- auto break_fut = Future<BreakValueType>::Make();
- auto control_fut = iterate();
- control_fut.AddCallback(Callback{std::move(iterate), break_fut});
-
- return break_fut;
-}
-
-inline Future<> ToFuture(Status status) {
- return Future<>::MakeFinished(std::move(status));
-}
-
-template <typename T>
-Future<T> ToFuture(T value) {
- return Future<T>::MakeFinished(std::move(value));
-}
-
-template <typename T>
-Future<T> ToFuture(Result<T> maybe_value) {
- return Future<T>::MakeFinished(std::move(maybe_value));
-}
-
-template <typename T>
-Future<T> ToFuture(Future<T> fut) {
- return std::move(fut);
-}
-
-template <typename T>
-struct EnsureFuture {
- using type = decltype(ToFuture(std::declval<T>()));
-};
-
+struct Continue {
+ template <typename T>
+ operator util::optional<T>() && { // NOLINT explicit
+ return {};
+ }
+};
+
+template <typename T = internal::Empty>
+util::optional<T> Break(T break_value = {}) {
+ return util::optional<T>{std::move(break_value)};
+}
+
+template <typename T = internal::Empty>
+using ControlFlow = util::optional<T>;
+
+/// \brief Loop through an asynchronous sequence
+///
+/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
+/// of each yielded future the resulting ControlFlow will be examined. A Break will
+/// terminate the loop, while a Continue will re-invoke `iterate`.
+///
+/// \return A future which will complete when a Future returned by iterate completes with
+/// a Break
+template <typename Iterate,
+ typename Control = typename detail::result_of_t<Iterate()>::ValueType,
+ typename BreakValueType = typename Control::value_type>
+Future<BreakValueType> Loop(Iterate iterate) {
+ struct Callback {
+ bool CheckForTermination(const Result<Control>& control_res) {
+ if (!control_res.ok()) {
+ break_fut.MarkFinished(control_res.status());
+ return true;
+ }
+ if (control_res->has_value()) {
+ break_fut.MarkFinished(**control_res);
+ return true;
+ }
+ return false;
+ }
+
+ void operator()(const Result<Control>& maybe_control) && {
+ if (CheckForTermination(maybe_control)) return;
+
+ auto control_fut = iterate();
+ while (true) {
+ if (control_fut.TryAddCallback([this]() { return *this; })) {
+ // Adding a callback succeeded; control_fut was not finished
+ // and we must wait to CheckForTermination.
+ return;
+ }
+ // Adding a callback failed; control_fut was finished and we
+ // can CheckForTermination immediately. This also avoids recursion and potential
+ // stack overflow.
+ if (CheckForTermination(control_fut.result())) return;
+
+ control_fut = iterate();
+ }
+ }
+
+ Iterate iterate;
+
+ // If the future returned by control_fut is never completed then we will be hanging on
+ // to break_fut forever even if the listener has given up listening on it. Instead we
+ // rely on the fact that a producer (the caller of Future<>::Make) is always
+ // responsible for completing the futures they create.
+ // TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
+ Future<BreakValueType> break_fut;
+ };
+
+ auto break_fut = Future<BreakValueType>::Make();
+ auto control_fut = iterate();
+ control_fut.AddCallback(Callback{std::move(iterate), break_fut});
+
+ return break_fut;
+}
+
+inline Future<> ToFuture(Status status) {
+ return Future<>::MakeFinished(std::move(status));
+}
+
+template <typename T>
+Future<T> ToFuture(T value) {
+ return Future<T>::MakeFinished(std::move(value));
+}
+
+template <typename T>
+Future<T> ToFuture(Result<T> maybe_value) {
+ return Future<T>::MakeFinished(std::move(maybe_value));
+}
+
+template <typename T>
+Future<T> ToFuture(Future<T> fut) {
+ return std::move(fut);
+}
+
+template <typename T>
+struct EnsureFuture {
+ using type = decltype(ToFuture(std::declval<T>()));
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
index ac1adcfb13e..2b887cfbfeb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
@@ -39,7 +39,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_builders.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
@@ -329,7 +329,7 @@ class HashTable {
// Stash old entries and seal builder, effectively resetting the Buffer
const Entry* old_entries = entries_;
- ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
+ ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
// Allocate new buffer
RETURN_NOT_OK(UpsizeBuffer(new_capacity));
@@ -460,13 +460,13 @@ class ScalarMemoTable : public MemoTable {
out_data[index] = entry->payload.value;
}
});
- // Zero-initialize the null entry
- if (null_index_ != kKeyNotFound) {
- int32_t index = null_index_ - start;
- if (index >= 0) {
- out_data[index] = Scalar{};
- }
- }
+ // Zero-initialize the null entry
+ if (null_index_ != kKeyNotFound) {
+ int32_t index = null_index_ - start;
+ if (index >= 0) {
+ out_data[index] = Scalar{};
+ }
+ }
}
void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
@@ -697,8 +697,8 @@ class BinaryMemoTable : public MemoTable {
DCHECK_LE(start, size());
const builder_offset_type* offsets = binary_builder_.offsets_data();
- const builder_offset_type delta =
- start < binary_builder_.length() ? offsets[start] : 0;
+ const builder_offset_type delta =
+ start < binary_builder_.length() ? offsets[start] : 0;
for (int32_t i = start; i < size(); ++i) {
const builder_offset_type adjusted_offset = offsets[i] - delta;
Offset cast_offset = static_cast<Offset>(adjusted_offset);
@@ -781,8 +781,8 @@ class BinaryMemoTable : public MemoTable {
if (left_size > 0) {
memcpy(out_data, in_data + left_offset, left_size);
}
- // Zero-initialize the null entry
- memset(out_data + left_size, 0, width_size);
+ // Zero-initialize the null entry
+ memset(out_data + left_size, 0, width_size);
auto right_size = values_size() - static_cast<size_t>(null_data_offset);
if (right_size > 0) {
@@ -852,8 +852,8 @@ struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
using MemoTableType = BinaryMemoTable<BinaryBuilder>;
};
-template <typename T>
-struct HashTraits<T, enable_if_decimal<T>> {
+template <typename T>
+struct HashTraits<T, enable_if_decimal<T>> {
using MemoTableType = BinaryMemoTable<BinaryBuilder>;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
index 1d494671a9f..34665dcf00c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
@@ -16,10 +16,10 @@
// under the License.
#pragma once
-#include "arrow/util/config.h"
+#include "arrow/util/config.h"
#include "arrow/util/macros.h"
-#ifndef ARROW_USE_NATIVE_INT128
+#ifndef ARROW_USE_NATIVE_INT128
#include <boost/multiprecision/cpp_int.hpp>
#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
index 24c5fe56eff..91ab77c64c7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
@@ -26,13 +26,13 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
namespace internal {
@@ -59,7 +59,7 @@ static const uint64_t max_uints[] = {0, max_uint8, max_uint16, 0, max_ui
0, 0, 0, max_uint64};
// Check if we would need to expand the underlying storage type
-static inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) {
+static inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) {
// Optimize for the common case where width doesn't change
if (ARROW_PREDICT_TRUE(val <= max_uints[current_width])) {
return current_width;
@@ -366,7 +366,7 @@ width8:
}
template <typename Source, typename Dest>
-static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t length) {
+static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t length) {
while (length >= 4) {
dest[0] = static_cast<Dest>(src[0]);
dest[1] = static_cast<Dest>(src[1]);
@@ -383,15 +383,15 @@ static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t lengt
}
void DowncastInts(const int64_t* source, int8_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastInts(const int64_t* source, int16_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastInts(const int64_t* source, int32_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) {
@@ -399,25 +399,25 @@ void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) {
}
void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) {
memcpy(dest, source, length * sizeof(int64_t));
}
-void UpcastInts(const int32_t* source, int64_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
-}
-
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
template <typename InputInt, typename OutputInt>
void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length,
const int32_t* transpose_map) {
@@ -466,72 +466,72 @@ INSTANTIATE_ALL()
#undef INSTANTIATE_ALL
#undef INSTANTIATE_ALL_DEST
-namespace {
-
-template <typename SrcType>
-struct TransposeIntsDest {
- const SrcType* src;
- uint8_t* dest;
- int64_t dest_offset;
- int64_t length;
- const int32_t* transpose_map;
-
- template <typename T>
- enable_if_integer<T, Status> Visit(const T&) {
- using DestType = typename T::c_type;
- TransposeInts(src, reinterpret_cast<DestType*>(dest) + dest_offset, length,
- transpose_map);
- return Status::OK();
- }
-
- Status Visit(const DataType& type) {
- return Status::TypeError("TransposeInts received non-integer dest_type");
- }
-
- Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
-};
-
-struct TransposeIntsSrc {
- const uint8_t* src;
- uint8_t* dest;
- int64_t src_offset;
- int64_t dest_offset;
- int64_t length;
- const int32_t* transpose_map;
- const DataType& dest_type;
-
- template <typename T>
- enable_if_integer<T, Status> Visit(const T&) {
- using SrcType = typename T::c_type;
- return TransposeIntsDest<SrcType>{reinterpret_cast<const SrcType*>(src) + src_offset,
- dest, dest_offset, length,
- transpose_map}(dest_type);
- }
-
- Status Visit(const DataType& type) {
- return Status::TypeError("TransposeInts received non-integer dest_type");
- }
-
- Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
-};
-
-}; // namespace
-
-Status TransposeInts(const DataType& src_type, const DataType& dest_type,
- const uint8_t* src, uint8_t* dest, int64_t src_offset,
- int64_t dest_offset, int64_t length, const int32_t* transpose_map) {
- TransposeIntsSrc transposer{src, dest, src_offset, dest_offset,
- length, transpose_map, dest_type};
- return transposer(src_type);
-}
-
+namespace {
+
+template <typename SrcType>
+struct TransposeIntsDest {
+ const SrcType* src;
+ uint8_t* dest;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using DestType = typename T::c_type;
+ TransposeInts(src, reinterpret_cast<DestType*>(dest) + dest_offset, length,
+ transpose_map);
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+struct TransposeIntsSrc {
+ const uint8_t* src;
+ uint8_t* dest;
+ int64_t src_offset;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+ const DataType& dest_type;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using SrcType = typename T::c_type;
+ return TransposeIntsDest<SrcType>{reinterpret_cast<const SrcType*>(src) + src_offset,
+ dest, dest_offset, length,
+ transpose_map}(dest_type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+}; // namespace
+
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map) {
+ TransposeIntsSrc transposer{src, dest, src_offset, dest_offset,
+ length, transpose_map, dest_type};
+ return transposer(src_type);
+}
+
template <typename T>
-static std::string FormatInt(T val) {
+static std::string FormatInt(T val) {
return std::to_string(val);
}
template <typename IndexCType, bool IsSigned = std::is_signed<IndexCType>::value>
-static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
+static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
// For unsigned integers, if the values array is larger than the maximum
// index value (e.g. especially for UINT8 / UINT16), then there is no need to
// boundscheck.
@@ -549,22 +549,22 @@ static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limi
return ((IsSigned && val < 0) ||
(val >= 0 && static_cast<uint64_t>(val) >= upper_limit));
};
- return VisitSetBitRuns(
- bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
- bool block_out_of_bounds = false;
- for (int64_t i = 0; i < length; ++i) {
- block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
+ return VisitSetBitRuns(
+ bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
+ bool block_out_of_bounds = false;
+ for (int64_t i = 0; i < length; ++i) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
}
- if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
- for (int64_t i = 0; i < length; ++i) {
- if (IsOutOfBounds(indices_data[offset + i])) {
- return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
- " out of bounds");
- }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (IsOutOfBounds(indices_data[offset + i])) {
+ return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
+ " out of bounds");
+ }
}
}
- return Status::OK();
- });
+ return Status::OK();
+ });
}
/// \brief Branchless boundschecking of the indices. Processes batches of
@@ -596,8 +596,8 @@ Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit) {
// ----------------------------------------------------------------------
// Utilities for casting from one integer type to another
-namespace {
-
+namespace {
+
template <typename InType, typename CType = typename InType::c_type>
Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper) {
if (std::numeric_limits<CType>::lowest() >= bound_lower &&
@@ -696,8 +696,8 @@ Status CheckIntegersInRangeImpl(const Datum& datum, const Scalar& bound_lower,
checked_cast<const ScalarType&>(bound_upper).value);
}
-} // namespace
-
+} // namespace
+
Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
const Scalar& bound_upper) {
Type::type type_id = datum.type()->id();
@@ -729,8 +729,8 @@ Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
}
}
-namespace {
-
+namespace {
+
template <typename O, typename I, typename Enable = void>
struct is_number_downcast {
static constexpr bool value = false;
@@ -919,8 +919,8 @@ Status IntegersCanFitImpl(const Datum& datum, const DataType& target_type) {
return CheckIntegersInRange(datum, ScalarType(bound_min), ScalarType(bound_max));
}
-} // namespace
-
+} // namespace
+
Status IntegersCanFit(const Datum& datum, const DataType& target_type) {
if (!is_integer(target_type.id())) {
return Status::Invalid("Target type is not an integer type: ", target_type);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
index bf9226cdf12..145a83b3171 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
@@ -18,7 +18,7 @@
#pragma once
#include <cstdint>
-#include <type_traits>
+#include <type_traits>
#include "arrow/status.h"
#include "arrow/util/visibility.h"
@@ -70,30 +70,30 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
ARROW_EXPORT
void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
-ARROW_EXPORT
-void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
-
-template <typename InputInt, typename OutputInt>
-inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
- const InputInt* source, OutputInt* dest, int64_t length) {
- DowncastInts(source, dest, length);
-}
-
-template <typename InputInt, typename OutputInt>
-inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
- const InputInt* source, OutputInt* dest, int64_t length) {
- UpcastInts(source, dest, length);
-}
-
+ARROW_EXPORT
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
+
template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
+ const InputInt* source, OutputInt* dest, int64_t length) {
+ DowncastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
+ const InputInt* source, OutputInt* dest, int64_t length) {
+ UpcastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
const int32_t* transpose_map);
-ARROW_EXPORT
-Status TransposeInts(const DataType& src_type, const DataType& dest_type,
- const uint8_t* src, uint8_t* dest, int64_t src_offset,
- int64_t dest_offset, int64_t length, const int32_t* transpose_map);
-
+ARROW_EXPORT
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map);
+
/// \brief Do vectorized boundschecking of integer-type array indices. The
/// indices must be non-nonnegative and strictly less than the passed upper
/// limit (which is usually the length of an array that is being indexed-into).
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
index 4136706629f..3760d03c9ff 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
@@ -63,27 +63,27 @@ OPS_WITH_OVERFLOW(DivideWithOverflow, div)
#undef OP_WITH_OVERFLOW
#undef OPS_WITH_OVERFLOW
-// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
-// where T is a signed integer type. On overflow, these functions return true.
-// Otherwise, false is returned and `out` is updated with the result of the
-// operation.
-
-#define UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
- static inline bool _func_name(_type u, _type* out) { \
- return !psnip_safe_##_psnip_type##_##_psnip_op(out, u); \
- }
-
-#define SIGNED_UNARY_OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64)
-
-SIGNED_UNARY_OPS_WITH_OVERFLOW(NegateWithOverflow, neg)
-
-#undef UNARY_OP_WITH_OVERFLOW
-#undef SIGNED_UNARY_OPS_WITH_OVERFLOW
-
+// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
+// where T is a signed integer type. On overflow, these functions return true.
+// Otherwise, false is returned and `out` is updated with the result of the
+// operation.
+
+#define UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
+ static inline bool _func_name(_type u, _type* out) { \
+ return !psnip_safe_##_psnip_type##_##_psnip_op(out, u); \
+ }
+
+#define SIGNED_UNARY_OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64)
+
+SIGNED_UNARY_OPS_WITH_OVERFLOW(NegateWithOverflow, neg)
+
+#undef UNARY_OP_WITH_OVERFLOW
+#undef SIGNED_UNARY_OPS_WITH_OVERFLOW
+
/// Signed addition with well-defined behaviour on overflow (as unsigned)
template <typename SignedInt>
SignedInt SafeSignedAdd(SignedInt u, SignedInt v) {
@@ -100,13 +100,13 @@ SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) {
static_cast<UnsignedInt>(v));
}
-/// Signed negation with well-defined behaviour on overflow (as unsigned)
-template <typename SignedInt>
-SignedInt SafeSignedNegate(SignedInt u) {
- using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
- return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
-}
-
+/// Signed negation with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedNegate(SignedInt u) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
+}
+
/// Signed left shift with well-defined behaviour on negative numbers or overflow
template <typename SignedInt, typename Shift>
SignedInt SafeLeftShift(SignedInt u, Shift shift) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
index f6566ea7e36..85f3843f715 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
@@ -22,15 +22,15 @@
#define _FILE_OFFSET_BITS 64
-#if defined(sun) || defined(__sun)
-// According to https://bugs.python.org/issue1759169#msg82201, __EXTENSIONS__
-// is the best way to enable modern POSIX APIs, such as posix_madvise(), on Solaris.
-// (see also
-// https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/sys/mman.h)
-#undef __EXTENSIONS__
-#define __EXTENSIONS__
-#endif
-
+#if defined(sun) || defined(__sun)
+// According to https://bugs.python.org/issue1759169#msg82201, __EXTENSIONS__
+// is the best way to enable modern POSIX APIs, such as posix_madvise(), on Solaris.
+// (see also
+// https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/sys/mman.h)
+#undef __EXTENSIONS__
+#define __EXTENSIONS__
+#endif
+
#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep
#include <algorithm>
@@ -41,7 +41,7 @@
#include <random>
#include <sstream>
#include <string>
-#include <thread>
+#include <thread>
#include <utility>
#include <vector>
@@ -244,26 +244,26 @@ class WinErrorDetail : public StatusDetail {
};
#endif
-const char kSignalDetailTypeId[] = "arrow::SignalDetail";
-
-class SignalDetail : public StatusDetail {
- public:
- explicit SignalDetail(int signum) : signum_(signum) {}
-
- const char* type_id() const override { return kSignalDetailTypeId; }
-
- std::string ToString() const override {
- std::stringstream ss;
- ss << "received signal " << signum_;
- return ss.str();
- }
-
- int signum() const { return signum_; }
-
- protected:
- int signum_;
-};
-
+const char kSignalDetailTypeId[] = "arrow::SignalDetail";
+
+class SignalDetail : public StatusDetail {
+ public:
+ explicit SignalDetail(int signum) : signum_(signum) {}
+
+ const char* type_id() const override { return kSignalDetailTypeId; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "received signal " << signum_;
+ return ss.str();
+ }
+
+ int signum() const { return signum_; }
+
+ protected:
+ int signum_;
+};
+
} // namespace
std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum) {
@@ -276,10 +276,10 @@ std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum) {
}
#endif
-std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum) {
- return std::make_shared<SignalDetail>(signum);
-}
-
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum) {
+ return std::make_shared<SignalDetail>(signum);
+}
+
int ErrnoFromStatus(const Status& status) {
const auto detail = status.detail();
if (detail != nullptr && detail->type_id() == kErrnoDetailTypeId) {
@@ -298,14 +298,14 @@ int WinErrorFromStatus(const Status& status) {
return 0;
}
-int SignalFromStatus(const Status& status) {
- const auto detail = status.detail();
- if (detail != nullptr && detail->type_id() == kSignalDetailTypeId) {
- return checked_cast<const SignalDetail&>(*detail).signum();
- }
- return 0;
-}
-
+int SignalFromStatus(const Status& status) {
+ const auto detail = status.detail();
+ if (detail != nullptr && detail->type_id() == kSignalDetailTypeId) {
+ return checked_cast<const SignalDetail&>(*detail).signum();
+ }
+ return 0;
+}
+
//
// PlatformFilename implementation
//
@@ -403,18 +403,18 @@ namespace {
Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents) {
#ifdef _WIN32
- const auto s = dir_path.ToNative().c_str();
- if (CreateDirectoryW(s, nullptr)) {
+ const auto s = dir_path.ToNative().c_str();
+ if (CreateDirectoryW(s, nullptr)) {
return true;
}
int errnum = GetLastError();
if (errnum == ERROR_ALREADY_EXISTS) {
- const auto attrs = GetFileAttributesW(s);
- if (attrs == INVALID_FILE_ATTRIBUTES || !(attrs & FILE_ATTRIBUTE_DIRECTORY)) {
- // Note we propagate the original error, not the GetFileAttributesW() error
- return IOErrorFromWinError(ERROR_ALREADY_EXISTS, "Cannot create directory '",
- dir_path.ToString(), "': non-directory entry exists");
- }
+ const auto attrs = GetFileAttributesW(s);
+ if (attrs == INVALID_FILE_ATTRIBUTES || !(attrs & FILE_ATTRIBUTE_DIRECTORY)) {
+ // Note we propagate the original error, not the GetFileAttributesW() error
+ return IOErrorFromWinError(ERROR_ALREADY_EXISTS, "Cannot create directory '",
+ dir_path.ToString(), "': non-directory entry exists");
+ }
return false;
}
if (create_parents && errnum == ERROR_PATH_NOT_FOUND) {
@@ -427,17 +427,17 @@ Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents)
return IOErrorFromWinError(GetLastError(), "Cannot create directory '",
dir_path.ToString(), "'");
#else
- const auto s = dir_path.ToNative().c_str();
- if (mkdir(s, S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
+ const auto s = dir_path.ToNative().c_str();
+ if (mkdir(s, S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
return true;
}
if (errno == EEXIST) {
- struct stat st;
- if (stat(s, &st) || !S_ISDIR(st.st_mode)) {
- // Note we propagate the original errno, not the stat() errno
- return IOErrorFromErrno(EEXIST, "Cannot create directory '", dir_path.ToString(),
- "': non-directory entry exists");
- }
+ struct stat st;
+ if (stat(s, &st) || !S_ISDIR(st.st_mode)) {
+ // Note we propagate the original errno, not the stat() errno
+ return IOErrorFromErrno(EEXIST, "Cannot create directory '", dir_path.ToString(),
+ "': non-directory entry exists");
+ }
return false;
}
if (create_parents && errno == ENOENT) {
@@ -1019,15 +1019,15 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes,
return StatusFromMmapErrno("MapViewOfFile failed");
}
return Status::OK();
-#elif defined(__linux__)
- if (ftruncate(fildes, new_size) == -1) {
- return StatusFromMmapErrno("ftruncate failed");
- }
- *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE);
- if (*new_addr == MAP_FAILED) {
- return StatusFromMmapErrno("mremap failed");
- }
- return Status::OK();
+#elif defined(__linux__)
+ if (ftruncate(fildes, new_size) == -1) {
+ return StatusFromMmapErrno("ftruncate failed");
+ }
+ *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE);
+ if (*new_addr == MAP_FAILED) {
+ return StatusFromMmapErrno("mremap failed");
+ }
+ return Status::OK();
#else
// we have to close the mmap first, truncate the file to the new size
// and recreate the mmap
@@ -1089,7 +1089,7 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
}
}
return Status::OK();
-#elif defined(POSIX_MADV_WILLNEED)
+#elif defined(POSIX_MADV_WILLNEED)
for (const auto& region : regions) {
if (region.size != 0) {
const auto aligned = align_region(region);
@@ -1103,8 +1103,8 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
}
}
return Status::OK();
-#else
- return Status::OK();
+#else
+ return Status::OK();
#endif
}
@@ -1468,51 +1468,51 @@ std::string MakeRandomName(int num_chars) {
} // namespace
Result<std::unique_ptr<TemporaryDir>> TemporaryDir::Make(const std::string& prefix) {
- const int kNumChars = 8;
-
+ const int kNumChars = 8;
+
NativePathString base_name;
- auto MakeBaseName = [&]() {
- std::string suffix = MakeRandomName(kNumChars);
- return StringToNative(prefix + suffix);
- };
-
- auto TryCreatingDirectory =
- [&](const NativePathString& base_dir) -> Result<std::unique_ptr<TemporaryDir>> {
- Status st;
- for (int attempt = 0; attempt < 3; ++attempt) {
- PlatformFilename fn(base_dir + kNativeSep + base_name + kNativeSep);
- auto result = CreateDir(fn);
- if (!result.ok()) {
- // Probably a permissions error or a non-existing base_dir
- return nullptr;
- }
- if (*result) {
- return std::unique_ptr<TemporaryDir>(new TemporaryDir(std::move(fn)));
- }
- // The random name already exists in base_dir, try with another name
- st = Status::IOError("Path already exists: '", fn.ToString(), "'");
- ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
- }
- return st;
- };
-
- ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
-
+ auto MakeBaseName = [&]() {
+ std::string suffix = MakeRandomName(kNumChars);
+ return StringToNative(prefix + suffix);
+ };
+
+ auto TryCreatingDirectory =
+ [&](const NativePathString& base_dir) -> Result<std::unique_ptr<TemporaryDir>> {
+ Status st;
+ for (int attempt = 0; attempt < 3; ++attempt) {
+ PlatformFilename fn(base_dir + kNativeSep + base_name + kNativeSep);
+ auto result = CreateDir(fn);
+ if (!result.ok()) {
+ // Probably a permissions error or a non-existing base_dir
+ return nullptr;
+ }
+ if (*result) {
+ return std::unique_ptr<TemporaryDir>(new TemporaryDir(std::move(fn)));
+ }
+ // The random name already exists in base_dir, try with another name
+ st = Status::IOError("Path already exists: '", fn.ToString(), "'");
+ ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
+ }
+ return st;
+ };
+
+ ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
+
auto base_dirs = GetPlatformTemporaryDirs();
DCHECK_NE(base_dirs.size(), 0);
- for (const auto& base_dir : base_dirs) {
- ARROW_ASSIGN_OR_RAISE(auto ptr, TryCreatingDirectory(base_dir));
- if (ptr) {
- return std::move(ptr);
+ for (const auto& base_dir : base_dirs) {
+ ARROW_ASSIGN_OR_RAISE(auto ptr, TryCreatingDirectory(base_dir));
+ if (ptr) {
+ return std::move(ptr);
}
- // Cannot create in this directory, try the next one
+ // Cannot create in this directory, try the next one
}
- return Status::IOError(
- "Cannot create temporary subdirectory in any "
- "of the platform temporary directories");
+ return Status::IOError(
+ "Cannot create temporary subdirectory in any "
+ "of the platform temporary directories");
}
TemporaryDir::TemporaryDir(PlatformFilename&& path) : path_(std::move(path)) {}
@@ -1594,64 +1594,64 @@ Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler)
return Status::OK();
}
-void ReinstateSignalHandler(int signum, SignalHandler::Callback handler) {
-#if !ARROW_HAVE_SIGACTION
- // Cannot report any errors from signal() (but there shouldn't be any)
- signal(signum, handler);
-#endif
-}
-
-Status SendSignal(int signum) {
- if (raise(signum) == 0) {
- return Status::OK();
- }
- if (errno == EINVAL) {
- return Status::Invalid("Invalid signal number ", signum);
- }
- return IOErrorFromErrno(errno, "Failed to raise signal");
-}
-
-Status SendSignalToThread(int signum, uint64_t thread_id) {
-#ifdef _WIN32
- return Status::NotImplemented("Cannot send signal to specific thread on Windows");
-#else
- // Have to use a C-style cast because pthread_t can be a pointer *or* integer type
- int r = pthread_kill((pthread_t)thread_id, signum); // NOLINT readability-casting
- if (r == 0) {
- return Status::OK();
- }
- if (r == EINVAL) {
- return Status::Invalid("Invalid signal number ", signum);
- }
- return IOErrorFromErrno(r, "Failed to raise signal");
-#endif
-}
-
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler) {
+#if !ARROW_HAVE_SIGACTION
+ // Cannot report any errors from signal() (but there shouldn't be any)
+ signal(signum, handler);
+#endif
+}
+
+Status SendSignal(int signum) {
+ if (raise(signum) == 0) {
+ return Status::OK();
+ }
+ if (errno == EINVAL) {
+ return Status::Invalid("Invalid signal number ", signum);
+ }
+ return IOErrorFromErrno(errno, "Failed to raise signal");
+}
+
+Status SendSignalToThread(int signum, uint64_t thread_id) {
+#ifdef _WIN32
+ return Status::NotImplemented("Cannot send signal to specific thread on Windows");
+#else
+ // Have to use a C-style cast because pthread_t can be a pointer *or* integer type
+ int r = pthread_kill((pthread_t)thread_id, signum); // NOLINT readability-casting
+ if (r == 0) {
+ return Status::OK();
+ }
+ if (r == EINVAL) {
+ return Status::Invalid("Invalid signal number ", signum);
+ }
+ return IOErrorFromErrno(r, "Failed to raise signal");
+#endif
+}
+
namespace {
-int64_t GetPid() {
-#ifdef _WIN32
- return GetCurrentProcessId();
-#else
- return getpid();
-#endif
-}
-
+int64_t GetPid() {
+#ifdef _WIN32
+ return GetCurrentProcessId();
+#else
+ return getpid();
+#endif
+}
+
std::mt19937_64 GetSeedGenerator() {
// Initialize Mersenne Twister PRNG with a true random seed.
- // Make sure to mix in process id to minimize risks of clashes when parallel testing.
+ // Make sure to mix in process id to minimize risks of clashes when parallel testing.
#ifdef ARROW_VALGRIND
// Valgrind can crash, hang or enter an infinite loop on std::random_device,
// use a crude initializer instead.
const uint8_t dummy = 0;
ARROW_UNUSED(dummy);
std::mt19937_64 seed_gen(reinterpret_cast<uintptr_t>(&dummy) ^
- static_cast<uintptr_t>(GetPid()));
+ static_cast<uintptr_t>(GetPid()));
#else
std::random_device true_random;
std::mt19937_64 seed_gen(static_cast<uint64_t>(true_random()) ^
- (static_cast<uint64_t>(true_random()) << 32) ^
- static_cast<uint64_t>(GetPid()));
+ (static_cast<uint64_t>(true_random()) << 32) ^
+ static_cast<uint64_t>(GetPid()));
#endif
return seed_gen;
}
@@ -1665,21 +1665,21 @@ int64_t GetRandomSeed() {
return static_cast<int64_t>(seed_gen());
}
-uint64_t GetThreadId() {
- uint64_t equiv{0};
- // std::thread::id is trivially copyable as per C++ spec,
- // so type punning as a uint64_t should work
- static_assert(sizeof(std::thread::id) <= sizeof(uint64_t),
- "std::thread::id can't fit into uint64_t");
- const auto tid = std::this_thread::get_id();
- memcpy(&equiv, reinterpret_cast<const void*>(&tid), sizeof(tid));
- return equiv;
-}
-
-uint64_t GetOptionalThreadId() {
- auto tid = GetThreadId();
- return (tid == 0) ? tid - 1 : tid;
-}
-
+uint64_t GetThreadId() {
+ uint64_t equiv{0};
+ // std::thread::id is trivially copyable as per C++ spec,
+ // so type punning as a uint64_t should work
+ static_assert(sizeof(std::thread::id) <= sizeof(uint64_t),
+ "std::thread::id can't fit into uint64_t");
+ const auto tid = std::this_thread::get_id();
+ memcpy(&equiv, reinterpret_cast<const void*>(&tid), sizeof(tid));
+ return equiv;
+}
+
+uint64_t GetOptionalThreadId() {
+ auto tid = GetThreadId();
+ return (tid == 0) ? tid - 1 : tid;
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
index 4255dd37105..7aa26f0819e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
@@ -209,8 +209,8 @@ std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum);
ARROW_EXPORT
std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum);
#endif
-ARROW_EXPORT
-std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
template <typename... Args>
Status StatusFromErrno(int errnum, StatusCode code, Args&&... args) {
@@ -236,17 +236,17 @@ Status IOErrorFromWinError(int errnum, Args&&... args) {
}
#endif
-template <typename... Args>
-Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
- return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
- std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-Status CancelledFromSignal(int signum, Args&&... args) {
- return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
-}
-
+template <typename... Args>
+Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
+ return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
+ std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status CancelledFromSignal(int signum, Args&&... args) {
+ return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
+}
+
ARROW_EXPORT
int ErrnoFromStatus(const Status&);
@@ -254,9 +254,9 @@ int ErrnoFromStatus(const Status&);
ARROW_EXPORT
int WinErrorFromStatus(const Status&);
-ARROW_EXPORT
-int SignalFromStatus(const Status&);
-
+ARROW_EXPORT
+int SignalFromStatus(const Status&);
+
class ARROW_EXPORT TemporaryDir {
public:
~TemporaryDir();
@@ -309,26 +309,26 @@ Result<SignalHandler> GetSignalHandler(int signum);
ARROW_EXPORT
Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler);
-/// \brief Reinstate the signal handler
-///
-/// For use in signal handlers. This is needed on platforms without sigaction()
-/// such as Windows, as the default signal handler is restored there as
-/// soon as a signal is raised.
-ARROW_EXPORT
-void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
-
-/// \brief Send a signal to the current process
-///
-/// The thread which will receive the signal is unspecified.
-ARROW_EXPORT
-Status SendSignal(int signum);
-
-/// \brief Send a signal to the given thread
-///
-/// This function isn't supported on Windows.
-ARROW_EXPORT
-Status SendSignalToThread(int signum, uint64_t thread_id);
-
+/// \brief Reinstate the signal handler
+///
+/// For use in signal handlers. This is needed on platforms without sigaction()
+/// such as Windows, as the default signal handler is restored there as
+/// soon as a signal is raised.
+ARROW_EXPORT
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
+
+/// \brief Send a signal to the current process
+///
+/// The thread which will receive the signal is unspecified.
+ARROW_EXPORT
+Status SendSignal(int signum);
+
+/// \brief Send a signal to the given thread
+///
+/// This function isn't supported on Windows.
+ARROW_EXPORT
+Status SendSignalToThread(int signum, uint64_t thread_id);
+
/// \brief Get an unpredictable random seed
///
/// This function may be slightly costly, so should only be used to initialize
@@ -338,12 +338,12 @@ Status SendSignalToThread(int signum, uint64_t thread_id);
ARROW_EXPORT
int64_t GetRandomSeed();
-/// \brief Get the current thread id
-///
-/// In addition to having the same properties as std::thread, the returned value
-/// is a regular integer value, which is more convenient than an opaque type.
-ARROW_EXPORT
-uint64_t GetThreadId();
-
+/// \brief Get the current thread id
+///
+/// In addition to having the same properties as std::thread, the returned value
+/// is a regular integer value, which is more convenient than an opaque type.
+ARROW_EXPORT
+uint64_t GetThreadId();
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
index 2f42803d26f..374ac1afd4e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
@@ -43,40 +43,40 @@ struct IterationTraits {
/// \brief a reserved value which indicates the end of iteration. By
/// default this is NULLPTR since most iterators yield pointer types.
/// Specialize IterationTraits if different end semantics are required.
- ///
- /// Note: This should not be used to determine if a given value is a
- /// terminal value. Use IsIterationEnd (which uses IsEnd) instead. This
- /// is only for returning terminal values.
+ ///
+ /// Note: This should not be used to determine if a given value is a
+ /// terminal value. Use IsIterationEnd (which uses IsEnd) instead. This
+ /// is only for returning terminal values.
static T End() { return T(NULLPTR); }
-
- /// \brief Checks to see if the value is a terminal value.
- /// A method is used here since T is not neccesarily comparable in many
- /// cases even though it has a distinct final value
- static bool IsEnd(const T& val) { return val == End(); }
+
+ /// \brief Checks to see if the value is a terminal value.
+ /// A method is used here since T is not neccesarily comparable in many
+ /// cases even though it has a distinct final value
+ static bool IsEnd(const T& val) { return val == End(); }
};
template <typename T>
-T IterationEnd() {
- return IterationTraits<T>::End();
-}
-
-template <typename T>
-bool IsIterationEnd(const T& val) {
- return IterationTraits<T>::IsEnd(val);
-}
-
-template <typename T>
+T IterationEnd() {
+ return IterationTraits<T>::End();
+}
+
+template <typename T>
+bool IsIterationEnd(const T& val) {
+ return IterationTraits<T>::IsEnd(val);
+}
+
+template <typename T>
struct IterationTraits<util::optional<T>> {
/// \brief by default when iterating through a sequence of optional,
/// nullopt indicates the end of iteration.
/// Specialize IterationTraits if different end semantics are required.
static util::optional<T> End() { return util::nullopt; }
- /// \brief by default when iterating through a sequence of optional,
- /// nullopt (!has_value()) indicates the end of iteration.
- /// Specialize IterationTraits if different end semantics are required.
- static bool IsEnd(const util::optional<T>& val) { return !val.has_value(); }
-
+ /// \brief by default when iterating through a sequence of optional,
+ /// nullopt (!has_value()) indicates the end of iteration.
+ /// Specialize IterationTraits if different end semantics are required.
+ static bool IsEnd(const util::optional<T>& val) { return !val.has_value(); }
+
// TODO(bkietz) The range-for loop over Iterator<optional<T>> yields
// Result<optional<T>> which is unnecessary (since only the unyielded end optional
// is nullopt. Add IterationTraits::GetRangeElement() to handle this case
@@ -87,8 +87,8 @@ template <typename T>
class Iterator : public util::EqualityComparable<Iterator<T>> {
public:
/// \brief Iterator may be constructed from any type which has a member function
- /// with signature Result<T> Next();
- /// End of iterator is signalled by returning IteratorTraits<T>::End();
+ /// with signature Result<T> Next();
+ /// End of iterator is signalled by returning IteratorTraits<T>::End();
///
/// The argument is moved or copied to the heap and kept in a unique_ptr<void>. Only
/// its destructor and its Next method (which are stored in function pointers) are
@@ -116,7 +116,7 @@ class Iterator : public util::EqualityComparable<Iterator<T>> {
for (;;) {
ARROW_ASSIGN_OR_RAISE(auto value, Next());
- if (IsIterationEnd(value)) break;
+ if (IsIterationEnd(value)) break;
ARROW_RETURN_NOT_OK(visitor(std::move(value)));
}
@@ -210,132 +210,132 @@ class Iterator : public util::EqualityComparable<Iterator<T>> {
};
template <typename T>
-struct TransformFlow {
- using YieldValueType = T;
-
- TransformFlow(YieldValueType value, bool ready_for_next)
- : finished_(false),
- ready_for_next_(ready_for_next),
- yield_value_(std::move(value)) {}
- TransformFlow(bool finished, bool ready_for_next)
- : finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
-
- bool HasValue() const { return yield_value_.has_value(); }
- bool Finished() const { return finished_; }
- bool ReadyForNext() const { return ready_for_next_; }
- T Value() const { return *yield_value_; }
-
- bool finished_ = false;
- bool ready_for_next_ = false;
- util::optional<YieldValueType> yield_value_;
-};
-
-struct TransformFinish {
- template <typename T>
- operator TransformFlow<T>() && { // NOLINT explicit
- return TransformFlow<T>(true, true);
- }
-};
-
-struct TransformSkip {
- template <typename T>
- operator TransformFlow<T>() && { // NOLINT explicit
- return TransformFlow<T>(false, true);
- }
-};
-
-template <typename T>
-TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
- return TransformFlow<T>(std::move(value), ready_for_next);
-}
-
-template <typename T, typename V>
-using Transformer = std::function<Result<TransformFlow<V>>(T)>;
-
-template <typename T, typename V>
-class TransformIterator {
- public:
- explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
- : it_(std::move(it)),
- transformer_(std::move(transformer)),
- last_value_(),
- finished_() {}
-
- Result<V> Next() {
- while (!finished_) {
- ARROW_ASSIGN_OR_RAISE(util::optional<V> next, Pump());
- if (next.has_value()) {
- return std::move(*next);
- }
- ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
- }
- return IterationTraits<V>::End();
- }
-
- private:
- // Calls the transform function on the current value. Can return in several ways
- // * If the next value is requested (e.g. skip) it will return an empty optional
- // * If an invalid status is encountered that will be returned
- // * If finished it will return IterationTraits<V>::End()
- // * If a value is returned by the transformer that will be returned
- Result<util::optional<V>> Pump() {
- if (!finished_ && last_value_.has_value()) {
- auto next_res = transformer_(*last_value_);
- if (!next_res.ok()) {
- finished_ = true;
- return next_res.status();
- }
- auto next = *next_res;
- if (next.ReadyForNext()) {
- if (IsIterationEnd(*last_value_)) {
- finished_ = true;
- }
- last_value_.reset();
- }
- if (next.Finished()) {
- finished_ = true;
- }
- if (next.HasValue()) {
- return next.Value();
- }
- }
- if (finished_) {
- return IterationTraits<V>::End();
- }
- return util::nullopt;
- }
-
- Iterator<T> it_;
- Transformer<T, V> transformer_;
- util::optional<T> last_value_;
- bool finished_ = false;
-};
-
-/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
-///
-/// The transformer will be called on each element of the source iterator and for each
-/// call it can yield a value, skip, or finish the iteration. When yielding a value the
-/// transformer can choose to consume the source item (the default, ready_for_next = true)
-/// or to keep it and it will be called again on the same value.
-///
-/// This is essentially a more generic form of the map operation that can return 0, 1, or
-/// many values for each of the source items.
-///
-/// The transformer will be exposed to the end of the source sequence
-/// (IterationTraits::End) in case it needs to return some penultimate item(s).
-///
-/// Any invalid status returned by the transformer will be returned immediately.
-template <typename T, typename V>
-Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
- return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
-}
-
-template <typename T>
+struct TransformFlow {
+ using YieldValueType = T;
+
+ TransformFlow(YieldValueType value, bool ready_for_next)
+ : finished_(false),
+ ready_for_next_(ready_for_next),
+ yield_value_(std::move(value)) {}
+ TransformFlow(bool finished, bool ready_for_next)
+ : finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
+
+ bool HasValue() const { return yield_value_.has_value(); }
+ bool Finished() const { return finished_; }
+ bool ReadyForNext() const { return ready_for_next_; }
+ T Value() const { return *yield_value_; }
+
+ bool finished_ = false;
+ bool ready_for_next_ = false;
+ util::optional<YieldValueType> yield_value_;
+};
+
+struct TransformFinish {
+ template <typename T>
+ operator TransformFlow<T>() && { // NOLINT explicit
+ return TransformFlow<T>(true, true);
+ }
+};
+
+struct TransformSkip {
+ template <typename T>
+ operator TransformFlow<T>() && { // NOLINT explicit
+ return TransformFlow<T>(false, true);
+ }
+};
+
+template <typename T>
+TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
+ return TransformFlow<T>(std::move(value), ready_for_next);
+}
+
+template <typename T, typename V>
+using Transformer = std::function<Result<TransformFlow<V>>(T)>;
+
+template <typename T, typename V>
+class TransformIterator {
+ public:
+ explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
+ : it_(std::move(it)),
+ transformer_(std::move(transformer)),
+ last_value_(),
+ finished_() {}
+
+ Result<V> Next() {
+ while (!finished_) {
+ ARROW_ASSIGN_OR_RAISE(util::optional<V> next, Pump());
+ if (next.has_value()) {
+ return std::move(*next);
+ }
+ ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
+ }
+ return IterationTraits<V>::End();
+ }
+
+ private:
+ // Calls the transform function on the current value. Can return in several ways
+ // * If the next value is requested (e.g. skip) it will return an empty optional
+ // * If an invalid status is encountered that will be returned
+ // * If finished it will return IterationTraits<V>::End()
+ // * If a value is returned by the transformer that will be returned
+ Result<util::optional<V>> Pump() {
+ if (!finished_ && last_value_.has_value()) {
+ auto next_res = transformer_(*last_value_);
+ if (!next_res.ok()) {
+ finished_ = true;
+ return next_res.status();
+ }
+ auto next = *next_res;
+ if (next.ReadyForNext()) {
+ if (IsIterationEnd(*last_value_)) {
+ finished_ = true;
+ }
+ last_value_.reset();
+ }
+ if (next.Finished()) {
+ finished_ = true;
+ }
+ if (next.HasValue()) {
+ return next.Value();
+ }
+ }
+ if (finished_) {
+ return IterationTraits<V>::End();
+ }
+ return util::nullopt;
+ }
+
+ Iterator<T> it_;
+ Transformer<T, V> transformer_;
+ util::optional<T> last_value_;
+ bool finished_ = false;
+};
+
+/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
+///
+/// The transformer will be called on each element of the source iterator and for each
+/// call it can yield a value, skip, or finish the iteration. When yielding a value the
+/// transformer can choose to consume the source item (the default, ready_for_next = true)
+/// or to keep it and it will be called again on the same value.
+///
+/// This is essentially a more generic form of the map operation that can return 0, 1, or
+/// many values for each of the source items.
+///
+/// The transformer will be exposed to the end of the source sequence
+/// (IterationTraits::End) in case it needs to return some penultimate item(s).
+///
+/// Any invalid status returned by the transformer will be returned immediately.
+template <typename T, typename V>
+Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
+ return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
+}
+
+template <typename T>
struct IterationTraits<Iterator<T>> {
// The end condition for an Iterator of Iterators is a default constructed (null)
// Iterator.
static Iterator<T> End() { return Iterator<T>(); }
- static bool IsEnd(const Iterator<T>& val) { return !val; }
+ static bool IsEnd(const Iterator<T>& val) { return !val; }
};
template <typename Fn, typename T>
@@ -427,7 +427,7 @@ class MapIterator {
Result<O> Next() {
ARROW_ASSIGN_OR_RAISE(I i, it_.Next());
- if (IsIterationEnd(i)) {
+ if (IsIterationEnd(i)) {
return IterationTraits<O>::End();
}
@@ -489,7 +489,7 @@ struct FilterIterator {
for (;;) {
ARROW_ASSIGN_OR_RAISE(From i, it_.Next());
- if (IsIterationEnd(i)) {
+ if (IsIterationEnd(i)) {
return IterationTraits<To>::End();
}
@@ -525,12 +525,12 @@ class FlattenIterator {
explicit FlattenIterator(Iterator<Iterator<T>> it) : parent_(std::move(it)) {}
Result<T> Next() {
- if (IsIterationEnd(child_)) {
+ if (IsIterationEnd(child_)) {
// Pop from parent's iterator.
ARROW_ASSIGN_OR_RAISE(child_, parent_.Next());
// Check if final iteration reached.
- if (IsIterationEnd(child_)) {
+ if (IsIterationEnd(child_)) {
return IterationTraits<T>::End();
}
@@ -539,7 +539,7 @@ class FlattenIterator {
// Pop from child_ and check for depletion.
ARROW_ASSIGN_OR_RAISE(T out, child_.Next());
- if (IsIterationEnd(out)) {
+ if (IsIterationEnd(out)) {
// Reset state such that we pop from parent on the recursive call
child_ = IterationTraits<Iterator<T>>::End();
@@ -559,10 +559,10 @@ Iterator<T> MakeFlattenIterator(Iterator<Iterator<T>> it) {
return Iterator<T>(FlattenIterator<T>(std::move(it)));
}
-template <typename Reader>
-Iterator<typename Reader::ValueType> MakeIteratorFromReader(
- const std::shared_ptr<Reader>& reader) {
- return MakeFunctionIterator([reader] { return reader->Next(); });
+template <typename Reader>
+Iterator<typename Reader::ValueType> MakeIteratorFromReader(
+ const std::shared_ptr<Reader>& reader) {
+ return MakeFunctionIterator([reader] { return reader->Next(); });
}
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
index ad3b686a9bd..c4a3ac64aab 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
@@ -70,11 +70,11 @@ KeyValueMetadata::KeyValueMetadata(std::vector<std::string> keys,
ARROW_CHECK_EQ(keys.size(), values.size());
}
-std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Make(
- std::vector<std::string> keys, std::vector<std::string> values) {
- return std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
-}
-
+std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Make(
+ std::vector<std::string> keys, std::vector<std::string> values) {
+ return std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
+}
+
void KeyValueMetadata::ToUnorderedMap(
std::unordered_map<std::string, std::string>* out) const {
DCHECK_NE(out, nullptr);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
index d42ab78f667..9835b1739c7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
@@ -39,9 +39,9 @@ class ARROW_EXPORT KeyValueMetadata {
explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
virtual ~KeyValueMetadata() = default;
- static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
- std::vector<std::string> values);
-
+ static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
+ std::vector<std::string> values);
+
void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const;
void Append(const std::string& key, const std::string& value);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
index 65359b44081..314b277a821 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
@@ -24,31 +24,31 @@
#include <iostream>
#ifdef ARROW_USE_GLOG
-
+
#include <signal.h>
#include <vector>
-
+
#error #include "glog/logging.h"
-
-// Restore our versions of DCHECK and friends, as GLog defines its own
-#undef DCHECK
-#undef DCHECK_OK
-#undef DCHECK_EQ
-#undef DCHECK_NE
-#undef DCHECK_LE
-#undef DCHECK_LT
-#undef DCHECK_GE
-#undef DCHECK_GT
-
-#define DCHECK ARROW_DCHECK
-#define DCHECK_OK ARROW_DCHECK_OK
-#define DCHECK_EQ ARROW_DCHECK_EQ
-#define DCHECK_NE ARROW_DCHECK_NE
-#define DCHECK_LE ARROW_DCHECK_LE
-#define DCHECK_LT ARROW_DCHECK_LT
-#define DCHECK_GE ARROW_DCHECK_GE
-#define DCHECK_GT ARROW_DCHECK_GT
-
+
+// Restore our versions of DCHECK and friends, as GLog defines its own
+#undef DCHECK
+#undef DCHECK_OK
+#undef DCHECK_EQ
+#undef DCHECK_NE
+#undef DCHECK_LE
+#undef DCHECK_LT
+#undef DCHECK_GE
+#undef DCHECK_GT
+
+#define DCHECK ARROW_DCHECK
+#define DCHECK_OK ARROW_DCHECK_OK
+#define DCHECK_EQ ARROW_DCHECK_EQ
+#define DCHECK_NE ARROW_DCHECK_NE
+#define DCHECK_LE ARROW_DCHECK_LE
+#define DCHECK_LT ARROW_DCHECK_LT
+#define DCHECK_GE ARROW_DCHECK_GE
+#define DCHECK_GT ARROW_DCHECK_GT
+
#endif
namespace arrow {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
index 15a0188ab76..286cca361b0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
@@ -92,33 +92,33 @@ enum class ArrowLogLevel : int {
// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros
// only do so in debug mode.
-#define ARROW_DCHECK(condition) \
+#define ARROW_DCHECK(condition) \
while (false) ARROW_IGNORE_EXPR(condition); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_OK(s) \
- ARROW_IGNORE_EXPR(s); \
+#define ARROW_DCHECK_OK(s) \
+ ARROW_IGNORE_EXPR(s); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_EQ(val1, val2) \
+#define ARROW_DCHECK_EQ(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_NE(val1, val2) \
+#define ARROW_DCHECK_NE(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_LE(val1, val2) \
+#define ARROW_DCHECK_LE(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_LT(val1, val2) \
+#define ARROW_DCHECK_LT(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_GE(val1, val2) \
+#define ARROW_DCHECK_GE(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_GT(val1, val2) \
+#define ARROW_DCHECK_GT(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
@@ -126,26 +126,26 @@ enum class ArrowLogLevel : int {
#else
#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
-#define ARROW_DCHECK ARROW_CHECK
-#define ARROW_DCHECK_OK ARROW_CHECK_OK
-#define ARROW_DCHECK_EQ ARROW_CHECK_EQ
-#define ARROW_DCHECK_NE ARROW_CHECK_NE
-#define ARROW_DCHECK_LE ARROW_CHECK_LE
-#define ARROW_DCHECK_LT ARROW_CHECK_LT
-#define ARROW_DCHECK_GE ARROW_CHECK_GE
-#define ARROW_DCHECK_GT ARROW_CHECK_GT
+#define ARROW_DCHECK ARROW_CHECK
+#define ARROW_DCHECK_OK ARROW_CHECK_OK
+#define ARROW_DCHECK_EQ ARROW_CHECK_EQ
+#define ARROW_DCHECK_NE ARROW_CHECK_NE
+#define ARROW_DCHECK_LE ARROW_CHECK_LE
+#define ARROW_DCHECK_LT ARROW_CHECK_LT
+#define ARROW_DCHECK_GE ARROW_CHECK_GE
+#define ARROW_DCHECK_GT ARROW_CHECK_GT
#endif // NDEBUG
-#define DCHECK ARROW_DCHECK
-#define DCHECK_OK ARROW_DCHECK_OK
-#define DCHECK_EQ ARROW_DCHECK_EQ
-#define DCHECK_NE ARROW_DCHECK_NE
-#define DCHECK_LE ARROW_DCHECK_LE
-#define DCHECK_LT ARROW_DCHECK_LT
-#define DCHECK_GE ARROW_DCHECK_GE
-#define DCHECK_GT ARROW_DCHECK_GT
-
+#define DCHECK ARROW_DCHECK
+#define DCHECK_OK ARROW_DCHECK_OK
+#define DCHECK_EQ ARROW_DCHECK_EQ
+#define DCHECK_NE ARROW_DCHECK_NE
+#define DCHECK_LE ARROW_DCHECK_LE
+#define DCHECK_LT ARROW_DCHECK_LT
+#define DCHECK_GE ARROW_DCHECK_GE
+#define DCHECK_GT ARROW_DCHECK_GT
+
// This code is adapted from
// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
index 6c80be380ae..3f665c01838 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
@@ -37,7 +37,7 @@ class ARROW_EXPORT Mutex {
/// A Guard is falsy if a lock could not be acquired.
class ARROW_EXPORT Guard {
public:
- Guard() : locked_(NULLPTR, [](Mutex* /* mutex */) {}) {}
+ Guard() : locked_(NULLPTR, [](Mutex* /* mutex */) {}) {}
Guard(Guard&&) = default;
Guard& operator=(Guard&&) = default;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
index 80f60fbdb36..b4858f0bf96 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
@@ -21,9 +21,9 @@
#include <vector>
#include "arrow/status.h"
-#include "arrow/util/functional.h"
+#include "arrow/util/functional.h"
#include "arrow/util/thread_pool.h"
-#include "arrow/util/vector.h"
+#include "arrow/util/vector.h"
namespace arrow {
namespace internal {
@@ -32,12 +32,12 @@ namespace internal {
// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads.
template <class FUNCTION>
-Status ParallelFor(int num_tasks, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
- std::vector<Future<>> futures(num_tasks);
+Status ParallelFor(int num_tasks, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ std::vector<Future<>> futures(num_tasks);
for (int i = 0; i < num_tasks; ++i) {
- ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
+ ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
}
auto st = Status::OK();
for (auto& fut : futures) {
@@ -46,30 +46,30 @@ Status ParallelFor(int num_tasks, FUNCTION&& func,
return st;
}
-template <class FUNCTION, typename T,
- typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
-Future<std::vector<R>> ParallelForAsync(
- std::vector<T> inputs, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
- std::vector<Future<R>> futures(inputs.size());
- for (size_t i = 0; i < inputs.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
- }
- return All(std::move(futures))
- .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
- return UnwrapOrRaise(results);
- });
-}
-
+template <class FUNCTION, typename T,
+ typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> ParallelForAsync(
+ std::vector<T> inputs, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ std::vector<Future<R>> futures(inputs.size());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
+ }
+ return All(std::move(futures))
+ .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
+ return UnwrapOrRaise(results);
+ });
+}
+
// A parallelizer that takes a `Status(int)` function and calls it with
// arguments between 0 and `num_tasks - 1`, in sequence or in parallel,
// depending on the input boolean.
template <class FUNCTION>
-Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
+Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
if (use_threads) {
- return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
+ return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
} else {
for (int i = 0; i < num_tasks; ++i) {
RETURN_NOT_OK(func(i));
@@ -78,25 +78,25 @@ Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
}
}
-// A parallelizer that takes a `Result<R>(int index, T item)` function and
-// calls it with each item from the input array, in sequence or in parallel,
-// depending on the input boolean.
-
-template <class FUNCTION, typename T,
- typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
-Future<std::vector<R>> OptionalParallelForAsync(
- bool use_threads, std::vector<T> inputs, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
- if (use_threads) {
- return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
- } else {
- std::vector<R> result(inputs.size());
- for (size_t i = 0; i < inputs.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
- }
- return result;
- }
-}
-
+// A parallelizer that takes a `Result<R>(int index, T item)` function and
+// calls it with each item from the input array, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION, typename T,
+ typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> OptionalParallelForAsync(
+ bool use_threads, std::vector<T> inputs, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ if (use_threads) {
+ return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
+ } else {
+ std::vector<R> result(inputs.size());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
+ }
+ return result;
+ }
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
index 6c71fa6e155..677778774e3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
@@ -1,29 +1,29 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/vendored/ProducerConsumerQueue.h"
-
-namespace arrow {
-namespace util {
-
-template <typename T>
-using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
-
-}
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/ProducerConsumerQueue.h"
+
+namespace arrow {
+namespace util {
+
+template <typename T>
+using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
+
+}
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
index 0440a2eb563..d9598a6eb34 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
@@ -1,133 +1,133 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "arrow/type_traits.h"
-#include "arrow/util/string_view.h"
-
-namespace arrow {
-namespace internal {
-
-template <size_t...>
-struct index_sequence {};
-
-template <size_t N, size_t Head = N, size_t... Tail>
-struct make_index_sequence_impl;
-
-template <size_t N>
-using make_index_sequence = typename make_index_sequence_impl<N>::type;
-
-template <typename... T>
-using index_sequence_for = make_index_sequence<sizeof...(T)>;
-
-template <size_t N, size_t... I>
-struct make_index_sequence_impl<N, 0, I...> {
- using type = index_sequence<I...>;
-};
-
-template <size_t N, size_t H, size_t... I>
-struct make_index_sequence_impl : make_index_sequence_impl<N, H - 1, H - 1, I...> {};
-
-static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
-static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
-
-template <typename...>
-struct all_same : std::true_type {};
-
-template <typename One>
-struct all_same<One> : std::true_type {};
-
-template <typename Same, typename... Rest>
-struct all_same<Same, Same, Rest...> : all_same<Same, Rest...> {};
-
-template <typename One, typename Other, typename... Rest>
-struct all_same<One, Other, Rest...> : std::false_type {};
-
-template <size_t... I, typename... T, typename Fn>
-void ForEachTupleMemberImpl(const std::tuple<T...>& tup, Fn&& fn, index_sequence<I...>) {
- (void)std::make_tuple((fn(std::get<I>(tup), I), std::ignore)...);
-}
-
-template <typename... T, typename Fn>
-void ForEachTupleMember(const std::tuple<T...>& tup, Fn&& fn) {
- ForEachTupleMemberImpl(tup, fn, index_sequence_for<T...>());
-}
-
-template <typename C, typename T>
-struct DataMemberProperty {
- using Class = C;
- using Type = T;
-
- constexpr const Type& get(const Class& obj) const { return obj.*ptr_; }
-
- void set(Class* obj, Type value) const { (*obj).*ptr_ = std::move(value); }
-
- constexpr util::string_view name() const { return name_; }
-
- util::string_view name_;
- Type Class::*ptr_;
-};
-
-template <typename Class, typename Type>
-constexpr DataMemberProperty<Class, Type> DataMember(util::string_view name,
- Type Class::*ptr) {
- return {name, ptr};
-}
-
-template <typename... Properties>
-struct PropertyTuple {
- template <typename Fn>
- void ForEach(Fn&& fn) const {
- ForEachTupleMember(props_, fn);
- }
-
- static_assert(all_same<typename Properties::Class...>::value,
- "All properties must be properties of the same class");
-
- size_t size() const { return sizeof...(Properties); }
-
- std::tuple<Properties...> props_;
-};
-
-template <typename... Properties>
-PropertyTuple<Properties...> MakeProperties(Properties... props) {
- return {std::make_tuple(props...)};
-}
-
-template <typename Enum>
-struct EnumTraits {};
-
-template <typename Enum, Enum... Values>
-struct BasicEnumTraits {
- using CType = typename std::underlying_type<Enum>::type;
- using Type = typename CTypeTraits<CType>::ArrowType;
- static std::array<Enum, sizeof...(Values)> values() { return {Values...}; }
-};
-
-template <typename T, typename Enable = void>
-struct has_enum_traits : std::false_type {};
-
-template <typename T>
-struct has_enum_traits<T, void_t<typename EnumTraits<T>::Type>> : std::true_type {};
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "arrow/type_traits.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace internal {
+
+template <size_t...>
+struct index_sequence {};
+
+template <size_t N, size_t Head = N, size_t... Tail>
+struct make_index_sequence_impl;
+
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+template <typename... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+template <size_t N, size_t... I>
+struct make_index_sequence_impl<N, 0, I...> {
+ using type = index_sequence<I...>;
+};
+
+template <size_t N, size_t H, size_t... I>
+struct make_index_sequence_impl : make_index_sequence_impl<N, H - 1, H - 1, I...> {};
+
+static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
+static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
+
+template <typename...>
+struct all_same : std::true_type {};
+
+template <typename One>
+struct all_same<One> : std::true_type {};
+
+template <typename Same, typename... Rest>
+struct all_same<Same, Same, Rest...> : all_same<Same, Rest...> {};
+
+template <typename One, typename Other, typename... Rest>
+struct all_same<One, Other, Rest...> : std::false_type {};
+
+template <size_t... I, typename... T, typename Fn>
+void ForEachTupleMemberImpl(const std::tuple<T...>& tup, Fn&& fn, index_sequence<I...>) {
+ (void)std::make_tuple((fn(std::get<I>(tup), I), std::ignore)...);
+}
+
+template <typename... T, typename Fn>
+void ForEachTupleMember(const std::tuple<T...>& tup, Fn&& fn) {
+ ForEachTupleMemberImpl(tup, fn, index_sequence_for<T...>());
+}
+
+template <typename C, typename T>
+struct DataMemberProperty {
+ using Class = C;
+ using Type = T;
+
+ constexpr const Type& get(const Class& obj) const { return obj.*ptr_; }
+
+ void set(Class* obj, Type value) const { (*obj).*ptr_ = std::move(value); }
+
+ constexpr util::string_view name() const { return name_; }
+
+ util::string_view name_;
+ Type Class::*ptr_;
+};
+
+template <typename Class, typename Type>
+constexpr DataMemberProperty<Class, Type> DataMember(util::string_view name,
+ Type Class::*ptr) {
+ return {name, ptr};
+}
+
+template <typename... Properties>
+struct PropertyTuple {
+ template <typename Fn>
+ void ForEach(Fn&& fn) const {
+ ForEachTupleMember(props_, fn);
+ }
+
+ static_assert(all_same<typename Properties::Class...>::value,
+ "All properties must be properties of the same class");
+
+ size_t size() const { return sizeof...(Properties); }
+
+ std::tuple<Properties...> props_;
+};
+
+template <typename... Properties>
+PropertyTuple<Properties...> MakeProperties(Properties... props) {
+ return {std::make_tuple(props...)};
+}
+
+template <typename Enum>
+struct EnumTraits {};
+
+template <typename Enum, Enum... Values>
+struct BasicEnumTraits {
+ using CType = typename std::underlying_type<Enum>::type;
+ using Type = typename CTypeTraits<CType>::ArrowType;
+ static std::array<Enum, sizeof...(Values)> values() { return {Values...}; }
+};
+
+template <typename T, typename Enable = void>
+struct has_enum_traits : std::false_type {};
+
+template <typename T>
+struct has_enum_traits<T, void_t<typename EnumTraits<T>::Type>> : std::true_type {};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
index 68d29930666..cf13264e41e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
@@ -1,826 +1,826 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use
-// in parquet-cpp, Arrow
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-
-#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/macros.h"
-
-namespace arrow {
-namespace util {
-
-/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs
-/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
-/// (literal encoding).
-/// For both types of runs, there is a byte-aligned indicator which encodes the length
-/// of the run and the type of the run.
-/// This encoding has the benefit that when there aren't any long enough runs, values
-/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
-/// the run length are byte aligned. This allows for very efficient decoding
-/// implementations.
-/// The encoding is:
-/// encoded-block := run*
-/// run := literal-run | repeated-run
-/// literal-run := literal-indicator < literal bytes >
-/// repeated-run := repeated-indicator < repeated value. padded to byte boundary >
-/// literal-indicator := varint_encode( number_of_groups << 1 | 1)
-/// repeated-indicator := varint_encode( number_of_repetitions << 1 )
-//
-/// Each run is preceded by a varint. The varint's least significant bit is
-/// used to indicate whether the run is a literal run or a repeated run. The rest
-/// of the varint is used to determine the length of the run (eg how many times the
-/// value repeats).
-//
-/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
-/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
-/// on a byte boundary without padding.
-/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
-/// the actual number of encoded ints. (This means that the total number of encoded values
-/// can not be determined from the encoded data, since the number of values in the last
-/// group may not be a multiple of 8). For the last group of literal runs, we pad
-/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
-/// without the need for additional checks.
-//
-/// There is a break-even point when it is more storage efficient to do run length
-/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes
-/// for both the repeated encoding or the literal encoding. This value can always
-/// be computed based on the bit-width.
-/// TODO: think about how to use this for strings. The bit packing isn't quite the same.
-//
-/// Examples with bit-width 1 (eg encoding booleans):
-/// ----------------------------------------
-/// 100 1s followed by 100 0s:
-/// <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
-/// - (total 4 bytes)
-//
-/// alternating 1s and 0s (200 total):
-/// 200 ints = 25 groups of 8
-/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
-/// (total 26 bytes, 1 byte overhead)
-//
-
-/// Decoder class for RLE encoded data.
-class RleDecoder {
- public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- DCHECK_GE(bit_width_, 0);
- DCHECK_LE(bit_width_, 64);
- }
-
- RleDecoder() : bit_width_(-1) {}
-
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- DCHECK_GE(bit_width, 0);
- DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
- }
-
- /// Gets the next value. Returns false if there are no more.
- template <typename T>
- bool Get(T* val);
-
- /// Gets a batch of values. Returns the number of decoded elements.
- template <typename T>
- int GetBatch(T* values, int batch_size);
-
- /// Like GetBatch but add spacing for null entries
- template <typename T>
- int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset, T* out);
-
- /// Like GetBatch but the values are then decoded using the provided dictionary
- template <typename T>
- int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values,
- int batch_size);
-
- /// Like GetBatchWithDict but add spacing for null entries
- ///
- /// Null entries will be zero-initialized in `values` to avoid leaking
- /// private data.
- template <typename T>
- int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values,
- int batch_size, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset);
-
- protected:
- BitUtil::BitReader bit_reader_;
- /// Number of bits needed to encode the value. Must be between 0 and 64.
- int bit_width_;
- uint64_t current_value_;
- int32_t repeat_count_;
- int32_t literal_count_;
-
- private:
- /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
- /// are no more.
- template <typename T>
- bool NextCounts();
-
- /// Utility methods for retrieving spaced values.
- template <typename T, typename RunType, typename Converter>
- int GetSpaced(Converter converter, int batch_size, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset, T* out);
-};
-
-/// Class to incrementally build the rle data. This class does not allocate any memory.
-/// The encoding has two modes: encoding repeated runs and literal runs.
-/// If the run is sufficiently short, it is more efficient to encode as a literal run.
-/// This class does so by buffering 8 values at a time. If they are not all the same
-/// they are added to the literal run. If they are the same, they are added to the
-/// repeated run. When we switch modes, the previous run is flushed out.
-class RleEncoder {
- public:
- /// buffer/buffer_len: preallocated output buffer.
- /// bit_width: max number of bits for value.
- /// TODO: consider adding a min_repeated_run_length so the caller can control
- /// when values should be encoded as repeated runs. Currently this is derived
- /// based on the bit_width, which can determine a storage optimal choice.
- /// TODO: allow 0 bit_width (and have dict encoder use it)
- RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
- : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
- DCHECK_GE(bit_width_, 0);
- DCHECK_LE(bit_width_, 64);
- max_run_byte_size_ = MinBufferSize(bit_width);
- DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
- Clear();
- }
-
- /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
- /// This is the maximum length of a single run for 'bit_width'.
- /// It is not valid to pass a buffer less than this length.
- static int MinBufferSize(int bit_width) {
- /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
- int max_literal_run_size =
- 1 +
- static_cast<int>(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width));
- /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value.
- int max_repeated_run_size = BitUtil::BitReader::kMaxVlqByteLength +
- static_cast<int>(BitUtil::BytesForBits(bit_width));
- return std::max(max_literal_run_size, max_repeated_run_size);
- }
-
- /// Returns the maximum byte size it could take to encode 'num_values'.
- static int MaxBufferSize(int bit_width, int num_values) {
- // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
- // and then a repeated run of length 8".
- // 8 values per smallest run, 8 bits per byte
- int bytes_per_run = bit_width;
- int num_runs = static_cast<int>(BitUtil::CeilDiv(num_values, 8));
- int literal_max_size = num_runs + num_runs * bytes_per_run;
-
- // In the very worst case scenario, the data is a concatenation of repeated
- // runs of 8 values. Repeated run has a 1 byte varint followed by the
- // bit-packed repeated value
- int min_repeated_run_size = 1 + static_cast<int>(BitUtil::BytesForBits(bit_width));
- int repeated_max_size =
- static_cast<int>(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size;
-
- return std::max(literal_max_size, repeated_max_size);
- }
-
- /// Encode value. Returns true if the value fits in buffer, false otherwise.
- /// This value must be representable with bit_width_ bits.
- bool Put(uint64_t value);
-
- /// Flushes any pending values to the underlying buffer.
- /// Returns the total number of bytes written
- int Flush();
-
- /// Resets all the state in the encoder.
- void Clear();
-
- /// Returns pointer to underlying buffer
- uint8_t* buffer() { return bit_writer_.buffer(); }
- int32_t len() { return bit_writer_.bytes_written(); }
-
- private:
- /// Flushes any buffered values. If this is part of a repeated run, this is largely
- /// a no-op.
- /// If it is part of a literal run, this will call FlushLiteralRun, which writes
- /// out the buffered literal values.
- /// If 'done' is true, the current run would be written even if it would normally
- /// have been buffered more. This should only be called at the end, when the
- /// encoder has received all values even if it would normally continue to be
- /// buffered.
- void FlushBufferedValues(bool done);
-
- /// Flushes literal values to the underlying buffer. If update_indicator_byte,
- /// then the current literal run is complete and the indicator byte is updated.
- void FlushLiteralRun(bool update_indicator_byte);
-
- /// Flushes a repeated run to the underlying buffer.
- void FlushRepeatedRun();
-
- /// Checks and sets buffer_full_. This must be called after flushing a run to
- /// make sure there are enough bytes remaining to encode the next run.
- void CheckBufferFull();
-
- /// The maximum number of values in a single literal run
- /// (number of groups encodable by a 1-byte indicator * 8)
- static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
-
- /// Number of bits needed to encode the value. Must be between 0 and 64.
- const int bit_width_;
-
- /// Underlying buffer.
- BitUtil::BitWriter bit_writer_;
-
- /// If true, the buffer is full and subsequent Put()'s will fail.
- bool buffer_full_;
-
- /// The maximum byte size a single run can take.
- int max_run_byte_size_;
-
- /// We need to buffer at most 8 values for literals. This happens when the
- /// bit_width is 1 (so 8 values fit in one byte).
- /// TODO: generalize this to other bit widths
- int64_t buffered_values_[8];
-
- /// Number of values in buffered_values_
- int num_buffered_values_;
-
- /// The current (also last) value that was written and the count of how
- /// many times in a row that value has been seen. This is maintained even
- /// if we are in a literal run. If the repeat_count_ get high enough, we switch
- /// to encoding repeated runs.
- uint64_t current_value_;
- int repeat_count_;
-
- /// Number of literals in the current run. This does not include the literals
- /// that might be in buffered_values_. Only after we've got a group big enough
- /// can we decide if they should part of the literal_count_ or repeat_count_
- int literal_count_;
-
- /// Pointer to a byte in the underlying buffer that stores the indicator byte.
- /// This is reserved as soon as we need a literal run but the value is written
- /// when the literal run is complete.
- uint8_t* literal_indicator_byte_;
-};
-
-template <typename T>
-inline bool RleDecoder::Get(T* val) {
- return GetBatch(val, 1) == 1;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatch(T* values, int batch_size) {
- DCHECK_GE(bit_width_, 0);
- int values_read = 0;
-
- auto* out = values;
-
- while (values_read < batch_size) {
- int remaining = batch_size - values_read;
-
- if (repeat_count_ > 0) { // Repeated value case.
- int repeat_batch = std::min(remaining, repeat_count_);
- std::fill(out, out + repeat_batch, static_cast<T>(current_value_));
-
- repeat_count_ -= repeat_batch;
- values_read += repeat_batch;
- out += repeat_batch;
- } else if (literal_count_ > 0) {
- int literal_batch = std::min(remaining, literal_count_);
- int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch);
- if (actual_read != literal_batch) {
- return values_read;
- }
-
- literal_count_ -= literal_batch;
- values_read += literal_batch;
- out += literal_batch;
- } else {
- if (!NextCounts<T>()) return values_read;
- }
- }
-
- return values_read;
-}
-
-template <typename T, typename RunType, typename Converter>
-inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset,
- T* out) {
- if (ARROW_PREDICT_FALSE(null_count == batch_size)) {
- converter.FillZero(out, out + batch_size);
- return batch_size;
- }
-
- DCHECK_GE(bit_width_, 0);
- int values_read = 0;
- int values_remaining = batch_size - null_count;
-
- // Assume no bits to start.
- arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset,
- /*length=*/batch_size);
- arrow::internal::BitRun valid_run = bit_reader.NextRun();
- while (values_read < batch_size) {
- if (ARROW_PREDICT_FALSE(valid_run.length == 0)) {
- valid_run = bit_reader.NextRun();
- }
-
- DCHECK_GT(batch_size, 0);
- DCHECK_GT(valid_run.length, 0);
-
- if (valid_run.set) {
- if ((repeat_count_ == 0) && (literal_count_ == 0)) {
- if (!NextCounts<RunType>()) return values_read;
- DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0));
- }
-
- if (repeat_count_ > 0) {
- int repeat_batch = 0;
- // Consume the entire repeat counts incrementing repeat_batch to
- // be the total of nulls + values consumed, we only need to
- // get the total count because we can fill in the same value for
- // nulls and non-nulls. This proves to be a big efficiency win.
- while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
- DCHECK_GT(valid_run.length, 0);
- if (valid_run.set) {
- int update_size = std::min(static_cast<int>(valid_run.length), repeat_count_);
- repeat_count_ -= update_size;
- repeat_batch += update_size;
- valid_run.length -= update_size;
- values_remaining -= update_size;
- } else {
- // We can consume all nulls here because we would do so on
- // the next loop anyways.
- repeat_batch += static_cast<int>(valid_run.length);
- valid_run.length = 0;
- }
- if (valid_run.length == 0) {
- valid_run = bit_reader.NextRun();
- }
- }
- RunType current_value = static_cast<RunType>(current_value_);
- if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) {
- return values_read;
- }
- converter.Fill(out, out + repeat_batch, current_value);
- out += repeat_batch;
- values_read += repeat_batch;
- } else if (literal_count_ > 0) {
- int literal_batch = std::min(values_remaining, literal_count_);
- DCHECK_GT(literal_batch, 0);
-
- // Decode the literals
- constexpr int kBufferSize = 1024;
- RunType indices[kBufferSize];
- literal_batch = std::min(literal_batch, kBufferSize);
- int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
- if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
- return values_read;
- }
- if (!converter.IsValid(indices, /*length=*/actual_read)) {
- return values_read;
- }
- int skipped = 0;
- int literals_read = 0;
- while (literals_read < literal_batch) {
- if (valid_run.set) {
- int update_size = std::min(literal_batch - literals_read,
- static_cast<int>(valid_run.length));
- converter.Copy(out, indices + literals_read, update_size);
- literals_read += update_size;
- out += update_size;
- valid_run.length -= update_size;
- } else {
- converter.FillZero(out, out + valid_run.length);
- out += valid_run.length;
- skipped += static_cast<int>(valid_run.length);
- valid_run.length = 0;
- }
- if (valid_run.length == 0) {
- valid_run = bit_reader.NextRun();
- }
- }
- literal_count_ -= literal_batch;
- values_remaining -= literal_batch;
- values_read += literal_batch + skipped;
- }
- } else {
- converter.FillZero(out, out + valid_run.length);
- out += valid_run.length;
- values_read += static_cast<int>(valid_run.length);
- valid_run.length = 0;
- }
- }
- DCHECK_EQ(valid_run.length, 0);
- DCHECK_EQ(values_remaining, 0);
- return values_read;
-}
-
-// Converter for GetSpaced that handles runs that get returned
-// directly as output.
-template <typename T>
-struct PlainRleConverter {
- T kZero = {};
- inline bool IsValid(const T& values) const { return true; }
- inline bool IsValid(const T* values, int32_t length) const { return true; }
- inline void Fill(T* begin, T* end, const T& run_value) const {
- std::fill(begin, end, run_value);
- }
- inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
- inline void Copy(T* out, const T* values, int length) const {
- std::memcpy(out, values, length * sizeof(T));
- }
-};
-
-template <typename T>
-inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset, T* out) {
- if (null_count == 0) {
- return GetBatch<T>(out, batch_size);
- }
-
- PlainRleConverter<T> converter;
- arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
- batch_size);
-
- int total_processed = 0;
- int processed = 0;
- arrow::internal::BitBlockCount block;
-
- do {
- block = block_counter.NextFourWords();
- if (block.length == 0) {
- break;
- }
- if (block.AllSet()) {
- processed = GetBatch<T>(out, block.length);
- } else if (block.NoneSet()) {
- converter.FillZero(out, out + block.length);
- processed = block.length;
- } else {
- processed = GetSpaced<T, /*RunType=*/T, PlainRleConverter<T>>(
- converter, block.length, block.length - block.popcount, valid_bits,
- valid_bits_offset, out);
- }
- total_processed += processed;
- out += block.length;
- valid_bits_offset += block.length;
- } while (processed == block.length);
- return total_processed;
-}
-
-static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) {
- return idx >= 0 && idx < dictionary_length;
-}
-
-// Converter for GetSpaced that handles runs of returned dictionary
-// indices.
-template <typename T>
-struct DictionaryConverter {
- T kZero = {};
- const T* dictionary;
- int32_t dictionary_length;
-
- inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); }
-
- inline bool IsValid(const int32_t* values, int32_t length) const {
- using IndexType = int32_t;
- IndexType min_index = std::numeric_limits<IndexType>::max();
- IndexType max_index = std::numeric_limits<IndexType>::min();
- for (int x = 0; x < length; x++) {
- min_index = std::min(values[x], min_index);
- max_index = std::max(values[x], max_index);
- }
-
- return IndexInRange(min_index, dictionary_length) &&
- IndexInRange(max_index, dictionary_length);
- }
- inline void Fill(T* begin, T* end, const int32_t& run_value) const {
- std::fill(begin, end, dictionary[run_value]);
- }
- inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
-
- inline void Copy(T* out, const int32_t* values, int length) const {
- for (int x = 0; x < length; x++) {
- out[x] = dictionary[values[x]];
- }
- }
-};
-
-template <typename T>
-inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length,
- T* values, int batch_size) {
- // Per https://github.com/apache/parquet-format/blob/master/Encodings.md,
- // the maximum dictionary index width in Parquet is 32 bits.
- using IndexType = int32_t;
- DictionaryConverter<T> converter;
- converter.dictionary = dictionary;
- converter.dictionary_length = dictionary_length;
-
- DCHECK_GE(bit_width_, 0);
- int values_read = 0;
-
- auto* out = values;
-
- while (values_read < batch_size) {
- int remaining = batch_size - values_read;
-
- if (repeat_count_ > 0) {
- auto idx = static_cast<IndexType>(current_value_);
- if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) {
- return values_read;
- }
- T val = dictionary[idx];
-
- int repeat_batch = std::min(remaining, repeat_count_);
- std::fill(out, out + repeat_batch, val);
-
- /* Upkeep counters */
- repeat_count_ -= repeat_batch;
- values_read += repeat_batch;
- out += repeat_batch;
- } else if (literal_count_ > 0) {
- constexpr int kBufferSize = 1024;
- IndexType indices[kBufferSize];
-
- int literal_batch = std::min(remaining, literal_count_);
- literal_batch = std::min(literal_batch, kBufferSize);
-
- int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
- if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
- return values_read;
- }
- if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) {
- return values_read;
- }
- converter.Copy(out, indices, literal_batch);
-
- /* Upkeep counters */
- literal_count_ -= literal_batch;
- values_read += literal_batch;
- out += literal_batch;
- } else {
- if (!NextCounts<IndexType>()) return values_read;
- }
- }
-
- return values_read;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
- int32_t dictionary_length, T* out,
- int batch_size, int null_count,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) {
- if (null_count == 0) {
- return GetBatchWithDict<T>(dictionary, dictionary_length, out, batch_size);
- }
- arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
- batch_size);
- using IndexType = int32_t;
- DictionaryConverter<T> converter;
- converter.dictionary = dictionary;
- converter.dictionary_length = dictionary_length;
-
- int total_processed = 0;
- int processed = 0;
- arrow::internal::BitBlockCount block;
- do {
- block = block_counter.NextFourWords();
- if (block.length == 0) {
- break;
- }
- if (block.AllSet()) {
- processed = GetBatchWithDict<T>(dictionary, dictionary_length, out, block.length);
- } else if (block.NoneSet()) {
- converter.FillZero(out, out + block.length);
- processed = block.length;
- } else {
- processed = GetSpaced<T, /*RunType=*/IndexType, DictionaryConverter<T>>(
- converter, block.length, block.length - block.popcount, valid_bits,
- valid_bits_offset, out);
- }
- total_processed += processed;
- out += block.length;
- valid_bits_offset += block.length;
- } while (processed == block.length);
- return total_processed;
-}
-
-template <typename T>
-bool RleDecoder::NextCounts() {
- // Read the next run's indicator int, it could be a literal or repeated run.
- // The int is encoded as a vlq-encoded value.
- uint32_t indicator_value = 0;
- if (!bit_reader_.GetVlqInt(&indicator_value)) return false;
-
- // lsb indicates if it is a literal run or repeated run
- bool is_literal = indicator_value & 1;
- uint32_t count = indicator_value >> 1;
- if (is_literal) {
- if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX) / 8)) {
- return false;
- }
- literal_count_ = count * 8;
- } else {
- if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX))) {
- return false;
- }
- repeat_count_ = count;
- T value = {};
- if (!bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)),
- &value)) {
- return false;
- }
- current_value_ = static_cast<uint64_t>(value);
- }
- return true;
-}
-
-/// This function buffers input values 8 at a time. After seeing all 8 values,
-/// it decides whether they should be encoded as a literal or repeated run.
-inline bool RleEncoder::Put(uint64_t value) {
- DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
- if (ARROW_PREDICT_FALSE(buffer_full_)) return false;
-
- if (ARROW_PREDICT_TRUE(current_value_ == value)) {
- ++repeat_count_;
- if (repeat_count_ > 8) {
- // This is just a continuation of the current run, no need to buffer the
- // values.
- // Note that this is the fast path for long repeated runs.
- return true;
- }
- } else {
- if (repeat_count_ >= 8) {
- // We had a run that was long enough but it has ended. Flush the
- // current repeated run.
- DCHECK_EQ(literal_count_, 0);
- FlushRepeatedRun();
- }
- repeat_count_ = 1;
- current_value_ = value;
- }
-
- buffered_values_[num_buffered_values_] = value;
- if (++num_buffered_values_ == 8) {
- DCHECK_EQ(literal_count_ % 8, 0);
- FlushBufferedValues(false);
- }
- return true;
-}
-
-inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
- if (literal_indicator_byte_ == NULL) {
- // The literal indicator byte has not been reserved yet, get one now.
- literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
- DCHECK(literal_indicator_byte_ != NULL);
- }
-
- // Write all the buffered values as bit packed literals
- for (int i = 0; i < num_buffered_values_; ++i) {
- bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
- DCHECK(success) << "There is a bug in using CheckBufferFull()";
- }
- num_buffered_values_ = 0;
-
- if (update_indicator_byte) {
- // At this point we need to write the indicator byte for the literal run.
- // We only reserve one byte, to allow for streaming writes of literal values.
- // The logic makes sure we flush literal runs often enough to not overrun
- // the 1 byte.
- DCHECK_EQ(literal_count_ % 8, 0);
- int num_groups = literal_count_ / 8;
- int32_t indicator_value = (num_groups << 1) | 1;
- DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
- *literal_indicator_byte_ = static_cast<uint8_t>(indicator_value);
- literal_indicator_byte_ = NULL;
- literal_count_ = 0;
- CheckBufferFull();
- }
-}
-
-inline void RleEncoder::FlushRepeatedRun() {
- DCHECK_GT(repeat_count_, 0);
- bool result = true;
- // The lsb of 0 indicates this is a repeated run
- int32_t indicator_value = repeat_count_ << 1 | 0;
- result &= bit_writer_.PutVlqInt(indicator_value);
- result &= bit_writer_.PutAligned(current_value_,
- static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)));
- DCHECK(result);
- num_buffered_values_ = 0;
- repeat_count_ = 0;
- CheckBufferFull();
-}
-
-/// Flush the values that have been buffered. At this point we decide whether
-/// we need to switch between the run types or continue the current one.
-inline void RleEncoder::FlushBufferedValues(bool done) {
- if (repeat_count_ >= 8) {
- // Clear the buffered values. They are part of the repeated run now and we
- // don't want to flush them out as literals.
- num_buffered_values_ = 0;
- if (literal_count_ != 0) {
- // There was a current literal run. All the values in it have been flushed
- // but we still need to update the indicator byte.
- DCHECK_EQ(literal_count_ % 8, 0);
- DCHECK_EQ(repeat_count_, 8);
- FlushLiteralRun(true);
- }
- DCHECK_EQ(literal_count_, 0);
- return;
- }
-
- literal_count_ += num_buffered_values_;
- DCHECK_EQ(literal_count_ % 8, 0);
- int num_groups = literal_count_ / 8;
- if (num_groups + 1 >= (1 << 6)) {
- // We need to start a new literal run because the indicator byte we've reserved
- // cannot store more values.
- DCHECK(literal_indicator_byte_ != NULL);
- FlushLiteralRun(true);
- } else {
- FlushLiteralRun(done);
- }
- repeat_count_ = 0;
-}
-
-inline int RleEncoder::Flush() {
- if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
- bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
- num_buffered_values_ == 0);
- // There is something pending, figure out if it's a repeated or literal run
- if (repeat_count_ > 0 && all_repeat) {
- FlushRepeatedRun();
- } else {
- DCHECK_EQ(literal_count_ % 8, 0);
- // Buffer the last group of literals to 8 by padding with 0s.
- for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
- ++num_buffered_values_) {
- buffered_values_[num_buffered_values_] = 0;
- }
- literal_count_ += num_buffered_values_;
- FlushLiteralRun(true);
- repeat_count_ = 0;
- }
- }
- bit_writer_.Flush();
- DCHECK_EQ(num_buffered_values_, 0);
- DCHECK_EQ(literal_count_, 0);
- DCHECK_EQ(repeat_count_, 0);
-
- return bit_writer_.bytes_written();
-}
-
-inline void RleEncoder::CheckBufferFull() {
- int bytes_written = bit_writer_.bytes_written();
- if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
- buffer_full_ = true;
- }
-}
-
-inline void RleEncoder::Clear() {
- buffer_full_ = false;
- current_value_ = 0;
- repeat_count_ = 0;
- num_buffered_values_ = 0;
- literal_count_ = 0;
- literal_indicator_byte_ = NULL;
- bit_writer_.Clear();
-}
-
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use
+// in parquet-cpp, Arrow
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs
+/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+/// (literal encoding).
+/// For both types of runs, there is a byte-aligned indicator which encodes the length
+/// of the run and the type of the run.
+/// This encoding has the benefit that when there aren't any long enough runs, values
+/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+/// the run length are byte aligned. This allows for very efficient decoding
+/// implementations.
+/// The encoding is:
+/// encoded-block := run*
+/// run := literal-run | repeated-run
+/// literal-run := literal-indicator < literal bytes >
+/// repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+/// literal-indicator := varint_encode( number_of_groups << 1 | 1)
+/// repeated-indicator := varint_encode( number_of_repetitions << 1 )
+//
+/// Each run is preceded by a varint. The varint's least significant bit is
+/// used to indicate whether the run is a literal run or a repeated run. The rest
+/// of the varint is used to determine the length of the run (eg how many times the
+/// value repeats).
+//
+/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+/// on a byte boundary without padding.
+/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+/// the actual number of encoded ints. (This means that the total number of encoded values
+/// can not be determined from the encoded data, since the number of values in the last
+/// group may not be a multiple of 8). For the last group of literal runs, we pad
+/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+/// without the need for additional checks.
+//
+/// There is a break-even point when it is more storage efficient to do run length
+/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes
+/// for both the repeated encoding or the literal encoding. This value can always
+/// be computed based on the bit-width.
+/// TODO: think about how to use this for strings. The bit packing isn't quite the same.
+//
+/// Examples with bit-width 1 (eg encoding booleans):
+/// ----------------------------------------
+/// 100 1s followed by 100 0s:
+/// <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
+/// - (total 4 bytes)
+//
+/// alternating 1s and 0s (200 total):
+/// 200 ints = 25 groups of 8
+/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+/// (total 26 bytes, 1 byte overhead)
+//
+
+/// Decoder class for RLE encoded data.
+class RleDecoder {
+ public:
+ /// Create a decoder object. buffer/buffer_len is the decoded data.
+ /// bit_width is the width of each value (before encoding).
+ RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
+ : bit_reader_(buffer, buffer_len),
+ bit_width_(bit_width),
+ current_value_(0),
+ repeat_count_(0),
+ literal_count_(0) {
+ DCHECK_GE(bit_width_, 0);
+ DCHECK_LE(bit_width_, 64);
+ }
+
+ RleDecoder() : bit_width_(-1) {}
+
+ void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
+ DCHECK_GE(bit_width, 0);
+ DCHECK_LE(bit_width, 64);
+ bit_reader_.Reset(buffer, buffer_len);
+ bit_width_ = bit_width;
+ current_value_ = 0;
+ repeat_count_ = 0;
+ literal_count_ = 0;
+ }
+
+ /// Gets the next value. Returns false if there are no more.
+ template <typename T>
+ bool Get(T* val);
+
+ /// Gets a batch of values. Returns the number of decoded elements.
+ template <typename T>
+ int GetBatch(T* values, int batch_size);
+
+ /// Like GetBatch but add spacing for null entries
+ template <typename T>
+ int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* out);
+
+ /// Like GetBatch but the values are then decoded using the provided dictionary
+ template <typename T>
+ int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values,
+ int batch_size);
+
+ /// Like GetBatchWithDict but add spacing for null entries
+ ///
+ /// Null entries will be zero-initialized in `values` to avoid leaking
+ /// private data.
+ template <typename T>
+ int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values,
+ int batch_size, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset);
+
+ protected:
+ BitUtil::BitReader bit_reader_;
+ /// Number of bits needed to encode the value. Must be between 0 and 64.
+ int bit_width_;
+ uint64_t current_value_;
+ int32_t repeat_count_;
+ int32_t literal_count_;
+
+ private:
+ /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
+ /// are no more.
+ template <typename T>
+ bool NextCounts();
+
+ /// Utility methods for retrieving spaced values.
+ template <typename T, typename RunType, typename Converter>
+ int GetSpaced(Converter converter, int batch_size, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset, T* out);
+};
+
+/// Class to incrementally build the rle data. This class does not allocate any memory.
+/// The encoding has two modes: encoding repeated runs and literal runs.
+/// If the run is sufficiently short, it is more efficient to encode as a literal run.
+/// This class does so by buffering 8 values at a time. If they are not all the same
+/// they are added to the literal run. If they are the same, they are added to the
+/// repeated run. When we switch modes, the previous run is flushed out.
+class RleEncoder {
+ public:
+ /// buffer/buffer_len: preallocated output buffer.
+ /// bit_width: max number of bits for value.
+ /// TODO: consider adding a min_repeated_run_length so the caller can control
+ /// when values should be encoded as repeated runs. Currently this is derived
+ /// based on the bit_width, which can determine a storage optimal choice.
+ /// TODO: allow 0 bit_width (and have dict encoder use it)
+ RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
+ : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
+ DCHECK_GE(bit_width_, 0);
+ DCHECK_LE(bit_width_, 64);
+ max_run_byte_size_ = MinBufferSize(bit_width);
+ DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
+ Clear();
+ }
+
+ /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
+ /// This is the maximum length of a single run for 'bit_width'.
+ /// It is not valid to pass a buffer less than this length.
+ static int MinBufferSize(int bit_width) {
+ /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+ int max_literal_run_size =
+ 1 +
+ static_cast<int>(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width));
+ /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value.
+ int max_repeated_run_size = BitUtil::BitReader::kMaxVlqByteLength +
+ static_cast<int>(BitUtil::BytesForBits(bit_width));
+ return std::max(max_literal_run_size, max_repeated_run_size);
+ }
+
+ /// Returns the maximum byte size it could take to encode 'num_values'.
+ static int MaxBufferSize(int bit_width, int num_values) {
+ // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
+ // and then a repeated run of length 8".
+ // 8 values per smallest run, 8 bits per byte
+ int bytes_per_run = bit_width;
+ int num_runs = static_cast<int>(BitUtil::CeilDiv(num_values, 8));
+ int literal_max_size = num_runs + num_runs * bytes_per_run;
+
+ // In the very worst case scenario, the data is a concatenation of repeated
+ // runs of 8 values. Repeated run has a 1 byte varint followed by the
+ // bit-packed repeated value
+ int min_repeated_run_size = 1 + static_cast<int>(BitUtil::BytesForBits(bit_width));
+ int repeated_max_size =
+ static_cast<int>(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size;
+
+ return std::max(literal_max_size, repeated_max_size);
+ }
+
+ /// Encode value. Returns true if the value fits in buffer, false otherwise.
+ /// This value must be representable with bit_width_ bits.
+ bool Put(uint64_t value);
+
+ /// Flushes any pending values to the underlying buffer.
+ /// Returns the total number of bytes written
+ int Flush();
+
+ /// Resets all the state in the encoder.
+ void Clear();
+
+ /// Returns pointer to underlying buffer
+ uint8_t* buffer() { return bit_writer_.buffer(); }
+ int32_t len() { return bit_writer_.bytes_written(); }
+
+ private:
+ /// Flushes any buffered values. If this is part of a repeated run, this is largely
+ /// a no-op.
+ /// If it is part of a literal run, this will call FlushLiteralRun, which writes
+ /// out the buffered literal values.
+ /// If 'done' is true, the current run would be written even if it would normally
+ /// have been buffered more. This should only be called at the end, when the
+ /// encoder has received all values even if it would normally continue to be
+ /// buffered.
+ void FlushBufferedValues(bool done);
+
+ /// Flushes literal values to the underlying buffer. If update_indicator_byte,
+ /// then the current literal run is complete and the indicator byte is updated.
+ void FlushLiteralRun(bool update_indicator_byte);
+
+ /// Flushes a repeated run to the underlying buffer.
+ void FlushRepeatedRun();
+
+ /// Checks and sets buffer_full_. This must be called after flushing a run to
+ /// make sure there are enough bytes remaining to encode the next run.
+ void CheckBufferFull();
+
+ /// The maximum number of values in a single literal run
+ /// (number of groups encodable by a 1-byte indicator * 8)
+ static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
+
+ /// Number of bits needed to encode the value. Must be between 0 and 64.
+ const int bit_width_;
+
+ /// Underlying buffer.
+ BitUtil::BitWriter bit_writer_;
+
+ /// If true, the buffer is full and subsequent Put()'s will fail.
+ bool buffer_full_;
+
+ /// The maximum byte size a single run can take.
+ int max_run_byte_size_;
+
+ /// We need to buffer at most 8 values for literals. This happens when the
+ /// bit_width is 1 (so 8 values fit in one byte).
+ /// TODO: generalize this to other bit widths
+ int64_t buffered_values_[8];
+
+ /// Number of values in buffered_values_
+ int num_buffered_values_;
+
+ /// The current (also last) value that was written and the count of how
+ /// many times in a row that value has been seen. This is maintained even
+ /// if we are in a literal run. If the repeat_count_ get high enough, we switch
+ /// to encoding repeated runs.
+ uint64_t current_value_;
+ int repeat_count_;
+
+ /// Number of literals in the current run. This does not include the literals
+ /// that might be in buffered_values_. Only after we've got a group big enough
+ /// can we decide if they should part of the literal_count_ or repeat_count_
+ int literal_count_;
+
+ /// Pointer to a byte in the underlying buffer that stores the indicator byte.
+ /// This is reserved as soon as we need a literal run but the value is written
+ /// when the literal run is complete.
+ uint8_t* literal_indicator_byte_;
+};
+
+template <typename T>
+inline bool RleDecoder::Get(T* val) {
+ return GetBatch(val, 1) == 1;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatch(T* values, int batch_size) {
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+
+ auto* out = values;
+
+ while (values_read < batch_size) {
+ int remaining = batch_size - values_read;
+
+ if (repeat_count_ > 0) { // Repeated value case.
+ int repeat_batch = std::min(remaining, repeat_count_);
+ std::fill(out, out + repeat_batch, static_cast<T>(current_value_));
+
+ repeat_count_ -= repeat_batch;
+ values_read += repeat_batch;
+ out += repeat_batch;
+ } else if (literal_count_ > 0) {
+ int literal_batch = std::min(remaining, literal_count_);
+ int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch);
+ if (actual_read != literal_batch) {
+ return values_read;
+ }
+
+ literal_count_ -= literal_batch;
+ values_read += literal_batch;
+ out += literal_batch;
+ } else {
+ if (!NextCounts<T>()) return values_read;
+ }
+ }
+
+ return values_read;
+}
+
+template <typename T, typename RunType, typename Converter>
+inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset,
+ T* out) {
+ if (ARROW_PREDICT_FALSE(null_count == batch_size)) {
+ converter.FillZero(out, out + batch_size);
+ return batch_size;
+ }
+
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+ int values_remaining = batch_size - null_count;
+
+ // Assume no bits to start.
+ arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset,
+ /*length=*/batch_size);
+ arrow::internal::BitRun valid_run = bit_reader.NextRun();
+ while (values_read < batch_size) {
+ if (ARROW_PREDICT_FALSE(valid_run.length == 0)) {
+ valid_run = bit_reader.NextRun();
+ }
+
+ DCHECK_GT(batch_size, 0);
+ DCHECK_GT(valid_run.length, 0);
+
+ if (valid_run.set) {
+ if ((repeat_count_ == 0) && (literal_count_ == 0)) {
+ if (!NextCounts<RunType>()) return values_read;
+ DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0));
+ }
+
+ if (repeat_count_ > 0) {
+ int repeat_batch = 0;
+ // Consume the entire repeat counts incrementing repeat_batch to
+ // be the total of nulls + values consumed, we only need to
+ // get the total count because we can fill in the same value for
+ // nulls and non-nulls. This proves to be a big efficiency win.
+ while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
+ DCHECK_GT(valid_run.length, 0);
+ if (valid_run.set) {
+ int update_size = std::min(static_cast<int>(valid_run.length), repeat_count_);
+ repeat_count_ -= update_size;
+ repeat_batch += update_size;
+ valid_run.length -= update_size;
+ values_remaining -= update_size;
+ } else {
+ // We can consume all nulls here because we would do so on
+ // the next loop anyways.
+ repeat_batch += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ if (valid_run.length == 0) {
+ valid_run = bit_reader.NextRun();
+ }
+ }
+ RunType current_value = static_cast<RunType>(current_value_);
+ if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) {
+ return values_read;
+ }
+ converter.Fill(out, out + repeat_batch, current_value);
+ out += repeat_batch;
+ values_read += repeat_batch;
+ } else if (literal_count_ > 0) {
+ int literal_batch = std::min(values_remaining, literal_count_);
+ DCHECK_GT(literal_batch, 0);
+
+ // Decode the literals
+ constexpr int kBufferSize = 1024;
+ RunType indices[kBufferSize];
+ literal_batch = std::min(literal_batch, kBufferSize);
+ int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
+ if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
+ return values_read;
+ }
+ if (!converter.IsValid(indices, /*length=*/actual_read)) {
+ return values_read;
+ }
+ int skipped = 0;
+ int literals_read = 0;
+ while (literals_read < literal_batch) {
+ if (valid_run.set) {
+ int update_size = std::min(literal_batch - literals_read,
+ static_cast<int>(valid_run.length));
+ converter.Copy(out, indices + literals_read, update_size);
+ literals_read += update_size;
+ out += update_size;
+ valid_run.length -= update_size;
+ } else {
+ converter.FillZero(out, out + valid_run.length);
+ out += valid_run.length;
+ skipped += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ if (valid_run.length == 0) {
+ valid_run = bit_reader.NextRun();
+ }
+ }
+ literal_count_ -= literal_batch;
+ values_remaining -= literal_batch;
+ values_read += literal_batch + skipped;
+ }
+ } else {
+ converter.FillZero(out, out + valid_run.length);
+ out += valid_run.length;
+ values_read += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ }
+ DCHECK_EQ(valid_run.length, 0);
+ DCHECK_EQ(values_remaining, 0);
+ return values_read;
+}
+
+// Converter for GetSpaced that handles runs that get returned
+// directly as output.
+template <typename T>
+struct PlainRleConverter {
+ T kZero = {};
+ inline bool IsValid(const T& values) const { return true; }
+ inline bool IsValid(const T* values, int32_t length) const { return true; }
+ inline void Fill(T* begin, T* end, const T& run_value) const {
+ std::fill(begin, end, run_value);
+ }
+ inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
+ inline void Copy(T* out, const T* values, int length) const {
+ std::memcpy(out, values, length * sizeof(T));
+ }
+};
+
+template <typename T>
+inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* out) {
+ if (null_count == 0) {
+ return GetBatch<T>(out, batch_size);
+ }
+
+ PlainRleConverter<T> converter;
+ arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
+ batch_size);
+
+ int total_processed = 0;
+ int processed = 0;
+ arrow::internal::BitBlockCount block;
+
+ do {
+ block = block_counter.NextFourWords();
+ if (block.length == 0) {
+ break;
+ }
+ if (block.AllSet()) {
+ processed = GetBatch<T>(out, block.length);
+ } else if (block.NoneSet()) {
+ converter.FillZero(out, out + block.length);
+ processed = block.length;
+ } else {
+ processed = GetSpaced<T, /*RunType=*/T, PlainRleConverter<T>>(
+ converter, block.length, block.length - block.popcount, valid_bits,
+ valid_bits_offset, out);
+ }
+ total_processed += processed;
+ out += block.length;
+ valid_bits_offset += block.length;
+ } while (processed == block.length);
+ return total_processed;
+}
+
+static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) {
+ return idx >= 0 && idx < dictionary_length;
+}
+
+// Converter for GetSpaced that handles runs of returned dictionary
+// indices.
+template <typename T>
+struct DictionaryConverter {
+ T kZero = {};
+ const T* dictionary;
+ int32_t dictionary_length;
+
+ inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); }
+
+ inline bool IsValid(const int32_t* values, int32_t length) const {
+ using IndexType = int32_t;
+ IndexType min_index = std::numeric_limits<IndexType>::max();
+ IndexType max_index = std::numeric_limits<IndexType>::min();
+ for (int x = 0; x < length; x++) {
+ min_index = std::min(values[x], min_index);
+ max_index = std::max(values[x], max_index);
+ }
+
+ return IndexInRange(min_index, dictionary_length) &&
+ IndexInRange(max_index, dictionary_length);
+ }
+ inline void Fill(T* begin, T* end, const int32_t& run_value) const {
+ std::fill(begin, end, dictionary[run_value]);
+ }
+ inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
+
+ inline void Copy(T* out, const int32_t* values, int length) const {
+ for (int x = 0; x < length; x++) {
+ out[x] = dictionary[values[x]];
+ }
+ }
+};
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length,
+ T* values, int batch_size) {
+ // Per https://github.com/apache/parquet-format/blob/master/Encodings.md,
+ // the maximum dictionary index width in Parquet is 32 bits.
+ using IndexType = int32_t;
+ DictionaryConverter<T> converter;
+ converter.dictionary = dictionary;
+ converter.dictionary_length = dictionary_length;
+
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+
+ auto* out = values;
+
+ while (values_read < batch_size) {
+ int remaining = batch_size - values_read;
+
+ if (repeat_count_ > 0) {
+ auto idx = static_cast<IndexType>(current_value_);
+ if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) {
+ return values_read;
+ }
+ T val = dictionary[idx];
+
+ int repeat_batch = std::min(remaining, repeat_count_);
+ std::fill(out, out + repeat_batch, val);
+
+ /* Upkeep counters */
+ repeat_count_ -= repeat_batch;
+ values_read += repeat_batch;
+ out += repeat_batch;
+ } else if (literal_count_ > 0) {
+ constexpr int kBufferSize = 1024;
+ IndexType indices[kBufferSize];
+
+ int literal_batch = std::min(remaining, literal_count_);
+ literal_batch = std::min(literal_batch, kBufferSize);
+
+ int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
+ if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
+ return values_read;
+ }
+ if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) {
+ return values_read;
+ }
+ converter.Copy(out, indices, literal_batch);
+
+ /* Upkeep counters */
+ literal_count_ -= literal_batch;
+ values_read += literal_batch;
+ out += literal_batch;
+ } else {
+ if (!NextCounts<IndexType>()) return values_read;
+ }
+ }
+
+ return values_read;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
+ int32_t dictionary_length, T* out,
+ int batch_size, int null_count,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (null_count == 0) {
+ return GetBatchWithDict<T>(dictionary, dictionary_length, out, batch_size);
+ }
+ arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
+ batch_size);
+ using IndexType = int32_t;
+ DictionaryConverter<T> converter;
+ converter.dictionary = dictionary;
+ converter.dictionary_length = dictionary_length;
+
+ int total_processed = 0;
+ int processed = 0;
+ arrow::internal::BitBlockCount block;
+ do {
+ block = block_counter.NextFourWords();
+ if (block.length == 0) {
+ break;
+ }
+ if (block.AllSet()) {
+ processed = GetBatchWithDict<T>(dictionary, dictionary_length, out, block.length);
+ } else if (block.NoneSet()) {
+ converter.FillZero(out, out + block.length);
+ processed = block.length;
+ } else {
+ processed = GetSpaced<T, /*RunType=*/IndexType, DictionaryConverter<T>>(
+ converter, block.length, block.length - block.popcount, valid_bits,
+ valid_bits_offset, out);
+ }
+ total_processed += processed;
+ out += block.length;
+ valid_bits_offset += block.length;
+ } while (processed == block.length);
+ return total_processed;
+}
+
+template <typename T>
+bool RleDecoder::NextCounts() {
+ // Read the next run's indicator int, it could be a literal or repeated run.
+ // The int is encoded as a vlq-encoded value.
+ uint32_t indicator_value = 0;
+ if (!bit_reader_.GetVlqInt(&indicator_value)) return false;
+
+ // lsb indicates if it is a literal run or repeated run
+ bool is_literal = indicator_value & 1;
+ uint32_t count = indicator_value >> 1;
+ if (is_literal) {
+ if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX) / 8)) {
+ return false;
+ }
+ literal_count_ = count * 8;
+ } else {
+ if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX))) {
+ return false;
+ }
+ repeat_count_ = count;
+ T value = {};
+ if (!bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)),
+ &value)) {
+ return false;
+ }
+ current_value_ = static_cast<uint64_t>(value);
+ }
+ return true;
+}
+
+/// This function buffers input values 8 at a time. After seeing all 8 values,
+/// it decides whether they should be encoded as a literal or repeated run.
+inline bool RleEncoder::Put(uint64_t value) {
+ DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
+ if (ARROW_PREDICT_FALSE(buffer_full_)) return false;
+
+ if (ARROW_PREDICT_TRUE(current_value_ == value)) {
+ ++repeat_count_;
+ if (repeat_count_ > 8) {
+ // This is just a continuation of the current run, no need to buffer the
+ // values.
+ // Note that this is the fast path for long repeated runs.
+ return true;
+ }
+ } else {
+ if (repeat_count_ >= 8) {
+ // We had a run that was long enough but it has ended. Flush the
+ // current repeated run.
+ DCHECK_EQ(literal_count_, 0);
+ FlushRepeatedRun();
+ }
+ repeat_count_ = 1;
+ current_value_ = value;
+ }
+
+ buffered_values_[num_buffered_values_] = value;
+ if (++num_buffered_values_ == 8) {
+ DCHECK_EQ(literal_count_ % 8, 0);
+ FlushBufferedValues(false);
+ }
+ return true;
+}
+
+inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
+ if (literal_indicator_byte_ == NULL) {
+ // The literal indicator byte has not been reserved yet, get one now.
+ literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
+ DCHECK(literal_indicator_byte_ != NULL);
+ }
+
+ // Write all the buffered values as bit packed literals
+ for (int i = 0; i < num_buffered_values_; ++i) {
+ bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
+ DCHECK(success) << "There is a bug in using CheckBufferFull()";
+ }
+ num_buffered_values_ = 0;
+
+ if (update_indicator_byte) {
+ // At this point we need to write the indicator byte for the literal run.
+ // We only reserve one byte, to allow for streaming writes of literal values.
+ // The logic makes sure we flush literal runs often enough to not overrun
+ // the 1 byte.
+ DCHECK_EQ(literal_count_ % 8, 0);
+ int num_groups = literal_count_ / 8;
+ int32_t indicator_value = (num_groups << 1) | 1;
+ DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
+ *literal_indicator_byte_ = static_cast<uint8_t>(indicator_value);
+ literal_indicator_byte_ = NULL;
+ literal_count_ = 0;
+ CheckBufferFull();
+ }
+}
+
+inline void RleEncoder::FlushRepeatedRun() {
+ DCHECK_GT(repeat_count_, 0);
+ bool result = true;
+ // The lsb of 0 indicates this is a repeated run
+ int32_t indicator_value = repeat_count_ << 1 | 0;
+ result &= bit_writer_.PutVlqInt(indicator_value);
+ result &= bit_writer_.PutAligned(current_value_,
+ static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)));
+ DCHECK(result);
+ num_buffered_values_ = 0;
+ repeat_count_ = 0;
+ CheckBufferFull();
+}
+
+/// Flush the values that have been buffered. At this point we decide whether
+/// we need to switch between the run types or continue the current one.
+inline void RleEncoder::FlushBufferedValues(bool done) {
+ if (repeat_count_ >= 8) {
+ // Clear the buffered values. They are part of the repeated run now and we
+ // don't want to flush them out as literals.
+ num_buffered_values_ = 0;
+ if (literal_count_ != 0) {
+ // There was a current literal run. All the values in it have been flushed
+ // but we still need to update the indicator byte.
+ DCHECK_EQ(literal_count_ % 8, 0);
+ DCHECK_EQ(repeat_count_, 8);
+ FlushLiteralRun(true);
+ }
+ DCHECK_EQ(literal_count_, 0);
+ return;
+ }
+
+ literal_count_ += num_buffered_values_;
+ DCHECK_EQ(literal_count_ % 8, 0);
+ int num_groups = literal_count_ / 8;
+ if (num_groups + 1 >= (1 << 6)) {
+ // We need to start a new literal run because the indicator byte we've reserved
+ // cannot store more values.
+ DCHECK(literal_indicator_byte_ != NULL);
+ FlushLiteralRun(true);
+ } else {
+ FlushLiteralRun(done);
+ }
+ repeat_count_ = 0;
+}
+
+inline int RleEncoder::Flush() {
+ if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
+ bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
+ num_buffered_values_ == 0);
+ // There is something pending, figure out if it's a repeated or literal run
+ if (repeat_count_ > 0 && all_repeat) {
+ FlushRepeatedRun();
+ } else {
+ DCHECK_EQ(literal_count_ % 8, 0);
+ // Buffer the last group of literals to 8 by padding with 0s.
+ for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
+ ++num_buffered_values_) {
+ buffered_values_[num_buffered_values_] = 0;
+ }
+ literal_count_ += num_buffered_values_;
+ FlushLiteralRun(true);
+ repeat_count_ = 0;
+ }
+ }
+ bit_writer_.Flush();
+ DCHECK_EQ(num_buffered_values_, 0);
+ DCHECK_EQ(literal_count_, 0);
+ DCHECK_EQ(repeat_count_, 0);
+
+ return bit_writer_.bytes_written();
+}
+
+inline void RleEncoder::CheckBufferFull() {
+ int bytes_written = bit_writer_.bytes_written();
+ if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
+ buffer_full_ = true;
+ }
+}
+
+inline void RleEncoder::Clear() {
+ buffer_full_ = false;
+ current_value_ = 0;
+ repeat_count_ = 0;
+ num_buffered_values_ = 0;
+ literal_count_ = 0;
+ literal_indicator_byte_ = NULL;
+ bit_writer_.Clear();
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
index 8265e1d22ae..9414984663f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
@@ -1,98 +1,98 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-
-#include "arrow/util/bit_run_reader.h"
-
-namespace arrow {
-namespace util {
-namespace internal {
-
-/// \brief Compress the buffer to spaced, excluding the null entries.
-///
-/// \param[in] src the source buffer
-/// \param[in] num_values the size of source buffer
-/// \param[in] valid_bits bitmap data indicating position of valid slots
-/// \param[in] valid_bits_offset offset into valid_bits
-/// \param[out] output the output buffer spaced
-/// \return The size of spaced buffer.
-template <typename T>
-inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset, T* output) {
- int num_valid_values = 0;
-
- arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T));
- num_valid_values += static_cast<int32_t>(run.length);
- }
-
- return num_valid_values;
-}
-
-/// \brief Relocate values in buffer into positions of non-null values as indicated by
-/// a validity bitmap.
-///
-/// \param[in, out] buffer the in-place buffer
-/// \param[in] num_values total size of buffer including null slots
-/// \param[in] null_count number of null slots
-/// \param[in] valid_bits bitmap data indicating position of valid slots
-/// \param[in] valid_bits_offset offset into valid_bits
-/// \return The number of values expanded, including nulls.
-template <typename T>
-inline int SpacedExpand(T* buffer, int num_values, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset) {
- // Point to end as we add the spacing from the back.
- int idx_decode = num_values - null_count;
-
- // Depending on the number of nulls, some of the value slots in buffer may
- // be uninitialized, and this will cause valgrind warnings / potentially UB
- std::memset(static_cast<void*>(buffer + idx_decode), 0, null_count * sizeof(T));
- if (idx_decode == 0) {
- // All nulls, nothing more to do
- return num_values;
- }
-
- arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset,
- num_values);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- idx_decode -= static_cast<int32_t>(run.length);
- assert(idx_decode >= 0);
- std::memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T));
- }
-
- // Otherwise caller gave an incorrect null_count
- assert(idx_decode == 0);
- return num_values;
-}
-
-} // namespace internal
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_run_reader.h"
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+/// \brief Compress the buffer to spaced, excluding the null entries.
+///
+/// \param[in] src the source buffer
+/// \param[in] num_values the size of source buffer
+/// \param[in] valid_bits bitmap data indicating position of valid slots
+/// \param[in] valid_bits_offset offset into valid_bits
+/// \param[out] output the output buffer spaced
+/// \return The size of spaced buffer.
+template <typename T>
+inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* output) {
+ int num_valid_values = 0;
+
+ arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T));
+ num_valid_values += static_cast<int32_t>(run.length);
+ }
+
+ return num_valid_values;
+}
+
+/// \brief Relocate values in buffer into positions of non-null values as indicated by
+/// a validity bitmap.
+///
+/// \param[in, out] buffer the in-place buffer
+/// \param[in] num_values total size of buffer including null slots
+/// \param[in] null_count number of null slots
+/// \param[in] valid_bits bitmap data indicating position of valid slots
+/// \param[in] valid_bits_offset offset into valid_bits
+/// \return The number of values expanded, including nulls.
+template <typename T>
+inline int SpacedExpand(T* buffer, int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ // Point to end as we add the spacing from the back.
+ int idx_decode = num_values - null_count;
+
+ // Depending on the number of nulls, some of the value slots in buffer may
+ // be uninitialized, and this will cause valgrind warnings / potentially UB
+ std::memset(static_cast<void*>(buffer + idx_decode), 0, null_count * sizeof(T));
+ if (idx_decode == 0) {
+ // All nulls, nothing more to do
+ return num_values;
+ }
+
+ arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset,
+ num_values);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ idx_decode -= static_cast<int32_t>(run.length);
+ assert(idx_decode >= 0);
+ std::memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T));
+ }
+
+ // Otherwise caller gave an incorrect null_count
+ assert(idx_decode == 0);
+ return num_values;
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
index d922311df1c..5abb2feb446 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
@@ -92,23 +92,23 @@ Status ParseHexValue(const char* data, uint8_t* out) {
namespace internal {
-std::vector<util::string_view> SplitString(util::string_view v, char delimiter) {
- std::vector<util::string_view> parts;
- size_t start = 0, end;
- while (true) {
- end = v.find(delimiter, start);
- parts.push_back(v.substr(start, end - start));
- if (end == std::string::npos) {
- break;
- }
- start = end + 1;
- }
- return parts;
-}
-
-template <typename StringLike>
-static std::string JoinStringLikes(const std::vector<StringLike>& strings,
- util::string_view delimiter) {
+std::vector<util::string_view> SplitString(util::string_view v, char delimiter) {
+ std::vector<util::string_view> parts;
+ size_t start = 0, end;
+ while (true) {
+ end = v.find(delimiter, start);
+ parts.push_back(v.substr(start, end - start));
+ if (end == std::string::npos) {
+ break;
+ }
+ start = end + 1;
+ }
+ return parts;
+}
+
+template <typename StringLike>
+static std::string JoinStringLikes(const std::vector<StringLike>& strings,
+ util::string_view delimiter) {
if (strings.size() == 0) {
return "";
}
@@ -120,18 +120,18 @@ static std::string JoinStringLikes(const std::vector<StringLike>& strings,
return out;
}
-std::string JoinStrings(const std::vector<util::string_view>& strings,
- util::string_view delimiter) {
- return JoinStringLikes(strings, delimiter);
-}
-
-std::string JoinStrings(const std::vector<std::string>& strings,
- util::string_view delimiter) {
- return JoinStringLikes(strings, delimiter);
-}
-
-static constexpr bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
-
+std::string JoinStrings(const std::vector<util::string_view>& strings,
+ util::string_view delimiter) {
+ return JoinStringLikes(strings, delimiter);
+}
+
+std::string JoinStrings(const std::vector<std::string>& strings,
+ util::string_view delimiter) {
+ return JoinStringLikes(strings, delimiter);
+}
+
+static constexpr bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
std::string TrimString(std::string value) {
size_t ltrim_chars = 0;
while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
index 68b8a54e313..932e599fc21 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
@@ -42,20 +42,20 @@ ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out);
namespace internal {
-/// \brief Split a string with a delimiter
-ARROW_EXPORT
-std::vector<util::string_view> SplitString(util::string_view v, char delim);
-
+/// \brief Split a string with a delimiter
+ARROW_EXPORT
+std::vector<util::string_view> SplitString(util::string_view v, char delim);
+
/// \brief Join strings with a delimiter
ARROW_EXPORT
std::string JoinStrings(const std::vector<util::string_view>& strings,
util::string_view delimiter);
-/// \brief Join strings with a delimiter
-ARROW_EXPORT
-std::string JoinStrings(const std::vector<std::string>& strings,
- util::string_view delimiter);
-
+/// \brief Join strings with a delimiter
+ARROW_EXPORT
+std::string JoinStrings(const std::vector<std::string>& strings,
+ util::string_view delimiter);
+
/// \brief Trim whitespace from left and right sides of string
ARROW_EXPORT
std::string TrimString(std::string value);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
index 7e8ab64b703..04a6d95cacb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
@@ -30,29 +30,29 @@
namespace arrow {
namespace internal {
-namespace {
-
+namespace {
+
////////////////////////////////////////////////////////////////////////
// Serial TaskGroup implementation
class SerialTaskGroup : public TaskGroup {
public:
- explicit SerialTaskGroup(StopToken stop_token) : stop_token_(std::move(stop_token)) {}
-
- void AppendReal(FnOnce<Status()> task) override {
+ explicit SerialTaskGroup(StopToken stop_token) : stop_token_(std::move(stop_token)) {}
+
+ void AppendReal(FnOnce<Status()> task) override {
DCHECK(!finished_);
- if (stop_token_.IsStopRequested()) {
- status_ &= stop_token_.Poll();
- return;
- }
+ if (stop_token_.IsStopRequested()) {
+ status_ &= stop_token_.Poll();
+ return;
+ }
if (status_.ok()) {
- status_ &= std::move(task)();
+ status_ &= std::move(task)();
}
}
Status current_status() override { return status_; }
- bool ok() const override { return status_.ok(); }
+ bool ok() const override { return status_.ok(); }
Status Finish() override {
if (!finished_) {
@@ -61,11 +61,11 @@ class SerialTaskGroup : public TaskGroup {
return status_;
}
- Future<> FinishAsync() override { return Future<>::MakeFinished(Finish()); }
-
+ Future<> FinishAsync() override { return Future<>::MakeFinished(Finish()); }
+
int parallelism() override { return 1; }
- StopToken stop_token_;
+ StopToken stop_token_;
Status status_;
bool finished_ = false;
};
@@ -75,11 +75,11 @@ class SerialTaskGroup : public TaskGroup {
class ThreadedTaskGroup : public TaskGroup {
public:
- ThreadedTaskGroup(Executor* executor, StopToken stop_token)
- : executor_(executor),
- stop_token_(std::move(stop_token)),
- nremaining_(0),
- ok_(true) {}
+ ThreadedTaskGroup(Executor* executor, StopToken stop_token)
+ : executor_(executor),
+ stop_token_(std::move(stop_token)),
+ nremaining_(0),
+ ok_(true) {}
~ThreadedTaskGroup() override {
// Make sure all pending tasks are finished, so that dangling references
@@ -87,42 +87,42 @@ class ThreadedTaskGroup : public TaskGroup {
ARROW_UNUSED(Finish());
}
- void AppendReal(FnOnce<Status()> task) override {
- DCHECK(!finished_);
- if (stop_token_.IsStopRequested()) {
- UpdateStatus(stop_token_.Poll());
- return;
- }
-
+ void AppendReal(FnOnce<Status()> task) override {
+ DCHECK(!finished_);
+ if (stop_token_.IsStopRequested()) {
+ UpdateStatus(stop_token_.Poll());
+ return;
+ }
+
// The hot path is unlocked thanks to atomics
// Only if an error occurs is the lock taken
if (ok_.load(std::memory_order_acquire)) {
nremaining_.fetch_add(1, std::memory_order_acquire);
auto self = checked_pointer_cast<ThreadedTaskGroup>(shared_from_this());
-
- struct Callable {
- void operator()() {
- if (self_->ok_.load(std::memory_order_acquire)) {
- Status st;
- if (stop_token_.IsStopRequested()) {
- st = stop_token_.Poll();
- } else {
- // XXX what about exceptions?
- st = std::move(task_)();
- }
- self_->UpdateStatus(std::move(st));
- }
- self_->OneTaskDone();
+
+ struct Callable {
+ void operator()() {
+ if (self_->ok_.load(std::memory_order_acquire)) {
+ Status st;
+ if (stop_token_.IsStopRequested()) {
+ st = stop_token_.Poll();
+ } else {
+ // XXX what about exceptions?
+ st = std::move(task_)();
+ }
+ self_->UpdateStatus(std::move(st));
+ }
+ self_->OneTaskDone();
}
-
- std::shared_ptr<ThreadedTaskGroup> self_;
- FnOnce<Status()> task_;
- StopToken stop_token_;
- };
-
- Status st =
- executor_->Spawn(Callable{std::move(self), std::move(task), stop_token_});
+
+ std::shared_ptr<ThreadedTaskGroup> self_;
+ FnOnce<Status()> task_;
+ StopToken stop_token_;
+ };
+
+ Status st =
+ executor_->Spawn(Callable{std::move(self), std::move(task), stop_token_});
UpdateStatus(std::move(st));
}
}
@@ -132,7 +132,7 @@ class ThreadedTaskGroup : public TaskGroup {
return status_;
}
- bool ok() const override { return ok_.load(); }
+ bool ok() const override { return ok_.load(); }
Status Finish() override {
std::unique_lock<std::mutex> lock(mutex_);
@@ -144,20 +144,20 @@ class ThreadedTaskGroup : public TaskGroup {
return status_;
}
- Future<> FinishAsync() override {
+ Future<> FinishAsync() override {
std::lock_guard<std::mutex> lock(mutex_);
- if (!completion_future_.has_value()) {
- if (nremaining_.load() == 0) {
- completion_future_ = Future<>::MakeFinished(status_);
- } else {
- completion_future_ = Future<>::Make();
- }
- }
- return *completion_future_;
+ if (!completion_future_.has_value()) {
+ if (nremaining_.load() == 0) {
+ completion_future_ = Future<>::MakeFinished(status_);
+ } else {
+ completion_future_ = Future<>::Make();
+ }
+ }
+ return *completion_future_;
}
- int parallelism() override { return executor_->GetCapacity(); }
-
+ int parallelism() override { return executor_->GetCapacity(); }
+
protected:
void UpdateStatus(Status&& st) {
// Must be called unlocked, only locks on error
@@ -177,27 +177,27 @@ class ThreadedTaskGroup : public TaskGroup {
// before cv.notify_one() has returned
std::unique_lock<std::mutex> lock(mutex_);
cv_.notify_one();
- if (completion_future_.has_value()) {
- // MarkFinished could be slow. We don't want to call it while we are holding
- // the lock.
- auto& future = *completion_future_;
- const auto finished = completion_future_->is_finished();
- const auto& status = status_;
- // This will be redundant if the user calls Finish and not FinishAsync
- if (!finished && !finished_) {
- finished_ = true;
- lock.unlock();
- future.MarkFinished(status);
- } else {
- lock.unlock();
- }
- }
+ if (completion_future_.has_value()) {
+ // MarkFinished could be slow. We don't want to call it while we are holding
+ // the lock.
+ auto& future = *completion_future_;
+ const auto finished = completion_future_->is_finished();
+ const auto& status = status_;
+ // This will be redundant if the user calls Finish and not FinishAsync
+ if (!finished && !finished_) {
+ finished_ = true;
+ lock.unlock();
+ future.MarkFinished(status);
+ } else {
+ lock.unlock();
+ }
+ }
}
}
// These members are usable unlocked
Executor* executor_;
- StopToken stop_token_;
+ StopToken stop_token_;
std::atomic<int32_t> nremaining_;
std::atomic<bool> ok_;
@@ -206,18 +206,18 @@ class ThreadedTaskGroup : public TaskGroup {
std::condition_variable cv_;
Status status_;
bool finished_ = false;
- util::optional<Future<>> completion_future_;
+ util::optional<Future<>> completion_future_;
};
-} // namespace
-
-std::shared_ptr<TaskGroup> TaskGroup::MakeSerial(StopToken stop_token) {
- return std::shared_ptr<TaskGroup>(new SerialTaskGroup{stop_token});
+} // namespace
+
+std::shared_ptr<TaskGroup> TaskGroup::MakeSerial(StopToken stop_token) {
+ return std::shared_ptr<TaskGroup>(new SerialTaskGroup{stop_token});
}
-std::shared_ptr<TaskGroup> TaskGroup::MakeThreaded(Executor* thread_pool,
- StopToken stop_token) {
- return std::shared_ptr<TaskGroup>(new ThreadedTaskGroup{thread_pool, stop_token});
+std::shared_ptr<TaskGroup> TaskGroup::MakeThreaded(Executor* thread_pool,
+ StopToken stop_token) {
+ return std::shared_ptr<TaskGroup>(new ThreadedTaskGroup{thread_pool, stop_token});
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
index 3bb72f0d9cb..b3692cbcfeb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
@@ -21,9 +21,9 @@
#include <utility>
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/cancel.h"
-#include "arrow/util/functional.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
#include "arrow/util/macros.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
@@ -38,18 +38,18 @@ namespace internal {
/// implementation. When Finish() returns, it is guaranteed that all
/// tasks have finished, or at least one has errored.
///
-/// Once an error has occurred any tasks that are submitted to the task group
-/// will not run. The call to Append will simply return without scheduling the
-/// task.
-///
-/// If the task group is parallel it is possible that multiple tasks could be
-/// running at the same time and one of those tasks fails. This will put the
-/// task group in a failure state (so additional tasks cannot be run) however
-/// it will not interrupt running tasks. Finish will not complete
-/// until all running tasks have finished, even if one task fails.
-///
-/// Once a task group has finished new tasks may not be added to it. If you need to start
-/// a new batch of work then you should create a new task group.
+/// Once an error has occurred any tasks that are submitted to the task group
+/// will not run. The call to Append will simply return without scheduling the
+/// task.
+///
+/// If the task group is parallel it is possible that multiple tasks could be
+/// running at the same time and one of those tasks fails. This will put the
+/// task group in a failure state (so additional tasks cannot be run) however
+/// it will not interrupt running tasks. Finish will not complete
+/// until all running tasks have finished, even if one task fails.
+///
+/// Once a task group has finished new tasks may not be added to it. If you need to start
+/// a new batch of work then you should create a new task group.
class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
public:
/// Add a Status-returning function to execute. Execution order is
@@ -65,33 +65,33 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
/// task (or subgroup).
virtual Status Finish() = 0;
- /// Returns a future that will complete the first time all tasks are finished.
- /// This should be called only after all top level tasks
- /// have been added to the task group.
- ///
- /// If you are using a TaskGroup asynchronously there are a few considerations to keep
- /// in mind. The tasks should not block on I/O, etc (defeats the purpose of using
- /// futures) and should not be doing any nested locking or you run the risk of the tasks
- /// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
- ///
- /// Primarily this call is intended to help migrate existing work written with TaskGroup
- /// in mind to using futures without having to do a complete conversion on the first
- /// pass.
- virtual Future<> FinishAsync() = 0;
-
+ /// Returns a future that will complete the first time all tasks are finished.
+ /// This should be called only after all top level tasks
+ /// have been added to the task group.
+ ///
+ /// If you are using a TaskGroup asynchronously there are a few considerations to keep
+ /// in mind. The tasks should not block on I/O, etc (defeats the purpose of using
+ /// futures) and should not be doing any nested locking or you run the risk of the tasks
+ /// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
+ ///
+ /// Primarily this call is intended to help migrate existing work written with TaskGroup
+ /// in mind to using futures without having to do a complete conversion on the first
+ /// pass.
+ virtual Future<> FinishAsync() = 0;
+
/// The current aggregate error Status. Non-blocking, useful for stopping early.
virtual Status current_status() = 0;
- /// Whether some tasks have already failed. Non-blocking, useful for stopping early.
- virtual bool ok() const = 0;
+ /// Whether some tasks have already failed. Non-blocking, useful for stopping early.
+ virtual bool ok() const = 0;
/// How many tasks can typically be executed in parallel.
/// This is only a hint, useful for testing or debugging.
virtual int parallelism() = 0;
- static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
- static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
- StopToken = StopToken::Unstoppable());
+ static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
+ static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
+ StopToken = StopToken::Unstoppable());
virtual ~TaskGroup() = default;
@@ -99,7 +99,7 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
TaskGroup() = default;
ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup);
- virtual void AppendReal(FnOnce<Status()> task) = 0;
+ virtual void AppendReal(FnOnce<Status()> task) = 0;
};
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
index 99b771ca0f2..93527f0c1f7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
@@ -1,417 +1,417 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/util/tdigest.h"
-
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <limits>
-#include <queue>
-#include <tuple>
-#include <vector>
-
-#include "arrow/status.h"
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-namespace arrow {
-namespace internal {
-
-namespace {
-
-// a numerically stable lerp is unbelievably complex
-// but we are *approximating* the quantile, so let's keep it simple
-double Lerp(double a, double b, double t) { return a + t * (b - a); }
-
-// histogram bin
-struct Centroid {
- double mean;
- double weight; // # data points in this bin
-
- // merge with another centroid
- void Merge(const Centroid& centroid) {
- weight += centroid.weight;
- mean += (centroid.mean - mean) * centroid.weight / weight;
- }
-};
-
-// scale function K0: linear function, as baseline
-struct ScalerK0 {
- explicit ScalerK0(uint32_t delta) : delta_norm(delta / 2.0) {}
-
- double K(double q) const { return delta_norm * q; }
- double Q(double k) const { return k / delta_norm; }
-
- const double delta_norm;
-};
-
-// scale function K1
-struct ScalerK1 {
- explicit ScalerK1(uint32_t delta) : delta_norm(delta / (2.0 * M_PI)) {}
-
- double K(double q) const { return delta_norm * std::asin(2 * q - 1); }
- double Q(double k) const { return (std::sin(k / delta_norm) + 1) / 2; }
-
- const double delta_norm;
-};
-
-// implements t-digest merging algorithm
-template <class T = ScalerK1>
-class TDigestMerger : private T {
- public:
- explicit TDigestMerger(uint32_t delta) : T(delta) { Reset(0, nullptr); }
-
- void Reset(double total_weight, std::vector<Centroid>* tdigest) {
- total_weight_ = total_weight;
- tdigest_ = tdigest;
- if (tdigest_) {
- tdigest_->resize(0);
- }
- weight_so_far_ = 0;
- weight_limit_ = -1; // trigger first centroid merge
- }
-
- // merge one centroid from a sorted centroid stream
- void Add(const Centroid& centroid) {
- auto& td = *tdigest_;
- const double weight = weight_so_far_ + centroid.weight;
- if (weight <= weight_limit_) {
- td.back().Merge(centroid);
- } else {
- const double quantile = weight_so_far_ / total_weight_;
- const double next_weight_limit = total_weight_ * this->Q(this->K(quantile) + 1);
- // weight limit should be strictly increasing, until the last centroid
- if (next_weight_limit <= weight_limit_) {
- weight_limit_ = total_weight_;
- } else {
- weight_limit_ = next_weight_limit;
- }
- td.push_back(centroid); // should never exceed capacity and trigger reallocation
- }
- weight_so_far_ = weight;
- }
-
- // validate k-size of a tdigest
- Status Validate(const std::vector<Centroid>& tdigest, double total_weight) const {
- double q_prev = 0, k_prev = this->K(0);
- for (size_t i = 0; i < tdigest.size(); ++i) {
- const double q = q_prev + tdigest[i].weight / total_weight;
- const double k = this->K(q);
- if (tdigest[i].weight != 1 && (k - k_prev) > 1.001) {
- return Status::Invalid("oversized centroid: ", k - k_prev);
- }
- k_prev = k;
- q_prev = q;
- }
- return Status::OK();
- }
-
- private:
- double total_weight_; // total weight of this tdigest
- double weight_so_far_; // accumulated weight till current bin
- double weight_limit_; // max accumulated weight to move to next bin
- std::vector<Centroid>* tdigest_;
-};
-
-} // namespace
-
-class TDigest::TDigestImpl {
- public:
- explicit TDigestImpl(uint32_t delta)
- : delta_(delta > 10 ? delta : 10), merger_(delta_) {
- tdigests_[0].reserve(delta_);
- tdigests_[1].reserve(delta_);
- Reset();
- }
-
- void Reset() {
- tdigests_[0].resize(0);
- tdigests_[1].resize(0);
- current_ = 0;
- total_weight_ = 0;
- min_ = std::numeric_limits<double>::max();
- max_ = std::numeric_limits<double>::lowest();
- merger_.Reset(0, nullptr);
- }
-
- Status Validate() const {
- // check weight, centroid order
- double total_weight = 0, prev_mean = std::numeric_limits<double>::lowest();
- for (const auto& centroid : tdigests_[current_]) {
- if (std::isnan(centroid.mean) || std::isnan(centroid.weight)) {
- return Status::Invalid("NAN found in tdigest");
- }
- if (centroid.mean < prev_mean) {
- return Status::Invalid("centroid mean decreases");
- }
- if (centroid.weight < 1) {
- return Status::Invalid("invalid centroid weight");
- }
- prev_mean = centroid.mean;
- total_weight += centroid.weight;
- }
- if (total_weight != total_weight_) {
- return Status::Invalid("tdigest total weight mismatch");
- }
- // check if buffer expanded
- if (tdigests_[0].capacity() > delta_ || tdigests_[1].capacity() > delta_) {
- return Status::Invalid("oversized tdigest buffer");
- }
- // check k-size
- return merger_.Validate(tdigests_[current_], total_weight_);
- }
-
- void Dump() const {
- const auto& td = tdigests_[current_];
- for (size_t i = 0; i < td.size(); ++i) {
- std::cerr << i << ": mean = " << td[i].mean << ", weight = " << td[i].weight
- << std::endl;
- }
- std::cerr << "min = " << min_ << ", max = " << max_ << std::endl;
- }
-
- // merge with other tdigests
- void Merge(const std::vector<const TDigestImpl*>& tdigest_impls) {
- // current and end iterator
- using CentroidIter = std::vector<Centroid>::const_iterator;
- using CentroidIterPair = std::pair<CentroidIter, CentroidIter>;
- // use a min-heap to find next minimal centroid from all tdigests
- auto centroid_gt = [](const CentroidIterPair& lhs, const CentroidIterPair& rhs) {
- return lhs.first->mean > rhs.first->mean;
- };
- using CentroidQueue =
- std::priority_queue<CentroidIterPair, std::vector<CentroidIterPair>,
- decltype(centroid_gt)>;
-
- // trivial dynamic memory allocated at runtime
- std::vector<CentroidIterPair> queue_buffer;
- queue_buffer.reserve(tdigest_impls.size() + 1);
- CentroidQueue queue(std::move(centroid_gt), std::move(queue_buffer));
-
- const auto& this_tdigest = tdigests_[current_];
- if (this_tdigest.size() > 0) {
- queue.emplace(this_tdigest.cbegin(), this_tdigest.cend());
- }
- for (const TDigestImpl* td : tdigest_impls) {
- const auto& other_tdigest = td->tdigests_[td->current_];
- if (other_tdigest.size() > 0) {
- queue.emplace(other_tdigest.cbegin(), other_tdigest.cend());
- total_weight_ += td->total_weight_;
- min_ = std::min(min_, td->min_);
- max_ = std::max(max_, td->max_);
- }
- }
-
- merger_.Reset(total_weight_, &tdigests_[1 - current_]);
- CentroidIter current_iter, end_iter;
- // do k-way merge till one buffer left
- while (queue.size() > 1) {
- std::tie(current_iter, end_iter) = queue.top();
- merger_.Add(*current_iter);
- queue.pop();
- if (++current_iter != end_iter) {
- queue.emplace(current_iter, end_iter);
- }
- }
- // merge last buffer
- if (!queue.empty()) {
- std::tie(current_iter, end_iter) = queue.top();
- while (current_iter != end_iter) {
- merger_.Add(*current_iter++);
- }
- }
- merger_.Reset(0, nullptr);
-
- current_ = 1 - current_;
- }
-
- // merge input data with current tdigest
- void MergeInput(std::vector<double>& input) {
- total_weight_ += input.size();
-
- std::sort(input.begin(), input.end());
- min_ = std::min(min_, input.front());
- max_ = std::max(max_, input.back());
-
- // pick next minimal centroid from input and tdigest, feed to merger
- merger_.Reset(total_weight_, &tdigests_[1 - current_]);
- const auto& td = tdigests_[current_];
- uint32_t tdigest_index = 0, input_index = 0;
- while (tdigest_index < td.size() && input_index < input.size()) {
- if (td[tdigest_index].mean < input[input_index]) {
- merger_.Add(td[tdigest_index++]);
- } else {
- merger_.Add(Centroid{input[input_index++], 1});
- }
- }
- while (tdigest_index < td.size()) {
- merger_.Add(td[tdigest_index++]);
- }
- while (input_index < input.size()) {
- merger_.Add(Centroid{input[input_index++], 1});
- }
- merger_.Reset(0, nullptr);
-
- input.resize(0);
- current_ = 1 - current_;
- }
-
- double Quantile(double q) const {
- const auto& td = tdigests_[current_];
-
- if (q < 0 || q > 1 || td.size() == 0) {
- return NAN;
- }
-
- const double index = q * total_weight_;
- if (index <= 1) {
- return min_;
- } else if (index >= total_weight_ - 1) {
- return max_;
- }
-
- // find centroid contains the index
- uint32_t ci = 0;
- double weight_sum = 0;
- for (; ci < td.size(); ++ci) {
- weight_sum += td[ci].weight;
- if (index <= weight_sum) {
- break;
- }
- }
- DCHECK_LT(ci, td.size());
-
- // deviation of index from the centroid center
- double diff = index + td[ci].weight / 2 - weight_sum;
-
- // index happen to be in a unit weight centroid
- if (td[ci].weight == 1 && std::abs(diff) < 0.5) {
- return td[ci].mean;
- }
-
- // find adjacent centroids for interpolation
- uint32_t ci_left = ci, ci_right = ci;
- if (diff > 0) {
- if (ci_right == td.size() - 1) {
- // index larger than center of last bin
- DCHECK_EQ(weight_sum, total_weight_);
- const Centroid* c = &td[ci_right];
- DCHECK_GE(c->weight, 2);
- return Lerp(c->mean, max_, diff / (c->weight / 2));
- }
- ++ci_right;
- } else {
- if (ci_left == 0) {
- // index smaller than center of first bin
- const Centroid* c = &td[0];
- DCHECK_GE(c->weight, 2);
- return Lerp(min_, c->mean, index / (c->weight / 2));
- }
- --ci_left;
- diff += td[ci_left].weight / 2 + td[ci_right].weight / 2;
- }
-
- // interpolate from adjacent centroids
- diff /= (td[ci_left].weight / 2 + td[ci_right].weight / 2);
- return Lerp(td[ci_left].mean, td[ci_right].mean, diff);
- }
-
- double Mean() const {
- double sum = 0;
- for (const auto& centroid : tdigests_[current_]) {
- sum += centroid.mean * centroid.weight;
- }
- return total_weight_ == 0 ? NAN : sum / total_weight_;
- }
-
- double total_weight() const { return total_weight_; }
-
- private:
- // must be delcared before merger_, see constructor initialization list
- const uint32_t delta_;
-
- TDigestMerger<> merger_;
- double total_weight_;
- double min_, max_;
-
- // ping-pong buffer holds two tdigests, size = 2 * delta * sizeof(Centroid)
- std::vector<Centroid> tdigests_[2];
- // index of active tdigest buffer, 0 or 1
- int current_;
-};
-
-TDigest::TDigest(uint32_t delta, uint32_t buffer_size) : impl_(new TDigestImpl(delta)) {
- input_.reserve(buffer_size);
- Reset();
-}
-
-TDigest::~TDigest() = default;
-TDigest::TDigest(TDigest&&) = default;
-TDigest& TDigest::operator=(TDigest&&) = default;
-
-void TDigest::Reset() {
- input_.resize(0);
- impl_->Reset();
-}
-
-Status TDigest::Validate() {
- MergeInput();
- return impl_->Validate();
-}
-
-void TDigest::Dump() {
- MergeInput();
- impl_->Dump();
-}
-
-void TDigest::Merge(std::vector<TDigest>* tdigests) {
- MergeInput();
-
- std::vector<const TDigestImpl*> tdigest_impls;
- tdigest_impls.reserve(tdigests->size());
- for (auto& td : *tdigests) {
- td.MergeInput();
- tdigest_impls.push_back(td.impl_.get());
- }
- impl_->Merge(tdigest_impls);
-}
-
-double TDigest::Quantile(double q) {
- MergeInput();
- return impl_->Quantile(q);
-}
-
-double TDigest::Mean() {
- MergeInput();
- return impl_->Mean();
-}
-
-bool TDigest::is_empty() const {
- return input_.size() == 0 && impl_->total_weight() == 0;
-}
-
-void TDigest::MergeInput() {
- if (input_.size() > 0) {
- impl_->MergeInput(input_); // will mutate input_
- }
-}
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/tdigest.h"
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <queue>
+#include <tuple>
+#include <vector>
+
+#include "arrow/status.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+// a numerically stable lerp is unbelievably complex
+// but we are *approximating* the quantile, so let's keep it simple
+double Lerp(double a, double b, double t) { return a + t * (b - a); }
+
+// histogram bin
+struct Centroid {
+ double mean;
+ double weight; // # data points in this bin
+
+ // merge with another centroid
+ void Merge(const Centroid& centroid) {
+ weight += centroid.weight;
+ mean += (centroid.mean - mean) * centroid.weight / weight;
+ }
+};
+
+// scale function K0: linear function, as baseline
+struct ScalerK0 {
+ explicit ScalerK0(uint32_t delta) : delta_norm(delta / 2.0) {}
+
+ double K(double q) const { return delta_norm * q; }
+ double Q(double k) const { return k / delta_norm; }
+
+ const double delta_norm;
+};
+
+// scale function K1
+struct ScalerK1 {
+ explicit ScalerK1(uint32_t delta) : delta_norm(delta / (2.0 * M_PI)) {}
+
+ double K(double q) const { return delta_norm * std::asin(2 * q - 1); }
+ double Q(double k) const { return (std::sin(k / delta_norm) + 1) / 2; }
+
+ const double delta_norm;
+};
+
+// implements t-digest merging algorithm
+template <class T = ScalerK1>
+class TDigestMerger : private T {
+ public:
+ explicit TDigestMerger(uint32_t delta) : T(delta) { Reset(0, nullptr); }
+
+ void Reset(double total_weight, std::vector<Centroid>* tdigest) {
+ total_weight_ = total_weight;
+ tdigest_ = tdigest;
+ if (tdigest_) {
+ tdigest_->resize(0);
+ }
+ weight_so_far_ = 0;
+ weight_limit_ = -1; // trigger first centroid merge
+ }
+
+ // merge one centroid from a sorted centroid stream
+ void Add(const Centroid& centroid) {
+ auto& td = *tdigest_;
+ const double weight = weight_so_far_ + centroid.weight;
+ if (weight <= weight_limit_) {
+ td.back().Merge(centroid);
+ } else {
+ const double quantile = weight_so_far_ / total_weight_;
+ const double next_weight_limit = total_weight_ * this->Q(this->K(quantile) + 1);
+ // weight limit should be strictly increasing, until the last centroid
+ if (next_weight_limit <= weight_limit_) {
+ weight_limit_ = total_weight_;
+ } else {
+ weight_limit_ = next_weight_limit;
+ }
+ td.push_back(centroid); // should never exceed capacity and trigger reallocation
+ }
+ weight_so_far_ = weight;
+ }
+
+ // validate k-size of a tdigest
+ Status Validate(const std::vector<Centroid>& tdigest, double total_weight) const {
+ double q_prev = 0, k_prev = this->K(0);
+ for (size_t i = 0; i < tdigest.size(); ++i) {
+ const double q = q_prev + tdigest[i].weight / total_weight;
+ const double k = this->K(q);
+ if (tdigest[i].weight != 1 && (k - k_prev) > 1.001) {
+ return Status::Invalid("oversized centroid: ", k - k_prev);
+ }
+ k_prev = k;
+ q_prev = q;
+ }
+ return Status::OK();
+ }
+
+ private:
+ double total_weight_; // total weight of this tdigest
+ double weight_so_far_; // accumulated weight till current bin
+ double weight_limit_; // max accumulated weight to move to next bin
+ std::vector<Centroid>* tdigest_;
+};
+
+} // namespace
+
+class TDigest::TDigestImpl {
+ public:
+ explicit TDigestImpl(uint32_t delta)
+ : delta_(delta > 10 ? delta : 10), merger_(delta_) {
+ tdigests_[0].reserve(delta_);
+ tdigests_[1].reserve(delta_);
+ Reset();
+ }
+
+ void Reset() {
+ tdigests_[0].resize(0);
+ tdigests_[1].resize(0);
+ current_ = 0;
+ total_weight_ = 0;
+ min_ = std::numeric_limits<double>::max();
+ max_ = std::numeric_limits<double>::lowest();
+ merger_.Reset(0, nullptr);
+ }
+
+ Status Validate() const {
+ // check weight, centroid order
+ double total_weight = 0, prev_mean = std::numeric_limits<double>::lowest();
+ for (const auto& centroid : tdigests_[current_]) {
+ if (std::isnan(centroid.mean) || std::isnan(centroid.weight)) {
+ return Status::Invalid("NAN found in tdigest");
+ }
+ if (centroid.mean < prev_mean) {
+ return Status::Invalid("centroid mean decreases");
+ }
+ if (centroid.weight < 1) {
+ return Status::Invalid("invalid centroid weight");
+ }
+ prev_mean = centroid.mean;
+ total_weight += centroid.weight;
+ }
+ if (total_weight != total_weight_) {
+ return Status::Invalid("tdigest total weight mismatch");
+ }
+ // check if buffer expanded
+ if (tdigests_[0].capacity() > delta_ || tdigests_[1].capacity() > delta_) {
+ return Status::Invalid("oversized tdigest buffer");
+ }
+ // check k-size
+ return merger_.Validate(tdigests_[current_], total_weight_);
+ }
+
+ void Dump() const {
+ const auto& td = tdigests_[current_];
+ for (size_t i = 0; i < td.size(); ++i) {
+ std::cerr << i << ": mean = " << td[i].mean << ", weight = " << td[i].weight
+ << std::endl;
+ }
+ std::cerr << "min = " << min_ << ", max = " << max_ << std::endl;
+ }
+
+ // merge with other tdigests
+ void Merge(const std::vector<const TDigestImpl*>& tdigest_impls) {
+ // current and end iterator
+ using CentroidIter = std::vector<Centroid>::const_iterator;
+ using CentroidIterPair = std::pair<CentroidIter, CentroidIter>;
+ // use a min-heap to find next minimal centroid from all tdigests
+ auto centroid_gt = [](const CentroidIterPair& lhs, const CentroidIterPair& rhs) {
+ return lhs.first->mean > rhs.first->mean;
+ };
+ using CentroidQueue =
+ std::priority_queue<CentroidIterPair, std::vector<CentroidIterPair>,
+ decltype(centroid_gt)>;
+
+ // trivial dynamic memory allocated at runtime
+ std::vector<CentroidIterPair> queue_buffer;
+ queue_buffer.reserve(tdigest_impls.size() + 1);
+ CentroidQueue queue(std::move(centroid_gt), std::move(queue_buffer));
+
+ const auto& this_tdigest = tdigests_[current_];
+ if (this_tdigest.size() > 0) {
+ queue.emplace(this_tdigest.cbegin(), this_tdigest.cend());
+ }
+ for (const TDigestImpl* td : tdigest_impls) {
+ const auto& other_tdigest = td->tdigests_[td->current_];
+ if (other_tdigest.size() > 0) {
+ queue.emplace(other_tdigest.cbegin(), other_tdigest.cend());
+ total_weight_ += td->total_weight_;
+ min_ = std::min(min_, td->min_);
+ max_ = std::max(max_, td->max_);
+ }
+ }
+
+ merger_.Reset(total_weight_, &tdigests_[1 - current_]);
+ CentroidIter current_iter, end_iter;
+ // do k-way merge till one buffer left
+ while (queue.size() > 1) {
+ std::tie(current_iter, end_iter) = queue.top();
+ merger_.Add(*current_iter);
+ queue.pop();
+ if (++current_iter != end_iter) {
+ queue.emplace(current_iter, end_iter);
+ }
+ }
+ // merge last buffer
+ if (!queue.empty()) {
+ std::tie(current_iter, end_iter) = queue.top();
+ while (current_iter != end_iter) {
+ merger_.Add(*current_iter++);
+ }
+ }
+ merger_.Reset(0, nullptr);
+
+ current_ = 1 - current_;
+ }
+
+ // merge input data with current tdigest
+ void MergeInput(std::vector<double>& input) {
+ total_weight_ += input.size();
+
+ std::sort(input.begin(), input.end());
+ min_ = std::min(min_, input.front());
+ max_ = std::max(max_, input.back());
+
+ // pick next minimal centroid from input and tdigest, feed to merger
+ merger_.Reset(total_weight_, &tdigests_[1 - current_]);
+ const auto& td = tdigests_[current_];
+ uint32_t tdigest_index = 0, input_index = 0;
+ while (tdigest_index < td.size() && input_index < input.size()) {
+ if (td[tdigest_index].mean < input[input_index]) {
+ merger_.Add(td[tdigest_index++]);
+ } else {
+ merger_.Add(Centroid{input[input_index++], 1});
+ }
+ }
+ while (tdigest_index < td.size()) {
+ merger_.Add(td[tdigest_index++]);
+ }
+ while (input_index < input.size()) {
+ merger_.Add(Centroid{input[input_index++], 1});
+ }
+ merger_.Reset(0, nullptr);
+
+ input.resize(0);
+ current_ = 1 - current_;
+ }
+
+ double Quantile(double q) const {
+ const auto& td = tdigests_[current_];
+
+ if (q < 0 || q > 1 || td.size() == 0) {
+ return NAN;
+ }
+
+ const double index = q * total_weight_;
+ if (index <= 1) {
+ return min_;
+ } else if (index >= total_weight_ - 1) {
+ return max_;
+ }
+
+ // find centroid contains the index
+ uint32_t ci = 0;
+ double weight_sum = 0;
+ for (; ci < td.size(); ++ci) {
+ weight_sum += td[ci].weight;
+ if (index <= weight_sum) {
+ break;
+ }
+ }
+ DCHECK_LT(ci, td.size());
+
+ // deviation of index from the centroid center
+ double diff = index + td[ci].weight / 2 - weight_sum;
+
+ // index happen to be in a unit weight centroid
+ if (td[ci].weight == 1 && std::abs(diff) < 0.5) {
+ return td[ci].mean;
+ }
+
+ // find adjacent centroids for interpolation
+ uint32_t ci_left = ci, ci_right = ci;
+ if (diff > 0) {
+ if (ci_right == td.size() - 1) {
+ // index larger than center of last bin
+ DCHECK_EQ(weight_sum, total_weight_);
+ const Centroid* c = &td[ci_right];
+ DCHECK_GE(c->weight, 2);
+ return Lerp(c->mean, max_, diff / (c->weight / 2));
+ }
+ ++ci_right;
+ } else {
+ if (ci_left == 0) {
+ // index smaller than center of first bin
+ const Centroid* c = &td[0];
+ DCHECK_GE(c->weight, 2);
+ return Lerp(min_, c->mean, index / (c->weight / 2));
+ }
+ --ci_left;
+ diff += td[ci_left].weight / 2 + td[ci_right].weight / 2;
+ }
+
+ // interpolate from adjacent centroids
+ diff /= (td[ci_left].weight / 2 + td[ci_right].weight / 2);
+ return Lerp(td[ci_left].mean, td[ci_right].mean, diff);
+ }
+
+ double Mean() const {
+ double sum = 0;
+ for (const auto& centroid : tdigests_[current_]) {
+ sum += centroid.mean * centroid.weight;
+ }
+ return total_weight_ == 0 ? NAN : sum / total_weight_;
+ }
+
+ double total_weight() const { return total_weight_; }
+
+ private:
+ // must be delcared before merger_, see constructor initialization list
+ const uint32_t delta_;
+
+ TDigestMerger<> merger_;
+ double total_weight_;
+ double min_, max_;
+
+ // ping-pong buffer holds two tdigests, size = 2 * delta * sizeof(Centroid)
+ std::vector<Centroid> tdigests_[2];
+ // index of active tdigest buffer, 0 or 1
+ int current_;
+};
+
+TDigest::TDigest(uint32_t delta, uint32_t buffer_size) : impl_(new TDigestImpl(delta)) {
+ input_.reserve(buffer_size);
+ Reset();
+}
+
+TDigest::~TDigest() = default;
+TDigest::TDigest(TDigest&&) = default;
+TDigest& TDigest::operator=(TDigest&&) = default;
+
+void TDigest::Reset() {
+ input_.resize(0);
+ impl_->Reset();
+}
+
+Status TDigest::Validate() {
+ MergeInput();
+ return impl_->Validate();
+}
+
+void TDigest::Dump() {
+ MergeInput();
+ impl_->Dump();
+}
+
+void TDigest::Merge(std::vector<TDigest>* tdigests) {
+ MergeInput();
+
+ std::vector<const TDigestImpl*> tdigest_impls;
+ tdigest_impls.reserve(tdigests->size());
+ for (auto& td : *tdigests) {
+ td.MergeInput();
+ tdigest_impls.push_back(td.impl_.get());
+ }
+ impl_->Merge(tdigest_impls);
+}
+
+double TDigest::Quantile(double q) {
+ MergeInput();
+ return impl_->Quantile(q);
+}
+
+double TDigest::Mean() {
+ MergeInput();
+ return impl_->Mean();
+}
+
+bool TDigest::is_empty() const {
+ return input_.size() == 0 && impl_->total_weight() == 0;
+}
+
+void TDigest::MergeInput() {
+ if (input_.size() > 0) {
+ impl_->MergeInput(input_); // will mutate input_
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
index ae42ce48e7d..361d176bff4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
@@ -1,103 +1,103 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// approximate quantiles from arbitrary length dataset with O(1) space
-// based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl
-// - https://arxiv.org/abs/1902.04023
-// - https://github.com/tdunning/t-digest
-
-#pragma once
-
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Status;
-
-namespace internal {
-
-class ARROW_EXPORT TDigest {
- public:
- explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500);
- ~TDigest();
- TDigest(TDigest&&);
- TDigest& operator=(TDigest&&);
-
- // reset and re-use this tdigest
- void Reset();
-
- // validate data integrity
- Status Validate();
-
- // dump internal data, only for debug
- void Dump();
-
- // buffer a single data point, consume internal buffer if full
- // this function is intensively called and performance critical
- // call it only if you are sure no NAN exists in input data
- void Add(double value) {
- DCHECK(!std::isnan(value)) << "cannot add NAN";
- if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) {
- MergeInput();
- }
- input_.push_back(value);
- }
-
- // skip NAN on adding
- template <typename T>
- typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) {
- if (!std::isnan(value)) Add(value);
- }
-
- template <typename T>
- typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) {
- Add(static_cast<double>(value));
- }
-
- // merge with other t-digests, called infrequently
- void Merge(std::vector<TDigest>* tdigests);
-
- // calculate quantile
- double Quantile(double q);
-
- double Min() { return Quantile(0); }
- double Max() { return Quantile(1); }
- double Mean();
-
- // check if this tdigest contains no valid data points
- bool is_empty() const;
-
- private:
- // merge input data with current tdigest
- void MergeInput();
-
- // input buffer, size = buffer_size * sizeof(double)
- std::vector<double> input_;
-
- // hide other members with pimpl
- class TDigestImpl;
- std::unique_ptr<TDigestImpl> impl_;
-};
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// approximate quantiles from arbitrary length dataset with O(1) space
+// based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl
+// - https://arxiv.org/abs/1902.04023
+// - https://github.com/tdunning/t-digest
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+namespace internal {
+
+class ARROW_EXPORT TDigest {
+ public:
+ explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500);
+ ~TDigest();
+ TDigest(TDigest&&);
+ TDigest& operator=(TDigest&&);
+
+ // reset and re-use this tdigest
+ void Reset();
+
+ // validate data integrity
+ Status Validate();
+
+ // dump internal data, only for debug
+ void Dump();
+
+ // buffer a single data point, consume internal buffer if full
+ // this function is intensively called and performance critical
+ // call it only if you are sure no NAN exists in input data
+ void Add(double value) {
+ DCHECK(!std::isnan(value)) << "cannot add NAN";
+ if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) {
+ MergeInput();
+ }
+ input_.push_back(value);
+ }
+
+ // skip NAN on adding
+ template <typename T>
+ typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) {
+ if (!std::isnan(value)) Add(value);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) {
+ Add(static_cast<double>(value));
+ }
+
+ // merge with other t-digests, called infrequently
+ void Merge(std::vector<TDigest>* tdigests);
+
+ // calculate quantile
+ double Quantile(double q);
+
+ double Min() { return Quantile(0); }
+ double Max() { return Quantile(1); }
+ double Mean();
+
+ // check if this tdigest contains no valid data points
+ bool is_empty() const;
+
+ private:
+ // merge input data with current tdigest
+ void MergeInput();
+
+ // input buffer, size = buffer_size * sizeof(double)
+ std::vector<double> input_;
+
+ // hide other members with pimpl
+ class TDigestImpl;
+ std::unique_ptr<TDigestImpl> impl_;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
index 758295d01ed..ee480d0dec9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
@@ -32,88 +32,88 @@
namespace arrow {
namespace internal {
-Executor::~Executor() = default;
-
-namespace {
-
-struct Task {
- FnOnce<void()> callable;
- StopToken stop_token;
- Executor::StopCallback stop_callback;
-};
-
-} // namespace
-
-struct SerialExecutor::State {
- std::deque<Task> task_queue;
- std::mutex mutex;
- std::condition_variable wait_for_tasks;
- bool finished{false};
-};
-
-SerialExecutor::SerialExecutor() : state_(std::make_shared<State>()) {}
-
-SerialExecutor::~SerialExecutor() = default;
-
-Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
- StopToken stop_token, StopCallback&& stop_callback) {
- // While the SerialExecutor runs tasks synchronously on its main thread,
- // SpawnReal may be called from external threads (e.g. when transferring back
- // from blocking I/O threads), so we need to keep the state alive *and* to
- // lock its contents.
- //
- // Note that holding the lock while notifying the condition variable may
- // not be sufficient, as some exit paths in the main thread are unlocked.
- auto state = state_;
- {
- std::lock_guard<std::mutex> lk(state->mutex);
- state->task_queue.push_back(
- Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
- }
- state->wait_for_tasks.notify_one();
- return Status::OK();
-}
-
-void SerialExecutor::MarkFinished() {
- // Same comment as SpawnReal above
- auto state = state_;
- {
- std::lock_guard<std::mutex> lk(state->mutex);
- state->finished = true;
- }
- state->wait_for_tasks.notify_one();
-}
-
-void SerialExecutor::RunLoop() {
- // This is called from the SerialExecutor's main thread, so the
- // state is guaranteed to be kept alive.
- std::unique_lock<std::mutex> lk(state_->mutex);
-
- while (!state_->finished) {
- while (!state_->task_queue.empty()) {
- Task task = std::move(state_->task_queue.front());
- state_->task_queue.pop_front();
- lk.unlock();
- if (!task.stop_token.IsStopRequested()) {
- std::move(task.callable)();
- } else {
- if (task.stop_callback) {
- std::move(task.stop_callback)(task.stop_token.Poll());
- }
- // Can't break here because there may be cleanup tasks down the chain we still
- // need to run.
- }
- lk.lock();
- }
- // In this case we must be waiting on work from external (e.g. I/O) executors. Wait
- // for tasks to arrive (typically via transferred futures).
- state_->wait_for_tasks.wait(
- lk, [&] { return state_->finished || !state_->task_queue.empty(); });
- }
-}
-
+Executor::~Executor() = default;
+
+namespace {
+
+struct Task {
+ FnOnce<void()> callable;
+ StopToken stop_token;
+ Executor::StopCallback stop_callback;
+};
+
+} // namespace
+
+struct SerialExecutor::State {
+ std::deque<Task> task_queue;
+ std::mutex mutex;
+ std::condition_variable wait_for_tasks;
+ bool finished{false};
+};
+
+SerialExecutor::SerialExecutor() : state_(std::make_shared<State>()) {}
+
+SerialExecutor::~SerialExecutor() = default;
+
+Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
+ StopToken stop_token, StopCallback&& stop_callback) {
+ // While the SerialExecutor runs tasks synchronously on its main thread,
+ // SpawnReal may be called from external threads (e.g. when transferring back
+ // from blocking I/O threads), so we need to keep the state alive *and* to
+ // lock its contents.
+ //
+ // Note that holding the lock while notifying the condition variable may
+ // not be sufficient, as some exit paths in the main thread are unlocked.
+ auto state = state_;
+ {
+ std::lock_guard<std::mutex> lk(state->mutex);
+ state->task_queue.push_back(
+ Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
+ }
+ state->wait_for_tasks.notify_one();
+ return Status::OK();
+}
+
+void SerialExecutor::MarkFinished() {
+ // Same comment as SpawnReal above
+ auto state = state_;
+ {
+ std::lock_guard<std::mutex> lk(state->mutex);
+ state->finished = true;
+ }
+ state->wait_for_tasks.notify_one();
+}
+
+void SerialExecutor::RunLoop() {
+ // This is called from the SerialExecutor's main thread, so the
+ // state is guaranteed to be kept alive.
+ std::unique_lock<std::mutex> lk(state_->mutex);
+
+ while (!state_->finished) {
+ while (!state_->task_queue.empty()) {
+ Task task = std::move(state_->task_queue.front());
+ state_->task_queue.pop_front();
+ lk.unlock();
+ if (!task.stop_token.IsStopRequested()) {
+ std::move(task.callable)();
+ } else {
+ if (task.stop_callback) {
+ std::move(task.stop_callback)(task.stop_token.Poll());
+ }
+ // Can't break here because there may be cleanup tasks down the chain we still
+ // need to run.
+ }
+ lk.lock();
+ }
+ // In this case we must be waiting on work from external (e.g. I/O) executors. Wait
+ // for tasks to arrive (typically via transferred futures).
+ state_->wait_for_tasks.wait(
+ lk, [&] { return state_->finished || !state_->task_queue.empty(); });
+ }
+}
+
struct ThreadPool::State {
- State() = default;
+ State() = default;
// NOTE: in case locking becomes too expensive, we can investigate lock-free FIFOs
// such as https://github.com/cameron314/concurrentqueue
@@ -125,17 +125,17 @@ struct ThreadPool::State {
std::list<std::thread> workers_;
// Trashcan for finished threads
std::vector<std::thread> finished_workers_;
- std::deque<Task> pending_tasks_;
+ std::deque<Task> pending_tasks_;
// Desired number of threads
- int desired_capacity_ = 0;
-
- // Total number of tasks that are either queued or running
- int tasks_queued_or_running_ = 0;
-
+ int desired_capacity_ = 0;
+
+ // Total number of tasks that are either queued or running
+ int tasks_queued_or_running_ = 0;
+
// Are we shutting down?
- bool please_shutdown_ = false;
- bool quick_shutdown_ = false;
+ bool please_shutdown_ = false;
+ bool quick_shutdown_ = false;
};
// The worker loop is an independent function so that it can keep running
@@ -165,24 +165,24 @@ static void WorkerLoop(std::shared_ptr<ThreadPool::State> state,
if (should_secede()) {
break;
}
-
- DCHECK_GE(state->tasks_queued_or_running_, 0);
+
+ DCHECK_GE(state->tasks_queued_or_running_, 0);
{
- Task task = std::move(state->pending_tasks_.front());
+ Task task = std::move(state->pending_tasks_.front());
state->pending_tasks_.pop_front();
- StopToken* stop_token = &task.stop_token;
+ StopToken* stop_token = &task.stop_token;
lock.unlock();
- if (!stop_token->IsStopRequested()) {
- std::move(task.callable)();
- } else {
- if (task.stop_callback) {
- std::move(task.stop_callback)(stop_token->Poll());
- }
- }
- ARROW_UNUSED(std::move(task)); // release resources before waiting for lock
- lock.lock();
+ if (!stop_token->IsStopRequested()) {
+ std::move(task.callable)();
+ } else {
+ if (task.stop_callback) {
+ std::move(task.stop_callback)(stop_token->Poll());
+ }
+ }
+ ARROW_UNUSED(std::move(task)); // release resources before waiting for lock
+ lock.lock();
}
- state->tasks_queued_or_running_--;
+ state->tasks_queued_or_running_--;
}
// Now either the queue is empty *or* a quick shutdown was requested
if (state->please_shutdown_ || should_secede()) {
@@ -191,7 +191,7 @@ static void WorkerLoop(std::shared_ptr<ThreadPool::State> state,
// Wait for next wakeup
state->cv_.wait(lock);
}
- DCHECK_GE(state->tasks_queued_or_running_, 0);
+ DCHECK_GE(state->tasks_queued_or_running_, 0);
// We're done. Move our thread object to the trashcan of finished
// workers. This has two motivations:
@@ -262,14 +262,14 @@ Status ThreadPool::SetCapacity(int threads) {
CollectFinishedWorkersUnlocked();
state_->desired_capacity_ = threads;
- // See if we need to increase or decrease the number of running threads
- const int required = std::min(static_cast<int>(state_->pending_tasks_.size()),
- threads - static_cast<int>(state_->workers_.size()));
- if (required > 0) {
- // Some tasks are pending, spawn the number of needed threads immediately
- LaunchWorkersUnlocked(required);
- } else if (required < 0) {
- // Excess threads are running, wake them so that they stop
+ // See if we need to increase or decrease the number of running threads
+ const int required = std::min(static_cast<int>(state_->pending_tasks_.size()),
+ threads - static_cast<int>(state_->workers_.size()));
+ if (required > 0) {
+ // Some tasks are pending, spawn the number of needed threads immediately
+ LaunchWorkersUnlocked(required);
+ } else if (required < 0) {
+ // Excess threads are running, wake them so that they stop
state_->cv_.notify_all();
}
return Status::OK();
@@ -281,12 +281,12 @@ int ThreadPool::GetCapacity() {
return state_->desired_capacity_;
}
-int ThreadPool::GetNumTasks() {
- ProtectAgainstFork();
- std::unique_lock<std::mutex> lock(state_->mutex_);
- return state_->tasks_queued_or_running_;
-}
-
+int ThreadPool::GetNumTasks() {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+ return state_->tasks_queued_or_running_;
+}
+
int ThreadPool::GetActualCapacity() {
ProtectAgainstFork();
std::unique_lock<std::mutex> lock(state_->mutex_);
@@ -321,25 +321,25 @@ void ThreadPool::CollectFinishedWorkersUnlocked() {
state_->finished_workers_.clear();
}
-thread_local ThreadPool* current_thread_pool_ = nullptr;
-
-bool ThreadPool::OwnsThisThread() { return current_thread_pool_ == this; }
-
+thread_local ThreadPool* current_thread_pool_ = nullptr;
+
+bool ThreadPool::OwnsThisThread() { return current_thread_pool_ == this; }
+
void ThreadPool::LaunchWorkersUnlocked(int threads) {
std::shared_ptr<State> state = sp_state_;
for (int i = 0; i < threads; i++) {
state_->workers_.emplace_back();
auto it = --(state_->workers_.end());
- *it = std::thread([this, state, it] {
- current_thread_pool_ = this;
- WorkerLoop(state, it);
- });
+ *it = std::thread([this, state, it] {
+ current_thread_pool_ = this;
+ WorkerLoop(state, it);
+ });
}
}
-Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken stop_token,
- StopCallback&& stop_callback) {
+Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken stop_token,
+ StopCallback&& stop_callback) {
{
ProtectAgainstFork();
std::lock_guard<std::mutex> lock(state_->mutex_);
@@ -347,14 +347,14 @@ Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken sto
return Status::Invalid("operation forbidden during or after shutdown");
}
CollectFinishedWorkersUnlocked();
- state_->tasks_queued_or_running_++;
- if (static_cast<int>(state_->workers_.size()) < state_->tasks_queued_or_running_ &&
- state_->desired_capacity_ > static_cast<int>(state_->workers_.size())) {
- // We can still spin up more workers so spin up a new worker
- LaunchWorkersUnlocked(/*threads=*/1);
- }
- state_->pending_tasks_.push_back(
- {std::move(task), std::move(stop_token), std::move(stop_callback)});
+ state_->tasks_queued_or_running_++;
+ if (static_cast<int>(state_->workers_.size()) < state_->tasks_queued_or_running_ &&
+ state_->desired_capacity_ > static_cast<int>(state_->workers_.size())) {
+ // We can still spin up more workers so spin up a new worker
+ LaunchWorkersUnlocked(/*threads=*/1);
+ }
+ state_->pending_tasks_.push_back(
+ {std::move(task), std::move(stop_token), std::move(stop_callback)});
}
state_->cv_.notify_one();
return Status::OK();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
index 9ac8e36a3d8..5d866601ab1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
@@ -23,14 +23,14 @@
#include <cstdint>
#include <memory>
-#include <queue>
+#include <queue>
#include <type_traits>
#include <utility>
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/cancel.h"
-#include "arrow/util/functional.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
#include "arrow/util/future.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -76,229 +76,229 @@ struct TaskHints {
class ARROW_EXPORT Executor {
public:
- using StopCallback = internal::FnOnce<void(const Status&)>;
-
+ using StopCallback = internal::FnOnce<void(const Status&)>;
+
virtual ~Executor();
// Spawn a fire-and-forget task.
template <typename Function>
Status Spawn(Function&& func) {
- return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
- StopCallback{});
- }
- template <typename Function>
- Status Spawn(Function&& func, StopToken stop_token) {
- return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
- StopCallback{});
+ return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
+ StopCallback{});
}
template <typename Function>
+ Status Spawn(Function&& func, StopToken stop_token) {
+ return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
+ StopCallback{});
+ }
+ template <typename Function>
Status Spawn(TaskHints hints, Function&& func) {
- return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
- StopCallback{});
- }
- template <typename Function>
- Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
- return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
- StopCallback{});
- }
- template <typename Function>
- Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
- StopCallback stop_callback) {
- return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
- std::move(stop_callback));
- }
-
- // Transfers a future to this executor. Any continuations added to the
- // returned future will run in this executor. Otherwise they would run
- // on the same thread that called MarkFinished.
- //
- // This is necessary when (for example) an I/O task is completing a future.
- // The continuations of that future should run on the CPU thread pool keeping
- // CPU heavy work off the I/O thread pool. So the I/O task should transfer
- // the future to the CPU executor before returning.
- //
- // By default this method will only transfer if the future is not already completed. If
- // the future is already completed then any callback would be run synchronously and so
- // no transfer is typically necessary. However, in cases where you want to force a
- // transfer (e.g. to help the scheduler break up units of work across multiple cores)
- // then you can override this behavior with `always_transfer`.
- template <typename T>
- Future<T> Transfer(Future<T> future) {
- return DoTransfer(std::move(future), false);
+ return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
+ StopCallback{});
}
-
- // Overload of Transfer which will always schedule callbacks on new threads even if the
- // future is finished when the callback is added.
- //
- // This can be useful in cases where you want to ensure parallelism
- template <typename T>
- Future<T> TransferAlways(Future<T> future) {
- return DoTransfer(std::move(future), true);
- }
-
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
+ return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
+ StopCallback stop_callback) {
+ return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+ std::move(stop_callback));
+ }
+
+ // Transfers a future to this executor. Any continuations added to the
+ // returned future will run in this executor. Otherwise they would run
+ // on the same thread that called MarkFinished.
+ //
+ // This is necessary when (for example) an I/O task is completing a future.
+ // The continuations of that future should run on the CPU thread pool keeping
+ // CPU heavy work off the I/O thread pool. So the I/O task should transfer
+ // the future to the CPU executor before returning.
+ //
+ // By default this method will only transfer if the future is not already completed. If
+ // the future is already completed then any callback would be run synchronously and so
+ // no transfer is typically necessary. However, in cases where you want to force a
+ // transfer (e.g. to help the scheduler break up units of work across multiple cores)
+ // then you can override this behavior with `always_transfer`.
+ template <typename T>
+ Future<T> Transfer(Future<T> future) {
+ return DoTransfer(std::move(future), false);
+ }
+
+ // Overload of Transfer which will always schedule callbacks on new threads even if the
+ // future is finished when the callback is added.
+ //
+ // This can be useful in cases where you want to ensure parallelism
+ template <typename T>
+ Future<T> TransferAlways(Future<T> future) {
+ return DoTransfer(std::move(future), true);
+ }
+
// Submit a callable and arguments for execution. Return a future that
// will return the callable's result value once.
// The callable's arguments are copied before execution.
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
- Args&&... args) {
- using ValueType = typename FutureType::ValueType;
-
- auto future = FutureType::Make();
- auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
- std::forward<Function>(func), std::forward<Args>(args)...);
- struct {
- WeakFuture<ValueType> weak_fut;
-
- void operator()(const Status& st) {
- auto fut = weak_fut.get();
- if (fut.is_valid()) {
- fut.MarkFinished(st);
- }
- }
- } stop_callback{WeakFuture<ValueType>(future)};
- ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
- std::move(stop_callback)));
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
+ Args&&... args) {
+ using ValueType = typename FutureType::ValueType;
+
+ auto future = FutureType::Make();
+ auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
+ std::forward<Function>(func), std::forward<Args>(args)...);
+ struct {
+ WeakFuture<ValueType> weak_fut;
+
+ void operator()(const Status& st) {
+ auto fut = weak_fut.get();
+ if (fut.is_valid()) {
+ fut.MarkFinished(st);
+ }
+ }
+ } stop_callback{WeakFuture<ValueType>(future)};
+ ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
+ std::move(stop_callback)));
return future;
}
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
- return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
- std::forward<Args>(args)...);
- }
-
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
- return Submit(std::move(hints), StopToken::Unstoppable(),
- std::forward<Function>(func), std::forward<Args>(args)...);
- }
-
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(Function&& func, Args&&... args) {
- return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
- std::forward<Args>(args)...);
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
+ return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
+ std::forward<Args>(args)...);
}
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
+ return Submit(std::move(hints), StopToken::Unstoppable(),
+ std::forward<Function>(func), std::forward<Args>(args)...);
+ }
+
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(Function&& func, Args&&... args) {
+ return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
+ std::forward<Args>(args)...);
+ }
+
// Return the level of parallelism (the number of tasks that may be executed
// concurrently). This may be an approximate number.
virtual int GetCapacity() = 0;
- // Return true if the thread from which this function is called is owned by this
- // Executor. Returns false if this Executor does not support this property.
- virtual bool OwnsThisThread() { return false; }
-
+ // Return true if the thread from which this function is called is owned by this
+ // Executor. Returns false if this Executor does not support this property.
+ virtual bool OwnsThisThread() { return false; }
+
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(Executor);
Executor() = default;
- template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
- Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
- auto transferred = Future<T>::Make();
- if (always_transfer) {
- CallbackOptions callback_options = CallbackOptions::Defaults();
- callback_options.should_schedule = ShouldSchedule::Always;
- callback_options.executor = this;
- auto sync_callback = [transferred](const FTSync& result) mutable {
- transferred.MarkFinished(result);
- };
- future.AddCallback(sync_callback, callback_options);
- return transferred;
- }
-
- // We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
- // work by doing the test here.
- auto callback = [this, transferred](const FTSync& result) mutable {
- auto spawn_status =
- Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
- if (!spawn_status.ok()) {
- transferred.MarkFinished(spawn_status);
- }
- };
- auto callback_factory = [&callback]() { return callback; };
- if (future.TryAddCallback(callback_factory)) {
- return transferred;
- }
- // If the future is already finished and we aren't going to force spawn a thread
- // then we don't need to add another layer of callback and can return the original
- // future
- return future;
- }
-
+ template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
+ Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
+ auto transferred = Future<T>::Make();
+ if (always_transfer) {
+ CallbackOptions callback_options = CallbackOptions::Defaults();
+ callback_options.should_schedule = ShouldSchedule::Always;
+ callback_options.executor = this;
+ auto sync_callback = [transferred](const FTSync& result) mutable {
+ transferred.MarkFinished(result);
+ };
+ future.AddCallback(sync_callback, callback_options);
+ return transferred;
+ }
+
+ // We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
+ // work by doing the test here.
+ auto callback = [this, transferred](const FTSync& result) mutable {
+ auto spawn_status =
+ Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
+ if (!spawn_status.ok()) {
+ transferred.MarkFinished(spawn_status);
+ }
+ };
+ auto callback_factory = [&callback]() { return callback; };
+ if (future.TryAddCallback(callback_factory)) {
+ return transferred;
+ }
+ // If the future is already finished and we aren't going to force spawn a thread
+ // then we don't need to add another layer of callback and can return the original
+ // future
+ return future;
+ }
+
// Subclassing API
- virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
- StopCallback&&) = 0;
+ virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) = 0;
};
-/// \brief An executor implementation that runs all tasks on a single thread using an
-/// event loop.
-///
-/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
-/// fine but if one task needs to wait for another task it must be expressed as an
-/// asynchronous continuation.
-class ARROW_EXPORT SerialExecutor : public Executor {
- public:
- template <typename T = ::arrow::internal::Empty>
- using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
-
- ~SerialExecutor() override;
-
- int GetCapacity() override { return 1; };
- Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
- StopCallback&&) override;
-
- /// \brief Runs the TopLevelTask and any scheduled tasks
- ///
- /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
- /// status or call the finish signal. Failure to do this will result in a deadlock. For
- /// this reason it is preferable (if possible) to use the helper methods (below)
- /// RunSynchronously/RunSerially which delegates the responsiblity onto a Future
- /// producer's existing responsibility to always mark a future finished (which can
- /// someday be aided by ARROW-12207).
- template <typename T = internal::Empty, typename FT = Future<T>,
- typename FTSync = typename FT::SyncType>
- static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
- Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
- return FutureToSync(fut);
- }
-
- private:
- SerialExecutor();
-
- // State uses mutex
- struct State;
- std::shared_ptr<State> state_;
-
- template <typename T, typename FTSync = typename Future<T>::SyncType>
- Future<T> Run(TopLevelTask<T> initial_task) {
- auto final_fut = std::move(initial_task)(this);
- if (final_fut.is_finished()) {
- return final_fut;
- }
- final_fut.AddCallback([this](const FTSync&) { MarkFinished(); });
- RunLoop();
- return final_fut;
- }
- void RunLoop();
- void MarkFinished();
-};
-
-/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
-/// pool of worker threads.
-///
-/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
-/// fine but if one task needs to wait for another task it must be expressed as an
-/// asynchronous continuation.
+/// \brief An executor implementation that runs all tasks on a single thread using an
+/// event loop.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
+class ARROW_EXPORT SerialExecutor : public Executor {
+ public:
+ template <typename T = ::arrow::internal::Empty>
+ using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
+
+ ~SerialExecutor() override;
+
+ int GetCapacity() override { return 1; };
+ Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) override;
+
+ /// \brief Runs the TopLevelTask and any scheduled tasks
+ ///
+ /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
+ /// status or call the finish signal. Failure to do this will result in a deadlock. For
+ /// this reason it is preferable (if possible) to use the helper methods (below)
+ /// RunSynchronously/RunSerially which delegates the responsiblity onto a Future
+ /// producer's existing responsibility to always mark a future finished (which can
+ /// someday be aided by ARROW-12207).
+ template <typename T = internal::Empty, typename FT = Future<T>,
+ typename FTSync = typename FT::SyncType>
+ static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
+ Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
+ return FutureToSync(fut);
+ }
+
+ private:
+ SerialExecutor();
+
+ // State uses mutex
+ struct State;
+ std::shared_ptr<State> state_;
+
+ template <typename T, typename FTSync = typename Future<T>::SyncType>
+ Future<T> Run(TopLevelTask<T> initial_task) {
+ auto final_fut = std::move(initial_task)(this);
+ if (final_fut.is_finished()) {
+ return final_fut;
+ }
+ final_fut.AddCallback([this](const FTSync&) { MarkFinished(); });
+ RunLoop();
+ return final_fut;
+ }
+ void RunLoop();
+ void MarkFinished();
+};
+
+/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
+/// pool of worker threads.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
class ARROW_EXPORT ThreadPool : public Executor {
public:
// Construct a thread pool with the given number of worker threads
@@ -309,25 +309,25 @@ class ARROW_EXPORT ThreadPool : public Executor {
static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
// Destroy thread pool; the pool will first be shut down
- ~ThreadPool() override;
+ ~ThreadPool() override;
// Return the desired number of worker threads.
// The actual number of workers may lag a bit before being adjusted to
// match this value.
int GetCapacity() override;
- bool OwnsThisThread() override;
-
- // Return the number of tasks either running or in the queue.
- int GetNumTasks();
-
+ bool OwnsThisThread() override;
+
+ // Return the number of tasks either running or in the queue.
+ int GetNumTasks();
+
// Dynamically change the number of worker threads.
- //
- // This function always returns immediately.
- // If fewer threads are running than this number, new threads are spawned
- // on-demand when needed for task execution.
- // If more threads are running than this number, excess threads are reaped
- // as soon as possible.
+ //
+ // This function always returns immediately.
+ // If fewer threads are running than this number, new threads are spawned
+ // on-demand when needed for task execution.
+ // If more threads are running than this number, excess threads are reaped
+ // as soon as possible.
Status SetCapacity(int threads);
// Heuristic for the default capacity of a thread pool for CPU-bound tasks.
@@ -350,8 +350,8 @@ class ARROW_EXPORT ThreadPool : public Executor {
ThreadPool();
- Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
- StopCallback&&) override;
+ Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) override;
// Collect finished worker threads, making sure the OS threads have exited
void CollectFinishedWorkersUnlocked();
@@ -375,24 +375,24 @@ class ARROW_EXPORT ThreadPool : public Executor {
// Return the process-global thread pool for CPU-bound tasks.
ARROW_EXPORT ThreadPool* GetCpuThreadPool();
-/// \brief Potentially run an async operation serially (if use_threads is false)
-/// \see RunSerially
-///
-/// If `use_threads` is true, the global CPU executor is used.
-/// If `use_threads` is false, a temporary SerialExecutor is used.
-/// `get_future` is called (from this thread) with the chosen executor and must
-/// return a future that will eventually finish. This function returns once the
-/// future has finished.
-template <typename Fut, typename ValueType = typename Fut::ValueType>
-typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
- bool use_threads) {
- if (use_threads) {
- auto fut = std::move(get_future)(GetCpuThreadPool());
- return FutureToSync(fut);
- } else {
- return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
- }
-}
-
+/// \brief Potentially run an async operation serially (if use_threads is false)
+/// \see RunSerially
+///
+/// If `use_threads` is true, the global CPU executor is used.
+/// If `use_threads` is false, a temporary SerialExecutor is used.
+/// `get_future` is called (from this thread) with the chosen executor and must
+/// return a future that will eventually finish. This function returns once the
+/// future has finished.
+template <typename Fut, typename ValueType = typename Fut::ValueType>
+typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
+ bool use_threads) {
+ if (use_threads) {
+ auto fut = std::move(get_future)(GetCpuThreadPool());
+ return FutureToSync(fut);
+ } else {
+ return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
+ }
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
index b250cca647d..ed73fdc6b04 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
@@ -116,7 +116,7 @@ std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
class ARROW_EXPORT Trie {
using index_type = int16_t;
using fast_index_type = int_fast16_t;
- static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
+ static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
public:
Trie() : size_(0) {}
@@ -126,9 +126,9 @@ class ARROW_EXPORT Trie {
int32_t Find(util::string_view s) const {
const Node* node = &nodes_[0];
fast_index_type pos = 0;
- if (s.length() > static_cast<size_t>(kMaxIndex)) {
- return -1;
- }
+ if (s.length() > static_cast<size_t>(kMaxIndex)) {
+ return -1;
+ }
fast_index_type remaining = static_cast<fast_index_type>(s.length());
while (remaining > 0) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
index ca107c2c69d..b3e69aa632f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
@@ -19,16 +19,16 @@
namespace arrow {
-namespace internal {
-struct Empty;
-} // namespace internal
-
-template <typename T = internal::Empty>
-class WeakFuture;
-class FutureWaiter;
-
-class TimestampParser;
-
+namespace internal {
+struct Empty;
+} // namespace internal
+
+template <typename T = internal::Empty>
+class WeakFuture;
+class FutureWaiter;
+
+class TimestampParser;
+
namespace internal {
class Executor;
@@ -36,27 +36,27 @@ class TaskGroup;
class ThreadPool;
} // namespace internal
-
-struct Compression {
- /// \brief Compression algorithm
- enum type {
- UNCOMPRESSED,
- SNAPPY,
- GZIP,
- BROTLI,
- ZSTD,
- LZ4,
- LZ4_FRAME,
- LZO,
- BZ2,
- LZ4_HADOOP
- };
-};
-
-namespace util {
-class Compressor;
-class Decompressor;
-class Codec;
-} // namespace util
-
+
+struct Compression {
+ /// \brief Compression algorithm
+ enum type {
+ UNCOMPRESSED,
+ SNAPPY,
+ GZIP,
+ BROTLI,
+ ZSTD,
+ LZ4,
+ LZ4_FRAME,
+ LZO,
+ BZ2,
+ LZ4_HADOOP
+ };
+};
+
+namespace util {
+class Compressor;
+class Decompressor;
+class Codec;
+} // namespace util
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
index 80cc6297e39..a8cfec5cc04 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
@@ -17,7 +17,7 @@
#pragma once
-#include <cstdint>
+#include <cstdint>
#include <type_traits>
namespace arrow {
@@ -42,45 +42,45 @@ template <typename T>
struct is_null_pointer : std::is_same<std::nullptr_t, typename std::remove_cv<T>::type> {
};
-#ifdef __GLIBCXX__
-
-// A aligned_union backport, because old libstdc++ versions don't include it.
-
-constexpr std::size_t max_size(std::size_t a, std::size_t b) { return (a > b) ? a : b; }
-
-template <typename...>
-struct max_size_traits;
-
-template <typename H, typename... T>
-struct max_size_traits<H, T...> {
- static constexpr std::size_t max_sizeof() {
- return max_size(sizeof(H), max_size_traits<T...>::max_sizeof());
- }
- static constexpr std::size_t max_alignof() {
- return max_size(alignof(H), max_size_traits<T...>::max_alignof());
- }
-};
-
-template <>
-struct max_size_traits<> {
- static constexpr std::size_t max_sizeof() { return 0; }
- static constexpr std::size_t max_alignof() { return 0; }
-};
-
-template <std::size_t Len, typename... T>
-struct aligned_union {
- static constexpr std::size_t alignment_value = max_size_traits<T...>::max_alignof();
- static constexpr std::size_t size_value =
- max_size(Len, max_size_traits<T...>::max_sizeof());
- using type = typename std::aligned_storage<size_value, alignment_value>::type;
-};
-
-#else
-
-template <std::size_t Len, typename... T>
-using aligned_union = std::aligned_union<Len, T...>;
-
-#endif
-
+#ifdef __GLIBCXX__
+
+// A aligned_union backport, because old libstdc++ versions don't include it.
+
+constexpr std::size_t max_size(std::size_t a, std::size_t b) { return (a > b) ? a : b; }
+
+template <typename...>
+struct max_size_traits;
+
+template <typename H, typename... T>
+struct max_size_traits<H, T...> {
+ static constexpr std::size_t max_sizeof() {
+ return max_size(sizeof(H), max_size_traits<T...>::max_sizeof());
+ }
+ static constexpr std::size_t max_alignof() {
+ return max_size(alignof(H), max_size_traits<T...>::max_alignof());
+ }
+};
+
+template <>
+struct max_size_traits<> {
+ static constexpr std::size_t max_sizeof() { return 0; }
+ static constexpr std::size_t max_alignof() { return 0; }
+};
+
+template <std::size_t Len, typename... T>
+struct aligned_union {
+ static constexpr std::size_t alignment_value = max_size_traits<T...>::max_alignof();
+ static constexpr std::size_t size_value =
+ max_size(Len, max_size_traits<T...>::max_sizeof());
+ using type = typename std::aligned_storage<size_value, alignment_value>::type;
+};
+
+#else
+
+template <std::size_t Len, typename... T>
+using aligned_union = std::aligned_union<Len, T...>;
+
+#endif
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
index c19a7bc2eee..f644f73fd8e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
@@ -23,7 +23,7 @@
#include "arrow/util/string_view.h"
#include "arrow/util/value_parsing.h"
-#include "contrib/restricted/uriparser/include/uriparser/Uri.h"
+#include "contrib/restricted/uriparser/include/uriparser/Uri.h"
namespace arrow {
namespace internal {
@@ -71,28 +71,28 @@ std::string UriEscape(const std::string& s) {
return escaped;
}
-std::string UriUnescape(const util::string_view s) {
- std::string result(s);
- if (!result.empty()) {
- auto end = uriUnescapeInPlaceA(&result[0]);
- result.resize(end - &result[0]);
- }
- return result;
-}
-
-std::string UriEncodeHost(const std::string& host) {
- // Fairly naive check: if it contains a ':', it's IPv6 and needs
- // brackets, else it's OK
- if (host.find(":") != std::string::npos) {
- std::string result = "[";
- result += host;
- result += ']';
- return result;
- } else {
- return host;
- }
-}
-
+std::string UriUnescape(const util::string_view s) {
+ std::string result(s);
+ if (!result.empty()) {
+ auto end = uriUnescapeInPlaceA(&result[0]);
+ result.resize(end - &result[0]);
+ }
+ return result;
+}
+
+std::string UriEncodeHost(const std::string& host) {
+ // Fairly naive check: if it contains a ':', it's IPv6 and needs
+ // brackets, else it's OK
+ if (host.find(":") != std::string::npos) {
+ std::string result = "[";
+ result += host;
+ result += ']';
+ return result;
+ } else {
+ return host;
+ }
+}
+
struct Uri::Impl {
Impl() : string_rep_(""), port_(-1) { memset(&uri_, 0, sizeof(uri_)); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
index b4ffbb04dec..35a9400f92b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
@@ -24,7 +24,7 @@
#include <vector>
#include "arrow/type_fwd.h"
-#include "arrow/util/string_view.h"
+#include "arrow/util/string_view.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -92,13 +92,13 @@ class ARROW_EXPORT Uri {
ARROW_EXPORT
std::string UriEscape(const std::string& s);
-ARROW_EXPORT
-std::string UriUnescape(const arrow::util::string_view s);
-
-/// Encode a host for use within a URI, such as "localhost",
-/// "127.0.0.1", or "[::1]".
-ARROW_EXPORT
-std::string UriEncodeHost(const std::string& host);
-
+ARROW_EXPORT
+std::string UriUnescape(const arrow::util::string_view s);
+
+/// Encode a host for use within a URI, such as "localhost",
+/// "127.0.0.1", or "[::1]".
+ARROW_EXPORT
+std::string UriEncodeHost(const std::string& host);
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
index 11394d2e64c..af850dfc523 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
@@ -64,8 +64,8 @@ const uint8_t utf8_small_table[] = { // NOLINT
uint16_t utf8_large_table[9 * 256] = {0xffff};
-const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
-
+const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+
static void InitializeLargeTable() {
for (uint32_t state = 0; state < 9; ++state) {
for (uint32_t byte = 0; byte < 256; ++byte) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
index 0ec3538b95c..54ee9a2820b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
@@ -23,15 +23,15 @@
#include <memory>
#include <string>
-#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
-#error #include <xsimd/xsimd.hpp>
-#endif
-
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#error #include <xsimd/xsimd.hpp>
+#endif
+
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/simd.h"
#include "arrow/util/string_view.h"
-#include "arrow/util/ubsan.h"
+#include "arrow/util/ubsan.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -65,8 +65,8 @@ static constexpr uint8_t kUTF8DecodeReject = 12;
// In this table states are multiples of 256.
ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];
-ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];
-
+ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];
+
// Success / reject states when looked up in the large table
static constexpr uint16_t kUTF8ValidateAccept = 0;
static constexpr uint16_t kUTF8ValidateReject = 256;
@@ -94,9 +94,9 @@ ARROW_EXPORT void InitializeUTF8();
inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
- static constexpr uint32_t high_bits_32 = 0x80808080UL;
- static constexpr uint16_t high_bits_16 = 0x8080U;
- static constexpr uint8_t high_bits_8 = 0x80U;
+ static constexpr uint32_t high_bits_32 = 0x80808080UL;
+ static constexpr uint16_t high_bits_16 = 0x8080U;
+ static constexpr uint8_t high_bits_8 = 0x80U;
#ifndef NDEBUG
internal::CheckUTF8Initialized();
@@ -106,8 +106,8 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
// XXX This is doing an unaligned access. Contemporary architectures
// (x86-64, AArch64, PPC64) support it natively and often have good
// performance nevertheless.
- uint64_t mask64 = SafeLoadAs<uint64_t>(data);
- if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
+ uint64_t mask64 = SafeLoadAs<uint64_t>(data);
+ if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
// 8 bytes of pure ASCII, move forward
size -= 8;
data += 8;
@@ -162,50 +162,50 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
return false;
}
- // Check if string tail is full ASCII (common case, fast)
- if (size >= 4) {
- uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
- uint32_t head_mask = SafeLoadAs<uint32_t>(data);
- if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
- return true;
- }
- } else if (size >= 2) {
- uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
- uint16_t head_mask = SafeLoadAs<uint16_t>(data);
- if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
- return true;
- }
- } else if (size == 1) {
- if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
- return true;
- }
- } else {
- /* size == 0 */
- return true;
- }
-
- // Fall back to UTF8 validation of tail string.
+ // Check if string tail is full ASCII (common case, fast)
+ if (size >= 4) {
+ uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
+ uint32_t head_mask = SafeLoadAs<uint32_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
+ return true;
+ }
+ } else if (size >= 2) {
+ uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
+ uint16_t head_mask = SafeLoadAs<uint16_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
+ return true;
+ }
+ } else if (size == 1) {
+ if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
+ return true;
+ }
+ } else {
+ /* size == 0 */
+ return true;
+ }
+
+ // Fall back to UTF8 validation of tail string.
// Note the state table is designed so that, once in the reject state,
// we remain in that state until the end. So we needn't check for
// rejection at each char (we don't gain much by short-circuiting here).
uint16_t state = internal::kUTF8ValidateAccept;
- switch (size) {
- case 7:
- state = internal::ValidateOneUTF8Byte(data[size - 7], state);
- case 6:
- state = internal::ValidateOneUTF8Byte(data[size - 6], state);
- case 5:
- state = internal::ValidateOneUTF8Byte(data[size - 5], state);
- case 4:
- state = internal::ValidateOneUTF8Byte(data[size - 4], state);
- case 3:
- state = internal::ValidateOneUTF8Byte(data[size - 3], state);
- case 2:
- state = internal::ValidateOneUTF8Byte(data[size - 2], state);
- case 1:
- state = internal::ValidateOneUTF8Byte(data[size - 1], state);
- default:
- break;
+ switch (size) {
+ case 7:
+ state = internal::ValidateOneUTF8Byte(data[size - 7], state);
+ case 6:
+ state = internal::ValidateOneUTF8Byte(data[size - 6], state);
+ case 5:
+ state = internal::ValidateOneUTF8Byte(data[size - 5], state);
+ case 4:
+ state = internal::ValidateOneUTF8Byte(data[size - 4], state);
+ case 3:
+ state = internal::ValidateOneUTF8Byte(data[size - 3], state);
+ case 2:
+ state = internal::ValidateOneUTF8Byte(data[size - 2], state);
+ case 1:
+ state = internal::ValidateOneUTF8Byte(data[size - 1], state);
+ default:
+ break;
}
return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
}
@@ -246,26 +246,26 @@ inline bool ValidateAsciiSw(const uint8_t* data, int64_t len) {
}
}
-#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
inline bool ValidateAsciiSimd(const uint8_t* data, int64_t len) {
- using simd_batch = xsimd::batch<int8_t, 16>;
+ using simd_batch = xsimd::batch<int8_t, 16>;
if (len >= 32) {
- const simd_batch zero(static_cast<int8_t>(0));
+ const simd_batch zero(static_cast<int8_t>(0));
const uint8_t* data2 = data + 16;
- simd_batch or1 = zero, or2 = zero;
+ simd_batch or1 = zero, or2 = zero;
while (len >= 32) {
- or1 |= simd_batch(reinterpret_cast<const int8_t*>(data), xsimd::unaligned_mode{});
- or2 |= simd_batch(reinterpret_cast<const int8_t*>(data2), xsimd::unaligned_mode{});
+ or1 |= simd_batch(reinterpret_cast<const int8_t*>(data), xsimd::unaligned_mode{});
+ or2 |= simd_batch(reinterpret_cast<const int8_t*>(data2), xsimd::unaligned_mode{});
data += 32;
data2 += 32;
len -= 32;
}
- // To test for upper bit in all bytes, test whether any of them is negative
- or1 |= or2;
- if (xsimd::any(or1 < zero)) {
+ // To test for upper bit in all bytes, test whether any of them is negative
+ or1 |= or2;
+ if (xsimd::any(or1 < zero)) {
return false;
}
}
@@ -295,34 +295,34 @@ Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);
static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;
-// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
-// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
-// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
-// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
-// utf8_byte_size_table[14] --> 3B long UTF8 chars
-// utf8_byte_size_table[15] --> 4B long UTF8 chars
-// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
-// ex: \xFF... returns 4B
-static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
- return internal::utf8_byte_size_table[*codeunit >> 4];
-}
-
+// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
+// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
+// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
+// utf8_byte_size_table[14] --> 3B long UTF8 chars
+// utf8_byte_size_table[15] --> 4B long UTF8 chars
+// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
+// ex: \xFF... returns 4B
+static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
+ return internal::utf8_byte_size_table[*codeunit >> 4];
+}
+
static inline bool Utf8IsContinuation(const uint8_t codeunit) {
return (codeunit & 0xC0) == 0x80; // upper two bits should be 10
}
-static inline bool Utf8Is2ByteStart(const uint8_t codeunit) {
- return (codeunit & 0xE0) == 0xC0; // upper three bits should be 110
-}
-
-static inline bool Utf8Is3ByteStart(const uint8_t codeunit) {
- return (codeunit & 0xF0) == 0xE0; // upper four bits should be 1110
-}
-
-static inline bool Utf8Is4ByteStart(const uint8_t codeunit) {
- return (codeunit & 0xF8) == 0xF0; // upper five bits should be 11110
-}
-
+static inline bool Utf8Is2ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xE0) == 0xC0; // upper three bits should be 110
+}
+
+static inline bool Utf8Is3ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xF0) == 0xE0; // upper four bits should be 1110
+}
+
+static inline bool Utf8Is4ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xF8) == 0xF0; // upper five bits should be 11110
+}
+
static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
if (codepoint < 0x80) {
*str++ = codepoint;
@@ -346,7 +346,7 @@ static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
const uint8_t* str = *data;
- if (*str < 0x80) { // ascii
+ if (*str < 0x80) { // ascii
*codepoint = *str++;
} else if (ARROW_PREDICT_FALSE(*str < 0xC0)) { // invalid non-ascii char
return false;
@@ -391,45 +391,45 @@ static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
return true;
}
-static inline bool UTF8DecodeReverse(const uint8_t** data, uint32_t* codepoint) {
- const uint8_t* str = *data;
- if (*str < 0x80) { // ascii
- *codepoint = *str--;
- } else {
- if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
- return false;
- }
- uint8_t code_unit_N = (*str--) & 0x3F; // take last 6 bits
- if (Utf8Is2ByteStart(*str)) {
- uint8_t code_unit_1 = (*str--) & 0x1F; // take last 5 bits
- *codepoint = (code_unit_1 << 6) + code_unit_N;
- } else {
- if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
- return false;
- }
- uint8_t code_unit_Nmin1 = (*str--) & 0x3F; // take last 6 bits
- if (Utf8Is3ByteStart(*str)) {
- uint8_t code_unit_1 = (*str--) & 0x0F; // take last 4 bits
- *codepoint = (code_unit_1 << 12) + (code_unit_Nmin1 << 6) + code_unit_N;
- } else {
- if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
- return false;
- }
- uint8_t code_unit_Nmin2 = (*str--) & 0x3F; // take last 6 bits
- if (ARROW_PREDICT_TRUE(Utf8Is4ByteStart(*str))) {
- uint8_t code_unit_1 = (*str--) & 0x07; // take last 3 bits
- *codepoint = (code_unit_1 << 18) + (code_unit_Nmin2 << 12) +
- (code_unit_Nmin1 << 6) + code_unit_N;
- } else {
- return false;
- }
- }
- }
- }
- *data = str;
- return true;
-}
-
+static inline bool UTF8DecodeReverse(const uint8_t** data, uint32_t* codepoint) {
+ const uint8_t* str = *data;
+ if (*str < 0x80) { // ascii
+ *codepoint = *str--;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_N = (*str--) & 0x3F; // take last 6 bits
+ if (Utf8Is2ByteStart(*str)) {
+ uint8_t code_unit_1 = (*str--) & 0x1F; // take last 5 bits
+ *codepoint = (code_unit_1 << 6) + code_unit_N;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_Nmin1 = (*str--) & 0x3F; // take last 6 bits
+ if (Utf8Is3ByteStart(*str)) {
+ uint8_t code_unit_1 = (*str--) & 0x0F; // take last 4 bits
+ *codepoint = (code_unit_1 << 12) + (code_unit_Nmin1 << 6) + code_unit_N;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_Nmin2 = (*str--) & 0x3F; // take last 6 bits
+ if (ARROW_PREDICT_TRUE(Utf8Is4ByteStart(*str))) {
+ uint8_t code_unit_1 = (*str--) & 0x07; // take last 3 bits
+ *codepoint = (code_unit_1 << 18) + (code_unit_Nmin2 << 12) +
+ (code_unit_Nmin1 << 6) + code_unit_N;
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+ *data = str;
+ return true;
+}
+
template <class UnaryOperation>
static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
uint8_t** destination, UnaryOperation&& unary_op) {
@@ -446,97 +446,97 @@ static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
return true;
}
-template <class Predicate>
-static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
- Predicate&& predicate, const uint8_t** position) {
- const uint8_t* i = first;
- while (i < last) {
- uint32_t codepoint = 0;
- const uint8_t* current = i;
- if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
- return false;
- }
- if (predicate(codepoint)) {
- *position = current;
- return true;
- }
- }
- *position = last;
- return true;
-}
-
-// Same semantics as std::find_if using reverse iterators with the return value
-// having the same semantics as std::reverse_iterator<..>.base()
-// A reverse iterator physically points to the next address, e.g.:
-// &*reverse_iterator(i) == &*(i + 1)
-template <class Predicate>
-static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
- Predicate&& predicate, const uint8_t** position) {
- // converts to a normal point
- const uint8_t* i = last - 1;
- while (i >= first) {
- uint32_t codepoint = 0;
- const uint8_t* current = i;
- if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(&i, &codepoint))) {
- return false;
- }
- if (predicate(codepoint)) {
- // converts normal pointer to 'reverse iterator semantics'.
- *position = current + 1;
- return true;
- }
- }
- // similar to how an end pointer point to 1 beyond the last, reverse iterators point
- // to the 'first' pointer to indicate out of range.
- *position = first;
- return true;
-}
-
-static inline bool UTF8AdvanceCodepoints(const uint8_t* first, const uint8_t* last,
- const uint8_t** destination, int64_t n) {
- return UTF8FindIf(
- first, last,
- [&](uint32_t codepoint) {
- bool done = n == 0;
- n--;
- return done;
- },
- destination);
-}
-
-static inline bool UTF8AdvanceCodepointsReverse(const uint8_t* first, const uint8_t* last,
- const uint8_t** destination, int64_t n) {
- return UTF8FindIfReverse(
- first, last,
- [&](uint32_t codepoint) {
- bool done = n == 0;
- n--;
- return done;
- },
- destination);
-}
-
-template <class UnaryFunction>
-static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last,
- UnaryFunction&& f) {
- const uint8_t* i = first;
- while (i < last) {
- uint32_t codepoint = 0;
- if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
- return false;
- }
- f(codepoint);
- }
- return true;
-}
-
-template <class UnaryFunction>
-static inline bool UTF8ForEach(const std::string& s, UnaryFunction&& f) {
- return UTF8ForEach(reinterpret_cast<const uint8_t*>(s.data()),
- reinterpret_cast<const uint8_t*>(s.data() + s.length()),
- std::forward<UnaryFunction>(f));
-}
-
+template <class Predicate>
+static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** position) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ const uint8_t* current = i;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ if (predicate(codepoint)) {
+ *position = current;
+ return true;
+ }
+ }
+ *position = last;
+ return true;
+}
+
+// Same semantics as std::find_if using reverse iterators with the return value
+// having the same semantics as std::reverse_iterator<..>.base()
+// A reverse iterator physically points to the next address, e.g.:
+// &*reverse_iterator(i) == &*(i + 1)
+template <class Predicate>
+static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** position) {
+ // converts to a normal point
+ const uint8_t* i = last - 1;
+ while (i >= first) {
+ uint32_t codepoint = 0;
+ const uint8_t* current = i;
+ if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ if (predicate(codepoint)) {
+ // converts normal pointer to 'reverse iterator semantics'.
+ *position = current + 1;
+ return true;
+ }
+ }
+ // similar to how an end pointer point to 1 beyond the last, reverse iterators point
+ // to the 'first' pointer to indicate out of range.
+ *position = first;
+ return true;
+}
+
+static inline bool UTF8AdvanceCodepoints(const uint8_t* first, const uint8_t* last,
+ const uint8_t** destination, int64_t n) {
+ return UTF8FindIf(
+ first, last,
+ [&](uint32_t codepoint) {
+ bool done = n == 0;
+ n--;
+ return done;
+ },
+ destination);
+}
+
+static inline bool UTF8AdvanceCodepointsReverse(const uint8_t* first, const uint8_t* last,
+ const uint8_t** destination, int64_t n) {
+ return UTF8FindIfReverse(
+ first, last,
+ [&](uint32_t codepoint) {
+ bool done = n == 0;
+ n--;
+ return done;
+ },
+ destination);
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last,
+ UnaryFunction&& f) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ f(codepoint);
+ }
+ return true;
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const std::string& s, UnaryFunction&& f) {
+ return UTF8ForEach(reinterpret_cast<const uint8_t*>(s.data()),
+ reinterpret_cast<const uint8_t*>(s.data() + s.length()),
+ std::forward<UnaryFunction>(f));
+}
+
template <class UnaryPredicate>
static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* result,
UnaryPredicate&& predicate) {
@@ -556,15 +556,15 @@ static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* re
return true;
}
-/// Count the number of codepoints in the given string (assuming it is valid UTF8).
-static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
- int64_t length = 0;
- while (first != last) {
- length += ((*first & 0xc0) != 0x80);
- ++first;
- }
- return length;
-}
-
+/// Count the number of codepoints in the given string (assuming it is valid UTF8).
+static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
+ int64_t length = 0;
+ while (first != last) {
+ length += ((*first & 0xc0) != 0x80);
+ ++first;
+ }
+ return length;
+}
+
} // namespace util
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
index 3b147366636..5460dfb91f9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
@@ -20,19 +20,19 @@
#include <string>
#include <utility>
-#include "contrib/restricted/fast_float/include/fast_float/fast_float.h"
+#include "contrib/restricted/fast_float/include/fast_float/fast_float.h"
namespace arrow {
namespace internal {
bool StringToFloat(const char* s, size_t length, float* out) {
- const auto res = fast_float::from_chars(s, s + length, *out);
- return res.ec == std::errc() && res.ptr == s + length;
+ const auto res = fast_float::from_chars(s, s + length, *out);
+ return res.ec == std::errc() && res.ptr == s + length;
}
bool StringToFloat(const char* s, size_t length, double* out) {
- const auto res = fast_float::from_chars(s, s + length, *out);
- return res.ec == std::errc() && res.ptr == s + length;
+ const auto res = fast_float::from_chars(s, s + length, *out);
+ return res.ec == std::errc() && res.ptr == s + length;
}
// ----------------------------------------------------------------------
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
index 00295d1b51f..e8de13287c1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
@@ -486,80 +486,80 @@ static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
uint32_t* out) {
- // The decimal point has been peeled off at this point
-
- // Fail if number of decimal places provided exceeds what the unit can hold.
- // Calculate how many trailing decimal places are omitted for the unit
- // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
- size_t omitted = 0;
- switch (unit) {
- case TimeUnit::MILLI:
- if (ARROW_PREDICT_FALSE(length > 3)) {
- return false;
- }
- if (length < 3) {
- omitted = 3 - length;
- }
+ // The decimal point has been peeled off at this point
+
+ // Fail if number of decimal places provided exceeds what the unit can hold.
+ // Calculate how many trailing decimal places are omitted for the unit
+ // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
+ size_t omitted = 0;
+ switch (unit) {
+ case TimeUnit::MILLI:
+ if (ARROW_PREDICT_FALSE(length > 3)) {
+ return false;
+ }
+ if (length < 3) {
+ omitted = 3 - length;
+ }
break;
- case TimeUnit::MICRO:
- if (ARROW_PREDICT_FALSE(length > 6)) {
- return false;
- }
- if (length < 6) {
- omitted = 6 - length;
- }
+ case TimeUnit::MICRO:
+ if (ARROW_PREDICT_FALSE(length > 6)) {
+ return false;
+ }
+ if (length < 6) {
+ omitted = 6 - length;
+ }
break;
- case TimeUnit::NANO:
- if (ARROW_PREDICT_FALSE(length > 9)) {
- return false;
- }
- if (length < 9) {
- omitted = 9 - length;
- }
+ case TimeUnit::NANO:
+ if (ARROW_PREDICT_FALSE(length > 9)) {
+ return false;
+ }
+ if (length < 9) {
+ omitted = 9 - length;
+ }
break;
default:
return false;
}
- if (ARROW_PREDICT_TRUE(omitted == 0)) {
- return ParseUnsigned(s, length, out);
- } else {
- uint32_t subseconds;
- bool success = ParseUnsigned(s, length, &subseconds);
- if (ARROW_PREDICT_TRUE(success)) {
- switch (omitted) {
- case 1:
- *out = subseconds * 10;
- break;
- case 2:
- *out = subseconds * 100;
- break;
- case 3:
- *out = subseconds * 1000;
- break;
- case 4:
- *out = subseconds * 10000;
- break;
- case 5:
- *out = subseconds * 100000;
- break;
- case 6:
- *out = subseconds * 1000000;
- break;
- case 7:
- *out = subseconds * 10000000;
- break;
- case 8:
- *out = subseconds * 100000000;
- break;
- default:
- // Impossible case
- break;
- }
- return true;
- } else {
- return false;
- }
+ if (ARROW_PREDICT_TRUE(omitted == 0)) {
+ return ParseUnsigned(s, length, out);
+ } else {
+ uint32_t subseconds;
+ bool success = ParseUnsigned(s, length, &subseconds);
+ if (ARROW_PREDICT_TRUE(success)) {
+ switch (omitted) {
+ case 1:
+ *out = subseconds * 10;
+ break;
+ case 2:
+ *out = subseconds * 100;
+ break;
+ case 3:
+ *out = subseconds * 1000;
+ break;
+ case 4:
+ *out = subseconds * 10000;
+ break;
+ case 5:
+ *out = subseconds * 100000;
+ break;
+ case 6:
+ *out = subseconds * 1000000;
+ break;
+ case 7:
+ *out = subseconds * 10000000;
+ break;
+ case 8:
+ *out = subseconds * 100000000;
+ break;
+ default:
+ // Impossible case
+ break;
+ }
+ return true;
+ } else {
+ return false;
+ }
}
}
@@ -572,21 +572,21 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
// We allow the following formats for all units:
// - "YYYY-MM-DD"
- // - "YYYY-MM-DD[ T]hhZ?"
- // - "YYYY-MM-DD[ T]hh:mmZ?"
- // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
+ // - "YYYY-MM-DD[ T]hhZ?"
+ // - "YYYY-MM-DD[ T]hh:mmZ?"
+ // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
//
- // We allow the following formats for unit == MILLI, MICRO, or NANO:
- // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
+ // We allow the following formats for unit == MILLI, MICRO, or NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
//
- // We allow the following formats for unit == MICRO, or NANO:
- // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
+ // We allow the following formats for unit == MICRO, or NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
//
- // We allow the following formats for unit == NANO:
- // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
+ // We allow the following formats for unit == NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
//
// UTC is always assumed, and the DataType's timezone is ignored.
- //
+ //
if (ARROW_PREDICT_FALSE(length < 10)) return false;
@@ -621,15 +621,15 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
}
break;
case 19: // YYYY-MM-DD[ T]hh:mm:ss
- case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
- case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
- case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
- case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
- case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
- case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
- case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
- case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
- case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
+ case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
+ case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
+ case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
+ case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
+ case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
+ case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
+ case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
+ case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
+ case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
return false;
}
@@ -645,13 +645,13 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
return true;
}
- if (ARROW_PREDICT_FALSE(s[19] != '.')) {
- return false;
- }
-
+ if (ARROW_PREDICT_FALSE(s[19] != '.')) {
+ return false;
+ }
+
uint32_t subseconds = 0;
if (ARROW_PREDICT_FALSE(
- !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
+ !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
return false;
}
@@ -753,7 +753,7 @@ struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
uint32_t subseconds_count = 0;
if (ARROW_PREDICT_FALSE(
- !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
+ !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
return false;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
index b4b0d8f6f31..8f8d23c2b76 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
@@ -17,423 +17,423 @@
#pragma once
-#include <cstddef>
-#include <exception>
-#include <type_traits>
-#include <utility>
-
-#include "arrow/util/macros.h"
-#include "arrow/util/type_traits.h"
-
+#include <cstddef>
+#include <exception>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
+
namespace arrow {
namespace util {
-/// \brief a std::variant-like discriminated union
-///
-/// Simplifications from std::variant:
-///
-/// - Strictly defaultable. The first type of T... should be nothrow default constructible
-/// and it will be used for default Variants.
-///
-/// - Never valueless_by_exception. std::variant supports a state outside those specified
-/// by T... to which it can return in the event that a constructor throws. If a Variant
-/// would become valueless_by_exception it will instead return to its default state.
-///
-/// - Strictly nothrow move constructible and assignable
-///
-/// - Less sophisticated type deduction. std::variant<bool, std::string>("hello") will
-/// intelligently construct std::string while Variant<bool, std::string>("hello") will
-/// construct bool.
-///
-/// - Either both copy constructible and assignable or neither (std::variant independently
-/// enables copy construction and copy assignment). Variant is copy constructible if
-/// each of T... is copy constructible and assignable.
-///
-/// - Slimmer interface; several members of std::variant are omitted.
-///
-/// - Throws no exceptions; if a bad_variant_access would be thrown Variant will instead
-/// segfault (nullptr dereference).
-///
-/// - Mutable visit takes a pointer instead of mutable reference or rvalue reference,
-/// which is more conformant with our code style.
-template <typename... T>
-class Variant;
-
-namespace detail {
-
-template <typename T, typename = void>
-struct is_equality_comparable : std::false_type {};
-
-template <typename T>
-struct is_equality_comparable<
- T, typename std::enable_if<std::is_convertible<
- decltype(std::declval<T>() == std::declval<T>()), bool>::value>::type>
- : std::true_type {};
-
-template <bool C, typename T, typename E>
-using conditional_t = typename std::conditional<C, T, E>::type;
-
-template <typename T>
-struct type_constant {
- using type = T;
-};
-
-template <typename...>
-struct first;
-
-template <typename H, typename... T>
-struct first<H, T...> {
- using type = H;
-};
-
-template <typename T>
-using decay_t = typename std::decay<T>::type;
-
-template <bool...>
-struct all : std::true_type {};
-
-template <bool H, bool... T>
-struct all<H, T...> : conditional_t<H, all<T...>, std::false_type> {};
-
-struct delete_copy_constructor {
- template <typename>
- struct type {
- type() = default;
- type(const type& other) = delete;
- type& operator=(const type& other) = delete;
- };
-};
-
-struct explicit_copy_constructor {
- template <typename Copyable>
- struct type {
- type() = default;
- type(const type& other) { static_cast<const Copyable&>(other).copy_to(this); }
- type& operator=(const type& other) {
- static_cast<Copyable*>(this)->destroy();
- static_cast<const Copyable&>(other).copy_to(this);
- return *this;
- }
- };
-};
-
-template <typename... T>
-struct VariantStorage {
- VariantStorage() = default;
- VariantStorage(const VariantStorage&) {}
- VariantStorage& operator=(const VariantStorage&) { return *this; }
- VariantStorage(VariantStorage&&) noexcept {}
- VariantStorage& operator=(VariantStorage&&) noexcept { return *this; }
- ~VariantStorage() {
- static_assert(offsetof(VariantStorage, data_) == 0,
- "(void*)&VariantStorage::data_ == (void*)this");
- }
-
- typename arrow::internal::aligned_union<0, T...>::type data_;
- uint8_t index_ = 0;
-};
-
-template <typename V, typename...>
-struct VariantImpl;
-
-template <typename... T>
-struct VariantImpl<Variant<T...>> : VariantStorage<T...> {
- static void index_of() noexcept {}
- void destroy() noexcept {}
- void move_to(...) noexcept {}
- void copy_to(...) const {}
-
- template <typename R, typename Visitor>
- [[noreturn]] R visit_const(Visitor&& /* visitor */) const {
- std::terminate();
- }
- template <typename R, typename Visitor>
- [[noreturn]] R visit_mutable(Visitor&& /* visitor */) {
- std::terminate();
- }
-};
-
-template <typename... M, typename H, typename... T>
-struct VariantImpl<Variant<M...>, H, T...> : VariantImpl<Variant<M...>, T...> {
- using VariantType = Variant<M...>;
- using Impl = VariantImpl<VariantType, T...>;
-
- static constexpr uint8_t kIndex = sizeof...(M) - sizeof...(T) - 1;
-
- VariantImpl() = default;
-
- using VariantImpl<VariantType, T...>::VariantImpl;
- using Impl::operator=;
- using Impl::index_of;
-
- explicit VariantImpl(H value) {
- new (this) H(std::move(value));
- this->index_ = kIndex;
- }
-
- VariantImpl& operator=(H value) {
- static_cast<VariantType*>(this)->destroy();
- new (this) H(std::move(value));
- this->index_ = kIndex;
- return *this;
- }
-
- H& cast_this() { return *reinterpret_cast<H*>(this); }
- const H& cast_this() const { return *reinterpret_cast<const H*>(this); }
-
- void move_to(VariantType* target) noexcept {
- if (this->index_ == kIndex) {
- new (target) H(std::move(cast_this()));
- target->index_ = kIndex;
- } else {
- Impl::move_to(target);
- }
- }
-
- // Templated to avoid instantiation in case H is not copy constructible
- template <typename Void>
- void copy_to(Void* generic_target) const {
- const auto target = static_cast<VariantType*>(generic_target);
- try {
- if (this->index_ == kIndex) {
- new (target) H(cast_this());
- target->index_ = kIndex;
- } else {
- Impl::copy_to(target);
- }
- } catch (...) {
- target->construct_default();
- throw;
- }
- }
-
- void destroy() noexcept {
- if (this->index_ == kIndex) {
- if (!std::is_trivially_destructible<H>::value) {
- cast_this().~H();
- }
- } else {
- Impl::destroy();
- }
- }
-
- static constexpr std::integral_constant<uint8_t, kIndex> index_of(
- const type_constant<H>&) {
- return {};
- }
-
- template <typename R, typename Visitor>
- R visit_const(Visitor&& visitor) const {
- if (this->index_ == kIndex) {
- return std::forward<Visitor>(visitor)(cast_this());
- }
- return Impl::template visit_const<R>(std::forward<Visitor>(visitor));
- }
-
- template <typename R, typename Visitor>
- R visit_mutable(Visitor&& visitor) {
- if (this->index_ == kIndex) {
- return std::forward<Visitor>(visitor)(&cast_this());
- }
- return Impl::template visit_mutable<R>(std::forward<Visitor>(visitor));
- }
-};
-
-} // namespace detail
-
-template <typename... T>
-class Variant : detail::VariantImpl<Variant<T...>, T...>,
- detail::conditional_t<
- detail::all<(std::is_copy_constructible<T>::value &&
- std::is_copy_assignable<T>::value)...>::value,
- detail::explicit_copy_constructor,
- detail::delete_copy_constructor>::template type<Variant<T...>> {
- template <typename U>
- static constexpr uint8_t index_of() {
- return Impl::index_of(detail::type_constant<U>{});
- }
-
- using Impl = detail::VariantImpl<Variant<T...>, T...>;
-
- public:
- using default_type = typename util::detail::first<T...>::type;
-
- Variant() noexcept { construct_default(); }
-
- Variant(const Variant& other) = default;
- Variant& operator=(const Variant& other) = default;
- Variant& operator=(Variant&& other) noexcept {
- this->destroy();
- other.move_to(this);
- return *this;
- }
-
- using Impl::Impl;
- using Impl::operator=;
-
- Variant(Variant&& other) noexcept { other.move_to(this); }
-
- ~Variant() {
- static_assert(offsetof(Variant, data_) == 0, "(void*)&Variant::data_ == (void*)this");
- this->destroy();
- }
-
- /// \brief Return the zero-based type index of the value held by the variant
- uint8_t index() const noexcept { return this->index_; }
-
- /// \brief Get a const pointer to the value held by the variant
- ///
- /// If the type given as template argument doesn't match, a null pointer is returned.
- template <typename U, uint8_t I = index_of<U>()>
- const U* get() const noexcept {
- return index() == I ? reinterpret_cast<const U*>(this) : NULLPTR;
- }
-
- /// \brief Get a pointer to the value held by the variant
- ///
- /// If the type given as template argument doesn't match, a null pointer is returned.
- template <typename U, uint8_t I = index_of<U>()>
- U* get() noexcept {
- return index() == I ? reinterpret_cast<U*>(this) : NULLPTR;
- }
-
- /// \brief Replace the value held by the variant
- ///
- /// The intended type must be given as a template argument.
- /// The value is constructed in-place using the given function arguments.
- template <typename U, typename... A, uint8_t I = index_of<U>()>
- void emplace(A&&... args) try {
- this->destroy();
- new (this) U(std::forward<A>(args)...);
- this->index_ = I;
- } catch (...) {
- construct_default();
- throw;
- }
-
- template <typename U, typename E, typename... A, uint8_t I = index_of<U>()>
- void emplace(std::initializer_list<E> il, A&&... args) try {
- this->destroy();
- new (this) U(il, std::forward<A>(args)...);
- this->index_ = I;
- } catch (...) {
- construct_default();
- throw;
- }
-
- /// \brief Swap with another variant's contents
- void swap(Variant& other) noexcept { // NOLINT google-runtime-references
- Variant tmp = std::move(other);
- other = std::move(*this);
- *this = std::move(tmp);
- }
-
- using Impl::visit_const;
- using Impl::visit_mutable;
-
- private:
- void construct_default() noexcept {
- new (this) default_type();
- this->index_ = 0;
- }
-
- template <typename V>
- friend struct detail::explicit_copy_constructor::type;
-
- template <typename V, typename...>
- friend struct detail::VariantImpl;
-};
-
-/// \brief Call polymorphic visitor on a const variant's value
-///
-/// The visitor will receive a const reference to the value held by the variant.
-/// It must define overloads for each possible variant type.
-/// The overloads should all return the same type (no attempt
-/// is made to find a generalized return type).
-template <typename Visitor, typename... T,
- typename R = decltype(std::declval<Visitor&&>()(
- std::declval<const typename Variant<T...>::default_type&>()))>
-R visit(Visitor&& visitor, const util::Variant<T...>& v) {
- return v.template visit_const<R>(std::forward<Visitor>(visitor));
-}
-
-/// \brief Call polymorphic visitor on a non-const variant's value
-///
-/// The visitor will receive a pointer to the value held by the variant.
-/// It must define overloads for each possible variant type.
-/// The overloads should all return the same type (no attempt
-/// is made to find a generalized return type).
-template <typename Visitor, typename... T,
- typename R = decltype(std::declval<Visitor&&>()(
- std::declval<typename Variant<T...>::default_type*>()))>
-R visit(Visitor&& visitor, util::Variant<T...>* v) {
- return v->template visit_mutable<R>(std::forward<Visitor>(visitor));
-}
-
-/// \brief Get a const reference to the value held by the variant
-///
-/// If the type given as template argument doesn't match, behavior is undefined
-/// (a null pointer will be dereferenced).
-template <typename U, typename... T>
-const U& get(const Variant<T...>& v) {
- return *v.template get<U>();
-}
-
-/// \brief Get a reference to the value held by the variant
-///
-/// If the type given as template argument doesn't match, behavior is undefined
-/// (a null pointer will be dereferenced).
-template <typename U, typename... T>
-U& get(Variant<T...>& v) {
- return *v.template get<U>();
-}
-
-/// \brief Get a const pointer to the value held by the variant
-///
-/// If the type given as template argument doesn't match, a nullptr is returned.
-template <typename U, typename... T>
-const U* get_if(const Variant<T...>* v) {
- return v->template get<U>();
-}
-
-/// \brief Get a pointer to the value held by the variant
-///
-/// If the type given as template argument doesn't match, a nullptr is returned.
-template <typename U, typename... T>
-U* get_if(Variant<T...>* v) {
- return v->template get<U>();
-}
-
-namespace detail {
-
-template <typename... T>
-struct VariantsEqual {
- template <typename U>
- bool operator()(const U& r) const {
- return get<U>(l_) == r;
- }
- const Variant<T...>& l_;
-};
-
-} // namespace detail
-
-template <typename... T, typename = typename std::enable_if<detail::all<
- detail::is_equality_comparable<T>::value...>::value>>
-bool operator==(const Variant<T...>& l, const Variant<T...>& r) {
- if (l.index() != r.index()) return false;
- return visit(detail::VariantsEqual<T...>{l}, r);
-}
-
-template <typename... T>
-auto operator!=(const Variant<T...>& l, const Variant<T...>& r) -> decltype(l == r) {
- return !(l == r);
-}
-
-/// \brief Return whether the variant holds a value of the given type
-template <typename U, typename... T>
-bool holds_alternative(const Variant<T...>& v) {
- return v.template get<U>();
-}
-
+/// \brief a std::variant-like discriminated union
+///
+/// Simplifications from std::variant:
+///
+/// - Strictly defaultable. The first type of T... should be nothrow default constructible
+/// and it will be used for default Variants.
+///
+/// - Never valueless_by_exception. std::variant supports a state outside those specified
+/// by T... to which it can return in the event that a constructor throws. If a Variant
+/// would become valueless_by_exception it will instead return to its default state.
+///
+/// - Strictly nothrow move constructible and assignable
+///
+/// - Less sophisticated type deduction. std::variant<bool, std::string>("hello") will
+/// intelligently construct std::string while Variant<bool, std::string>("hello") will
+/// construct bool.
+///
+/// - Either both copy constructible and assignable or neither (std::variant independently
+/// enables copy construction and copy assignment). Variant is copy constructible if
+/// each of T... is copy constructible and assignable.
+///
+/// - Slimmer interface; several members of std::variant are omitted.
+///
+/// - Throws no exceptions; if a bad_variant_access would be thrown Variant will instead
+/// segfault (nullptr dereference).
+///
+/// - Mutable visit takes a pointer instead of mutable reference or rvalue reference,
+/// which is more conformant with our code style.
+template <typename... T>
+class Variant;
+
+namespace detail {
+
+template <typename T, typename = void>
+struct is_equality_comparable : std::false_type {};
+
+template <typename T>
+struct is_equality_comparable<
+ T, typename std::enable_if<std::is_convertible<
+ decltype(std::declval<T>() == std::declval<T>()), bool>::value>::type>
+ : std::true_type {};
+
+template <bool C, typename T, typename E>
+using conditional_t = typename std::conditional<C, T, E>::type;
+
+template <typename T>
+struct type_constant {
+ using type = T;
+};
+
+template <typename...>
+struct first;
+
+template <typename H, typename... T>
+struct first<H, T...> {
+ using type = H;
+};
+
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <bool...>
+struct all : std::true_type {};
+
+template <bool H, bool... T>
+struct all<H, T...> : conditional_t<H, all<T...>, std::false_type> {};
+
+struct delete_copy_constructor {
+ template <typename>
+ struct type {
+ type() = default;
+ type(const type& other) = delete;
+ type& operator=(const type& other) = delete;
+ };
+};
+
+struct explicit_copy_constructor {
+ template <typename Copyable>
+ struct type {
+ type() = default;
+ type(const type& other) { static_cast<const Copyable&>(other).copy_to(this); }
+ type& operator=(const type& other) {
+ static_cast<Copyable*>(this)->destroy();
+ static_cast<const Copyable&>(other).copy_to(this);
+ return *this;
+ }
+ };
+};
+
+template <typename... T>
+struct VariantStorage {
+ VariantStorage() = default;
+ VariantStorage(const VariantStorage&) {}
+ VariantStorage& operator=(const VariantStorage&) { return *this; }
+ VariantStorage(VariantStorage&&) noexcept {}
+ VariantStorage& operator=(VariantStorage&&) noexcept { return *this; }
+ ~VariantStorage() {
+ static_assert(offsetof(VariantStorage, data_) == 0,
+ "(void*)&VariantStorage::data_ == (void*)this");
+ }
+
+ typename arrow::internal::aligned_union<0, T...>::type data_;
+ uint8_t index_ = 0;
+};
+
+template <typename V, typename...>
+struct VariantImpl;
+
+template <typename... T>
+struct VariantImpl<Variant<T...>> : VariantStorage<T...> {
+ static void index_of() noexcept {}
+ void destroy() noexcept {}
+ void move_to(...) noexcept {}
+ void copy_to(...) const {}
+
+ template <typename R, typename Visitor>
+ [[noreturn]] R visit_const(Visitor&& /* visitor */) const {
+ std::terminate();
+ }
+ template <typename R, typename Visitor>
+ [[noreturn]] R visit_mutable(Visitor&& /* visitor */) {
+ std::terminate();
+ }
+};
+
+template <typename... M, typename H, typename... T>
+struct VariantImpl<Variant<M...>, H, T...> : VariantImpl<Variant<M...>, T...> {
+ using VariantType = Variant<M...>;
+ using Impl = VariantImpl<VariantType, T...>;
+
+ static constexpr uint8_t kIndex = sizeof...(M) - sizeof...(T) - 1;
+
+ VariantImpl() = default;
+
+ using VariantImpl<VariantType, T...>::VariantImpl;
+ using Impl::operator=;
+ using Impl::index_of;
+
+ explicit VariantImpl(H value) {
+ new (this) H(std::move(value));
+ this->index_ = kIndex;
+ }
+
+ VariantImpl& operator=(H value) {
+ static_cast<VariantType*>(this)->destroy();
+ new (this) H(std::move(value));
+ this->index_ = kIndex;
+ return *this;
+ }
+
+ H& cast_this() { return *reinterpret_cast<H*>(this); }
+ const H& cast_this() const { return *reinterpret_cast<const H*>(this); }
+
+ void move_to(VariantType* target) noexcept {
+ if (this->index_ == kIndex) {
+ new (target) H(std::move(cast_this()));
+ target->index_ = kIndex;
+ } else {
+ Impl::move_to(target);
+ }
+ }
+
+ // Templated to avoid instantiation in case H is not copy constructible
+ template <typename Void>
+ void copy_to(Void* generic_target) const {
+ const auto target = static_cast<VariantType*>(generic_target);
+ try {
+ if (this->index_ == kIndex) {
+ new (target) H(cast_this());
+ target->index_ = kIndex;
+ } else {
+ Impl::copy_to(target);
+ }
+ } catch (...) {
+ target->construct_default();
+ throw;
+ }
+ }
+
+ void destroy() noexcept {
+ if (this->index_ == kIndex) {
+ if (!std::is_trivially_destructible<H>::value) {
+ cast_this().~H();
+ }
+ } else {
+ Impl::destroy();
+ }
+ }
+
+ static constexpr std::integral_constant<uint8_t, kIndex> index_of(
+ const type_constant<H>&) {
+ return {};
+ }
+
+ template <typename R, typename Visitor>
+ R visit_const(Visitor&& visitor) const {
+ if (this->index_ == kIndex) {
+ return std::forward<Visitor>(visitor)(cast_this());
+ }
+ return Impl::template visit_const<R>(std::forward<Visitor>(visitor));
+ }
+
+ template <typename R, typename Visitor>
+ R visit_mutable(Visitor&& visitor) {
+ if (this->index_ == kIndex) {
+ return std::forward<Visitor>(visitor)(&cast_this());
+ }
+ return Impl::template visit_mutable<R>(std::forward<Visitor>(visitor));
+ }
+};
+
+} // namespace detail
+
+template <typename... T>
+class Variant : detail::VariantImpl<Variant<T...>, T...>,
+ detail::conditional_t<
+ detail::all<(std::is_copy_constructible<T>::value &&
+ std::is_copy_assignable<T>::value)...>::value,
+ detail::explicit_copy_constructor,
+ detail::delete_copy_constructor>::template type<Variant<T...>> {
+ template <typename U>
+ static constexpr uint8_t index_of() {
+ return Impl::index_of(detail::type_constant<U>{});
+ }
+
+ using Impl = detail::VariantImpl<Variant<T...>, T...>;
+
+ public:
+ using default_type = typename util::detail::first<T...>::type;
+
+ Variant() noexcept { construct_default(); }
+
+ Variant(const Variant& other) = default;
+ Variant& operator=(const Variant& other) = default;
+ Variant& operator=(Variant&& other) noexcept {
+ this->destroy();
+ other.move_to(this);
+ return *this;
+ }
+
+ using Impl::Impl;
+ using Impl::operator=;
+
+ Variant(Variant&& other) noexcept { other.move_to(this); }
+
+ ~Variant() {
+ static_assert(offsetof(Variant, data_) == 0, "(void*)&Variant::data_ == (void*)this");
+ this->destroy();
+ }
+
+ /// \brief Return the zero-based type index of the value held by the variant
+ uint8_t index() const noexcept { return this->index_; }
+
+ /// \brief Get a const pointer to the value held by the variant
+ ///
+ /// If the type given as template argument doesn't match, a null pointer is returned.
+ template <typename U, uint8_t I = index_of<U>()>
+ const U* get() const noexcept {
+ return index() == I ? reinterpret_cast<const U*>(this) : NULLPTR;
+ }
+
+ /// \brief Get a pointer to the value held by the variant
+ ///
+ /// If the type given as template argument doesn't match, a null pointer is returned.
+ template <typename U, uint8_t I = index_of<U>()>
+ U* get() noexcept {
+ return index() == I ? reinterpret_cast<U*>(this) : NULLPTR;
+ }
+
+ /// \brief Replace the value held by the variant
+ ///
+ /// The intended type must be given as a template argument.
+ /// The value is constructed in-place using the given function arguments.
+ template <typename U, typename... A, uint8_t I = index_of<U>()>
+ void emplace(A&&... args) try {
+ this->destroy();
+ new (this) U(std::forward<A>(args)...);
+ this->index_ = I;
+ } catch (...) {
+ construct_default();
+ throw;
+ }
+
+ template <typename U, typename E, typename... A, uint8_t I = index_of<U>()>
+ void emplace(std::initializer_list<E> il, A&&... args) try {
+ this->destroy();
+ new (this) U(il, std::forward<A>(args)...);
+ this->index_ = I;
+ } catch (...) {
+ construct_default();
+ throw;
+ }
+
+ /// \brief Swap with another variant's contents
+ void swap(Variant& other) noexcept { // NOLINT google-runtime-references
+ Variant tmp = std::move(other);
+ other = std::move(*this);
+ *this = std::move(tmp);
+ }
+
+ using Impl::visit_const;
+ using Impl::visit_mutable;
+
+ private:
+ void construct_default() noexcept {
+ new (this) default_type();
+ this->index_ = 0;
+ }
+
+ template <typename V>
+ friend struct detail::explicit_copy_constructor::type;
+
+ template <typename V, typename...>
+ friend struct detail::VariantImpl;
+};
+
+/// \brief Call polymorphic visitor on a const variant's value
+///
+/// The visitor will receive a const reference to the value held by the variant.
+/// It must define overloads for each possible variant type.
+/// The overloads should all return the same type (no attempt
+/// is made to find a generalized return type).
+template <typename Visitor, typename... T,
+ typename R = decltype(std::declval<Visitor&&>()(
+ std::declval<const typename Variant<T...>::default_type&>()))>
+R visit(Visitor&& visitor, const util::Variant<T...>& v) {
+ return v.template visit_const<R>(std::forward<Visitor>(visitor));
+}
+
+/// \brief Call polymorphic visitor on a non-const variant's value
+///
+/// The visitor will receive a pointer to the value held by the variant.
+/// It must define overloads for each possible variant type.
+/// The overloads should all return the same type (no attempt
+/// is made to find a generalized return type).
+template <typename Visitor, typename... T,
+ typename R = decltype(std::declval<Visitor&&>()(
+ std::declval<typename Variant<T...>::default_type*>()))>
+R visit(Visitor&& visitor, util::Variant<T...>* v) {
+ return v->template visit_mutable<R>(std::forward<Visitor>(visitor));
+}
+
+/// \brief Get a const reference to the value held by the variant
+///
+/// If the type given as template argument doesn't match, behavior is undefined
+/// (a null pointer will be dereferenced).
+template <typename U, typename... T>
+const U& get(const Variant<T...>& v) {
+ return *v.template get<U>();
+}
+
+/// \brief Get a reference to the value held by the variant
+///
+/// If the type given as template argument doesn't match, behavior is undefined
+/// (a null pointer will be dereferenced).
+template <typename U, typename... T>
+U& get(Variant<T...>& v) {
+ return *v.template get<U>();
+}
+
+/// \brief Get a const pointer to the value held by the variant
+///
+/// If the type given as template argument doesn't match, a nullptr is returned.
+template <typename U, typename... T>
+const U* get_if(const Variant<T...>* v) {
+ return v->template get<U>();
+}
+
+/// \brief Get a pointer to the value held by the variant
+///
+/// If the type given as template argument doesn't match, a nullptr is returned.
+template <typename U, typename... T>
+U* get_if(Variant<T...>* v) {
+ return v->template get<U>();
+}
+
+namespace detail {
+
+template <typename... T>
+struct VariantsEqual {
+ template <typename U>
+ bool operator()(const U& r) const {
+ return get<U>(l_) == r;
+ }
+ const Variant<T...>& l_;
+};
+
+} // namespace detail
+
+template <typename... T, typename = typename std::enable_if<detail::all<
+ detail::is_equality_comparable<T>::value...>::value>>
+bool operator==(const Variant<T...>& l, const Variant<T...>& r) {
+ if (l.index() != r.index()) return false;
+ return visit(detail::VariantsEqual<T...>{l}, r);
+}
+
+template <typename... T>
+auto operator!=(const Variant<T...>& l, const Variant<T...>& r) -> decltype(l == r) {
+ return !(l == r);
+}
+
+/// \brief Return whether the variant holds a value of the given type
+template <typename U, typename... T>
+bool holds_alternative(const Variant<T...>& v) {
+ return v.template get<U>();
+}
+
} // namespace util
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
index 041bdb424a7..8bb6f44a4d5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
@@ -17,20 +17,20 @@
#pragma once
-#include <algorithm>
+#include <algorithm>
#include <utility>
#include <vector>
-#include "arrow/result.h"
-#include "arrow/util/algorithm.h"
-#include "arrow/util/functional.h"
+#include "arrow/result.h"
+#include "arrow/util/algorithm.h"
+#include "arrow/util/functional.h"
#include "arrow/util/logging.h"
namespace arrow {
namespace internal {
template <typename T>
-std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
+std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
DCHECK(!values.empty());
DCHECK_LT(index, values.size());
std::vector<T> out;
@@ -45,8 +45,8 @@ std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
}
template <typename T>
-std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
- T new_element) {
+std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
+ T new_element) {
DCHECK_LE(index, values.size());
std::vector<T> out;
out.reserve(values.size() + 1);
@@ -61,8 +61,8 @@ std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
}
template <typename T>
-std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
- T new_element) {
+std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
+ T new_element) {
DCHECK_LE(index, values.size());
std::vector<T> out;
out.reserve(values.size());
@@ -76,97 +76,97 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
return out;
}
-template <typename T, typename Predicate>
-std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
- auto new_end =
- std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
- values.erase(new_end, values.end());
- return values;
-}
-
-template <typename Fn, typename From,
- typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
-std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
- std::vector<To> out;
- out.reserve(source.size());
- std::transform(source.begin(), source.end(), std::back_inserter(out),
- std::forward<Fn>(map));
- return out;
-}
-
-template <typename Fn, typename From,
- typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
-std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
- std::vector<To> out;
- out.reserve(source.size());
- std::transform(std::make_move_iterator(source.begin()),
- std::make_move_iterator(source.end()), std::back_inserter(out),
- std::forward<Fn>(map));
- return out;
-}
-
-/// \brief Like MapVector, but where the function can fail.
-template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
- typename To = typename internal::call_traits::return_type<Fn>::ValueType>
-Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
- std::vector<To> out;
- out.reserve(source.size());
- ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
- std::back_inserter(out), std::forward<Fn>(map)));
- return std::move(out);
-}
-
-template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
- typename To = typename internal::call_traits::return_type<Fn>::ValueType>
-Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
- std::vector<To> out;
- out.reserve(source.size());
- ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
- std::make_move_iterator(source.end()),
- std::back_inserter(out), std::forward<Fn>(map)));
- return std::move(out);
-}
-
-template <typename T>
-std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
- std::size_t sum = 0;
- for (const auto& vec : vecs) {
- sum += vec.size();
- }
- std::vector<T> out;
- out.reserve(sum);
- for (const auto& vec : vecs) {
- out.insert(out.end(), vec.begin(), vec.end());
- }
- return out;
-}
-
-template <typename T>
-Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
- std::vector<T> out;
- out.reserve(results.size());
- auto end = std::make_move_iterator(results.end());
- for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
- if (!it->ok()) {
- return it->status();
- }
- out.push_back(it->MoveValueUnsafe());
- }
- return std::move(out);
-}
-
-template <typename T>
-Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
- std::vector<T> out;
- out.reserve(results.size());
- for (const auto& result : results) {
- if (!result.ok()) {
- return result.status();
- }
- out.push_back(result.ValueUnsafe());
- }
- return std::move(out);
-}
-
+template <typename T, typename Predicate>
+std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
+ auto new_end =
+ std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+ values.erase(new_end, values.end());
+ return values;
+}
+
+template <typename Fn, typename From,
+ typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ std::transform(source.begin(), source.end(), std::back_inserter(out),
+ std::forward<Fn>(map));
+ return out;
+}
+
+template <typename Fn, typename From,
+ typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ std::transform(std::make_move_iterator(source.begin()),
+ std::make_move_iterator(source.end()), std::back_inserter(out),
+ std::forward<Fn>(map));
+ return out;
+}
+
+/// \brief Like MapVector, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
+ std::back_inserter(out), std::forward<Fn>(map)));
+ return std::move(out);
+}
+
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
+ std::make_move_iterator(source.end()),
+ std::back_inserter(out), std::forward<Fn>(map)));
+ return std::move(out);
+}
+
+template <typename T>
+std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
+ std::size_t sum = 0;
+ for (const auto& vec : vecs) {
+ sum += vec.size();
+ }
+ std::vector<T> out;
+ out.reserve(sum);
+ for (const auto& vec : vecs) {
+ out.insert(out.end(), vec.begin(), vec.end());
+ }
+ return out;
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
+ std::vector<T> out;
+ out.reserve(results.size());
+ auto end = std::make_move_iterator(results.end());
+ for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
+ if (!it->ok()) {
+ return it->status();
+ }
+ out.push_back(it->MoveValueUnsafe());
+ }
+ return std::move(out);
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
+ std::vector<T> out;
+ out.reserve(results.size());
+ for (const auto& result : results) {
+ if (!result.ok()) {
+ return result.status();
+ }
+ out.push_back(result.ValueUnsafe());
+ }
+ return std::move(out);
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
index 2949ac4ab76..6cb5a5e66be 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
@@ -19,13 +19,13 @@
#ifdef _WIN32
-#ifdef max
-#undef max
-#endif
-#ifdef min
-#undef min
-#endif
-
+#ifdef max
+#undef max
+#endif
+#ifdef min
+#undef min
+#endif
+
// The Windows API defines macros from *File resolving to either
// *FileA or *FileW. Need to undo them.
#ifdef CopyFile
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
index 0b7cfa1cb16..bdd776bfc4c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
@@ -1,217 +1,217 @@
-// Vendored from git tag v2021.02.15.00
-
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// @author Bo Hu ([email protected])
-// @author Jordan DeLong ([email protected])
-
-// This file has been modified as part of Apache Arrow to conform to
-// Apache Arrow's coding conventions
-
-#pragma once
-
-#include <atomic>
-#include <cassert>
-#include <cstdlib>
-#include <memory>
-#include <stdexcept>
-#include <type_traits>
-#include <utility>
-
-namespace arrow_vendored {
-namespace folly {
-
-// Vendored from folly/Portability.h
-namespace {
-#if defined(__arm__)
-#define FOLLY_ARM 1
-#else
-#define FOLLY_ARM 0
-#endif
-
-#if defined(__s390x__)
-#define FOLLY_S390X 1
-#else
-#define FOLLY_S390X 0
-#endif
-
-constexpr bool kIsArchArm = FOLLY_ARM == 1;
-constexpr bool kIsArchS390X = FOLLY_S390X == 1;
-} // namespace
-
-// Vendored from folly/lang/Align.h
-namespace {
-
-constexpr std::size_t hardware_destructive_interference_size =
- (kIsArchArm || kIsArchS390X) ? 64 : 128;
-
-} // namespace
-
-/*
- * ProducerConsumerQueue is a one producer and one consumer queue
- * without locks.
- */
-template <class T>
-struct ProducerConsumerQueue {
- typedef T value_type;
-
- ProducerConsumerQueue(const ProducerConsumerQueue&) = delete;
- ProducerConsumerQueue& operator=(const ProducerConsumerQueue&) = delete;
-
- // size must be >= 2.
- //
- // Also, note that the number of usable slots in the queue at any
- // given time is actually (size-1), so if you start with an empty queue,
- // IsFull() will return true after size-1 insertions.
- explicit ProducerConsumerQueue(uint32_t size)
- : size_(size),
- records_(static_cast<T*>(std::malloc(sizeof(T) * size))),
- readIndex_(0),
- writeIndex_(0) {
- assert(size >= 2);
- if (!records_) {
- throw std::bad_alloc();
- }
- }
-
- ~ProducerConsumerQueue() {
- // We need to destruct anything that may still exist in our queue.
- // (No real synchronization needed at destructor time: only one
- // thread can be doing this.)
- if (!std::is_trivially_destructible<T>::value) {
- size_t readIndex = readIndex_;
- size_t endIndex = writeIndex_;
- while (readIndex != endIndex) {
- records_[readIndex].~T();
- if (++readIndex == size_) {
- readIndex = 0;
- }
- }
- }
-
- std::free(records_);
- }
-
- template <class... Args>
- bool Write(Args&&... recordArgs) {
- auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
- auto nextRecord = currentWrite + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
- new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
- writeIndex_.store(nextRecord, std::memory_order_release);
- return true;
- }
-
- // queue is full
- return false;
- }
-
- // move the value at the front of the queue to given variable
- bool Read(T& record) {
- auto const currentRead = readIndex_.load(std::memory_order_relaxed);
- if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
- // queue is empty
- return false;
- }
-
- auto nextRecord = currentRead + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- record = std::move(records_[currentRead]);
- records_[currentRead].~T();
- readIndex_.store(nextRecord, std::memory_order_release);
- return true;
- }
-
- // pointer to the value at the front of the queue (for use in-place) or
- // nullptr if empty.
- T* FrontPtr() {
- auto const currentRead = readIndex_.load(std::memory_order_relaxed);
- if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
- // queue is empty
- return nullptr;
- }
- return &records_[currentRead];
- }
-
- // queue must not be empty
- void PopFront() {
- auto const currentRead = readIndex_.load(std::memory_order_relaxed);
- assert(currentRead != writeIndex_.load(std::memory_order_acquire));
-
- auto nextRecord = currentRead + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- records_[currentRead].~T();
- readIndex_.store(nextRecord, std::memory_order_release);
- }
-
- bool IsEmpty() const {
- return readIndex_.load(std::memory_order_acquire) ==
- writeIndex_.load(std::memory_order_acquire);
- }
-
- bool IsFull() const {
- auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
- return false;
- }
- // queue is full
- return true;
- }
-
- // * If called by consumer, then true size may be more (because producer may
- // be adding items concurrently).
- // * If called by producer, then true size may be less (because consumer may
- // be removing items concurrently).
- // * It is undefined to call this from any other thread.
- size_t SizeGuess() const {
- int ret = writeIndex_.load(std::memory_order_acquire) -
- readIndex_.load(std::memory_order_acquire);
- if (ret < 0) {
- ret += size_;
- }
- return ret;
- }
-
- // maximum number of items in the queue.
- size_t capacity() const { return size_ - 1; }
-
- private:
- using AtomicIndex = std::atomic<unsigned int>;
-
- char pad0_[hardware_destructive_interference_size];
- const uint32_t size_;
- T* const records_;
-
- AtomicIndex readIndex_;
- char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
- AtomicIndex writeIndex_;
-
- char pad2_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
-};
-
-} // namespace folly
-} // namespace arrow_vendored
+// Vendored from git tag v2021.02.15.00
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Bo Hu ([email protected])
+// @author Jordan DeLong ([email protected])
+
+// This file has been modified as part of Apache Arrow to conform to
+// Apache Arrow's coding conventions
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace arrow_vendored {
+namespace folly {
+
+// Vendored from folly/Portability.h
+namespace {
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
+#if defined(__s390x__)
+#define FOLLY_S390X 1
+#else
+#define FOLLY_S390X 0
+#endif
+
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
+constexpr bool kIsArchS390X = FOLLY_S390X == 1;
+} // namespace
+
+// Vendored from folly/lang/Align.h
+namespace {
+
+constexpr std::size_t hardware_destructive_interference_size =
+ (kIsArchArm || kIsArchS390X) ? 64 : 128;
+
+} // namespace
+
+/*
+ * ProducerConsumerQueue is a one producer and one consumer queue
+ * without locks.
+ */
+template <class T>
+struct ProducerConsumerQueue {
+ typedef T value_type;
+
+ ProducerConsumerQueue(const ProducerConsumerQueue&) = delete;
+ ProducerConsumerQueue& operator=(const ProducerConsumerQueue&) = delete;
+
+ // size must be >= 2.
+ //
+ // Also, note that the number of usable slots in the queue at any
+ // given time is actually (size-1), so if you start with an empty queue,
+ // IsFull() will return true after size-1 insertions.
+ explicit ProducerConsumerQueue(uint32_t size)
+ : size_(size),
+ records_(static_cast<T*>(std::malloc(sizeof(T) * size))),
+ readIndex_(0),
+ writeIndex_(0) {
+ assert(size >= 2);
+ if (!records_) {
+ throw std::bad_alloc();
+ }
+ }
+
+ ~ProducerConsumerQueue() {
+ // We need to destruct anything that may still exist in our queue.
+ // (No real synchronization needed at destructor time: only one
+ // thread can be doing this.)
+ if (!std::is_trivially_destructible<T>::value) {
+ size_t readIndex = readIndex_;
+ size_t endIndex = writeIndex_;
+ while (readIndex != endIndex) {
+ records_[readIndex].~T();
+ if (++readIndex == size_) {
+ readIndex = 0;
+ }
+ }
+ }
+
+ std::free(records_);
+ }
+
+ template <class... Args>
+ bool Write(Args&&... recordArgs) {
+ auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
+ auto nextRecord = currentWrite + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+ new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
+ writeIndex_.store(nextRecord, std::memory_order_release);
+ return true;
+ }
+
+ // queue is full
+ return false;
+ }
+
+ // move the value at the front of the queue to given variable
+ bool Read(T& record) {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+ // queue is empty
+ return false;
+ }
+
+ auto nextRecord = currentRead + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ record = std::move(records_[currentRead]);
+ records_[currentRead].~T();
+ readIndex_.store(nextRecord, std::memory_order_release);
+ return true;
+ }
+
+ // pointer to the value at the front of the queue (for use in-place) or
+ // nullptr if empty.
+ T* FrontPtr() {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+ // queue is empty
+ return nullptr;
+ }
+ return &records_[currentRead];
+ }
+
+ // queue must not be empty
+ void PopFront() {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ assert(currentRead != writeIndex_.load(std::memory_order_acquire));
+
+ auto nextRecord = currentRead + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ records_[currentRead].~T();
+ readIndex_.store(nextRecord, std::memory_order_release);
+ }
+
+ bool IsEmpty() const {
+ return readIndex_.load(std::memory_order_acquire) ==
+ writeIndex_.load(std::memory_order_acquire);
+ }
+
+ bool IsFull() const {
+ auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+ return false;
+ }
+ // queue is full
+ return true;
+ }
+
+ // * If called by consumer, then true size may be more (because producer may
+ // be adding items concurrently).
+ // * If called by producer, then true size may be less (because consumer may
+ // be removing items concurrently).
+ // * It is undefined to call this from any other thread.
+ size_t SizeGuess() const {
+ int ret = writeIndex_.load(std::memory_order_acquire) -
+ readIndex_.load(std::memory_order_acquire);
+ if (ret < 0) {
+ ret += size_;
+ }
+ return ret;
+ }
+
+ // maximum number of items in the queue.
+ size_t capacity() const { return size_ - 1; }
+
+ private:
+ using AtomicIndex = std::atomic<unsigned int>;
+
+ char pad0_[hardware_destructive_interference_size];
+ const uint32_t size_;
+ T* const records_;
+
+ AtomicIndex readIndex_;
+ char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+ AtomicIndex writeIndex_;
+
+ char pad2_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+};
+
+} // namespace folly
+} // namespace arrow_vendored
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
index 7f6426ac765..6430a57af29 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
@@ -58,8 +58,8 @@
# define PSNIP_SAFE__FUNCTION PSNIP_SAFE__COMPILER_ATTRIBUTES static PSNIP_SAFE__INLINE
#endif
-// !defined(__cplusplus) added for Solaris support
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+// !defined(__cplusplus) added for Solaris support
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
# define psnip_safe_bool _Bool
#else
# define psnip_safe_bool int
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
index 851785081c7..83d4de210d3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
@@ -67,7 +67,7 @@ ARRAY_VISITOR_DEFAULT(SparseUnionArray)
ARRAY_VISITOR_DEFAULT(DenseUnionArray)
ARRAY_VISITOR_DEFAULT(DictionaryArray)
ARRAY_VISITOR_DEFAULT(Decimal128Array)
-ARRAY_VISITOR_DEFAULT(Decimal256Array)
+ARRAY_VISITOR_DEFAULT(Decimal256Array)
ARRAY_VISITOR_DEFAULT(ExtensionArray)
#undef ARRAY_VISITOR_DEFAULT
@@ -107,7 +107,7 @@ TYPE_VISITOR_DEFAULT(DayTimeIntervalType)
TYPE_VISITOR_DEFAULT(MonthIntervalType)
TYPE_VISITOR_DEFAULT(DurationType)
TYPE_VISITOR_DEFAULT(Decimal128Type)
-TYPE_VISITOR_DEFAULT(Decimal256Type)
+TYPE_VISITOR_DEFAULT(Decimal256Type)
TYPE_VISITOR_DEFAULT(ListType)
TYPE_VISITOR_DEFAULT(LargeListType)
TYPE_VISITOR_DEFAULT(MapType)
@@ -156,7 +156,7 @@ SCALAR_VISITOR_DEFAULT(DayTimeIntervalScalar)
SCALAR_VISITOR_DEFAULT(MonthIntervalScalar)
SCALAR_VISITOR_DEFAULT(DurationScalar)
SCALAR_VISITOR_DEFAULT(Decimal128Scalar)
-SCALAR_VISITOR_DEFAULT(Decimal256Scalar)
+SCALAR_VISITOR_DEFAULT(Decimal256Scalar)
SCALAR_VISITOR_DEFAULT(ListScalar)
SCALAR_VISITOR_DEFAULT(LargeListScalar)
SCALAR_VISITOR_DEFAULT(MapScalar)
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
index 0382e461199..fe49f51ce3d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
@@ -54,7 +54,7 @@ class ARROW_EXPORT ArrayVisitor {
virtual Status Visit(const MonthIntervalArray& array);
virtual Status Visit(const DurationArray& array);
virtual Status Visit(const Decimal128Array& array);
- virtual Status Visit(const Decimal256Array& array);
+ virtual Status Visit(const Decimal256Array& array);
virtual Status Visit(const ListArray& array);
virtual Status Visit(const LargeListArray& array);
virtual Status Visit(const MapArray& array);
@@ -97,7 +97,7 @@ class ARROW_EXPORT TypeVisitor {
virtual Status Visit(const DayTimeIntervalType& type);
virtual Status Visit(const DurationType& type);
virtual Status Visit(const Decimal128Type& type);
- virtual Status Visit(const Decimal256Type& type);
+ virtual Status Visit(const Decimal256Type& type);
virtual Status Visit(const ListType& type);
virtual Status Visit(const LargeListType& type);
virtual Status Visit(const MapType& type);
@@ -140,7 +140,7 @@ class ARROW_EXPORT ScalarVisitor {
virtual Status Visit(const MonthIntervalScalar& scalar);
virtual Status Visit(const DurationScalar& scalar);
virtual Status Visit(const Decimal128Scalar& scalar);
- virtual Status Visit(const Decimal256Scalar& scalar);
+ virtual Status Visit(const Decimal256Scalar& scalar);
virtual Status Visit(const ListScalar& scalar);
virtual Status Visit(const LargeListScalar& scalar);
virtual Status Visit(const MapScalar& scalar);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h b/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
index 132c35aeaa1..69d443b48cd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
@@ -68,7 +68,7 @@ namespace arrow {
ACTION(MonthInterval); \
ACTION(DayTimeInterval); \
ACTION(Decimal128); \
- ACTION(Decimal256); \
+ ACTION(Decimal256); \
ACTION(List); \
ACTION(LargeList); \
ACTION(Map); \
@@ -199,9 +199,9 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
using offset_type = typename T::offset_type;
constexpr char empty_value = 0;
- if (arr.length == 0) {
- return Status::OK();
- }
+ if (arr.length == 0) {
+ return Status::OK();
+ }
const offset_type* offsets = arr.GetValues<offset_type>(1);
const char* data;
if (!arr.buffers[2]) {
@@ -232,9 +232,9 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
using offset_type = typename T::offset_type;
constexpr uint8_t empty_value = 0;
- if (arr.length == 0) {
- return;
- }
+ if (arr.length == 0) {
+ return;
+ }
const offset_type* offsets = arr.GetValues<offset_type>(1);
const uint8_t* data;
if (!arr.buffers[2]) {
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
index b1b4ce62673..5df101f4369 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
@@ -1,17 +1,17 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#include "parquet_constants.h"
-
-namespace parquet { namespace format {
-
-const parquetConstants g_parquet_constants;
-
-parquetConstants::parquetConstants() {
-}
-
-}} // namespace
-
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#include "parquet_constants.h"
+
+namespace parquet { namespace format {
+
+const parquetConstants g_parquet_constants;
+
+parquetConstants::parquetConstants() {
+}
+
+}} // namespace
+
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
index 1e288c7cd1f..98df7236774 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
@@ -1,24 +1,24 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#ifndef parquet_CONSTANTS_H
-#define parquet_CONSTANTS_H
-
-#include "parquet_types.h"
-
-namespace parquet { namespace format {
-
-class parquetConstants {
- public:
- parquetConstants();
-
-};
-
-extern const parquetConstants g_parquet_constants;
-
-}} // namespace
-
-#endif
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#ifndef parquet_CONSTANTS_H
+#define parquet_CONSTANTS_H
+
+#include "parquet_types.h"
+
+namespace parquet { namespace format {
+
+class parquetConstants {
+ public:
+ parquetConstants();
+
+};
+
+extern const parquetConstants g_parquet_constants;
+
+}} // namespace
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
index 7c7289658ee..ca55e9ab0ae 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
@@ -1,7415 +1,7415 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#include "parquet_types.h"
-
-#include <algorithm>
-#include <ostream>
-
-#include <thrift/TToString.h>
-
-namespace parquet { namespace format {
-
-int _kTypeValues[] = {
- Type::BOOLEAN,
- Type::INT32,
- Type::INT64,
- Type::INT96,
- Type::FLOAT,
- Type::DOUBLE,
- Type::BYTE_ARRAY,
- Type::FIXED_LEN_BYTE_ARRAY
-};
-const char* _kTypeNames[] = {
- "BOOLEAN",
- "INT32",
- "INT64",
- "INT96",
- "FLOAT",
- "DOUBLE",
- "BYTE_ARRAY",
- "FIXED_LEN_BYTE_ARRAY"
-};
-const std::map<int, const char*> _Type_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kTypeValues, _kTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const Type::type& val) {
- std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
- if (it != _Type_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const Type::type& val) {
- std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
- if (it != _Type_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kConvertedTypeValues[] = {
- ConvertedType::UTF8,
- ConvertedType::MAP,
- ConvertedType::MAP_KEY_VALUE,
- ConvertedType::LIST,
- ConvertedType::ENUM,
- ConvertedType::DECIMAL,
- ConvertedType::DATE,
- ConvertedType::TIME_MILLIS,
- ConvertedType::TIME_MICROS,
- ConvertedType::TIMESTAMP_MILLIS,
- ConvertedType::TIMESTAMP_MICROS,
- ConvertedType::UINT_8,
- ConvertedType::UINT_16,
- ConvertedType::UINT_32,
- ConvertedType::UINT_64,
- ConvertedType::INT_8,
- ConvertedType::INT_16,
- ConvertedType::INT_32,
- ConvertedType::INT_64,
- ConvertedType::JSON,
- ConvertedType::BSON,
- ConvertedType::INTERVAL
-};
-const char* _kConvertedTypeNames[] = {
- "UTF8",
- "MAP",
- "MAP_KEY_VALUE",
- "LIST",
- "ENUM",
- "DECIMAL",
- "DATE",
- "TIME_MILLIS",
- "TIME_MICROS",
- "TIMESTAMP_MILLIS",
- "TIMESTAMP_MICROS",
- "UINT_8",
- "UINT_16",
- "UINT_32",
- "UINT_64",
- "INT_8",
- "INT_16",
- "INT_32",
- "INT_64",
- "JSON",
- "BSON",
- "INTERVAL"
-};
-const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(22, _kConvertedTypeValues, _kConvertedTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val) {
- std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
- if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const ConvertedType::type& val) {
- std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
- if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kFieldRepetitionTypeValues[] = {
- FieldRepetitionType::REQUIRED,
- FieldRepetitionType::OPTIONAL,
- FieldRepetitionType::REPEATED
-};
-const char* _kFieldRepetitionTypeNames[] = {
- "REQUIRED",
- "OPTIONAL",
- "REPEATED"
-};
-const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) {
- std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
- if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const FieldRepetitionType::type& val) {
- std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
- if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kEncodingValues[] = {
- Encoding::PLAIN,
- Encoding::PLAIN_DICTIONARY,
- Encoding::RLE,
- Encoding::BIT_PACKED,
- Encoding::DELTA_BINARY_PACKED,
- Encoding::DELTA_LENGTH_BYTE_ARRAY,
- Encoding::DELTA_BYTE_ARRAY,
- Encoding::RLE_DICTIONARY,
- Encoding::BYTE_STREAM_SPLIT
-};
-const char* _kEncodingNames[] = {
- "PLAIN",
- "PLAIN_DICTIONARY",
- "RLE",
- "BIT_PACKED",
- "DELTA_BINARY_PACKED",
- "DELTA_LENGTH_BYTE_ARRAY",
- "DELTA_BYTE_ARRAY",
- "RLE_DICTIONARY",
- "BYTE_STREAM_SPLIT"
-};
-const std::map<int, const char*> _Encoding_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(9, _kEncodingValues, _kEncodingNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const Encoding::type& val) {
- std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
- if (it != _Encoding_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const Encoding::type& val) {
- std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
- if (it != _Encoding_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kCompressionCodecValues[] = {
- CompressionCodec::UNCOMPRESSED,
- CompressionCodec::SNAPPY,
- CompressionCodec::GZIP,
- CompressionCodec::LZO,
- CompressionCodec::BROTLI,
- CompressionCodec::LZ4,
- CompressionCodec::ZSTD,
- CompressionCodec::LZ4_RAW
-};
-const char* _kCompressionCodecNames[] = {
- "UNCOMPRESSED",
- "SNAPPY",
- "GZIP",
- "LZO",
- "BROTLI",
- "LZ4",
- "ZSTD",
- "LZ4_RAW"
-};
-const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kCompressionCodecValues, _kCompressionCodecNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val) {
- std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
- if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const CompressionCodec::type& val) {
- std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
- if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kPageTypeValues[] = {
- PageType::DATA_PAGE,
- PageType::INDEX_PAGE,
- PageType::DICTIONARY_PAGE,
- PageType::DATA_PAGE_V2
-};
-const char* _kPageTypeNames[] = {
- "DATA_PAGE",
- "INDEX_PAGE",
- "DICTIONARY_PAGE",
- "DATA_PAGE_V2"
-};
-const std::map<int, const char*> _PageType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(4, _kPageTypeValues, _kPageTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const PageType::type& val) {
- std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
- if (it != _PageType_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const PageType::type& val) {
- std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
- if (it != _PageType_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kBoundaryOrderValues[] = {
- BoundaryOrder::UNORDERED,
- BoundaryOrder::ASCENDING,
- BoundaryOrder::DESCENDING
-};
-const char* _kBoundaryOrderNames[] = {
- "UNORDERED",
- "ASCENDING",
- "DESCENDING"
-};
-const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kBoundaryOrderValues, _kBoundaryOrderNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val) {
- std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
- if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const BoundaryOrder::type& val) {
- std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
- if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-
-Statistics::~Statistics() noexcept {
-}
-
-
-void Statistics::__set_max(const std::string& val) {
- this->max = val;
-__isset.max = true;
-}
-
-void Statistics::__set_min(const std::string& val) {
- this->min = val;
-__isset.min = true;
-}
-
-void Statistics::__set_null_count(const int64_t val) {
- this->null_count = val;
-__isset.null_count = true;
-}
-
-void Statistics::__set_distinct_count(const int64_t val) {
- this->distinct_count = val;
-__isset.distinct_count = true;
-}
-
-void Statistics::__set_max_value(const std::string& val) {
- this->max_value = val;
-__isset.max_value = true;
-}
-
-void Statistics::__set_min_value(const std::string& val) {
- this->min_value = val;
-__isset.min_value = true;
-}
-std::ostream& operator<<(std::ostream& out, const Statistics& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->max);
- this->__isset.max = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->min);
- this->__isset.min = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->null_count);
- this->__isset.null_count = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->distinct_count);
- this->__isset.distinct_count = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->max_value);
- this->__isset.max_value = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->min_value);
- this->__isset.min_value = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("Statistics");
-
- if (this->__isset.max) {
- xfer += oprot->writeFieldBegin("max", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeBinary(this->max);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.min) {
- xfer += oprot->writeFieldBegin("min", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->min);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.null_count) {
- xfer += oprot->writeFieldBegin("null_count", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->null_count);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.distinct_count) {
- xfer += oprot->writeFieldBegin("distinct_count", ::apache::thrift::protocol::T_I64, 4);
- xfer += oprot->writeI64(this->distinct_count);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.max_value) {
- xfer += oprot->writeFieldBegin("max_value", ::apache::thrift::protocol::T_STRING, 5);
- xfer += oprot->writeBinary(this->max_value);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.min_value) {
- xfer += oprot->writeFieldBegin("min_value", ::apache::thrift::protocol::T_STRING, 6);
- xfer += oprot->writeBinary(this->min_value);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(Statistics &a, Statistics &b) {
- using ::std::swap;
- swap(a.max, b.max);
- swap(a.min, b.min);
- swap(a.null_count, b.null_count);
- swap(a.distinct_count, b.distinct_count);
- swap(a.max_value, b.max_value);
- swap(a.min_value, b.min_value);
- swap(a.__isset, b.__isset);
-}
-
-Statistics::Statistics(const Statistics& other0) {
- max = other0.max;
- min = other0.min;
- null_count = other0.null_count;
- distinct_count = other0.distinct_count;
- max_value = other0.max_value;
- min_value = other0.min_value;
- __isset = other0.__isset;
-}
-Statistics& Statistics::operator=(const Statistics& other1) {
- max = other1.max;
- min = other1.min;
- null_count = other1.null_count;
- distinct_count = other1.distinct_count;
- max_value = other1.max_value;
- min_value = other1.min_value;
- __isset = other1.__isset;
- return *this;
-}
-void Statistics::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "Statistics(";
- out << "max="; (__isset.max ? (out << to_string(max)) : (out << "<null>"));
- out << ", " << "min="; (__isset.min ? (out << to_string(min)) : (out << "<null>"));
- out << ", " << "null_count="; (__isset.null_count ? (out << to_string(null_count)) : (out << "<null>"));
- out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "<null>"));
- out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
- out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
- out << ")";
-}
-
-
-StringType::~StringType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const StringType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t StringType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t StringType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("StringType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(StringType &a, StringType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-StringType::StringType(const StringType& other2) {
- (void) other2;
-}
-StringType& StringType::operator=(const StringType& other3) {
- (void) other3;
- return *this;
-}
-void StringType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "StringType(";
- out << ")";
-}
-
-
-UUIDType::~UUIDType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const UUIDType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t UUIDType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t UUIDType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("UUIDType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(UUIDType &a, UUIDType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-UUIDType::UUIDType(const UUIDType& other4) {
- (void) other4;
-}
-UUIDType& UUIDType::operator=(const UUIDType& other5) {
- (void) other5;
- return *this;
-}
-void UUIDType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "UUIDType(";
- out << ")";
-}
-
-
-MapType::~MapType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const MapType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t MapType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t MapType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("MapType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(MapType &a, MapType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-MapType::MapType(const MapType& other6) {
- (void) other6;
-}
-MapType& MapType::operator=(const MapType& other7) {
- (void) other7;
- return *this;
-}
-void MapType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "MapType(";
- out << ")";
-}
-
-
-ListType::~ListType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const ListType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ListType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t ListType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ListType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ListType &a, ListType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-ListType::ListType(const ListType& other8) {
- (void) other8;
-}
-ListType& ListType::operator=(const ListType& other9) {
- (void) other9;
- return *this;
-}
-void ListType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ListType(";
- out << ")";
-}
-
-
-EnumType::~EnumType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const EnumType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EnumType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t EnumType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EnumType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EnumType &a, EnumType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-EnumType::EnumType(const EnumType& other10) {
- (void) other10;
-}
-EnumType& EnumType::operator=(const EnumType& other11) {
- (void) other11;
- return *this;
-}
-void EnumType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EnumType(";
- out << ")";
-}
-
-
-DateType::~DateType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const DateType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DateType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t DateType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DateType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DateType &a, DateType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-DateType::DateType(const DateType& other12) {
- (void) other12;
-}
-DateType& DateType::operator=(const DateType& other13) {
- (void) other13;
- return *this;
-}
-void DateType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DateType(";
- out << ")";
-}
-
-
-NullType::~NullType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const NullType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t NullType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t NullType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("NullType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(NullType &a, NullType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-NullType::NullType(const NullType& other14) {
- (void) other14;
-}
-NullType& NullType::operator=(const NullType& other15) {
- (void) other15;
- return *this;
-}
-void NullType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "NullType(";
- out << ")";
-}
-
-
-DecimalType::~DecimalType() noexcept {
-}
-
-
-void DecimalType::__set_scale(const int32_t val) {
- this->scale = val;
-}
-
-void DecimalType::__set_precision(const int32_t val) {
- this->precision = val;
-}
-std::ostream& operator<<(std::ostream& out, const DecimalType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DecimalType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_scale = false;
- bool isset_precision = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->scale);
- isset_scale = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->precision);
- isset_precision = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_scale)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_precision)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DecimalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DecimalType");
-
- xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->scale);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->precision);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DecimalType &a, DecimalType &b) {
- using ::std::swap;
- swap(a.scale, b.scale);
- swap(a.precision, b.precision);
-}
-
-DecimalType::DecimalType(const DecimalType& other16) {
- scale = other16.scale;
- precision = other16.precision;
-}
-DecimalType& DecimalType::operator=(const DecimalType& other17) {
- scale = other17.scale;
- precision = other17.precision;
- return *this;
-}
-void DecimalType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DecimalType(";
- out << "scale=" << to_string(scale);
- out << ", " << "precision=" << to_string(precision);
- out << ")";
-}
-
-
-MilliSeconds::~MilliSeconds() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t MilliSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t MilliSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("MilliSeconds");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(MilliSeconds &a, MilliSeconds &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-MilliSeconds::MilliSeconds(const MilliSeconds& other18) {
- (void) other18;
-}
-MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other19) {
- (void) other19;
- return *this;
-}
-void MilliSeconds::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "MilliSeconds(";
- out << ")";
-}
-
-
-MicroSeconds::~MicroSeconds() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t MicroSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t MicroSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("MicroSeconds");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(MicroSeconds &a, MicroSeconds &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-MicroSeconds::MicroSeconds(const MicroSeconds& other20) {
- (void) other20;
-}
-MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other21) {
- (void) other21;
- return *this;
-}
-void MicroSeconds::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "MicroSeconds(";
- out << ")";
-}
-
-
-NanoSeconds::~NanoSeconds() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t NanoSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t NanoSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("NanoSeconds");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(NanoSeconds &a, NanoSeconds &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-NanoSeconds::NanoSeconds(const NanoSeconds& other22) {
- (void) other22;
-}
-NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other23) {
- (void) other23;
- return *this;
-}
-void NanoSeconds::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "NanoSeconds(";
- out << ")";
-}
-
-
-TimeUnit::~TimeUnit() noexcept {
-}
-
-
-void TimeUnit::__set_MILLIS(const MilliSeconds& val) {
- this->MILLIS = val;
-__isset.MILLIS = true;
-}
-
-void TimeUnit::__set_MICROS(const MicroSeconds& val) {
- this->MICROS = val;
-__isset.MICROS = true;
-}
-
-void TimeUnit::__set_NANOS(const NanoSeconds& val) {
- this->NANOS = val;
-__isset.NANOS = true;
-}
-std::ostream& operator<<(std::ostream& out, const TimeUnit& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TimeUnit::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->MILLIS.read(iprot);
- this->__isset.MILLIS = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->MICROS.read(iprot);
- this->__isset.MICROS = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->NANOS.read(iprot);
- this->__isset.NANOS = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t TimeUnit::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TimeUnit");
-
- if (this->__isset.MILLIS) {
- xfer += oprot->writeFieldBegin("MILLIS", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->MILLIS.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.MICROS) {
- xfer += oprot->writeFieldBegin("MICROS", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->MICROS.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.NANOS) {
- xfer += oprot->writeFieldBegin("NANOS", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->NANOS.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TimeUnit &a, TimeUnit &b) {
- using ::std::swap;
- swap(a.MILLIS, b.MILLIS);
- swap(a.MICROS, b.MICROS);
- swap(a.NANOS, b.NANOS);
- swap(a.__isset, b.__isset);
-}
-
-TimeUnit::TimeUnit(const TimeUnit& other24) {
- MILLIS = other24.MILLIS;
- MICROS = other24.MICROS;
- NANOS = other24.NANOS;
- __isset = other24.__isset;
-}
-TimeUnit& TimeUnit::operator=(const TimeUnit& other25) {
- MILLIS = other25.MILLIS;
- MICROS = other25.MICROS;
- NANOS = other25.NANOS;
- __isset = other25.__isset;
- return *this;
-}
-void TimeUnit::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TimeUnit(";
- out << "MILLIS="; (__isset.MILLIS ? (out << to_string(MILLIS)) : (out << "<null>"));
- out << ", " << "MICROS="; (__isset.MICROS ? (out << to_string(MICROS)) : (out << "<null>"));
- out << ", " << "NANOS="; (__isset.NANOS ? (out << to_string(NANOS)) : (out << "<null>"));
- out << ")";
-}
-
-
-TimestampType::~TimestampType() noexcept {
-}
-
-
-void TimestampType::__set_isAdjustedToUTC(const bool val) {
- this->isAdjustedToUTC = val;
-}
-
-void TimestampType::__set_unit(const TimeUnit& val) {
- this->unit = val;
-}
-std::ostream& operator<<(std::ostream& out, const TimestampType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TimestampType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_isAdjustedToUTC = false;
- bool isset_unit = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->isAdjustedToUTC);
- isset_isAdjustedToUTC = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->unit.read(iprot);
- isset_unit = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_isAdjustedToUTC)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_unit)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t TimestampType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TimestampType");
-
- xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
- xfer += oprot->writeBool(this->isAdjustedToUTC);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->unit.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TimestampType &a, TimestampType &b) {
- using ::std::swap;
- swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
- swap(a.unit, b.unit);
-}
-
-TimestampType::TimestampType(const TimestampType& other26) {
- isAdjustedToUTC = other26.isAdjustedToUTC;
- unit = other26.unit;
-}
-TimestampType& TimestampType::operator=(const TimestampType& other27) {
- isAdjustedToUTC = other27.isAdjustedToUTC;
- unit = other27.unit;
- return *this;
-}
-void TimestampType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TimestampType(";
- out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
- out << ", " << "unit=" << to_string(unit);
- out << ")";
-}
-
-
-TimeType::~TimeType() noexcept {
-}
-
-
-void TimeType::__set_isAdjustedToUTC(const bool val) {
- this->isAdjustedToUTC = val;
-}
-
-void TimeType::__set_unit(const TimeUnit& val) {
- this->unit = val;
-}
-std::ostream& operator<<(std::ostream& out, const TimeType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TimeType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_isAdjustedToUTC = false;
- bool isset_unit = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->isAdjustedToUTC);
- isset_isAdjustedToUTC = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->unit.read(iprot);
- isset_unit = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_isAdjustedToUTC)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_unit)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t TimeType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TimeType");
-
- xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
- xfer += oprot->writeBool(this->isAdjustedToUTC);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->unit.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TimeType &a, TimeType &b) {
- using ::std::swap;
- swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
- swap(a.unit, b.unit);
-}
-
-TimeType::TimeType(const TimeType& other28) {
- isAdjustedToUTC = other28.isAdjustedToUTC;
- unit = other28.unit;
-}
-TimeType& TimeType::operator=(const TimeType& other29) {
- isAdjustedToUTC = other29.isAdjustedToUTC;
- unit = other29.unit;
- return *this;
-}
-void TimeType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TimeType(";
- out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
- out << ", " << "unit=" << to_string(unit);
- out << ")";
-}
-
-
-IntType::~IntType() noexcept {
-}
-
-
-void IntType::__set_bitWidth(const int8_t val) {
- this->bitWidth = val;
-}
-
-void IntType::__set_isSigned(const bool val) {
- this->isSigned = val;
-}
-std::ostream& operator<<(std::ostream& out, const IntType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t IntType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_bitWidth = false;
- bool isset_isSigned = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_BYTE) {
- xfer += iprot->readByte(this->bitWidth);
- isset_bitWidth = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->isSigned);
- isset_isSigned = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_bitWidth)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_isSigned)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t IntType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("IntType");
-
- xfer += oprot->writeFieldBegin("bitWidth", ::apache::thrift::protocol::T_BYTE, 1);
- xfer += oprot->writeByte(this->bitWidth);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("isSigned", ::apache::thrift::protocol::T_BOOL, 2);
- xfer += oprot->writeBool(this->isSigned);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(IntType &a, IntType &b) {
- using ::std::swap;
- swap(a.bitWidth, b.bitWidth);
- swap(a.isSigned, b.isSigned);
-}
-
-IntType::IntType(const IntType& other30) {
- bitWidth = other30.bitWidth;
- isSigned = other30.isSigned;
-}
-IntType& IntType::operator=(const IntType& other31) {
- bitWidth = other31.bitWidth;
- isSigned = other31.isSigned;
- return *this;
-}
-void IntType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "IntType(";
- out << "bitWidth=" << to_string(bitWidth);
- out << ", " << "isSigned=" << to_string(isSigned);
- out << ")";
-}
-
-
-JsonType::~JsonType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const JsonType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t JsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t JsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("JsonType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(JsonType &a, JsonType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-JsonType::JsonType(const JsonType& other32) {
- (void) other32;
-}
-JsonType& JsonType::operator=(const JsonType& other33) {
- (void) other33;
- return *this;
-}
-void JsonType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "JsonType(";
- out << ")";
-}
-
-
-BsonType::~BsonType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const BsonType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BsonType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BsonType &a, BsonType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-BsonType::BsonType(const BsonType& other34) {
- (void) other34;
-}
-BsonType& BsonType::operator=(const BsonType& other35) {
- (void) other35;
- return *this;
-}
-void BsonType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BsonType(";
- out << ")";
-}
-
-
-LogicalType::~LogicalType() noexcept {
-}
-
-
-void LogicalType::__set_STRING(const StringType& val) {
- this->STRING = val;
-__isset.STRING = true;
-}
-
-void LogicalType::__set_MAP(const MapType& val) {
- this->MAP = val;
-__isset.MAP = true;
-}
-
-void LogicalType::__set_LIST(const ListType& val) {
- this->LIST = val;
-__isset.LIST = true;
-}
-
-void LogicalType::__set_ENUM(const EnumType& val) {
- this->ENUM = val;
-__isset.ENUM = true;
-}
-
-void LogicalType::__set_DECIMAL(const DecimalType& val) {
- this->DECIMAL = val;
-__isset.DECIMAL = true;
-}
-
-void LogicalType::__set_DATE(const DateType& val) {
- this->DATE = val;
-__isset.DATE = true;
-}
-
-void LogicalType::__set_TIME(const TimeType& val) {
- this->TIME = val;
-__isset.TIME = true;
-}
-
-void LogicalType::__set_TIMESTAMP(const TimestampType& val) {
- this->TIMESTAMP = val;
-__isset.TIMESTAMP = true;
-}
-
-void LogicalType::__set_INTEGER(const IntType& val) {
- this->INTEGER = val;
-__isset.INTEGER = true;
-}
-
-void LogicalType::__set_UNKNOWN(const NullType& val) {
- this->UNKNOWN = val;
-__isset.UNKNOWN = true;
-}
-
-void LogicalType::__set_JSON(const JsonType& val) {
- this->JSON = val;
-__isset.JSON = true;
-}
-
-void LogicalType::__set_BSON(const BsonType& val) {
- this->BSON = val;
-__isset.BSON = true;
-}
-
-void LogicalType::__set_UUID(const UUIDType& val) {
- this->UUID = val;
-__isset.UUID = true;
-}
-std::ostream& operator<<(std::ostream& out, const LogicalType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t LogicalType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->STRING.read(iprot);
- this->__isset.STRING = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->MAP.read(iprot);
- this->__isset.MAP = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->LIST.read(iprot);
- this->__isset.LIST = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->ENUM.read(iprot);
- this->__isset.ENUM = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->DECIMAL.read(iprot);
- this->__isset.DECIMAL = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->DATE.read(iprot);
- this->__isset.DATE = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->TIME.read(iprot);
- this->__isset.TIME = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->TIMESTAMP.read(iprot);
- this->__isset.TIMESTAMP = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 10:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->INTEGER.read(iprot);
- this->__isset.INTEGER = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 11:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->UNKNOWN.read(iprot);
- this->__isset.UNKNOWN = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 12:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->JSON.read(iprot);
- this->__isset.JSON = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 13:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->BSON.read(iprot);
- this->__isset.BSON = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 14:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->UUID.read(iprot);
- this->__isset.UUID = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t LogicalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("LogicalType");
-
- if (this->__isset.STRING) {
- xfer += oprot->writeFieldBegin("STRING", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->STRING.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.MAP) {
- xfer += oprot->writeFieldBegin("MAP", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->MAP.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.LIST) {
- xfer += oprot->writeFieldBegin("LIST", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->LIST.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.ENUM) {
- xfer += oprot->writeFieldBegin("ENUM", ::apache::thrift::protocol::T_STRUCT, 4);
- xfer += this->ENUM.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.DECIMAL) {
- xfer += oprot->writeFieldBegin("DECIMAL", ::apache::thrift::protocol::T_STRUCT, 5);
- xfer += this->DECIMAL.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.DATE) {
- xfer += oprot->writeFieldBegin("DATE", ::apache::thrift::protocol::T_STRUCT, 6);
- xfer += this->DATE.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.TIME) {
- xfer += oprot->writeFieldBegin("TIME", ::apache::thrift::protocol::T_STRUCT, 7);
- xfer += this->TIME.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.TIMESTAMP) {
- xfer += oprot->writeFieldBegin("TIMESTAMP", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->TIMESTAMP.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.INTEGER) {
- xfer += oprot->writeFieldBegin("INTEGER", ::apache::thrift::protocol::T_STRUCT, 10);
- xfer += this->INTEGER.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.UNKNOWN) {
- xfer += oprot->writeFieldBegin("UNKNOWN", ::apache::thrift::protocol::T_STRUCT, 11);
- xfer += this->UNKNOWN.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.JSON) {
- xfer += oprot->writeFieldBegin("JSON", ::apache::thrift::protocol::T_STRUCT, 12);
- xfer += this->JSON.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.BSON) {
- xfer += oprot->writeFieldBegin("BSON", ::apache::thrift::protocol::T_STRUCT, 13);
- xfer += this->BSON.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.UUID) {
- xfer += oprot->writeFieldBegin("UUID", ::apache::thrift::protocol::T_STRUCT, 14);
- xfer += this->UUID.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(LogicalType &a, LogicalType &b) {
- using ::std::swap;
- swap(a.STRING, b.STRING);
- swap(a.MAP, b.MAP);
- swap(a.LIST, b.LIST);
- swap(a.ENUM, b.ENUM);
- swap(a.DECIMAL, b.DECIMAL);
- swap(a.DATE, b.DATE);
- swap(a.TIME, b.TIME);
- swap(a.TIMESTAMP, b.TIMESTAMP);
- swap(a.INTEGER, b.INTEGER);
- swap(a.UNKNOWN, b.UNKNOWN);
- swap(a.JSON, b.JSON);
- swap(a.BSON, b.BSON);
- swap(a.UUID, b.UUID);
- swap(a.__isset, b.__isset);
-}
-
-LogicalType::LogicalType(const LogicalType& other36) {
- STRING = other36.STRING;
- MAP = other36.MAP;
- LIST = other36.LIST;
- ENUM = other36.ENUM;
- DECIMAL = other36.DECIMAL;
- DATE = other36.DATE;
- TIME = other36.TIME;
- TIMESTAMP = other36.TIMESTAMP;
- INTEGER = other36.INTEGER;
- UNKNOWN = other36.UNKNOWN;
- JSON = other36.JSON;
- BSON = other36.BSON;
- UUID = other36.UUID;
- __isset = other36.__isset;
-}
-LogicalType& LogicalType::operator=(const LogicalType& other37) {
- STRING = other37.STRING;
- MAP = other37.MAP;
- LIST = other37.LIST;
- ENUM = other37.ENUM;
- DECIMAL = other37.DECIMAL;
- DATE = other37.DATE;
- TIME = other37.TIME;
- TIMESTAMP = other37.TIMESTAMP;
- INTEGER = other37.INTEGER;
- UNKNOWN = other37.UNKNOWN;
- JSON = other37.JSON;
- BSON = other37.BSON;
- UUID = other37.UUID;
- __isset = other37.__isset;
- return *this;
-}
-void LogicalType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "LogicalType(";
- out << "STRING="; (__isset.STRING ? (out << to_string(STRING)) : (out << "<null>"));
- out << ", " << "MAP="; (__isset.MAP ? (out << to_string(MAP)) : (out << "<null>"));
- out << ", " << "LIST="; (__isset.LIST ? (out << to_string(LIST)) : (out << "<null>"));
- out << ", " << "ENUM="; (__isset.ENUM ? (out << to_string(ENUM)) : (out << "<null>"));
- out << ", " << "DECIMAL="; (__isset.DECIMAL ? (out << to_string(DECIMAL)) : (out << "<null>"));
- out << ", " << "DATE="; (__isset.DATE ? (out << to_string(DATE)) : (out << "<null>"));
- out << ", " << "TIME="; (__isset.TIME ? (out << to_string(TIME)) : (out << "<null>"));
- out << ", " << "TIMESTAMP="; (__isset.TIMESTAMP ? (out << to_string(TIMESTAMP)) : (out << "<null>"));
- out << ", " << "INTEGER="; (__isset.INTEGER ? (out << to_string(INTEGER)) : (out << "<null>"));
- out << ", " << "UNKNOWN="; (__isset.UNKNOWN ? (out << to_string(UNKNOWN)) : (out << "<null>"));
- out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "<null>"));
- out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "<null>"));
- out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "<null>"));
- out << ")";
-}
-
-
-SchemaElement::~SchemaElement() noexcept {
-}
-
-
-void SchemaElement::__set_type(const Type::type val) {
- this->type = val;
-__isset.type = true;
-}
-
-void SchemaElement::__set_type_length(const int32_t val) {
- this->type_length = val;
-__isset.type_length = true;
-}
-
-void SchemaElement::__set_repetition_type(const FieldRepetitionType::type val) {
- this->repetition_type = val;
-__isset.repetition_type = true;
-}
-
-void SchemaElement::__set_name(const std::string& val) {
- this->name = val;
-}
-
-void SchemaElement::__set_num_children(const int32_t val) {
- this->num_children = val;
-__isset.num_children = true;
-}
-
-void SchemaElement::__set_converted_type(const ConvertedType::type val) {
- this->converted_type = val;
-__isset.converted_type = true;
-}
-
-void SchemaElement::__set_scale(const int32_t val) {
- this->scale = val;
-__isset.scale = true;
-}
-
-void SchemaElement::__set_precision(const int32_t val) {
- this->precision = val;
-__isset.precision = true;
-}
-
-void SchemaElement::__set_field_id(const int32_t val) {
- this->field_id = val;
-__isset.field_id = true;
-}
-
-void SchemaElement::__set_logicalType(const LogicalType& val) {
- this->logicalType = val;
-__isset.logicalType = true;
-}
-std::ostream& operator<<(std::ostream& out, const SchemaElement& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_name = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast38;
- xfer += iprot->readI32(ecast38);
- this->type = (Type::type)ecast38;
- this->__isset.type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->type_length);
- this->__isset.type_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast39;
- xfer += iprot->readI32(ecast39);
- this->repetition_type = (FieldRepetitionType::type)ecast39;
- this->__isset.repetition_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->name);
- isset_name = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_children);
- this->__isset.num_children = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast40;
- xfer += iprot->readI32(ecast40);
- this->converted_type = (ConvertedType::type)ecast40;
- this->__isset.converted_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->scale);
- this->__isset.scale = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->precision);
- this->__isset.precision = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->field_id);
- this->__isset.field_id = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 10:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->logicalType.read(iprot);
- this->__isset.logicalType = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_name)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t SchemaElement::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("SchemaElement");
-
- if (this->__isset.type) {
- xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->type);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.type_length) {
- xfer += oprot->writeFieldBegin("type_length", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->type_length);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.repetition_type) {
- xfer += oprot->writeFieldBegin("repetition_type", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32((int32_t)this->repetition_type);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldBegin("name", ::apache::thrift::protocol::T_STRING, 4);
- xfer += oprot->writeString(this->name);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.num_children) {
- xfer += oprot->writeFieldBegin("num_children", ::apache::thrift::protocol::T_I32, 5);
- xfer += oprot->writeI32(this->num_children);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.converted_type) {
- xfer += oprot->writeFieldBegin("converted_type", ::apache::thrift::protocol::T_I32, 6);
- xfer += oprot->writeI32((int32_t)this->converted_type);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.scale) {
- xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 7);
- xfer += oprot->writeI32(this->scale);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.precision) {
- xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 8);
- xfer += oprot->writeI32(this->precision);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.field_id) {
- xfer += oprot->writeFieldBegin("field_id", ::apache::thrift::protocol::T_I32, 9);
- xfer += oprot->writeI32(this->field_id);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.logicalType) {
- xfer += oprot->writeFieldBegin("logicalType", ::apache::thrift::protocol::T_STRUCT, 10);
- xfer += this->logicalType.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(SchemaElement &a, SchemaElement &b) {
- using ::std::swap;
- swap(a.type, b.type);
- swap(a.type_length, b.type_length);
- swap(a.repetition_type, b.repetition_type);
- swap(a.name, b.name);
- swap(a.num_children, b.num_children);
- swap(a.converted_type, b.converted_type);
- swap(a.scale, b.scale);
- swap(a.precision, b.precision);
- swap(a.field_id, b.field_id);
- swap(a.logicalType, b.logicalType);
- swap(a.__isset, b.__isset);
-}
-
-SchemaElement::SchemaElement(const SchemaElement& other41) {
- type = other41.type;
- type_length = other41.type_length;
- repetition_type = other41.repetition_type;
- name = other41.name;
- num_children = other41.num_children;
- converted_type = other41.converted_type;
- scale = other41.scale;
- precision = other41.precision;
- field_id = other41.field_id;
- logicalType = other41.logicalType;
- __isset = other41.__isset;
-}
-SchemaElement& SchemaElement::operator=(const SchemaElement& other42) {
- type = other42.type;
- type_length = other42.type_length;
- repetition_type = other42.repetition_type;
- name = other42.name;
- num_children = other42.num_children;
- converted_type = other42.converted_type;
- scale = other42.scale;
- precision = other42.precision;
- field_id = other42.field_id;
- logicalType = other42.logicalType;
- __isset = other42.__isset;
- return *this;
-}
-void SchemaElement::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "SchemaElement(";
- out << "type="; (__isset.type ? (out << to_string(type)) : (out << "<null>"));
- out << ", " << "type_length="; (__isset.type_length ? (out << to_string(type_length)) : (out << "<null>"));
- out << ", " << "repetition_type="; (__isset.repetition_type ? (out << to_string(repetition_type)) : (out << "<null>"));
- out << ", " << "name=" << to_string(name);
- out << ", " << "num_children="; (__isset.num_children ? (out << to_string(num_children)) : (out << "<null>"));
- out << ", " << "converted_type="; (__isset.converted_type ? (out << to_string(converted_type)) : (out << "<null>"));
- out << ", " << "scale="; (__isset.scale ? (out << to_string(scale)) : (out << "<null>"));
- out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "<null>"));
- out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "<null>"));
- out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "<null>"));
- out << ")";
-}
-
-
-DataPageHeader::~DataPageHeader() noexcept {
-}
-
-
-void DataPageHeader::__set_num_values(const int32_t val) {
- this->num_values = val;
-}
-
-void DataPageHeader::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void DataPageHeader::__set_definition_level_encoding(const Encoding::type val) {
- this->definition_level_encoding = val;
-}
-
-void DataPageHeader::__set_repetition_level_encoding(const Encoding::type val) {
- this->repetition_level_encoding = val;
-}
-
-void DataPageHeader::__set_statistics(const Statistics& val) {
- this->statistics = val;
-__isset.statistics = true;
-}
-std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_num_values = false;
- bool isset_encoding = false;
- bool isset_definition_level_encoding = false;
- bool isset_repetition_level_encoding = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast43;
- xfer += iprot->readI32(ecast43);
- this->encoding = (Encoding::type)ecast43;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast44;
- xfer += iprot->readI32(ecast44);
- this->definition_level_encoding = (Encoding::type)ecast44;
- isset_definition_level_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast45;
- xfer += iprot->readI32(ecast45);
- this->repetition_level_encoding = (Encoding::type)ecast45;
- isset_repetition_level_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->statistics.read(iprot);
- this->__isset.statistics = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_definition_level_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_repetition_level_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DataPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DataPageHeader");
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("definition_level_encoding", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32((int32_t)this->definition_level_encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("repetition_level_encoding", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->repetition_level_encoding);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.statistics) {
- xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 5);
- xfer += this->statistics.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DataPageHeader &a, DataPageHeader &b) {
- using ::std::swap;
- swap(a.num_values, b.num_values);
- swap(a.encoding, b.encoding);
- swap(a.definition_level_encoding, b.definition_level_encoding);
- swap(a.repetition_level_encoding, b.repetition_level_encoding);
- swap(a.statistics, b.statistics);
- swap(a.__isset, b.__isset);
-}
-
-DataPageHeader::DataPageHeader(const DataPageHeader& other46) {
- num_values = other46.num_values;
- encoding = other46.encoding;
- definition_level_encoding = other46.definition_level_encoding;
- repetition_level_encoding = other46.repetition_level_encoding;
- statistics = other46.statistics;
- __isset = other46.__isset;
-}
-DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other47) {
- num_values = other47.num_values;
- encoding = other47.encoding;
- definition_level_encoding = other47.definition_level_encoding;
- repetition_level_encoding = other47.repetition_level_encoding;
- statistics = other47.statistics;
- __isset = other47.__isset;
- return *this;
-}
-void DataPageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DataPageHeader(";
- out << "num_values=" << to_string(num_values);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "definition_level_encoding=" << to_string(definition_level_encoding);
- out << ", " << "repetition_level_encoding=" << to_string(repetition_level_encoding);
- out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
- out << ")";
-}
-
-
-IndexPageHeader::~IndexPageHeader() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t IndexPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t IndexPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("IndexPageHeader");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(IndexPageHeader &a, IndexPageHeader &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-IndexPageHeader::IndexPageHeader(const IndexPageHeader& other48) {
- (void) other48;
-}
-IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other49) {
- (void) other49;
- return *this;
-}
-void IndexPageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "IndexPageHeader(";
- out << ")";
-}
-
-
-DictionaryPageHeader::~DictionaryPageHeader() noexcept {
-}
-
-
-void DictionaryPageHeader::__set_num_values(const int32_t val) {
- this->num_values = val;
-}
-
-void DictionaryPageHeader::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void DictionaryPageHeader::__set_is_sorted(const bool val) {
- this->is_sorted = val;
-__isset.is_sorted = true;
-}
-std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_num_values = false;
- bool isset_encoding = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast50;
- xfer += iprot->readI32(ecast50);
- this->encoding = (Encoding::type)ecast50;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->is_sorted);
- this->__isset.is_sorted = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DictionaryPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DictionaryPageHeader");
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.is_sorted) {
- xfer += oprot->writeFieldBegin("is_sorted", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->is_sorted);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) {
- using ::std::swap;
- swap(a.num_values, b.num_values);
- swap(a.encoding, b.encoding);
- swap(a.is_sorted, b.is_sorted);
- swap(a.__isset, b.__isset);
-}
-
-DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other51) {
- num_values = other51.num_values;
- encoding = other51.encoding;
- is_sorted = other51.is_sorted;
- __isset = other51.__isset;
-}
-DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other52) {
- num_values = other52.num_values;
- encoding = other52.encoding;
- is_sorted = other52.is_sorted;
- __isset = other52.__isset;
- return *this;
-}
-void DictionaryPageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DictionaryPageHeader(";
- out << "num_values=" << to_string(num_values);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "is_sorted="; (__isset.is_sorted ? (out << to_string(is_sorted)) : (out << "<null>"));
- out << ")";
-}
-
-
-DataPageHeaderV2::~DataPageHeaderV2() noexcept {
-}
-
-
-void DataPageHeaderV2::__set_num_values(const int32_t val) {
- this->num_values = val;
-}
-
-void DataPageHeaderV2::__set_num_nulls(const int32_t val) {
- this->num_nulls = val;
-}
-
-void DataPageHeaderV2::__set_num_rows(const int32_t val) {
- this->num_rows = val;
-}
-
-void DataPageHeaderV2::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void DataPageHeaderV2::__set_definition_levels_byte_length(const int32_t val) {
- this->definition_levels_byte_length = val;
-}
-
-void DataPageHeaderV2::__set_repetition_levels_byte_length(const int32_t val) {
- this->repetition_levels_byte_length = val;
-}
-
-void DataPageHeaderV2::__set_is_compressed(const bool val) {
- this->is_compressed = val;
-__isset.is_compressed = true;
-}
-
-void DataPageHeaderV2::__set_statistics(const Statistics& val) {
- this->statistics = val;
-__isset.statistics = true;
-}
-std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_num_values = false;
- bool isset_num_nulls = false;
- bool isset_num_rows = false;
- bool isset_encoding = false;
- bool isset_definition_levels_byte_length = false;
- bool isset_repetition_levels_byte_length = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_nulls);
- isset_num_nulls = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_rows);
- isset_num_rows = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast53;
- xfer += iprot->readI32(ecast53);
- this->encoding = (Encoding::type)ecast53;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->definition_levels_byte_length);
- isset_definition_levels_byte_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->repetition_levels_byte_length);
- isset_repetition_levels_byte_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->is_compressed);
- this->__isset.is_compressed = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->statistics.read(iprot);
- this->__isset.statistics = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_nulls)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_rows)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_definition_levels_byte_length)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_repetition_levels_byte_length)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DataPageHeaderV2::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DataPageHeaderV2");
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_nulls", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->num_nulls);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32(this->num_rows);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("definition_levels_byte_length", ::apache::thrift::protocol::T_I32, 5);
- xfer += oprot->writeI32(this->definition_levels_byte_length);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("repetition_levels_byte_length", ::apache::thrift::protocol::T_I32, 6);
- xfer += oprot->writeI32(this->repetition_levels_byte_length);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.is_compressed) {
- xfer += oprot->writeFieldBegin("is_compressed", ::apache::thrift::protocol::T_BOOL, 7);
- xfer += oprot->writeBool(this->is_compressed);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.statistics) {
- xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->statistics.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) {
- using ::std::swap;
- swap(a.num_values, b.num_values);
- swap(a.num_nulls, b.num_nulls);
- swap(a.num_rows, b.num_rows);
- swap(a.encoding, b.encoding);
- swap(a.definition_levels_byte_length, b.definition_levels_byte_length);
- swap(a.repetition_levels_byte_length, b.repetition_levels_byte_length);
- swap(a.is_compressed, b.is_compressed);
- swap(a.statistics, b.statistics);
- swap(a.__isset, b.__isset);
-}
-
-DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other54) {
- num_values = other54.num_values;
- num_nulls = other54.num_nulls;
- num_rows = other54.num_rows;
- encoding = other54.encoding;
- definition_levels_byte_length = other54.definition_levels_byte_length;
- repetition_levels_byte_length = other54.repetition_levels_byte_length;
- is_compressed = other54.is_compressed;
- statistics = other54.statistics;
- __isset = other54.__isset;
-}
-DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other55) {
- num_values = other55.num_values;
- num_nulls = other55.num_nulls;
- num_rows = other55.num_rows;
- encoding = other55.encoding;
- definition_levels_byte_length = other55.definition_levels_byte_length;
- repetition_levels_byte_length = other55.repetition_levels_byte_length;
- is_compressed = other55.is_compressed;
- statistics = other55.statistics;
- __isset = other55.__isset;
- return *this;
-}
-void DataPageHeaderV2::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DataPageHeaderV2(";
- out << "num_values=" << to_string(num_values);
- out << ", " << "num_nulls=" << to_string(num_nulls);
- out << ", " << "num_rows=" << to_string(num_rows);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "definition_levels_byte_length=" << to_string(definition_levels_byte_length);
- out << ", " << "repetition_levels_byte_length=" << to_string(repetition_levels_byte_length);
- out << ", " << "is_compressed="; (__isset.is_compressed ? (out << to_string(is_compressed)) : (out << "<null>"));
- out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
- out << ")";
-}
-
-
-SplitBlockAlgorithm::~SplitBlockAlgorithm() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t SplitBlockAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t SplitBlockAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("SplitBlockAlgorithm");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other56) {
- (void) other56;
-}
-SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other57) {
- (void) other57;
- return *this;
-}
-void SplitBlockAlgorithm::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "SplitBlockAlgorithm(";
- out << ")";
-}
-
-
-BloomFilterAlgorithm::~BloomFilterAlgorithm() noexcept {
-}
-
-
-void BloomFilterAlgorithm::__set_BLOCK(const SplitBlockAlgorithm& val) {
- this->BLOCK = val;
-__isset.BLOCK = true;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->BLOCK.read(iprot);
- this->__isset.BLOCK = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BloomFilterAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterAlgorithm");
-
- if (this->__isset.BLOCK) {
- xfer += oprot->writeFieldBegin("BLOCK", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->BLOCK.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) {
- using ::std::swap;
- swap(a.BLOCK, b.BLOCK);
- swap(a.__isset, b.__isset);
-}
-
-BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other58) {
- BLOCK = other58.BLOCK;
- __isset = other58.__isset;
-}
-BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other59) {
- BLOCK = other59.BLOCK;
- __isset = other59.__isset;
- return *this;
-}
-void BloomFilterAlgorithm::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterAlgorithm(";
- out << "BLOCK="; (__isset.BLOCK ? (out << to_string(BLOCK)) : (out << "<null>"));
- out << ")";
-}
-
-
-XxHash::~XxHash() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const XxHash& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t XxHash::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t XxHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("XxHash");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(XxHash &a, XxHash &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-XxHash::XxHash(const XxHash& other60) {
- (void) other60;
-}
-XxHash& XxHash::operator=(const XxHash& other61) {
- (void) other61;
- return *this;
-}
-void XxHash::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "XxHash(";
- out << ")";
-}
-
-
-BloomFilterHash::~BloomFilterHash() noexcept {
-}
-
-
-void BloomFilterHash::__set_XXHASH(const XxHash& val) {
- this->XXHASH = val;
-__isset.XXHASH = true;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterHash::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->XXHASH.read(iprot);
- this->__isset.XXHASH = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BloomFilterHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterHash");
-
- if (this->__isset.XXHASH) {
- xfer += oprot->writeFieldBegin("XXHASH", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->XXHASH.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterHash &a, BloomFilterHash &b) {
- using ::std::swap;
- swap(a.XXHASH, b.XXHASH);
- swap(a.__isset, b.__isset);
-}
-
-BloomFilterHash::BloomFilterHash(const BloomFilterHash& other62) {
- XXHASH = other62.XXHASH;
- __isset = other62.__isset;
-}
-BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other63) {
- XXHASH = other63.XXHASH;
- __isset = other63.__isset;
- return *this;
-}
-void BloomFilterHash::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterHash(";
- out << "XXHASH="; (__isset.XXHASH ? (out << to_string(XXHASH)) : (out << "<null>"));
- out << ")";
-}
-
-
-Uncompressed::~Uncompressed() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const Uncompressed& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t Uncompressed::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t Uncompressed::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("Uncompressed");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(Uncompressed &a, Uncompressed &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-Uncompressed::Uncompressed(const Uncompressed& other64) {
- (void) other64;
-}
-Uncompressed& Uncompressed::operator=(const Uncompressed& other65) {
- (void) other65;
- return *this;
-}
-void Uncompressed::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "Uncompressed(";
- out << ")";
-}
-
-
-BloomFilterCompression::~BloomFilterCompression() noexcept {
-}
-
-
-void BloomFilterCompression::__set_UNCOMPRESSED(const Uncompressed& val) {
- this->UNCOMPRESSED = val;
-__isset.UNCOMPRESSED = true;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterCompression::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->UNCOMPRESSED.read(iprot);
- this->__isset.UNCOMPRESSED = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BloomFilterCompression::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterCompression");
-
- if (this->__isset.UNCOMPRESSED) {
- xfer += oprot->writeFieldBegin("UNCOMPRESSED", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->UNCOMPRESSED.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterCompression &a, BloomFilterCompression &b) {
- using ::std::swap;
- swap(a.UNCOMPRESSED, b.UNCOMPRESSED);
- swap(a.__isset, b.__isset);
-}
-
-BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other66) {
- UNCOMPRESSED = other66.UNCOMPRESSED;
- __isset = other66.__isset;
-}
-BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other67) {
- UNCOMPRESSED = other67.UNCOMPRESSED;
- __isset = other67.__isset;
- return *this;
-}
-void BloomFilterCompression::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterCompression(";
- out << "UNCOMPRESSED="; (__isset.UNCOMPRESSED ? (out << to_string(UNCOMPRESSED)) : (out << "<null>"));
- out << ")";
-}
-
-
-BloomFilterHeader::~BloomFilterHeader() noexcept {
-}
-
-
-void BloomFilterHeader::__set_numBytes(const int32_t val) {
- this->numBytes = val;
-}
-
-void BloomFilterHeader::__set_algorithm(const BloomFilterAlgorithm& val) {
- this->algorithm = val;
-}
-
-void BloomFilterHeader::__set_hash(const BloomFilterHash& val) {
- this->hash = val;
-}
-
-void BloomFilterHeader::__set_compression(const BloomFilterCompression& val) {
- this->compression = val;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_numBytes = false;
- bool isset_algorithm = false;
- bool isset_hash = false;
- bool isset_compression = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->numBytes);
- isset_numBytes = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->algorithm.read(iprot);
- isset_algorithm = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->hash.read(iprot);
- isset_hash = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->compression.read(iprot);
- isset_compression = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_numBytes)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_algorithm)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_hash)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_compression)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t BloomFilterHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterHeader");
-
- xfer += oprot->writeFieldBegin("numBytes", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->numBytes);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("algorithm", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->algorithm.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("hash", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->hash.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("compression", ::apache::thrift::protocol::T_STRUCT, 4);
- xfer += this->compression.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterHeader &a, BloomFilterHeader &b) {
- using ::std::swap;
- swap(a.numBytes, b.numBytes);
- swap(a.algorithm, b.algorithm);
- swap(a.hash, b.hash);
- swap(a.compression, b.compression);
-}
-
-BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other68) {
- numBytes = other68.numBytes;
- algorithm = other68.algorithm;
- hash = other68.hash;
- compression = other68.compression;
-}
-BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other69) {
- numBytes = other69.numBytes;
- algorithm = other69.algorithm;
- hash = other69.hash;
- compression = other69.compression;
- return *this;
-}
-void BloomFilterHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterHeader(";
- out << "numBytes=" << to_string(numBytes);
- out << ", " << "algorithm=" << to_string(algorithm);
- out << ", " << "hash=" << to_string(hash);
- out << ", " << "compression=" << to_string(compression);
- out << ")";
-}
-
-
-PageHeader::~PageHeader() noexcept {
-}
-
-
-void PageHeader::__set_type(const PageType::type val) {
- this->type = val;
-}
-
-void PageHeader::__set_uncompressed_page_size(const int32_t val) {
- this->uncompressed_page_size = val;
-}
-
-void PageHeader::__set_compressed_page_size(const int32_t val) {
- this->compressed_page_size = val;
-}
-
-void PageHeader::__set_crc(const int32_t val) {
- this->crc = val;
-__isset.crc = true;
-}
-
-void PageHeader::__set_data_page_header(const DataPageHeader& val) {
- this->data_page_header = val;
-__isset.data_page_header = true;
-}
-
-void PageHeader::__set_index_page_header(const IndexPageHeader& val) {
- this->index_page_header = val;
-__isset.index_page_header = true;
-}
-
-void PageHeader::__set_dictionary_page_header(const DictionaryPageHeader& val) {
- this->dictionary_page_header = val;
-__isset.dictionary_page_header = true;
-}
-
-void PageHeader::__set_data_page_header_v2(const DataPageHeaderV2& val) {
- this->data_page_header_v2 = val;
-__isset.data_page_header_v2 = true;
-}
-std::ostream& operator<<(std::ostream& out, const PageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_type = false;
- bool isset_uncompressed_page_size = false;
- bool isset_compressed_page_size = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast70;
- xfer += iprot->readI32(ecast70);
- this->type = (PageType::type)ecast70;
- isset_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->uncompressed_page_size);
- isset_uncompressed_page_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->compressed_page_size);
- isset_compressed_page_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->crc);
- this->__isset.crc = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->data_page_header.read(iprot);
- this->__isset.data_page_header = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->index_page_header.read(iprot);
- this->__isset.index_page_header = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->dictionary_page_header.read(iprot);
- this->__isset.dictionary_page_header = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->data_page_header_v2.read(iprot);
- this->__isset.data_page_header_v2 = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_type)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_uncompressed_page_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_compressed_page_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t PageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("PageHeader");
-
- xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->type);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("uncompressed_page_size", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->uncompressed_page_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32(this->compressed_page_size);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.crc) {
- xfer += oprot->writeFieldBegin("crc", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32(this->crc);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.data_page_header) {
- xfer += oprot->writeFieldBegin("data_page_header", ::apache::thrift::protocol::T_STRUCT, 5);
- xfer += this->data_page_header.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.index_page_header) {
- xfer += oprot->writeFieldBegin("index_page_header", ::apache::thrift::protocol::T_STRUCT, 6);
- xfer += this->index_page_header.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.dictionary_page_header) {
- xfer += oprot->writeFieldBegin("dictionary_page_header", ::apache::thrift::protocol::T_STRUCT, 7);
- xfer += this->dictionary_page_header.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.data_page_header_v2) {
- xfer += oprot->writeFieldBegin("data_page_header_v2", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->data_page_header_v2.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(PageHeader &a, PageHeader &b) {
- using ::std::swap;
- swap(a.type, b.type);
- swap(a.uncompressed_page_size, b.uncompressed_page_size);
- swap(a.compressed_page_size, b.compressed_page_size);
- swap(a.crc, b.crc);
- swap(a.data_page_header, b.data_page_header);
- swap(a.index_page_header, b.index_page_header);
- swap(a.dictionary_page_header, b.dictionary_page_header);
- swap(a.data_page_header_v2, b.data_page_header_v2);
- swap(a.__isset, b.__isset);
-}
-
-PageHeader::PageHeader(const PageHeader& other71) {
- type = other71.type;
- uncompressed_page_size = other71.uncompressed_page_size;
- compressed_page_size = other71.compressed_page_size;
- crc = other71.crc;
- data_page_header = other71.data_page_header;
- index_page_header = other71.index_page_header;
- dictionary_page_header = other71.dictionary_page_header;
- data_page_header_v2 = other71.data_page_header_v2;
- __isset = other71.__isset;
-}
-PageHeader& PageHeader::operator=(const PageHeader& other72) {
- type = other72.type;
- uncompressed_page_size = other72.uncompressed_page_size;
- compressed_page_size = other72.compressed_page_size;
- crc = other72.crc;
- data_page_header = other72.data_page_header;
- index_page_header = other72.index_page_header;
- dictionary_page_header = other72.dictionary_page_header;
- data_page_header_v2 = other72.data_page_header_v2;
- __isset = other72.__isset;
- return *this;
-}
-void PageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "PageHeader(";
- out << "type=" << to_string(type);
- out << ", " << "uncompressed_page_size=" << to_string(uncompressed_page_size);
- out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
- out << ", " << "crc="; (__isset.crc ? (out << to_string(crc)) : (out << "<null>"));
- out << ", " << "data_page_header="; (__isset.data_page_header ? (out << to_string(data_page_header)) : (out << "<null>"));
- out << ", " << "index_page_header="; (__isset.index_page_header ? (out << to_string(index_page_header)) : (out << "<null>"));
- out << ", " << "dictionary_page_header="; (__isset.dictionary_page_header ? (out << to_string(dictionary_page_header)) : (out << "<null>"));
- out << ", " << "data_page_header_v2="; (__isset.data_page_header_v2 ? (out << to_string(data_page_header_v2)) : (out << "<null>"));
- out << ")";
-}
-
-
-KeyValue::~KeyValue() noexcept {
-}
-
-
-void KeyValue::__set_key(const std::string& val) {
- this->key = val;
-}
-
-void KeyValue::__set_value(const std::string& val) {
- this->value = val;
-__isset.value = true;
-}
-std::ostream& operator<<(std::ostream& out, const KeyValue& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t KeyValue::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_key = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->key);
- isset_key = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->value);
- this->__isset.value = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_key)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t KeyValue::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("KeyValue");
-
- xfer += oprot->writeFieldBegin("key", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeString(this->key);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.value) {
- xfer += oprot->writeFieldBegin("value", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeString(this->value);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(KeyValue &a, KeyValue &b) {
- using ::std::swap;
- swap(a.key, b.key);
- swap(a.value, b.value);
- swap(a.__isset, b.__isset);
-}
-
-KeyValue::KeyValue(const KeyValue& other73) {
- key = other73.key;
- value = other73.value;
- __isset = other73.__isset;
-}
-KeyValue& KeyValue::operator=(const KeyValue& other74) {
- key = other74.key;
- value = other74.value;
- __isset = other74.__isset;
- return *this;
-}
-void KeyValue::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "KeyValue(";
- out << "key=" << to_string(key);
- out << ", " << "value="; (__isset.value ? (out << to_string(value)) : (out << "<null>"));
- out << ")";
-}
-
-
-SortingColumn::~SortingColumn() noexcept {
-}
-
-
-void SortingColumn::__set_column_idx(const int32_t val) {
- this->column_idx = val;
-}
-
-void SortingColumn::__set_descending(const bool val) {
- this->descending = val;
-}
-
-void SortingColumn::__set_nulls_first(const bool val) {
- this->nulls_first = val;
-}
-std::ostream& operator<<(std::ostream& out, const SortingColumn& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t SortingColumn::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_column_idx = false;
- bool isset_descending = false;
- bool isset_nulls_first = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->column_idx);
- isset_column_idx = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->descending);
- isset_descending = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->nulls_first);
- isset_nulls_first = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_column_idx)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_descending)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_nulls_first)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t SortingColumn::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("SortingColumn");
-
- xfer += oprot->writeFieldBegin("column_idx", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->column_idx);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("descending", ::apache::thrift::protocol::T_BOOL, 2);
- xfer += oprot->writeBool(this->descending);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("nulls_first", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->nulls_first);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(SortingColumn &a, SortingColumn &b) {
- using ::std::swap;
- swap(a.column_idx, b.column_idx);
- swap(a.descending, b.descending);
- swap(a.nulls_first, b.nulls_first);
-}
-
-SortingColumn::SortingColumn(const SortingColumn& other75) {
- column_idx = other75.column_idx;
- descending = other75.descending;
- nulls_first = other75.nulls_first;
-}
-SortingColumn& SortingColumn::operator=(const SortingColumn& other76) {
- column_idx = other76.column_idx;
- descending = other76.descending;
- nulls_first = other76.nulls_first;
- return *this;
-}
-void SortingColumn::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "SortingColumn(";
- out << "column_idx=" << to_string(column_idx);
- out << ", " << "descending=" << to_string(descending);
- out << ", " << "nulls_first=" << to_string(nulls_first);
- out << ")";
-}
-
-
-PageEncodingStats::~PageEncodingStats() noexcept {
-}
-
-
-void PageEncodingStats::__set_page_type(const PageType::type val) {
- this->page_type = val;
-}
-
-void PageEncodingStats::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void PageEncodingStats::__set_count(const int32_t val) {
- this->count = val;
-}
-std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_page_type = false;
- bool isset_encoding = false;
- bool isset_count = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast77;
- xfer += iprot->readI32(ecast77);
- this->page_type = (PageType::type)ecast77;
- isset_page_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast78;
- xfer += iprot->readI32(ecast78);
- this->encoding = (Encoding::type)ecast78;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->count);
- isset_count = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_page_type)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_count)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t PageEncodingStats::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("PageEncodingStats");
-
- xfer += oprot->writeFieldBegin("page_type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->page_type);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("count", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32(this->count);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(PageEncodingStats &a, PageEncodingStats &b) {
- using ::std::swap;
- swap(a.page_type, b.page_type);
- swap(a.encoding, b.encoding);
- swap(a.count, b.count);
-}
-
-PageEncodingStats::PageEncodingStats(const PageEncodingStats& other79) {
- page_type = other79.page_type;
- encoding = other79.encoding;
- count = other79.count;
-}
-PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other80) {
- page_type = other80.page_type;
- encoding = other80.encoding;
- count = other80.count;
- return *this;
-}
-void PageEncodingStats::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "PageEncodingStats(";
- out << "page_type=" << to_string(page_type);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "count=" << to_string(count);
- out << ")";
-}
-
-
-ColumnMetaData::~ColumnMetaData() noexcept {
-}
-
-
-void ColumnMetaData::__set_type(const Type::type val) {
- this->type = val;
-}
-
-void ColumnMetaData::__set_encodings(const std::vector<Encoding::type> & val) {
- this->encodings = val;
-}
-
-void ColumnMetaData::__set_path_in_schema(const std::vector<std::string> & val) {
- this->path_in_schema = val;
-}
-
-void ColumnMetaData::__set_codec(const CompressionCodec::type val) {
- this->codec = val;
-}
-
-void ColumnMetaData::__set_num_values(const int64_t val) {
- this->num_values = val;
-}
-
-void ColumnMetaData::__set_total_uncompressed_size(const int64_t val) {
- this->total_uncompressed_size = val;
-}
-
-void ColumnMetaData::__set_total_compressed_size(const int64_t val) {
- this->total_compressed_size = val;
-}
-
-void ColumnMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
- this->key_value_metadata = val;
-__isset.key_value_metadata = true;
-}
-
-void ColumnMetaData::__set_data_page_offset(const int64_t val) {
- this->data_page_offset = val;
-}
-
-void ColumnMetaData::__set_index_page_offset(const int64_t val) {
- this->index_page_offset = val;
-__isset.index_page_offset = true;
-}
-
-void ColumnMetaData::__set_dictionary_page_offset(const int64_t val) {
- this->dictionary_page_offset = val;
-__isset.dictionary_page_offset = true;
-}
-
-void ColumnMetaData::__set_statistics(const Statistics& val) {
- this->statistics = val;
-__isset.statistics = true;
-}
-
-void ColumnMetaData::__set_encoding_stats(const std::vector<PageEncodingStats> & val) {
- this->encoding_stats = val;
-__isset.encoding_stats = true;
-}
-
-void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) {
- this->bloom_filter_offset = val;
-__isset.bloom_filter_offset = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_type = false;
- bool isset_encodings = false;
- bool isset_path_in_schema = false;
- bool isset_codec = false;
- bool isset_num_values = false;
- bool isset_total_uncompressed_size = false;
- bool isset_total_compressed_size = false;
- bool isset_data_page_offset = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast81;
- xfer += iprot->readI32(ecast81);
- this->type = (Type::type)ecast81;
- isset_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->encodings.clear();
- uint32_t _size82;
- ::apache::thrift::protocol::TType _etype85;
- xfer += iprot->readListBegin(_etype85, _size82);
- this->encodings.resize(_size82);
- uint32_t _i86;
- for (_i86 = 0; _i86 < _size82; ++_i86)
- {
- int32_t ecast87;
- xfer += iprot->readI32(ecast87);
- this->encodings[_i86] = (Encoding::type)ecast87;
- }
- xfer += iprot->readListEnd();
- }
- isset_encodings = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->path_in_schema.clear();
- uint32_t _size88;
- ::apache::thrift::protocol::TType _etype91;
- xfer += iprot->readListBegin(_etype91, _size88);
- this->path_in_schema.resize(_size88);
- uint32_t _i92;
- for (_i92 = 0; _i92 < _size88; ++_i92)
- {
- xfer += iprot->readString(this->path_in_schema[_i92]);
- }
- xfer += iprot->readListEnd();
- }
- isset_path_in_schema = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast93;
- xfer += iprot->readI32(ecast93);
- this->codec = (CompressionCodec::type)ecast93;
- isset_codec = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_uncompressed_size);
- isset_total_uncompressed_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_compressed_size);
- isset_total_compressed_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->key_value_metadata.clear();
- uint32_t _size94;
- ::apache::thrift::protocol::TType _etype97;
- xfer += iprot->readListBegin(_etype97, _size94);
- this->key_value_metadata.resize(_size94);
- uint32_t _i98;
- for (_i98 = 0; _i98 < _size94; ++_i98)
- {
- xfer += this->key_value_metadata[_i98].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.key_value_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->data_page_offset);
- isset_data_page_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 10:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->index_page_offset);
- this->__isset.index_page_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 11:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->dictionary_page_offset);
- this->__isset.dictionary_page_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 12:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->statistics.read(iprot);
- this->__isset.statistics = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 13:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->encoding_stats.clear();
- uint32_t _size99;
- ::apache::thrift::protocol::TType _etype102;
- xfer += iprot->readListBegin(_etype102, _size99);
- this->encoding_stats.resize(_size99);
- uint32_t _i103;
- for (_i103 = 0; _i103 < _size99; ++_i103)
- {
- xfer += this->encoding_stats[_i103].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.encoding_stats = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 14:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->bloom_filter_offset);
- this->__isset.bloom_filter_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_type)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encodings)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_path_in_schema)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_codec)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_total_uncompressed_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_total_compressed_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_data_page_offset)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnMetaData");
-
- xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->type);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast<uint32_t>(this->encodings.size()));
- std::vector<Encoding::type> ::const_iterator _iter104;
- for (_iter104 = this->encodings.begin(); _iter104 != this->encodings.end(); ++_iter104)
- {
- xfer += oprot->writeI32((int32_t)(*_iter104));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
- std::vector<std::string> ::const_iterator _iter105;
- for (_iter105 = this->path_in_schema.begin(); _iter105 != this->path_in_schema.end(); ++_iter105)
- {
- xfer += oprot->writeString((*_iter105));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("codec", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->codec);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I64, 5);
- xfer += oprot->writeI64(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("total_uncompressed_size", ::apache::thrift::protocol::T_I64, 6);
- xfer += oprot->writeI64(this->total_uncompressed_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 7);
- xfer += oprot->writeI64(this->total_compressed_size);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_value_metadata) {
- xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
- std::vector<KeyValue> ::const_iterator _iter106;
- for (_iter106 = this->key_value_metadata.begin(); _iter106 != this->key_value_metadata.end(); ++_iter106)
- {
- xfer += (*_iter106).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldBegin("data_page_offset", ::apache::thrift::protocol::T_I64, 9);
- xfer += oprot->writeI64(this->data_page_offset);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.index_page_offset) {
- xfer += oprot->writeFieldBegin("index_page_offset", ::apache::thrift::protocol::T_I64, 10);
- xfer += oprot->writeI64(this->index_page_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.dictionary_page_offset) {
- xfer += oprot->writeFieldBegin("dictionary_page_offset", ::apache::thrift::protocol::T_I64, 11);
- xfer += oprot->writeI64(this->dictionary_page_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.statistics) {
- xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 12);
- xfer += this->statistics.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.encoding_stats) {
- xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->encoding_stats.size()));
- std::vector<PageEncodingStats> ::const_iterator _iter107;
- for (_iter107 = this->encoding_stats.begin(); _iter107 != this->encoding_stats.end(); ++_iter107)
- {
- xfer += (*_iter107).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.bloom_filter_offset) {
- xfer += oprot->writeFieldBegin("bloom_filter_offset", ::apache::thrift::protocol::T_I64, 14);
- xfer += oprot->writeI64(this->bloom_filter_offset);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnMetaData &a, ColumnMetaData &b) {
- using ::std::swap;
- swap(a.type, b.type);
- swap(a.encodings, b.encodings);
- swap(a.path_in_schema, b.path_in_schema);
- swap(a.codec, b.codec);
- swap(a.num_values, b.num_values);
- swap(a.total_uncompressed_size, b.total_uncompressed_size);
- swap(a.total_compressed_size, b.total_compressed_size);
- swap(a.key_value_metadata, b.key_value_metadata);
- swap(a.data_page_offset, b.data_page_offset);
- swap(a.index_page_offset, b.index_page_offset);
- swap(a.dictionary_page_offset, b.dictionary_page_offset);
- swap(a.statistics, b.statistics);
- swap(a.encoding_stats, b.encoding_stats);
- swap(a.bloom_filter_offset, b.bloom_filter_offset);
- swap(a.__isset, b.__isset);
-}
-
-ColumnMetaData::ColumnMetaData(const ColumnMetaData& other108) {
- type = other108.type;
- encodings = other108.encodings;
- path_in_schema = other108.path_in_schema;
- codec = other108.codec;
- num_values = other108.num_values;
- total_uncompressed_size = other108.total_uncompressed_size;
- total_compressed_size = other108.total_compressed_size;
- key_value_metadata = other108.key_value_metadata;
- data_page_offset = other108.data_page_offset;
- index_page_offset = other108.index_page_offset;
- dictionary_page_offset = other108.dictionary_page_offset;
- statistics = other108.statistics;
- encoding_stats = other108.encoding_stats;
- bloom_filter_offset = other108.bloom_filter_offset;
- __isset = other108.__isset;
-}
-ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other109) {
- type = other109.type;
- encodings = other109.encodings;
- path_in_schema = other109.path_in_schema;
- codec = other109.codec;
- num_values = other109.num_values;
- total_uncompressed_size = other109.total_uncompressed_size;
- total_compressed_size = other109.total_compressed_size;
- key_value_metadata = other109.key_value_metadata;
- data_page_offset = other109.data_page_offset;
- index_page_offset = other109.index_page_offset;
- dictionary_page_offset = other109.dictionary_page_offset;
- statistics = other109.statistics;
- encoding_stats = other109.encoding_stats;
- bloom_filter_offset = other109.bloom_filter_offset;
- __isset = other109.__isset;
- return *this;
-}
-void ColumnMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnMetaData(";
- out << "type=" << to_string(type);
- out << ", " << "encodings=" << to_string(encodings);
- out << ", " << "path_in_schema=" << to_string(path_in_schema);
- out << ", " << "codec=" << to_string(codec);
- out << ", " << "num_values=" << to_string(num_values);
- out << ", " << "total_uncompressed_size=" << to_string(total_uncompressed_size);
- out << ", " << "total_compressed_size=" << to_string(total_compressed_size);
- out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
- out << ", " << "data_page_offset=" << to_string(data_page_offset);
- out << ", " << "index_page_offset="; (__isset.index_page_offset ? (out << to_string(index_page_offset)) : (out << "<null>"));
- out << ", " << "dictionary_page_offset="; (__isset.dictionary_page_offset ? (out << to_string(dictionary_page_offset)) : (out << "<null>"));
- out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
- out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "<null>"));
- out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "<null>"));
- out << ")";
-}
-
-
-EncryptionWithFooterKey::~EncryptionWithFooterKey() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EncryptionWithFooterKey::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t EncryptionWithFooterKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EncryptionWithFooterKey");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other110) {
- (void) other110;
-}
-EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other111) {
- (void) other111;
- return *this;
-}
-void EncryptionWithFooterKey::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EncryptionWithFooterKey(";
- out << ")";
-}
-
-
-EncryptionWithColumnKey::~EncryptionWithColumnKey() noexcept {
-}
-
-
-void EncryptionWithColumnKey::__set_path_in_schema(const std::vector<std::string> & val) {
- this->path_in_schema = val;
-}
-
-void EncryptionWithColumnKey::__set_key_metadata(const std::string& val) {
- this->key_metadata = val;
-__isset.key_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_path_in_schema = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->path_in_schema.clear();
- uint32_t _size112;
- ::apache::thrift::protocol::TType _etype115;
- xfer += iprot->readListBegin(_etype115, _size112);
- this->path_in_schema.resize(_size112);
- uint32_t _i116;
- for (_i116 = 0; _i116 < _size112; ++_i116)
- {
- xfer += iprot->readString(this->path_in_schema[_i116]);
- }
- xfer += iprot->readListEnd();
- }
- isset_path_in_schema = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->key_metadata);
- this->__isset.key_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_path_in_schema)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EncryptionWithColumnKey");
-
- xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
- std::vector<std::string> ::const_iterator _iter117;
- for (_iter117 = this->path_in_schema.begin(); _iter117 != this->path_in_schema.end(); ++_iter117)
- {
- xfer += oprot->writeString((*_iter117));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_metadata) {
- xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->key_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) {
- using ::std::swap;
- swap(a.path_in_schema, b.path_in_schema);
- swap(a.key_metadata, b.key_metadata);
- swap(a.__isset, b.__isset);
-}
-
-EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other118) {
- path_in_schema = other118.path_in_schema;
- key_metadata = other118.key_metadata;
- __isset = other118.__isset;
-}
-EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other119) {
- path_in_schema = other119.path_in_schema;
- key_metadata = other119.key_metadata;
- __isset = other119.__isset;
- return *this;
-}
-void EncryptionWithColumnKey::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EncryptionWithColumnKey(";
- out << "path_in_schema=" << to_string(path_in_schema);
- out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-
-ColumnCryptoMetaData::~ColumnCryptoMetaData() noexcept {
-}
-
-
-void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val) {
- this->ENCRYPTION_WITH_FOOTER_KEY = val;
-__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
-}
-
-void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val) {
- this->ENCRYPTION_WITH_COLUMN_KEY = val;
-__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->ENCRYPTION_WITH_FOOTER_KEY.read(iprot);
- this->__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->ENCRYPTION_WITH_COLUMN_KEY.read(iprot);
- this->__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t ColumnCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnCryptoMetaData");
-
- if (this->__isset.ENCRYPTION_WITH_FOOTER_KEY) {
- xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_FOOTER_KEY", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->ENCRYPTION_WITH_FOOTER_KEY.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.ENCRYPTION_WITH_COLUMN_KEY) {
- xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_COLUMN_KEY", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->ENCRYPTION_WITH_COLUMN_KEY.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) {
- using ::std::swap;
- swap(a.ENCRYPTION_WITH_FOOTER_KEY, b.ENCRYPTION_WITH_FOOTER_KEY);
- swap(a.ENCRYPTION_WITH_COLUMN_KEY, b.ENCRYPTION_WITH_COLUMN_KEY);
- swap(a.__isset, b.__isset);
-}
-
-ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other120) {
- ENCRYPTION_WITH_FOOTER_KEY = other120.ENCRYPTION_WITH_FOOTER_KEY;
- ENCRYPTION_WITH_COLUMN_KEY = other120.ENCRYPTION_WITH_COLUMN_KEY;
- __isset = other120.__isset;
-}
-ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other121) {
- ENCRYPTION_WITH_FOOTER_KEY = other121.ENCRYPTION_WITH_FOOTER_KEY;
- ENCRYPTION_WITH_COLUMN_KEY = other121.ENCRYPTION_WITH_COLUMN_KEY;
- __isset = other121.__isset;
- return *this;
-}
-void ColumnCryptoMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnCryptoMetaData(";
- out << "ENCRYPTION_WITH_FOOTER_KEY="; (__isset.ENCRYPTION_WITH_FOOTER_KEY ? (out << to_string(ENCRYPTION_WITH_FOOTER_KEY)) : (out << "<null>"));
- out << ", " << "ENCRYPTION_WITH_COLUMN_KEY="; (__isset.ENCRYPTION_WITH_COLUMN_KEY ? (out << to_string(ENCRYPTION_WITH_COLUMN_KEY)) : (out << "<null>"));
- out << ")";
-}
-
-
-ColumnChunk::~ColumnChunk() noexcept {
-}
-
-
-void ColumnChunk::__set_file_path(const std::string& val) {
- this->file_path = val;
-__isset.file_path = true;
-}
-
-void ColumnChunk::__set_file_offset(const int64_t val) {
- this->file_offset = val;
-}
-
-void ColumnChunk::__set_meta_data(const ColumnMetaData& val) {
- this->meta_data = val;
-__isset.meta_data = true;
-}
-
-void ColumnChunk::__set_offset_index_offset(const int64_t val) {
- this->offset_index_offset = val;
-__isset.offset_index_offset = true;
-}
-
-void ColumnChunk::__set_offset_index_length(const int32_t val) {
- this->offset_index_length = val;
-__isset.offset_index_length = true;
-}
-
-void ColumnChunk::__set_column_index_offset(const int64_t val) {
- this->column_index_offset = val;
-__isset.column_index_offset = true;
-}
-
-void ColumnChunk::__set_column_index_length(const int32_t val) {
- this->column_index_length = val;
-__isset.column_index_length = true;
-}
-
-void ColumnChunk::__set_crypto_metadata(const ColumnCryptoMetaData& val) {
- this->crypto_metadata = val;
-__isset.crypto_metadata = true;
-}
-
-void ColumnChunk::__set_encrypted_column_metadata(const std::string& val) {
- this->encrypted_column_metadata = val;
-__isset.encrypted_column_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnChunk::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_file_offset = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->file_path);
- this->__isset.file_path = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->file_offset);
- isset_file_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->meta_data.read(iprot);
- this->__isset.meta_data = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->offset_index_offset);
- this->__isset.offset_index_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->offset_index_length);
- this->__isset.offset_index_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->column_index_offset);
- this->__isset.column_index_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->column_index_length);
- this->__isset.column_index_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->crypto_metadata.read(iprot);
- this->__isset.crypto_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->encrypted_column_metadata);
- this->__isset.encrypted_column_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_file_offset)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t ColumnChunk::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnChunk");
-
- if (this->__isset.file_path) {
- xfer += oprot->writeFieldBegin("file_path", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeString(this->file_path);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 2);
- xfer += oprot->writeI64(this->file_offset);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.meta_data) {
- xfer += oprot->writeFieldBegin("meta_data", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->meta_data.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.offset_index_offset) {
- xfer += oprot->writeFieldBegin("offset_index_offset", ::apache::thrift::protocol::T_I64, 4);
- xfer += oprot->writeI64(this->offset_index_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.offset_index_length) {
- xfer += oprot->writeFieldBegin("offset_index_length", ::apache::thrift::protocol::T_I32, 5);
- xfer += oprot->writeI32(this->offset_index_length);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.column_index_offset) {
- xfer += oprot->writeFieldBegin("column_index_offset", ::apache::thrift::protocol::T_I64, 6);
- xfer += oprot->writeI64(this->column_index_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.column_index_length) {
- xfer += oprot->writeFieldBegin("column_index_length", ::apache::thrift::protocol::T_I32, 7);
- xfer += oprot->writeI32(this->column_index_length);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.crypto_metadata) {
- xfer += oprot->writeFieldBegin("crypto_metadata", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->crypto_metadata.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.encrypted_column_metadata) {
- xfer += oprot->writeFieldBegin("encrypted_column_metadata", ::apache::thrift::protocol::T_STRING, 9);
- xfer += oprot->writeBinary(this->encrypted_column_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnChunk &a, ColumnChunk &b) {
- using ::std::swap;
- swap(a.file_path, b.file_path);
- swap(a.file_offset, b.file_offset);
- swap(a.meta_data, b.meta_data);
- swap(a.offset_index_offset, b.offset_index_offset);
- swap(a.offset_index_length, b.offset_index_length);
- swap(a.column_index_offset, b.column_index_offset);
- swap(a.column_index_length, b.column_index_length);
- swap(a.crypto_metadata, b.crypto_metadata);
- swap(a.encrypted_column_metadata, b.encrypted_column_metadata);
- swap(a.__isset, b.__isset);
-}
-
-ColumnChunk::ColumnChunk(const ColumnChunk& other122) {
- file_path = other122.file_path;
- file_offset = other122.file_offset;
- meta_data = other122.meta_data;
- offset_index_offset = other122.offset_index_offset;
- offset_index_length = other122.offset_index_length;
- column_index_offset = other122.column_index_offset;
- column_index_length = other122.column_index_length;
- crypto_metadata = other122.crypto_metadata;
- encrypted_column_metadata = other122.encrypted_column_metadata;
- __isset = other122.__isset;
-}
-ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other123) {
- file_path = other123.file_path;
- file_offset = other123.file_offset;
- meta_data = other123.meta_data;
- offset_index_offset = other123.offset_index_offset;
- offset_index_length = other123.offset_index_length;
- column_index_offset = other123.column_index_offset;
- column_index_length = other123.column_index_length;
- crypto_metadata = other123.crypto_metadata;
- encrypted_column_metadata = other123.encrypted_column_metadata;
- __isset = other123.__isset;
- return *this;
-}
-void ColumnChunk::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnChunk(";
- out << "file_path="; (__isset.file_path ? (out << to_string(file_path)) : (out << "<null>"));
- out << ", " << "file_offset=" << to_string(file_offset);
- out << ", " << "meta_data="; (__isset.meta_data ? (out << to_string(meta_data)) : (out << "<null>"));
- out << ", " << "offset_index_offset="; (__isset.offset_index_offset ? (out << to_string(offset_index_offset)) : (out << "<null>"));
- out << ", " << "offset_index_length="; (__isset.offset_index_length ? (out << to_string(offset_index_length)) : (out << "<null>"));
- out << ", " << "column_index_offset="; (__isset.column_index_offset ? (out << to_string(column_index_offset)) : (out << "<null>"));
- out << ", " << "column_index_length="; (__isset.column_index_length ? (out << to_string(column_index_length)) : (out << "<null>"));
- out << ", " << "crypto_metadata="; (__isset.crypto_metadata ? (out << to_string(crypto_metadata)) : (out << "<null>"));
- out << ", " << "encrypted_column_metadata="; (__isset.encrypted_column_metadata ? (out << to_string(encrypted_column_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-
-RowGroup::~RowGroup() noexcept {
-}
-
-
-void RowGroup::__set_columns(const std::vector<ColumnChunk> & val) {
- this->columns = val;
-}
-
-void RowGroup::__set_total_byte_size(const int64_t val) {
- this->total_byte_size = val;
-}
-
-void RowGroup::__set_num_rows(const int64_t val) {
- this->num_rows = val;
-}
-
-void RowGroup::__set_sorting_columns(const std::vector<SortingColumn> & val) {
- this->sorting_columns = val;
-__isset.sorting_columns = true;
-}
-
-void RowGroup::__set_file_offset(const int64_t val) {
- this->file_offset = val;
-__isset.file_offset = true;
-}
-
-void RowGroup::__set_total_compressed_size(const int64_t val) {
- this->total_compressed_size = val;
-__isset.total_compressed_size = true;
-}
-
-void RowGroup::__set_ordinal(const int16_t val) {
- this->ordinal = val;
-__isset.ordinal = true;
-}
-std::ostream& operator<<(std::ostream& out, const RowGroup& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_columns = false;
- bool isset_total_byte_size = false;
- bool isset_num_rows = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->columns.clear();
- uint32_t _size124;
- ::apache::thrift::protocol::TType _etype127;
- xfer += iprot->readListBegin(_etype127, _size124);
- this->columns.resize(_size124);
- uint32_t _i128;
- for (_i128 = 0; _i128 < _size124; ++_i128)
- {
- xfer += this->columns[_i128].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_columns = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_byte_size);
- isset_total_byte_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->num_rows);
- isset_num_rows = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->sorting_columns.clear();
- uint32_t _size129;
- ::apache::thrift::protocol::TType _etype132;
- xfer += iprot->readListBegin(_etype132, _size129);
- this->sorting_columns.resize(_size129);
- uint32_t _i133;
- for (_i133 = 0; _i133 < _size129; ++_i133)
- {
- xfer += this->sorting_columns[_i133].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.sorting_columns = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->file_offset);
- this->__isset.file_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_compressed_size);
- this->__isset.total_compressed_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I16) {
- xfer += iprot->readI16(this->ordinal);
- this->__isset.ordinal = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_columns)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_total_byte_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_rows)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("RowGroup");
-
- xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->columns.size()));
- std::vector<ColumnChunk> ::const_iterator _iter134;
- for (_iter134 = this->columns.begin(); _iter134 != this->columns.end(); ++_iter134)
- {
- xfer += (*_iter134).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("total_byte_size", ::apache::thrift::protocol::T_I64, 2);
- xfer += oprot->writeI64(this->total_byte_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->num_rows);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.sorting_columns) {
- xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->sorting_columns.size()));
- std::vector<SortingColumn> ::const_iterator _iter135;
- for (_iter135 = this->sorting_columns.begin(); _iter135 != this->sorting_columns.end(); ++_iter135)
- {
- xfer += (*_iter135).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.file_offset) {
- xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 5);
- xfer += oprot->writeI64(this->file_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.total_compressed_size) {
- xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 6);
- xfer += oprot->writeI64(this->total_compressed_size);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.ordinal) {
- xfer += oprot->writeFieldBegin("ordinal", ::apache::thrift::protocol::T_I16, 7);
- xfer += oprot->writeI16(this->ordinal);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(RowGroup &a, RowGroup &b) {
- using ::std::swap;
- swap(a.columns, b.columns);
- swap(a.total_byte_size, b.total_byte_size);
- swap(a.num_rows, b.num_rows);
- swap(a.sorting_columns, b.sorting_columns);
- swap(a.file_offset, b.file_offset);
- swap(a.total_compressed_size, b.total_compressed_size);
- swap(a.ordinal, b.ordinal);
- swap(a.__isset, b.__isset);
-}
-
-RowGroup::RowGroup(const RowGroup& other136) {
- columns = other136.columns;
- total_byte_size = other136.total_byte_size;
- num_rows = other136.num_rows;
- sorting_columns = other136.sorting_columns;
- file_offset = other136.file_offset;
- total_compressed_size = other136.total_compressed_size;
- ordinal = other136.ordinal;
- __isset = other136.__isset;
-}
-RowGroup& RowGroup::operator=(const RowGroup& other137) {
- columns = other137.columns;
- total_byte_size = other137.total_byte_size;
- num_rows = other137.num_rows;
- sorting_columns = other137.sorting_columns;
- file_offset = other137.file_offset;
- total_compressed_size = other137.total_compressed_size;
- ordinal = other137.ordinal;
- __isset = other137.__isset;
- return *this;
-}
-void RowGroup::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "RowGroup(";
- out << "columns=" << to_string(columns);
- out << ", " << "total_byte_size=" << to_string(total_byte_size);
- out << ", " << "num_rows=" << to_string(num_rows);
- out << ", " << "sorting_columns="; (__isset.sorting_columns ? (out << to_string(sorting_columns)) : (out << "<null>"));
- out << ", " << "file_offset="; (__isset.file_offset ? (out << to_string(file_offset)) : (out << "<null>"));
- out << ", " << "total_compressed_size="; (__isset.total_compressed_size ? (out << to_string(total_compressed_size)) : (out << "<null>"));
- out << ", " << "ordinal="; (__isset.ordinal ? (out << to_string(ordinal)) : (out << "<null>"));
- out << ")";
-}
-
-
-TypeDefinedOrder::~TypeDefinedOrder() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TypeDefinedOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t TypeDefinedOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TypeDefinedOrder");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other138) {
- (void) other138;
-}
-TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other139) {
- (void) other139;
- return *this;
-}
-void TypeDefinedOrder::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TypeDefinedOrder(";
- out << ")";
-}
-
-
-ColumnOrder::~ColumnOrder() noexcept {
-}
-
-
-void ColumnOrder::__set_TYPE_ORDER(const TypeDefinedOrder& val) {
- this->TYPE_ORDER = val;
-__isset.TYPE_ORDER = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->TYPE_ORDER.read(iprot);
- this->__isset.TYPE_ORDER = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t ColumnOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnOrder");
-
- if (this->__isset.TYPE_ORDER) {
- xfer += oprot->writeFieldBegin("TYPE_ORDER", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->TYPE_ORDER.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnOrder &a, ColumnOrder &b) {
- using ::std::swap;
- swap(a.TYPE_ORDER, b.TYPE_ORDER);
- swap(a.__isset, b.__isset);
-}
-
-ColumnOrder::ColumnOrder(const ColumnOrder& other140) {
- TYPE_ORDER = other140.TYPE_ORDER;
- __isset = other140.__isset;
-}
-ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other141) {
- TYPE_ORDER = other141.TYPE_ORDER;
- __isset = other141.__isset;
- return *this;
-}
-void ColumnOrder::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnOrder(";
- out << "TYPE_ORDER="; (__isset.TYPE_ORDER ? (out << to_string(TYPE_ORDER)) : (out << "<null>"));
- out << ")";
-}
-
-
-PageLocation::~PageLocation() noexcept {
-}
-
-
-void PageLocation::__set_offset(const int64_t val) {
- this->offset = val;
-}
-
-void PageLocation::__set_compressed_page_size(const int32_t val) {
- this->compressed_page_size = val;
-}
-
-void PageLocation::__set_first_row_index(const int64_t val) {
- this->first_row_index = val;
-}
-std::ostream& operator<<(std::ostream& out, const PageLocation& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t PageLocation::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_offset = false;
- bool isset_compressed_page_size = false;
- bool isset_first_row_index = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->offset);
- isset_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->compressed_page_size);
- isset_compressed_page_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->first_row_index);
- isset_first_row_index = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_offset)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_compressed_page_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_first_row_index)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t PageLocation::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("PageLocation");
-
- xfer += oprot->writeFieldBegin("offset", ::apache::thrift::protocol::T_I64, 1);
- xfer += oprot->writeI64(this->offset);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->compressed_page_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("first_row_index", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->first_row_index);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(PageLocation &a, PageLocation &b) {
- using ::std::swap;
- swap(a.offset, b.offset);
- swap(a.compressed_page_size, b.compressed_page_size);
- swap(a.first_row_index, b.first_row_index);
-}
-
-PageLocation::PageLocation(const PageLocation& other142) {
- offset = other142.offset;
- compressed_page_size = other142.compressed_page_size;
- first_row_index = other142.first_row_index;
-}
-PageLocation& PageLocation::operator=(const PageLocation& other143) {
- offset = other143.offset;
- compressed_page_size = other143.compressed_page_size;
- first_row_index = other143.first_row_index;
- return *this;
-}
-void PageLocation::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "PageLocation(";
- out << "offset=" << to_string(offset);
- out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
- out << ", " << "first_row_index=" << to_string(first_row_index);
- out << ")";
-}
-
-
-OffsetIndex::~OffsetIndex() noexcept {
-}
-
-
-void OffsetIndex::__set_page_locations(const std::vector<PageLocation> & val) {
- this->page_locations = val;
-}
-std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_page_locations = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->page_locations.clear();
- uint32_t _size144;
- ::apache::thrift::protocol::TType _etype147;
- xfer += iprot->readListBegin(_etype147, _size144);
- this->page_locations.resize(_size144);
- uint32_t _i148;
- for (_i148 = 0; _i148 < _size144; ++_i148)
- {
- xfer += this->page_locations[_i148].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_page_locations = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_page_locations)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("OffsetIndex");
-
- xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->page_locations.size()));
- std::vector<PageLocation> ::const_iterator _iter149;
- for (_iter149 = this->page_locations.begin(); _iter149 != this->page_locations.end(); ++_iter149)
- {
- xfer += (*_iter149).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(OffsetIndex &a, OffsetIndex &b) {
- using ::std::swap;
- swap(a.page_locations, b.page_locations);
-}
-
-OffsetIndex::OffsetIndex(const OffsetIndex& other150) {
- page_locations = other150.page_locations;
-}
-OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other151) {
- page_locations = other151.page_locations;
- return *this;
-}
-void OffsetIndex::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "OffsetIndex(";
- out << "page_locations=" << to_string(page_locations);
- out << ")";
-}
-
-
-ColumnIndex::~ColumnIndex() noexcept {
-}
-
-
-void ColumnIndex::__set_null_pages(const std::vector<bool> & val) {
- this->null_pages = val;
-}
-
-void ColumnIndex::__set_min_values(const std::vector<std::string> & val) {
- this->min_values = val;
-}
-
-void ColumnIndex::__set_max_values(const std::vector<std::string> & val) {
- this->max_values = val;
-}
-
-void ColumnIndex::__set_boundary_order(const BoundaryOrder::type val) {
- this->boundary_order = val;
-}
-
-void ColumnIndex::__set_null_counts(const std::vector<int64_t> & val) {
- this->null_counts = val;
-__isset.null_counts = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_null_pages = false;
- bool isset_min_values = false;
- bool isset_max_values = false;
- bool isset_boundary_order = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->null_pages.clear();
- uint32_t _size152;
- ::apache::thrift::protocol::TType _etype155;
- xfer += iprot->readListBegin(_etype155, _size152);
- this->null_pages.resize(_size152);
- uint32_t _i156;
- for (_i156 = 0; _i156 < _size152; ++_i156)
- {
- bool result;
- xfer += iprot->readBool(result);
- this->null_pages[_i156] = result;
- }
- xfer += iprot->readListEnd();
- }
- isset_null_pages = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->min_values.clear();
- uint32_t _size157;
- ::apache::thrift::protocol::TType _etype160;
- xfer += iprot->readListBegin(_etype160, _size157);
- this->min_values.resize(_size157);
- uint32_t _i161;
- for (_i161 = 0; _i161 < _size157; ++_i161)
- {
- xfer += iprot->readBinary(this->min_values[_i161]);
- }
- xfer += iprot->readListEnd();
- }
- isset_min_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->max_values.clear();
- uint32_t _size162;
- ::apache::thrift::protocol::TType _etype165;
- xfer += iprot->readListBegin(_etype165, _size162);
- this->max_values.resize(_size162);
- uint32_t _i166;
- for (_i166 = 0; _i166 < _size162; ++_i166)
- {
- xfer += iprot->readBinary(this->max_values[_i166]);
- }
- xfer += iprot->readListEnd();
- }
- isset_max_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast167;
- xfer += iprot->readI32(ecast167);
- this->boundary_order = (BoundaryOrder::type)ecast167;
- isset_boundary_order = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->null_counts.clear();
- uint32_t _size168;
- ::apache::thrift::protocol::TType _etype171;
- xfer += iprot->readListBegin(_etype171, _size168);
- this->null_counts.resize(_size168);
- uint32_t _i172;
- for (_i172 = 0; _i172 < _size168; ++_i172)
- {
- xfer += iprot->readI64(this->null_counts[_i172]);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.null_counts = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_null_pages)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_min_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_max_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_boundary_order)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnIndex");
-
- xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast<uint32_t>(this->null_pages.size()));
- std::vector<bool> ::const_iterator _iter173;
- for (_iter173 = this->null_pages.begin(); _iter173 != this->null_pages.end(); ++_iter173)
- {
- xfer += oprot->writeBool((*_iter173));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->min_values.size()));
- std::vector<std::string> ::const_iterator _iter174;
- for (_iter174 = this->min_values.begin(); _iter174 != this->min_values.end(); ++_iter174)
- {
- xfer += oprot->writeBinary((*_iter174));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->max_values.size()));
- std::vector<std::string> ::const_iterator _iter175;
- for (_iter175 = this->max_values.begin(); _iter175 != this->max_values.end(); ++_iter175)
- {
- xfer += oprot->writeBinary((*_iter175));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("boundary_order", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->boundary_order);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.null_counts) {
- xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->null_counts.size()));
- std::vector<int64_t> ::const_iterator _iter176;
- for (_iter176 = this->null_counts.begin(); _iter176 != this->null_counts.end(); ++_iter176)
- {
- xfer += oprot->writeI64((*_iter176));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnIndex &a, ColumnIndex &b) {
- using ::std::swap;
- swap(a.null_pages, b.null_pages);
- swap(a.min_values, b.min_values);
- swap(a.max_values, b.max_values);
- swap(a.boundary_order, b.boundary_order);
- swap(a.null_counts, b.null_counts);
- swap(a.__isset, b.__isset);
-}
-
-ColumnIndex::ColumnIndex(const ColumnIndex& other177) {
- null_pages = other177.null_pages;
- min_values = other177.min_values;
- max_values = other177.max_values;
- boundary_order = other177.boundary_order;
- null_counts = other177.null_counts;
- __isset = other177.__isset;
-}
-ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other178) {
- null_pages = other178.null_pages;
- min_values = other178.min_values;
- max_values = other178.max_values;
- boundary_order = other178.boundary_order;
- null_counts = other178.null_counts;
- __isset = other178.__isset;
- return *this;
-}
-void ColumnIndex::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnIndex(";
- out << "null_pages=" << to_string(null_pages);
- out << ", " << "min_values=" << to_string(min_values);
- out << ", " << "max_values=" << to_string(max_values);
- out << ", " << "boundary_order=" << to_string(boundary_order);
- out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "<null>"));
- out << ")";
-}
-
-
-AesGcmV1::~AesGcmV1() noexcept {
-}
-
-
-void AesGcmV1::__set_aad_prefix(const std::string& val) {
- this->aad_prefix = val;
-__isset.aad_prefix = true;
-}
-
-void AesGcmV1::__set_aad_file_unique(const std::string& val) {
- this->aad_file_unique = val;
-__isset.aad_file_unique = true;
-}
-
-void AesGcmV1::__set_supply_aad_prefix(const bool val) {
- this->supply_aad_prefix = val;
-__isset.supply_aad_prefix = true;
-}
-std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t AesGcmV1::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_prefix);
- this->__isset.aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_file_unique);
- this->__isset.aad_file_unique = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->supply_aad_prefix);
- this->__isset.supply_aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t AesGcmV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("AesGcmV1");
-
- if (this->__isset.aad_prefix) {
- xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeBinary(this->aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.aad_file_unique) {
- xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->aad_file_unique);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.supply_aad_prefix) {
- xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->supply_aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(AesGcmV1 &a, AesGcmV1 &b) {
- using ::std::swap;
- swap(a.aad_prefix, b.aad_prefix);
- swap(a.aad_file_unique, b.aad_file_unique);
- swap(a.supply_aad_prefix, b.supply_aad_prefix);
- swap(a.__isset, b.__isset);
-}
-
-AesGcmV1::AesGcmV1(const AesGcmV1& other179) {
- aad_prefix = other179.aad_prefix;
- aad_file_unique = other179.aad_file_unique;
- supply_aad_prefix = other179.supply_aad_prefix;
- __isset = other179.__isset;
-}
-AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other180) {
- aad_prefix = other180.aad_prefix;
- aad_file_unique = other180.aad_file_unique;
- supply_aad_prefix = other180.supply_aad_prefix;
- __isset = other180.__isset;
- return *this;
-}
-void AesGcmV1::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "AesGcmV1(";
- out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
- out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
- out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
- out << ")";
-}
-
-
-AesGcmCtrV1::~AesGcmCtrV1() noexcept {
-}
-
-
-void AesGcmCtrV1::__set_aad_prefix(const std::string& val) {
- this->aad_prefix = val;
-__isset.aad_prefix = true;
-}
-
-void AesGcmCtrV1::__set_aad_file_unique(const std::string& val) {
- this->aad_file_unique = val;
-__isset.aad_file_unique = true;
-}
-
-void AesGcmCtrV1::__set_supply_aad_prefix(const bool val) {
- this->supply_aad_prefix = val;
-__isset.supply_aad_prefix = true;
-}
-std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t AesGcmCtrV1::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_prefix);
- this->__isset.aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_file_unique);
- this->__isset.aad_file_unique = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->supply_aad_prefix);
- this->__isset.supply_aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t AesGcmCtrV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("AesGcmCtrV1");
-
- if (this->__isset.aad_prefix) {
- xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeBinary(this->aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.aad_file_unique) {
- xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->aad_file_unique);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.supply_aad_prefix) {
- xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->supply_aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) {
- using ::std::swap;
- swap(a.aad_prefix, b.aad_prefix);
- swap(a.aad_file_unique, b.aad_file_unique);
- swap(a.supply_aad_prefix, b.supply_aad_prefix);
- swap(a.__isset, b.__isset);
-}
-
-AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other181) {
- aad_prefix = other181.aad_prefix;
- aad_file_unique = other181.aad_file_unique;
- supply_aad_prefix = other181.supply_aad_prefix;
- __isset = other181.__isset;
-}
-AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other182) {
- aad_prefix = other182.aad_prefix;
- aad_file_unique = other182.aad_file_unique;
- supply_aad_prefix = other182.supply_aad_prefix;
- __isset = other182.__isset;
- return *this;
-}
-void AesGcmCtrV1::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "AesGcmCtrV1(";
- out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
- out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
- out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
- out << ")";
-}
-
-
-EncryptionAlgorithm::~EncryptionAlgorithm() noexcept {
-}
-
-
-void EncryptionAlgorithm::__set_AES_GCM_V1(const AesGcmV1& val) {
- this->AES_GCM_V1 = val;
-__isset.AES_GCM_V1 = true;
-}
-
-void EncryptionAlgorithm::__set_AES_GCM_CTR_V1(const AesGcmCtrV1& val) {
- this->AES_GCM_CTR_V1 = val;
-__isset.AES_GCM_CTR_V1 = true;
-}
-std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EncryptionAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->AES_GCM_V1.read(iprot);
- this->__isset.AES_GCM_V1 = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->AES_GCM_CTR_V1.read(iprot);
- this->__isset.AES_GCM_CTR_V1 = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t EncryptionAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EncryptionAlgorithm");
-
- if (this->__isset.AES_GCM_V1) {
- xfer += oprot->writeFieldBegin("AES_GCM_V1", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->AES_GCM_V1.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.AES_GCM_CTR_V1) {
- xfer += oprot->writeFieldBegin("AES_GCM_CTR_V1", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->AES_GCM_CTR_V1.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) {
- using ::std::swap;
- swap(a.AES_GCM_V1, b.AES_GCM_V1);
- swap(a.AES_GCM_CTR_V1, b.AES_GCM_CTR_V1);
- swap(a.__isset, b.__isset);
-}
-
-EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other183) {
- AES_GCM_V1 = other183.AES_GCM_V1;
- AES_GCM_CTR_V1 = other183.AES_GCM_CTR_V1;
- __isset = other183.__isset;
-}
-EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other184) {
- AES_GCM_V1 = other184.AES_GCM_V1;
- AES_GCM_CTR_V1 = other184.AES_GCM_CTR_V1;
- __isset = other184.__isset;
- return *this;
-}
-void EncryptionAlgorithm::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EncryptionAlgorithm(";
- out << "AES_GCM_V1="; (__isset.AES_GCM_V1 ? (out << to_string(AES_GCM_V1)) : (out << "<null>"));
- out << ", " << "AES_GCM_CTR_V1="; (__isset.AES_GCM_CTR_V1 ? (out << to_string(AES_GCM_CTR_V1)) : (out << "<null>"));
- out << ")";
-}
-
-
-FileMetaData::~FileMetaData() noexcept {
-}
-
-
-void FileMetaData::__set_version(const int32_t val) {
- this->version = val;
-}
-
-void FileMetaData::__set_schema(const std::vector<SchemaElement> & val) {
- this->schema = val;
-}
-
-void FileMetaData::__set_num_rows(const int64_t val) {
- this->num_rows = val;
-}
-
-void FileMetaData::__set_row_groups(const std::vector<RowGroup> & val) {
- this->row_groups = val;
-}
-
-void FileMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
- this->key_value_metadata = val;
-__isset.key_value_metadata = true;
-}
-
-void FileMetaData::__set_created_by(const std::string& val) {
- this->created_by = val;
-__isset.created_by = true;
-}
-
-void FileMetaData::__set_column_orders(const std::vector<ColumnOrder> & val) {
- this->column_orders = val;
-__isset.column_orders = true;
-}
-
-void FileMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
- this->encryption_algorithm = val;
-__isset.encryption_algorithm = true;
-}
-
-void FileMetaData::__set_footer_signing_key_metadata(const std::string& val) {
- this->footer_signing_key_metadata = val;
-__isset.footer_signing_key_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const FileMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_version = false;
- bool isset_schema = false;
- bool isset_num_rows = false;
- bool isset_row_groups = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->version);
- isset_version = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->schema.clear();
- uint32_t _size185;
- ::apache::thrift::protocol::TType _etype188;
- xfer += iprot->readListBegin(_etype188, _size185);
- this->schema.resize(_size185);
- uint32_t _i189;
- for (_i189 = 0; _i189 < _size185; ++_i189)
- {
- xfer += this->schema[_i189].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_schema = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->num_rows);
- isset_num_rows = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->row_groups.clear();
- uint32_t _size190;
- ::apache::thrift::protocol::TType _etype193;
- xfer += iprot->readListBegin(_etype193, _size190);
- this->row_groups.resize(_size190);
- uint32_t _i194;
- for (_i194 = 0; _i194 < _size190; ++_i194)
- {
- xfer += this->row_groups[_i194].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_row_groups = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->key_value_metadata.clear();
- uint32_t _size195;
- ::apache::thrift::protocol::TType _etype198;
- xfer += iprot->readListBegin(_etype198, _size195);
- this->key_value_metadata.resize(_size195);
- uint32_t _i199;
- for (_i199 = 0; _i199 < _size195; ++_i199)
- {
- xfer += this->key_value_metadata[_i199].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.key_value_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->created_by);
- this->__isset.created_by = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->column_orders.clear();
- uint32_t _size200;
- ::apache::thrift::protocol::TType _etype203;
- xfer += iprot->readListBegin(_etype203, _size200);
- this->column_orders.resize(_size200);
- uint32_t _i204;
- for (_i204 = 0; _i204 < _size200; ++_i204)
- {
- xfer += this->column_orders[_i204].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.column_orders = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->encryption_algorithm.read(iprot);
- this->__isset.encryption_algorithm = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->footer_signing_key_metadata);
- this->__isset.footer_signing_key_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_version)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_schema)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_rows)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_row_groups)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("FileMetaData");
-
- xfer += oprot->writeFieldBegin("version", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->version);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->schema.size()));
- std::vector<SchemaElement> ::const_iterator _iter205;
- for (_iter205 = this->schema.begin(); _iter205 != this->schema.end(); ++_iter205)
- {
- xfer += (*_iter205).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->num_rows);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->row_groups.size()));
- std::vector<RowGroup> ::const_iterator _iter206;
- for (_iter206 = this->row_groups.begin(); _iter206 != this->row_groups.end(); ++_iter206)
- {
- xfer += (*_iter206).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_value_metadata) {
- xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
- std::vector<KeyValue> ::const_iterator _iter207;
- for (_iter207 = this->key_value_metadata.begin(); _iter207 != this->key_value_metadata.end(); ++_iter207)
- {
- xfer += (*_iter207).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.created_by) {
- xfer += oprot->writeFieldBegin("created_by", ::apache::thrift::protocol::T_STRING, 6);
- xfer += oprot->writeString(this->created_by);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.column_orders) {
- xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->column_orders.size()));
- std::vector<ColumnOrder> ::const_iterator _iter208;
- for (_iter208 = this->column_orders.begin(); _iter208 != this->column_orders.end(); ++_iter208)
- {
- xfer += (*_iter208).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.encryption_algorithm) {
- xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->encryption_algorithm.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.footer_signing_key_metadata) {
- xfer += oprot->writeFieldBegin("footer_signing_key_metadata", ::apache::thrift::protocol::T_STRING, 9);
- xfer += oprot->writeBinary(this->footer_signing_key_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(FileMetaData &a, FileMetaData &b) {
- using ::std::swap;
- swap(a.version, b.version);
- swap(a.schema, b.schema);
- swap(a.num_rows, b.num_rows);
- swap(a.row_groups, b.row_groups);
- swap(a.key_value_metadata, b.key_value_metadata);
- swap(a.created_by, b.created_by);
- swap(a.column_orders, b.column_orders);
- swap(a.encryption_algorithm, b.encryption_algorithm);
- swap(a.footer_signing_key_metadata, b.footer_signing_key_metadata);
- swap(a.__isset, b.__isset);
-}
-
-FileMetaData::FileMetaData(const FileMetaData& other209) {
- version = other209.version;
- schema = other209.schema;
- num_rows = other209.num_rows;
- row_groups = other209.row_groups;
- key_value_metadata = other209.key_value_metadata;
- created_by = other209.created_by;
- column_orders = other209.column_orders;
- encryption_algorithm = other209.encryption_algorithm;
- footer_signing_key_metadata = other209.footer_signing_key_metadata;
- __isset = other209.__isset;
-}
-FileMetaData& FileMetaData::operator=(const FileMetaData& other210) {
- version = other210.version;
- schema = other210.schema;
- num_rows = other210.num_rows;
- row_groups = other210.row_groups;
- key_value_metadata = other210.key_value_metadata;
- created_by = other210.created_by;
- column_orders = other210.column_orders;
- encryption_algorithm = other210.encryption_algorithm;
- footer_signing_key_metadata = other210.footer_signing_key_metadata;
- __isset = other210.__isset;
- return *this;
-}
-void FileMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "FileMetaData(";
- out << "version=" << to_string(version);
- out << ", " << "schema=" << to_string(schema);
- out << ", " << "num_rows=" << to_string(num_rows);
- out << ", " << "row_groups=" << to_string(row_groups);
- out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
- out << ", " << "created_by="; (__isset.created_by ? (out << to_string(created_by)) : (out << "<null>"));
- out << ", " << "column_orders="; (__isset.column_orders ? (out << to_string(column_orders)) : (out << "<null>"));
- out << ", " << "encryption_algorithm="; (__isset.encryption_algorithm ? (out << to_string(encryption_algorithm)) : (out << "<null>"));
- out << ", " << "footer_signing_key_metadata="; (__isset.footer_signing_key_metadata ? (out << to_string(footer_signing_key_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-
-FileCryptoMetaData::~FileCryptoMetaData() noexcept {
-}
-
-
-void FileCryptoMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
- this->encryption_algorithm = val;
-}
-
-void FileCryptoMetaData::__set_key_metadata(const std::string& val) {
- this->key_metadata = val;
-__isset.key_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t FileCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_encryption_algorithm = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->encryption_algorithm.read(iprot);
- isset_encryption_algorithm = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->key_metadata);
- this->__isset.key_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_encryption_algorithm)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t FileCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("FileCryptoMetaData");
-
- xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->encryption_algorithm.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_metadata) {
- xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->key_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) {
- using ::std::swap;
- swap(a.encryption_algorithm, b.encryption_algorithm);
- swap(a.key_metadata, b.key_metadata);
- swap(a.__isset, b.__isset);
-}
-
-FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other211) {
- encryption_algorithm = other211.encryption_algorithm;
- key_metadata = other211.key_metadata;
- __isset = other211.__isset;
-}
-FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other212) {
- encryption_algorithm = other212.encryption_algorithm;
- key_metadata = other212.key_metadata;
- __isset = other212.__isset;
- return *this;
-}
-void FileCryptoMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "FileCryptoMetaData(";
- out << "encryption_algorithm=" << to_string(encryption_algorithm);
- out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-}} // namespace
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#include "parquet_types.h"
+
+#include <algorithm>
+#include <ostream>
+
+#include <thrift/TToString.h>
+
+namespace parquet { namespace format {
+
+int _kTypeValues[] = {
+ Type::BOOLEAN,
+ Type::INT32,
+ Type::INT64,
+ Type::INT96,
+ Type::FLOAT,
+ Type::DOUBLE,
+ Type::BYTE_ARRAY,
+ Type::FIXED_LEN_BYTE_ARRAY
+};
+const char* _kTypeNames[] = {
+ "BOOLEAN",
+ "INT32",
+ "INT64",
+ "INT96",
+ "FLOAT",
+ "DOUBLE",
+ "BYTE_ARRAY",
+ "FIXED_LEN_BYTE_ARRAY"
+};
+const std::map<int, const char*> _Type_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kTypeValues, _kTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const Type::type& val) {
+ std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
+ if (it != _Type_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const Type::type& val) {
+ std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
+ if (it != _Type_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kConvertedTypeValues[] = {
+ ConvertedType::UTF8,
+ ConvertedType::MAP,
+ ConvertedType::MAP_KEY_VALUE,
+ ConvertedType::LIST,
+ ConvertedType::ENUM,
+ ConvertedType::DECIMAL,
+ ConvertedType::DATE,
+ ConvertedType::TIME_MILLIS,
+ ConvertedType::TIME_MICROS,
+ ConvertedType::TIMESTAMP_MILLIS,
+ ConvertedType::TIMESTAMP_MICROS,
+ ConvertedType::UINT_8,
+ ConvertedType::UINT_16,
+ ConvertedType::UINT_32,
+ ConvertedType::UINT_64,
+ ConvertedType::INT_8,
+ ConvertedType::INT_16,
+ ConvertedType::INT_32,
+ ConvertedType::INT_64,
+ ConvertedType::JSON,
+ ConvertedType::BSON,
+ ConvertedType::INTERVAL
+};
+const char* _kConvertedTypeNames[] = {
+ "UTF8",
+ "MAP",
+ "MAP_KEY_VALUE",
+ "LIST",
+ "ENUM",
+ "DECIMAL",
+ "DATE",
+ "TIME_MILLIS",
+ "TIME_MICROS",
+ "TIMESTAMP_MILLIS",
+ "TIMESTAMP_MICROS",
+ "UINT_8",
+ "UINT_16",
+ "UINT_32",
+ "UINT_64",
+ "INT_8",
+ "INT_16",
+ "INT_32",
+ "INT_64",
+ "JSON",
+ "BSON",
+ "INTERVAL"
+};
+const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(22, _kConvertedTypeValues, _kConvertedTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val) {
+ std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
+ if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const ConvertedType::type& val) {
+ std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
+ if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kFieldRepetitionTypeValues[] = {
+ FieldRepetitionType::REQUIRED,
+ FieldRepetitionType::OPTIONAL,
+ FieldRepetitionType::REPEATED
+};
+const char* _kFieldRepetitionTypeNames[] = {
+ "REQUIRED",
+ "OPTIONAL",
+ "REPEATED"
+};
+const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) {
+ std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
+ if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const FieldRepetitionType::type& val) {
+ std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
+ if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kEncodingValues[] = {
+ Encoding::PLAIN,
+ Encoding::PLAIN_DICTIONARY,
+ Encoding::RLE,
+ Encoding::BIT_PACKED,
+ Encoding::DELTA_BINARY_PACKED,
+ Encoding::DELTA_LENGTH_BYTE_ARRAY,
+ Encoding::DELTA_BYTE_ARRAY,
+ Encoding::RLE_DICTIONARY,
+ Encoding::BYTE_STREAM_SPLIT
+};
+const char* _kEncodingNames[] = {
+ "PLAIN",
+ "PLAIN_DICTIONARY",
+ "RLE",
+ "BIT_PACKED",
+ "DELTA_BINARY_PACKED",
+ "DELTA_LENGTH_BYTE_ARRAY",
+ "DELTA_BYTE_ARRAY",
+ "RLE_DICTIONARY",
+ "BYTE_STREAM_SPLIT"
+};
+const std::map<int, const char*> _Encoding_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(9, _kEncodingValues, _kEncodingNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const Encoding::type& val) {
+ std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
+ if (it != _Encoding_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const Encoding::type& val) {
+ std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
+ if (it != _Encoding_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kCompressionCodecValues[] = {
+ CompressionCodec::UNCOMPRESSED,
+ CompressionCodec::SNAPPY,
+ CompressionCodec::GZIP,
+ CompressionCodec::LZO,
+ CompressionCodec::BROTLI,
+ CompressionCodec::LZ4,
+ CompressionCodec::ZSTD,
+ CompressionCodec::LZ4_RAW
+};
+const char* _kCompressionCodecNames[] = {
+ "UNCOMPRESSED",
+ "SNAPPY",
+ "GZIP",
+ "LZO",
+ "BROTLI",
+ "LZ4",
+ "ZSTD",
+ "LZ4_RAW"
+};
+const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kCompressionCodecValues, _kCompressionCodecNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val) {
+ std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
+ if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const CompressionCodec::type& val) {
+ std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
+ if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kPageTypeValues[] = {
+ PageType::DATA_PAGE,
+ PageType::INDEX_PAGE,
+ PageType::DICTIONARY_PAGE,
+ PageType::DATA_PAGE_V2
+};
+const char* _kPageTypeNames[] = {
+ "DATA_PAGE",
+ "INDEX_PAGE",
+ "DICTIONARY_PAGE",
+ "DATA_PAGE_V2"
+};
+const std::map<int, const char*> _PageType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(4, _kPageTypeValues, _kPageTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const PageType::type& val) {
+ std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
+ if (it != _PageType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const PageType::type& val) {
+ std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
+ if (it != _PageType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kBoundaryOrderValues[] = {
+ BoundaryOrder::UNORDERED,
+ BoundaryOrder::ASCENDING,
+ BoundaryOrder::DESCENDING
+};
+const char* _kBoundaryOrderNames[] = {
+ "UNORDERED",
+ "ASCENDING",
+ "DESCENDING"
+};
+const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kBoundaryOrderValues, _kBoundaryOrderNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val) {
+ std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
+ if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const BoundaryOrder::type& val) {
+ std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
+ if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+
+Statistics::~Statistics() noexcept {
+}
+
+
+void Statistics::__set_max(const std::string& val) {
+ this->max = val;
+__isset.max = true;
+}
+
+void Statistics::__set_min(const std::string& val) {
+ this->min = val;
+__isset.min = true;
+}
+
+void Statistics::__set_null_count(const int64_t val) {
+ this->null_count = val;
+__isset.null_count = true;
+}
+
+void Statistics::__set_distinct_count(const int64_t val) {
+ this->distinct_count = val;
+__isset.distinct_count = true;
+}
+
+void Statistics::__set_max_value(const std::string& val) {
+ this->max_value = val;
+__isset.max_value = true;
+}
+
+void Statistics::__set_min_value(const std::string& val) {
+ this->min_value = val;
+__isset.min_value = true;
+}
+std::ostream& operator<<(std::ostream& out, const Statistics& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->max);
+ this->__isset.max = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->min);
+ this->__isset.min = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->null_count);
+ this->__isset.null_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->distinct_count);
+ this->__isset.distinct_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->max_value);
+ this->__isset.max_value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->min_value);
+ this->__isset.min_value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("Statistics");
+
+ if (this->__isset.max) {
+ xfer += oprot->writeFieldBegin("max", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->max);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.min) {
+ xfer += oprot->writeFieldBegin("min", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->min);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.null_count) {
+ xfer += oprot->writeFieldBegin("null_count", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->null_count);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.distinct_count) {
+ xfer += oprot->writeFieldBegin("distinct_count", ::apache::thrift::protocol::T_I64, 4);
+ xfer += oprot->writeI64(this->distinct_count);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.max_value) {
+ xfer += oprot->writeFieldBegin("max_value", ::apache::thrift::protocol::T_STRING, 5);
+ xfer += oprot->writeBinary(this->max_value);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.min_value) {
+ xfer += oprot->writeFieldBegin("min_value", ::apache::thrift::protocol::T_STRING, 6);
+ xfer += oprot->writeBinary(this->min_value);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(Statistics &a, Statistics &b) {
+ using ::std::swap;
+ swap(a.max, b.max);
+ swap(a.min, b.min);
+ swap(a.null_count, b.null_count);
+ swap(a.distinct_count, b.distinct_count);
+ swap(a.max_value, b.max_value);
+ swap(a.min_value, b.min_value);
+ swap(a.__isset, b.__isset);
+}
+
+Statistics::Statistics(const Statistics& other0) {
+ max = other0.max;
+ min = other0.min;
+ null_count = other0.null_count;
+ distinct_count = other0.distinct_count;
+ max_value = other0.max_value;
+ min_value = other0.min_value;
+ __isset = other0.__isset;
+}
+Statistics& Statistics::operator=(const Statistics& other1) {
+ max = other1.max;
+ min = other1.min;
+ null_count = other1.null_count;
+ distinct_count = other1.distinct_count;
+ max_value = other1.max_value;
+ min_value = other1.min_value;
+ __isset = other1.__isset;
+ return *this;
+}
+void Statistics::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "Statistics(";
+ out << "max="; (__isset.max ? (out << to_string(max)) : (out << "<null>"));
+ out << ", " << "min="; (__isset.min ? (out << to_string(min)) : (out << "<null>"));
+ out << ", " << "null_count="; (__isset.null_count ? (out << to_string(null_count)) : (out << "<null>"));
+ out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "<null>"));
+ out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
+ out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
+ out << ")";
+}
+
+
+StringType::~StringType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const StringType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t StringType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t StringType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("StringType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(StringType &a, StringType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+StringType::StringType(const StringType& other2) {
+ (void) other2;
+}
+StringType& StringType::operator=(const StringType& other3) {
+ (void) other3;
+ return *this;
+}
+void StringType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "StringType(";
+ out << ")";
+}
+
+
+UUIDType::~UUIDType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const UUIDType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t UUIDType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t UUIDType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("UUIDType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(UUIDType &a, UUIDType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+UUIDType::UUIDType(const UUIDType& other4) {
+ (void) other4;
+}
+UUIDType& UUIDType::operator=(const UUIDType& other5) {
+ (void) other5;
+ return *this;
+}
+void UUIDType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "UUIDType(";
+ out << ")";
+}
+
+
+MapType::~MapType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MapType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MapType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MapType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MapType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MapType &a, MapType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MapType::MapType(const MapType& other6) {
+ (void) other6;
+}
+MapType& MapType::operator=(const MapType& other7) {
+ (void) other7;
+ return *this;
+}
+void MapType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MapType(";
+ out << ")";
+}
+
+
+ListType::~ListType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const ListType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ListType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ListType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ListType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ListType &a, ListType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+ListType::ListType(const ListType& other8) {
+ (void) other8;
+}
+ListType& ListType::operator=(const ListType& other9) {
+ (void) other9;
+ return *this;
+}
+void ListType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ListType(";
+ out << ")";
+}
+
+
+EnumType::~EnumType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const EnumType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EnumType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EnumType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EnumType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EnumType &a, EnumType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+EnumType::EnumType(const EnumType& other10) {
+ (void) other10;
+}
+EnumType& EnumType::operator=(const EnumType& other11) {
+ (void) other11;
+ return *this;
+}
+void EnumType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EnumType(";
+ out << ")";
+}
+
+
+DateType::~DateType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const DateType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DateType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t DateType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DateType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DateType &a, DateType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+DateType::DateType(const DateType& other12) {
+ (void) other12;
+}
+DateType& DateType::operator=(const DateType& other13) {
+ (void) other13;
+ return *this;
+}
+void DateType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DateType(";
+ out << ")";
+}
+
+
+NullType::~NullType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const NullType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t NullType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t NullType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("NullType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(NullType &a, NullType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+NullType::NullType(const NullType& other14) {
+ (void) other14;
+}
+NullType& NullType::operator=(const NullType& other15) {
+ (void) other15;
+ return *this;
+}
+void NullType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "NullType(";
+ out << ")";
+}
+
+
+DecimalType::~DecimalType() noexcept {
+}
+
+
+void DecimalType::__set_scale(const int32_t val) {
+ this->scale = val;
+}
+
+void DecimalType::__set_precision(const int32_t val) {
+ this->precision = val;
+}
+std::ostream& operator<<(std::ostream& out, const DecimalType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DecimalType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_scale = false;
+ bool isset_precision = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->scale);
+ isset_scale = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->precision);
+ isset_precision = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_scale)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_precision)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DecimalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DecimalType");
+
+ xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->scale);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->precision);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DecimalType &a, DecimalType &b) {
+ using ::std::swap;
+ swap(a.scale, b.scale);
+ swap(a.precision, b.precision);
+}
+
+DecimalType::DecimalType(const DecimalType& other16) {
+ scale = other16.scale;
+ precision = other16.precision;
+}
+DecimalType& DecimalType::operator=(const DecimalType& other17) {
+ scale = other17.scale;
+ precision = other17.precision;
+ return *this;
+}
+void DecimalType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DecimalType(";
+ out << "scale=" << to_string(scale);
+ out << ", " << "precision=" << to_string(precision);
+ out << ")";
+}
+
+
+MilliSeconds::~MilliSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MilliSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MilliSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MilliSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MilliSeconds &a, MilliSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MilliSeconds::MilliSeconds(const MilliSeconds& other18) {
+ (void) other18;
+}
+MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other19) {
+ (void) other19;
+ return *this;
+}
+void MilliSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MilliSeconds(";
+ out << ")";
+}
+
+
+MicroSeconds::~MicroSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MicroSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MicroSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MicroSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MicroSeconds &a, MicroSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MicroSeconds::MicroSeconds(const MicroSeconds& other20) {
+ (void) other20;
+}
+MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other21) {
+ (void) other21;
+ return *this;
+}
+void MicroSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MicroSeconds(";
+ out << ")";
+}
+
+
+NanoSeconds::~NanoSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t NanoSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t NanoSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("NanoSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(NanoSeconds &a, NanoSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+NanoSeconds::NanoSeconds(const NanoSeconds& other22) {
+ (void) other22;
+}
+NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other23) {
+ (void) other23;
+ return *this;
+}
+void NanoSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "NanoSeconds(";
+ out << ")";
+}
+
+
+TimeUnit::~TimeUnit() noexcept {
+}
+
+
+void TimeUnit::__set_MILLIS(const MilliSeconds& val) {
+ this->MILLIS = val;
+__isset.MILLIS = true;
+}
+
+void TimeUnit::__set_MICROS(const MicroSeconds& val) {
+ this->MICROS = val;
+__isset.MICROS = true;
+}
+
+void TimeUnit::__set_NANOS(const NanoSeconds& val) {
+ this->NANOS = val;
+__isset.NANOS = true;
+}
+std::ostream& operator<<(std::ostream& out, const TimeUnit& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimeUnit::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MILLIS.read(iprot);
+ this->__isset.MILLIS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MICROS.read(iprot);
+ this->__isset.MICROS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->NANOS.read(iprot);
+ this->__isset.NANOS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t TimeUnit::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimeUnit");
+
+ if (this->__isset.MILLIS) {
+ xfer += oprot->writeFieldBegin("MILLIS", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->MILLIS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.MICROS) {
+ xfer += oprot->writeFieldBegin("MICROS", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->MICROS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.NANOS) {
+ xfer += oprot->writeFieldBegin("NANOS", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->NANOS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimeUnit &a, TimeUnit &b) {
+ using ::std::swap;
+ swap(a.MILLIS, b.MILLIS);
+ swap(a.MICROS, b.MICROS);
+ swap(a.NANOS, b.NANOS);
+ swap(a.__isset, b.__isset);
+}
+
+TimeUnit::TimeUnit(const TimeUnit& other24) {
+ MILLIS = other24.MILLIS;
+ MICROS = other24.MICROS;
+ NANOS = other24.NANOS;
+ __isset = other24.__isset;
+}
+TimeUnit& TimeUnit::operator=(const TimeUnit& other25) {
+ MILLIS = other25.MILLIS;
+ MICROS = other25.MICROS;
+ NANOS = other25.NANOS;
+ __isset = other25.__isset;
+ return *this;
+}
+void TimeUnit::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimeUnit(";
+ out << "MILLIS="; (__isset.MILLIS ? (out << to_string(MILLIS)) : (out << "<null>"));
+ out << ", " << "MICROS="; (__isset.MICROS ? (out << to_string(MICROS)) : (out << "<null>"));
+ out << ", " << "NANOS="; (__isset.NANOS ? (out << to_string(NANOS)) : (out << "<null>"));
+ out << ")";
+}
+
+
+TimestampType::~TimestampType() noexcept {
+}
+
+
+void TimestampType::__set_isAdjustedToUTC(const bool val) {
+ this->isAdjustedToUTC = val;
+}
+
+void TimestampType::__set_unit(const TimeUnit& val) {
+ this->unit = val;
+}
+std::ostream& operator<<(std::ostream& out, const TimestampType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimestampType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_isAdjustedToUTC = false;
+ bool isset_unit = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isAdjustedToUTC);
+ isset_isAdjustedToUTC = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->unit.read(iprot);
+ isset_unit = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_isAdjustedToUTC)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_unit)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t TimestampType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimestampType");
+
+ xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
+ xfer += oprot->writeBool(this->isAdjustedToUTC);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->unit.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimestampType &a, TimestampType &b) {
+ using ::std::swap;
+ swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
+ swap(a.unit, b.unit);
+}
+
+TimestampType::TimestampType(const TimestampType& other26) {
+ isAdjustedToUTC = other26.isAdjustedToUTC;
+ unit = other26.unit;
+}
+TimestampType& TimestampType::operator=(const TimestampType& other27) {
+ isAdjustedToUTC = other27.isAdjustedToUTC;
+ unit = other27.unit;
+ return *this;
+}
+void TimestampType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimestampType(";
+ out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
+ out << ", " << "unit=" << to_string(unit);
+ out << ")";
+}
+
+
+TimeType::~TimeType() noexcept {
+}
+
+
+void TimeType::__set_isAdjustedToUTC(const bool val) {
+ this->isAdjustedToUTC = val;
+}
+
+void TimeType::__set_unit(const TimeUnit& val) {
+ this->unit = val;
+}
+std::ostream& operator<<(std::ostream& out, const TimeType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimeType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_isAdjustedToUTC = false;
+ bool isset_unit = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isAdjustedToUTC);
+ isset_isAdjustedToUTC = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->unit.read(iprot);
+ isset_unit = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_isAdjustedToUTC)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_unit)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t TimeType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimeType");
+
+ xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
+ xfer += oprot->writeBool(this->isAdjustedToUTC);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->unit.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimeType &a, TimeType &b) {
+ using ::std::swap;
+ swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
+ swap(a.unit, b.unit);
+}
+
+TimeType::TimeType(const TimeType& other28) {
+ isAdjustedToUTC = other28.isAdjustedToUTC;
+ unit = other28.unit;
+}
+TimeType& TimeType::operator=(const TimeType& other29) {
+ isAdjustedToUTC = other29.isAdjustedToUTC;
+ unit = other29.unit;
+ return *this;
+}
+void TimeType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimeType(";
+ out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
+ out << ", " << "unit=" << to_string(unit);
+ out << ")";
+}
+
+
+IntType::~IntType() noexcept {
+}
+
+
+void IntType::__set_bitWidth(const int8_t val) {
+ this->bitWidth = val;
+}
+
+void IntType::__set_isSigned(const bool val) {
+ this->isSigned = val;
+}
+std::ostream& operator<<(std::ostream& out, const IntType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t IntType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_bitWidth = false;
+ bool isset_isSigned = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BYTE) {
+ xfer += iprot->readByte(this->bitWidth);
+ isset_bitWidth = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isSigned);
+ isset_isSigned = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_bitWidth)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_isSigned)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t IntType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("IntType");
+
+ xfer += oprot->writeFieldBegin("bitWidth", ::apache::thrift::protocol::T_BYTE, 1);
+ xfer += oprot->writeByte(this->bitWidth);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("isSigned", ::apache::thrift::protocol::T_BOOL, 2);
+ xfer += oprot->writeBool(this->isSigned);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(IntType &a, IntType &b) {
+ using ::std::swap;
+ swap(a.bitWidth, b.bitWidth);
+ swap(a.isSigned, b.isSigned);
+}
+
+IntType::IntType(const IntType& other30) {
+ bitWidth = other30.bitWidth;
+ isSigned = other30.isSigned;
+}
+IntType& IntType::operator=(const IntType& other31) {
+ bitWidth = other31.bitWidth;
+ isSigned = other31.isSigned;
+ return *this;
+}
+void IntType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "IntType(";
+ out << "bitWidth=" << to_string(bitWidth);
+ out << ", " << "isSigned=" << to_string(isSigned);
+ out << ")";
+}
+
+
+JsonType::~JsonType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const JsonType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t JsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t JsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("JsonType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(JsonType &a, JsonType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+JsonType::JsonType(const JsonType& other32) {
+ (void) other32;
+}
+JsonType& JsonType::operator=(const JsonType& other33) {
+ (void) other33;
+ return *this;
+}
+void JsonType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "JsonType(";
+ out << ")";
+}
+
+
+BsonType::~BsonType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const BsonType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BsonType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BsonType &a, BsonType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+BsonType::BsonType(const BsonType& other34) {
+ (void) other34;
+}
+BsonType& BsonType::operator=(const BsonType& other35) {
+ (void) other35;
+ return *this;
+}
+void BsonType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BsonType(";
+ out << ")";
+}
+
+
+LogicalType::~LogicalType() noexcept {
+}
+
+
+void LogicalType::__set_STRING(const StringType& val) {
+ this->STRING = val;
+__isset.STRING = true;
+}
+
+void LogicalType::__set_MAP(const MapType& val) {
+ this->MAP = val;
+__isset.MAP = true;
+}
+
+void LogicalType::__set_LIST(const ListType& val) {
+ this->LIST = val;
+__isset.LIST = true;
+}
+
+void LogicalType::__set_ENUM(const EnumType& val) {
+ this->ENUM = val;
+__isset.ENUM = true;
+}
+
+void LogicalType::__set_DECIMAL(const DecimalType& val) {
+ this->DECIMAL = val;
+__isset.DECIMAL = true;
+}
+
+void LogicalType::__set_DATE(const DateType& val) {
+ this->DATE = val;
+__isset.DATE = true;
+}
+
+void LogicalType::__set_TIME(const TimeType& val) {
+ this->TIME = val;
+__isset.TIME = true;
+}
+
+void LogicalType::__set_TIMESTAMP(const TimestampType& val) {
+ this->TIMESTAMP = val;
+__isset.TIMESTAMP = true;
+}
+
+void LogicalType::__set_INTEGER(const IntType& val) {
+ this->INTEGER = val;
+__isset.INTEGER = true;
+}
+
+void LogicalType::__set_UNKNOWN(const NullType& val) {
+ this->UNKNOWN = val;
+__isset.UNKNOWN = true;
+}
+
+void LogicalType::__set_JSON(const JsonType& val) {
+ this->JSON = val;
+__isset.JSON = true;
+}
+
+void LogicalType::__set_BSON(const BsonType& val) {
+ this->BSON = val;
+__isset.BSON = true;
+}
+
+void LogicalType::__set_UUID(const UUIDType& val) {
+ this->UUID = val;
+__isset.UUID = true;
+}
+std::ostream& operator<<(std::ostream& out, const LogicalType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t LogicalType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->STRING.read(iprot);
+ this->__isset.STRING = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MAP.read(iprot);
+ this->__isset.MAP = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->LIST.read(iprot);
+ this->__isset.LIST = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENUM.read(iprot);
+ this->__isset.ENUM = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->DECIMAL.read(iprot);
+ this->__isset.DECIMAL = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->DATE.read(iprot);
+ this->__isset.DATE = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TIME.read(iprot);
+ this->__isset.TIME = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TIMESTAMP.read(iprot);
+ this->__isset.TIMESTAMP = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->INTEGER.read(iprot);
+ this->__isset.INTEGER = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 11:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UNKNOWN.read(iprot);
+ this->__isset.UNKNOWN = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 12:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->JSON.read(iprot);
+ this->__isset.JSON = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 13:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->BSON.read(iprot);
+ this->__isset.BSON = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 14:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UUID.read(iprot);
+ this->__isset.UUID = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t LogicalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("LogicalType");
+
+ if (this->__isset.STRING) {
+ xfer += oprot->writeFieldBegin("STRING", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->STRING.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.MAP) {
+ xfer += oprot->writeFieldBegin("MAP", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->MAP.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.LIST) {
+ xfer += oprot->writeFieldBegin("LIST", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->LIST.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ENUM) {
+ xfer += oprot->writeFieldBegin("ENUM", ::apache::thrift::protocol::T_STRUCT, 4);
+ xfer += this->ENUM.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.DECIMAL) {
+ xfer += oprot->writeFieldBegin("DECIMAL", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->DECIMAL.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.DATE) {
+ xfer += oprot->writeFieldBegin("DATE", ::apache::thrift::protocol::T_STRUCT, 6);
+ xfer += this->DATE.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.TIME) {
+ xfer += oprot->writeFieldBegin("TIME", ::apache::thrift::protocol::T_STRUCT, 7);
+ xfer += this->TIME.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.TIMESTAMP) {
+ xfer += oprot->writeFieldBegin("TIMESTAMP", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->TIMESTAMP.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.INTEGER) {
+ xfer += oprot->writeFieldBegin("INTEGER", ::apache::thrift::protocol::T_STRUCT, 10);
+ xfer += this->INTEGER.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.UNKNOWN) {
+ xfer += oprot->writeFieldBegin("UNKNOWN", ::apache::thrift::protocol::T_STRUCT, 11);
+ xfer += this->UNKNOWN.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.JSON) {
+ xfer += oprot->writeFieldBegin("JSON", ::apache::thrift::protocol::T_STRUCT, 12);
+ xfer += this->JSON.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.BSON) {
+ xfer += oprot->writeFieldBegin("BSON", ::apache::thrift::protocol::T_STRUCT, 13);
+ xfer += this->BSON.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.UUID) {
+ xfer += oprot->writeFieldBegin("UUID", ::apache::thrift::protocol::T_STRUCT, 14);
+ xfer += this->UUID.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(LogicalType &a, LogicalType &b) {
+ using ::std::swap;
+ swap(a.STRING, b.STRING);
+ swap(a.MAP, b.MAP);
+ swap(a.LIST, b.LIST);
+ swap(a.ENUM, b.ENUM);
+ swap(a.DECIMAL, b.DECIMAL);
+ swap(a.DATE, b.DATE);
+ swap(a.TIME, b.TIME);
+ swap(a.TIMESTAMP, b.TIMESTAMP);
+ swap(a.INTEGER, b.INTEGER);
+ swap(a.UNKNOWN, b.UNKNOWN);
+ swap(a.JSON, b.JSON);
+ swap(a.BSON, b.BSON);
+ swap(a.UUID, b.UUID);
+ swap(a.__isset, b.__isset);
+}
+
+LogicalType::LogicalType(const LogicalType& other36) {
+ STRING = other36.STRING;
+ MAP = other36.MAP;
+ LIST = other36.LIST;
+ ENUM = other36.ENUM;
+ DECIMAL = other36.DECIMAL;
+ DATE = other36.DATE;
+ TIME = other36.TIME;
+ TIMESTAMP = other36.TIMESTAMP;
+ INTEGER = other36.INTEGER;
+ UNKNOWN = other36.UNKNOWN;
+ JSON = other36.JSON;
+ BSON = other36.BSON;
+ UUID = other36.UUID;
+ __isset = other36.__isset;
+}
+LogicalType& LogicalType::operator=(const LogicalType& other37) {
+ STRING = other37.STRING;
+ MAP = other37.MAP;
+ LIST = other37.LIST;
+ ENUM = other37.ENUM;
+ DECIMAL = other37.DECIMAL;
+ DATE = other37.DATE;
+ TIME = other37.TIME;
+ TIMESTAMP = other37.TIMESTAMP;
+ INTEGER = other37.INTEGER;
+ UNKNOWN = other37.UNKNOWN;
+ JSON = other37.JSON;
+ BSON = other37.BSON;
+ UUID = other37.UUID;
+ __isset = other37.__isset;
+ return *this;
+}
+void LogicalType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "LogicalType(";
+ out << "STRING="; (__isset.STRING ? (out << to_string(STRING)) : (out << "<null>"));
+ out << ", " << "MAP="; (__isset.MAP ? (out << to_string(MAP)) : (out << "<null>"));
+ out << ", " << "LIST="; (__isset.LIST ? (out << to_string(LIST)) : (out << "<null>"));
+ out << ", " << "ENUM="; (__isset.ENUM ? (out << to_string(ENUM)) : (out << "<null>"));
+ out << ", " << "DECIMAL="; (__isset.DECIMAL ? (out << to_string(DECIMAL)) : (out << "<null>"));
+ out << ", " << "DATE="; (__isset.DATE ? (out << to_string(DATE)) : (out << "<null>"));
+ out << ", " << "TIME="; (__isset.TIME ? (out << to_string(TIME)) : (out << "<null>"));
+ out << ", " << "TIMESTAMP="; (__isset.TIMESTAMP ? (out << to_string(TIMESTAMP)) : (out << "<null>"));
+ out << ", " << "INTEGER="; (__isset.INTEGER ? (out << to_string(INTEGER)) : (out << "<null>"));
+ out << ", " << "UNKNOWN="; (__isset.UNKNOWN ? (out << to_string(UNKNOWN)) : (out << "<null>"));
+ out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "<null>"));
+ out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "<null>"));
+ out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SchemaElement::~SchemaElement() noexcept {
+}
+
+
+void SchemaElement::__set_type(const Type::type val) {
+ this->type = val;
+__isset.type = true;
+}
+
+void SchemaElement::__set_type_length(const int32_t val) {
+ this->type_length = val;
+__isset.type_length = true;
+}
+
+void SchemaElement::__set_repetition_type(const FieldRepetitionType::type val) {
+ this->repetition_type = val;
+__isset.repetition_type = true;
+}
+
+void SchemaElement::__set_name(const std::string& val) {
+ this->name = val;
+}
+
+void SchemaElement::__set_num_children(const int32_t val) {
+ this->num_children = val;
+__isset.num_children = true;
+}
+
+void SchemaElement::__set_converted_type(const ConvertedType::type val) {
+ this->converted_type = val;
+__isset.converted_type = true;
+}
+
+void SchemaElement::__set_scale(const int32_t val) {
+ this->scale = val;
+__isset.scale = true;
+}
+
+void SchemaElement::__set_precision(const int32_t val) {
+ this->precision = val;
+__isset.precision = true;
+}
+
+void SchemaElement::__set_field_id(const int32_t val) {
+ this->field_id = val;
+__isset.field_id = true;
+}
+
+void SchemaElement::__set_logicalType(const LogicalType& val) {
+ this->logicalType = val;
+__isset.logicalType = true;
+}
+std::ostream& operator<<(std::ostream& out, const SchemaElement& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_name = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast38;
+ xfer += iprot->readI32(ecast38);
+ this->type = (Type::type)ecast38;
+ this->__isset.type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->type_length);
+ this->__isset.type_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast39;
+ xfer += iprot->readI32(ecast39);
+ this->repetition_type = (FieldRepetitionType::type)ecast39;
+ this->__isset.repetition_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->name);
+ isset_name = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_children);
+ this->__isset.num_children = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast40;
+ xfer += iprot->readI32(ecast40);
+ this->converted_type = (ConvertedType::type)ecast40;
+ this->__isset.converted_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->scale);
+ this->__isset.scale = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->precision);
+ this->__isset.precision = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->field_id);
+ this->__isset.field_id = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->logicalType.read(iprot);
+ this->__isset.logicalType = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_name)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t SchemaElement::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SchemaElement");
+
+ if (this->__isset.type) {
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.type_length) {
+ xfer += oprot->writeFieldBegin("type_length", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->type_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.repetition_type) {
+ xfer += oprot->writeFieldBegin("repetition_type", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32((int32_t)this->repetition_type);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("name", ::apache::thrift::protocol::T_STRING, 4);
+ xfer += oprot->writeString(this->name);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.num_children) {
+ xfer += oprot->writeFieldBegin("num_children", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->num_children);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.converted_type) {
+ xfer += oprot->writeFieldBegin("converted_type", ::apache::thrift::protocol::T_I32, 6);
+ xfer += oprot->writeI32((int32_t)this->converted_type);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.scale) {
+ xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 7);
+ xfer += oprot->writeI32(this->scale);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.precision) {
+ xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 8);
+ xfer += oprot->writeI32(this->precision);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.field_id) {
+ xfer += oprot->writeFieldBegin("field_id", ::apache::thrift::protocol::T_I32, 9);
+ xfer += oprot->writeI32(this->field_id);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.logicalType) {
+ xfer += oprot->writeFieldBegin("logicalType", ::apache::thrift::protocol::T_STRUCT, 10);
+ xfer += this->logicalType.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SchemaElement &a, SchemaElement &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.type_length, b.type_length);
+ swap(a.repetition_type, b.repetition_type);
+ swap(a.name, b.name);
+ swap(a.num_children, b.num_children);
+ swap(a.converted_type, b.converted_type);
+ swap(a.scale, b.scale);
+ swap(a.precision, b.precision);
+ swap(a.field_id, b.field_id);
+ swap(a.logicalType, b.logicalType);
+ swap(a.__isset, b.__isset);
+}
+
+SchemaElement::SchemaElement(const SchemaElement& other41) {
+ type = other41.type;
+ type_length = other41.type_length;
+ repetition_type = other41.repetition_type;
+ name = other41.name;
+ num_children = other41.num_children;
+ converted_type = other41.converted_type;
+ scale = other41.scale;
+ precision = other41.precision;
+ field_id = other41.field_id;
+ logicalType = other41.logicalType;
+ __isset = other41.__isset;
+}
+SchemaElement& SchemaElement::operator=(const SchemaElement& other42) {
+ type = other42.type;
+ type_length = other42.type_length;
+ repetition_type = other42.repetition_type;
+ name = other42.name;
+ num_children = other42.num_children;
+ converted_type = other42.converted_type;
+ scale = other42.scale;
+ precision = other42.precision;
+ field_id = other42.field_id;
+ logicalType = other42.logicalType;
+ __isset = other42.__isset;
+ return *this;
+}
+void SchemaElement::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SchemaElement(";
+ out << "type="; (__isset.type ? (out << to_string(type)) : (out << "<null>"));
+ out << ", " << "type_length="; (__isset.type_length ? (out << to_string(type_length)) : (out << "<null>"));
+ out << ", " << "repetition_type="; (__isset.repetition_type ? (out << to_string(repetition_type)) : (out << "<null>"));
+ out << ", " << "name=" << to_string(name);
+ out << ", " << "num_children="; (__isset.num_children ? (out << to_string(num_children)) : (out << "<null>"));
+ out << ", " << "converted_type="; (__isset.converted_type ? (out << to_string(converted_type)) : (out << "<null>"));
+ out << ", " << "scale="; (__isset.scale ? (out << to_string(scale)) : (out << "<null>"));
+ out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "<null>"));
+ out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "<null>"));
+ out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "<null>"));
+ out << ")";
+}
+
+
+DataPageHeader::~DataPageHeader() noexcept {
+}
+
+
+void DataPageHeader::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DataPageHeader::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DataPageHeader::__set_definition_level_encoding(const Encoding::type val) {
+ this->definition_level_encoding = val;
+}
+
+void DataPageHeader::__set_repetition_level_encoding(const Encoding::type val) {
+ this->repetition_level_encoding = val;
+}
+
+void DataPageHeader::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_encoding = false;
+ bool isset_definition_level_encoding = false;
+ bool isset_repetition_level_encoding = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast43;
+ xfer += iprot->readI32(ecast43);
+ this->encoding = (Encoding::type)ecast43;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast44;
+ xfer += iprot->readI32(ecast44);
+ this->definition_level_encoding = (Encoding::type)ecast44;
+ isset_definition_level_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast45;
+ xfer += iprot->readI32(ecast45);
+ this->repetition_level_encoding = (Encoding::type)ecast45;
+ isset_repetition_level_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_definition_level_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_repetition_level_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DataPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DataPageHeader");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("definition_level_encoding", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32((int32_t)this->definition_level_encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("repetition_level_encoding", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->repetition_level_encoding);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DataPageHeader &a, DataPageHeader &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.encoding, b.encoding);
+ swap(a.definition_level_encoding, b.definition_level_encoding);
+ swap(a.repetition_level_encoding, b.repetition_level_encoding);
+ swap(a.statistics, b.statistics);
+ swap(a.__isset, b.__isset);
+}
+
+DataPageHeader::DataPageHeader(const DataPageHeader& other46) {
+ num_values = other46.num_values;
+ encoding = other46.encoding;
+ definition_level_encoding = other46.definition_level_encoding;
+ repetition_level_encoding = other46.repetition_level_encoding;
+ statistics = other46.statistics;
+ __isset = other46.__isset;
+}
+DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other47) {
+ num_values = other47.num_values;
+ encoding = other47.encoding;
+ definition_level_encoding = other47.definition_level_encoding;
+ repetition_level_encoding = other47.repetition_level_encoding;
+ statistics = other47.statistics;
+ __isset = other47.__isset;
+ return *this;
+}
+void DataPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DataPageHeader(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "definition_level_encoding=" << to_string(definition_level_encoding);
+ out << ", " << "repetition_level_encoding=" << to_string(repetition_level_encoding);
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ")";
+}
+
+
+IndexPageHeader::~IndexPageHeader() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t IndexPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t IndexPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("IndexPageHeader");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(IndexPageHeader &a, IndexPageHeader &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+IndexPageHeader::IndexPageHeader(const IndexPageHeader& other48) {
+ (void) other48;
+}
+IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other49) {
+ (void) other49;
+ return *this;
+}
+void IndexPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "IndexPageHeader(";
+ out << ")";
+}
+
+
+DictionaryPageHeader::~DictionaryPageHeader() noexcept {
+}
+
+
+void DictionaryPageHeader::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DictionaryPageHeader::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DictionaryPageHeader::__set_is_sorted(const bool val) {
+ this->is_sorted = val;
+__isset.is_sorted = true;
+}
+std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_encoding = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast50;
+ xfer += iprot->readI32(ecast50);
+ this->encoding = (Encoding::type)ecast50;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->is_sorted);
+ this->__isset.is_sorted = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DictionaryPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DictionaryPageHeader");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.is_sorted) {
+ xfer += oprot->writeFieldBegin("is_sorted", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->is_sorted);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.encoding, b.encoding);
+ swap(a.is_sorted, b.is_sorted);
+ swap(a.__isset, b.__isset);
+}
+
+DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other51) {
+ num_values = other51.num_values;
+ encoding = other51.encoding;
+ is_sorted = other51.is_sorted;
+ __isset = other51.__isset;
+}
+DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other52) {
+ num_values = other52.num_values;
+ encoding = other52.encoding;
+ is_sorted = other52.is_sorted;
+ __isset = other52.__isset;
+ return *this;
+}
+void DictionaryPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DictionaryPageHeader(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "is_sorted="; (__isset.is_sorted ? (out << to_string(is_sorted)) : (out << "<null>"));
+ out << ")";
+}
+
+
+DataPageHeaderV2::~DataPageHeaderV2() noexcept {
+}
+
+
+void DataPageHeaderV2::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DataPageHeaderV2::__set_num_nulls(const int32_t val) {
+ this->num_nulls = val;
+}
+
+void DataPageHeaderV2::__set_num_rows(const int32_t val) {
+ this->num_rows = val;
+}
+
+void DataPageHeaderV2::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DataPageHeaderV2::__set_definition_levels_byte_length(const int32_t val) {
+ this->definition_levels_byte_length = val;
+}
+
+void DataPageHeaderV2::__set_repetition_levels_byte_length(const int32_t val) {
+ this->repetition_levels_byte_length = val;
+}
+
+void DataPageHeaderV2::__set_is_compressed(const bool val) {
+ this->is_compressed = val;
+__isset.is_compressed = true;
+}
+
+void DataPageHeaderV2::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_num_nulls = false;
+ bool isset_num_rows = false;
+ bool isset_encoding = false;
+ bool isset_definition_levels_byte_length = false;
+ bool isset_repetition_levels_byte_length = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_nulls);
+ isset_num_nulls = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast53;
+ xfer += iprot->readI32(ecast53);
+ this->encoding = (Encoding::type)ecast53;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->definition_levels_byte_length);
+ isset_definition_levels_byte_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->repetition_levels_byte_length);
+ isset_repetition_levels_byte_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->is_compressed);
+ this->__isset.is_compressed = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_nulls)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_definition_levels_byte_length)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_repetition_levels_byte_length)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DataPageHeaderV2::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DataPageHeaderV2");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_nulls", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->num_nulls);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("definition_levels_byte_length", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->definition_levels_byte_length);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("repetition_levels_byte_length", ::apache::thrift::protocol::T_I32, 6);
+ xfer += oprot->writeI32(this->repetition_levels_byte_length);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.is_compressed) {
+ xfer += oprot->writeFieldBegin("is_compressed", ::apache::thrift::protocol::T_BOOL, 7);
+ xfer += oprot->writeBool(this->is_compressed);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.num_nulls, b.num_nulls);
+ swap(a.num_rows, b.num_rows);
+ swap(a.encoding, b.encoding);
+ swap(a.definition_levels_byte_length, b.definition_levels_byte_length);
+ swap(a.repetition_levels_byte_length, b.repetition_levels_byte_length);
+ swap(a.is_compressed, b.is_compressed);
+ swap(a.statistics, b.statistics);
+ swap(a.__isset, b.__isset);
+}
+
+DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other54) {
+ num_values = other54.num_values;
+ num_nulls = other54.num_nulls;
+ num_rows = other54.num_rows;
+ encoding = other54.encoding;
+ definition_levels_byte_length = other54.definition_levels_byte_length;
+ repetition_levels_byte_length = other54.repetition_levels_byte_length;
+ is_compressed = other54.is_compressed;
+ statistics = other54.statistics;
+ __isset = other54.__isset;
+}
+DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other55) {
+ num_values = other55.num_values;
+ num_nulls = other55.num_nulls;
+ num_rows = other55.num_rows;
+ encoding = other55.encoding;
+ definition_levels_byte_length = other55.definition_levels_byte_length;
+ repetition_levels_byte_length = other55.repetition_levels_byte_length;
+ is_compressed = other55.is_compressed;
+ statistics = other55.statistics;
+ __isset = other55.__isset;
+ return *this;
+}
+void DataPageHeaderV2::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DataPageHeaderV2(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "num_nulls=" << to_string(num_nulls);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "definition_levels_byte_length=" << to_string(definition_levels_byte_length);
+ out << ", " << "repetition_levels_byte_length=" << to_string(repetition_levels_byte_length);
+ out << ", " << "is_compressed="; (__isset.is_compressed ? (out << to_string(is_compressed)) : (out << "<null>"));
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SplitBlockAlgorithm::~SplitBlockAlgorithm() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SplitBlockAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t SplitBlockAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SplitBlockAlgorithm");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other56) {
+ (void) other56;
+}
+SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other57) {
+ (void) other57;
+ return *this;
+}
+void SplitBlockAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SplitBlockAlgorithm(";
+ out << ")";
+}
+
+
+BloomFilterAlgorithm::~BloomFilterAlgorithm() noexcept {
+}
+
+
+void BloomFilterAlgorithm::__set_BLOCK(const SplitBlockAlgorithm& val) {
+ this->BLOCK = val;
+__isset.BLOCK = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->BLOCK.read(iprot);
+ this->__isset.BLOCK = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterAlgorithm");
+
+ if (this->__isset.BLOCK) {
+ xfer += oprot->writeFieldBegin("BLOCK", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->BLOCK.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) {
+ using ::std::swap;
+ swap(a.BLOCK, b.BLOCK);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other58) {
+ BLOCK = other58.BLOCK;
+ __isset = other58.__isset;
+}
+BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other59) {
+ BLOCK = other59.BLOCK;
+ __isset = other59.__isset;
+ return *this;
+}
+void BloomFilterAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterAlgorithm(";
+ out << "BLOCK="; (__isset.BLOCK ? (out << to_string(BLOCK)) : (out << "<null>"));
+ out << ")";
+}
+
+
+XxHash::~XxHash() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const XxHash& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t XxHash::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t XxHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("XxHash");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(XxHash &a, XxHash &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+XxHash::XxHash(const XxHash& other60) {
+ (void) other60;
+}
+XxHash& XxHash::operator=(const XxHash& other61) {
+ (void) other61;
+ return *this;
+}
+void XxHash::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "XxHash(";
+ out << ")";
+}
+
+
+BloomFilterHash::~BloomFilterHash() noexcept {
+}
+
+
+void BloomFilterHash::__set_XXHASH(const XxHash& val) {
+ this->XXHASH = val;
+__isset.XXHASH = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterHash::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->XXHASH.read(iprot);
+ this->__isset.XXHASH = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterHash");
+
+ if (this->__isset.XXHASH) {
+ xfer += oprot->writeFieldBegin("XXHASH", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->XXHASH.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterHash &a, BloomFilterHash &b) {
+ using ::std::swap;
+ swap(a.XXHASH, b.XXHASH);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterHash::BloomFilterHash(const BloomFilterHash& other62) {
+ XXHASH = other62.XXHASH;
+ __isset = other62.__isset;
+}
+BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other63) {
+ XXHASH = other63.XXHASH;
+ __isset = other63.__isset;
+ return *this;
+}
+void BloomFilterHash::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterHash(";
+ out << "XXHASH="; (__isset.XXHASH ? (out << to_string(XXHASH)) : (out << "<null>"));
+ out << ")";
+}
+
+
+Uncompressed::~Uncompressed() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const Uncompressed& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t Uncompressed::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t Uncompressed::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("Uncompressed");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(Uncompressed &a, Uncompressed &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+Uncompressed::Uncompressed(const Uncompressed& other64) {
+ (void) other64;
+}
+Uncompressed& Uncompressed::operator=(const Uncompressed& other65) {
+ (void) other65;
+ return *this;
+}
+void Uncompressed::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "Uncompressed(";
+ out << ")";
+}
+
+
+BloomFilterCompression::~BloomFilterCompression() noexcept {
+}
+
+
+void BloomFilterCompression::__set_UNCOMPRESSED(const Uncompressed& val) {
+ this->UNCOMPRESSED = val;
+__isset.UNCOMPRESSED = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterCompression::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UNCOMPRESSED.read(iprot);
+ this->__isset.UNCOMPRESSED = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterCompression::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterCompression");
+
+ if (this->__isset.UNCOMPRESSED) {
+ xfer += oprot->writeFieldBegin("UNCOMPRESSED", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->UNCOMPRESSED.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterCompression &a, BloomFilterCompression &b) {
+ using ::std::swap;
+ swap(a.UNCOMPRESSED, b.UNCOMPRESSED);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other66) {
+ UNCOMPRESSED = other66.UNCOMPRESSED;
+ __isset = other66.__isset;
+}
+BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other67) {
+ UNCOMPRESSED = other67.UNCOMPRESSED;
+ __isset = other67.__isset;
+ return *this;
+}
+void BloomFilterCompression::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterCompression(";
+ out << "UNCOMPRESSED="; (__isset.UNCOMPRESSED ? (out << to_string(UNCOMPRESSED)) : (out << "<null>"));
+ out << ")";
+}
+
+
+BloomFilterHeader::~BloomFilterHeader() noexcept {
+}
+
+
+void BloomFilterHeader::__set_numBytes(const int32_t val) {
+ this->numBytes = val;
+}
+
+void BloomFilterHeader::__set_algorithm(const BloomFilterAlgorithm& val) {
+ this->algorithm = val;
+}
+
+void BloomFilterHeader::__set_hash(const BloomFilterHash& val) {
+ this->hash = val;
+}
+
+void BloomFilterHeader::__set_compression(const BloomFilterCompression& val) {
+ this->compression = val;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_numBytes = false;
+ bool isset_algorithm = false;
+ bool isset_hash = false;
+ bool isset_compression = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->numBytes);
+ isset_numBytes = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->algorithm.read(iprot);
+ isset_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->hash.read(iprot);
+ isset_hash = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->compression.read(iprot);
+ isset_compression = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_numBytes)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_algorithm)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_hash)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compression)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t BloomFilterHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterHeader");
+
+ xfer += oprot->writeFieldBegin("numBytes", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->numBytes);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("algorithm", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("hash", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->hash.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compression", ::apache::thrift::protocol::T_STRUCT, 4);
+ xfer += this->compression.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterHeader &a, BloomFilterHeader &b) {
+ using ::std::swap;
+ swap(a.numBytes, b.numBytes);
+ swap(a.algorithm, b.algorithm);
+ swap(a.hash, b.hash);
+ swap(a.compression, b.compression);
+}
+
+BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other68) {
+ numBytes = other68.numBytes;
+ algorithm = other68.algorithm;
+ hash = other68.hash;
+ compression = other68.compression;
+}
+BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other69) {
+ numBytes = other69.numBytes;
+ algorithm = other69.algorithm;
+ hash = other69.hash;
+ compression = other69.compression;
+ return *this;
+}
+void BloomFilterHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterHeader(";
+ out << "numBytes=" << to_string(numBytes);
+ out << ", " << "algorithm=" << to_string(algorithm);
+ out << ", " << "hash=" << to_string(hash);
+ out << ", " << "compression=" << to_string(compression);
+ out << ")";
+}
+
+
+PageHeader::~PageHeader() noexcept {
+}
+
+
+void PageHeader::__set_type(const PageType::type val) {
+ this->type = val;
+}
+
+void PageHeader::__set_uncompressed_page_size(const int32_t val) {
+ this->uncompressed_page_size = val;
+}
+
+void PageHeader::__set_compressed_page_size(const int32_t val) {
+ this->compressed_page_size = val;
+}
+
+void PageHeader::__set_crc(const int32_t val) {
+ this->crc = val;
+__isset.crc = true;
+}
+
+void PageHeader::__set_data_page_header(const DataPageHeader& val) {
+ this->data_page_header = val;
+__isset.data_page_header = true;
+}
+
+void PageHeader::__set_index_page_header(const IndexPageHeader& val) {
+ this->index_page_header = val;
+__isset.index_page_header = true;
+}
+
+void PageHeader::__set_dictionary_page_header(const DictionaryPageHeader& val) {
+ this->dictionary_page_header = val;
+__isset.dictionary_page_header = true;
+}
+
+void PageHeader::__set_data_page_header_v2(const DataPageHeaderV2& val) {
+ this->data_page_header_v2 = val;
+__isset.data_page_header_v2 = true;
+}
+std::ostream& operator<<(std::ostream& out, const PageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_type = false;
+ bool isset_uncompressed_page_size = false;
+ bool isset_compressed_page_size = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast70;
+ xfer += iprot->readI32(ecast70);
+ this->type = (PageType::type)ecast70;
+ isset_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->uncompressed_page_size);
+ isset_uncompressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->compressed_page_size);
+ isset_compressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->crc);
+ this->__isset.crc = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->data_page_header.read(iprot);
+ this->__isset.data_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->index_page_header.read(iprot);
+ this->__isset.index_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->dictionary_page_header.read(iprot);
+ this->__isset.dictionary_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->data_page_header_v2.read(iprot);
+ this->__isset.data_page_header_v2 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_uncompressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageHeader");
+
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("uncompressed_page_size", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->uncompressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->compressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.crc) {
+ xfer += oprot->writeFieldBegin("crc", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32(this->crc);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.data_page_header) {
+ xfer += oprot->writeFieldBegin("data_page_header", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->data_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.index_page_header) {
+ xfer += oprot->writeFieldBegin("index_page_header", ::apache::thrift::protocol::T_STRUCT, 6);
+ xfer += this->index_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.dictionary_page_header) {
+ xfer += oprot->writeFieldBegin("dictionary_page_header", ::apache::thrift::protocol::T_STRUCT, 7);
+ xfer += this->dictionary_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.data_page_header_v2) {
+ xfer += oprot->writeFieldBegin("data_page_header_v2", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->data_page_header_v2.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageHeader &a, PageHeader &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.uncompressed_page_size, b.uncompressed_page_size);
+ swap(a.compressed_page_size, b.compressed_page_size);
+ swap(a.crc, b.crc);
+ swap(a.data_page_header, b.data_page_header);
+ swap(a.index_page_header, b.index_page_header);
+ swap(a.dictionary_page_header, b.dictionary_page_header);
+ swap(a.data_page_header_v2, b.data_page_header_v2);
+ swap(a.__isset, b.__isset);
+}
+
+PageHeader::PageHeader(const PageHeader& other71) {
+ type = other71.type;
+ uncompressed_page_size = other71.uncompressed_page_size;
+ compressed_page_size = other71.compressed_page_size;
+ crc = other71.crc;
+ data_page_header = other71.data_page_header;
+ index_page_header = other71.index_page_header;
+ dictionary_page_header = other71.dictionary_page_header;
+ data_page_header_v2 = other71.data_page_header_v2;
+ __isset = other71.__isset;
+}
+PageHeader& PageHeader::operator=(const PageHeader& other72) {
+ type = other72.type;
+ uncompressed_page_size = other72.uncompressed_page_size;
+ compressed_page_size = other72.compressed_page_size;
+ crc = other72.crc;
+ data_page_header = other72.data_page_header;
+ index_page_header = other72.index_page_header;
+ dictionary_page_header = other72.dictionary_page_header;
+ data_page_header_v2 = other72.data_page_header_v2;
+ __isset = other72.__isset;
+ return *this;
+}
+void PageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageHeader(";
+ out << "type=" << to_string(type);
+ out << ", " << "uncompressed_page_size=" << to_string(uncompressed_page_size);
+ out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
+ out << ", " << "crc="; (__isset.crc ? (out << to_string(crc)) : (out << "<null>"));
+ out << ", " << "data_page_header="; (__isset.data_page_header ? (out << to_string(data_page_header)) : (out << "<null>"));
+ out << ", " << "index_page_header="; (__isset.index_page_header ? (out << to_string(index_page_header)) : (out << "<null>"));
+ out << ", " << "dictionary_page_header="; (__isset.dictionary_page_header ? (out << to_string(dictionary_page_header)) : (out << "<null>"));
+ out << ", " << "data_page_header_v2="; (__isset.data_page_header_v2 ? (out << to_string(data_page_header_v2)) : (out << "<null>"));
+ out << ")";
+}
+
+
+KeyValue::~KeyValue() noexcept {
+}
+
+
+void KeyValue::__set_key(const std::string& val) {
+ this->key = val;
+}
+
+void KeyValue::__set_value(const std::string& val) {
+ this->value = val;
+__isset.value = true;
+}
+std::ostream& operator<<(std::ostream& out, const KeyValue& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t KeyValue::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_key = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->key);
+ isset_key = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->value);
+ this->__isset.value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_key)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t KeyValue::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("KeyValue");
+
+ xfer += oprot->writeFieldBegin("key", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeString(this->key);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.value) {
+ xfer += oprot->writeFieldBegin("value", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeString(this->value);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(KeyValue &a, KeyValue &b) {
+ using ::std::swap;
+ swap(a.key, b.key);
+ swap(a.value, b.value);
+ swap(a.__isset, b.__isset);
+}
+
+KeyValue::KeyValue(const KeyValue& other73) {
+ key = other73.key;
+ value = other73.value;
+ __isset = other73.__isset;
+}
+KeyValue& KeyValue::operator=(const KeyValue& other74) {
+ key = other74.key;
+ value = other74.value;
+ __isset = other74.__isset;
+ return *this;
+}
+void KeyValue::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "KeyValue(";
+ out << "key=" << to_string(key);
+ out << ", " << "value="; (__isset.value ? (out << to_string(value)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SortingColumn::~SortingColumn() noexcept {
+}
+
+
+void SortingColumn::__set_column_idx(const int32_t val) {
+ this->column_idx = val;
+}
+
+void SortingColumn::__set_descending(const bool val) {
+ this->descending = val;
+}
+
+void SortingColumn::__set_nulls_first(const bool val) {
+ this->nulls_first = val;
+}
+std::ostream& operator<<(std::ostream& out, const SortingColumn& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SortingColumn::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_column_idx = false;
+ bool isset_descending = false;
+ bool isset_nulls_first = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->column_idx);
+ isset_column_idx = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->descending);
+ isset_descending = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->nulls_first);
+ isset_nulls_first = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_column_idx)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_descending)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_nulls_first)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t SortingColumn::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SortingColumn");
+
+ xfer += oprot->writeFieldBegin("column_idx", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->column_idx);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("descending", ::apache::thrift::protocol::T_BOOL, 2);
+ xfer += oprot->writeBool(this->descending);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("nulls_first", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->nulls_first);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SortingColumn &a, SortingColumn &b) {
+ using ::std::swap;
+ swap(a.column_idx, b.column_idx);
+ swap(a.descending, b.descending);
+ swap(a.nulls_first, b.nulls_first);
+}
+
+SortingColumn::SortingColumn(const SortingColumn& other75) {
+ column_idx = other75.column_idx;
+ descending = other75.descending;
+ nulls_first = other75.nulls_first;
+}
+SortingColumn& SortingColumn::operator=(const SortingColumn& other76) {
+ column_idx = other76.column_idx;
+ descending = other76.descending;
+ nulls_first = other76.nulls_first;
+ return *this;
+}
+void SortingColumn::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SortingColumn(";
+ out << "column_idx=" << to_string(column_idx);
+ out << ", " << "descending=" << to_string(descending);
+ out << ", " << "nulls_first=" << to_string(nulls_first);
+ out << ")";
+}
+
+
+PageEncodingStats::~PageEncodingStats() noexcept {
+}
+
+
+void PageEncodingStats::__set_page_type(const PageType::type val) {
+ this->page_type = val;
+}
+
+void PageEncodingStats::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void PageEncodingStats::__set_count(const int32_t val) {
+ this->count = val;
+}
+std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_page_type = false;
+ bool isset_encoding = false;
+ bool isset_count = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast77;
+ xfer += iprot->readI32(ecast77);
+ this->page_type = (PageType::type)ecast77;
+ isset_page_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast78;
+ xfer += iprot->readI32(ecast78);
+ this->encoding = (Encoding::type)ecast78;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->count);
+ isset_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_page_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_count)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageEncodingStats::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageEncodingStats");
+
+ xfer += oprot->writeFieldBegin("page_type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->page_type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("count", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->count);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageEncodingStats &a, PageEncodingStats &b) {
+ using ::std::swap;
+ swap(a.page_type, b.page_type);
+ swap(a.encoding, b.encoding);
+ swap(a.count, b.count);
+}
+
+PageEncodingStats::PageEncodingStats(const PageEncodingStats& other79) {
+ page_type = other79.page_type;
+ encoding = other79.encoding;
+ count = other79.count;
+}
+PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other80) {
+ page_type = other80.page_type;
+ encoding = other80.encoding;
+ count = other80.count;
+ return *this;
+}
+void PageEncodingStats::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageEncodingStats(";
+ out << "page_type=" << to_string(page_type);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "count=" << to_string(count);
+ out << ")";
+}
+
+
+ColumnMetaData::~ColumnMetaData() noexcept {
+}
+
+
+void ColumnMetaData::__set_type(const Type::type val) {
+ this->type = val;
+}
+
+void ColumnMetaData::__set_encodings(const std::vector<Encoding::type> & val) {
+ this->encodings = val;
+}
+
+void ColumnMetaData::__set_path_in_schema(const std::vector<std::string> & val) {
+ this->path_in_schema = val;
+}
+
+void ColumnMetaData::__set_codec(const CompressionCodec::type val) {
+ this->codec = val;
+}
+
+void ColumnMetaData::__set_num_values(const int64_t val) {
+ this->num_values = val;
+}
+
+void ColumnMetaData::__set_total_uncompressed_size(const int64_t val) {
+ this->total_uncompressed_size = val;
+}
+
+void ColumnMetaData::__set_total_compressed_size(const int64_t val) {
+ this->total_compressed_size = val;
+}
+
+void ColumnMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
+ this->key_value_metadata = val;
+__isset.key_value_metadata = true;
+}
+
+void ColumnMetaData::__set_data_page_offset(const int64_t val) {
+ this->data_page_offset = val;
+}
+
+void ColumnMetaData::__set_index_page_offset(const int64_t val) {
+ this->index_page_offset = val;
+__isset.index_page_offset = true;
+}
+
+void ColumnMetaData::__set_dictionary_page_offset(const int64_t val) {
+ this->dictionary_page_offset = val;
+__isset.dictionary_page_offset = true;
+}
+
+void ColumnMetaData::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+
+void ColumnMetaData::__set_encoding_stats(const std::vector<PageEncodingStats> & val) {
+ this->encoding_stats = val;
+__isset.encoding_stats = true;
+}
+
+void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) {
+ this->bloom_filter_offset = val;
+__isset.bloom_filter_offset = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_type = false;
+ bool isset_encodings = false;
+ bool isset_path_in_schema = false;
+ bool isset_codec = false;
+ bool isset_num_values = false;
+ bool isset_total_uncompressed_size = false;
+ bool isset_total_compressed_size = false;
+ bool isset_data_page_offset = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast81;
+ xfer += iprot->readI32(ecast81);
+ this->type = (Type::type)ecast81;
+ isset_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->encodings.clear();
+ uint32_t _size82;
+ ::apache::thrift::protocol::TType _etype85;
+ xfer += iprot->readListBegin(_etype85, _size82);
+ this->encodings.resize(_size82);
+ uint32_t _i86;
+ for (_i86 = 0; _i86 < _size82; ++_i86)
+ {
+ int32_t ecast87;
+ xfer += iprot->readI32(ecast87);
+ this->encodings[_i86] = (Encoding::type)ecast87;
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_encodings = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->path_in_schema.clear();
+ uint32_t _size88;
+ ::apache::thrift::protocol::TType _etype91;
+ xfer += iprot->readListBegin(_etype91, _size88);
+ this->path_in_schema.resize(_size88);
+ uint32_t _i92;
+ for (_i92 = 0; _i92 < _size88; ++_i92)
+ {
+ xfer += iprot->readString(this->path_in_schema[_i92]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_path_in_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast93;
+ xfer += iprot->readI32(ecast93);
+ this->codec = (CompressionCodec::type)ecast93;
+ isset_codec = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_uncompressed_size);
+ isset_total_uncompressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_compressed_size);
+ isset_total_compressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->key_value_metadata.clear();
+ uint32_t _size94;
+ ::apache::thrift::protocol::TType _etype97;
+ xfer += iprot->readListBegin(_etype97, _size94);
+ this->key_value_metadata.resize(_size94);
+ uint32_t _i98;
+ for (_i98 = 0; _i98 < _size94; ++_i98)
+ {
+ xfer += this->key_value_metadata[_i98].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.key_value_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->data_page_offset);
+ isset_data_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->index_page_offset);
+ this->__isset.index_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 11:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->dictionary_page_offset);
+ this->__isset.dictionary_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 12:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 13:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->encoding_stats.clear();
+ uint32_t _size99;
+ ::apache::thrift::protocol::TType _etype102;
+ xfer += iprot->readListBegin(_etype102, _size99);
+ this->encoding_stats.resize(_size99);
+ uint32_t _i103;
+ for (_i103 = 0; _i103 < _size99; ++_i103)
+ {
+ xfer += this->encoding_stats[_i103].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.encoding_stats = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 14:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->bloom_filter_offset);
+ this->__isset.bloom_filter_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encodings)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_path_in_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_codec)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_uncompressed_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_compressed_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_data_page_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnMetaData");
+
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast<uint32_t>(this->encodings.size()));
+ std::vector<Encoding::type> ::const_iterator _iter104;
+ for (_iter104 = this->encodings.begin(); _iter104 != this->encodings.end(); ++_iter104)
+ {
+ xfer += oprot->writeI32((int32_t)(*_iter104));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
+ std::vector<std::string> ::const_iterator _iter105;
+ for (_iter105 = this->path_in_schema.begin(); _iter105 != this->path_in_schema.end(); ++_iter105)
+ {
+ xfer += oprot->writeString((*_iter105));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("codec", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->codec);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I64, 5);
+ xfer += oprot->writeI64(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_uncompressed_size", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->total_uncompressed_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 7);
+ xfer += oprot->writeI64(this->total_compressed_size);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_value_metadata) {
+ xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
+ std::vector<KeyValue> ::const_iterator _iter106;
+ for (_iter106 = this->key_value_metadata.begin(); _iter106 != this->key_value_metadata.end(); ++_iter106)
+ {
+ xfer += (*_iter106).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("data_page_offset", ::apache::thrift::protocol::T_I64, 9);
+ xfer += oprot->writeI64(this->data_page_offset);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.index_page_offset) {
+ xfer += oprot->writeFieldBegin("index_page_offset", ::apache::thrift::protocol::T_I64, 10);
+ xfer += oprot->writeI64(this->index_page_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.dictionary_page_offset) {
+ xfer += oprot->writeFieldBegin("dictionary_page_offset", ::apache::thrift::protocol::T_I64, 11);
+ xfer += oprot->writeI64(this->dictionary_page_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 12);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encoding_stats) {
+ xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->encoding_stats.size()));
+ std::vector<PageEncodingStats> ::const_iterator _iter107;
+ for (_iter107 = this->encoding_stats.begin(); _iter107 != this->encoding_stats.end(); ++_iter107)
+ {
+ xfer += (*_iter107).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.bloom_filter_offset) {
+ xfer += oprot->writeFieldBegin("bloom_filter_offset", ::apache::thrift::protocol::T_I64, 14);
+ xfer += oprot->writeI64(this->bloom_filter_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnMetaData &a, ColumnMetaData &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.encodings, b.encodings);
+ swap(a.path_in_schema, b.path_in_schema);
+ swap(a.codec, b.codec);
+ swap(a.num_values, b.num_values);
+ swap(a.total_uncompressed_size, b.total_uncompressed_size);
+ swap(a.total_compressed_size, b.total_compressed_size);
+ swap(a.key_value_metadata, b.key_value_metadata);
+ swap(a.data_page_offset, b.data_page_offset);
+ swap(a.index_page_offset, b.index_page_offset);
+ swap(a.dictionary_page_offset, b.dictionary_page_offset);
+ swap(a.statistics, b.statistics);
+ swap(a.encoding_stats, b.encoding_stats);
+ swap(a.bloom_filter_offset, b.bloom_filter_offset);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnMetaData::ColumnMetaData(const ColumnMetaData& other108) {
+ type = other108.type;
+ encodings = other108.encodings;
+ path_in_schema = other108.path_in_schema;
+ codec = other108.codec;
+ num_values = other108.num_values;
+ total_uncompressed_size = other108.total_uncompressed_size;
+ total_compressed_size = other108.total_compressed_size;
+ key_value_metadata = other108.key_value_metadata;
+ data_page_offset = other108.data_page_offset;
+ index_page_offset = other108.index_page_offset;
+ dictionary_page_offset = other108.dictionary_page_offset;
+ statistics = other108.statistics;
+ encoding_stats = other108.encoding_stats;
+ bloom_filter_offset = other108.bloom_filter_offset;
+ __isset = other108.__isset;
+}
+ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other109) {
+ type = other109.type;
+ encodings = other109.encodings;
+ path_in_schema = other109.path_in_schema;
+ codec = other109.codec;
+ num_values = other109.num_values;
+ total_uncompressed_size = other109.total_uncompressed_size;
+ total_compressed_size = other109.total_compressed_size;
+ key_value_metadata = other109.key_value_metadata;
+ data_page_offset = other109.data_page_offset;
+ index_page_offset = other109.index_page_offset;
+ dictionary_page_offset = other109.dictionary_page_offset;
+ statistics = other109.statistics;
+ encoding_stats = other109.encoding_stats;
+ bloom_filter_offset = other109.bloom_filter_offset;
+ __isset = other109.__isset;
+ return *this;
+}
+void ColumnMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnMetaData(";
+ out << "type=" << to_string(type);
+ out << ", " << "encodings=" << to_string(encodings);
+ out << ", " << "path_in_schema=" << to_string(path_in_schema);
+ out << ", " << "codec=" << to_string(codec);
+ out << ", " << "num_values=" << to_string(num_values);
+ out << ", " << "total_uncompressed_size=" << to_string(total_uncompressed_size);
+ out << ", " << "total_compressed_size=" << to_string(total_compressed_size);
+ out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
+ out << ", " << "data_page_offset=" << to_string(data_page_offset);
+ out << ", " << "index_page_offset="; (__isset.index_page_offset ? (out << to_string(index_page_offset)) : (out << "<null>"));
+ out << ", " << "dictionary_page_offset="; (__isset.dictionary_page_offset ? (out << to_string(dictionary_page_offset)) : (out << "<null>"));
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "<null>"));
+ out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "<null>"));
+ out << ")";
+}
+
+
+EncryptionWithFooterKey::~EncryptionWithFooterKey() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionWithFooterKey::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EncryptionWithFooterKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionWithFooterKey");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other110) {
+ (void) other110;
+}
+EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other111) {
+ (void) other111;
+ return *this;
+}
+void EncryptionWithFooterKey::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionWithFooterKey(";
+ out << ")";
+}
+
+
+EncryptionWithColumnKey::~EncryptionWithColumnKey() noexcept {
+}
+
+
+void EncryptionWithColumnKey::__set_path_in_schema(const std::vector<std::string> & val) {
+ this->path_in_schema = val;
+}
+
+void EncryptionWithColumnKey::__set_key_metadata(const std::string& val) {
+ this->key_metadata = val;
+__isset.key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_path_in_schema = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->path_in_schema.clear();
+ uint32_t _size112;
+ ::apache::thrift::protocol::TType _etype115;
+ xfer += iprot->readListBegin(_etype115, _size112);
+ this->path_in_schema.resize(_size112);
+ uint32_t _i116;
+ for (_i116 = 0; _i116 < _size112; ++_i116)
+ {
+ xfer += iprot->readString(this->path_in_schema[_i116]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_path_in_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->key_metadata);
+ this->__isset.key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_path_in_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionWithColumnKey");
+
+ xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
+ std::vector<std::string> ::const_iterator _iter117;
+ for (_iter117 = this->path_in_schema.begin(); _iter117 != this->path_in_schema.end(); ++_iter117)
+ {
+ xfer += oprot->writeString((*_iter117));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_metadata) {
+ xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) {
+ using ::std::swap;
+ swap(a.path_in_schema, b.path_in_schema);
+ swap(a.key_metadata, b.key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other118) {
+ path_in_schema = other118.path_in_schema;
+ key_metadata = other118.key_metadata;
+ __isset = other118.__isset;
+}
+EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other119) {
+ path_in_schema = other119.path_in_schema;
+ key_metadata = other119.key_metadata;
+ __isset = other119.__isset;
+ return *this;
+}
+void EncryptionWithColumnKey::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionWithColumnKey(";
+ out << "path_in_schema=" << to_string(path_in_schema);
+ out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+ColumnCryptoMetaData::~ColumnCryptoMetaData() noexcept {
+}
+
+
+void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val) {
+ this->ENCRYPTION_WITH_FOOTER_KEY = val;
+__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+}
+
+void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val) {
+ this->ENCRYPTION_WITH_COLUMN_KEY = val;
+__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENCRYPTION_WITH_FOOTER_KEY.read(iprot);
+ this->__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENCRYPTION_WITH_COLUMN_KEY.read(iprot);
+ this->__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ColumnCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnCryptoMetaData");
+
+ if (this->__isset.ENCRYPTION_WITH_FOOTER_KEY) {
+ xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_FOOTER_KEY", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->ENCRYPTION_WITH_FOOTER_KEY.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_COLUMN_KEY", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->ENCRYPTION_WITH_COLUMN_KEY.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) {
+ using ::std::swap;
+ swap(a.ENCRYPTION_WITH_FOOTER_KEY, b.ENCRYPTION_WITH_FOOTER_KEY);
+ swap(a.ENCRYPTION_WITH_COLUMN_KEY, b.ENCRYPTION_WITH_COLUMN_KEY);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other120) {
+ ENCRYPTION_WITH_FOOTER_KEY = other120.ENCRYPTION_WITH_FOOTER_KEY;
+ ENCRYPTION_WITH_COLUMN_KEY = other120.ENCRYPTION_WITH_COLUMN_KEY;
+ __isset = other120.__isset;
+}
+ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other121) {
+ ENCRYPTION_WITH_FOOTER_KEY = other121.ENCRYPTION_WITH_FOOTER_KEY;
+ ENCRYPTION_WITH_COLUMN_KEY = other121.ENCRYPTION_WITH_COLUMN_KEY;
+ __isset = other121.__isset;
+ return *this;
+}
+void ColumnCryptoMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnCryptoMetaData(";
+ out << "ENCRYPTION_WITH_FOOTER_KEY="; (__isset.ENCRYPTION_WITH_FOOTER_KEY ? (out << to_string(ENCRYPTION_WITH_FOOTER_KEY)) : (out << "<null>"));
+ out << ", " << "ENCRYPTION_WITH_COLUMN_KEY="; (__isset.ENCRYPTION_WITH_COLUMN_KEY ? (out << to_string(ENCRYPTION_WITH_COLUMN_KEY)) : (out << "<null>"));
+ out << ")";
+}
+
+
+ColumnChunk::~ColumnChunk() noexcept {
+}
+
+
+void ColumnChunk::__set_file_path(const std::string& val) {
+ this->file_path = val;
+__isset.file_path = true;
+}
+
+void ColumnChunk::__set_file_offset(const int64_t val) {
+ this->file_offset = val;
+}
+
+void ColumnChunk::__set_meta_data(const ColumnMetaData& val) {
+ this->meta_data = val;
+__isset.meta_data = true;
+}
+
+void ColumnChunk::__set_offset_index_offset(const int64_t val) {
+ this->offset_index_offset = val;
+__isset.offset_index_offset = true;
+}
+
+void ColumnChunk::__set_offset_index_length(const int32_t val) {
+ this->offset_index_length = val;
+__isset.offset_index_length = true;
+}
+
+void ColumnChunk::__set_column_index_offset(const int64_t val) {
+ this->column_index_offset = val;
+__isset.column_index_offset = true;
+}
+
+void ColumnChunk::__set_column_index_length(const int32_t val) {
+ this->column_index_length = val;
+__isset.column_index_length = true;
+}
+
+void ColumnChunk::__set_crypto_metadata(const ColumnCryptoMetaData& val) {
+ this->crypto_metadata = val;
+__isset.crypto_metadata = true;
+}
+
+void ColumnChunk::__set_encrypted_column_metadata(const std::string& val) {
+ this->encrypted_column_metadata = val;
+__isset.encrypted_column_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnChunk::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_file_offset = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->file_path);
+ this->__isset.file_path = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->file_offset);
+ isset_file_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->meta_data.read(iprot);
+ this->__isset.meta_data = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->offset_index_offset);
+ this->__isset.offset_index_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->offset_index_length);
+ this->__isset.offset_index_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->column_index_offset);
+ this->__isset.column_index_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->column_index_length);
+ this->__isset.column_index_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->crypto_metadata.read(iprot);
+ this->__isset.crypto_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->encrypted_column_metadata);
+ this->__isset.encrypted_column_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_file_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnChunk::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnChunk");
+
+ if (this->__isset.file_path) {
+ xfer += oprot->writeFieldBegin("file_path", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeString(this->file_path);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 2);
+ xfer += oprot->writeI64(this->file_offset);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.meta_data) {
+ xfer += oprot->writeFieldBegin("meta_data", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->meta_data.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.offset_index_offset) {
+ xfer += oprot->writeFieldBegin("offset_index_offset", ::apache::thrift::protocol::T_I64, 4);
+ xfer += oprot->writeI64(this->offset_index_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.offset_index_length) {
+ xfer += oprot->writeFieldBegin("offset_index_length", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->offset_index_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_index_offset) {
+ xfer += oprot->writeFieldBegin("column_index_offset", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->column_index_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_index_length) {
+ xfer += oprot->writeFieldBegin("column_index_length", ::apache::thrift::protocol::T_I32, 7);
+ xfer += oprot->writeI32(this->column_index_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.crypto_metadata) {
+ xfer += oprot->writeFieldBegin("crypto_metadata", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->crypto_metadata.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encrypted_column_metadata) {
+ xfer += oprot->writeFieldBegin("encrypted_column_metadata", ::apache::thrift::protocol::T_STRING, 9);
+ xfer += oprot->writeBinary(this->encrypted_column_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnChunk &a, ColumnChunk &b) {
+ using ::std::swap;
+ swap(a.file_path, b.file_path);
+ swap(a.file_offset, b.file_offset);
+ swap(a.meta_data, b.meta_data);
+ swap(a.offset_index_offset, b.offset_index_offset);
+ swap(a.offset_index_length, b.offset_index_length);
+ swap(a.column_index_offset, b.column_index_offset);
+ swap(a.column_index_length, b.column_index_length);
+ swap(a.crypto_metadata, b.crypto_metadata);
+ swap(a.encrypted_column_metadata, b.encrypted_column_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnChunk::ColumnChunk(const ColumnChunk& other122) {
+ file_path = other122.file_path;
+ file_offset = other122.file_offset;
+ meta_data = other122.meta_data;
+ offset_index_offset = other122.offset_index_offset;
+ offset_index_length = other122.offset_index_length;
+ column_index_offset = other122.column_index_offset;
+ column_index_length = other122.column_index_length;
+ crypto_metadata = other122.crypto_metadata;
+ encrypted_column_metadata = other122.encrypted_column_metadata;
+ __isset = other122.__isset;
+}
+ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other123) {
+ file_path = other123.file_path;
+ file_offset = other123.file_offset;
+ meta_data = other123.meta_data;
+ offset_index_offset = other123.offset_index_offset;
+ offset_index_length = other123.offset_index_length;
+ column_index_offset = other123.column_index_offset;
+ column_index_length = other123.column_index_length;
+ crypto_metadata = other123.crypto_metadata;
+ encrypted_column_metadata = other123.encrypted_column_metadata;
+ __isset = other123.__isset;
+ return *this;
+}
+void ColumnChunk::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnChunk(";
+ out << "file_path="; (__isset.file_path ? (out << to_string(file_path)) : (out << "<null>"));
+ out << ", " << "file_offset=" << to_string(file_offset);
+ out << ", " << "meta_data="; (__isset.meta_data ? (out << to_string(meta_data)) : (out << "<null>"));
+ out << ", " << "offset_index_offset="; (__isset.offset_index_offset ? (out << to_string(offset_index_offset)) : (out << "<null>"));
+ out << ", " << "offset_index_length="; (__isset.offset_index_length ? (out << to_string(offset_index_length)) : (out << "<null>"));
+ out << ", " << "column_index_offset="; (__isset.column_index_offset ? (out << to_string(column_index_offset)) : (out << "<null>"));
+ out << ", " << "column_index_length="; (__isset.column_index_length ? (out << to_string(column_index_length)) : (out << "<null>"));
+ out << ", " << "crypto_metadata="; (__isset.crypto_metadata ? (out << to_string(crypto_metadata)) : (out << "<null>"));
+ out << ", " << "encrypted_column_metadata="; (__isset.encrypted_column_metadata ? (out << to_string(encrypted_column_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+RowGroup::~RowGroup() noexcept {
+}
+
+
+void RowGroup::__set_columns(const std::vector<ColumnChunk> & val) {
+ this->columns = val;
+}
+
+void RowGroup::__set_total_byte_size(const int64_t val) {
+ this->total_byte_size = val;
+}
+
+void RowGroup::__set_num_rows(const int64_t val) {
+ this->num_rows = val;
+}
+
+void RowGroup::__set_sorting_columns(const std::vector<SortingColumn> & val) {
+ this->sorting_columns = val;
+__isset.sorting_columns = true;
+}
+
+void RowGroup::__set_file_offset(const int64_t val) {
+ this->file_offset = val;
+__isset.file_offset = true;
+}
+
+void RowGroup::__set_total_compressed_size(const int64_t val) {
+ this->total_compressed_size = val;
+__isset.total_compressed_size = true;
+}
+
+void RowGroup::__set_ordinal(const int16_t val) {
+ this->ordinal = val;
+__isset.ordinal = true;
+}
+std::ostream& operator<<(std::ostream& out, const RowGroup& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_columns = false;
+ bool isset_total_byte_size = false;
+ bool isset_num_rows = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->columns.clear();
+ uint32_t _size124;
+ ::apache::thrift::protocol::TType _etype127;
+ xfer += iprot->readListBegin(_etype127, _size124);
+ this->columns.resize(_size124);
+ uint32_t _i128;
+ for (_i128 = 0; _i128 < _size124; ++_i128)
+ {
+ xfer += this->columns[_i128].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_columns = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_byte_size);
+ isset_total_byte_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->sorting_columns.clear();
+ uint32_t _size129;
+ ::apache::thrift::protocol::TType _etype132;
+ xfer += iprot->readListBegin(_etype132, _size129);
+ this->sorting_columns.resize(_size129);
+ uint32_t _i133;
+ for (_i133 = 0; _i133 < _size129; ++_i133)
+ {
+ xfer += this->sorting_columns[_i133].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.sorting_columns = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->file_offset);
+ this->__isset.file_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_compressed_size);
+ this->__isset.total_compressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I16) {
+ xfer += iprot->readI16(this->ordinal);
+ this->__isset.ordinal = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_columns)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_byte_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("RowGroup");
+
+ xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->columns.size()));
+ std::vector<ColumnChunk> ::const_iterator _iter134;
+ for (_iter134 = this->columns.begin(); _iter134 != this->columns.end(); ++_iter134)
+ {
+ xfer += (*_iter134).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_byte_size", ::apache::thrift::protocol::T_I64, 2);
+ xfer += oprot->writeI64(this->total_byte_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.sorting_columns) {
+ xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->sorting_columns.size()));
+ std::vector<SortingColumn> ::const_iterator _iter135;
+ for (_iter135 = this->sorting_columns.begin(); _iter135 != this->sorting_columns.end(); ++_iter135)
+ {
+ xfer += (*_iter135).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.file_offset) {
+ xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 5);
+ xfer += oprot->writeI64(this->file_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.total_compressed_size) {
+ xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->total_compressed_size);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ordinal) {
+ xfer += oprot->writeFieldBegin("ordinal", ::apache::thrift::protocol::T_I16, 7);
+ xfer += oprot->writeI16(this->ordinal);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(RowGroup &a, RowGroup &b) {
+ using ::std::swap;
+ swap(a.columns, b.columns);
+ swap(a.total_byte_size, b.total_byte_size);
+ swap(a.num_rows, b.num_rows);
+ swap(a.sorting_columns, b.sorting_columns);
+ swap(a.file_offset, b.file_offset);
+ swap(a.total_compressed_size, b.total_compressed_size);
+ swap(a.ordinal, b.ordinal);
+ swap(a.__isset, b.__isset);
+}
+
+RowGroup::RowGroup(const RowGroup& other136) {
+ columns = other136.columns;
+ total_byte_size = other136.total_byte_size;
+ num_rows = other136.num_rows;
+ sorting_columns = other136.sorting_columns;
+ file_offset = other136.file_offset;
+ total_compressed_size = other136.total_compressed_size;
+ ordinal = other136.ordinal;
+ __isset = other136.__isset;
+}
+RowGroup& RowGroup::operator=(const RowGroup& other137) {
+ columns = other137.columns;
+ total_byte_size = other137.total_byte_size;
+ num_rows = other137.num_rows;
+ sorting_columns = other137.sorting_columns;
+ file_offset = other137.file_offset;
+ total_compressed_size = other137.total_compressed_size;
+ ordinal = other137.ordinal;
+ __isset = other137.__isset;
+ return *this;
+}
+void RowGroup::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "RowGroup(";
+ out << "columns=" << to_string(columns);
+ out << ", " << "total_byte_size=" << to_string(total_byte_size);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "sorting_columns="; (__isset.sorting_columns ? (out << to_string(sorting_columns)) : (out << "<null>"));
+ out << ", " << "file_offset="; (__isset.file_offset ? (out << to_string(file_offset)) : (out << "<null>"));
+ out << ", " << "total_compressed_size="; (__isset.total_compressed_size ? (out << to_string(total_compressed_size)) : (out << "<null>"));
+ out << ", " << "ordinal="; (__isset.ordinal ? (out << to_string(ordinal)) : (out << "<null>"));
+ out << ")";
+}
+
+
+TypeDefinedOrder::~TypeDefinedOrder() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TypeDefinedOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t TypeDefinedOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TypeDefinedOrder");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other138) {
+ (void) other138;
+}
+TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other139) {
+ (void) other139;
+ return *this;
+}
+void TypeDefinedOrder::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TypeDefinedOrder(";
+ out << ")";
+}
+
+
+ColumnOrder::~ColumnOrder() noexcept {
+}
+
+
+void ColumnOrder::__set_TYPE_ORDER(const TypeDefinedOrder& val) {
+ this->TYPE_ORDER = val;
+__isset.TYPE_ORDER = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TYPE_ORDER.read(iprot);
+ this->__isset.TYPE_ORDER = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ColumnOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnOrder");
+
+ if (this->__isset.TYPE_ORDER) {
+ xfer += oprot->writeFieldBegin("TYPE_ORDER", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->TYPE_ORDER.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnOrder &a, ColumnOrder &b) {
+ using ::std::swap;
+ swap(a.TYPE_ORDER, b.TYPE_ORDER);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnOrder::ColumnOrder(const ColumnOrder& other140) {
+ TYPE_ORDER = other140.TYPE_ORDER;
+ __isset = other140.__isset;
+}
+ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other141) {
+ TYPE_ORDER = other141.TYPE_ORDER;
+ __isset = other141.__isset;
+ return *this;
+}
+void ColumnOrder::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnOrder(";
+ out << "TYPE_ORDER="; (__isset.TYPE_ORDER ? (out << to_string(TYPE_ORDER)) : (out << "<null>"));
+ out << ")";
+}
+
+
+PageLocation::~PageLocation() noexcept {
+}
+
+
+void PageLocation::__set_offset(const int64_t val) {
+ this->offset = val;
+}
+
+void PageLocation::__set_compressed_page_size(const int32_t val) {
+ this->compressed_page_size = val;
+}
+
+void PageLocation::__set_first_row_index(const int64_t val) {
+ this->first_row_index = val;
+}
+std::ostream& operator<<(std::ostream& out, const PageLocation& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageLocation::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_offset = false;
+ bool isset_compressed_page_size = false;
+ bool isset_first_row_index = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->offset);
+ isset_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->compressed_page_size);
+ isset_compressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->first_row_index);
+ isset_first_row_index = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_first_row_index)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageLocation::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageLocation");
+
+ xfer += oprot->writeFieldBegin("offset", ::apache::thrift::protocol::T_I64, 1);
+ xfer += oprot->writeI64(this->offset);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->compressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("first_row_index", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->first_row_index);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageLocation &a, PageLocation &b) {
+ using ::std::swap;
+ swap(a.offset, b.offset);
+ swap(a.compressed_page_size, b.compressed_page_size);
+ swap(a.first_row_index, b.first_row_index);
+}
+
+PageLocation::PageLocation(const PageLocation& other142) {
+ offset = other142.offset;
+ compressed_page_size = other142.compressed_page_size;
+ first_row_index = other142.first_row_index;
+}
+PageLocation& PageLocation::operator=(const PageLocation& other143) {
+ offset = other143.offset;
+ compressed_page_size = other143.compressed_page_size;
+ first_row_index = other143.first_row_index;
+ return *this;
+}
+void PageLocation::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageLocation(";
+ out << "offset=" << to_string(offset);
+ out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
+ out << ", " << "first_row_index=" << to_string(first_row_index);
+ out << ")";
+}
+
+
+OffsetIndex::~OffsetIndex() noexcept {
+}
+
+
+void OffsetIndex::__set_page_locations(const std::vector<PageLocation> & val) {
+ this->page_locations = val;
+}
+std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_page_locations = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->page_locations.clear();
+ uint32_t _size144;
+ ::apache::thrift::protocol::TType _etype147;
+ xfer += iprot->readListBegin(_etype147, _size144);
+ this->page_locations.resize(_size144);
+ uint32_t _i148;
+ for (_i148 = 0; _i148 < _size144; ++_i148)
+ {
+ xfer += this->page_locations[_i148].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_page_locations = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_page_locations)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("OffsetIndex");
+
+ xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->page_locations.size()));
+ std::vector<PageLocation> ::const_iterator _iter149;
+ for (_iter149 = this->page_locations.begin(); _iter149 != this->page_locations.end(); ++_iter149)
+ {
+ xfer += (*_iter149).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(OffsetIndex &a, OffsetIndex &b) {
+ using ::std::swap;
+ swap(a.page_locations, b.page_locations);
+}
+
+OffsetIndex::OffsetIndex(const OffsetIndex& other150) {
+ page_locations = other150.page_locations;
+}
+OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other151) {
+ page_locations = other151.page_locations;
+ return *this;
+}
+void OffsetIndex::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "OffsetIndex(";
+ out << "page_locations=" << to_string(page_locations);
+ out << ")";
+}
+
+
+ColumnIndex::~ColumnIndex() noexcept {
+}
+
+
+void ColumnIndex::__set_null_pages(const std::vector<bool> & val) {
+ this->null_pages = val;
+}
+
+void ColumnIndex::__set_min_values(const std::vector<std::string> & val) {
+ this->min_values = val;
+}
+
+void ColumnIndex::__set_max_values(const std::vector<std::string> & val) {
+ this->max_values = val;
+}
+
+void ColumnIndex::__set_boundary_order(const BoundaryOrder::type val) {
+ this->boundary_order = val;
+}
+
+void ColumnIndex::__set_null_counts(const std::vector<int64_t> & val) {
+ this->null_counts = val;
+__isset.null_counts = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_null_pages = false;
+ bool isset_min_values = false;
+ bool isset_max_values = false;
+ bool isset_boundary_order = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->null_pages.clear();
+ uint32_t _size152;
+ ::apache::thrift::protocol::TType _etype155;
+ xfer += iprot->readListBegin(_etype155, _size152);
+ this->null_pages.resize(_size152);
+ uint32_t _i156;
+ for (_i156 = 0; _i156 < _size152; ++_i156)
+ {
+ bool result;
+ xfer += iprot->readBool(result);
+ this->null_pages[_i156] = result;
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_null_pages = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->min_values.clear();
+ uint32_t _size157;
+ ::apache::thrift::protocol::TType _etype160;
+ xfer += iprot->readListBegin(_etype160, _size157);
+ this->min_values.resize(_size157);
+ uint32_t _i161;
+ for (_i161 = 0; _i161 < _size157; ++_i161)
+ {
+ xfer += iprot->readBinary(this->min_values[_i161]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_min_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->max_values.clear();
+ uint32_t _size162;
+ ::apache::thrift::protocol::TType _etype165;
+ xfer += iprot->readListBegin(_etype165, _size162);
+ this->max_values.resize(_size162);
+ uint32_t _i166;
+ for (_i166 = 0; _i166 < _size162; ++_i166)
+ {
+ xfer += iprot->readBinary(this->max_values[_i166]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_max_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast167;
+ xfer += iprot->readI32(ecast167);
+ this->boundary_order = (BoundaryOrder::type)ecast167;
+ isset_boundary_order = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->null_counts.clear();
+ uint32_t _size168;
+ ::apache::thrift::protocol::TType _etype171;
+ xfer += iprot->readListBegin(_etype171, _size168);
+ this->null_counts.resize(_size168);
+ uint32_t _i172;
+ for (_i172 = 0; _i172 < _size168; ++_i172)
+ {
+ xfer += iprot->readI64(this->null_counts[_i172]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.null_counts = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_null_pages)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_min_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_max_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_boundary_order)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnIndex");
+
+ xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast<uint32_t>(this->null_pages.size()));
+ std::vector<bool> ::const_iterator _iter173;
+ for (_iter173 = this->null_pages.begin(); _iter173 != this->null_pages.end(); ++_iter173)
+ {
+ xfer += oprot->writeBool((*_iter173));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->min_values.size()));
+ std::vector<std::string> ::const_iterator _iter174;
+ for (_iter174 = this->min_values.begin(); _iter174 != this->min_values.end(); ++_iter174)
+ {
+ xfer += oprot->writeBinary((*_iter174));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->max_values.size()));
+ std::vector<std::string> ::const_iterator _iter175;
+ for (_iter175 = this->max_values.begin(); _iter175 != this->max_values.end(); ++_iter175)
+ {
+ xfer += oprot->writeBinary((*_iter175));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("boundary_order", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->boundary_order);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.null_counts) {
+ xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->null_counts.size()));
+ std::vector<int64_t> ::const_iterator _iter176;
+ for (_iter176 = this->null_counts.begin(); _iter176 != this->null_counts.end(); ++_iter176)
+ {
+ xfer += oprot->writeI64((*_iter176));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnIndex &a, ColumnIndex &b) {
+ using ::std::swap;
+ swap(a.null_pages, b.null_pages);
+ swap(a.min_values, b.min_values);
+ swap(a.max_values, b.max_values);
+ swap(a.boundary_order, b.boundary_order);
+ swap(a.null_counts, b.null_counts);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnIndex::ColumnIndex(const ColumnIndex& other177) {
+ null_pages = other177.null_pages;
+ min_values = other177.min_values;
+ max_values = other177.max_values;
+ boundary_order = other177.boundary_order;
+ null_counts = other177.null_counts;
+ __isset = other177.__isset;
+}
+ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other178) {
+ null_pages = other178.null_pages;
+ min_values = other178.min_values;
+ max_values = other178.max_values;
+ boundary_order = other178.boundary_order;
+ null_counts = other178.null_counts;
+ __isset = other178.__isset;
+ return *this;
+}
+void ColumnIndex::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnIndex(";
+ out << "null_pages=" << to_string(null_pages);
+ out << ", " << "min_values=" << to_string(min_values);
+ out << ", " << "max_values=" << to_string(max_values);
+ out << ", " << "boundary_order=" << to_string(boundary_order);
+ out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "<null>"));
+ out << ")";
+}
+
+
+AesGcmV1::~AesGcmV1() noexcept {
+}
+
+
+void AesGcmV1::__set_aad_prefix(const std::string& val) {
+ this->aad_prefix = val;
+__isset.aad_prefix = true;
+}
+
+void AesGcmV1::__set_aad_file_unique(const std::string& val) {
+ this->aad_file_unique = val;
+__isset.aad_file_unique = true;
+}
+
+void AesGcmV1::__set_supply_aad_prefix(const bool val) {
+ this->supply_aad_prefix = val;
+__isset.supply_aad_prefix = true;
+}
+std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t AesGcmV1::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_prefix);
+ this->__isset.aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_file_unique);
+ this->__isset.aad_file_unique = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->supply_aad_prefix);
+ this->__isset.supply_aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t AesGcmV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("AesGcmV1");
+
+ if (this->__isset.aad_prefix) {
+ xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.aad_file_unique) {
+ xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->aad_file_unique);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.supply_aad_prefix) {
+ xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->supply_aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(AesGcmV1 &a, AesGcmV1 &b) {
+ using ::std::swap;
+ swap(a.aad_prefix, b.aad_prefix);
+ swap(a.aad_file_unique, b.aad_file_unique);
+ swap(a.supply_aad_prefix, b.supply_aad_prefix);
+ swap(a.__isset, b.__isset);
+}
+
+AesGcmV1::AesGcmV1(const AesGcmV1& other179) {
+ aad_prefix = other179.aad_prefix;
+ aad_file_unique = other179.aad_file_unique;
+ supply_aad_prefix = other179.supply_aad_prefix;
+ __isset = other179.__isset;
+}
+AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other180) {
+ aad_prefix = other180.aad_prefix;
+ aad_file_unique = other180.aad_file_unique;
+ supply_aad_prefix = other180.supply_aad_prefix;
+ __isset = other180.__isset;
+ return *this;
+}
+void AesGcmV1::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "AesGcmV1(";
+ out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
+ out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
+ out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
+ out << ")";
+}
+
+
+AesGcmCtrV1::~AesGcmCtrV1() noexcept {
+}
+
+
+void AesGcmCtrV1::__set_aad_prefix(const std::string& val) {
+ this->aad_prefix = val;
+__isset.aad_prefix = true;
+}
+
+void AesGcmCtrV1::__set_aad_file_unique(const std::string& val) {
+ this->aad_file_unique = val;
+__isset.aad_file_unique = true;
+}
+
+void AesGcmCtrV1::__set_supply_aad_prefix(const bool val) {
+ this->supply_aad_prefix = val;
+__isset.supply_aad_prefix = true;
+}
+std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t AesGcmCtrV1::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_prefix);
+ this->__isset.aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_file_unique);
+ this->__isset.aad_file_unique = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->supply_aad_prefix);
+ this->__isset.supply_aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t AesGcmCtrV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("AesGcmCtrV1");
+
+ if (this->__isset.aad_prefix) {
+ xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.aad_file_unique) {
+ xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->aad_file_unique);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.supply_aad_prefix) {
+ xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->supply_aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) {
+ using ::std::swap;
+ swap(a.aad_prefix, b.aad_prefix);
+ swap(a.aad_file_unique, b.aad_file_unique);
+ swap(a.supply_aad_prefix, b.supply_aad_prefix);
+ swap(a.__isset, b.__isset);
+}
+
+AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other181) {
+ aad_prefix = other181.aad_prefix;
+ aad_file_unique = other181.aad_file_unique;
+ supply_aad_prefix = other181.supply_aad_prefix;
+ __isset = other181.__isset;
+}
+AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other182) {
+ aad_prefix = other182.aad_prefix;
+ aad_file_unique = other182.aad_file_unique;
+ supply_aad_prefix = other182.supply_aad_prefix;
+ __isset = other182.__isset;
+ return *this;
+}
+void AesGcmCtrV1::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "AesGcmCtrV1(";
+ out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
+ out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
+ out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
+ out << ")";
+}
+
+
+EncryptionAlgorithm::~EncryptionAlgorithm() noexcept {
+}
+
+
+void EncryptionAlgorithm::__set_AES_GCM_V1(const AesGcmV1& val) {
+ this->AES_GCM_V1 = val;
+__isset.AES_GCM_V1 = true;
+}
+
+void EncryptionAlgorithm::__set_AES_GCM_CTR_V1(const AesGcmCtrV1& val) {
+ this->AES_GCM_CTR_V1 = val;
+__isset.AES_GCM_CTR_V1 = true;
+}
+std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->AES_GCM_V1.read(iprot);
+ this->__isset.AES_GCM_V1 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->AES_GCM_CTR_V1.read(iprot);
+ this->__isset.AES_GCM_CTR_V1 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EncryptionAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionAlgorithm");
+
+ if (this->__isset.AES_GCM_V1) {
+ xfer += oprot->writeFieldBegin("AES_GCM_V1", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->AES_GCM_V1.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.AES_GCM_CTR_V1) {
+ xfer += oprot->writeFieldBegin("AES_GCM_CTR_V1", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->AES_GCM_CTR_V1.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) {
+ using ::std::swap;
+ swap(a.AES_GCM_V1, b.AES_GCM_V1);
+ swap(a.AES_GCM_CTR_V1, b.AES_GCM_CTR_V1);
+ swap(a.__isset, b.__isset);
+}
+
+EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other183) {
+ AES_GCM_V1 = other183.AES_GCM_V1;
+ AES_GCM_CTR_V1 = other183.AES_GCM_CTR_V1;
+ __isset = other183.__isset;
+}
+EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other184) {
+ AES_GCM_V1 = other184.AES_GCM_V1;
+ AES_GCM_CTR_V1 = other184.AES_GCM_CTR_V1;
+ __isset = other184.__isset;
+ return *this;
+}
+void EncryptionAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionAlgorithm(";
+ out << "AES_GCM_V1="; (__isset.AES_GCM_V1 ? (out << to_string(AES_GCM_V1)) : (out << "<null>"));
+ out << ", " << "AES_GCM_CTR_V1="; (__isset.AES_GCM_CTR_V1 ? (out << to_string(AES_GCM_CTR_V1)) : (out << "<null>"));
+ out << ")";
+}
+
+
+FileMetaData::~FileMetaData() noexcept {
+}
+
+
+void FileMetaData::__set_version(const int32_t val) {
+ this->version = val;
+}
+
+void FileMetaData::__set_schema(const std::vector<SchemaElement> & val) {
+ this->schema = val;
+}
+
+void FileMetaData::__set_num_rows(const int64_t val) {
+ this->num_rows = val;
+}
+
+void FileMetaData::__set_row_groups(const std::vector<RowGroup> & val) {
+ this->row_groups = val;
+}
+
+void FileMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
+ this->key_value_metadata = val;
+__isset.key_value_metadata = true;
+}
+
+void FileMetaData::__set_created_by(const std::string& val) {
+ this->created_by = val;
+__isset.created_by = true;
+}
+
+void FileMetaData::__set_column_orders(const std::vector<ColumnOrder> & val) {
+ this->column_orders = val;
+__isset.column_orders = true;
+}
+
+void FileMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
+ this->encryption_algorithm = val;
+__isset.encryption_algorithm = true;
+}
+
+void FileMetaData::__set_footer_signing_key_metadata(const std::string& val) {
+ this->footer_signing_key_metadata = val;
+__isset.footer_signing_key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const FileMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_version = false;
+ bool isset_schema = false;
+ bool isset_num_rows = false;
+ bool isset_row_groups = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->version);
+ isset_version = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->schema.clear();
+ uint32_t _size185;
+ ::apache::thrift::protocol::TType _etype188;
+ xfer += iprot->readListBegin(_etype188, _size185);
+ this->schema.resize(_size185);
+ uint32_t _i189;
+ for (_i189 = 0; _i189 < _size185; ++_i189)
+ {
+ xfer += this->schema[_i189].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->row_groups.clear();
+ uint32_t _size190;
+ ::apache::thrift::protocol::TType _etype193;
+ xfer += iprot->readListBegin(_etype193, _size190);
+ this->row_groups.resize(_size190);
+ uint32_t _i194;
+ for (_i194 = 0; _i194 < _size190; ++_i194)
+ {
+ xfer += this->row_groups[_i194].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_row_groups = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->key_value_metadata.clear();
+ uint32_t _size195;
+ ::apache::thrift::protocol::TType _etype198;
+ xfer += iprot->readListBegin(_etype198, _size195);
+ this->key_value_metadata.resize(_size195);
+ uint32_t _i199;
+ for (_i199 = 0; _i199 < _size195; ++_i199)
+ {
+ xfer += this->key_value_metadata[_i199].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.key_value_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->created_by);
+ this->__isset.created_by = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->column_orders.clear();
+ uint32_t _size200;
+ ::apache::thrift::protocol::TType _etype203;
+ xfer += iprot->readListBegin(_etype203, _size200);
+ this->column_orders.resize(_size200);
+ uint32_t _i204;
+ for (_i204 = 0; _i204 < _size200; ++_i204)
+ {
+ xfer += this->column_orders[_i204].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.column_orders = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->encryption_algorithm.read(iprot);
+ this->__isset.encryption_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->footer_signing_key_metadata);
+ this->__isset.footer_signing_key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_version)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_row_groups)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("FileMetaData");
+
+ xfer += oprot->writeFieldBegin("version", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->version);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->schema.size()));
+ std::vector<SchemaElement> ::const_iterator _iter205;
+ for (_iter205 = this->schema.begin(); _iter205 != this->schema.end(); ++_iter205)
+ {
+ xfer += (*_iter205).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->row_groups.size()));
+ std::vector<RowGroup> ::const_iterator _iter206;
+ for (_iter206 = this->row_groups.begin(); _iter206 != this->row_groups.end(); ++_iter206)
+ {
+ xfer += (*_iter206).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_value_metadata) {
+ xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
+ std::vector<KeyValue> ::const_iterator _iter207;
+ for (_iter207 = this->key_value_metadata.begin(); _iter207 != this->key_value_metadata.end(); ++_iter207)
+ {
+ xfer += (*_iter207).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.created_by) {
+ xfer += oprot->writeFieldBegin("created_by", ::apache::thrift::protocol::T_STRING, 6);
+ xfer += oprot->writeString(this->created_by);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_orders) {
+ xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->column_orders.size()));
+ std::vector<ColumnOrder> ::const_iterator _iter208;
+ for (_iter208 = this->column_orders.begin(); _iter208 != this->column_orders.end(); ++_iter208)
+ {
+ xfer += (*_iter208).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encryption_algorithm) {
+ xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->encryption_algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.footer_signing_key_metadata) {
+ xfer += oprot->writeFieldBegin("footer_signing_key_metadata", ::apache::thrift::protocol::T_STRING, 9);
+ xfer += oprot->writeBinary(this->footer_signing_key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(FileMetaData &a, FileMetaData &b) {
+ using ::std::swap;
+ swap(a.version, b.version);
+ swap(a.schema, b.schema);
+ swap(a.num_rows, b.num_rows);
+ swap(a.row_groups, b.row_groups);
+ swap(a.key_value_metadata, b.key_value_metadata);
+ swap(a.created_by, b.created_by);
+ swap(a.column_orders, b.column_orders);
+ swap(a.encryption_algorithm, b.encryption_algorithm);
+ swap(a.footer_signing_key_metadata, b.footer_signing_key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+FileMetaData::FileMetaData(const FileMetaData& other209) {
+ version = other209.version;
+ schema = other209.schema;
+ num_rows = other209.num_rows;
+ row_groups = other209.row_groups;
+ key_value_metadata = other209.key_value_metadata;
+ created_by = other209.created_by;
+ column_orders = other209.column_orders;
+ encryption_algorithm = other209.encryption_algorithm;
+ footer_signing_key_metadata = other209.footer_signing_key_metadata;
+ __isset = other209.__isset;
+}
+FileMetaData& FileMetaData::operator=(const FileMetaData& other210) {
+ version = other210.version;
+ schema = other210.schema;
+ num_rows = other210.num_rows;
+ row_groups = other210.row_groups;
+ key_value_metadata = other210.key_value_metadata;
+ created_by = other210.created_by;
+ column_orders = other210.column_orders;
+ encryption_algorithm = other210.encryption_algorithm;
+ footer_signing_key_metadata = other210.footer_signing_key_metadata;
+ __isset = other210.__isset;
+ return *this;
+}
+void FileMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "FileMetaData(";
+ out << "version=" << to_string(version);
+ out << ", " << "schema=" << to_string(schema);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "row_groups=" << to_string(row_groups);
+ out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
+ out << ", " << "created_by="; (__isset.created_by ? (out << to_string(created_by)) : (out << "<null>"));
+ out << ", " << "column_orders="; (__isset.column_orders ? (out << to_string(column_orders)) : (out << "<null>"));
+ out << ", " << "encryption_algorithm="; (__isset.encryption_algorithm ? (out << to_string(encryption_algorithm)) : (out << "<null>"));
+ out << ", " << "footer_signing_key_metadata="; (__isset.footer_signing_key_metadata ? (out << to_string(footer_signing_key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+FileCryptoMetaData::~FileCryptoMetaData() noexcept {
+}
+
+
+void FileCryptoMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
+ this->encryption_algorithm = val;
+}
+
+void FileCryptoMetaData::__set_key_metadata(const std::string& val) {
+ this->key_metadata = val;
+__isset.key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t FileCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_encryption_algorithm = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->encryption_algorithm.read(iprot);
+ isset_encryption_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->key_metadata);
+ this->__isset.key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_encryption_algorithm)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t FileCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("FileCryptoMetaData");
+
+ xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->encryption_algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_metadata) {
+ xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) {
+ using ::std::swap;
+ swap(a.encryption_algorithm, b.encryption_algorithm);
+ swap(a.key_metadata, b.key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other211) {
+ encryption_algorithm = other211.encryption_algorithm;
+ key_metadata = other211.key_metadata;
+ __isset = other211.__isset;
+}
+FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other212) {
+ encryption_algorithm = other212.encryption_algorithm;
+ key_metadata = other212.key_metadata;
+ __isset = other212.__isset;
+ return *this;
+}
+void FileCryptoMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "FileCryptoMetaData(";
+ out << "encryption_algorithm=" << to_string(encryption_algorithm);
+ out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+}} // namespace
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
index 3d7edd40983..c48383fa4d5 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
@@ -1,2917 +1,2917 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#ifndef parquet_TYPES_H
-#define parquet_TYPES_H
-
-#include <iosfwd>
-
-#include <thrift/Thrift.h>
-#include <thrift/TApplicationException.h>
-#include <thrift/TBase.h>
-#include <thrift/protocol/TProtocol.h>
-#include <thrift/transport/TTransport.h>
-
-#include <functional>
-#include <memory>
-
-#include "parquet/windows_compatibility.h"
-
-namespace parquet { namespace format {
-
-struct Type {
- enum type {
- BOOLEAN = 0,
- INT32 = 1,
- INT64 = 2,
- INT96 = 3,
- FLOAT = 4,
- DOUBLE = 5,
- BYTE_ARRAY = 6,
- FIXED_LEN_BYTE_ARRAY = 7
- };
-};
-
-extern const std::map<int, const char*> _Type_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const Type::type& val);
-
-std::string to_string(const Type::type& val);
-
-struct ConvertedType {
- enum type {
- UTF8 = 0,
- MAP = 1,
- MAP_KEY_VALUE = 2,
- LIST = 3,
- ENUM = 4,
- DECIMAL = 5,
- DATE = 6,
- TIME_MILLIS = 7,
- TIME_MICROS = 8,
- TIMESTAMP_MILLIS = 9,
- TIMESTAMP_MICROS = 10,
- UINT_8 = 11,
- UINT_16 = 12,
- UINT_32 = 13,
- UINT_64 = 14,
- INT_8 = 15,
- INT_16 = 16,
- INT_32 = 17,
- INT_64 = 18,
- JSON = 19,
- BSON = 20,
- INTERVAL = 21
- };
-};
-
-extern const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val);
-
-std::string to_string(const ConvertedType::type& val);
-
-struct FieldRepetitionType {
- enum type {
- REQUIRED = 0,
- OPTIONAL = 1,
- REPEATED = 2
- };
-};
-
-extern const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val);
-
-std::string to_string(const FieldRepetitionType::type& val);
-
-struct Encoding {
- enum type {
- PLAIN = 0,
- PLAIN_DICTIONARY = 2,
- RLE = 3,
- BIT_PACKED = 4,
- DELTA_BINARY_PACKED = 5,
- DELTA_LENGTH_BYTE_ARRAY = 6,
- DELTA_BYTE_ARRAY = 7,
- RLE_DICTIONARY = 8,
- BYTE_STREAM_SPLIT = 9
- };
-};
-
-extern const std::map<int, const char*> _Encoding_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const Encoding::type& val);
-
-std::string to_string(const Encoding::type& val);
-
-struct CompressionCodec {
- enum type {
- UNCOMPRESSED = 0,
- SNAPPY = 1,
- GZIP = 2,
- LZO = 3,
- BROTLI = 4,
- LZ4 = 5,
- ZSTD = 6,
- LZ4_RAW = 7
- };
-};
-
-extern const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val);
-
-std::string to_string(const CompressionCodec::type& val);
-
-struct PageType {
- enum type {
- DATA_PAGE = 0,
- INDEX_PAGE = 1,
- DICTIONARY_PAGE = 2,
- DATA_PAGE_V2 = 3
- };
-};
-
-extern const std::map<int, const char*> _PageType_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const PageType::type& val);
-
-std::string to_string(const PageType::type& val);
-
-struct BoundaryOrder {
- enum type {
- UNORDERED = 0,
- ASCENDING = 1,
- DESCENDING = 2
- };
-};
-
-extern const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val);
-
-std::string to_string(const BoundaryOrder::type& val);
-
-class Statistics;
-
-class StringType;
-
-class UUIDType;
-
-class MapType;
-
-class ListType;
-
-class EnumType;
-
-class DateType;
-
-class NullType;
-
-class DecimalType;
-
-class MilliSeconds;
-
-class MicroSeconds;
-
-class NanoSeconds;
-
-class TimeUnit;
-
-class TimestampType;
-
-class TimeType;
-
-class IntType;
-
-class JsonType;
-
-class BsonType;
-
-class LogicalType;
-
-class SchemaElement;
-
-class DataPageHeader;
-
-class IndexPageHeader;
-
-class DictionaryPageHeader;
-
-class DataPageHeaderV2;
-
-class SplitBlockAlgorithm;
-
-class BloomFilterAlgorithm;
-
-class XxHash;
-
-class BloomFilterHash;
-
-class Uncompressed;
-
-class BloomFilterCompression;
-
-class BloomFilterHeader;
-
-class PageHeader;
-
-class KeyValue;
-
-class SortingColumn;
-
-class PageEncodingStats;
-
-class ColumnMetaData;
-
-class EncryptionWithFooterKey;
-
-class EncryptionWithColumnKey;
-
-class ColumnCryptoMetaData;
-
-class ColumnChunk;
-
-class RowGroup;
-
-class TypeDefinedOrder;
-
-class ColumnOrder;
-
-class PageLocation;
-
-class OffsetIndex;
-
-class ColumnIndex;
-
-class AesGcmV1;
-
-class AesGcmCtrV1;
-
-class EncryptionAlgorithm;
-
-class FileMetaData;
-
-class FileCryptoMetaData;
-
-typedef struct _Statistics__isset {
- _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {}
- bool max :1;
- bool min :1;
- bool null_count :1;
- bool distinct_count :1;
- bool max_value :1;
- bool min_value :1;
-} _Statistics__isset;
-
-class Statistics : public virtual ::apache::thrift::TBase {
- public:
-
- Statistics(const Statistics&);
- Statistics& operator=(const Statistics&);
- Statistics() : max(), min(), null_count(0), distinct_count(0), max_value(), min_value() {
- }
-
- virtual ~Statistics() noexcept;
- std::string max;
- std::string min;
- int64_t null_count;
- int64_t distinct_count;
- std::string max_value;
- std::string min_value;
-
- _Statistics__isset __isset;
-
- void __set_max(const std::string& val);
-
- void __set_min(const std::string& val);
-
- void __set_null_count(const int64_t val);
-
- void __set_distinct_count(const int64_t val);
-
- void __set_max_value(const std::string& val);
-
- void __set_min_value(const std::string& val);
-
- bool operator == (const Statistics & rhs) const
- {
- if (__isset.max != rhs.__isset.max)
- return false;
- else if (__isset.max && !(max == rhs.max))
- return false;
- if (__isset.min != rhs.__isset.min)
- return false;
- else if (__isset.min && !(min == rhs.min))
- return false;
- if (__isset.null_count != rhs.__isset.null_count)
- return false;
- else if (__isset.null_count && !(null_count == rhs.null_count))
- return false;
- if (__isset.distinct_count != rhs.__isset.distinct_count)
- return false;
- else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
- return false;
- if (__isset.max_value != rhs.__isset.max_value)
- return false;
- else if (__isset.max_value && !(max_value == rhs.max_value))
- return false;
- if (__isset.min_value != rhs.__isset.min_value)
- return false;
- else if (__isset.min_value && !(min_value == rhs.min_value))
- return false;
- return true;
- }
- bool operator != (const Statistics &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const Statistics & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(Statistics &a, Statistics &b);
-
-std::ostream& operator<<(std::ostream& out, const Statistics& obj);
-
-
-class StringType : public virtual ::apache::thrift::TBase {
- public:
-
- StringType(const StringType&);
- StringType& operator=(const StringType&);
- StringType() {
- }
-
- virtual ~StringType() noexcept;
-
- bool operator == (const StringType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const StringType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const StringType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(StringType &a, StringType &b);
-
-std::ostream& operator<<(std::ostream& out, const StringType& obj);
-
-
-class UUIDType : public virtual ::apache::thrift::TBase {
- public:
-
- UUIDType(const UUIDType&);
- UUIDType& operator=(const UUIDType&);
- UUIDType() {
- }
-
- virtual ~UUIDType() noexcept;
-
- bool operator == (const UUIDType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const UUIDType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const UUIDType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(UUIDType &a, UUIDType &b);
-
-std::ostream& operator<<(std::ostream& out, const UUIDType& obj);
-
-
-class MapType : public virtual ::apache::thrift::TBase {
- public:
-
- MapType(const MapType&);
- MapType& operator=(const MapType&);
- MapType() {
- }
-
- virtual ~MapType() noexcept;
-
- bool operator == (const MapType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const MapType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const MapType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(MapType &a, MapType &b);
-
-std::ostream& operator<<(std::ostream& out, const MapType& obj);
-
-
-class ListType : public virtual ::apache::thrift::TBase {
- public:
-
- ListType(const ListType&);
- ListType& operator=(const ListType&);
- ListType() {
- }
-
- virtual ~ListType() noexcept;
-
- bool operator == (const ListType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const ListType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ListType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ListType &a, ListType &b);
-
-std::ostream& operator<<(std::ostream& out, const ListType& obj);
-
-
-class EnumType : public virtual ::apache::thrift::TBase {
- public:
-
- EnumType(const EnumType&);
- EnumType& operator=(const EnumType&);
- EnumType() {
- }
-
- virtual ~EnumType() noexcept;
-
- bool operator == (const EnumType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const EnumType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EnumType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EnumType &a, EnumType &b);
-
-std::ostream& operator<<(std::ostream& out, const EnumType& obj);
-
-
-class DateType : public virtual ::apache::thrift::TBase {
- public:
-
- DateType(const DateType&);
- DateType& operator=(const DateType&);
- DateType() {
- }
-
- virtual ~DateType() noexcept;
-
- bool operator == (const DateType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const DateType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DateType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DateType &a, DateType &b);
-
-std::ostream& operator<<(std::ostream& out, const DateType& obj);
-
-
-class NullType : public virtual ::apache::thrift::TBase {
- public:
-
- NullType(const NullType&);
- NullType& operator=(const NullType&);
- NullType() {
- }
-
- virtual ~NullType() noexcept;
-
- bool operator == (const NullType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const NullType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const NullType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(NullType &a, NullType &b);
-
-std::ostream& operator<<(std::ostream& out, const NullType& obj);
-
-
-class DecimalType : public virtual ::apache::thrift::TBase {
- public:
-
- DecimalType(const DecimalType&);
- DecimalType& operator=(const DecimalType&);
- DecimalType() : scale(0), precision(0) {
- }
-
- virtual ~DecimalType() noexcept;
- int32_t scale;
- int32_t precision;
-
- void __set_scale(const int32_t val);
-
- void __set_precision(const int32_t val);
-
- bool operator == (const DecimalType & rhs) const
- {
- if (!(scale == rhs.scale))
- return false;
- if (!(precision == rhs.precision))
- return false;
- return true;
- }
- bool operator != (const DecimalType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DecimalType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DecimalType &a, DecimalType &b);
-
-std::ostream& operator<<(std::ostream& out, const DecimalType& obj);
-
-
-class MilliSeconds : public virtual ::apache::thrift::TBase {
- public:
-
- MilliSeconds(const MilliSeconds&);
- MilliSeconds& operator=(const MilliSeconds&);
- MilliSeconds() {
- }
-
- virtual ~MilliSeconds() noexcept;
-
- bool operator == (const MilliSeconds & /* rhs */) const
- {
- return true;
- }
- bool operator != (const MilliSeconds &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const MilliSeconds & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(MilliSeconds &a, MilliSeconds &b);
-
-std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj);
-
-
-class MicroSeconds : public virtual ::apache::thrift::TBase {
- public:
-
- MicroSeconds(const MicroSeconds&);
- MicroSeconds& operator=(const MicroSeconds&);
- MicroSeconds() {
- }
-
- virtual ~MicroSeconds() noexcept;
-
- bool operator == (const MicroSeconds & /* rhs */) const
- {
- return true;
- }
- bool operator != (const MicroSeconds &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const MicroSeconds & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(MicroSeconds &a, MicroSeconds &b);
-
-std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj);
-
-
-class NanoSeconds : public virtual ::apache::thrift::TBase {
- public:
-
- NanoSeconds(const NanoSeconds&);
- NanoSeconds& operator=(const NanoSeconds&);
- NanoSeconds() {
- }
-
- virtual ~NanoSeconds() noexcept;
-
- bool operator == (const NanoSeconds & /* rhs */) const
- {
- return true;
- }
- bool operator != (const NanoSeconds &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const NanoSeconds & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(NanoSeconds &a, NanoSeconds &b);
-
-std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj);
-
-typedef struct _TimeUnit__isset {
- _TimeUnit__isset() : MILLIS(false), MICROS(false), NANOS(false) {}
- bool MILLIS :1;
- bool MICROS :1;
- bool NANOS :1;
-} _TimeUnit__isset;
-
-class TimeUnit : public virtual ::apache::thrift::TBase {
- public:
-
- TimeUnit(const TimeUnit&);
- TimeUnit& operator=(const TimeUnit&);
- TimeUnit() {
- }
-
- virtual ~TimeUnit() noexcept;
- MilliSeconds MILLIS;
- MicroSeconds MICROS;
- NanoSeconds NANOS;
-
- _TimeUnit__isset __isset;
-
- void __set_MILLIS(const MilliSeconds& val);
-
- void __set_MICROS(const MicroSeconds& val);
-
- void __set_NANOS(const NanoSeconds& val);
-
- bool operator == (const TimeUnit & rhs) const
- {
- if (__isset.MILLIS != rhs.__isset.MILLIS)
- return false;
- else if (__isset.MILLIS && !(MILLIS == rhs.MILLIS))
- return false;
- if (__isset.MICROS != rhs.__isset.MICROS)
- return false;
- else if (__isset.MICROS && !(MICROS == rhs.MICROS))
- return false;
- if (__isset.NANOS != rhs.__isset.NANOS)
- return false;
- else if (__isset.NANOS && !(NANOS == rhs.NANOS))
- return false;
- return true;
- }
- bool operator != (const TimeUnit &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TimeUnit & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TimeUnit &a, TimeUnit &b);
-
-std::ostream& operator<<(std::ostream& out, const TimeUnit& obj);
-
-
-class TimestampType : public virtual ::apache::thrift::TBase {
- public:
-
- TimestampType(const TimestampType&);
- TimestampType& operator=(const TimestampType&);
- TimestampType() : isAdjustedToUTC(0) {
- }
-
- virtual ~TimestampType() noexcept;
- bool isAdjustedToUTC;
- TimeUnit unit;
-
- void __set_isAdjustedToUTC(const bool val);
-
- void __set_unit(const TimeUnit& val);
-
- bool operator == (const TimestampType & rhs) const
- {
- if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
- return false;
- if (!(unit == rhs.unit))
- return false;
- return true;
- }
- bool operator != (const TimestampType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TimestampType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TimestampType &a, TimestampType &b);
-
-std::ostream& operator<<(std::ostream& out, const TimestampType& obj);
-
-
-class TimeType : public virtual ::apache::thrift::TBase {
- public:
-
- TimeType(const TimeType&);
- TimeType& operator=(const TimeType&);
- TimeType() : isAdjustedToUTC(0) {
- }
-
- virtual ~TimeType() noexcept;
- bool isAdjustedToUTC;
- TimeUnit unit;
-
- void __set_isAdjustedToUTC(const bool val);
-
- void __set_unit(const TimeUnit& val);
-
- bool operator == (const TimeType & rhs) const
- {
- if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
- return false;
- if (!(unit == rhs.unit))
- return false;
- return true;
- }
- bool operator != (const TimeType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TimeType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TimeType &a, TimeType &b);
-
-std::ostream& operator<<(std::ostream& out, const TimeType& obj);
-
-
-class IntType : public virtual ::apache::thrift::TBase {
- public:
-
- IntType(const IntType&);
- IntType& operator=(const IntType&);
- IntType() : bitWidth(0), isSigned(0) {
- }
-
- virtual ~IntType() noexcept;
- int8_t bitWidth;
- bool isSigned;
-
- void __set_bitWidth(const int8_t val);
-
- void __set_isSigned(const bool val);
-
- bool operator == (const IntType & rhs) const
- {
- if (!(bitWidth == rhs.bitWidth))
- return false;
- if (!(isSigned == rhs.isSigned))
- return false;
- return true;
- }
- bool operator != (const IntType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const IntType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(IntType &a, IntType &b);
-
-std::ostream& operator<<(std::ostream& out, const IntType& obj);
-
-
-class JsonType : public virtual ::apache::thrift::TBase {
- public:
-
- JsonType(const JsonType&);
- JsonType& operator=(const JsonType&);
- JsonType() {
- }
-
- virtual ~JsonType() noexcept;
-
- bool operator == (const JsonType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const JsonType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const JsonType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(JsonType &a, JsonType &b);
-
-std::ostream& operator<<(std::ostream& out, const JsonType& obj);
-
-
-class BsonType : public virtual ::apache::thrift::TBase {
- public:
-
- BsonType(const BsonType&);
- BsonType& operator=(const BsonType&);
- BsonType() {
- }
-
- virtual ~BsonType() noexcept;
-
- bool operator == (const BsonType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const BsonType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BsonType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BsonType &a, BsonType &b);
-
-std::ostream& operator<<(std::ostream& out, const BsonType& obj);
-
-typedef struct _LogicalType__isset {
- _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false) {}
- bool STRING :1;
- bool MAP :1;
- bool LIST :1;
- bool ENUM :1;
- bool DECIMAL :1;
- bool DATE :1;
- bool TIME :1;
- bool TIMESTAMP :1;
- bool INTEGER :1;
- bool UNKNOWN :1;
- bool JSON :1;
- bool BSON :1;
- bool UUID :1;
-} _LogicalType__isset;
-
-class LogicalType : public virtual ::apache::thrift::TBase {
- public:
-
- LogicalType(const LogicalType&);
- LogicalType& operator=(const LogicalType&);
- LogicalType() {
- }
-
- virtual ~LogicalType() noexcept;
- StringType STRING;
- MapType MAP;
- ListType LIST;
- EnumType ENUM;
- DecimalType DECIMAL;
- DateType DATE;
- TimeType TIME;
- TimestampType TIMESTAMP;
- IntType INTEGER;
- NullType UNKNOWN;
- JsonType JSON;
- BsonType BSON;
- UUIDType UUID;
-
- _LogicalType__isset __isset;
-
- void __set_STRING(const StringType& val);
-
- void __set_MAP(const MapType& val);
-
- void __set_LIST(const ListType& val);
-
- void __set_ENUM(const EnumType& val);
-
- void __set_DECIMAL(const DecimalType& val);
-
- void __set_DATE(const DateType& val);
-
- void __set_TIME(const TimeType& val);
-
- void __set_TIMESTAMP(const TimestampType& val);
-
- void __set_INTEGER(const IntType& val);
-
- void __set_UNKNOWN(const NullType& val);
-
- void __set_JSON(const JsonType& val);
-
- void __set_BSON(const BsonType& val);
-
- void __set_UUID(const UUIDType& val);
-
- bool operator == (const LogicalType & rhs) const
- {
- if (__isset.STRING != rhs.__isset.STRING)
- return false;
- else if (__isset.STRING && !(STRING == rhs.STRING))
- return false;
- if (__isset.MAP != rhs.__isset.MAP)
- return false;
- else if (__isset.MAP && !(MAP == rhs.MAP))
- return false;
- if (__isset.LIST != rhs.__isset.LIST)
- return false;
- else if (__isset.LIST && !(LIST == rhs.LIST))
- return false;
- if (__isset.ENUM != rhs.__isset.ENUM)
- return false;
- else if (__isset.ENUM && !(ENUM == rhs.ENUM))
- return false;
- if (__isset.DECIMAL != rhs.__isset.DECIMAL)
- return false;
- else if (__isset.DECIMAL && !(DECIMAL == rhs.DECIMAL))
- return false;
- if (__isset.DATE != rhs.__isset.DATE)
- return false;
- else if (__isset.DATE && !(DATE == rhs.DATE))
- return false;
- if (__isset.TIME != rhs.__isset.TIME)
- return false;
- else if (__isset.TIME && !(TIME == rhs.TIME))
- return false;
- if (__isset.TIMESTAMP != rhs.__isset.TIMESTAMP)
- return false;
- else if (__isset.TIMESTAMP && !(TIMESTAMP == rhs.TIMESTAMP))
- return false;
- if (__isset.INTEGER != rhs.__isset.INTEGER)
- return false;
- else if (__isset.INTEGER && !(INTEGER == rhs.INTEGER))
- return false;
- if (__isset.UNKNOWN != rhs.__isset.UNKNOWN)
- return false;
- else if (__isset.UNKNOWN && !(UNKNOWN == rhs.UNKNOWN))
- return false;
- if (__isset.JSON != rhs.__isset.JSON)
- return false;
- else if (__isset.JSON && !(JSON == rhs.JSON))
- return false;
- if (__isset.BSON != rhs.__isset.BSON)
- return false;
- else if (__isset.BSON && !(BSON == rhs.BSON))
- return false;
- if (__isset.UUID != rhs.__isset.UUID)
- return false;
- else if (__isset.UUID && !(UUID == rhs.UUID))
- return false;
- return true;
- }
- bool operator != (const LogicalType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const LogicalType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(LogicalType &a, LogicalType &b);
-
-std::ostream& operator<<(std::ostream& out, const LogicalType& obj);
-
-typedef struct _SchemaElement__isset {
- _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false) {}
- bool type :1;
- bool type_length :1;
- bool repetition_type :1;
- bool num_children :1;
- bool converted_type :1;
- bool scale :1;
- bool precision :1;
- bool field_id :1;
- bool logicalType :1;
-} _SchemaElement__isset;
-
-class SchemaElement : public virtual ::apache::thrift::TBase {
- public:
-
- SchemaElement(const SchemaElement&);
- SchemaElement& operator=(const SchemaElement&);
- SchemaElement() : type((Type::type)0), type_length(0), repetition_type((FieldRepetitionType::type)0), name(), num_children(0), converted_type((ConvertedType::type)0), scale(0), precision(0), field_id(0) {
- }
-
- virtual ~SchemaElement() noexcept;
- Type::type type;
- int32_t type_length;
- FieldRepetitionType::type repetition_type;
- std::string name;
- int32_t num_children;
- ConvertedType::type converted_type;
- int32_t scale;
- int32_t precision;
- int32_t field_id;
- LogicalType logicalType;
-
- _SchemaElement__isset __isset;
-
- void __set_type(const Type::type val);
-
- void __set_type_length(const int32_t val);
-
- void __set_repetition_type(const FieldRepetitionType::type val);
-
- void __set_name(const std::string& val);
-
- void __set_num_children(const int32_t val);
-
- void __set_converted_type(const ConvertedType::type val);
-
- void __set_scale(const int32_t val);
-
- void __set_precision(const int32_t val);
-
- void __set_field_id(const int32_t val);
-
- void __set_logicalType(const LogicalType& val);
-
- bool operator == (const SchemaElement & rhs) const
- {
- if (__isset.type != rhs.__isset.type)
- return false;
- else if (__isset.type && !(type == rhs.type))
- return false;
- if (__isset.type_length != rhs.__isset.type_length)
- return false;
- else if (__isset.type_length && !(type_length == rhs.type_length))
- return false;
- if (__isset.repetition_type != rhs.__isset.repetition_type)
- return false;
- else if (__isset.repetition_type && !(repetition_type == rhs.repetition_type))
- return false;
- if (!(name == rhs.name))
- return false;
- if (__isset.num_children != rhs.__isset.num_children)
- return false;
- else if (__isset.num_children && !(num_children == rhs.num_children))
- return false;
- if (__isset.converted_type != rhs.__isset.converted_type)
- return false;
- else if (__isset.converted_type && !(converted_type == rhs.converted_type))
- return false;
- if (__isset.scale != rhs.__isset.scale)
- return false;
- else if (__isset.scale && !(scale == rhs.scale))
- return false;
- if (__isset.precision != rhs.__isset.precision)
- return false;
- else if (__isset.precision && !(precision == rhs.precision))
- return false;
- if (__isset.field_id != rhs.__isset.field_id)
- return false;
- else if (__isset.field_id && !(field_id == rhs.field_id))
- return false;
- if (__isset.logicalType != rhs.__isset.logicalType)
- return false;
- else if (__isset.logicalType && !(logicalType == rhs.logicalType))
- return false;
- return true;
- }
- bool operator != (const SchemaElement &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const SchemaElement & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(SchemaElement &a, SchemaElement &b);
-
-std::ostream& operator<<(std::ostream& out, const SchemaElement& obj);
-
-typedef struct _DataPageHeader__isset {
- _DataPageHeader__isset() : statistics(false) {}
- bool statistics :1;
-} _DataPageHeader__isset;
-
-class DataPageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- DataPageHeader(const DataPageHeader&);
- DataPageHeader& operator=(const DataPageHeader&);
- DataPageHeader() : num_values(0), encoding((Encoding::type)0), definition_level_encoding((Encoding::type)0), repetition_level_encoding((Encoding::type)0) {
- }
-
- virtual ~DataPageHeader() noexcept;
- int32_t num_values;
- Encoding::type encoding;
- Encoding::type definition_level_encoding;
- Encoding::type repetition_level_encoding;
- Statistics statistics;
-
- _DataPageHeader__isset __isset;
-
- void __set_num_values(const int32_t val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_definition_level_encoding(const Encoding::type val);
-
- void __set_repetition_level_encoding(const Encoding::type val);
-
- void __set_statistics(const Statistics& val);
-
- bool operator == (const DataPageHeader & rhs) const
- {
- if (!(num_values == rhs.num_values))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (!(definition_level_encoding == rhs.definition_level_encoding))
- return false;
- if (!(repetition_level_encoding == rhs.repetition_level_encoding))
- return false;
- if (__isset.statistics != rhs.__isset.statistics)
- return false;
- else if (__isset.statistics && !(statistics == rhs.statistics))
- return false;
- return true;
- }
- bool operator != (const DataPageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DataPageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DataPageHeader &a, DataPageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj);
-
-
-class IndexPageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- IndexPageHeader(const IndexPageHeader&);
- IndexPageHeader& operator=(const IndexPageHeader&);
- IndexPageHeader() {
- }
-
- virtual ~IndexPageHeader() noexcept;
-
- bool operator == (const IndexPageHeader & /* rhs */) const
- {
- return true;
- }
- bool operator != (const IndexPageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const IndexPageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(IndexPageHeader &a, IndexPageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj);
-
-typedef struct _DictionaryPageHeader__isset {
- _DictionaryPageHeader__isset() : is_sorted(false) {}
- bool is_sorted :1;
-} _DictionaryPageHeader__isset;
-
-class DictionaryPageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- DictionaryPageHeader(const DictionaryPageHeader&);
- DictionaryPageHeader& operator=(const DictionaryPageHeader&);
- DictionaryPageHeader() : num_values(0), encoding((Encoding::type)0), is_sorted(0) {
- }
-
- virtual ~DictionaryPageHeader() noexcept;
- int32_t num_values;
- Encoding::type encoding;
- bool is_sorted;
-
- _DictionaryPageHeader__isset __isset;
-
- void __set_num_values(const int32_t val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_is_sorted(const bool val);
-
- bool operator == (const DictionaryPageHeader & rhs) const
- {
- if (!(num_values == rhs.num_values))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (__isset.is_sorted != rhs.__isset.is_sorted)
- return false;
- else if (__isset.is_sorted && !(is_sorted == rhs.is_sorted))
- return false;
- return true;
- }
- bool operator != (const DictionaryPageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DictionaryPageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DictionaryPageHeader &a, DictionaryPageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj);
-
-typedef struct _DataPageHeaderV2__isset {
- _DataPageHeaderV2__isset() : is_compressed(true), statistics(false) {}
- bool is_compressed :1;
- bool statistics :1;
-} _DataPageHeaderV2__isset;
-
-class DataPageHeaderV2 : public virtual ::apache::thrift::TBase {
- public:
-
- DataPageHeaderV2(const DataPageHeaderV2&);
- DataPageHeaderV2& operator=(const DataPageHeaderV2&);
- DataPageHeaderV2() : num_values(0), num_nulls(0), num_rows(0), encoding((Encoding::type)0), definition_levels_byte_length(0), repetition_levels_byte_length(0), is_compressed(true) {
- }
-
- virtual ~DataPageHeaderV2() noexcept;
- int32_t num_values;
- int32_t num_nulls;
- int32_t num_rows;
- Encoding::type encoding;
- int32_t definition_levels_byte_length;
- int32_t repetition_levels_byte_length;
- bool is_compressed;
- Statistics statistics;
-
- _DataPageHeaderV2__isset __isset;
-
- void __set_num_values(const int32_t val);
-
- void __set_num_nulls(const int32_t val);
-
- void __set_num_rows(const int32_t val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_definition_levels_byte_length(const int32_t val);
-
- void __set_repetition_levels_byte_length(const int32_t val);
-
- void __set_is_compressed(const bool val);
-
- void __set_statistics(const Statistics& val);
-
- bool operator == (const DataPageHeaderV2 & rhs) const
- {
- if (!(num_values == rhs.num_values))
- return false;
- if (!(num_nulls == rhs.num_nulls))
- return false;
- if (!(num_rows == rhs.num_rows))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (!(definition_levels_byte_length == rhs.definition_levels_byte_length))
- return false;
- if (!(repetition_levels_byte_length == rhs.repetition_levels_byte_length))
- return false;
- if (__isset.is_compressed != rhs.__isset.is_compressed)
- return false;
- else if (__isset.is_compressed && !(is_compressed == rhs.is_compressed))
- return false;
- if (__isset.statistics != rhs.__isset.statistics)
- return false;
- else if (__isset.statistics && !(statistics == rhs.statistics))
- return false;
- return true;
- }
- bool operator != (const DataPageHeaderV2 &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DataPageHeaderV2 & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b);
-
-std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj);
-
-
-class SplitBlockAlgorithm : public virtual ::apache::thrift::TBase {
- public:
-
- SplitBlockAlgorithm(const SplitBlockAlgorithm&);
- SplitBlockAlgorithm& operator=(const SplitBlockAlgorithm&);
- SplitBlockAlgorithm() {
- }
-
- virtual ~SplitBlockAlgorithm() noexcept;
-
- bool operator == (const SplitBlockAlgorithm & /* rhs */) const
- {
- return true;
- }
- bool operator != (const SplitBlockAlgorithm &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const SplitBlockAlgorithm & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b);
-
-std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj);
-
-typedef struct _BloomFilterAlgorithm__isset {
- _BloomFilterAlgorithm__isset() : BLOCK(false) {}
- bool BLOCK :1;
-} _BloomFilterAlgorithm__isset;
-
-class BloomFilterAlgorithm : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterAlgorithm(const BloomFilterAlgorithm&);
- BloomFilterAlgorithm& operator=(const BloomFilterAlgorithm&);
- BloomFilterAlgorithm() {
- }
-
- virtual ~BloomFilterAlgorithm() noexcept;
- SplitBlockAlgorithm BLOCK;
-
- _BloomFilterAlgorithm__isset __isset;
-
- void __set_BLOCK(const SplitBlockAlgorithm& val);
-
- bool operator == (const BloomFilterAlgorithm & rhs) const
- {
- if (__isset.BLOCK != rhs.__isset.BLOCK)
- return false;
- else if (__isset.BLOCK && !(BLOCK == rhs.BLOCK))
- return false;
- return true;
- }
- bool operator != (const BloomFilterAlgorithm &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterAlgorithm & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj);
-
-
-class XxHash : public virtual ::apache::thrift::TBase {
- public:
-
- XxHash(const XxHash&);
- XxHash& operator=(const XxHash&);
- XxHash() {
- }
-
- virtual ~XxHash() noexcept;
-
- bool operator == (const XxHash & /* rhs */) const
- {
- return true;
- }
- bool operator != (const XxHash &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const XxHash & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(XxHash &a, XxHash &b);
-
-std::ostream& operator<<(std::ostream& out, const XxHash& obj);
-
-typedef struct _BloomFilterHash__isset {
- _BloomFilterHash__isset() : XXHASH(false) {}
- bool XXHASH :1;
-} _BloomFilterHash__isset;
-
-class BloomFilterHash : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterHash(const BloomFilterHash&);
- BloomFilterHash& operator=(const BloomFilterHash&);
- BloomFilterHash() {
- }
-
- virtual ~BloomFilterHash() noexcept;
- XxHash XXHASH;
-
- _BloomFilterHash__isset __isset;
-
- void __set_XXHASH(const XxHash& val);
-
- bool operator == (const BloomFilterHash & rhs) const
- {
- if (__isset.XXHASH != rhs.__isset.XXHASH)
- return false;
- else if (__isset.XXHASH && !(XXHASH == rhs.XXHASH))
- return false;
- return true;
- }
- bool operator != (const BloomFilterHash &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterHash & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterHash &a, BloomFilterHash &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj);
-
-
-class Uncompressed : public virtual ::apache::thrift::TBase {
- public:
-
- Uncompressed(const Uncompressed&);
- Uncompressed& operator=(const Uncompressed&);
- Uncompressed() {
- }
-
- virtual ~Uncompressed() noexcept;
-
- bool operator == (const Uncompressed & /* rhs */) const
- {
- return true;
- }
- bool operator != (const Uncompressed &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const Uncompressed & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(Uncompressed &a, Uncompressed &b);
-
-std::ostream& operator<<(std::ostream& out, const Uncompressed& obj);
-
-typedef struct _BloomFilterCompression__isset {
- _BloomFilterCompression__isset() : UNCOMPRESSED(false) {}
- bool UNCOMPRESSED :1;
-} _BloomFilterCompression__isset;
-
-class BloomFilterCompression : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterCompression(const BloomFilterCompression&);
- BloomFilterCompression& operator=(const BloomFilterCompression&);
- BloomFilterCompression() {
- }
-
- virtual ~BloomFilterCompression() noexcept;
- Uncompressed UNCOMPRESSED;
-
- _BloomFilterCompression__isset __isset;
-
- void __set_UNCOMPRESSED(const Uncompressed& val);
-
- bool operator == (const BloomFilterCompression & rhs) const
- {
- if (__isset.UNCOMPRESSED != rhs.__isset.UNCOMPRESSED)
- return false;
- else if (__isset.UNCOMPRESSED && !(UNCOMPRESSED == rhs.UNCOMPRESSED))
- return false;
- return true;
- }
- bool operator != (const BloomFilterCompression &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterCompression & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterCompression &a, BloomFilterCompression &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj);
-
-
-class BloomFilterHeader : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterHeader(const BloomFilterHeader&);
- BloomFilterHeader& operator=(const BloomFilterHeader&);
- BloomFilterHeader() : numBytes(0) {
- }
-
- virtual ~BloomFilterHeader() noexcept;
- int32_t numBytes;
- BloomFilterAlgorithm algorithm;
- BloomFilterHash hash;
- BloomFilterCompression compression;
-
- void __set_numBytes(const int32_t val);
-
- void __set_algorithm(const BloomFilterAlgorithm& val);
-
- void __set_hash(const BloomFilterHash& val);
-
- void __set_compression(const BloomFilterCompression& val);
-
- bool operator == (const BloomFilterHeader & rhs) const
- {
- if (!(numBytes == rhs.numBytes))
- return false;
- if (!(algorithm == rhs.algorithm))
- return false;
- if (!(hash == rhs.hash))
- return false;
- if (!(compression == rhs.compression))
- return false;
- return true;
- }
- bool operator != (const BloomFilterHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterHeader &a, BloomFilterHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj);
-
-typedef struct _PageHeader__isset {
- _PageHeader__isset() : crc(false), data_page_header(false), index_page_header(false), dictionary_page_header(false), data_page_header_v2(false) {}
- bool crc :1;
- bool data_page_header :1;
- bool index_page_header :1;
- bool dictionary_page_header :1;
- bool data_page_header_v2 :1;
-} _PageHeader__isset;
-
-class PageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- PageHeader(const PageHeader&);
- PageHeader& operator=(const PageHeader&);
- PageHeader() : type((PageType::type)0), uncompressed_page_size(0), compressed_page_size(0), crc(0) {
- }
-
- virtual ~PageHeader() noexcept;
- PageType::type type;
- int32_t uncompressed_page_size;
- int32_t compressed_page_size;
- int32_t crc;
- DataPageHeader data_page_header;
- IndexPageHeader index_page_header;
- DictionaryPageHeader dictionary_page_header;
- DataPageHeaderV2 data_page_header_v2;
-
- _PageHeader__isset __isset;
-
- void __set_type(const PageType::type val);
-
- void __set_uncompressed_page_size(const int32_t val);
-
- void __set_compressed_page_size(const int32_t val);
-
- void __set_crc(const int32_t val);
-
- void __set_data_page_header(const DataPageHeader& val);
-
- void __set_index_page_header(const IndexPageHeader& val);
-
- void __set_dictionary_page_header(const DictionaryPageHeader& val);
-
- void __set_data_page_header_v2(const DataPageHeaderV2& val);
-
- bool operator == (const PageHeader & rhs) const
- {
- if (!(type == rhs.type))
- return false;
- if (!(uncompressed_page_size == rhs.uncompressed_page_size))
- return false;
- if (!(compressed_page_size == rhs.compressed_page_size))
- return false;
- if (__isset.crc != rhs.__isset.crc)
- return false;
- else if (__isset.crc && !(crc == rhs.crc))
- return false;
- if (__isset.data_page_header != rhs.__isset.data_page_header)
- return false;
- else if (__isset.data_page_header && !(data_page_header == rhs.data_page_header))
- return false;
- if (__isset.index_page_header != rhs.__isset.index_page_header)
- return false;
- else if (__isset.index_page_header && !(index_page_header == rhs.index_page_header))
- return false;
- if (__isset.dictionary_page_header != rhs.__isset.dictionary_page_header)
- return false;
- else if (__isset.dictionary_page_header && !(dictionary_page_header == rhs.dictionary_page_header))
- return false;
- if (__isset.data_page_header_v2 != rhs.__isset.data_page_header_v2)
- return false;
- else if (__isset.data_page_header_v2 && !(data_page_header_v2 == rhs.data_page_header_v2))
- return false;
- return true;
- }
- bool operator != (const PageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const PageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(PageHeader &a, PageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const PageHeader& obj);
-
-typedef struct _KeyValue__isset {
- _KeyValue__isset() : value(false) {}
- bool value :1;
-} _KeyValue__isset;
-
-class KeyValue : public virtual ::apache::thrift::TBase {
- public:
-
- KeyValue(const KeyValue&);
- KeyValue& operator=(const KeyValue&);
- KeyValue() : key(), value() {
- }
-
- virtual ~KeyValue() noexcept;
- std::string key;
- std::string value;
-
- _KeyValue__isset __isset;
-
- void __set_key(const std::string& val);
-
- void __set_value(const std::string& val);
-
- bool operator == (const KeyValue & rhs) const
- {
- if (!(key == rhs.key))
- return false;
- if (__isset.value != rhs.__isset.value)
- return false;
- else if (__isset.value && !(value == rhs.value))
- return false;
- return true;
- }
- bool operator != (const KeyValue &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const KeyValue & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(KeyValue &a, KeyValue &b);
-
-std::ostream& operator<<(std::ostream& out, const KeyValue& obj);
-
-
-class SortingColumn : public virtual ::apache::thrift::TBase {
- public:
-
- SortingColumn(const SortingColumn&);
- SortingColumn& operator=(const SortingColumn&);
- SortingColumn() : column_idx(0), descending(0), nulls_first(0) {
- }
-
- virtual ~SortingColumn() noexcept;
- int32_t column_idx;
- bool descending;
- bool nulls_first;
-
- void __set_column_idx(const int32_t val);
-
- void __set_descending(const bool val);
-
- void __set_nulls_first(const bool val);
-
- bool operator == (const SortingColumn & rhs) const
- {
- if (!(column_idx == rhs.column_idx))
- return false;
- if (!(descending == rhs.descending))
- return false;
- if (!(nulls_first == rhs.nulls_first))
- return false;
- return true;
- }
- bool operator != (const SortingColumn &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const SortingColumn & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(SortingColumn &a, SortingColumn &b);
-
-std::ostream& operator<<(std::ostream& out, const SortingColumn& obj);
-
-
-class PageEncodingStats : public virtual ::apache::thrift::TBase {
- public:
-
- PageEncodingStats(const PageEncodingStats&);
- PageEncodingStats& operator=(const PageEncodingStats&);
- PageEncodingStats() : page_type((PageType::type)0), encoding((Encoding::type)0), count(0) {
- }
-
- virtual ~PageEncodingStats() noexcept;
- PageType::type page_type;
- Encoding::type encoding;
- int32_t count;
-
- void __set_page_type(const PageType::type val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_count(const int32_t val);
-
- bool operator == (const PageEncodingStats & rhs) const
- {
- if (!(page_type == rhs.page_type))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (!(count == rhs.count))
- return false;
- return true;
- }
- bool operator != (const PageEncodingStats &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const PageEncodingStats & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(PageEncodingStats &a, PageEncodingStats &b);
-
-std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj);
-
-typedef struct _ColumnMetaData__isset {
- _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {}
- bool key_value_metadata :1;
- bool index_page_offset :1;
- bool dictionary_page_offset :1;
- bool statistics :1;
- bool encoding_stats :1;
- bool bloom_filter_offset :1;
-} _ColumnMetaData__isset;
-
-class ColumnMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnMetaData(const ColumnMetaData&);
- ColumnMetaData& operator=(const ColumnMetaData&);
- ColumnMetaData() : type((Type::type)0), codec((CompressionCodec::type)0), num_values(0), total_uncompressed_size(0), total_compressed_size(0), data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), bloom_filter_offset(0) {
- }
-
- virtual ~ColumnMetaData() noexcept;
- Type::type type;
- std::vector<Encoding::type> encodings;
- std::vector<std::string> path_in_schema;
- CompressionCodec::type codec;
- int64_t num_values;
- int64_t total_uncompressed_size;
- int64_t total_compressed_size;
- std::vector<KeyValue> key_value_metadata;
- int64_t data_page_offset;
- int64_t index_page_offset;
- int64_t dictionary_page_offset;
- Statistics statistics;
- std::vector<PageEncodingStats> encoding_stats;
- int64_t bloom_filter_offset;
-
- _ColumnMetaData__isset __isset;
-
- void __set_type(const Type::type val);
-
- void __set_encodings(const std::vector<Encoding::type> & val);
-
- void __set_path_in_schema(const std::vector<std::string> & val);
-
- void __set_codec(const CompressionCodec::type val);
-
- void __set_num_values(const int64_t val);
-
- void __set_total_uncompressed_size(const int64_t val);
-
- void __set_total_compressed_size(const int64_t val);
-
- void __set_key_value_metadata(const std::vector<KeyValue> & val);
-
- void __set_data_page_offset(const int64_t val);
-
- void __set_index_page_offset(const int64_t val);
-
- void __set_dictionary_page_offset(const int64_t val);
-
- void __set_statistics(const Statistics& val);
-
- void __set_encoding_stats(const std::vector<PageEncodingStats> & val);
-
- void __set_bloom_filter_offset(const int64_t val);
-
- bool operator == (const ColumnMetaData & rhs) const
- {
- if (!(type == rhs.type))
- return false;
- if (!(encodings == rhs.encodings))
- return false;
- if (!(path_in_schema == rhs.path_in_schema))
- return false;
- if (!(codec == rhs.codec))
- return false;
- if (!(num_values == rhs.num_values))
- return false;
- if (!(total_uncompressed_size == rhs.total_uncompressed_size))
- return false;
- if (!(total_compressed_size == rhs.total_compressed_size))
- return false;
- if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
- return false;
- else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
- return false;
- if (!(data_page_offset == rhs.data_page_offset))
- return false;
- if (__isset.index_page_offset != rhs.__isset.index_page_offset)
- return false;
- else if (__isset.index_page_offset && !(index_page_offset == rhs.index_page_offset))
- return false;
- if (__isset.dictionary_page_offset != rhs.__isset.dictionary_page_offset)
- return false;
- else if (__isset.dictionary_page_offset && !(dictionary_page_offset == rhs.dictionary_page_offset))
- return false;
- if (__isset.statistics != rhs.__isset.statistics)
- return false;
- else if (__isset.statistics && !(statistics == rhs.statistics))
- return false;
- if (__isset.encoding_stats != rhs.__isset.encoding_stats)
- return false;
- else if (__isset.encoding_stats && !(encoding_stats == rhs.encoding_stats))
- return false;
- if (__isset.bloom_filter_offset != rhs.__isset.bloom_filter_offset)
- return false;
- else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset))
- return false;
- return true;
- }
- bool operator != (const ColumnMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnMetaData &a, ColumnMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj);
-
-
-class EncryptionWithFooterKey : public virtual ::apache::thrift::TBase {
- public:
-
- EncryptionWithFooterKey(const EncryptionWithFooterKey&);
- EncryptionWithFooterKey& operator=(const EncryptionWithFooterKey&);
- EncryptionWithFooterKey() {
- }
-
- virtual ~EncryptionWithFooterKey() noexcept;
-
- bool operator == (const EncryptionWithFooterKey & /* rhs */) const
- {
- return true;
- }
- bool operator != (const EncryptionWithFooterKey &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EncryptionWithFooterKey & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b);
-
-std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj);
-
-typedef struct _EncryptionWithColumnKey__isset {
- _EncryptionWithColumnKey__isset() : key_metadata(false) {}
- bool key_metadata :1;
-} _EncryptionWithColumnKey__isset;
-
-class EncryptionWithColumnKey : public virtual ::apache::thrift::TBase {
- public:
-
- EncryptionWithColumnKey(const EncryptionWithColumnKey&);
- EncryptionWithColumnKey& operator=(const EncryptionWithColumnKey&);
- EncryptionWithColumnKey() : key_metadata() {
- }
-
- virtual ~EncryptionWithColumnKey() noexcept;
- std::vector<std::string> path_in_schema;
- std::string key_metadata;
-
- _EncryptionWithColumnKey__isset __isset;
-
- void __set_path_in_schema(const std::vector<std::string> & val);
-
- void __set_key_metadata(const std::string& val);
-
- bool operator == (const EncryptionWithColumnKey & rhs) const
- {
- if (!(path_in_schema == rhs.path_in_schema))
- return false;
- if (__isset.key_metadata != rhs.__isset.key_metadata)
- return false;
- else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
- return false;
- return true;
- }
- bool operator != (const EncryptionWithColumnKey &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EncryptionWithColumnKey & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b);
-
-std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj);
-
-typedef struct _ColumnCryptoMetaData__isset {
- _ColumnCryptoMetaData__isset() : ENCRYPTION_WITH_FOOTER_KEY(false), ENCRYPTION_WITH_COLUMN_KEY(false) {}
- bool ENCRYPTION_WITH_FOOTER_KEY :1;
- bool ENCRYPTION_WITH_COLUMN_KEY :1;
-} _ColumnCryptoMetaData__isset;
-
-class ColumnCryptoMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnCryptoMetaData(const ColumnCryptoMetaData&);
- ColumnCryptoMetaData& operator=(const ColumnCryptoMetaData&);
- ColumnCryptoMetaData() {
- }
-
- virtual ~ColumnCryptoMetaData() noexcept;
- EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY;
- EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY;
-
- _ColumnCryptoMetaData__isset __isset;
-
- void __set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val);
-
- void __set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val);
-
- bool operator == (const ColumnCryptoMetaData & rhs) const
- {
- if (__isset.ENCRYPTION_WITH_FOOTER_KEY != rhs.__isset.ENCRYPTION_WITH_FOOTER_KEY)
- return false;
- else if (__isset.ENCRYPTION_WITH_FOOTER_KEY && !(ENCRYPTION_WITH_FOOTER_KEY == rhs.ENCRYPTION_WITH_FOOTER_KEY))
- return false;
- if (__isset.ENCRYPTION_WITH_COLUMN_KEY != rhs.__isset.ENCRYPTION_WITH_COLUMN_KEY)
- return false;
- else if (__isset.ENCRYPTION_WITH_COLUMN_KEY && !(ENCRYPTION_WITH_COLUMN_KEY == rhs.ENCRYPTION_WITH_COLUMN_KEY))
- return false;
- return true;
- }
- bool operator != (const ColumnCryptoMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnCryptoMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj);
-
-typedef struct _ColumnChunk__isset {
- _ColumnChunk__isset() : file_path(false), meta_data(false), offset_index_offset(false), offset_index_length(false), column_index_offset(false), column_index_length(false), crypto_metadata(false), encrypted_column_metadata(false) {}
- bool file_path :1;
- bool meta_data :1;
- bool offset_index_offset :1;
- bool offset_index_length :1;
- bool column_index_offset :1;
- bool column_index_length :1;
- bool crypto_metadata :1;
- bool encrypted_column_metadata :1;
-} _ColumnChunk__isset;
-
-class ColumnChunk : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnChunk(const ColumnChunk&);
- ColumnChunk& operator=(const ColumnChunk&);
- ColumnChunk() : file_path(), file_offset(0), offset_index_offset(0), offset_index_length(0), column_index_offset(0), column_index_length(0), encrypted_column_metadata() {
- }
-
- virtual ~ColumnChunk() noexcept;
- std::string file_path;
- int64_t file_offset;
- ColumnMetaData meta_data;
- int64_t offset_index_offset;
- int32_t offset_index_length;
- int64_t column_index_offset;
- int32_t column_index_length;
- ColumnCryptoMetaData crypto_metadata;
- std::string encrypted_column_metadata;
-
- _ColumnChunk__isset __isset;
-
- void __set_file_path(const std::string& val);
-
- void __set_file_offset(const int64_t val);
-
- void __set_meta_data(const ColumnMetaData& val);
-
- void __set_offset_index_offset(const int64_t val);
-
- void __set_offset_index_length(const int32_t val);
-
- void __set_column_index_offset(const int64_t val);
-
- void __set_column_index_length(const int32_t val);
-
- void __set_crypto_metadata(const ColumnCryptoMetaData& val);
-
- void __set_encrypted_column_metadata(const std::string& val);
-
- bool operator == (const ColumnChunk & rhs) const
- {
- if (__isset.file_path != rhs.__isset.file_path)
- return false;
- else if (__isset.file_path && !(file_path == rhs.file_path))
- return false;
- if (!(file_offset == rhs.file_offset))
- return false;
- if (__isset.meta_data != rhs.__isset.meta_data)
- return false;
- else if (__isset.meta_data && !(meta_data == rhs.meta_data))
- return false;
- if (__isset.offset_index_offset != rhs.__isset.offset_index_offset)
- return false;
- else if (__isset.offset_index_offset && !(offset_index_offset == rhs.offset_index_offset))
- return false;
- if (__isset.offset_index_length != rhs.__isset.offset_index_length)
- return false;
- else if (__isset.offset_index_length && !(offset_index_length == rhs.offset_index_length))
- return false;
- if (__isset.column_index_offset != rhs.__isset.column_index_offset)
- return false;
- else if (__isset.column_index_offset && !(column_index_offset == rhs.column_index_offset))
- return false;
- if (__isset.column_index_length != rhs.__isset.column_index_length)
- return false;
- else if (__isset.column_index_length && !(column_index_length == rhs.column_index_length))
- return false;
- if (__isset.crypto_metadata != rhs.__isset.crypto_metadata)
- return false;
- else if (__isset.crypto_metadata && !(crypto_metadata == rhs.crypto_metadata))
- return false;
- if (__isset.encrypted_column_metadata != rhs.__isset.encrypted_column_metadata)
- return false;
- else if (__isset.encrypted_column_metadata && !(encrypted_column_metadata == rhs.encrypted_column_metadata))
- return false;
- return true;
- }
- bool operator != (const ColumnChunk &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnChunk & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnChunk &a, ColumnChunk &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj);
-
-typedef struct _RowGroup__isset {
- _RowGroup__isset() : sorting_columns(false), file_offset(false), total_compressed_size(false), ordinal(false) {}
- bool sorting_columns :1;
- bool file_offset :1;
- bool total_compressed_size :1;
- bool ordinal :1;
-} _RowGroup__isset;
-
-class RowGroup : public virtual ::apache::thrift::TBase {
- public:
-
- RowGroup(const RowGroup&);
- RowGroup& operator=(const RowGroup&);
- RowGroup() : total_byte_size(0), num_rows(0), file_offset(0), total_compressed_size(0), ordinal(0) {
- }
-
- virtual ~RowGroup() noexcept;
- std::vector<ColumnChunk> columns;
- int64_t total_byte_size;
- int64_t num_rows;
- std::vector<SortingColumn> sorting_columns;
- int64_t file_offset;
- int64_t total_compressed_size;
- int16_t ordinal;
-
- _RowGroup__isset __isset;
-
- void __set_columns(const std::vector<ColumnChunk> & val);
-
- void __set_total_byte_size(const int64_t val);
-
- void __set_num_rows(const int64_t val);
-
- void __set_sorting_columns(const std::vector<SortingColumn> & val);
-
- void __set_file_offset(const int64_t val);
-
- void __set_total_compressed_size(const int64_t val);
-
- void __set_ordinal(const int16_t val);
-
- bool operator == (const RowGroup & rhs) const
- {
- if (!(columns == rhs.columns))
- return false;
- if (!(total_byte_size == rhs.total_byte_size))
- return false;
- if (!(num_rows == rhs.num_rows))
- return false;
- if (__isset.sorting_columns != rhs.__isset.sorting_columns)
- return false;
- else if (__isset.sorting_columns && !(sorting_columns == rhs.sorting_columns))
- return false;
- if (__isset.file_offset != rhs.__isset.file_offset)
- return false;
- else if (__isset.file_offset && !(file_offset == rhs.file_offset))
- return false;
- if (__isset.total_compressed_size != rhs.__isset.total_compressed_size)
- return false;
- else if (__isset.total_compressed_size && !(total_compressed_size == rhs.total_compressed_size))
- return false;
- if (__isset.ordinal != rhs.__isset.ordinal)
- return false;
- else if (__isset.ordinal && !(ordinal == rhs.ordinal))
- return false;
- return true;
- }
- bool operator != (const RowGroup &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const RowGroup & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(RowGroup &a, RowGroup &b);
-
-std::ostream& operator<<(std::ostream& out, const RowGroup& obj);
-
-
-class TypeDefinedOrder : public virtual ::apache::thrift::TBase {
- public:
-
- TypeDefinedOrder(const TypeDefinedOrder&);
- TypeDefinedOrder& operator=(const TypeDefinedOrder&);
- TypeDefinedOrder() {
- }
-
- virtual ~TypeDefinedOrder() noexcept;
-
- bool operator == (const TypeDefinedOrder & /* rhs */) const
- {
- return true;
- }
- bool operator != (const TypeDefinedOrder &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TypeDefinedOrder & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TypeDefinedOrder &a, TypeDefinedOrder &b);
-
-std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj);
-
-typedef struct _ColumnOrder__isset {
- _ColumnOrder__isset() : TYPE_ORDER(false) {}
- bool TYPE_ORDER :1;
-} _ColumnOrder__isset;
-
-class ColumnOrder : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnOrder(const ColumnOrder&);
- ColumnOrder& operator=(const ColumnOrder&);
- ColumnOrder() {
- }
-
- virtual ~ColumnOrder() noexcept;
- TypeDefinedOrder TYPE_ORDER;
-
- _ColumnOrder__isset __isset;
-
- void __set_TYPE_ORDER(const TypeDefinedOrder& val);
-
- bool operator == (const ColumnOrder & rhs) const
- {
- if (__isset.TYPE_ORDER != rhs.__isset.TYPE_ORDER)
- return false;
- else if (__isset.TYPE_ORDER && !(TYPE_ORDER == rhs.TYPE_ORDER))
- return false;
- return true;
- }
- bool operator != (const ColumnOrder &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnOrder & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnOrder &a, ColumnOrder &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj);
-
-
-class PageLocation : public virtual ::apache::thrift::TBase {
- public:
-
- PageLocation(const PageLocation&);
- PageLocation& operator=(const PageLocation&);
- PageLocation() : offset(0), compressed_page_size(0), first_row_index(0) {
- }
-
- virtual ~PageLocation() noexcept;
- int64_t offset;
- int32_t compressed_page_size;
- int64_t first_row_index;
-
- void __set_offset(const int64_t val);
-
- void __set_compressed_page_size(const int32_t val);
-
- void __set_first_row_index(const int64_t val);
-
- bool operator == (const PageLocation & rhs) const
- {
- if (!(offset == rhs.offset))
- return false;
- if (!(compressed_page_size == rhs.compressed_page_size))
- return false;
- if (!(first_row_index == rhs.first_row_index))
- return false;
- return true;
- }
- bool operator != (const PageLocation &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const PageLocation & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(PageLocation &a, PageLocation &b);
-
-std::ostream& operator<<(std::ostream& out, const PageLocation& obj);
-
-
-class OffsetIndex : public virtual ::apache::thrift::TBase {
- public:
-
- OffsetIndex(const OffsetIndex&);
- OffsetIndex& operator=(const OffsetIndex&);
- OffsetIndex() {
- }
-
- virtual ~OffsetIndex() noexcept;
- std::vector<PageLocation> page_locations;
-
- void __set_page_locations(const std::vector<PageLocation> & val);
-
- bool operator == (const OffsetIndex & rhs) const
- {
- if (!(page_locations == rhs.page_locations))
- return false;
- return true;
- }
- bool operator != (const OffsetIndex &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const OffsetIndex & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(OffsetIndex &a, OffsetIndex &b);
-
-std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj);
-
-typedef struct _ColumnIndex__isset {
- _ColumnIndex__isset() : null_counts(false) {}
- bool null_counts :1;
-} _ColumnIndex__isset;
-
-class ColumnIndex : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnIndex(const ColumnIndex&);
- ColumnIndex& operator=(const ColumnIndex&);
- ColumnIndex() : boundary_order((BoundaryOrder::type)0) {
- }
-
- virtual ~ColumnIndex() noexcept;
- std::vector<bool> null_pages;
- std::vector<std::string> min_values;
- std::vector<std::string> max_values;
- BoundaryOrder::type boundary_order;
- std::vector<int64_t> null_counts;
-
- _ColumnIndex__isset __isset;
-
- void __set_null_pages(const std::vector<bool> & val);
-
- void __set_min_values(const std::vector<std::string> & val);
-
- void __set_max_values(const std::vector<std::string> & val);
-
- void __set_boundary_order(const BoundaryOrder::type val);
-
- void __set_null_counts(const std::vector<int64_t> & val);
-
- bool operator == (const ColumnIndex & rhs) const
- {
- if (!(null_pages == rhs.null_pages))
- return false;
- if (!(min_values == rhs.min_values))
- return false;
- if (!(max_values == rhs.max_values))
- return false;
- if (!(boundary_order == rhs.boundary_order))
- return false;
- if (__isset.null_counts != rhs.__isset.null_counts)
- return false;
- else if (__isset.null_counts && !(null_counts == rhs.null_counts))
- return false;
- return true;
- }
- bool operator != (const ColumnIndex &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnIndex & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnIndex &a, ColumnIndex &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj);
-
-typedef struct _AesGcmV1__isset {
- _AesGcmV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
- bool aad_prefix :1;
- bool aad_file_unique :1;
- bool supply_aad_prefix :1;
-} _AesGcmV1__isset;
-
-class AesGcmV1 : public virtual ::apache::thrift::TBase {
- public:
-
- AesGcmV1(const AesGcmV1&);
- AesGcmV1& operator=(const AesGcmV1&);
- AesGcmV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
- }
-
- virtual ~AesGcmV1() noexcept;
- std::string aad_prefix;
- std::string aad_file_unique;
- bool supply_aad_prefix;
-
- _AesGcmV1__isset __isset;
-
- void __set_aad_prefix(const std::string& val);
-
- void __set_aad_file_unique(const std::string& val);
-
- void __set_supply_aad_prefix(const bool val);
-
- bool operator == (const AesGcmV1 & rhs) const
- {
- if (__isset.aad_prefix != rhs.__isset.aad_prefix)
- return false;
- else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
- return false;
- if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
- return false;
- else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
- return false;
- if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
- return false;
- else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
- return false;
- return true;
- }
- bool operator != (const AesGcmV1 &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const AesGcmV1 & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(AesGcmV1 &a, AesGcmV1 &b);
-
-std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj);
-
-typedef struct _AesGcmCtrV1__isset {
- _AesGcmCtrV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
- bool aad_prefix :1;
- bool aad_file_unique :1;
- bool supply_aad_prefix :1;
-} _AesGcmCtrV1__isset;
-
-class AesGcmCtrV1 : public virtual ::apache::thrift::TBase {
- public:
-
- AesGcmCtrV1(const AesGcmCtrV1&);
- AesGcmCtrV1& operator=(const AesGcmCtrV1&);
- AesGcmCtrV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
- }
-
- virtual ~AesGcmCtrV1() noexcept;
- std::string aad_prefix;
- std::string aad_file_unique;
- bool supply_aad_prefix;
-
- _AesGcmCtrV1__isset __isset;
-
- void __set_aad_prefix(const std::string& val);
-
- void __set_aad_file_unique(const std::string& val);
-
- void __set_supply_aad_prefix(const bool val);
-
- bool operator == (const AesGcmCtrV1 & rhs) const
- {
- if (__isset.aad_prefix != rhs.__isset.aad_prefix)
- return false;
- else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
- return false;
- if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
- return false;
- else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
- return false;
- if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
- return false;
- else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
- return false;
- return true;
- }
- bool operator != (const AesGcmCtrV1 &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const AesGcmCtrV1 & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b);
-
-std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj);
-
-typedef struct _EncryptionAlgorithm__isset {
- _EncryptionAlgorithm__isset() : AES_GCM_V1(false), AES_GCM_CTR_V1(false) {}
- bool AES_GCM_V1 :1;
- bool AES_GCM_CTR_V1 :1;
-} _EncryptionAlgorithm__isset;
-
-class EncryptionAlgorithm : public virtual ::apache::thrift::TBase {
- public:
-
- EncryptionAlgorithm(const EncryptionAlgorithm&);
- EncryptionAlgorithm& operator=(const EncryptionAlgorithm&);
- EncryptionAlgorithm() {
- }
-
- virtual ~EncryptionAlgorithm() noexcept;
- AesGcmV1 AES_GCM_V1;
- AesGcmCtrV1 AES_GCM_CTR_V1;
-
- _EncryptionAlgorithm__isset __isset;
-
- void __set_AES_GCM_V1(const AesGcmV1& val);
-
- void __set_AES_GCM_CTR_V1(const AesGcmCtrV1& val);
-
- bool operator == (const EncryptionAlgorithm & rhs) const
- {
- if (__isset.AES_GCM_V1 != rhs.__isset.AES_GCM_V1)
- return false;
- else if (__isset.AES_GCM_V1 && !(AES_GCM_V1 == rhs.AES_GCM_V1))
- return false;
- if (__isset.AES_GCM_CTR_V1 != rhs.__isset.AES_GCM_CTR_V1)
- return false;
- else if (__isset.AES_GCM_CTR_V1 && !(AES_GCM_CTR_V1 == rhs.AES_GCM_CTR_V1))
- return false;
- return true;
- }
- bool operator != (const EncryptionAlgorithm &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EncryptionAlgorithm & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b);
-
-std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj);
-
-typedef struct _FileMetaData__isset {
- _FileMetaData__isset() : key_value_metadata(false), created_by(false), column_orders(false), encryption_algorithm(false), footer_signing_key_metadata(false) {}
- bool key_value_metadata :1;
- bool created_by :1;
- bool column_orders :1;
- bool encryption_algorithm :1;
- bool footer_signing_key_metadata :1;
-} _FileMetaData__isset;
-
-class FileMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- FileMetaData(const FileMetaData&);
- FileMetaData& operator=(const FileMetaData&);
- FileMetaData() : version(0), num_rows(0), created_by(), footer_signing_key_metadata() {
- }
-
- virtual ~FileMetaData() noexcept;
- int32_t version;
- std::vector<SchemaElement> schema;
- int64_t num_rows;
- std::vector<RowGroup> row_groups;
- std::vector<KeyValue> key_value_metadata;
- std::string created_by;
- std::vector<ColumnOrder> column_orders;
- EncryptionAlgorithm encryption_algorithm;
- std::string footer_signing_key_metadata;
-
- _FileMetaData__isset __isset;
-
- void __set_version(const int32_t val);
-
- void __set_schema(const std::vector<SchemaElement> & val);
-
- void __set_num_rows(const int64_t val);
-
- void __set_row_groups(const std::vector<RowGroup> & val);
-
- void __set_key_value_metadata(const std::vector<KeyValue> & val);
-
- void __set_created_by(const std::string& val);
-
- void __set_column_orders(const std::vector<ColumnOrder> & val);
-
- void __set_encryption_algorithm(const EncryptionAlgorithm& val);
-
- void __set_footer_signing_key_metadata(const std::string& val);
-
- bool operator == (const FileMetaData & rhs) const
- {
- if (!(version == rhs.version))
- return false;
- if (!(schema == rhs.schema))
- return false;
- if (!(num_rows == rhs.num_rows))
- return false;
- if (!(row_groups == rhs.row_groups))
- return false;
- if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
- return false;
- else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
- return false;
- if (__isset.created_by != rhs.__isset.created_by)
- return false;
- else if (__isset.created_by && !(created_by == rhs.created_by))
- return false;
- if (__isset.column_orders != rhs.__isset.column_orders)
- return false;
- else if (__isset.column_orders && !(column_orders == rhs.column_orders))
- return false;
- if (__isset.encryption_algorithm != rhs.__isset.encryption_algorithm)
- return false;
- else if (__isset.encryption_algorithm && !(encryption_algorithm == rhs.encryption_algorithm))
- return false;
- if (__isset.footer_signing_key_metadata != rhs.__isset.footer_signing_key_metadata)
- return false;
- else if (__isset.footer_signing_key_metadata && !(footer_signing_key_metadata == rhs.footer_signing_key_metadata))
- return false;
- return true;
- }
- bool operator != (const FileMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const FileMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(FileMetaData &a, FileMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const FileMetaData& obj);
-
-typedef struct _FileCryptoMetaData__isset {
- _FileCryptoMetaData__isset() : key_metadata(false) {}
- bool key_metadata :1;
-} _FileCryptoMetaData__isset;
-
-class FileCryptoMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- FileCryptoMetaData(const FileCryptoMetaData&);
- FileCryptoMetaData& operator=(const FileCryptoMetaData&);
- FileCryptoMetaData() : key_metadata() {
- }
-
- virtual ~FileCryptoMetaData() noexcept;
- EncryptionAlgorithm encryption_algorithm;
- std::string key_metadata;
-
- _FileCryptoMetaData__isset __isset;
-
- void __set_encryption_algorithm(const EncryptionAlgorithm& val);
-
- void __set_key_metadata(const std::string& val);
-
- bool operator == (const FileCryptoMetaData & rhs) const
- {
- if (!(encryption_algorithm == rhs.encryption_algorithm))
- return false;
- if (__isset.key_metadata != rhs.__isset.key_metadata)
- return false;
- else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
- return false;
- return true;
- }
- bool operator != (const FileCryptoMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const FileCryptoMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(FileCryptoMetaData &a, FileCryptoMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
-
-}} // namespace
-
-#endif
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#ifndef parquet_TYPES_H
+#define parquet_TYPES_H
+
+#include <iosfwd>
+
+#include <thrift/Thrift.h>
+#include <thrift/TApplicationException.h>
+#include <thrift/TBase.h>
+#include <thrift/protocol/TProtocol.h>
+#include <thrift/transport/TTransport.h>
+
+#include <functional>
+#include <memory>
+
+#include "parquet/windows_compatibility.h"
+
+namespace parquet { namespace format {
+
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7
+ };
+};
+
+extern const std::map<int, const char*> _Type_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const Type::type& val);
+
+std::string to_string(const Type::type& val);
+
+struct ConvertedType {
+ enum type {
+ UTF8 = 0,
+ MAP = 1,
+ MAP_KEY_VALUE = 2,
+ LIST = 3,
+ ENUM = 4,
+ DECIMAL = 5,
+ DATE = 6,
+ TIME_MILLIS = 7,
+ TIME_MICROS = 8,
+ TIMESTAMP_MILLIS = 9,
+ TIMESTAMP_MICROS = 10,
+ UINT_8 = 11,
+ UINT_16 = 12,
+ UINT_32 = 13,
+ UINT_64 = 14,
+ INT_8 = 15,
+ INT_16 = 16,
+ INT_32 = 17,
+ INT_64 = 18,
+ JSON = 19,
+ BSON = 20,
+ INTERVAL = 21
+ };
+};
+
+extern const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val);
+
+std::string to_string(const ConvertedType::type& val);
+
+struct FieldRepetitionType {
+ enum type {
+ REQUIRED = 0,
+ OPTIONAL = 1,
+ REPEATED = 2
+ };
+};
+
+extern const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val);
+
+std::string to_string(const FieldRepetitionType::type& val);
+
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9
+ };
+};
+
+extern const std::map<int, const char*> _Encoding_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const Encoding::type& val);
+
+std::string to_string(const Encoding::type& val);
+
+struct CompressionCodec {
+ enum type {
+ UNCOMPRESSED = 0,
+ SNAPPY = 1,
+ GZIP = 2,
+ LZO = 3,
+ BROTLI = 4,
+ LZ4 = 5,
+ ZSTD = 6,
+ LZ4_RAW = 7
+ };
+};
+
+extern const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val);
+
+std::string to_string(const CompressionCodec::type& val);
+
+struct PageType {
+ enum type {
+ DATA_PAGE = 0,
+ INDEX_PAGE = 1,
+ DICTIONARY_PAGE = 2,
+ DATA_PAGE_V2 = 3
+ };
+};
+
+extern const std::map<int, const char*> _PageType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const PageType::type& val);
+
+std::string to_string(const PageType::type& val);
+
+struct BoundaryOrder {
+ enum type {
+ UNORDERED = 0,
+ ASCENDING = 1,
+ DESCENDING = 2
+ };
+};
+
+extern const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val);
+
+std::string to_string(const BoundaryOrder::type& val);
+
+class Statistics;
+
+class StringType;
+
+class UUIDType;
+
+class MapType;
+
+class ListType;
+
+class EnumType;
+
+class DateType;
+
+class NullType;
+
+class DecimalType;
+
+class MilliSeconds;
+
+class MicroSeconds;
+
+class NanoSeconds;
+
+class TimeUnit;
+
+class TimestampType;
+
+class TimeType;
+
+class IntType;
+
+class JsonType;
+
+class BsonType;
+
+class LogicalType;
+
+class SchemaElement;
+
+class DataPageHeader;
+
+class IndexPageHeader;
+
+class DictionaryPageHeader;
+
+class DataPageHeaderV2;
+
+class SplitBlockAlgorithm;
+
+class BloomFilterAlgorithm;
+
+class XxHash;
+
+class BloomFilterHash;
+
+class Uncompressed;
+
+class BloomFilterCompression;
+
+class BloomFilterHeader;
+
+class PageHeader;
+
+class KeyValue;
+
+class SortingColumn;
+
+class PageEncodingStats;
+
+class ColumnMetaData;
+
+class EncryptionWithFooterKey;
+
+class EncryptionWithColumnKey;
+
+class ColumnCryptoMetaData;
+
+class ColumnChunk;
+
+class RowGroup;
+
+class TypeDefinedOrder;
+
+class ColumnOrder;
+
+class PageLocation;
+
+class OffsetIndex;
+
+class ColumnIndex;
+
+class AesGcmV1;
+
+class AesGcmCtrV1;
+
+class EncryptionAlgorithm;
+
+class FileMetaData;
+
+class FileCryptoMetaData;
+
+typedef struct _Statistics__isset {
+ _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {}
+ bool max :1;
+ bool min :1;
+ bool null_count :1;
+ bool distinct_count :1;
+ bool max_value :1;
+ bool min_value :1;
+} _Statistics__isset;
+
+class Statistics : public virtual ::apache::thrift::TBase {
+ public:
+
+ Statistics(const Statistics&);
+ Statistics& operator=(const Statistics&);
+ Statistics() : max(), min(), null_count(0), distinct_count(0), max_value(), min_value() {
+ }
+
+ virtual ~Statistics() noexcept;
+ std::string max;
+ std::string min;
+ int64_t null_count;
+ int64_t distinct_count;
+ std::string max_value;
+ std::string min_value;
+
+ _Statistics__isset __isset;
+
+ void __set_max(const std::string& val);
+
+ void __set_min(const std::string& val);
+
+ void __set_null_count(const int64_t val);
+
+ void __set_distinct_count(const int64_t val);
+
+ void __set_max_value(const std::string& val);
+
+ void __set_min_value(const std::string& val);
+
+ bool operator == (const Statistics & rhs) const
+ {
+ if (__isset.max != rhs.__isset.max)
+ return false;
+ else if (__isset.max && !(max == rhs.max))
+ return false;
+ if (__isset.min != rhs.__isset.min)
+ return false;
+ else if (__isset.min && !(min == rhs.min))
+ return false;
+ if (__isset.null_count != rhs.__isset.null_count)
+ return false;
+ else if (__isset.null_count && !(null_count == rhs.null_count))
+ return false;
+ if (__isset.distinct_count != rhs.__isset.distinct_count)
+ return false;
+ else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
+ return false;
+ if (__isset.max_value != rhs.__isset.max_value)
+ return false;
+ else if (__isset.max_value && !(max_value == rhs.max_value))
+ return false;
+ if (__isset.min_value != rhs.__isset.min_value)
+ return false;
+ else if (__isset.min_value && !(min_value == rhs.min_value))
+ return false;
+ return true;
+ }
+ bool operator != (const Statistics &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const Statistics & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(Statistics &a, Statistics &b);
+
+std::ostream& operator<<(std::ostream& out, const Statistics& obj);
+
+
+class StringType : public virtual ::apache::thrift::TBase {
+ public:
+
+ StringType(const StringType&);
+ StringType& operator=(const StringType&);
+ StringType() {
+ }
+
+ virtual ~StringType() noexcept;
+
+ bool operator == (const StringType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const StringType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const StringType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(StringType &a, StringType &b);
+
+std::ostream& operator<<(std::ostream& out, const StringType& obj);
+
+
+class UUIDType : public virtual ::apache::thrift::TBase {
+ public:
+
+ UUIDType(const UUIDType&);
+ UUIDType& operator=(const UUIDType&);
+ UUIDType() {
+ }
+
+ virtual ~UUIDType() noexcept;
+
+ bool operator == (const UUIDType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const UUIDType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const UUIDType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(UUIDType &a, UUIDType &b);
+
+std::ostream& operator<<(std::ostream& out, const UUIDType& obj);
+
+
+class MapType : public virtual ::apache::thrift::TBase {
+ public:
+
+ MapType(const MapType&);
+ MapType& operator=(const MapType&);
+ MapType() {
+ }
+
+ virtual ~MapType() noexcept;
+
+ bool operator == (const MapType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MapType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MapType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MapType &a, MapType &b);
+
+std::ostream& operator<<(std::ostream& out, const MapType& obj);
+
+
+class ListType : public virtual ::apache::thrift::TBase {
+ public:
+
+ ListType(const ListType&);
+ ListType& operator=(const ListType&);
+ ListType() {
+ }
+
+ virtual ~ListType() noexcept;
+
+ bool operator == (const ListType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const ListType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ListType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ListType &a, ListType &b);
+
+std::ostream& operator<<(std::ostream& out, const ListType& obj);
+
+
+class EnumType : public virtual ::apache::thrift::TBase {
+ public:
+
+ EnumType(const EnumType&);
+ EnumType& operator=(const EnumType&);
+ EnumType() {
+ }
+
+ virtual ~EnumType() noexcept;
+
+ bool operator == (const EnumType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const EnumType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EnumType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EnumType &a, EnumType &b);
+
+std::ostream& operator<<(std::ostream& out, const EnumType& obj);
+
+
+class DateType : public virtual ::apache::thrift::TBase {
+ public:
+
+ DateType(const DateType&);
+ DateType& operator=(const DateType&);
+ DateType() {
+ }
+
+ virtual ~DateType() noexcept;
+
+ bool operator == (const DateType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const DateType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DateType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DateType &a, DateType &b);
+
+std::ostream& operator<<(std::ostream& out, const DateType& obj);
+
+
+class NullType : public virtual ::apache::thrift::TBase {
+ public:
+
+ NullType(const NullType&);
+ NullType& operator=(const NullType&);
+ NullType() {
+ }
+
+ virtual ~NullType() noexcept;
+
+ bool operator == (const NullType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const NullType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const NullType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(NullType &a, NullType &b);
+
+std::ostream& operator<<(std::ostream& out, const NullType& obj);
+
+
+class DecimalType : public virtual ::apache::thrift::TBase {
+ public:
+
+ DecimalType(const DecimalType&);
+ DecimalType& operator=(const DecimalType&);
+ DecimalType() : scale(0), precision(0) {
+ }
+
+ virtual ~DecimalType() noexcept;
+ int32_t scale;
+ int32_t precision;
+
+ void __set_scale(const int32_t val);
+
+ void __set_precision(const int32_t val);
+
+ bool operator == (const DecimalType & rhs) const
+ {
+ if (!(scale == rhs.scale))
+ return false;
+ if (!(precision == rhs.precision))
+ return false;
+ return true;
+ }
+ bool operator != (const DecimalType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DecimalType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DecimalType &a, DecimalType &b);
+
+std::ostream& operator<<(std::ostream& out, const DecimalType& obj);
+
+
+class MilliSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ MilliSeconds(const MilliSeconds&);
+ MilliSeconds& operator=(const MilliSeconds&);
+ MilliSeconds() {
+ }
+
+ virtual ~MilliSeconds() noexcept;
+
+ bool operator == (const MilliSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MilliSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MilliSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MilliSeconds &a, MilliSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj);
+
+
+class MicroSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ MicroSeconds(const MicroSeconds&);
+ MicroSeconds& operator=(const MicroSeconds&);
+ MicroSeconds() {
+ }
+
+ virtual ~MicroSeconds() noexcept;
+
+ bool operator == (const MicroSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MicroSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MicroSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MicroSeconds &a, MicroSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj);
+
+
+class NanoSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ NanoSeconds(const NanoSeconds&);
+ NanoSeconds& operator=(const NanoSeconds&);
+ NanoSeconds() {
+ }
+
+ virtual ~NanoSeconds() noexcept;
+
+ bool operator == (const NanoSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const NanoSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const NanoSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(NanoSeconds &a, NanoSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj);
+
+typedef struct _TimeUnit__isset {
+ _TimeUnit__isset() : MILLIS(false), MICROS(false), NANOS(false) {}
+ bool MILLIS :1;
+ bool MICROS :1;
+ bool NANOS :1;
+} _TimeUnit__isset;
+
+class TimeUnit : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimeUnit(const TimeUnit&);
+ TimeUnit& operator=(const TimeUnit&);
+ TimeUnit() {
+ }
+
+ virtual ~TimeUnit() noexcept;
+ MilliSeconds MILLIS;
+ MicroSeconds MICROS;
+ NanoSeconds NANOS;
+
+ _TimeUnit__isset __isset;
+
+ void __set_MILLIS(const MilliSeconds& val);
+
+ void __set_MICROS(const MicroSeconds& val);
+
+ void __set_NANOS(const NanoSeconds& val);
+
+ bool operator == (const TimeUnit & rhs) const
+ {
+ if (__isset.MILLIS != rhs.__isset.MILLIS)
+ return false;
+ else if (__isset.MILLIS && !(MILLIS == rhs.MILLIS))
+ return false;
+ if (__isset.MICROS != rhs.__isset.MICROS)
+ return false;
+ else if (__isset.MICROS && !(MICROS == rhs.MICROS))
+ return false;
+ if (__isset.NANOS != rhs.__isset.NANOS)
+ return false;
+ else if (__isset.NANOS && !(NANOS == rhs.NANOS))
+ return false;
+ return true;
+ }
+ bool operator != (const TimeUnit &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimeUnit & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimeUnit &a, TimeUnit &b);
+
+std::ostream& operator<<(std::ostream& out, const TimeUnit& obj);
+
+
+class TimestampType : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimestampType(const TimestampType&);
+ TimestampType& operator=(const TimestampType&);
+ TimestampType() : isAdjustedToUTC(0) {
+ }
+
+ virtual ~TimestampType() noexcept;
+ bool isAdjustedToUTC;
+ TimeUnit unit;
+
+ void __set_isAdjustedToUTC(const bool val);
+
+ void __set_unit(const TimeUnit& val);
+
+ bool operator == (const TimestampType & rhs) const
+ {
+ if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
+ return false;
+ if (!(unit == rhs.unit))
+ return false;
+ return true;
+ }
+ bool operator != (const TimestampType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimestampType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimestampType &a, TimestampType &b);
+
+std::ostream& operator<<(std::ostream& out, const TimestampType& obj);
+
+
+class TimeType : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimeType(const TimeType&);
+ TimeType& operator=(const TimeType&);
+ TimeType() : isAdjustedToUTC(0) {
+ }
+
+ virtual ~TimeType() noexcept;
+ bool isAdjustedToUTC;
+ TimeUnit unit;
+
+ void __set_isAdjustedToUTC(const bool val);
+
+ void __set_unit(const TimeUnit& val);
+
+ bool operator == (const TimeType & rhs) const
+ {
+ if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
+ return false;
+ if (!(unit == rhs.unit))
+ return false;
+ return true;
+ }
+ bool operator != (const TimeType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimeType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimeType &a, TimeType &b);
+
+std::ostream& operator<<(std::ostream& out, const TimeType& obj);
+
+
+class IntType : public virtual ::apache::thrift::TBase {
+ public:
+
+ IntType(const IntType&);
+ IntType& operator=(const IntType&);
+ IntType() : bitWidth(0), isSigned(0) {
+ }
+
+ virtual ~IntType() noexcept;
+ int8_t bitWidth;
+ bool isSigned;
+
+ void __set_bitWidth(const int8_t val);
+
+ void __set_isSigned(const bool val);
+
+ bool operator == (const IntType & rhs) const
+ {
+ if (!(bitWidth == rhs.bitWidth))
+ return false;
+ if (!(isSigned == rhs.isSigned))
+ return false;
+ return true;
+ }
+ bool operator != (const IntType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const IntType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(IntType &a, IntType &b);
+
+std::ostream& operator<<(std::ostream& out, const IntType& obj);
+
+
+class JsonType : public virtual ::apache::thrift::TBase {
+ public:
+
+ JsonType(const JsonType&);
+ JsonType& operator=(const JsonType&);
+ JsonType() {
+ }
+
+ virtual ~JsonType() noexcept;
+
+ bool operator == (const JsonType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const JsonType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const JsonType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(JsonType &a, JsonType &b);
+
+std::ostream& operator<<(std::ostream& out, const JsonType& obj);
+
+
+class BsonType : public virtual ::apache::thrift::TBase {
+ public:
+
+ BsonType(const BsonType&);
+ BsonType& operator=(const BsonType&);
+ BsonType() {
+ }
+
+ virtual ~BsonType() noexcept;
+
+ bool operator == (const BsonType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const BsonType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BsonType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BsonType &a, BsonType &b);
+
+std::ostream& operator<<(std::ostream& out, const BsonType& obj);
+
+typedef struct _LogicalType__isset {
+ _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false) {}
+ bool STRING :1;
+ bool MAP :1;
+ bool LIST :1;
+ bool ENUM :1;
+ bool DECIMAL :1;
+ bool DATE :1;
+ bool TIME :1;
+ bool TIMESTAMP :1;
+ bool INTEGER :1;
+ bool UNKNOWN :1;
+ bool JSON :1;
+ bool BSON :1;
+ bool UUID :1;
+} _LogicalType__isset;
+
+class LogicalType : public virtual ::apache::thrift::TBase {
+ public:
+
+ LogicalType(const LogicalType&);
+ LogicalType& operator=(const LogicalType&);
+ LogicalType() {
+ }
+
+ virtual ~LogicalType() noexcept;
+ StringType STRING;
+ MapType MAP;
+ ListType LIST;
+ EnumType ENUM;
+ DecimalType DECIMAL;
+ DateType DATE;
+ TimeType TIME;
+ TimestampType TIMESTAMP;
+ IntType INTEGER;
+ NullType UNKNOWN;
+ JsonType JSON;
+ BsonType BSON;
+ UUIDType UUID;
+
+ _LogicalType__isset __isset;
+
+ void __set_STRING(const StringType& val);
+
+ void __set_MAP(const MapType& val);
+
+ void __set_LIST(const ListType& val);
+
+ void __set_ENUM(const EnumType& val);
+
+ void __set_DECIMAL(const DecimalType& val);
+
+ void __set_DATE(const DateType& val);
+
+ void __set_TIME(const TimeType& val);
+
+ void __set_TIMESTAMP(const TimestampType& val);
+
+ void __set_INTEGER(const IntType& val);
+
+ void __set_UNKNOWN(const NullType& val);
+
+ void __set_JSON(const JsonType& val);
+
+ void __set_BSON(const BsonType& val);
+
+ void __set_UUID(const UUIDType& val);
+
+ bool operator == (const LogicalType & rhs) const
+ {
+ if (__isset.STRING != rhs.__isset.STRING)
+ return false;
+ else if (__isset.STRING && !(STRING == rhs.STRING))
+ return false;
+ if (__isset.MAP != rhs.__isset.MAP)
+ return false;
+ else if (__isset.MAP && !(MAP == rhs.MAP))
+ return false;
+ if (__isset.LIST != rhs.__isset.LIST)
+ return false;
+ else if (__isset.LIST && !(LIST == rhs.LIST))
+ return false;
+ if (__isset.ENUM != rhs.__isset.ENUM)
+ return false;
+ else if (__isset.ENUM && !(ENUM == rhs.ENUM))
+ return false;
+ if (__isset.DECIMAL != rhs.__isset.DECIMAL)
+ return false;
+ else if (__isset.DECIMAL && !(DECIMAL == rhs.DECIMAL))
+ return false;
+ if (__isset.DATE != rhs.__isset.DATE)
+ return false;
+ else if (__isset.DATE && !(DATE == rhs.DATE))
+ return false;
+ if (__isset.TIME != rhs.__isset.TIME)
+ return false;
+ else if (__isset.TIME && !(TIME == rhs.TIME))
+ return false;
+ if (__isset.TIMESTAMP != rhs.__isset.TIMESTAMP)
+ return false;
+ else if (__isset.TIMESTAMP && !(TIMESTAMP == rhs.TIMESTAMP))
+ return false;
+ if (__isset.INTEGER != rhs.__isset.INTEGER)
+ return false;
+ else if (__isset.INTEGER && !(INTEGER == rhs.INTEGER))
+ return false;
+ if (__isset.UNKNOWN != rhs.__isset.UNKNOWN)
+ return false;
+ else if (__isset.UNKNOWN && !(UNKNOWN == rhs.UNKNOWN))
+ return false;
+ if (__isset.JSON != rhs.__isset.JSON)
+ return false;
+ else if (__isset.JSON && !(JSON == rhs.JSON))
+ return false;
+ if (__isset.BSON != rhs.__isset.BSON)
+ return false;
+ else if (__isset.BSON && !(BSON == rhs.BSON))
+ return false;
+ if (__isset.UUID != rhs.__isset.UUID)
+ return false;
+ else if (__isset.UUID && !(UUID == rhs.UUID))
+ return false;
+ return true;
+ }
+ bool operator != (const LogicalType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const LogicalType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(LogicalType &a, LogicalType &b);
+
+std::ostream& operator<<(std::ostream& out, const LogicalType& obj);
+
+typedef struct _SchemaElement__isset {
+ _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false) {}
+ bool type :1;
+ bool type_length :1;
+ bool repetition_type :1;
+ bool num_children :1;
+ bool converted_type :1;
+ bool scale :1;
+ bool precision :1;
+ bool field_id :1;
+ bool logicalType :1;
+} _SchemaElement__isset;
+
+class SchemaElement : public virtual ::apache::thrift::TBase {
+ public:
+
+ SchemaElement(const SchemaElement&);
+ SchemaElement& operator=(const SchemaElement&);
+ SchemaElement() : type((Type::type)0), type_length(0), repetition_type((FieldRepetitionType::type)0), name(), num_children(0), converted_type((ConvertedType::type)0), scale(0), precision(0), field_id(0) {
+ }
+
+ virtual ~SchemaElement() noexcept;
+ Type::type type;
+ int32_t type_length;
+ FieldRepetitionType::type repetition_type;
+ std::string name;
+ int32_t num_children;
+ ConvertedType::type converted_type;
+ int32_t scale;
+ int32_t precision;
+ int32_t field_id;
+ LogicalType logicalType;
+
+ _SchemaElement__isset __isset;
+
+ void __set_type(const Type::type val);
+
+ void __set_type_length(const int32_t val);
+
+ void __set_repetition_type(const FieldRepetitionType::type val);
+
+ void __set_name(const std::string& val);
+
+ void __set_num_children(const int32_t val);
+
+ void __set_converted_type(const ConvertedType::type val);
+
+ void __set_scale(const int32_t val);
+
+ void __set_precision(const int32_t val);
+
+ void __set_field_id(const int32_t val);
+
+ void __set_logicalType(const LogicalType& val);
+
+ bool operator == (const SchemaElement & rhs) const
+ {
+ if (__isset.type != rhs.__isset.type)
+ return false;
+ else if (__isset.type && !(type == rhs.type))
+ return false;
+ if (__isset.type_length != rhs.__isset.type_length)
+ return false;
+ else if (__isset.type_length && !(type_length == rhs.type_length))
+ return false;
+ if (__isset.repetition_type != rhs.__isset.repetition_type)
+ return false;
+ else if (__isset.repetition_type && !(repetition_type == rhs.repetition_type))
+ return false;
+ if (!(name == rhs.name))
+ return false;
+ if (__isset.num_children != rhs.__isset.num_children)
+ return false;
+ else if (__isset.num_children && !(num_children == rhs.num_children))
+ return false;
+ if (__isset.converted_type != rhs.__isset.converted_type)
+ return false;
+ else if (__isset.converted_type && !(converted_type == rhs.converted_type))
+ return false;
+ if (__isset.scale != rhs.__isset.scale)
+ return false;
+ else if (__isset.scale && !(scale == rhs.scale))
+ return false;
+ if (__isset.precision != rhs.__isset.precision)
+ return false;
+ else if (__isset.precision && !(precision == rhs.precision))
+ return false;
+ if (__isset.field_id != rhs.__isset.field_id)
+ return false;
+ else if (__isset.field_id && !(field_id == rhs.field_id))
+ return false;
+ if (__isset.logicalType != rhs.__isset.logicalType)
+ return false;
+ else if (__isset.logicalType && !(logicalType == rhs.logicalType))
+ return false;
+ return true;
+ }
+ bool operator != (const SchemaElement &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SchemaElement & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SchemaElement &a, SchemaElement &b);
+
+std::ostream& operator<<(std::ostream& out, const SchemaElement& obj);
+
+typedef struct _DataPageHeader__isset {
+ _DataPageHeader__isset() : statistics(false) {}
+ bool statistics :1;
+} _DataPageHeader__isset;
+
+class DataPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ DataPageHeader(const DataPageHeader&);
+ DataPageHeader& operator=(const DataPageHeader&);
+ DataPageHeader() : num_values(0), encoding((Encoding::type)0), definition_level_encoding((Encoding::type)0), repetition_level_encoding((Encoding::type)0) {
+ }
+
+ virtual ~DataPageHeader() noexcept;
+ int32_t num_values;
+ Encoding::type encoding;
+ Encoding::type definition_level_encoding;
+ Encoding::type repetition_level_encoding;
+ Statistics statistics;
+
+ _DataPageHeader__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_definition_level_encoding(const Encoding::type val);
+
+ void __set_repetition_level_encoding(const Encoding::type val);
+
+ void __set_statistics(const Statistics& val);
+
+ bool operator == (const DataPageHeader & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(definition_level_encoding == rhs.definition_level_encoding))
+ return false;
+ if (!(repetition_level_encoding == rhs.repetition_level_encoding))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ return true;
+ }
+ bool operator != (const DataPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DataPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DataPageHeader &a, DataPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj);
+
+
+class IndexPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ IndexPageHeader(const IndexPageHeader&);
+ IndexPageHeader& operator=(const IndexPageHeader&);
+ IndexPageHeader() {
+ }
+
+ virtual ~IndexPageHeader() noexcept;
+
+ bool operator == (const IndexPageHeader & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const IndexPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const IndexPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(IndexPageHeader &a, IndexPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj);
+
+typedef struct _DictionaryPageHeader__isset {
+ _DictionaryPageHeader__isset() : is_sorted(false) {}
+ bool is_sorted :1;
+} _DictionaryPageHeader__isset;
+
+class DictionaryPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ DictionaryPageHeader(const DictionaryPageHeader&);
+ DictionaryPageHeader& operator=(const DictionaryPageHeader&);
+ DictionaryPageHeader() : num_values(0), encoding((Encoding::type)0), is_sorted(0) {
+ }
+
+ virtual ~DictionaryPageHeader() noexcept;
+ int32_t num_values;
+ Encoding::type encoding;
+ bool is_sorted;
+
+ _DictionaryPageHeader__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_is_sorted(const bool val);
+
+ bool operator == (const DictionaryPageHeader & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (__isset.is_sorted != rhs.__isset.is_sorted)
+ return false;
+ else if (__isset.is_sorted && !(is_sorted == rhs.is_sorted))
+ return false;
+ return true;
+ }
+ bool operator != (const DictionaryPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DictionaryPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj);
+
+typedef struct _DataPageHeaderV2__isset {
+ _DataPageHeaderV2__isset() : is_compressed(true), statistics(false) {}
+ bool is_compressed :1;
+ bool statistics :1;
+} _DataPageHeaderV2__isset;
+
+class DataPageHeaderV2 : public virtual ::apache::thrift::TBase {
+ public:
+
+ DataPageHeaderV2(const DataPageHeaderV2&);
+ DataPageHeaderV2& operator=(const DataPageHeaderV2&);
+ DataPageHeaderV2() : num_values(0), num_nulls(0), num_rows(0), encoding((Encoding::type)0), definition_levels_byte_length(0), repetition_levels_byte_length(0), is_compressed(true) {
+ }
+
+ virtual ~DataPageHeaderV2() noexcept;
+ int32_t num_values;
+ int32_t num_nulls;
+ int32_t num_rows;
+ Encoding::type encoding;
+ int32_t definition_levels_byte_length;
+ int32_t repetition_levels_byte_length;
+ bool is_compressed;
+ Statistics statistics;
+
+ _DataPageHeaderV2__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_num_nulls(const int32_t val);
+
+ void __set_num_rows(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_definition_levels_byte_length(const int32_t val);
+
+ void __set_repetition_levels_byte_length(const int32_t val);
+
+ void __set_is_compressed(const bool val);
+
+ void __set_statistics(const Statistics& val);
+
+ bool operator == (const DataPageHeaderV2 & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(num_nulls == rhs.num_nulls))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(definition_levels_byte_length == rhs.definition_levels_byte_length))
+ return false;
+ if (!(repetition_levels_byte_length == rhs.repetition_levels_byte_length))
+ return false;
+ if (__isset.is_compressed != rhs.__isset.is_compressed)
+ return false;
+ else if (__isset.is_compressed && !(is_compressed == rhs.is_compressed))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ return true;
+ }
+ bool operator != (const DataPageHeaderV2 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DataPageHeaderV2 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b);
+
+std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj);
+
+
+class SplitBlockAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ SplitBlockAlgorithm(const SplitBlockAlgorithm&);
+ SplitBlockAlgorithm& operator=(const SplitBlockAlgorithm&);
+ SplitBlockAlgorithm() {
+ }
+
+ virtual ~SplitBlockAlgorithm() noexcept;
+
+ bool operator == (const SplitBlockAlgorithm & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const SplitBlockAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SplitBlockAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj);
+
+typedef struct _BloomFilterAlgorithm__isset {
+ _BloomFilterAlgorithm__isset() : BLOCK(false) {}
+ bool BLOCK :1;
+} _BloomFilterAlgorithm__isset;
+
+class BloomFilterAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterAlgorithm(const BloomFilterAlgorithm&);
+ BloomFilterAlgorithm& operator=(const BloomFilterAlgorithm&);
+ BloomFilterAlgorithm() {
+ }
+
+ virtual ~BloomFilterAlgorithm() noexcept;
+ SplitBlockAlgorithm BLOCK;
+
+ _BloomFilterAlgorithm__isset __isset;
+
+ void __set_BLOCK(const SplitBlockAlgorithm& val);
+
+ bool operator == (const BloomFilterAlgorithm & rhs) const
+ {
+ if (__isset.BLOCK != rhs.__isset.BLOCK)
+ return false;
+ else if (__isset.BLOCK && !(BLOCK == rhs.BLOCK))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj);
+
+
+class XxHash : public virtual ::apache::thrift::TBase {
+ public:
+
+ XxHash(const XxHash&);
+ XxHash& operator=(const XxHash&);
+ XxHash() {
+ }
+
+ virtual ~XxHash() noexcept;
+
+ bool operator == (const XxHash & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const XxHash &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const XxHash & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(XxHash &a, XxHash &b);
+
+std::ostream& operator<<(std::ostream& out, const XxHash& obj);
+
+typedef struct _BloomFilterHash__isset {
+ _BloomFilterHash__isset() : XXHASH(false) {}
+ bool XXHASH :1;
+} _BloomFilterHash__isset;
+
+class BloomFilterHash : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterHash(const BloomFilterHash&);
+ BloomFilterHash& operator=(const BloomFilterHash&);
+ BloomFilterHash() {
+ }
+
+ virtual ~BloomFilterHash() noexcept;
+ XxHash XXHASH;
+
+ _BloomFilterHash__isset __isset;
+
+ void __set_XXHASH(const XxHash& val);
+
+ bool operator == (const BloomFilterHash & rhs) const
+ {
+ if (__isset.XXHASH != rhs.__isset.XXHASH)
+ return false;
+ else if (__isset.XXHASH && !(XXHASH == rhs.XXHASH))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterHash &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterHash & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterHash &a, BloomFilterHash &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj);
+
+
+class Uncompressed : public virtual ::apache::thrift::TBase {
+ public:
+
+ Uncompressed(const Uncompressed&);
+ Uncompressed& operator=(const Uncompressed&);
+ Uncompressed() {
+ }
+
+ virtual ~Uncompressed() noexcept;
+
+ bool operator == (const Uncompressed & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const Uncompressed &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const Uncompressed & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(Uncompressed &a, Uncompressed &b);
+
+std::ostream& operator<<(std::ostream& out, const Uncompressed& obj);
+
+typedef struct _BloomFilterCompression__isset {
+ _BloomFilterCompression__isset() : UNCOMPRESSED(false) {}
+ bool UNCOMPRESSED :1;
+} _BloomFilterCompression__isset;
+
+class BloomFilterCompression : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterCompression(const BloomFilterCompression&);
+ BloomFilterCompression& operator=(const BloomFilterCompression&);
+ BloomFilterCompression() {
+ }
+
+ virtual ~BloomFilterCompression() noexcept;
+ Uncompressed UNCOMPRESSED;
+
+ _BloomFilterCompression__isset __isset;
+
+ void __set_UNCOMPRESSED(const Uncompressed& val);
+
+ bool operator == (const BloomFilterCompression & rhs) const
+ {
+ if (__isset.UNCOMPRESSED != rhs.__isset.UNCOMPRESSED)
+ return false;
+ else if (__isset.UNCOMPRESSED && !(UNCOMPRESSED == rhs.UNCOMPRESSED))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterCompression &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterCompression & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterCompression &a, BloomFilterCompression &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj);
+
+
+class BloomFilterHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterHeader(const BloomFilterHeader&);
+ BloomFilterHeader& operator=(const BloomFilterHeader&);
+ BloomFilterHeader() : numBytes(0) {
+ }
+
+ virtual ~BloomFilterHeader() noexcept;
+ int32_t numBytes;
+ BloomFilterAlgorithm algorithm;
+ BloomFilterHash hash;
+ BloomFilterCompression compression;
+
+ void __set_numBytes(const int32_t val);
+
+ void __set_algorithm(const BloomFilterAlgorithm& val);
+
+ void __set_hash(const BloomFilterHash& val);
+
+ void __set_compression(const BloomFilterCompression& val);
+
+ bool operator == (const BloomFilterHeader & rhs) const
+ {
+ if (!(numBytes == rhs.numBytes))
+ return false;
+ if (!(algorithm == rhs.algorithm))
+ return false;
+ if (!(hash == rhs.hash))
+ return false;
+ if (!(compression == rhs.compression))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterHeader &a, BloomFilterHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj);
+
+typedef struct _PageHeader__isset {
+ _PageHeader__isset() : crc(false), data_page_header(false), index_page_header(false), dictionary_page_header(false), data_page_header_v2(false) {}
+ bool crc :1;
+ bool data_page_header :1;
+ bool index_page_header :1;
+ bool dictionary_page_header :1;
+ bool data_page_header_v2 :1;
+} _PageHeader__isset;
+
+class PageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageHeader(const PageHeader&);
+ PageHeader& operator=(const PageHeader&);
+ PageHeader() : type((PageType::type)0), uncompressed_page_size(0), compressed_page_size(0), crc(0) {
+ }
+
+ virtual ~PageHeader() noexcept;
+ PageType::type type;
+ int32_t uncompressed_page_size;
+ int32_t compressed_page_size;
+ int32_t crc;
+ DataPageHeader data_page_header;
+ IndexPageHeader index_page_header;
+ DictionaryPageHeader dictionary_page_header;
+ DataPageHeaderV2 data_page_header_v2;
+
+ _PageHeader__isset __isset;
+
+ void __set_type(const PageType::type val);
+
+ void __set_uncompressed_page_size(const int32_t val);
+
+ void __set_compressed_page_size(const int32_t val);
+
+ void __set_crc(const int32_t val);
+
+ void __set_data_page_header(const DataPageHeader& val);
+
+ void __set_index_page_header(const IndexPageHeader& val);
+
+ void __set_dictionary_page_header(const DictionaryPageHeader& val);
+
+ void __set_data_page_header_v2(const DataPageHeaderV2& val);
+
+ bool operator == (const PageHeader & rhs) const
+ {
+ if (!(type == rhs.type))
+ return false;
+ if (!(uncompressed_page_size == rhs.uncompressed_page_size))
+ return false;
+ if (!(compressed_page_size == rhs.compressed_page_size))
+ return false;
+ if (__isset.crc != rhs.__isset.crc)
+ return false;
+ else if (__isset.crc && !(crc == rhs.crc))
+ return false;
+ if (__isset.data_page_header != rhs.__isset.data_page_header)
+ return false;
+ else if (__isset.data_page_header && !(data_page_header == rhs.data_page_header))
+ return false;
+ if (__isset.index_page_header != rhs.__isset.index_page_header)
+ return false;
+ else if (__isset.index_page_header && !(index_page_header == rhs.index_page_header))
+ return false;
+ if (__isset.dictionary_page_header != rhs.__isset.dictionary_page_header)
+ return false;
+ else if (__isset.dictionary_page_header && !(dictionary_page_header == rhs.dictionary_page_header))
+ return false;
+ if (__isset.data_page_header_v2 != rhs.__isset.data_page_header_v2)
+ return false;
+ else if (__isset.data_page_header_v2 && !(data_page_header_v2 == rhs.data_page_header_v2))
+ return false;
+ return true;
+ }
+ bool operator != (const PageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageHeader &a, PageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const PageHeader& obj);
+
+typedef struct _KeyValue__isset {
+ _KeyValue__isset() : value(false) {}
+ bool value :1;
+} _KeyValue__isset;
+
+class KeyValue : public virtual ::apache::thrift::TBase {
+ public:
+
+ KeyValue(const KeyValue&);
+ KeyValue& operator=(const KeyValue&);
+ KeyValue() : key(), value() {
+ }
+
+ virtual ~KeyValue() noexcept;
+ std::string key;
+ std::string value;
+
+ _KeyValue__isset __isset;
+
+ void __set_key(const std::string& val);
+
+ void __set_value(const std::string& val);
+
+ bool operator == (const KeyValue & rhs) const
+ {
+ if (!(key == rhs.key))
+ return false;
+ if (__isset.value != rhs.__isset.value)
+ return false;
+ else if (__isset.value && !(value == rhs.value))
+ return false;
+ return true;
+ }
+ bool operator != (const KeyValue &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const KeyValue & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(KeyValue &a, KeyValue &b);
+
+std::ostream& operator<<(std::ostream& out, const KeyValue& obj);
+
+
+class SortingColumn : public virtual ::apache::thrift::TBase {
+ public:
+
+ SortingColumn(const SortingColumn&);
+ SortingColumn& operator=(const SortingColumn&);
+ SortingColumn() : column_idx(0), descending(0), nulls_first(0) {
+ }
+
+ virtual ~SortingColumn() noexcept;
+ int32_t column_idx;
+ bool descending;
+ bool nulls_first;
+
+ void __set_column_idx(const int32_t val);
+
+ void __set_descending(const bool val);
+
+ void __set_nulls_first(const bool val);
+
+ bool operator == (const SortingColumn & rhs) const
+ {
+ if (!(column_idx == rhs.column_idx))
+ return false;
+ if (!(descending == rhs.descending))
+ return false;
+ if (!(nulls_first == rhs.nulls_first))
+ return false;
+ return true;
+ }
+ bool operator != (const SortingColumn &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SortingColumn & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SortingColumn &a, SortingColumn &b);
+
+std::ostream& operator<<(std::ostream& out, const SortingColumn& obj);
+
+
+class PageEncodingStats : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageEncodingStats(const PageEncodingStats&);
+ PageEncodingStats& operator=(const PageEncodingStats&);
+ PageEncodingStats() : page_type((PageType::type)0), encoding((Encoding::type)0), count(0) {
+ }
+
+ virtual ~PageEncodingStats() noexcept;
+ PageType::type page_type;
+ Encoding::type encoding;
+ int32_t count;
+
+ void __set_page_type(const PageType::type val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_count(const int32_t val);
+
+ bool operator == (const PageEncodingStats & rhs) const
+ {
+ if (!(page_type == rhs.page_type))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(count == rhs.count))
+ return false;
+ return true;
+ }
+ bool operator != (const PageEncodingStats &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageEncodingStats & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageEncodingStats &a, PageEncodingStats &b);
+
+std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj);
+
+typedef struct _ColumnMetaData__isset {
+ _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {}
+ bool key_value_metadata :1;
+ bool index_page_offset :1;
+ bool dictionary_page_offset :1;
+ bool statistics :1;
+ bool encoding_stats :1;
+ bool bloom_filter_offset :1;
+} _ColumnMetaData__isset;
+
+class ColumnMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnMetaData(const ColumnMetaData&);
+ ColumnMetaData& operator=(const ColumnMetaData&);
+ ColumnMetaData() : type((Type::type)0), codec((CompressionCodec::type)0), num_values(0), total_uncompressed_size(0), total_compressed_size(0), data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), bloom_filter_offset(0) {
+ }
+
+ virtual ~ColumnMetaData() noexcept;
+ Type::type type;
+ std::vector<Encoding::type> encodings;
+ std::vector<std::string> path_in_schema;
+ CompressionCodec::type codec;
+ int64_t num_values;
+ int64_t total_uncompressed_size;
+ int64_t total_compressed_size;
+ std::vector<KeyValue> key_value_metadata;
+ int64_t data_page_offset;
+ int64_t index_page_offset;
+ int64_t dictionary_page_offset;
+ Statistics statistics;
+ std::vector<PageEncodingStats> encoding_stats;
+ int64_t bloom_filter_offset;
+
+ _ColumnMetaData__isset __isset;
+
+ void __set_type(const Type::type val);
+
+ void __set_encodings(const std::vector<Encoding::type> & val);
+
+ void __set_path_in_schema(const std::vector<std::string> & val);
+
+ void __set_codec(const CompressionCodec::type val);
+
+ void __set_num_values(const int64_t val);
+
+ void __set_total_uncompressed_size(const int64_t val);
+
+ void __set_total_compressed_size(const int64_t val);
+
+ void __set_key_value_metadata(const std::vector<KeyValue> & val);
+
+ void __set_data_page_offset(const int64_t val);
+
+ void __set_index_page_offset(const int64_t val);
+
+ void __set_dictionary_page_offset(const int64_t val);
+
+ void __set_statistics(const Statistics& val);
+
+ void __set_encoding_stats(const std::vector<PageEncodingStats> & val);
+
+ void __set_bloom_filter_offset(const int64_t val);
+
+ bool operator == (const ColumnMetaData & rhs) const
+ {
+ if (!(type == rhs.type))
+ return false;
+ if (!(encodings == rhs.encodings))
+ return false;
+ if (!(path_in_schema == rhs.path_in_schema))
+ return false;
+ if (!(codec == rhs.codec))
+ return false;
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(total_uncompressed_size == rhs.total_uncompressed_size))
+ return false;
+ if (!(total_compressed_size == rhs.total_compressed_size))
+ return false;
+ if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+ return false;
+ else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+ return false;
+ if (!(data_page_offset == rhs.data_page_offset))
+ return false;
+ if (__isset.index_page_offset != rhs.__isset.index_page_offset)
+ return false;
+ else if (__isset.index_page_offset && !(index_page_offset == rhs.index_page_offset))
+ return false;
+ if (__isset.dictionary_page_offset != rhs.__isset.dictionary_page_offset)
+ return false;
+ else if (__isset.dictionary_page_offset && !(dictionary_page_offset == rhs.dictionary_page_offset))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ if (__isset.encoding_stats != rhs.__isset.encoding_stats)
+ return false;
+ else if (__isset.encoding_stats && !(encoding_stats == rhs.encoding_stats))
+ return false;
+ if (__isset.bloom_filter_offset != rhs.__isset.bloom_filter_offset)
+ return false;
+ else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnMetaData &a, ColumnMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj);
+
+
+class EncryptionWithFooterKey : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionWithFooterKey(const EncryptionWithFooterKey&);
+ EncryptionWithFooterKey& operator=(const EncryptionWithFooterKey&);
+ EncryptionWithFooterKey() {
+ }
+
+ virtual ~EncryptionWithFooterKey() noexcept;
+
+ bool operator == (const EncryptionWithFooterKey & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const EncryptionWithFooterKey &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionWithFooterKey & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj);
+
+typedef struct _EncryptionWithColumnKey__isset {
+ _EncryptionWithColumnKey__isset() : key_metadata(false) {}
+ bool key_metadata :1;
+} _EncryptionWithColumnKey__isset;
+
+class EncryptionWithColumnKey : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionWithColumnKey(const EncryptionWithColumnKey&);
+ EncryptionWithColumnKey& operator=(const EncryptionWithColumnKey&);
+ EncryptionWithColumnKey() : key_metadata() {
+ }
+
+ virtual ~EncryptionWithColumnKey() noexcept;
+ std::vector<std::string> path_in_schema;
+ std::string key_metadata;
+
+ _EncryptionWithColumnKey__isset __isset;
+
+ void __set_path_in_schema(const std::vector<std::string> & val);
+
+ void __set_key_metadata(const std::string& val);
+
+ bool operator == (const EncryptionWithColumnKey & rhs) const
+ {
+ if (!(path_in_schema == rhs.path_in_schema))
+ return false;
+ if (__isset.key_metadata != rhs.__isset.key_metadata)
+ return false;
+ else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const EncryptionWithColumnKey &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionWithColumnKey & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj);
+
+typedef struct _ColumnCryptoMetaData__isset {
+ _ColumnCryptoMetaData__isset() : ENCRYPTION_WITH_FOOTER_KEY(false), ENCRYPTION_WITH_COLUMN_KEY(false) {}
+ bool ENCRYPTION_WITH_FOOTER_KEY :1;
+ bool ENCRYPTION_WITH_COLUMN_KEY :1;
+} _ColumnCryptoMetaData__isset;
+
+class ColumnCryptoMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnCryptoMetaData(const ColumnCryptoMetaData&);
+ ColumnCryptoMetaData& operator=(const ColumnCryptoMetaData&);
+ ColumnCryptoMetaData() {
+ }
+
+ virtual ~ColumnCryptoMetaData() noexcept;
+ EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY;
+ EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY;
+
+ _ColumnCryptoMetaData__isset __isset;
+
+ void __set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val);
+
+ void __set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val);
+
+ bool operator == (const ColumnCryptoMetaData & rhs) const
+ {
+ if (__isset.ENCRYPTION_WITH_FOOTER_KEY != rhs.__isset.ENCRYPTION_WITH_FOOTER_KEY)
+ return false;
+ else if (__isset.ENCRYPTION_WITH_FOOTER_KEY && !(ENCRYPTION_WITH_FOOTER_KEY == rhs.ENCRYPTION_WITH_FOOTER_KEY))
+ return false;
+ if (__isset.ENCRYPTION_WITH_COLUMN_KEY != rhs.__isset.ENCRYPTION_WITH_COLUMN_KEY)
+ return false;
+ else if (__isset.ENCRYPTION_WITH_COLUMN_KEY && !(ENCRYPTION_WITH_COLUMN_KEY == rhs.ENCRYPTION_WITH_COLUMN_KEY))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnCryptoMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnCryptoMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj);
+
+typedef struct _ColumnChunk__isset {
+ _ColumnChunk__isset() : file_path(false), meta_data(false), offset_index_offset(false), offset_index_length(false), column_index_offset(false), column_index_length(false), crypto_metadata(false), encrypted_column_metadata(false) {}
+ bool file_path :1;
+ bool meta_data :1;
+ bool offset_index_offset :1;
+ bool offset_index_length :1;
+ bool column_index_offset :1;
+ bool column_index_length :1;
+ bool crypto_metadata :1;
+ bool encrypted_column_metadata :1;
+} _ColumnChunk__isset;
+
+class ColumnChunk : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnChunk(const ColumnChunk&);
+ ColumnChunk& operator=(const ColumnChunk&);
+ ColumnChunk() : file_path(), file_offset(0), offset_index_offset(0), offset_index_length(0), column_index_offset(0), column_index_length(0), encrypted_column_metadata() {
+ }
+
+ virtual ~ColumnChunk() noexcept;
+ std::string file_path;
+ int64_t file_offset;
+ ColumnMetaData meta_data;
+ int64_t offset_index_offset;
+ int32_t offset_index_length;
+ int64_t column_index_offset;
+ int32_t column_index_length;
+ ColumnCryptoMetaData crypto_metadata;
+ std::string encrypted_column_metadata;
+
+ _ColumnChunk__isset __isset;
+
+ void __set_file_path(const std::string& val);
+
+ void __set_file_offset(const int64_t val);
+
+ void __set_meta_data(const ColumnMetaData& val);
+
+ void __set_offset_index_offset(const int64_t val);
+
+ void __set_offset_index_length(const int32_t val);
+
+ void __set_column_index_offset(const int64_t val);
+
+ void __set_column_index_length(const int32_t val);
+
+ void __set_crypto_metadata(const ColumnCryptoMetaData& val);
+
+ void __set_encrypted_column_metadata(const std::string& val);
+
+ bool operator == (const ColumnChunk & rhs) const
+ {
+ if (__isset.file_path != rhs.__isset.file_path)
+ return false;
+ else if (__isset.file_path && !(file_path == rhs.file_path))
+ return false;
+ if (!(file_offset == rhs.file_offset))
+ return false;
+ if (__isset.meta_data != rhs.__isset.meta_data)
+ return false;
+ else if (__isset.meta_data && !(meta_data == rhs.meta_data))
+ return false;
+ if (__isset.offset_index_offset != rhs.__isset.offset_index_offset)
+ return false;
+ else if (__isset.offset_index_offset && !(offset_index_offset == rhs.offset_index_offset))
+ return false;
+ if (__isset.offset_index_length != rhs.__isset.offset_index_length)
+ return false;
+ else if (__isset.offset_index_length && !(offset_index_length == rhs.offset_index_length))
+ return false;
+ if (__isset.column_index_offset != rhs.__isset.column_index_offset)
+ return false;
+ else if (__isset.column_index_offset && !(column_index_offset == rhs.column_index_offset))
+ return false;
+ if (__isset.column_index_length != rhs.__isset.column_index_length)
+ return false;
+ else if (__isset.column_index_length && !(column_index_length == rhs.column_index_length))
+ return false;
+ if (__isset.crypto_metadata != rhs.__isset.crypto_metadata)
+ return false;
+ else if (__isset.crypto_metadata && !(crypto_metadata == rhs.crypto_metadata))
+ return false;
+ if (__isset.encrypted_column_metadata != rhs.__isset.encrypted_column_metadata)
+ return false;
+ else if (__isset.encrypted_column_metadata && !(encrypted_column_metadata == rhs.encrypted_column_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnChunk &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnChunk & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnChunk &a, ColumnChunk &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj);
+
+typedef struct _RowGroup__isset {
+ _RowGroup__isset() : sorting_columns(false), file_offset(false), total_compressed_size(false), ordinal(false) {}
+ bool sorting_columns :1;
+ bool file_offset :1;
+ bool total_compressed_size :1;
+ bool ordinal :1;
+} _RowGroup__isset;
+
+class RowGroup : public virtual ::apache::thrift::TBase {
+ public:
+
+ RowGroup(const RowGroup&);
+ RowGroup& operator=(const RowGroup&);
+ RowGroup() : total_byte_size(0), num_rows(0), file_offset(0), total_compressed_size(0), ordinal(0) {
+ }
+
+ virtual ~RowGroup() noexcept;
+ std::vector<ColumnChunk> columns;
+ int64_t total_byte_size;
+ int64_t num_rows;
+ std::vector<SortingColumn> sorting_columns;
+ int64_t file_offset;
+ int64_t total_compressed_size;
+ int16_t ordinal;
+
+ _RowGroup__isset __isset;
+
+ void __set_columns(const std::vector<ColumnChunk> & val);
+
+ void __set_total_byte_size(const int64_t val);
+
+ void __set_num_rows(const int64_t val);
+
+ void __set_sorting_columns(const std::vector<SortingColumn> & val);
+
+ void __set_file_offset(const int64_t val);
+
+ void __set_total_compressed_size(const int64_t val);
+
+ void __set_ordinal(const int16_t val);
+
+ bool operator == (const RowGroup & rhs) const
+ {
+ if (!(columns == rhs.columns))
+ return false;
+ if (!(total_byte_size == rhs.total_byte_size))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (__isset.sorting_columns != rhs.__isset.sorting_columns)
+ return false;
+ else if (__isset.sorting_columns && !(sorting_columns == rhs.sorting_columns))
+ return false;
+ if (__isset.file_offset != rhs.__isset.file_offset)
+ return false;
+ else if (__isset.file_offset && !(file_offset == rhs.file_offset))
+ return false;
+ if (__isset.total_compressed_size != rhs.__isset.total_compressed_size)
+ return false;
+ else if (__isset.total_compressed_size && !(total_compressed_size == rhs.total_compressed_size))
+ return false;
+ if (__isset.ordinal != rhs.__isset.ordinal)
+ return false;
+ else if (__isset.ordinal && !(ordinal == rhs.ordinal))
+ return false;
+ return true;
+ }
+ bool operator != (const RowGroup &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const RowGroup & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(RowGroup &a, RowGroup &b);
+
+std::ostream& operator<<(std::ostream& out, const RowGroup& obj);
+
+
+class TypeDefinedOrder : public virtual ::apache::thrift::TBase {
+ public:
+
+ TypeDefinedOrder(const TypeDefinedOrder&);
+ TypeDefinedOrder& operator=(const TypeDefinedOrder&);
+ TypeDefinedOrder() {
+ }
+
+ virtual ~TypeDefinedOrder() noexcept;
+
+ bool operator == (const TypeDefinedOrder & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const TypeDefinedOrder &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TypeDefinedOrder & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TypeDefinedOrder &a, TypeDefinedOrder &b);
+
+std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj);
+
+typedef struct _ColumnOrder__isset {
+ _ColumnOrder__isset() : TYPE_ORDER(false) {}
+ bool TYPE_ORDER :1;
+} _ColumnOrder__isset;
+
+class ColumnOrder : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnOrder(const ColumnOrder&);
+ ColumnOrder& operator=(const ColumnOrder&);
+ ColumnOrder() {
+ }
+
+ virtual ~ColumnOrder() noexcept;
+ TypeDefinedOrder TYPE_ORDER;
+
+ _ColumnOrder__isset __isset;
+
+ void __set_TYPE_ORDER(const TypeDefinedOrder& val);
+
+ bool operator == (const ColumnOrder & rhs) const
+ {
+ if (__isset.TYPE_ORDER != rhs.__isset.TYPE_ORDER)
+ return false;
+ else if (__isset.TYPE_ORDER && !(TYPE_ORDER == rhs.TYPE_ORDER))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnOrder &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnOrder & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnOrder &a, ColumnOrder &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj);
+
+
+class PageLocation : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageLocation(const PageLocation&);
+ PageLocation& operator=(const PageLocation&);
+ PageLocation() : offset(0), compressed_page_size(0), first_row_index(0) {
+ }
+
+ virtual ~PageLocation() noexcept;
+ int64_t offset;
+ int32_t compressed_page_size;
+ int64_t first_row_index;
+
+ void __set_offset(const int64_t val);
+
+ void __set_compressed_page_size(const int32_t val);
+
+ void __set_first_row_index(const int64_t val);
+
+ bool operator == (const PageLocation & rhs) const
+ {
+ if (!(offset == rhs.offset))
+ return false;
+ if (!(compressed_page_size == rhs.compressed_page_size))
+ return false;
+ if (!(first_row_index == rhs.first_row_index))
+ return false;
+ return true;
+ }
+ bool operator != (const PageLocation &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageLocation & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageLocation &a, PageLocation &b);
+
+std::ostream& operator<<(std::ostream& out, const PageLocation& obj);
+
+
+class OffsetIndex : public virtual ::apache::thrift::TBase {
+ public:
+
+ OffsetIndex(const OffsetIndex&);
+ OffsetIndex& operator=(const OffsetIndex&);
+ OffsetIndex() {
+ }
+
+ virtual ~OffsetIndex() noexcept;
+ std::vector<PageLocation> page_locations;
+
+ void __set_page_locations(const std::vector<PageLocation> & val);
+
+ bool operator == (const OffsetIndex & rhs) const
+ {
+ if (!(page_locations == rhs.page_locations))
+ return false;
+ return true;
+ }
+ bool operator != (const OffsetIndex &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const OffsetIndex & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(OffsetIndex &a, OffsetIndex &b);
+
+std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj);
+
+typedef struct _ColumnIndex__isset {
+ _ColumnIndex__isset() : null_counts(false) {}
+ bool null_counts :1;
+} _ColumnIndex__isset;
+
+class ColumnIndex : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnIndex(const ColumnIndex&);
+ ColumnIndex& operator=(const ColumnIndex&);
+ ColumnIndex() : boundary_order((BoundaryOrder::type)0) {
+ }
+
+ virtual ~ColumnIndex() noexcept;
+ std::vector<bool> null_pages;
+ std::vector<std::string> min_values;
+ std::vector<std::string> max_values;
+ BoundaryOrder::type boundary_order;
+ std::vector<int64_t> null_counts;
+
+ _ColumnIndex__isset __isset;
+
+ void __set_null_pages(const std::vector<bool> & val);
+
+ void __set_min_values(const std::vector<std::string> & val);
+
+ void __set_max_values(const std::vector<std::string> & val);
+
+ void __set_boundary_order(const BoundaryOrder::type val);
+
+ void __set_null_counts(const std::vector<int64_t> & val);
+
+ bool operator == (const ColumnIndex & rhs) const
+ {
+ if (!(null_pages == rhs.null_pages))
+ return false;
+ if (!(min_values == rhs.min_values))
+ return false;
+ if (!(max_values == rhs.max_values))
+ return false;
+ if (!(boundary_order == rhs.boundary_order))
+ return false;
+ if (__isset.null_counts != rhs.__isset.null_counts)
+ return false;
+ else if (__isset.null_counts && !(null_counts == rhs.null_counts))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnIndex &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnIndex & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnIndex &a, ColumnIndex &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj);
+
+typedef struct _AesGcmV1__isset {
+ _AesGcmV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
+ bool aad_prefix :1;
+ bool aad_file_unique :1;
+ bool supply_aad_prefix :1;
+} _AesGcmV1__isset;
+
+class AesGcmV1 : public virtual ::apache::thrift::TBase {
+ public:
+
+ AesGcmV1(const AesGcmV1&);
+ AesGcmV1& operator=(const AesGcmV1&);
+ AesGcmV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
+ }
+
+ virtual ~AesGcmV1() noexcept;
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+
+ _AesGcmV1__isset __isset;
+
+ void __set_aad_prefix(const std::string& val);
+
+ void __set_aad_file_unique(const std::string& val);
+
+ void __set_supply_aad_prefix(const bool val);
+
+ bool operator == (const AesGcmV1 & rhs) const
+ {
+ if (__isset.aad_prefix != rhs.__isset.aad_prefix)
+ return false;
+ else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
+ return false;
+ if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
+ return false;
+ else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
+ return false;
+ if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
+ return false;
+ else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
+ return false;
+ return true;
+ }
+ bool operator != (const AesGcmV1 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const AesGcmV1 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(AesGcmV1 &a, AesGcmV1 &b);
+
+std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj);
+
+typedef struct _AesGcmCtrV1__isset {
+ _AesGcmCtrV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
+ bool aad_prefix :1;
+ bool aad_file_unique :1;
+ bool supply_aad_prefix :1;
+} _AesGcmCtrV1__isset;
+
+class AesGcmCtrV1 : public virtual ::apache::thrift::TBase {
+ public:
+
+ AesGcmCtrV1(const AesGcmCtrV1&);
+ AesGcmCtrV1& operator=(const AesGcmCtrV1&);
+ AesGcmCtrV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
+ }
+
+ virtual ~AesGcmCtrV1() noexcept;
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+
+ _AesGcmCtrV1__isset __isset;
+
+ void __set_aad_prefix(const std::string& val);
+
+ void __set_aad_file_unique(const std::string& val);
+
+ void __set_supply_aad_prefix(const bool val);
+
+ bool operator == (const AesGcmCtrV1 & rhs) const
+ {
+ if (__isset.aad_prefix != rhs.__isset.aad_prefix)
+ return false;
+ else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
+ return false;
+ if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
+ return false;
+ else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
+ return false;
+ if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
+ return false;
+ else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
+ return false;
+ return true;
+ }
+ bool operator != (const AesGcmCtrV1 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const AesGcmCtrV1 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b);
+
+std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj);
+
+typedef struct _EncryptionAlgorithm__isset {
+ _EncryptionAlgorithm__isset() : AES_GCM_V1(false), AES_GCM_CTR_V1(false) {}
+ bool AES_GCM_V1 :1;
+ bool AES_GCM_CTR_V1 :1;
+} _EncryptionAlgorithm__isset;
+
+class EncryptionAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionAlgorithm(const EncryptionAlgorithm&);
+ EncryptionAlgorithm& operator=(const EncryptionAlgorithm&);
+ EncryptionAlgorithm() {
+ }
+
+ virtual ~EncryptionAlgorithm() noexcept;
+ AesGcmV1 AES_GCM_V1;
+ AesGcmCtrV1 AES_GCM_CTR_V1;
+
+ _EncryptionAlgorithm__isset __isset;
+
+ void __set_AES_GCM_V1(const AesGcmV1& val);
+
+ void __set_AES_GCM_CTR_V1(const AesGcmCtrV1& val);
+
+ bool operator == (const EncryptionAlgorithm & rhs) const
+ {
+ if (__isset.AES_GCM_V1 != rhs.__isset.AES_GCM_V1)
+ return false;
+ else if (__isset.AES_GCM_V1 && !(AES_GCM_V1 == rhs.AES_GCM_V1))
+ return false;
+ if (__isset.AES_GCM_CTR_V1 != rhs.__isset.AES_GCM_CTR_V1)
+ return false;
+ else if (__isset.AES_GCM_CTR_V1 && !(AES_GCM_CTR_V1 == rhs.AES_GCM_CTR_V1))
+ return false;
+ return true;
+ }
+ bool operator != (const EncryptionAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj);
+
+typedef struct _FileMetaData__isset {
+ _FileMetaData__isset() : key_value_metadata(false), created_by(false), column_orders(false), encryption_algorithm(false), footer_signing_key_metadata(false) {}
+ bool key_value_metadata :1;
+ bool created_by :1;
+ bool column_orders :1;
+ bool encryption_algorithm :1;
+ bool footer_signing_key_metadata :1;
+} _FileMetaData__isset;
+
+class FileMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ FileMetaData(const FileMetaData&);
+ FileMetaData& operator=(const FileMetaData&);
+ FileMetaData() : version(0), num_rows(0), created_by(), footer_signing_key_metadata() {
+ }
+
+ virtual ~FileMetaData() noexcept;
+ int32_t version;
+ std::vector<SchemaElement> schema;
+ int64_t num_rows;
+ std::vector<RowGroup> row_groups;
+ std::vector<KeyValue> key_value_metadata;
+ std::string created_by;
+ std::vector<ColumnOrder> column_orders;
+ EncryptionAlgorithm encryption_algorithm;
+ std::string footer_signing_key_metadata;
+
+ _FileMetaData__isset __isset;
+
+ void __set_version(const int32_t val);
+
+ void __set_schema(const std::vector<SchemaElement> & val);
+
+ void __set_num_rows(const int64_t val);
+
+ void __set_row_groups(const std::vector<RowGroup> & val);
+
+ void __set_key_value_metadata(const std::vector<KeyValue> & val);
+
+ void __set_created_by(const std::string& val);
+
+ void __set_column_orders(const std::vector<ColumnOrder> & val);
+
+ void __set_encryption_algorithm(const EncryptionAlgorithm& val);
+
+ void __set_footer_signing_key_metadata(const std::string& val);
+
+ bool operator == (const FileMetaData & rhs) const
+ {
+ if (!(version == rhs.version))
+ return false;
+ if (!(schema == rhs.schema))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (!(row_groups == rhs.row_groups))
+ return false;
+ if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+ return false;
+ else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+ return false;
+ if (__isset.created_by != rhs.__isset.created_by)
+ return false;
+ else if (__isset.created_by && !(created_by == rhs.created_by))
+ return false;
+ if (__isset.column_orders != rhs.__isset.column_orders)
+ return false;
+ else if (__isset.column_orders && !(column_orders == rhs.column_orders))
+ return false;
+ if (__isset.encryption_algorithm != rhs.__isset.encryption_algorithm)
+ return false;
+ else if (__isset.encryption_algorithm && !(encryption_algorithm == rhs.encryption_algorithm))
+ return false;
+ if (__isset.footer_signing_key_metadata != rhs.__isset.footer_signing_key_metadata)
+ return false;
+ else if (__isset.footer_signing_key_metadata && !(footer_signing_key_metadata == rhs.footer_signing_key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const FileMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const FileMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(FileMetaData &a, FileMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const FileMetaData& obj);
+
+typedef struct _FileCryptoMetaData__isset {
+ _FileCryptoMetaData__isset() : key_metadata(false) {}
+ bool key_metadata :1;
+} _FileCryptoMetaData__isset;
+
+class FileCryptoMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ FileCryptoMetaData(const FileCryptoMetaData&);
+ FileCryptoMetaData& operator=(const FileCryptoMetaData&);
+ FileCryptoMetaData() : key_metadata() {
+ }
+
+ virtual ~FileCryptoMetaData() noexcept;
+ EncryptionAlgorithm encryption_algorithm;
+ std::string key_metadata;
+
+ _FileCryptoMetaData__isset __isset;
+
+ void __set_encryption_algorithm(const EncryptionAlgorithm& val);
+
+ void __set_key_metadata(const std::string& val);
+
+ bool operator == (const FileCryptoMetaData & rhs) const
+ {
+ if (!(encryption_algorithm == rhs.encryption_algorithm))
+ return false;
+ if (__isset.key_metadata != rhs.__isset.key_metadata)
+ return false;
+ else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const FileCryptoMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const FileCryptoMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(FileCryptoMetaData &a, FileCryptoMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
+
+}} // namespace
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/README b/contrib/libs/apache/arrow/cpp/src/parquet/README
index fc16a46ca08..326bd7253f4 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/README
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/README
@@ -1,10 +1,10 @@
-The CompatibilityTest of bloom_filter-test.cc is used to test cross compatibility of
-Bloom filters between parquet-mr and parquet-cpp. It reads the Bloom filter binary
-generated by the Bloom filter class in the parquet-mr project and tests whether the
-values inserted before could be filtered or not.
-
-The Bloom filter binary is generated by three steps from Parquet-mr:
-Step 1: Construct a Bloom filter with 1024 bytes of bitset.
-Step 2: Insert hashes of "hello", "parquet", "bloom", "filter" strings to Bloom filter
-by calling hash and insert APIs.
-Step 3: Call writeTo API to write to File.
+The CompatibilityTest of bloom_filter-test.cc is used to test cross compatibility of
+Bloom filters between parquet-mr and parquet-cpp. It reads the Bloom filter binary
+generated by the Bloom filter class in the parquet-mr project and tests whether the
+values inserted before could be filtered or not.
+
+The Bloom filter binary is generated by three steps from Parquet-mr:
+Step 1: Construct a Bloom filter with 1024 bytes of bitset.
+Step 2: Insert hashes of "hello", "parquet", "bloom", "filter" strings to Bloom filter
+by calling hash and insert APIs.
+Step 3: Call writeTo API to write to File.
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
index a51773c44d3..62cbee22a18 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
@@ -1,900 +1,900 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Overview.
-//
-// The strategy used for this code for repetition/definition
-// is to dissect the top level array into a list of paths
-// from the top level array to the final primitive (possibly
-// dictionary encoded array). It then evaluates each one of
-// those paths to produce results for the callback iteratively.
-//
-// This approach was taken to reduce the aggregate memory required if we were
-// to build all def/rep levels in parallel as apart of a tree traversal. It
-// also allows for straightforward parallelization at the path level if that is
-// desired in the future.
-//
-// The main downside to this approach is it duplicates effort for nodes
-// that share common ancestors. This can be mitigated to some degree
-// by adding in optimizations that detect leaf arrays that share
-// the same common list ancestor and reuse the repetition levels
-// from the first leaf encountered (only definition levels greater
-// the list ancestor need to be re-evaluated. This is left for future
-// work.
-//
-// Algorithm.
-//
-// As mentioned above this code dissects arrays into constituent parts:
-// nullability data, and list offset data. It tries to optimize for
-// some special cases, where it is known ahead of time that a step
-// can be skipped (e.g. a nullable array happens to have all of its
-// values) or batch filled (a nullable array has all null values).
-// One further optimization that is not implemented but could be done
-// in the future is special handling for nested list arrays that
-// have some intermediate data which indicates the final array contains only
-// nulls.
-//
-// In general, the algorithm attempts to batch work at each node as much
-// as possible. For nullability nodes this means finding runs of null
-// values and batch filling those interspersed with finding runs of non-null values
-// to process in batch at the next column.
-//
-// Similarly, list runs of empty lists are all processed in one batch
-// followed by either:
-// - A single list entry for non-terminal lists (i.e. the upper part of a nested list)
-// - Runs of non-empty lists for the terminal list (i.e. the lowest part of a nested
-// list).
-//
-// This makes use of the following observations.
-// 1. Null values at any node on the path are terminal (repetition and definition
-// level can be set directly when a Null value is encountered).
-// 2. Empty lists share this eager termination property with Null values.
-// 3. In order to keep repetition/definition level populated the algorithm is lazy
-// in assigning repetition levels. The algorithm tracks whether it is currently
-// in the middle of a list by comparing the lengths of repetition/definition levels.
-// If it is currently in the middle of a list the the number of repetition levels
-// populated will be greater than definition levels (the start of a List requires
-// adding the first element). If there are equal numbers of definition and repetition
-// levels populated this indicates a list is waiting to be started and the next list
-// encountered will have its repetition level signify the beginning of the list.
-//
-// Other implementation notes.
-//
-// This code hasn't been benchmarked (or assembly analyzed) but did the following
-// as optimizations (yes premature optimization is the root of all evil).
-// - This code does not use recursion, instead it constructs its own stack and manages
-// updating elements accordingly.
-// - It tries to avoid using Status for common return states.
-// - Avoids virtual dispatch in favor of if/else statements on a set of well known
-// classes.
-
-#include "parquet/arrow/path_internal.h"
-
-#include <atomic>
-#include <cstddef>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/buffer_builder.h"
-#include "arrow/extension_type.h"
-#include "arrow/memory_pool.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_visit.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/util/variant.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/properties.h"
-
-namespace parquet {
-namespace arrow {
-
-namespace {
-
-using ::arrow::Array;
-using ::arrow::Status;
-using ::arrow::TypedBufferBuilder;
-
-constexpr static int16_t kLevelNotSet = -1;
-
-/// \brief Simple result of a iterating over a column to determine values.
-enum IterationResult {
- /// Processing is done at this node. Move back up the path
- /// to continue processing.
- kDone = -1,
- /// Move down towards the leaf for processing.
- kNext = 1,
- /// An error occurred while processing.
- kError = 2
-};
-
-#define RETURN_IF_ERROR(iteration_result) \
- do { \
- if (ARROW_PREDICT_FALSE(iteration_result == kError)) { \
- return iteration_result; \
- } \
- } while (false)
-
-int64_t LazyNullCount(const Array& array) { return array.data()->null_count.load(); }
-
-bool LazyNoNulls(const Array& array) {
- int64_t null_count = LazyNullCount(array);
- return null_count == 0 ||
- // kUnkownNullCount comparison is needed to account
- // for null arrays.
- (null_count == ::arrow::kUnknownNullCount &&
- array.null_bitmap_data() == nullptr);
-}
-
-struct PathWriteContext {
- PathWriteContext(::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::ResizableBuffer> def_levels_buffer)
- : rep_levels(pool), def_levels(std::move(def_levels_buffer), pool) {}
- IterationResult ReserveDefLevels(int64_t elements) {
- last_status = def_levels.Reserve(elements);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- IterationResult AppendDefLevel(int16_t def_level) {
- last_status = def_levels.Append(def_level);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- IterationResult AppendDefLevels(int64_t count, int16_t def_level) {
- last_status = def_levels.Append(count, def_level);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- void UnsafeAppendDefLevel(int16_t def_level) { def_levels.UnsafeAppend(def_level); }
-
- IterationResult AppendRepLevel(int16_t rep_level) {
- last_status = rep_levels.Append(rep_level);
-
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- IterationResult AppendRepLevels(int64_t count, int16_t rep_level) {
- last_status = rep_levels.Append(count, rep_level);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- bool EqualRepDefLevelsLengths() const {
- return rep_levels.length() == def_levels.length();
- }
-
- // Incorporates |range| into visited elements. If the |range| is contiguous
- // with the last range, extend the last range, otherwise add |range| separately
- // tot he list.
- void RecordPostListVisit(const ElementRange& range) {
- if (!visited_elements.empty() && range.start == visited_elements.back().end) {
- visited_elements.back().end = range.end;
- return;
- }
- visited_elements.push_back(range);
- }
-
- Status last_status;
- TypedBufferBuilder<int16_t> rep_levels;
- TypedBufferBuilder<int16_t> def_levels;
- std::vector<ElementRange> visited_elements;
-};
-
-IterationResult FillRepLevels(int64_t count, int16_t rep_level,
- PathWriteContext* context) {
- if (rep_level == kLevelNotSet) {
- return kDone;
- }
- int64_t fill_count = count;
- // This condition occurs (rep and dep levels equals), in one of
- // in a few cases:
- // 1. Before any list is encountered.
- // 2. After rep-level has been filled in due to null/empty
- // values above it.
- // 3. After finishing a list.
- if (!context->EqualRepDefLevelsLengths()) {
- fill_count--;
- }
- return context->AppendRepLevels(fill_count, rep_level);
-}
-
-// A node for handling an array that is discovered to have all
-// null elements. It is referred to as a TerminalNode because
-// traversal of nodes will not continue it when generating
-// rep/def levels. However, there could be many nested children
-// elements beyond it in the Array that is being processed.
-class AllNullsTerminalNode {
- public:
- explicit AllNullsTerminalNode(int16_t def_level, int16_t rep_level = kLevelNotSet)
- : def_level_(def_level), rep_level_(rep_level) {}
- void SetRepLevelIfNull(int16_t rep_level) { rep_level_ = rep_level; }
- IterationResult Run(const ElementRange& range, PathWriteContext* context) {
- int64_t size = range.Size();
- RETURN_IF_ERROR(FillRepLevels(size, rep_level_, context));
- return context->AppendDefLevels(size, def_level_);
- }
-
- private:
- int16_t def_level_;
- int16_t rep_level_;
-};
-
-// Handles the case where all remaining arrays until the leaf have no nulls
-// (and are not interrupted by lists). Unlike AllNullsTerminalNode this is
-// always the last node in a path. We don't need an analogue to the AllNullsTerminalNode
-// because if all values are present at an intermediate array no node is added for it
-// (the def-level for the next nullable node is incremented).
-struct AllPresentTerminalNode {
- IterationResult Run(const ElementRange& range, PathWriteContext* context) {
- return context->AppendDefLevels(range.end - range.start, def_level);
- // No need to worry about rep levels, because this state should
- // only be applicable for after all list/repeated values
- // have been evaluated in the path.
- }
- int16_t def_level;
-};
-
-/// Node for handling the case when the leaf-array is nullable
-/// and contains null elements.
-struct NullableTerminalNode {
- NullableTerminalNode() = default;
-
- NullableTerminalNode(const uint8_t* bitmap, int64_t element_offset,
- int16_t def_level_if_present)
- : bitmap_(bitmap),
- element_offset_(element_offset),
- def_level_if_present_(def_level_if_present),
- def_level_if_null_(def_level_if_present - 1) {}
-
- IterationResult Run(const ElementRange& range, PathWriteContext* context) {
- int64_t elements = range.Size();
- RETURN_IF_ERROR(context->ReserveDefLevels(elements));
-
- DCHECK_GT(elements, 0);
-
- auto bit_visitor = [&](bool is_set) {
- context->UnsafeAppendDefLevel(is_set ? def_level_if_present_ : def_level_if_null_);
- };
-
- if (elements > 16) { // 16 guarantees at least one unrolled loop.
- ::arrow::internal::VisitBitsUnrolled(bitmap_, range.start + element_offset_,
- elements, bit_visitor);
- } else {
- ::arrow::internal::VisitBits(bitmap_, range.start + element_offset_, elements,
- bit_visitor);
- }
- return kDone;
- }
- const uint8_t* bitmap_;
- int64_t element_offset_;
- int16_t def_level_if_present_;
- int16_t def_level_if_null_;
-};
-
-// List nodes handle populating rep_level for Arrow Lists and def-level for empty lists.
-// Nullability (both list and children) is handled by other Nodes. By
-// construction all list nodes will be intermediate nodes (they will always be followed by
-// at least one other node).
-//
-// Type parameters:
-// |RangeSelector| - A strategy for determine the the range of the child node to
-// process.
-// this varies depending on the type of list (int32_t* offsets, int64_t* offsets of
-// fixed.
-template <typename RangeSelector>
-class ListPathNode {
- public:
- ListPathNode(RangeSelector selector, int16_t rep_lev, int16_t def_level_if_empty)
- : selector_(std::move(selector)),
- prev_rep_level_(rep_lev - 1),
- rep_level_(rep_lev),
- def_level_if_empty_(def_level_if_empty) {}
-
- int16_t rep_level() const { return rep_level_; }
-
- IterationResult Run(ElementRange* range, ElementRange* child_range,
- PathWriteContext* context) {
- if (range->Empty()) {
- return kDone;
- }
-
- // Find the first non-empty list (skipping a run of empties).
- int64_t start = range->start;
- // Retrieves the range of elements that this list contains.
- // Uses the strategy pattern to distinguish between the different
- // lists that are supported in Arrow (fixed size, normal and "large").
- *child_range = selector_.GetRange(range->start);
- while (child_range->Empty() && !range->Empty()) {
- ++range->start;
- *child_range = selector_.GetRange(range->start);
- }
- // Loops post-condition:
- // * range is either empty (we are done processing at this node)
- // or start corresponds a non-empty list.
- // * If range is non-empty child_range contains
- // the bounds of non-empty list.
-
- // Handle any skipped over empty lists.
- int64_t empty_elements = range->start - start;
- if (empty_elements > 0) {
- RETURN_IF_ERROR(FillRepLevels(empty_elements, prev_rep_level_, context));
- RETURN_IF_ERROR(context->AppendDefLevels(empty_elements, def_level_if_empty_));
- }
- // Start of a new list. Note that for nested lists adding the element
- // here effectively suppresses this code until we either encounter null
- // elements or empty lists between here and the innermost list (since
- // we make the rep levels repetition and definition levels unequal).
- // Similarly when we are backtracking up the stack the repetition and
- // definition levels are again equal so if we encounter an intermediate list
- // with more elements this will detect it as a new list.
- if (context->EqualRepDefLevelsLengths() && !range->Empty()) {
- RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
- }
-
- if (range->Empty()) {
- return kDone;
- }
-
- ++range->start;
- if (is_last_) {
- // If this is the last repeated node, we can extend try
- // to extend the child range as wide as possible before
- // continuing to the next node.
- return FillForLast(range, child_range, context);
- }
- return kNext;
- }
-
- void SetLast() { is_last_ = true; }
-
- private:
- IterationResult FillForLast(ElementRange* range, ElementRange* child_range,
- PathWriteContext* context) {
- // First fill int the remainder of the list.
- RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context));
- // Once we've reached this point the following preconditions should hold:
- // 1. There are no more repeated path nodes to deal with.
- // 2. All elements in |range| represent contiguous elements in the
- // child array (Null values would have shortened the range to ensure
- // all remaining list elements are present (though they may be empty lists)).
- // 3. No element of range spans a parent list (intermediate
- // list nodes only handle one list entry at a time).
- //
- // Given these preconditions it should be safe to fill runs on non-empty
- // lists here and expand the range in the child node accordingly.
-
- while (!range->Empty()) {
- ElementRange size_check = selector_.GetRange(range->start);
- if (size_check.Empty()) {
- // The empty range will need to be handled after we pass down the accumulated
- // range because it affects def_level placement and we need to get the children
- // def_levels entered first.
- break;
- }
- // This is the start of a new list. We can be sure it only applies
- // to the previous list (and doesn't jump to the start of any list
- // further up in nesting due to the constraints mentioned at the start
- // of the function).
- RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
- RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_));
- DCHECK_EQ(size_check.start, child_range->end);
- child_range->end = size_check.end;
- ++range->start;
- }
-
- // Do book-keeping to track the elements of the arrays that are actually visited
- // beyond this point. This is necessary to identify "gaps" in values that should
- // not be processed (written out to parquet).
- context->RecordPostListVisit(*child_range);
- return kNext;
- }
-
- RangeSelector selector_;
- int16_t prev_rep_level_;
- int16_t rep_level_;
- int16_t def_level_if_empty_;
- bool is_last_ = false;
-};
-
-template <typename OffsetType>
-struct VarRangeSelector {
- ElementRange GetRange(int64_t index) const {
- return ElementRange{offsets[index], offsets[index + 1]};
- }
-
- // Either int32_t* or int64_t*.
- const OffsetType* offsets;
-};
-
-struct FixedSizedRangeSelector {
- ElementRange GetRange(int64_t index) const {
- int64_t start = index * list_size;
- return ElementRange{start, start + list_size};
- }
- int list_size;
-};
-
-// An intermediate node that handles null values.
-class NullableNode {
- public:
- NullableNode(const uint8_t* null_bitmap, int64_t entry_offset,
- int16_t def_level_if_null, int16_t rep_level_if_null = kLevelNotSet)
- : null_bitmap_(null_bitmap),
- entry_offset_(entry_offset),
- valid_bits_reader_(MakeReader(ElementRange{0, 0})),
- def_level_if_null_(def_level_if_null),
- rep_level_if_null_(rep_level_if_null),
- new_range_(true) {}
-
- void SetRepLevelIfNull(int16_t rep_level) { rep_level_if_null_ = rep_level; }
-
- ::arrow::internal::BitRunReader MakeReader(const ElementRange& range) {
- return ::arrow::internal::BitRunReader(null_bitmap_, entry_offset_ + range.start,
- range.Size());
- }
-
- IterationResult Run(ElementRange* range, ElementRange* child_range,
- PathWriteContext* context) {
- if (new_range_) {
- // Reset the reader each time we are starting fresh on a range.
- // We can't rely on continuity because nulls above can
- // cause discontinuities.
- valid_bits_reader_ = MakeReader(*range);
- }
- child_range->start = range->start;
- ::arrow::internal::BitRun run = valid_bits_reader_.NextRun();
- if (!run.set) {
- range->start += run.length;
- RETURN_IF_ERROR(FillRepLevels(run.length, rep_level_if_null_, context));
- RETURN_IF_ERROR(context->AppendDefLevels(run.length, def_level_if_null_));
- run = valid_bits_reader_.NextRun();
- }
- if (range->Empty()) {
- new_range_ = true;
- return kDone;
- }
- child_range->end = child_range->start = range->start;
- child_range->end += run.length;
-
- DCHECK(!child_range->Empty());
- range->start += child_range->Size();
- new_range_ = false;
- return kNext;
- }
-
- const uint8_t* null_bitmap_;
- int64_t entry_offset_;
- ::arrow::internal::BitRunReader valid_bits_reader_;
- int16_t def_level_if_null_;
- int16_t rep_level_if_null_;
-
- // Whether the next invocation will be a new range.
- bool new_range_ = true;
-};
-
-using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
-using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
-using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;
-
-// Contains static information derived from traversing the schema.
-struct PathInfo {
- // The vectors are expected to the same length info.
-
- // Note index order matters here.
- using Node = ::arrow::util::Variant<NullableTerminalNode, ListNode, LargeListNode,
- FixedSizeListNode, NullableNode,
- AllPresentTerminalNode, AllNullsTerminalNode>;
-
- std::vector<Node> path;
- std::shared_ptr<Array> primitive_array;
- int16_t max_def_level = 0;
- int16_t max_rep_level = 0;
- bool has_dictionary = false;
- bool leaf_is_nullable = false;
-};
-
-/// Contains logic for writing a single leaf node to parquet.
-/// This tracks the path from root to leaf.
-///
-/// |writer| will be called after all of the definition/repetition
-/// values have been calculated for root_range with the calculated
-/// values. It is intended to abstract the complexity of writing
-/// the levels and values to parquet.
-Status WritePath(ElementRange root_range, PathInfo* path_info,
- ArrowWriteContext* arrow_context,
- MultipathLevelBuilder::CallbackFunction writer) {
- std::vector<ElementRange> stack(path_info->path.size());
- MultipathLevelBuilderResult builder_result;
- builder_result.leaf_array = path_info->primitive_array;
- builder_result.leaf_is_nullable = path_info->leaf_is_nullable;
-
- if (path_info->max_def_level == 0) {
- // This case only occurs when there are no nullable or repeated
- // columns in the path from the root to leaf.
- int64_t leaf_length = builder_result.leaf_array->length();
- builder_result.def_rep_level_count = leaf_length;
- builder_result.post_list_visited_elements.push_back({0, leaf_length});
- return writer(builder_result);
- }
- stack[0] = root_range;
- RETURN_NOT_OK(
- arrow_context->def_levels_buffer->Resize(/*new_size=*/0, /*shrink_to_fit*/ false));
- PathWriteContext context(arrow_context->memory_pool, arrow_context->def_levels_buffer);
- // We should need at least this many entries so reserve the space ahead of time.
- RETURN_NOT_OK(context.def_levels.Reserve(root_range.Size()));
- if (path_info->max_rep_level > 0) {
- RETURN_NOT_OK(context.rep_levels.Reserve(root_range.Size()));
- }
-
- auto stack_base = &stack[0];
- auto stack_position = stack_base;
- // This is the main loop for calculated rep/def levels. The nodes
- // in the path implement a chain-of-responsibility like pattern
- // where each node can add some number of repetition/definition
- // levels to PathWriteContext and also delegate to the next node
- // in the path to add values. The values are added through each Run(...)
- // call and the choice to delegate to the next node (or return to the
- // previous node) is communicated by the return value of Run(...).
- // The loop terminates after the first node indicates all values in
- // |root_range| are processed.
- while (stack_position >= stack_base) {
- PathInfo::Node& node = path_info->path[stack_position - stack_base];
- struct {
- IterationResult operator()(NullableNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- IterationResult operator()(ListNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- IterationResult operator()(NullableTerminalNode* node) {
- return node->Run(*stack_position, context);
- }
- IterationResult operator()(FixedSizeListNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- IterationResult operator()(AllPresentTerminalNode* node) {
- return node->Run(*stack_position, context);
- }
- IterationResult operator()(AllNullsTerminalNode* node) {
- return node->Run(*stack_position, context);
- }
- IterationResult operator()(LargeListNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- ElementRange* stack_position;
- PathWriteContext* context;
- } visitor = {stack_position, &context};
-
- IterationResult result = ::arrow::util::visit(visitor, &node);
-
- if (ARROW_PREDICT_FALSE(result == kError)) {
- DCHECK(!context.last_status.ok());
- return context.last_status;
- }
- stack_position += static_cast<int>(result);
- }
- RETURN_NOT_OK(context.last_status);
- builder_result.def_rep_level_count = context.def_levels.length();
-
- if (context.rep_levels.length() > 0) {
- // This case only occurs when there was a repeated element that needs to be
- // processed.
- builder_result.rep_levels = context.rep_levels.data();
- std::swap(builder_result.post_list_visited_elements, context.visited_elements);
- // If it is possible when processing lists that all lists where empty. In this
- // case no elements would have been added to post_list_visited_elements. By
- // added an empty element we avoid special casing in downstream consumers.
- if (builder_result.post_list_visited_elements.empty()) {
- builder_result.post_list_visited_elements.push_back({0, 0});
- }
- } else {
- builder_result.post_list_visited_elements.push_back(
- {0, builder_result.leaf_array->length()});
- builder_result.rep_levels = nullptr;
- }
-
- builder_result.def_levels = context.def_levels.data();
- return writer(builder_result);
-}
-
-struct FixupVisitor {
- int max_rep_level = -1;
- int16_t rep_level_if_null = kLevelNotSet;
-
- template <typename T>
- void HandleListNode(T* arg) {
- if (arg->rep_level() == max_rep_level) {
- arg->SetLast();
- // after the last list node we don't need to fill
- // rep levels on null.
- rep_level_if_null = kLevelNotSet;
- } else {
- rep_level_if_null = arg->rep_level();
- }
- }
- void operator()(ListNode* node) { HandleListNode(node); }
- void operator()(LargeListNode* node) { HandleListNode(node); }
- void operator()(FixedSizeListNode* node) { HandleListNode(node); }
-
- // For non-list intermediate nodes.
- template <typename T>
- void HandleIntermediateNode(T* arg) {
- if (rep_level_if_null != kLevelNotSet) {
- arg->SetRepLevelIfNull(rep_level_if_null);
- }
- }
-
- void operator()(NullableNode* arg) { HandleIntermediateNode(arg); }
-
- void operator()(AllNullsTerminalNode* arg) {
- // Even though no processing happens past this point we
- // still need to adjust it if a list occurred after an
- // all null array.
- HandleIntermediateNode(arg);
- }
-
- void operator()(NullableTerminalNode*) {}
- void operator()(AllPresentTerminalNode*) {}
-};
-
-PathInfo Fixup(PathInfo info) {
- // We only need to fixup the path if there were repeated
- // elements on it.
- if (info.max_rep_level == 0) {
- return info;
- }
- FixupVisitor visitor;
- visitor.max_rep_level = info.max_rep_level;
- if (visitor.max_rep_level > 0) {
- visitor.rep_level_if_null = 0;
- }
- for (size_t x = 0; x < info.path.size(); x++) {
- ::arrow::util::visit(visitor, &info.path[x]);
- }
- return info;
-}
-
-class PathBuilder {
- public:
- explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {}
- template <typename T>
- void AddTerminalInfo(const T& array) {
- info_.leaf_is_nullable = nullable_in_parent_;
- if (nullable_in_parent_) {
- info_.max_def_level++;
- }
- // We don't use null_count() because if the null_count isn't known
- // and the array does in fact contain nulls, we will end up
- // traversing the null bitmap twice (once here and once when calculating
- // rep/def levels).
- if (LazyNoNulls(array)) {
- info_.path.emplace_back(AllPresentTerminalNode{info_.max_def_level});
- } else if (LazyNullCount(array) == array.length()) {
- info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
- } else {
- info_.path.emplace_back(NullableTerminalNode(array.null_bitmap_data(),
- array.offset(), info_.max_def_level));
- }
- info_.primitive_array = std::make_shared<T>(array.data());
- paths_.push_back(Fixup(info_));
- }
-
- template <typename T>
- ::arrow::enable_if_t<std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit(
- const T& array) {
- AddTerminalInfo(array);
- return Status::OK();
- }
-
- template <typename T>
- ::arrow::enable_if_t<std::is_same<::arrow::ListArray, T>::value ||
- std::is_same<::arrow::LargeListArray, T>::value,
- Status>
- Visit(const T& array) {
- MaybeAddNullable(array);
- // Increment necessary due to empty lists.
- info_.max_def_level++;
- info_.max_rep_level++;
- // raw_value_offsets() accounts for any slice offset.
- ListPathNode<VarRangeSelector<typename T::offset_type>> node(
- VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
- info_.max_rep_level, info_.max_def_level - 1);
- info_.path.emplace_back(std::move(node));
- nullable_in_parent_ = array.list_type()->value_field()->nullable();
- return VisitInline(*array.values());
- }
-
- Status Visit(const ::arrow::DictionaryArray& array) {
- // Only currently handle DictionaryArray where the dictionary is a
- // primitive type
- if (array.dict_type()->value_type()->num_fields() > 0) {
- return Status::NotImplemented(
- "Writing DictionaryArray with nested dictionary "
- "type not yet supported");
- }
- if (array.dictionary()->null_count() > 0) {
- return Status::NotImplemented(
- "Writing DictionaryArray with null encoded in dictionary "
- "type not yet supported");
- }
- AddTerminalInfo(array);
- return Status::OK();
- }
-
- void MaybeAddNullable(const Array& array) {
- if (!nullable_in_parent_) {
- return;
- }
- info_.max_def_level++;
- // We don't use null_count() because if the null_count isn't known
- // and the array does in fact contain nulls, we will end up
- // traversing the null bitmap twice (once here and once when calculating
- // rep/def levels). Because this isn't terminal this might not be
- // the right decision for structs that share the same nullable
- // parents.
- if (LazyNoNulls(array)) {
- // Don't add anything because there won't be any point checking
- // null values for the array. There will always be at least
- // one more array to handle nullability.
- return;
- }
- if (LazyNullCount(array) == array.length()) {
- info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
- return;
- }
- info_.path.emplace_back(
- NullableNode(array.null_bitmap_data(), array.offset(),
- /* def_level_if_null = */ info_.max_def_level - 1));
- }
-
- Status VisitInline(const Array& array);
-
- Status Visit(const ::arrow::MapArray& array) {
- return Visit(static_cast<const ::arrow::ListArray&>(array));
- }
-
- Status Visit(const ::arrow::StructArray& array) {
- MaybeAddNullable(array);
- PathInfo info_backup = info_;
- for (int x = 0; x < array.num_fields(); x++) {
- nullable_in_parent_ = array.type()->field(x)->nullable();
- RETURN_NOT_OK(VisitInline(*array.field(x)));
- info_ = info_backup;
- }
- return Status::OK();
- }
-
- Status Visit(const ::arrow::FixedSizeListArray& array) {
- MaybeAddNullable(array);
- int32_t list_size = array.list_type()->list_size();
- // Technically we could encode fixed size lists with two level encodings
- // but since we always use 3 level encoding we increment def levels as
- // well.
- info_.max_def_level++;
- info_.max_rep_level++;
- info_.path.emplace_back(FixedSizeListNode(FixedSizedRangeSelector{list_size},
- info_.max_rep_level, info_.max_def_level));
- nullable_in_parent_ = array.list_type()->value_field()->nullable();
- if (array.offset() > 0) {
- return VisitInline(*array.values()->Slice(array.value_offset(0)));
- }
- return VisitInline(*array.values());
- }
-
- Status Visit(const ::arrow::ExtensionArray& array) {
- return VisitInline(*array.storage());
- }
-
-#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
- Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
- return Status::NotImplemented("Level generation for " #ArrowTypePrefix \
- " not supported yet"); \
- }
-
- // Union types aren't supported in Parquet.
- NOT_IMPLEMENTED_VISIT(Union)
-
-#undef NOT_IMPLEMENTED_VISIT
- std::vector<PathInfo>& paths() { return paths_; }
-
- private:
- PathInfo info_;
- std::vector<PathInfo> paths_;
- bool nullable_in_parent_;
-};
-
-Status PathBuilder::VisitInline(const Array& array) {
- return ::arrow::VisitArrayInline(array, this);
-}
-
-#undef RETURN_IF_ERROR
-} // namespace
-
-class MultipathLevelBuilderImpl : public MultipathLevelBuilder {
- public:
- MultipathLevelBuilderImpl(std::shared_ptr<::arrow::ArrayData> data,
- std::unique_ptr<PathBuilder> path_builder)
- : root_range_{0, data->length},
- data_(std::move(data)),
- path_builder_(std::move(path_builder)) {}
-
- int GetLeafCount() const override {
- return static_cast<int>(path_builder_->paths().size());
- }
-
- ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
- CallbackFunction write_leaf_callback) override {
- DCHECK_GE(leaf_index, 0);
- DCHECK_LT(leaf_index, GetLeafCount());
- return WritePath(root_range_, &path_builder_->paths()[leaf_index], context,
- std::move(write_leaf_callback));
- }
-
- private:
- ElementRange root_range_;
- // Reference holder to ensure the data stays valid.
- std::shared_ptr<::arrow::ArrayData> data_;
- std::unique_ptr<PathBuilder> path_builder_;
-};
-
-// static
-::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> MultipathLevelBuilder::Make(
- const ::arrow::Array& array, bool array_field_nullable) {
- auto constructor = ::arrow::internal::make_unique<PathBuilder>(array_field_nullable);
- RETURN_NOT_OK(VisitArrayInline(array, constructor.get()));
- return ::arrow::internal::make_unique<MultipathLevelBuilderImpl>(
- array.data(), std::move(constructor));
-}
-
-// static
-Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullable,
- ArrowWriteContext* context,
- MultipathLevelBuilder::CallbackFunction callback) {
- ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
- MultipathLevelBuilder::Make(array, array_field_nullable));
- PathBuilder constructor(array_field_nullable);
- RETURN_NOT_OK(VisitArrayInline(array, &constructor));
- for (int leaf_idx = 0; leaf_idx < builder->GetLeafCount(); leaf_idx++) {
- RETURN_NOT_OK(builder->Write(leaf_idx, context, callback));
- }
- return Status::OK();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Overview.
+//
+// The strategy used for this code for repetition/definition
+// is to dissect the top level array into a list of paths
+// from the top level array to the final primitive (possibly
+// dictionary encoded array). It then evaluates each one of
+// those paths to produce results for the callback iteratively.
+//
+// This approach was taken to reduce the aggregate memory required if we were
+// to build all def/rep levels in parallel as apart of a tree traversal. It
+// also allows for straightforward parallelization at the path level if that is
+// desired in the future.
+//
+// The main downside to this approach is it duplicates effort for nodes
+// that share common ancestors. This can be mitigated to some degree
+// by adding in optimizations that detect leaf arrays that share
+// the same common list ancestor and reuse the repetition levels
+// from the first leaf encountered (only definition levels greater
+// the list ancestor need to be re-evaluated. This is left for future
+// work.
+//
+// Algorithm.
+//
+// As mentioned above this code dissects arrays into constituent parts:
+// nullability data, and list offset data. It tries to optimize for
+// some special cases, where it is known ahead of time that a step
+// can be skipped (e.g. a nullable array happens to have all of its
+// values) or batch filled (a nullable array has all null values).
+// One further optimization that is not implemented but could be done
+// in the future is special handling for nested list arrays that
+// have some intermediate data which indicates the final array contains only
+// nulls.
+//
+// In general, the algorithm attempts to batch work at each node as much
+// as possible. For nullability nodes this means finding runs of null
+// values and batch filling those interspersed with finding runs of non-null values
+// to process in batch at the next column.
+//
+// Similarly, list runs of empty lists are all processed in one batch
+// followed by either:
+// - A single list entry for non-terminal lists (i.e. the upper part of a nested list)
+// - Runs of non-empty lists for the terminal list (i.e. the lowest part of a nested
+// list).
+//
+// This makes use of the following observations.
+// 1. Null values at any node on the path are terminal (repetition and definition
+// level can be set directly when a Null value is encountered).
+// 2. Empty lists share this eager termination property with Null values.
+// 3. In order to keep repetition/definition level populated the algorithm is lazy
+// in assigning repetition levels. The algorithm tracks whether it is currently
+// in the middle of a list by comparing the lengths of repetition/definition levels.
+// If it is currently in the middle of a list the the number of repetition levels
+// populated will be greater than definition levels (the start of a List requires
+// adding the first element). If there are equal numbers of definition and repetition
+// levels populated this indicates a list is waiting to be started and the next list
+// encountered will have its repetition level signify the beginning of the list.
+//
+// Other implementation notes.
+//
+// This code hasn't been benchmarked (or assembly analyzed) but did the following
+// as optimizations (yes premature optimization is the root of all evil).
+// - This code does not use recursion, instead it constructs its own stack and manages
+// updating elements accordingly.
+// - It tries to avoid using Status for common return states.
+// - Avoids virtual dispatch in favor of if/else statements on a set of well known
+// classes.
+
+#include "parquet/arrow/path_internal.h"
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/extension_type.h"
+#include "arrow/memory_pool.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_visit.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/variant.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+namespace arrow {
+
+namespace {
+
+using ::arrow::Array;
+using ::arrow::Status;
+using ::arrow::TypedBufferBuilder;
+
+constexpr static int16_t kLevelNotSet = -1;
+
+/// \brief Simple result of a iterating over a column to determine values.
+enum IterationResult {
+ /// Processing is done at this node. Move back up the path
+ /// to continue processing.
+ kDone = -1,
+ /// Move down towards the leaf for processing.
+ kNext = 1,
+ /// An error occurred while processing.
+ kError = 2
+};
+
+#define RETURN_IF_ERROR(iteration_result) \
+ do { \
+ if (ARROW_PREDICT_FALSE(iteration_result == kError)) { \
+ return iteration_result; \
+ } \
+ } while (false)
+
+int64_t LazyNullCount(const Array& array) { return array.data()->null_count.load(); }
+
+bool LazyNoNulls(const Array& array) {
+ int64_t null_count = LazyNullCount(array);
+ return null_count == 0 ||
+ // kUnkownNullCount comparison is needed to account
+ // for null arrays.
+ (null_count == ::arrow::kUnknownNullCount &&
+ array.null_bitmap_data() == nullptr);
+}
+
+struct PathWriteContext {
+ PathWriteContext(::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::ResizableBuffer> def_levels_buffer)
+ : rep_levels(pool), def_levels(std::move(def_levels_buffer), pool) {}
+ IterationResult ReserveDefLevels(int64_t elements) {
+ last_status = def_levels.Reserve(elements);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendDefLevel(int16_t def_level) {
+ last_status = def_levels.Append(def_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendDefLevels(int64_t count, int16_t def_level) {
+ last_status = def_levels.Append(count, def_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ void UnsafeAppendDefLevel(int16_t def_level) { def_levels.UnsafeAppend(def_level); }
+
+ IterationResult AppendRepLevel(int16_t rep_level) {
+ last_status = rep_levels.Append(rep_level);
+
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendRepLevels(int64_t count, int16_t rep_level) {
+ last_status = rep_levels.Append(count, rep_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ bool EqualRepDefLevelsLengths() const {
+ return rep_levels.length() == def_levels.length();
+ }
+
+ // Incorporates |range| into visited elements. If the |range| is contiguous
+ // with the last range, extend the last range, otherwise add |range| separately
+ // tot he list.
+ void RecordPostListVisit(const ElementRange& range) {
+ if (!visited_elements.empty() && range.start == visited_elements.back().end) {
+ visited_elements.back().end = range.end;
+ return;
+ }
+ visited_elements.push_back(range);
+ }
+
+ Status last_status;
+ TypedBufferBuilder<int16_t> rep_levels;
+ TypedBufferBuilder<int16_t> def_levels;
+ std::vector<ElementRange> visited_elements;
+};
+
+IterationResult FillRepLevels(int64_t count, int16_t rep_level,
+ PathWriteContext* context) {
+ if (rep_level == kLevelNotSet) {
+ return kDone;
+ }
+ int64_t fill_count = count;
+ // This condition occurs (rep and dep levels equals), in one of
+ // in a few cases:
+ // 1. Before any list is encountered.
+ // 2. After rep-level has been filled in due to null/empty
+ // values above it.
+ // 3. After finishing a list.
+ if (!context->EqualRepDefLevelsLengths()) {
+ fill_count--;
+ }
+ return context->AppendRepLevels(fill_count, rep_level);
+}
+
+// A node for handling an array that is discovered to have all
+// null elements. It is referred to as a TerminalNode because
+// traversal of nodes will not continue it when generating
+// rep/def levels. However, there could be many nested children
+// elements beyond it in the Array that is being processed.
+class AllNullsTerminalNode {
+ public:
+ explicit AllNullsTerminalNode(int16_t def_level, int16_t rep_level = kLevelNotSet)
+ : def_level_(def_level), rep_level_(rep_level) {}
+ void SetRepLevelIfNull(int16_t rep_level) { rep_level_ = rep_level; }
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ int64_t size = range.Size();
+ RETURN_IF_ERROR(FillRepLevels(size, rep_level_, context));
+ return context->AppendDefLevels(size, def_level_);
+ }
+
+ private:
+ int16_t def_level_;
+ int16_t rep_level_;
+};
+
+// Handles the case where all remaining arrays until the leaf have no nulls
+// (and are not interrupted by lists). Unlike AllNullsTerminalNode this is
+// always the last node in a path. We don't need an analogue to the AllNullsTerminalNode
+// because if all values are present at an intermediate array no node is added for it
+// (the def-level for the next nullable node is incremented).
+struct AllPresentTerminalNode {
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ return context->AppendDefLevels(range.end - range.start, def_level);
+ // No need to worry about rep levels, because this state should
+ // only be applicable for after all list/repeated values
+ // have been evaluated in the path.
+ }
+ int16_t def_level;
+};
+
+/// Node for handling the case when the leaf-array is nullable
+/// and contains null elements.
+struct NullableTerminalNode {
+ NullableTerminalNode() = default;
+
+ NullableTerminalNode(const uint8_t* bitmap, int64_t element_offset,
+ int16_t def_level_if_present)
+ : bitmap_(bitmap),
+ element_offset_(element_offset),
+ def_level_if_present_(def_level_if_present),
+ def_level_if_null_(def_level_if_present - 1) {}
+
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ int64_t elements = range.Size();
+ RETURN_IF_ERROR(context->ReserveDefLevels(elements));
+
+ DCHECK_GT(elements, 0);
+
+ auto bit_visitor = [&](bool is_set) {
+ context->UnsafeAppendDefLevel(is_set ? def_level_if_present_ : def_level_if_null_);
+ };
+
+ if (elements > 16) { // 16 guarantees at least one unrolled loop.
+ ::arrow::internal::VisitBitsUnrolled(bitmap_, range.start + element_offset_,
+ elements, bit_visitor);
+ } else {
+ ::arrow::internal::VisitBits(bitmap_, range.start + element_offset_, elements,
+ bit_visitor);
+ }
+ return kDone;
+ }
+ const uint8_t* bitmap_;
+ int64_t element_offset_;
+ int16_t def_level_if_present_;
+ int16_t def_level_if_null_;
+};
+
+// List nodes handle populating rep_level for Arrow Lists and def-level for empty lists.
+// Nullability (both list and children) is handled by other Nodes. By
+// construction all list nodes will be intermediate nodes (they will always be followed by
+// at least one other node).
+//
+// Type parameters:
+// |RangeSelector| - A strategy for determine the the range of the child node to
+// process.
+// this varies depending on the type of list (int32_t* offsets, int64_t* offsets of
+// fixed.
+template <typename RangeSelector>
+class ListPathNode {
+ public:
+ ListPathNode(RangeSelector selector, int16_t rep_lev, int16_t def_level_if_empty)
+ : selector_(std::move(selector)),
+ prev_rep_level_(rep_lev - 1),
+ rep_level_(rep_lev),
+ def_level_if_empty_(def_level_if_empty) {}
+
+ int16_t rep_level() const { return rep_level_; }
+
+ IterationResult Run(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ if (range->Empty()) {
+ return kDone;
+ }
+
+ // Find the first non-empty list (skipping a run of empties).
+ int64_t start = range->start;
+ // Retrieves the range of elements that this list contains.
+ // Uses the strategy pattern to distinguish between the different
+ // lists that are supported in Arrow (fixed size, normal and "large").
+ *child_range = selector_.GetRange(range->start);
+ while (child_range->Empty() && !range->Empty()) {
+ ++range->start;
+ *child_range = selector_.GetRange(range->start);
+ }
+ // Loops post-condition:
+ // * range is either empty (we are done processing at this node)
+ // or start corresponds a non-empty list.
+ // * If range is non-empty child_range contains
+ // the bounds of non-empty list.
+
+ // Handle any skipped over empty lists.
+ int64_t empty_elements = range->start - start;
+ if (empty_elements > 0) {
+ RETURN_IF_ERROR(FillRepLevels(empty_elements, prev_rep_level_, context));
+ RETURN_IF_ERROR(context->AppendDefLevels(empty_elements, def_level_if_empty_));
+ }
+ // Start of a new list. Note that for nested lists adding the element
+ // here effectively suppresses this code until we either encounter null
+ // elements or empty lists between here and the innermost list (since
+ // we make the rep levels repetition and definition levels unequal).
+ // Similarly when we are backtracking up the stack the repetition and
+ // definition levels are again equal so if we encounter an intermediate list
+ // with more elements this will detect it as a new list.
+ if (context->EqualRepDefLevelsLengths() && !range->Empty()) {
+ RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
+ }
+
+ if (range->Empty()) {
+ return kDone;
+ }
+
+ ++range->start;
+ if (is_last_) {
+ // If this is the last repeated node, we can extend try
+ // to extend the child range as wide as possible before
+ // continuing to the next node.
+ return FillForLast(range, child_range, context);
+ }
+ return kNext;
+ }
+
+ void SetLast() { is_last_ = true; }
+
+ private:
+ IterationResult FillForLast(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ // First fill int the remainder of the list.
+ RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context));
+ // Once we've reached this point the following preconditions should hold:
+ // 1. There are no more repeated path nodes to deal with.
+ // 2. All elements in |range| represent contiguous elements in the
+ // child array (Null values would have shortened the range to ensure
+ // all remaining list elements are present (though they may be empty lists)).
+ // 3. No element of range spans a parent list (intermediate
+ // list nodes only handle one list entry at a time).
+ //
+ // Given these preconditions it should be safe to fill runs on non-empty
+ // lists here and expand the range in the child node accordingly.
+
+ while (!range->Empty()) {
+ ElementRange size_check = selector_.GetRange(range->start);
+ if (size_check.Empty()) {
+ // The empty range will need to be handled after we pass down the accumulated
+ // range because it affects def_level placement and we need to get the children
+ // def_levels entered first.
+ break;
+ }
+ // This is the start of a new list. We can be sure it only applies
+ // to the previous list (and doesn't jump to the start of any list
+ // further up in nesting due to the constraints mentioned at the start
+ // of the function).
+ RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
+ RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_));
+ DCHECK_EQ(size_check.start, child_range->end);
+ child_range->end = size_check.end;
+ ++range->start;
+ }
+
+ // Do book-keeping to track the elements of the arrays that are actually visited
+ // beyond this point. This is necessary to identify "gaps" in values that should
+ // not be processed (written out to parquet).
+ context->RecordPostListVisit(*child_range);
+ return kNext;
+ }
+
+ RangeSelector selector_;
+ int16_t prev_rep_level_;
+ int16_t rep_level_;
+ int16_t def_level_if_empty_;
+ bool is_last_ = false;
+};
+
+template <typename OffsetType>
+struct VarRangeSelector {
+ ElementRange GetRange(int64_t index) const {
+ return ElementRange{offsets[index], offsets[index + 1]};
+ }
+
+ // Either int32_t* or int64_t*.
+ const OffsetType* offsets;
+};
+
+struct FixedSizedRangeSelector {
+ ElementRange GetRange(int64_t index) const {
+ int64_t start = index * list_size;
+ return ElementRange{start, start + list_size};
+ }
+ int list_size;
+};
+
+// An intermediate node that handles null values.
+class NullableNode {
+ public:
+ NullableNode(const uint8_t* null_bitmap, int64_t entry_offset,
+ int16_t def_level_if_null, int16_t rep_level_if_null = kLevelNotSet)
+ : null_bitmap_(null_bitmap),
+ entry_offset_(entry_offset),
+ valid_bits_reader_(MakeReader(ElementRange{0, 0})),
+ def_level_if_null_(def_level_if_null),
+ rep_level_if_null_(rep_level_if_null),
+ new_range_(true) {}
+
+ void SetRepLevelIfNull(int16_t rep_level) { rep_level_if_null_ = rep_level; }
+
+ ::arrow::internal::BitRunReader MakeReader(const ElementRange& range) {
+ return ::arrow::internal::BitRunReader(null_bitmap_, entry_offset_ + range.start,
+ range.Size());
+ }
+
+ IterationResult Run(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ if (new_range_) {
+ // Reset the reader each time we are starting fresh on a range.
+ // We can't rely on continuity because nulls above can
+ // cause discontinuities.
+ valid_bits_reader_ = MakeReader(*range);
+ }
+ child_range->start = range->start;
+ ::arrow::internal::BitRun run = valid_bits_reader_.NextRun();
+ if (!run.set) {
+ range->start += run.length;
+ RETURN_IF_ERROR(FillRepLevels(run.length, rep_level_if_null_, context));
+ RETURN_IF_ERROR(context->AppendDefLevels(run.length, def_level_if_null_));
+ run = valid_bits_reader_.NextRun();
+ }
+ if (range->Empty()) {
+ new_range_ = true;
+ return kDone;
+ }
+ child_range->end = child_range->start = range->start;
+ child_range->end += run.length;
+
+ DCHECK(!child_range->Empty());
+ range->start += child_range->Size();
+ new_range_ = false;
+ return kNext;
+ }
+
+ const uint8_t* null_bitmap_;
+ int64_t entry_offset_;
+ ::arrow::internal::BitRunReader valid_bits_reader_;
+ int16_t def_level_if_null_;
+ int16_t rep_level_if_null_;
+
+ // Whether the next invocation will be a new range.
+ bool new_range_ = true;
+};
+
+using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
+using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
+using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;
+
+// Contains static information derived from traversing the schema.
+struct PathInfo {
+ // The vectors are expected to the same length info.
+
+ // Note index order matters here.
+ using Node = ::arrow::util::Variant<NullableTerminalNode, ListNode, LargeListNode,
+ FixedSizeListNode, NullableNode,
+ AllPresentTerminalNode, AllNullsTerminalNode>;
+
+ std::vector<Node> path;
+ std::shared_ptr<Array> primitive_array;
+ int16_t max_def_level = 0;
+ int16_t max_rep_level = 0;
+ bool has_dictionary = false;
+ bool leaf_is_nullable = false;
+};
+
+/// Contains logic for writing a single leaf node to parquet.
+/// This tracks the path from root to leaf.
+///
+/// |writer| will be called after all of the definition/repetition
+/// values have been calculated for root_range with the calculated
+/// values. It is intended to abstract the complexity of writing
+/// the levels and values to parquet.
+Status WritePath(ElementRange root_range, PathInfo* path_info,
+ ArrowWriteContext* arrow_context,
+ MultipathLevelBuilder::CallbackFunction writer) {
+ std::vector<ElementRange> stack(path_info->path.size());
+ MultipathLevelBuilderResult builder_result;
+ builder_result.leaf_array = path_info->primitive_array;
+ builder_result.leaf_is_nullable = path_info->leaf_is_nullable;
+
+ if (path_info->max_def_level == 0) {
+ // This case only occurs when there are no nullable or repeated
+ // columns in the path from the root to leaf.
+ int64_t leaf_length = builder_result.leaf_array->length();
+ builder_result.def_rep_level_count = leaf_length;
+ builder_result.post_list_visited_elements.push_back({0, leaf_length});
+ return writer(builder_result);
+ }
+ stack[0] = root_range;
+ RETURN_NOT_OK(
+ arrow_context->def_levels_buffer->Resize(/*new_size=*/0, /*shrink_to_fit*/ false));
+ PathWriteContext context(arrow_context->memory_pool, arrow_context->def_levels_buffer);
+ // We should need at least this many entries so reserve the space ahead of time.
+ RETURN_NOT_OK(context.def_levels.Reserve(root_range.Size()));
+ if (path_info->max_rep_level > 0) {
+ RETURN_NOT_OK(context.rep_levels.Reserve(root_range.Size()));
+ }
+
+ auto stack_base = &stack[0];
+ auto stack_position = stack_base;
+ // This is the main loop for calculated rep/def levels. The nodes
+ // in the path implement a chain-of-responsibility like pattern
+ // where each node can add some number of repetition/definition
+ // levels to PathWriteContext and also delegate to the next node
+ // in the path to add values. The values are added through each Run(...)
+ // call and the choice to delegate to the next node (or return to the
+ // previous node) is communicated by the return value of Run(...).
+ // The loop terminates after the first node indicates all values in
+ // |root_range| are processed.
+ while (stack_position >= stack_base) {
+ PathInfo::Node& node = path_info->path[stack_position - stack_base];
+ struct {
+ IterationResult operator()(NullableNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(ListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(NullableTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(FixedSizeListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(AllPresentTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(AllNullsTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(LargeListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ ElementRange* stack_position;
+ PathWriteContext* context;
+ } visitor = {stack_position, &context};
+
+ IterationResult result = ::arrow::util::visit(visitor, &node);
+
+ if (ARROW_PREDICT_FALSE(result == kError)) {
+ DCHECK(!context.last_status.ok());
+ return context.last_status;
+ }
+ stack_position += static_cast<int>(result);
+ }
+ RETURN_NOT_OK(context.last_status);
+ builder_result.def_rep_level_count = context.def_levels.length();
+
+ if (context.rep_levels.length() > 0) {
+ // This case only occurs when there was a repeated element that needs to be
+ // processed.
+ builder_result.rep_levels = context.rep_levels.data();
+ std::swap(builder_result.post_list_visited_elements, context.visited_elements);
+ // If it is possible when processing lists that all lists where empty. In this
+ // case no elements would have been added to post_list_visited_elements. By
+ // added an empty element we avoid special casing in downstream consumers.
+ if (builder_result.post_list_visited_elements.empty()) {
+ builder_result.post_list_visited_elements.push_back({0, 0});
+ }
+ } else {
+ builder_result.post_list_visited_elements.push_back(
+ {0, builder_result.leaf_array->length()});
+ builder_result.rep_levels = nullptr;
+ }
+
+ builder_result.def_levels = context.def_levels.data();
+ return writer(builder_result);
+}
+
+struct FixupVisitor {
+ int max_rep_level = -1;
+ int16_t rep_level_if_null = kLevelNotSet;
+
+ template <typename T>
+ void HandleListNode(T* arg) {
+ if (arg->rep_level() == max_rep_level) {
+ arg->SetLast();
+ // after the last list node we don't need to fill
+ // rep levels on null.
+ rep_level_if_null = kLevelNotSet;
+ } else {
+ rep_level_if_null = arg->rep_level();
+ }
+ }
+ void operator()(ListNode* node) { HandleListNode(node); }
+ void operator()(LargeListNode* node) { HandleListNode(node); }
+ void operator()(FixedSizeListNode* node) { HandleListNode(node); }
+
+ // For non-list intermediate nodes.
+ template <typename T>
+ void HandleIntermediateNode(T* arg) {
+ if (rep_level_if_null != kLevelNotSet) {
+ arg->SetRepLevelIfNull(rep_level_if_null);
+ }
+ }
+
+ void operator()(NullableNode* arg) { HandleIntermediateNode(arg); }
+
+ void operator()(AllNullsTerminalNode* arg) {
+ // Even though no processing happens past this point we
+ // still need to adjust it if a list occurred after an
+ // all null array.
+ HandleIntermediateNode(arg);
+ }
+
+ void operator()(NullableTerminalNode*) {}
+ void operator()(AllPresentTerminalNode*) {}
+};
+
+PathInfo Fixup(PathInfo info) {
+ // We only need to fixup the path if there were repeated
+ // elements on it.
+ if (info.max_rep_level == 0) {
+ return info;
+ }
+ FixupVisitor visitor;
+ visitor.max_rep_level = info.max_rep_level;
+ if (visitor.max_rep_level > 0) {
+ visitor.rep_level_if_null = 0;
+ }
+ for (size_t x = 0; x < info.path.size(); x++) {
+ ::arrow::util::visit(visitor, &info.path[x]);
+ }
+ return info;
+}
+
+class PathBuilder {
+ public:
+ explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {}
+ template <typename T>
+ void AddTerminalInfo(const T& array) {
+ info_.leaf_is_nullable = nullable_in_parent_;
+ if (nullable_in_parent_) {
+ info_.max_def_level++;
+ }
+ // We don't use null_count() because if the null_count isn't known
+ // and the array does in fact contain nulls, we will end up
+ // traversing the null bitmap twice (once here and once when calculating
+ // rep/def levels).
+ if (LazyNoNulls(array)) {
+ info_.path.emplace_back(AllPresentTerminalNode{info_.max_def_level});
+ } else if (LazyNullCount(array) == array.length()) {
+ info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
+ } else {
+ info_.path.emplace_back(NullableTerminalNode(array.null_bitmap_data(),
+ array.offset(), info_.max_def_level));
+ }
+ info_.primitive_array = std::make_shared<T>(array.data());
+ paths_.push_back(Fixup(info_));
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit(
+ const T& array) {
+ AddTerminalInfo(array);
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<std::is_same<::arrow::ListArray, T>::value ||
+ std::is_same<::arrow::LargeListArray, T>::value,
+ Status>
+ Visit(const T& array) {
+ MaybeAddNullable(array);
+ // Increment necessary due to empty lists.
+ info_.max_def_level++;
+ info_.max_rep_level++;
+ // raw_value_offsets() accounts for any slice offset.
+ ListPathNode<VarRangeSelector<typename T::offset_type>> node(
+ VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
+ info_.max_rep_level, info_.max_def_level - 1);
+ info_.path.emplace_back(std::move(node));
+ nullable_in_parent_ = array.list_type()->value_field()->nullable();
+ return VisitInline(*array.values());
+ }
+
+ Status Visit(const ::arrow::DictionaryArray& array) {
+ // Only currently handle DictionaryArray where the dictionary is a
+ // primitive type
+ if (array.dict_type()->value_type()->num_fields() > 0) {
+ return Status::NotImplemented(
+ "Writing DictionaryArray with nested dictionary "
+ "type not yet supported");
+ }
+ if (array.dictionary()->null_count() > 0) {
+ return Status::NotImplemented(
+ "Writing DictionaryArray with null encoded in dictionary "
+ "type not yet supported");
+ }
+ AddTerminalInfo(array);
+ return Status::OK();
+ }
+
+ void MaybeAddNullable(const Array& array) {
+ if (!nullable_in_parent_) {
+ return;
+ }
+ info_.max_def_level++;
+ // We don't use null_count() because if the null_count isn't known
+ // and the array does in fact contain nulls, we will end up
+ // traversing the null bitmap twice (once here and once when calculating
+ // rep/def levels). Because this isn't terminal this might not be
+ // the right decision for structs that share the same nullable
+ // parents.
+ if (LazyNoNulls(array)) {
+ // Don't add anything because there won't be any point checking
+ // null values for the array. There will always be at least
+ // one more array to handle nullability.
+ return;
+ }
+ if (LazyNullCount(array) == array.length()) {
+ info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
+ return;
+ }
+ info_.path.emplace_back(
+ NullableNode(array.null_bitmap_data(), array.offset(),
+ /* def_level_if_null = */ info_.max_def_level - 1));
+ }
+
+ Status VisitInline(const Array& array);
+
+ Status Visit(const ::arrow::MapArray& array) {
+ return Visit(static_cast<const ::arrow::ListArray&>(array));
+ }
+
+ Status Visit(const ::arrow::StructArray& array) {
+ MaybeAddNullable(array);
+ PathInfo info_backup = info_;
+ for (int x = 0; x < array.num_fields(); x++) {
+ nullable_in_parent_ = array.type()->field(x)->nullable();
+ RETURN_NOT_OK(VisitInline(*array.field(x)));
+ info_ = info_backup;
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const ::arrow::FixedSizeListArray& array) {
+ MaybeAddNullable(array);
+ int32_t list_size = array.list_type()->list_size();
+ // Technically we could encode fixed size lists with two level encodings
+ // but since we always use 3 level encoding we increment def levels as
+ // well.
+ info_.max_def_level++;
+ info_.max_rep_level++;
+ info_.path.emplace_back(FixedSizeListNode(FixedSizedRangeSelector{list_size},
+ info_.max_rep_level, info_.max_def_level));
+ nullable_in_parent_ = array.list_type()->value_field()->nullable();
+ if (array.offset() > 0) {
+ return VisitInline(*array.values()->Slice(array.value_offset(0)));
+ }
+ return VisitInline(*array.values());
+ }
+
+ Status Visit(const ::arrow::ExtensionArray& array) {
+ return VisitInline(*array.storage());
+ }
+
+#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
+ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
+ return Status::NotImplemented("Level generation for " #ArrowTypePrefix \
+ " not supported yet"); \
+ }
+
+ // Union types aren't supported in Parquet.
+ NOT_IMPLEMENTED_VISIT(Union)
+
+#undef NOT_IMPLEMENTED_VISIT
+ std::vector<PathInfo>& paths() { return paths_; }
+
+ private:
+ PathInfo info_;
+ std::vector<PathInfo> paths_;
+ bool nullable_in_parent_;
+};
+
+Status PathBuilder::VisitInline(const Array& array) {
+ return ::arrow::VisitArrayInline(array, this);
+}
+
+#undef RETURN_IF_ERROR
+} // namespace
+
+class MultipathLevelBuilderImpl : public MultipathLevelBuilder {
+ public:
+ MultipathLevelBuilderImpl(std::shared_ptr<::arrow::ArrayData> data,
+ std::unique_ptr<PathBuilder> path_builder)
+ : root_range_{0, data->length},
+ data_(std::move(data)),
+ path_builder_(std::move(path_builder)) {}
+
+ int GetLeafCount() const override {
+ return static_cast<int>(path_builder_->paths().size());
+ }
+
+ ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback) override {
+ DCHECK_GE(leaf_index, 0);
+ DCHECK_LT(leaf_index, GetLeafCount());
+ return WritePath(root_range_, &path_builder_->paths()[leaf_index], context,
+ std::move(write_leaf_callback));
+ }
+
+ private:
+ ElementRange root_range_;
+ // Reference holder to ensure the data stays valid.
+ std::shared_ptr<::arrow::ArrayData> data_;
+ std::unique_ptr<PathBuilder> path_builder_;
+};
+
+// static
+::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> MultipathLevelBuilder::Make(
+ const ::arrow::Array& array, bool array_field_nullable) {
+ auto constructor = ::arrow::internal::make_unique<PathBuilder>(array_field_nullable);
+ RETURN_NOT_OK(VisitArrayInline(array, constructor.get()));
+ return ::arrow::internal::make_unique<MultipathLevelBuilderImpl>(
+ array.data(), std::move(constructor));
+}
+
+// static
+Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullable,
+ ArrowWriteContext* context,
+ MultipathLevelBuilder::CallbackFunction callback) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
+ MultipathLevelBuilder::Make(array, array_field_nullable));
+ PathBuilder constructor(array_field_nullable);
+ RETURN_NOT_OK(VisitArrayInline(array, &constructor));
+ for (int leaf_idx = 0; leaf_idx < builder->GetLeafCount(); leaf_idx++) {
+ RETURN_NOT_OK(builder->Write(leaf_idx, context, callback));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
index c5b7fdfdac3..e5af186dc4f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
@@ -1,155 +1,155 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "arrow/result.h"
-#include "arrow/status.h"
-
-#include "parquet/platform.h"
-
-namespace arrow {
-
-class Array;
-
-} // namespace arrow
-
-namespace parquet {
-
-struct ArrowWriteContext;
-
-namespace arrow {
-
-// This files contain internal implementation details and should not be considered
-// part of the public API.
-
-// The MultipathLevelBuilder is intended to fully support all Arrow nested types that
-// map to parquet types (i.e. Everything but Unions).
-//
-
-/// \brief Half open range of elements in an array.
-struct ElementRange {
- /// Upper bound of range (inclusive)
- int64_t start;
- /// Upper bound of range (exclusive)
- int64_t end;
-
- bool Empty() const { return start == end; }
-
- int64_t Size() const { return end - start; }
-};
-
-/// \brief Result for a single leaf array when running the builder on the
-/// its root.
-struct MultipathLevelBuilderResult {
- /// \brief The Array containing only the values to write (after all nesting has
- /// been processed.
- ///
- /// No additional processing is done on this array (it is copied as is when
- /// visited via a DFS).
- std::shared_ptr<::arrow::Array> leaf_array;
-
- /// \brief Might be null.
- const int16_t* def_levels = nullptr;
-
- /// \brief Might be null.
- const int16_t* rep_levels = nullptr;
-
- /// \brief Number of items (int16_t) contained in def/rep_levels when present.
- int64_t def_rep_level_count = 0;
-
- /// \brief Contains element ranges of the required visiting on the
- /// descendants of the final list ancestor for any leaf node.
- ///
- /// The algorithm will attempt to consolidate visited ranges into
- /// the smallest number possible.
- ///
- /// This data is necessary to pass along because after producing
- /// def-rep levels for each leaf array it is impossible to determine
- /// which values have to be sent to parquet when a null list value
- /// in a nullable ListArray is non-empty.
- ///
- /// This allows for the parquet writing to determine which values ultimately
- /// needs to be written.
- std::vector<ElementRange> post_list_visited_elements;
-
- /// Whether the leaf array is nullable.
- bool leaf_is_nullable;
-};
-
-/// \brief Logic for being able to write out nesting (rep/def level) data that is
-/// needed for writing to parquet.
-class PARQUET_EXPORT MultipathLevelBuilder {
- public:
- /// \brief A callback function that will receive results from the call to
- /// Write(...) below. The MultipathLevelBuilderResult passed in will
- /// only remain valid for the function call (i.e. storing it and relying
- /// for its data to be consistent afterwards will result in undefined
- /// behavior.
- using CallbackFunction =
- std::function<::arrow::Status(const MultipathLevelBuilderResult&)>;
-
- /// \brief Determine rep/def level information for the array.
- ///
- /// The callback will be invoked for each leaf Array that is a
- /// descendant of array. Each leaf array is processed in a depth
- /// first traversal-order.
- ///
- /// \param[in] array The array to process.
- /// \param[in] array_field_nullable Whether the algorithm should consider
- /// the the array column as nullable (as determined by its type's parent
- /// field).
- /// \param[in, out] context for use when allocating memory, etc.
- /// \param[out] write_leaf_callback Callback to receive results.
- /// There will be one call to the write_leaf_callback for each leaf node.
- static ::arrow::Status Write(const ::arrow::Array& array, bool array_field_nullable,
- ArrowWriteContext* context,
- CallbackFunction write_leaf_callback);
-
- /// \brief Construct a new instance of the builder.
- ///
- /// \param[in] array The array to process.
- /// \param[in] array_field_nullable Whether the algorithm should consider
- /// the the array column as nullable (as determined by its type's parent
- /// field).
- static ::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> Make(
- const ::arrow::Array& array, bool array_field_nullable);
-
- virtual ~MultipathLevelBuilder() = default;
-
- /// \brief Returns the number of leaf columns that need to be written
- /// to Parquet.
- virtual int GetLeafCount() const = 0;
-
- /// \brief Calls write_leaf_callback with the MultipathLevelBuilderResult corresponding
- /// to |leaf_index|.
- ///
- /// \param[in] leaf_index The index of the leaf column to write. Must be in the range
- /// [0, GetLeafCount()].
- /// \param[in, out] context for use when allocating memory, etc.
- /// \param[out] write_leaf_callback Callback to receive the result.
- virtual ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
- CallbackFunction write_leaf_callback) = 0;
-};
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "parquet/platform.h"
+
+namespace arrow {
+
+class Array;
+
+} // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+
+namespace arrow {
+
+// This files contain internal implementation details and should not be considered
+// part of the public API.
+
+// The MultipathLevelBuilder is intended to fully support all Arrow nested types that
+// map to parquet types (i.e. Everything but Unions).
+//
+
+/// \brief Half open range of elements in an array.
+struct ElementRange {
+ /// Upper bound of range (inclusive)
+ int64_t start;
+ /// Upper bound of range (exclusive)
+ int64_t end;
+
+ bool Empty() const { return start == end; }
+
+ int64_t Size() const { return end - start; }
+};
+
+/// \brief Result for a single leaf array when running the builder on the
+/// its root.
+struct MultipathLevelBuilderResult {
+ /// \brief The Array containing only the values to write (after all nesting has
+ /// been processed.
+ ///
+ /// No additional processing is done on this array (it is copied as is when
+ /// visited via a DFS).
+ std::shared_ptr<::arrow::Array> leaf_array;
+
+ /// \brief Might be null.
+ const int16_t* def_levels = nullptr;
+
+ /// \brief Might be null.
+ const int16_t* rep_levels = nullptr;
+
+ /// \brief Number of items (int16_t) contained in def/rep_levels when present.
+ int64_t def_rep_level_count = 0;
+
+ /// \brief Contains element ranges of the required visiting on the
+ /// descendants of the final list ancestor for any leaf node.
+ ///
+ /// The algorithm will attempt to consolidate visited ranges into
+ /// the smallest number possible.
+ ///
+ /// This data is necessary to pass along because after producing
+ /// def-rep levels for each leaf array it is impossible to determine
+ /// which values have to be sent to parquet when a null list value
+ /// in a nullable ListArray is non-empty.
+ ///
+ /// This allows for the parquet writing to determine which values ultimately
+ /// needs to be written.
+ std::vector<ElementRange> post_list_visited_elements;
+
+ /// Whether the leaf array is nullable.
+ bool leaf_is_nullable;
+};
+
+/// \brief Logic for being able to write out nesting (rep/def level) data that is
+/// needed for writing to parquet.
+class PARQUET_EXPORT MultipathLevelBuilder {
+ public:
+ /// \brief A callback function that will receive results from the call to
+ /// Write(...) below. The MultipathLevelBuilderResult passed in will
+ /// only remain valid for the function call (i.e. storing it and relying
+ /// for its data to be consistent afterwards will result in undefined
+ /// behavior.
+ using CallbackFunction =
+ std::function<::arrow::Status(const MultipathLevelBuilderResult&)>;
+
+ /// \brief Determine rep/def level information for the array.
+ ///
+ /// The callback will be invoked for each leaf Array that is a
+ /// descendant of array. Each leaf array is processed in a depth
+ /// first traversal-order.
+ ///
+ /// \param[in] array The array to process.
+ /// \param[in] array_field_nullable Whether the algorithm should consider
+ /// the the array column as nullable (as determined by its type's parent
+ /// field).
+ /// \param[in, out] context for use when allocating memory, etc.
+ /// \param[out] write_leaf_callback Callback to receive results.
+ /// There will be one call to the write_leaf_callback for each leaf node.
+ static ::arrow::Status Write(const ::arrow::Array& array, bool array_field_nullable,
+ ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback);
+
+ /// \brief Construct a new instance of the builder.
+ ///
+ /// \param[in] array The array to process.
+ /// \param[in] array_field_nullable Whether the algorithm should consider
+ /// the the array column as nullable (as determined by its type's parent
+ /// field).
+ static ::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> Make(
+ const ::arrow::Array& array, bool array_field_nullable);
+
+ virtual ~MultipathLevelBuilder() = default;
+
+ /// \brief Returns the number of leaf columns that need to be written
+ /// to Parquet.
+ virtual int GetLeafCount() const = 0;
+
+ /// \brief Calls write_leaf_callback with the MultipathLevelBuilderResult corresponding
+ /// to |leaf_index|.
+ ///
+ /// \param[in] leaf_index The index of the leaf column to write. Must be in the range
+ /// [0, GetLeafCount()].
+ /// \param[in, out] context for use when allocating memory, etc.
+ /// \param[out] write_leaf_callback Callback to receive the result.
+ virtual ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback) = 0;
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
index 4f5f79c964a..7f284abdee0 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
@@ -1,1248 +1,1248 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/reader.h"
-
-#include <algorithm>
-#include <cstring>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/extension_type.h"
-#include "arrow/io/memory.h"
-#include "arrow/record_batch.h"
-#include "arrow/table.h"
-#include "arrow/type.h"
-#include "arrow/util/async_generator.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/future.h"
-#include "arrow/util/iterator.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/util/parallel.h"
-#include "arrow/util/range.h"
-#include "parquet/arrow/reader_internal.h"
-#include "parquet/column_reader.h"
-#include "parquet/exception.h"
-#include "parquet/file_reader.h"
-#include "parquet/metadata.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-
-using arrow::Array;
-using arrow::ArrayData;
-using arrow::BooleanArray;
-using arrow::ChunkedArray;
-using arrow::DataType;
-using arrow::ExtensionType;
-using arrow::Field;
-using arrow::Future;
-using arrow::Int32Array;
-using arrow::ListArray;
-using arrow::MemoryPool;
-using arrow::RecordBatchReader;
-using arrow::ResizableBuffer;
-using arrow::Status;
-using arrow::StructArray;
-using arrow::Table;
-using arrow::TimestampArray;
-
-using arrow::internal::checked_cast;
-using arrow::internal::Iota;
-
-// Help reduce verbosity
-using ParquetReader = parquet::ParquetFileReader;
-
-using parquet::internal::RecordReader;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-namespace arrow {
-namespace {
-
-::arrow::Result<std::shared_ptr<ArrayData>> ChunksToSingle(const ChunkedArray& chunked) {
- switch (chunked.num_chunks()) {
- case 0: {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array,
- ::arrow::MakeArrayOfNull(chunked.type(), 0));
- return array->data();
- }
- case 1:
- return chunked.chunk(0)->data();
- default:
- // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
- // this is not yet implemented
- return Status::NotImplemented(
- "Nested data conversions not implemented for chunked array outputs");
- }
-}
-
-} // namespace
-
-class ColumnReaderImpl : public ColumnReader {
- public:
- virtual Status GetDefLevels(const int16_t** data, int64_t* length) = 0;
- virtual Status GetRepLevels(const int16_t** data, int64_t* length) = 0;
- virtual const std::shared_ptr<Field> field() = 0;
-
- ::arrow::Status NextBatch(int64_t batch_size,
- std::shared_ptr<::arrow::ChunkedArray>* out) final {
- RETURN_NOT_OK(LoadBatch(batch_size));
- RETURN_NOT_OK(BuildArray(batch_size, out));
- for (int x = 0; x < (*out)->num_chunks(); x++) {
- RETURN_NOT_OK((*out)->chunk(x)->Validate());
- }
- return Status::OK();
- }
-
- virtual ::arrow::Status LoadBatch(int64_t num_records) = 0;
-
- virtual ::arrow::Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
- virtual bool IsOrHasRepeatedChild() const = 0;
-};
-
-namespace {
-
-std::shared_ptr<std::unordered_set<int>> VectorToSharedSet(
- const std::vector<int>& values) {
- std::shared_ptr<std::unordered_set<int>> result(new std::unordered_set<int>());
- result->insert(values.begin(), values.end());
- return result;
-}
-
-// Forward declaration
-Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& context,
- std::unique_ptr<ColumnReaderImpl>* out);
-
-// ----------------------------------------------------------------------
-// FileReaderImpl forward declaration
-
-class FileReaderImpl : public FileReader {
- public:
- FileReaderImpl(MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader,
- ArrowReaderProperties properties)
- : pool_(pool),
- reader_(std::move(reader)),
- reader_properties_(std::move(properties)) {}
-
- Status Init() {
- return SchemaManifest::Make(reader_->metadata()->schema(),
- reader_->metadata()->key_value_metadata(),
- reader_properties_, &manifest_);
- }
-
- FileColumnIteratorFactory SomeRowGroupsFactory(std::vector<int> row_groups) {
- return [row_groups](int i, ParquetFileReader* reader) {
- return new FileColumnIterator(i, reader, row_groups);
- };
- }
-
- FileColumnIteratorFactory AllRowGroupsFactory() {
- return SomeRowGroupsFactory(Iota(reader_->metadata()->num_row_groups()));
- }
-
- Status BoundsCheckColumn(int column) {
- if (column < 0 || column >= this->num_columns()) {
- return Status::Invalid("Column index out of bounds (got ", column,
- ", should be "
- "between 0 and ",
- this->num_columns() - 1, ")");
- }
- return Status::OK();
- }
-
- Status BoundsCheckRowGroup(int row_group) {
- // row group indices check
- if (row_group < 0 || row_group >= num_row_groups()) {
- return Status::Invalid("Some index in row_group_indices is ", row_group,
- ", which is either < 0 or >= num_row_groups(",
- num_row_groups(), ")");
- }
- return Status::OK();
- }
-
- Status BoundsCheck(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices) {
- for (int i : row_groups) {
- RETURN_NOT_OK(BoundsCheckRowGroup(i));
- }
- for (int i : column_indices) {
- RETURN_NOT_OK(BoundsCheckColumn(i));
- }
- return Status::OK();
- }
-
- std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) override;
-
- Status ReadTable(const std::vector<int>& indices,
- std::shared_ptr<Table>* out) override {
- return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out);
- }
-
- Status GetFieldReader(int i,
- const std::shared_ptr<std::unordered_set<int>>& included_leaves,
- const std::vector<int>& row_groups,
- std::unique_ptr<ColumnReaderImpl>* out) {
- auto ctx = std::make_shared<ReaderContext>();
- ctx->reader = reader_.get();
- ctx->pool = pool_;
- ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
- ctx->filter_leaves = true;
- ctx->included_leaves = included_leaves;
- return GetReader(manifest_.schema_fields[i], ctx, out);
- }
-
- Status GetFieldReaders(const std::vector<int>& column_indices,
- const std::vector<int>& row_groups,
- std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
- std::shared_ptr<::arrow::Schema>* out_schema) {
- // We only need to read schema fields which have columns indicated
- // in the indices vector
- ARROW_ASSIGN_OR_RAISE(std::vector<int> field_indices,
- manifest_.GetFieldIndices(column_indices));
-
- auto included_leaves = VectorToSharedSet(column_indices);
-
- out->resize(field_indices.size());
- ::arrow::FieldVector out_fields(field_indices.size());
- for (size_t i = 0; i < out->size(); ++i) {
- std::unique_ptr<ColumnReaderImpl> reader;
- RETURN_NOT_OK(
- GetFieldReader(field_indices[i], included_leaves, row_groups, &reader));
-
- out_fields[i] = reader->field();
- out->at(i) = std::move(reader);
- }
-
- *out_schema = ::arrow::schema(std::move(out_fields), manifest_.schema_metadata);
- return Status::OK();
- }
-
- Status GetColumn(int i, FileColumnIteratorFactory iterator_factory,
- std::unique_ptr<ColumnReader>* out);
-
- Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) override {
- return GetColumn(i, AllRowGroupsFactory(), out);
- }
-
- Status GetSchema(std::shared_ptr<::arrow::Schema>* out) override {
- return FromParquetSchema(reader_->metadata()->schema(), reader_properties_,
- reader_->metadata()->key_value_metadata(), out);
- }
-
- Status ReadSchemaField(int i, std::shared_ptr<ChunkedArray>* out) override {
- auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns()));
- std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
-
- std::unique_ptr<ColumnReaderImpl> reader;
- RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader));
-
- return ReadColumn(i, row_groups, reader.get(), out);
- }
-
- Status ReadColumn(int i, const std::vector<int>& row_groups, ColumnReader* reader,
- std::shared_ptr<ChunkedArray>* out) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- // TODO(wesm): This calculation doesn't make much sense when we have repeated
- // schema nodes
- int64_t records_to_read = 0;
- for (auto row_group : row_groups) {
- // Can throw exception
- records_to_read +=
- reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values();
- }
- return reader->NextBatch(records_to_read, out);
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- Status ReadColumn(int i, const std::vector<int>& row_groups,
- std::shared_ptr<ChunkedArray>* out) {
- std::unique_ptr<ColumnReader> flat_column_reader;
- RETURN_NOT_OK(GetColumn(i, SomeRowGroupsFactory(row_groups), &flat_column_reader));
- return ReadColumn(i, row_groups, flat_column_reader.get(), out);
- }
-
- Status ReadColumn(int i, std::shared_ptr<ChunkedArray>* out) override {
- return ReadColumn(i, Iota(reader_->metadata()->num_row_groups()), out);
- }
-
- Status ReadTable(std::shared_ptr<Table>* table) override {
- return ReadTable(Iota(reader_->metadata()->num_columns()), table);
- }
-
- Status ReadRowGroups(const std::vector<int>& row_groups,
- const std::vector<int>& indices,
- std::shared_ptr<Table>* table) override;
-
- // Helper method used by ReadRowGroups - read the given row groups/columns, skipping
- // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader
- // alive in async contexts.
- Future<std::shared_ptr<Table>> DecodeRowGroups(
- std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
- const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor);
-
- Status ReadRowGroups(const std::vector<int>& row_groups,
- std::shared_ptr<Table>* table) override {
- return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns()), table);
- }
-
- Status ReadRowGroup(int row_group_index, const std::vector<int>& column_indices,
- std::shared_ptr<Table>* out) override {
- return ReadRowGroups({row_group_index}, column_indices, out);
- }
-
- Status ReadRowGroup(int i, std::shared_ptr<Table>* table) override {
- return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
- }
-
- Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- const std::vector<int>& column_indices,
- std::unique_ptr<RecordBatchReader>* out) override;
-
- Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- std::unique_ptr<RecordBatchReader>* out) override {
- return GetRecordBatchReader(row_group_indices,
- Iota(reader_->metadata()->num_columns()), out);
- }
-
- ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
- GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
- const std::vector<int> row_group_indices,
- const std::vector<int> column_indices,
- ::arrow::internal::Executor* cpu_executor) override;
-
- int num_columns() const { return reader_->metadata()->num_columns(); }
-
- ParquetFileReader* parquet_reader() const override { return reader_.get(); }
-
- int num_row_groups() const override { return reader_->metadata()->num_row_groups(); }
-
- void set_use_threads(bool use_threads) override {
- reader_properties_.set_use_threads(use_threads);
- }
-
- void set_batch_size(int64_t batch_size) override {
- reader_properties_.set_batch_size(batch_size);
- }
-
- const ArrowReaderProperties& properties() const override { return reader_properties_; }
-
- const SchemaManifest& manifest() const override { return manifest_; }
-
- Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
- int64_t* num_rows) override {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- *num_rows = ScanFileContents(columns, column_batch_size, reader_.get());
- return Status::OK();
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- MemoryPool* pool_;
- std::unique_ptr<ParquetFileReader> reader_;
- ArrowReaderProperties reader_properties_;
-
- SchemaManifest manifest_;
-};
-
-class RowGroupRecordBatchReader : public ::arrow::RecordBatchReader {
- public:
- RowGroupRecordBatchReader(::arrow::RecordBatchIterator batches,
- std::shared_ptr<::arrow::Schema> schema)
- : batches_(std::move(batches)), schema_(std::move(schema)) {}
-
- ~RowGroupRecordBatchReader() override {}
-
- Status ReadNext(std::shared_ptr<::arrow::RecordBatch>* out) override {
- return batches_.Next().Value(out);
- }
-
- std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
-
- private:
- ::arrow::Iterator<std::shared_ptr<::arrow::RecordBatch>> batches_;
- std::shared_ptr<::arrow::Schema> schema_;
-};
-
-class ColumnChunkReaderImpl : public ColumnChunkReader {
- public:
- ColumnChunkReaderImpl(FileReaderImpl* impl, int row_group_index, int column_index)
- : impl_(impl), column_index_(column_index), row_group_index_(row_group_index) {}
-
- Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) override {
- return impl_->ReadColumn(column_index_, {row_group_index_}, out);
- }
-
- private:
- FileReaderImpl* impl_;
- int column_index_;
- int row_group_index_;
-};
-
-class RowGroupReaderImpl : public RowGroupReader {
- public:
- RowGroupReaderImpl(FileReaderImpl* impl, int row_group_index)
- : impl_(impl), row_group_index_(row_group_index) {}
-
- std::shared_ptr<ColumnChunkReader> Column(int column_index) override {
- return std::shared_ptr<ColumnChunkReader>(
- new ColumnChunkReaderImpl(impl_, row_group_index_, column_index));
- }
-
- Status ReadTable(const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) override {
- return impl_->ReadRowGroup(row_group_index_, column_indices, out);
- }
-
- Status ReadTable(std::shared_ptr<::arrow::Table>* out) override {
- return impl_->ReadRowGroup(row_group_index_, out);
- }
-
- private:
- FileReaderImpl* impl_;
- int row_group_index_;
-};
-
-// ----------------------------------------------------------------------
-// Column reader implementations
-
-// Leaf reader is for primitive arrays and primitive children of nested arrays
-class LeafReader : public ColumnReaderImpl {
- public:
- LeafReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
- std::unique_ptr<FileColumnIterator> input,
- ::parquet::internal::LevelInfo leaf_info)
- : ctx_(std::move(ctx)),
- field_(std::move(field)),
- input_(std::move(input)),
- descr_(input_->descr()) {
- record_reader_ = RecordReader::Make(
- descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
- NextRowGroup();
- }
-
- Status GetDefLevels(const int16_t** data, int64_t* length) final {
- *data = record_reader_->def_levels();
- *length = record_reader_->levels_position();
- return Status::OK();
- }
-
- Status GetRepLevels(const int16_t** data, int64_t* length) final {
- *data = record_reader_->rep_levels();
- *length = record_reader_->levels_position();
- return Status::OK();
- }
-
- bool IsOrHasRepeatedChild() const final { return false; }
-
- Status LoadBatch(int64_t records_to_read) final {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- out_ = nullptr;
- record_reader_->Reset();
- // Pre-allocation gives much better performance for flat columns
- record_reader_->Reserve(records_to_read);
- while (records_to_read > 0) {
- if (!record_reader_->HasMoreData()) {
- break;
- }
- int64_t records_read = record_reader_->ReadRecords(records_to_read);
- records_to_read -= records_read;
- if (records_read == 0) {
- NextRowGroup();
- }
- }
- RETURN_NOT_OK(TransferColumnData(record_reader_.get(), field_->type(), descr_,
- ctx_->pool, &out_));
- return Status::OK();
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- ::arrow::Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<::arrow::ChunkedArray>* out) final {
- *out = out_;
- return Status::OK();
- }
-
- const std::shared_ptr<Field> field() override { return field_; }
-
- private:
- std::shared_ptr<ChunkedArray> out_;
- void NextRowGroup() {
- std::unique_ptr<PageReader> page_reader = input_->NextChunk();
- record_reader_->SetPageReader(std::move(page_reader));
- }
-
- std::shared_ptr<ReaderContext> ctx_;
- std::shared_ptr<Field> field_;
- std::unique_ptr<FileColumnIterator> input_;
- const ColumnDescriptor* descr_;
- std::shared_ptr<RecordReader> record_reader_;
-};
-
-// Column reader for extension arrays
-class ExtensionReader : public ColumnReaderImpl {
- public:
- ExtensionReader(std::shared_ptr<Field> field,
- std::unique_ptr<ColumnReaderImpl> storage_reader)
- : field_(std::move(field)), storage_reader_(std::move(storage_reader)) {}
-
- Status GetDefLevels(const int16_t** data, int64_t* length) override {
- return storage_reader_->GetDefLevels(data, length);
- }
-
- Status GetRepLevels(const int16_t** data, int64_t* length) override {
- return storage_reader_->GetRepLevels(data, length);
- }
-
- Status LoadBatch(int64_t number_of_records) final {
- return storage_reader_->LoadBatch(number_of_records);
- }
-
- Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) override {
- std::shared_ptr<ChunkedArray> storage;
- RETURN_NOT_OK(storage_reader_->BuildArray(length_upper_bound, &storage));
- *out = ExtensionType::WrapArray(field_->type(), storage);
- return Status::OK();
- }
-
- bool IsOrHasRepeatedChild() const final {
- return storage_reader_->IsOrHasRepeatedChild();
- }
-
- const std::shared_ptr<Field> field() override { return field_; }
-
- private:
- std::shared_ptr<Field> field_;
- std::unique_ptr<ColumnReaderImpl> storage_reader_;
-};
-
-template <typename IndexType>
-class ListReader : public ColumnReaderImpl {
- public:
- ListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
- ::parquet::internal::LevelInfo level_info,
- std::unique_ptr<ColumnReaderImpl> child_reader)
- : ctx_(std::move(ctx)),
- field_(std::move(field)),
- level_info_(level_info),
- item_reader_(std::move(child_reader)) {}
-
- Status GetDefLevels(const int16_t** data, int64_t* length) override {
- return item_reader_->GetDefLevels(data, length);
- }
-
- Status GetRepLevels(const int16_t** data, int64_t* length) override {
- return item_reader_->GetRepLevels(data, length);
- }
-
- bool IsOrHasRepeatedChild() const final { return true; }
-
- Status LoadBatch(int64_t number_of_records) final {
- return item_reader_->LoadBatch(number_of_records);
- }
-
- virtual ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
- std::shared_ptr<ArrayData> data) {
- if (field_->type()->id() == ::arrow::Type::MAP) {
- // Error out if data is not map-compliant instead of aborting in MakeArray below
- RETURN_NOT_OK(::arrow::MapArray::ValidateChildData(data->child_data));
- }
- std::shared_ptr<Array> result = ::arrow::MakeArray(data);
- return std::make_shared<ChunkedArray>(result);
- }
-
- Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) override {
- const int16_t* def_levels;
- const int16_t* rep_levels;
- int64_t num_levels;
- RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
- RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
-
- std::shared_ptr<ResizableBuffer> validity_buffer;
- ::parquet::internal::ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = length_upper_bound;
- if (field_->nullable()) {
- ARROW_ASSIGN_OR_RAISE(
- validity_buffer,
- AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
- validity_io.valid_bits = validity_buffer->mutable_data();
- }
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<ResizableBuffer> offsets_buffer,
- AllocateResizableBuffer(
- sizeof(IndexType) * std::max(int64_t{1}, length_upper_bound + 1),
- ctx_->pool));
- // Ensure zero initialization in case we have reached a zero length list (and
- // because first entry is always zero).
- IndexType* offset_data = reinterpret_cast<IndexType*>(offsets_buffer->mutable_data());
- offset_data[0] = 0;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- ::parquet::internal::DefRepLevelsToList(def_levels, rep_levels, num_levels,
- level_info_, &validity_io, offset_data);
- END_PARQUET_CATCH_EXCEPTIONS
-
- RETURN_NOT_OK(item_reader_->BuildArray(offset_data[validity_io.values_read], out));
-
- // Resize to actual number of elements returned.
- RETURN_NOT_OK(
- offsets_buffer->Resize((validity_io.values_read + 1) * sizeof(IndexType)));
- if (validity_buffer != nullptr) {
- RETURN_NOT_OK(
- validity_buffer->Resize(BitUtil::BytesForBits(validity_io.values_read)));
- validity_buffer->ZeroPadding();
- }
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> item_chunk, ChunksToSingle(**out));
-
- std::vector<std::shared_ptr<Buffer>> buffers{
- validity_io.null_count > 0 ? validity_buffer : nullptr, offsets_buffer};
- auto data = std::make_shared<ArrayData>(
- field_->type(),
- /*length=*/validity_io.values_read, std::move(buffers),
- std::vector<std::shared_ptr<ArrayData>>{item_chunk}, validity_io.null_count);
-
- ARROW_ASSIGN_OR_RAISE(*out, AssembleArray(std::move(data)));
- return Status::OK();
- }
-
- const std::shared_ptr<Field> field() override { return field_; }
-
- private:
- std::shared_ptr<ReaderContext> ctx_;
- std::shared_ptr<Field> field_;
- ::parquet::internal::LevelInfo level_info_;
- std::unique_ptr<ColumnReaderImpl> item_reader_;
-};
-
-class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader<int32_t> {
- public:
- FixedSizeListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
- ::parquet::internal::LevelInfo level_info,
- std::unique_ptr<ColumnReaderImpl> child_reader)
- : ListReader(std::move(ctx), std::move(field), level_info,
- std::move(child_reader)) {}
- ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
- std::shared_ptr<ArrayData> data) final {
- DCHECK_EQ(data->buffers.size(), 2);
- DCHECK_EQ(field()->type()->id(), ::arrow::Type::FIXED_SIZE_LIST);
- const auto& type = checked_cast<::arrow::FixedSizeListType&>(*field()->type());
- const int32_t* offsets = reinterpret_cast<const int32_t*>(data->buffers[1]->data());
- for (int x = 1; x <= data->length; x++) {
- int32_t size = offsets[x] - offsets[x - 1];
- if (size != type.list_size()) {
- return Status::Invalid("Expected all lists to be of size=", type.list_size(),
- " but index ", x, " had size=", size);
- }
- }
- data->buffers.resize(1);
- std::shared_ptr<Array> result = ::arrow::MakeArray(data);
- return std::make_shared<ChunkedArray>(result);
- }
-};
-
-class PARQUET_NO_EXPORT StructReader : public ColumnReaderImpl {
- public:
- explicit StructReader(std::shared_ptr<ReaderContext> ctx,
- std::shared_ptr<Field> filtered_field,
- ::parquet::internal::LevelInfo level_info,
- std::vector<std::unique_ptr<ColumnReaderImpl>> children)
- : ctx_(std::move(ctx)),
- filtered_field_(std::move(filtered_field)),
- level_info_(level_info),
- children_(std::move(children)) {
- // There could be a mix of children some might be repeated some might not be.
- // If possible use one that isn't since that will be guaranteed to have the least
- // number of levels to reconstruct a nullable bitmap.
- auto result = std::find_if(children_.begin(), children_.end(),
- [](const std::unique_ptr<ColumnReaderImpl>& child) {
- return !child->IsOrHasRepeatedChild();
- });
- if (result != children_.end()) {
- def_rep_level_child_ = result->get();
- has_repeated_child_ = false;
- } else if (!children_.empty()) {
- def_rep_level_child_ = children_.front().get();
- has_repeated_child_ = true;
- }
- }
-
- bool IsOrHasRepeatedChild() const final { return has_repeated_child_; }
-
- Status LoadBatch(int64_t records_to_read) override {
- for (const std::unique_ptr<ColumnReaderImpl>& reader : children_) {
- RETURN_NOT_OK(reader->LoadBatch(records_to_read));
- }
- return Status::OK();
- }
- Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) override;
- Status GetDefLevels(const int16_t** data, int64_t* length) override;
- Status GetRepLevels(const int16_t** data, int64_t* length) override;
- const std::shared_ptr<Field> field() override { return filtered_field_; }
-
- private:
- const std::shared_ptr<ReaderContext> ctx_;
- const std::shared_ptr<Field> filtered_field_;
- const ::parquet::internal::LevelInfo level_info_;
- const std::vector<std::unique_ptr<ColumnReaderImpl>> children_;
- ColumnReaderImpl* def_rep_level_child_ = nullptr;
- bool has_repeated_child_;
-};
-
-Status StructReader::GetDefLevels(const int16_t** data, int64_t* length) {
- *data = nullptr;
- if (children_.size() == 0) {
- *length = 0;
- return Status::Invalid("StructReader had no children");
- }
-
- // This method should only be called when this struct or one of its parents
- // are optional/repeated or it has a repeated child.
- // Meaning all children must have rep/def levels associated
- // with them.
- RETURN_NOT_OK(def_rep_level_child_->GetDefLevels(data, length));
- return Status::OK();
-}
-
-Status StructReader::GetRepLevels(const int16_t** data, int64_t* length) {
- *data = nullptr;
- if (children_.size() == 0) {
- *length = 0;
- return Status::Invalid("StructReader had no childre");
- }
-
- // This method should only be called when this struct or one of its parents
- // are optional/repeated or it has repeated child.
- // Meaning all children must have rep/def levels associated
- // with them.
- RETURN_NOT_OK(def_rep_level_child_->GetRepLevels(data, length));
- return Status::OK();
-}
-
-Status StructReader::BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) {
- std::vector<std::shared_ptr<ArrayData>> children_array_data;
- std::shared_ptr<ResizableBuffer> null_bitmap;
-
- ::parquet::internal::ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = length_upper_bound;
- // This simplifies accounting below.
- validity_io.values_read = length_upper_bound;
-
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- const int16_t* def_levels;
- const int16_t* rep_levels;
- int64_t num_levels;
-
- if (has_repeated_child_) {
- ARROW_ASSIGN_OR_RAISE(
- null_bitmap,
- AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
- validity_io.valid_bits = null_bitmap->mutable_data();
- RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
- RETURN_NOT_OK(GetRepLevels(&rep_levels, &num_levels));
- DefRepLevelsToBitmap(def_levels, rep_levels, num_levels, level_info_, &validity_io);
- } else if (filtered_field_->nullable()) {
- ARROW_ASSIGN_OR_RAISE(
- null_bitmap,
- AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
- validity_io.valid_bits = null_bitmap->mutable_data();
- RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
- DefLevelsToBitmap(def_levels, num_levels, level_info_, &validity_io);
- }
-
- // Ensure all values are initialized.
- if (null_bitmap) {
- RETURN_NOT_OK(null_bitmap->Resize(BitUtil::BytesForBits(validity_io.values_read)));
- null_bitmap->ZeroPadding();
- }
-
- END_PARQUET_CATCH_EXCEPTIONS
- // Gather children arrays and def levels
- for (auto& child : children_) {
- std::shared_ptr<ChunkedArray> field;
- RETURN_NOT_OK(child->BuildArray(validity_io.values_read, &field));
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array_data, ChunksToSingle(*field));
- children_array_data.push_back(std::move(array_data));
- }
-
- if (!filtered_field_->nullable() && !has_repeated_child_) {
- validity_io.values_read = children_array_data.front()->length;
- }
-
- std::vector<std::shared_ptr<Buffer>> buffers{validity_io.null_count > 0 ? null_bitmap
- : nullptr};
- auto data =
- std::make_shared<ArrayData>(filtered_field_->type(),
- /*length=*/validity_io.values_read, std::move(buffers),
- std::move(children_array_data));
- std::shared_ptr<Array> result = ::arrow::MakeArray(data);
-
- *out = std::make_shared<ChunkedArray>(result);
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// File reader implementation
-
-Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
- const std::shared_ptr<ReaderContext>& ctx,
- std::unique_ptr<ColumnReaderImpl>* out) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
-
- auto type_id = arrow_field->type()->id();
-
- if (type_id == ::arrow::Type::EXTENSION) {
- auto storage_field = arrow_field->WithType(
- checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
- RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
- out->reset(new ExtensionReader(arrow_field, std::move(*out)));
- return Status::OK();
- }
-
- if (field.children.size() == 0) {
- if (!field.is_leaf()) {
- return Status::Invalid("Parquet non-leaf node has no children");
- }
- if (!ctx->IncludesLeaf(field.column_index)) {
- *out = nullptr;
- return Status::OK();
- }
- std::unique_ptr<FileColumnIterator> input(
- ctx->iterator_factory(field.column_index, ctx->reader));
- out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
- } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
- type_id == ::arrow::Type::FIXED_SIZE_LIST ||
- type_id == ::arrow::Type::LARGE_LIST) {
- auto list_field = arrow_field;
- auto child = &field.children[0];
- std::unique_ptr<ColumnReaderImpl> child_reader;
- RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
- if (child_reader == nullptr) {
- *out = nullptr;
- return Status::OK();
- }
- if (type_id == ::arrow::Type::LIST ||
- type_id == ::arrow::Type::MAP) { // Map can be reconstructed as list of structs.
- if (type_id == ::arrow::Type::MAP &&
- child_reader->field()->type()->num_fields() != 2) {
- // This case applies if either key or value is filtered.
- list_field = list_field->WithType(::arrow::list(child_reader->field()));
- }
- out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
- std::move(child_reader)));
- } else if (type_id == ::arrow::Type::LARGE_LIST) {
- out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
- std::move(child_reader)));
-
- } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
- out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
- std::move(child_reader)));
- } else {
- return Status::UnknownError("Unknown list type: ", field.field->ToString());
- }
- } else if (type_id == ::arrow::Type::STRUCT) {
- std::vector<std::shared_ptr<Field>> child_fields;
- std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
- for (const auto& child : field.children) {
- std::unique_ptr<ColumnReaderImpl> child_reader;
- RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
- if (!child_reader) {
- // If all children were pruned, then we do not try to read this field
- continue;
- }
- child_fields.push_back(child.field);
- child_readers.emplace_back(std::move(child_reader));
- }
- if (child_fields.size() == 0) {
- *out = nullptr;
- return Status::OK();
- }
- auto filtered_field =
- ::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
- arrow_field->nullable(), arrow_field->metadata());
- out->reset(new StructReader(ctx, filtered_field, field.level_info,
- std::move(child_readers)));
- } else {
- return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
- }
- return Status::OK();
-
- END_PARQUET_CATCH_EXCEPTIONS
-}
-
-Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& ctx,
- std::unique_ptr<ColumnReaderImpl>* out) {
- return GetReader(field, field.field, ctx, out);
-}
-
-} // namespace
-
-Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- std::unique_ptr<RecordBatchReader>* out) {
- RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
-
- if (reader_properties_.pre_buffer()) {
- // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- reader_->PreBuffer(row_groups, column_indices, reader_properties_.io_context(),
- reader_properties_.cache_options());
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
- std::shared_ptr<::arrow::Schema> batch_schema;
- RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
-
- if (readers.empty()) {
- // Just generate all batches right now; they're cheap since they have no columns.
- int64_t batch_size = properties().batch_size();
- auto max_sized_batch =
- ::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});
-
- ::arrow::RecordBatchVector batches;
-
- for (int row_group : row_groups) {
- int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
-
- batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);
-
- if (int64_t trailing_rows = num_rows % batch_size) {
- batches.push_back(max_sized_batch->Slice(0, trailing_rows));
- }
- }
-
- *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
- ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
-
- return Status::OK();
- }
-
- int64_t num_rows = 0;
- for (int row_group : row_groups) {
- num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
- }
-
- using ::arrow::RecordBatchIterator;
-
- // NB: This lambda will be invoked outside the scope of this call to
- // `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
- // `this` is a non-owning pointer so we are relying on the parent FileReader outliving
- // this RecordBatchReader.
- ::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
- [readers, batch_schema, num_rows,
- this]() mutable -> ::arrow::Result<RecordBatchIterator> {
- ::arrow::ChunkedArrayVector columns(readers.size());
-
- // don't reserve more rows than necessary
- int64_t batch_size = std::min(properties().batch_size(), num_rows);
- num_rows -= batch_size;
-
- RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
- reader_properties_.use_threads(), static_cast<int>(readers.size()),
- [&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
-
- for (const auto& column : columns) {
- if (column == nullptr || column->length() == 0) {
- return ::arrow::IterationTraits<RecordBatchIterator>::End();
- }
- }
-
- auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
- auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
-
- // NB: explicitly preserve table so that table_reader doesn't outlive it
- return ::arrow::MakeFunctionIterator(
- [table, table_reader] { return table_reader->Next(); });
- });
-
- *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
- ::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));
-
- return Status::OK();
-}
-
-/// Given a file reader and a list of row groups, this is a generator of record
-/// batch generators (where each sub-generator is the contents of a single row group).
-class RowGroupGenerator {
- public:
- using RecordBatchGenerator =
- ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>;
-
- explicit RowGroupGenerator(std::shared_ptr<FileReaderImpl> arrow_reader,
- ::arrow::internal::Executor* cpu_executor,
- std::vector<int> row_groups, std::vector<int> column_indices)
- : arrow_reader_(std::move(arrow_reader)),
- cpu_executor_(cpu_executor),
- row_groups_(std::move(row_groups)),
- column_indices_(std::move(column_indices)),
- index_(0) {}
-
- ::arrow::Future<RecordBatchGenerator> operator()() {
- if (index_ >= row_groups_.size()) {
- return ::arrow::AsyncGeneratorEnd<RecordBatchGenerator>();
- }
- int row_group = row_groups_[index_++];
- std::vector<int> column_indices = column_indices_;
- auto reader = arrow_reader_;
- if (!reader->properties().pre_buffer()) {
- return SubmitRead(cpu_executor_, reader, row_group, column_indices);
- }
- auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices);
- if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready);
- return ready.Then([=]() -> ::arrow::Future<RecordBatchGenerator> {
- return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices);
- });
- }
-
- private:
- // Synchronous fallback for when pre-buffer isn't enabled.
- //
- // Making the Parquet reader truly asynchronous requires heavy refactoring, so the
- // generator piggybacks on ReadRangeCache. The lazy ReadRangeCache can be used for
- // async I/O without forcing readahead.
- static ::arrow::Future<RecordBatchGenerator> SubmitRead(
- ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
- const int row_group, const std::vector<int>& column_indices) {
- if (!cpu_executor) {
- return ReadOneRowGroup(cpu_executor, self, row_group, column_indices);
- }
- // If we have an executor, then force transfer (even if I/O was complete)
- return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self,
- row_group, column_indices));
- }
-
- static ::arrow::Future<RecordBatchGenerator> ReadOneRowGroup(
- ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
- const int row_group, const std::vector<int>& column_indices) {
- // Skips bound checks/pre-buffering, since we've done that already
- return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor)
- .Then([](const std::shared_ptr<Table>& table)
- -> ::arrow::Result<RecordBatchGenerator> {
- ::arrow::TableBatchReader table_reader(*table);
- ::arrow::RecordBatchVector batches;
- RETURN_NOT_OK(table_reader.ReadAll(&batches));
- return ::arrow::MakeVectorGenerator(std::move(batches));
- });
- }
-
- std::shared_ptr<FileReaderImpl> arrow_reader_;
- ::arrow::internal::Executor* cpu_executor_;
- std::vector<int> row_groups_;
- std::vector<int> column_indices_;
- size_t index_;
-};
-
-::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
-FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
- const std::vector<int> row_group_indices,
- const std::vector<int> column_indices,
- ::arrow::internal::Executor* cpu_executor) {
- RETURN_NOT_OK(BoundsCheck(row_group_indices, column_indices));
- if (reader_properties_.pre_buffer()) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- reader_->PreBuffer(row_group_indices, column_indices, reader_properties_.io_context(),
- reader_properties_.cache_options());
- END_PARQUET_CATCH_EXCEPTIONS
- }
- ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> row_group_generator =
- RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
- cpu_executor, row_group_indices, column_indices);
- return ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
-}
-
-Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_factory,
- std::unique_ptr<ColumnReader>* out) {
- RETURN_NOT_OK(BoundsCheckColumn(i));
- auto ctx = std::make_shared<ReaderContext>();
- ctx->reader = reader_.get();
- ctx->pool = pool_;
- ctx->iterator_factory = iterator_factory;
- ctx->filter_leaves = false;
- std::unique_ptr<ColumnReaderImpl> result;
- RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
- out->reset(result.release());
- return Status::OK();
-}
-
-Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- std::shared_ptr<Table>* out) {
- RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
-
- // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
- if (reader_properties_.pre_buffer()) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- parquet_reader()->PreBuffer(row_groups, column_indices,
- reader_properties_.io_context(),
- reader_properties_.cache_options());
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
- /*cpu_executor=*/nullptr);
- ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
- return Status::OK();
-}
-
-Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
- std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
- const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor) {
- // `self` is used solely to keep `this` alive in an async context - but we use this
- // in a sync context too so use `this` over `self`
- std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
- std::shared_ptr<::arrow::Schema> result_schema;
- RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
- // OptionalParallelForAsync requires an executor
- if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
-
- auto read_column = [row_groups, self, this](size_t i,
- std::shared_ptr<ColumnReaderImpl> reader)
- -> ::arrow::Result<std::shared_ptr<::arrow::ChunkedArray>> {
- std::shared_ptr<::arrow::ChunkedArray> column;
- RETURN_NOT_OK(ReadColumn(static_cast<int>(i), row_groups, reader.get(), &column));
- return column;
- };
- auto make_table = [result_schema, row_groups, self,
- this](const ::arrow::ChunkedArrayVector& columns)
- -> ::arrow::Result<std::shared_ptr<Table>> {
- int64_t num_rows = 0;
- if (!columns.empty()) {
- num_rows = columns[0]->length();
- } else {
- for (int i : row_groups) {
- num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
- }
- }
- auto table = Table::Make(std::move(result_schema), columns, num_rows);
- RETURN_NOT_OK(table->Validate());
- return table;
- };
- return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(),
- std::move(readers), read_column,
- cpu_executor)
- .Then(std::move(make_table));
-}
-
-std::shared_ptr<RowGroupReader> FileReaderImpl::RowGroup(int row_group_index) {
- return std::make_shared<RowGroupReaderImpl>(this, row_group_index);
-}
-
-// ----------------------------------------------------------------------
-// Public factory functions
-
-Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- std::unique_ptr<RecordBatchReader> tmp;
- ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, &tmp));
- out->reset(tmp.release());
- return Status::OK();
-}
-
-Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
- const std::vector<int>& column_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- std::unique_ptr<RecordBatchReader> tmp;
- ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, &tmp));
- out->reset(tmp.release());
- return Status::OK();
-}
-
-Status FileReader::Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- const ArrowReaderProperties& properties,
- std::unique_ptr<FileReader>* out) {
- out->reset(new FileReaderImpl(pool, std::move(reader), properties));
- return static_cast<FileReaderImpl*>(out->get())->Init();
-}
-
-Status FileReader::Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- std::unique_ptr<FileReader>* out) {
- return Make(pool, std::move(reader), default_arrow_reader_properties(), out);
-}
-
-FileReaderBuilder::FileReaderBuilder()
- : pool_(::arrow::default_memory_pool()),
- properties_(default_arrow_reader_properties()) {}
-
-Status FileReaderBuilder::Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
- const ReaderProperties& properties,
- std::shared_ptr<FileMetaData> metadata) {
- PARQUET_CATCH_NOT_OK(raw_reader_ = ParquetReader::Open(std::move(file), properties,
- std::move(metadata)));
- return Status::OK();
-}
-
-FileReaderBuilder* FileReaderBuilder::memory_pool(::arrow::MemoryPool* pool) {
- pool_ = pool;
- return this;
-}
-
-FileReaderBuilder* FileReaderBuilder::properties(
- const ArrowReaderProperties& arg_properties) {
- properties_ = arg_properties;
- return this;
-}
-
-Status FileReaderBuilder::Build(std::unique_ptr<FileReader>* out) {
- return FileReader::Make(pool_, std::move(raw_reader_), properties_, out);
-}
-
-Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
- std::unique_ptr<FileReader>* reader) {
- FileReaderBuilder builder;
- RETURN_NOT_OK(builder.Open(std::move(file)));
- return builder.memory_pool(pool)->Build(reader);
-}
-
-namespace internal {
-
-Status FuzzReader(std::unique_ptr<FileReader> reader) {
- auto st = Status::OK();
- for (int i = 0; i < reader->num_row_groups(); ++i) {
- std::shared_ptr<Table> table;
- auto row_group_status = reader->ReadRowGroup(i, &table);
- if (row_group_status.ok()) {
- row_group_status &= table->ValidateFull();
- }
- st &= row_group_status;
- }
- return st;
-}
-
-Status FuzzReader(const uint8_t* data, int64_t size) {
- auto buffer = std::make_shared<::arrow::Buffer>(data, size);
- auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
- FileReaderBuilder builder;
- RETURN_NOT_OK(builder.Open(std::move(file)));
-
- std::unique_ptr<FileReader> reader;
- RETURN_NOT_OK(builder.Build(&reader));
- return FuzzReader(std::move(reader));
-}
-
-} // namespace internal
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/reader.h"
+
+#include <algorithm>
+#include <cstring>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/parallel.h"
+#include "arrow/util/range.h"
+#include "parquet/arrow/reader_internal.h"
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+using arrow::Array;
+using arrow::ArrayData;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::ExtensionType;
+using arrow::Field;
+using arrow::Future;
+using arrow::Int32Array;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::RecordBatchReader;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::StructArray;
+using arrow::Table;
+using arrow::TimestampArray;
+
+using arrow::internal::checked_cast;
+using arrow::internal::Iota;
+
+// Help reduce verbosity
+using ParquetReader = parquet::ParquetFileReader;
+
+using parquet::internal::RecordReader;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace arrow {
+namespace {
+
+::arrow::Result<std::shared_ptr<ArrayData>> ChunksToSingle(const ChunkedArray& chunked) {
+ switch (chunked.num_chunks()) {
+ case 0: {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array,
+ ::arrow::MakeArrayOfNull(chunked.type(), 0));
+ return array->data();
+ }
+ case 1:
+ return chunked.chunk(0)->data();
+ default:
+ // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
+ // this is not yet implemented
+ return Status::NotImplemented(
+ "Nested data conversions not implemented for chunked array outputs");
+ }
+}
+
+} // namespace
+
+class ColumnReaderImpl : public ColumnReader {
+ public:
+ virtual Status GetDefLevels(const int16_t** data, int64_t* length) = 0;
+ virtual Status GetRepLevels(const int16_t** data, int64_t* length) = 0;
+ virtual const std::shared_ptr<Field> field() = 0;
+
+ ::arrow::Status NextBatch(int64_t batch_size,
+ std::shared_ptr<::arrow::ChunkedArray>* out) final {
+ RETURN_NOT_OK(LoadBatch(batch_size));
+ RETURN_NOT_OK(BuildArray(batch_size, out));
+ for (int x = 0; x < (*out)->num_chunks(); x++) {
+ RETURN_NOT_OK((*out)->chunk(x)->Validate());
+ }
+ return Status::OK();
+ }
+
+ virtual ::arrow::Status LoadBatch(int64_t num_records) = 0;
+
+ virtual ::arrow::Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+ virtual bool IsOrHasRepeatedChild() const = 0;
+};
+
+namespace {
+
+std::shared_ptr<std::unordered_set<int>> VectorToSharedSet(
+ const std::vector<int>& values) {
+ std::shared_ptr<std::unordered_set<int>> result(new std::unordered_set<int>());
+ result->insert(values.begin(), values.end());
+ return result;
+}
+
+// Forward declaration
+Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& context,
+ std::unique_ptr<ColumnReaderImpl>* out);
+
+// ----------------------------------------------------------------------
+// FileReaderImpl forward declaration
+
+class FileReaderImpl : public FileReader {
+ public:
+ FileReaderImpl(MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader,
+ ArrowReaderProperties properties)
+ : pool_(pool),
+ reader_(std::move(reader)),
+ reader_properties_(std::move(properties)) {}
+
+ Status Init() {
+ return SchemaManifest::Make(reader_->metadata()->schema(),
+ reader_->metadata()->key_value_metadata(),
+ reader_properties_, &manifest_);
+ }
+
+ FileColumnIteratorFactory SomeRowGroupsFactory(std::vector<int> row_groups) {
+ return [row_groups](int i, ParquetFileReader* reader) {
+ return new FileColumnIterator(i, reader, row_groups);
+ };
+ }
+
+ FileColumnIteratorFactory AllRowGroupsFactory() {
+ return SomeRowGroupsFactory(Iota(reader_->metadata()->num_row_groups()));
+ }
+
+ Status BoundsCheckColumn(int column) {
+ if (column < 0 || column >= this->num_columns()) {
+ return Status::Invalid("Column index out of bounds (got ", column,
+ ", should be "
+ "between 0 and ",
+ this->num_columns() - 1, ")");
+ }
+ return Status::OK();
+ }
+
+ Status BoundsCheckRowGroup(int row_group) {
+ // row group indices check
+ if (row_group < 0 || row_group >= num_row_groups()) {
+ return Status::Invalid("Some index in row_group_indices is ", row_group,
+ ", which is either < 0 or >= num_row_groups(",
+ num_row_groups(), ")");
+ }
+ return Status::OK();
+ }
+
+ Status BoundsCheck(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) {
+ for (int i : row_groups) {
+ RETURN_NOT_OK(BoundsCheckRowGroup(i));
+ }
+ for (int i : column_indices) {
+ RETURN_NOT_OK(BoundsCheckColumn(i));
+ }
+ return Status::OK();
+ }
+
+ std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) override;
+
+ Status ReadTable(const std::vector<int>& indices,
+ std::shared_ptr<Table>* out) override {
+ return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out);
+ }
+
+ Status GetFieldReader(int i,
+ const std::shared_ptr<std::unordered_set<int>>& included_leaves,
+ const std::vector<int>& row_groups,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ auto ctx = std::make_shared<ReaderContext>();
+ ctx->reader = reader_.get();
+ ctx->pool = pool_;
+ ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
+ ctx->filter_leaves = true;
+ ctx->included_leaves = included_leaves;
+ return GetReader(manifest_.schema_fields[i], ctx, out);
+ }
+
+ Status GetFieldReaders(const std::vector<int>& column_indices,
+ const std::vector<int>& row_groups,
+ std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
+ std::shared_ptr<::arrow::Schema>* out_schema) {
+ // We only need to read schema fields which have columns indicated
+ // in the indices vector
+ ARROW_ASSIGN_OR_RAISE(std::vector<int> field_indices,
+ manifest_.GetFieldIndices(column_indices));
+
+ auto included_leaves = VectorToSharedSet(column_indices);
+
+ out->resize(field_indices.size());
+ ::arrow::FieldVector out_fields(field_indices.size());
+ for (size_t i = 0; i < out->size(); ++i) {
+ std::unique_ptr<ColumnReaderImpl> reader;
+ RETURN_NOT_OK(
+ GetFieldReader(field_indices[i], included_leaves, row_groups, &reader));
+
+ out_fields[i] = reader->field();
+ out->at(i) = std::move(reader);
+ }
+
+ *out_schema = ::arrow::schema(std::move(out_fields), manifest_.schema_metadata);
+ return Status::OK();
+ }
+
+ Status GetColumn(int i, FileColumnIteratorFactory iterator_factory,
+ std::unique_ptr<ColumnReader>* out);
+
+ Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) override {
+ return GetColumn(i, AllRowGroupsFactory(), out);
+ }
+
+ Status GetSchema(std::shared_ptr<::arrow::Schema>* out) override {
+ return FromParquetSchema(reader_->metadata()->schema(), reader_properties_,
+ reader_->metadata()->key_value_metadata(), out);
+ }
+
+ Status ReadSchemaField(int i, std::shared_ptr<ChunkedArray>* out) override {
+ auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns()));
+ std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
+
+ std::unique_ptr<ColumnReaderImpl> reader;
+ RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader));
+
+ return ReadColumn(i, row_groups, reader.get(), out);
+ }
+
+ Status ReadColumn(int i, const std::vector<int>& row_groups, ColumnReader* reader,
+ std::shared_ptr<ChunkedArray>* out) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ // TODO(wesm): This calculation doesn't make much sense when we have repeated
+ // schema nodes
+ int64_t records_to_read = 0;
+ for (auto row_group : row_groups) {
+ // Can throw exception
+ records_to_read +=
+ reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values();
+ }
+ return reader->NextBatch(records_to_read, out);
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ Status ReadColumn(int i, const std::vector<int>& row_groups,
+ std::shared_ptr<ChunkedArray>* out) {
+ std::unique_ptr<ColumnReader> flat_column_reader;
+ RETURN_NOT_OK(GetColumn(i, SomeRowGroupsFactory(row_groups), &flat_column_reader));
+ return ReadColumn(i, row_groups, flat_column_reader.get(), out);
+ }
+
+ Status ReadColumn(int i, std::shared_ptr<ChunkedArray>* out) override {
+ return ReadColumn(i, Iota(reader_->metadata()->num_row_groups()), out);
+ }
+
+ Status ReadTable(std::shared_ptr<Table>* table) override {
+ return ReadTable(Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& indices,
+ std::shared_ptr<Table>* table) override;
+
+ // Helper method used by ReadRowGroups - read the given row groups/columns, skipping
+ // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader
+ // alive in async contexts.
+ Future<std::shared_ptr<Table>> DecodeRowGroups(
+ std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor);
+
+ Status ReadRowGroups(const std::vector<int>& row_groups,
+ std::shared_ptr<Table>* table) override {
+ return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status ReadRowGroup(int row_group_index, const std::vector<int>& column_indices,
+ std::shared_ptr<Table>* out) override {
+ return ReadRowGroups({row_group_index}, column_indices, out);
+ }
+
+ Status ReadRowGroup(int i, std::shared_ptr<Table>* table) override {
+ return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out) override;
+
+ Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::unique_ptr<RecordBatchReader>* out) override {
+ return GetRecordBatchReader(row_group_indices,
+ Iota(reader_->metadata()->num_columns()), out);
+ }
+
+ ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
+ GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor) override;
+
+ int num_columns() const { return reader_->metadata()->num_columns(); }
+
+ ParquetFileReader* parquet_reader() const override { return reader_.get(); }
+
+ int num_row_groups() const override { return reader_->metadata()->num_row_groups(); }
+
+ void set_use_threads(bool use_threads) override {
+ reader_properties_.set_use_threads(use_threads);
+ }
+
+ void set_batch_size(int64_t batch_size) override {
+ reader_properties_.set_batch_size(batch_size);
+ }
+
+ const ArrowReaderProperties& properties() const override { return reader_properties_; }
+
+ const SchemaManifest& manifest() const override { return manifest_; }
+
+ Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
+ int64_t* num_rows) override {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ *num_rows = ScanFileContents(columns, column_batch_size, reader_.get());
+ return Status::OK();
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ MemoryPool* pool_;
+ std::unique_ptr<ParquetFileReader> reader_;
+ ArrowReaderProperties reader_properties_;
+
+ SchemaManifest manifest_;
+};
+
+class RowGroupRecordBatchReader : public ::arrow::RecordBatchReader {
+ public:
+ RowGroupRecordBatchReader(::arrow::RecordBatchIterator batches,
+ std::shared_ptr<::arrow::Schema> schema)
+ : batches_(std::move(batches)), schema_(std::move(schema)) {}
+
+ ~RowGroupRecordBatchReader() override {}
+
+ Status ReadNext(std::shared_ptr<::arrow::RecordBatch>* out) override {
+ return batches_.Next().Value(out);
+ }
+
+ std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
+
+ private:
+ ::arrow::Iterator<std::shared_ptr<::arrow::RecordBatch>> batches_;
+ std::shared_ptr<::arrow::Schema> schema_;
+};
+
+class ColumnChunkReaderImpl : public ColumnChunkReader {
+ public:
+ ColumnChunkReaderImpl(FileReaderImpl* impl, int row_group_index, int column_index)
+ : impl_(impl), column_index_(column_index), row_group_index_(row_group_index) {}
+
+ Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) override {
+ return impl_->ReadColumn(column_index_, {row_group_index_}, out);
+ }
+
+ private:
+ FileReaderImpl* impl_;
+ int column_index_;
+ int row_group_index_;
+};
+
+class RowGroupReaderImpl : public RowGroupReader {
+ public:
+ RowGroupReaderImpl(FileReaderImpl* impl, int row_group_index)
+ : impl_(impl), row_group_index_(row_group_index) {}
+
+ std::shared_ptr<ColumnChunkReader> Column(int column_index) override {
+ return std::shared_ptr<ColumnChunkReader>(
+ new ColumnChunkReaderImpl(impl_, row_group_index_, column_index));
+ }
+
+ Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) override {
+ return impl_->ReadRowGroup(row_group_index_, column_indices, out);
+ }
+
+ Status ReadTable(std::shared_ptr<::arrow::Table>* out) override {
+ return impl_->ReadRowGroup(row_group_index_, out);
+ }
+
+ private:
+ FileReaderImpl* impl_;
+ int row_group_index_;
+};
+
+// ----------------------------------------------------------------------
+// Column reader implementations
+
+// Leaf reader is for primitive arrays and primitive children of nested arrays
+class LeafReader : public ColumnReaderImpl {
+ public:
+ LeafReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ std::unique_ptr<FileColumnIterator> input,
+ ::parquet::internal::LevelInfo leaf_info)
+ : ctx_(std::move(ctx)),
+ field_(std::move(field)),
+ input_(std::move(input)),
+ descr_(input_->descr()) {
+ record_reader_ = RecordReader::Make(
+ descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
+ NextRowGroup();
+ }
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) final {
+ *data = record_reader_->def_levels();
+ *length = record_reader_->levels_position();
+ return Status::OK();
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) final {
+ *data = record_reader_->rep_levels();
+ *length = record_reader_->levels_position();
+ return Status::OK();
+ }
+
+ bool IsOrHasRepeatedChild() const final { return false; }
+
+ Status LoadBatch(int64_t records_to_read) final {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ out_ = nullptr;
+ record_reader_->Reset();
+ // Pre-allocation gives much better performance for flat columns
+ record_reader_->Reserve(records_to_read);
+ while (records_to_read > 0) {
+ if (!record_reader_->HasMoreData()) {
+ break;
+ }
+ int64_t records_read = record_reader_->ReadRecords(records_to_read);
+ records_to_read -= records_read;
+ if (records_read == 0) {
+ NextRowGroup();
+ }
+ }
+ RETURN_NOT_OK(TransferColumnData(record_reader_.get(), field_->type(), descr_,
+ ctx_->pool, &out_));
+ return Status::OK();
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ ::arrow::Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<::arrow::ChunkedArray>* out) final {
+ *out = out_;
+ return Status::OK();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<ChunkedArray> out_;
+ void NextRowGroup() {
+ std::unique_ptr<PageReader> page_reader = input_->NextChunk();
+ record_reader_->SetPageReader(std::move(page_reader));
+ }
+
+ std::shared_ptr<ReaderContext> ctx_;
+ std::shared_ptr<Field> field_;
+ std::unique_ptr<FileColumnIterator> input_;
+ const ColumnDescriptor* descr_;
+ std::shared_ptr<RecordReader> record_reader_;
+};
+
+// Column reader for extension arrays
+class ExtensionReader : public ColumnReaderImpl {
+ public:
+ ExtensionReader(std::shared_ptr<Field> field,
+ std::unique_ptr<ColumnReaderImpl> storage_reader)
+ : field_(std::move(field)), storage_reader_(std::move(storage_reader)) {}
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) override {
+ return storage_reader_->GetDefLevels(data, length);
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) override {
+ return storage_reader_->GetRepLevels(data, length);
+ }
+
+ Status LoadBatch(int64_t number_of_records) final {
+ return storage_reader_->LoadBatch(number_of_records);
+ }
+
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override {
+ std::shared_ptr<ChunkedArray> storage;
+ RETURN_NOT_OK(storage_reader_->BuildArray(length_upper_bound, &storage));
+ *out = ExtensionType::WrapArray(field_->type(), storage);
+ return Status::OK();
+ }
+
+ bool IsOrHasRepeatedChild() const final {
+ return storage_reader_->IsOrHasRepeatedChild();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<Field> field_;
+ std::unique_ptr<ColumnReaderImpl> storage_reader_;
+};
+
+template <typename IndexType>
+class ListReader : public ColumnReaderImpl {
+ public:
+ ListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ ::parquet::internal::LevelInfo level_info,
+ std::unique_ptr<ColumnReaderImpl> child_reader)
+ : ctx_(std::move(ctx)),
+ field_(std::move(field)),
+ level_info_(level_info),
+ item_reader_(std::move(child_reader)) {}
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) override {
+ return item_reader_->GetDefLevels(data, length);
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) override {
+ return item_reader_->GetRepLevels(data, length);
+ }
+
+ bool IsOrHasRepeatedChild() const final { return true; }
+
+ Status LoadBatch(int64_t number_of_records) final {
+ return item_reader_->LoadBatch(number_of_records);
+ }
+
+ virtual ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+ std::shared_ptr<ArrayData> data) {
+ if (field_->type()->id() == ::arrow::Type::MAP) {
+ // Error out if data is not map-compliant instead of aborting in MakeArray below
+ RETURN_NOT_OK(::arrow::MapArray::ValidateChildData(data->child_data));
+ }
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+ return std::make_shared<ChunkedArray>(result);
+ }
+
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override {
+ const int16_t* def_levels;
+ const int16_t* rep_levels;
+ int64_t num_levels;
+ RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
+ RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
+
+ std::shared_ptr<ResizableBuffer> validity_buffer;
+ ::parquet::internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = length_upper_bound;
+ if (field_->nullable()) {
+ ARROW_ASSIGN_OR_RAISE(
+ validity_buffer,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = validity_buffer->mutable_data();
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<ResizableBuffer> offsets_buffer,
+ AllocateResizableBuffer(
+ sizeof(IndexType) * std::max(int64_t{1}, length_upper_bound + 1),
+ ctx_->pool));
+ // Ensure zero initialization in case we have reached a zero length list (and
+ // because first entry is always zero).
+ IndexType* offset_data = reinterpret_cast<IndexType*>(offsets_buffer->mutable_data());
+ offset_data[0] = 0;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ ::parquet::internal::DefRepLevelsToList(def_levels, rep_levels, num_levels,
+ level_info_, &validity_io, offset_data);
+ END_PARQUET_CATCH_EXCEPTIONS
+
+ RETURN_NOT_OK(item_reader_->BuildArray(offset_data[validity_io.values_read], out));
+
+ // Resize to actual number of elements returned.
+ RETURN_NOT_OK(
+ offsets_buffer->Resize((validity_io.values_read + 1) * sizeof(IndexType)));
+ if (validity_buffer != nullptr) {
+ RETURN_NOT_OK(
+ validity_buffer->Resize(BitUtil::BytesForBits(validity_io.values_read)));
+ validity_buffer->ZeroPadding();
+ }
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> item_chunk, ChunksToSingle(**out));
+
+ std::vector<std::shared_ptr<Buffer>> buffers{
+ validity_io.null_count > 0 ? validity_buffer : nullptr, offsets_buffer};
+ auto data = std::make_shared<ArrayData>(
+ field_->type(),
+ /*length=*/validity_io.values_read, std::move(buffers),
+ std::vector<std::shared_ptr<ArrayData>>{item_chunk}, validity_io.null_count);
+
+ ARROW_ASSIGN_OR_RAISE(*out, AssembleArray(std::move(data)));
+ return Status::OK();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<ReaderContext> ctx_;
+ std::shared_ptr<Field> field_;
+ ::parquet::internal::LevelInfo level_info_;
+ std::unique_ptr<ColumnReaderImpl> item_reader_;
+};
+
+class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader<int32_t> {
+ public:
+ FixedSizeListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ ::parquet::internal::LevelInfo level_info,
+ std::unique_ptr<ColumnReaderImpl> child_reader)
+ : ListReader(std::move(ctx), std::move(field), level_info,
+ std::move(child_reader)) {}
+ ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+ std::shared_ptr<ArrayData> data) final {
+ DCHECK_EQ(data->buffers.size(), 2);
+ DCHECK_EQ(field()->type()->id(), ::arrow::Type::FIXED_SIZE_LIST);
+ const auto& type = checked_cast<::arrow::FixedSizeListType&>(*field()->type());
+ const int32_t* offsets = reinterpret_cast<const int32_t*>(data->buffers[1]->data());
+ for (int x = 1; x <= data->length; x++) {
+ int32_t size = offsets[x] - offsets[x - 1];
+ if (size != type.list_size()) {
+ return Status::Invalid("Expected all lists to be of size=", type.list_size(),
+ " but index ", x, " had size=", size);
+ }
+ }
+ data->buffers.resize(1);
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+ return std::make_shared<ChunkedArray>(result);
+ }
+};
+
+class PARQUET_NO_EXPORT StructReader : public ColumnReaderImpl {
+ public:
+ explicit StructReader(std::shared_ptr<ReaderContext> ctx,
+ std::shared_ptr<Field> filtered_field,
+ ::parquet::internal::LevelInfo level_info,
+ std::vector<std::unique_ptr<ColumnReaderImpl>> children)
+ : ctx_(std::move(ctx)),
+ filtered_field_(std::move(filtered_field)),
+ level_info_(level_info),
+ children_(std::move(children)) {
+ // There could be a mix of children some might be repeated some might not be.
+ // If possible use one that isn't since that will be guaranteed to have the least
+ // number of levels to reconstruct a nullable bitmap.
+ auto result = std::find_if(children_.begin(), children_.end(),
+ [](const std::unique_ptr<ColumnReaderImpl>& child) {
+ return !child->IsOrHasRepeatedChild();
+ });
+ if (result != children_.end()) {
+ def_rep_level_child_ = result->get();
+ has_repeated_child_ = false;
+ } else if (!children_.empty()) {
+ def_rep_level_child_ = children_.front().get();
+ has_repeated_child_ = true;
+ }
+ }
+
+ bool IsOrHasRepeatedChild() const final { return has_repeated_child_; }
+
+ Status LoadBatch(int64_t records_to_read) override {
+ for (const std::unique_ptr<ColumnReaderImpl>& reader : children_) {
+ RETURN_NOT_OK(reader->LoadBatch(records_to_read));
+ }
+ return Status::OK();
+ }
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override;
+ Status GetDefLevels(const int16_t** data, int64_t* length) override;
+ Status GetRepLevels(const int16_t** data, int64_t* length) override;
+ const std::shared_ptr<Field> field() override { return filtered_field_; }
+
+ private:
+ const std::shared_ptr<ReaderContext> ctx_;
+ const std::shared_ptr<Field> filtered_field_;
+ const ::parquet::internal::LevelInfo level_info_;
+ const std::vector<std::unique_ptr<ColumnReaderImpl>> children_;
+ ColumnReaderImpl* def_rep_level_child_ = nullptr;
+ bool has_repeated_child_;
+};
+
+Status StructReader::GetDefLevels(const int16_t** data, int64_t* length) {
+ *data = nullptr;
+ if (children_.size() == 0) {
+ *length = 0;
+ return Status::Invalid("StructReader had no children");
+ }
+
+ // This method should only be called when this struct or one of its parents
+ // are optional/repeated or it has a repeated child.
+ // Meaning all children must have rep/def levels associated
+ // with them.
+ RETURN_NOT_OK(def_rep_level_child_->GetDefLevels(data, length));
+ return Status::OK();
+}
+
+Status StructReader::GetRepLevels(const int16_t** data, int64_t* length) {
+ *data = nullptr;
+ if (children_.size() == 0) {
+ *length = 0;
+ return Status::Invalid("StructReader had no childre");
+ }
+
+ // This method should only be called when this struct or one of its parents
+ // are optional/repeated or it has repeated child.
+ // Meaning all children must have rep/def levels associated
+ // with them.
+ RETURN_NOT_OK(def_rep_level_child_->GetRepLevels(data, length));
+ return Status::OK();
+}
+
+Status StructReader::BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) {
+ std::vector<std::shared_ptr<ArrayData>> children_array_data;
+ std::shared_ptr<ResizableBuffer> null_bitmap;
+
+ ::parquet::internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = length_upper_bound;
+ // This simplifies accounting below.
+ validity_io.values_read = length_upper_bound;
+
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ const int16_t* def_levels;
+ const int16_t* rep_levels;
+ int64_t num_levels;
+
+ if (has_repeated_child_) {
+ ARROW_ASSIGN_OR_RAISE(
+ null_bitmap,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = null_bitmap->mutable_data();
+ RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
+ RETURN_NOT_OK(GetRepLevels(&rep_levels, &num_levels));
+ DefRepLevelsToBitmap(def_levels, rep_levels, num_levels, level_info_, &validity_io);
+ } else if (filtered_field_->nullable()) {
+ ARROW_ASSIGN_OR_RAISE(
+ null_bitmap,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = null_bitmap->mutable_data();
+ RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
+ DefLevelsToBitmap(def_levels, num_levels, level_info_, &validity_io);
+ }
+
+ // Ensure all values are initialized.
+ if (null_bitmap) {
+ RETURN_NOT_OK(null_bitmap->Resize(BitUtil::BytesForBits(validity_io.values_read)));
+ null_bitmap->ZeroPadding();
+ }
+
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Gather children arrays and def levels
+ for (auto& child : children_) {
+ std::shared_ptr<ChunkedArray> field;
+ RETURN_NOT_OK(child->BuildArray(validity_io.values_read, &field));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array_data, ChunksToSingle(*field));
+ children_array_data.push_back(std::move(array_data));
+ }
+
+ if (!filtered_field_->nullable() && !has_repeated_child_) {
+ validity_io.values_read = children_array_data.front()->length;
+ }
+
+ std::vector<std::shared_ptr<Buffer>> buffers{validity_io.null_count > 0 ? null_bitmap
+ : nullptr};
+ auto data =
+ std::make_shared<ArrayData>(filtered_field_->type(),
+ /*length=*/validity_io.values_read, std::move(buffers),
+ std::move(children_array_data));
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+
+ *out = std::make_shared<ChunkedArray>(result);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// File reader implementation
+
+Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
+ const std::shared_ptr<ReaderContext>& ctx,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+
+ auto type_id = arrow_field->type()->id();
+
+ if (type_id == ::arrow::Type::EXTENSION) {
+ auto storage_field = arrow_field->WithType(
+ checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
+ RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
+ out->reset(new ExtensionReader(arrow_field, std::move(*out)));
+ return Status::OK();
+ }
+
+ if (field.children.size() == 0) {
+ if (!field.is_leaf()) {
+ return Status::Invalid("Parquet non-leaf node has no children");
+ }
+ if (!ctx->IncludesLeaf(field.column_index)) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ std::unique_ptr<FileColumnIterator> input(
+ ctx->iterator_factory(field.column_index, ctx->reader));
+ out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
+ } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
+ type_id == ::arrow::Type::FIXED_SIZE_LIST ||
+ type_id == ::arrow::Type::LARGE_LIST) {
+ auto list_field = arrow_field;
+ auto child = &field.children[0];
+ std::unique_ptr<ColumnReaderImpl> child_reader;
+ RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
+ if (child_reader == nullptr) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ if (type_id == ::arrow::Type::LIST ||
+ type_id == ::arrow::Type::MAP) { // Map can be reconstructed as list of structs.
+ if (type_id == ::arrow::Type::MAP &&
+ child_reader->field()->type()->num_fields() != 2) {
+ // This case applies if either key or value is filtered.
+ list_field = list_field->WithType(::arrow::list(child_reader->field()));
+ }
+ out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+ } else if (type_id == ::arrow::Type::LARGE_LIST) {
+ out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+
+ } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
+ out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+ } else {
+ return Status::UnknownError("Unknown list type: ", field.field->ToString());
+ }
+ } else if (type_id == ::arrow::Type::STRUCT) {
+ std::vector<std::shared_ptr<Field>> child_fields;
+ std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
+ for (const auto& child : field.children) {
+ std::unique_ptr<ColumnReaderImpl> child_reader;
+ RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
+ if (!child_reader) {
+ // If all children were pruned, then we do not try to read this field
+ continue;
+ }
+ child_fields.push_back(child.field);
+ child_readers.emplace_back(std::move(child_reader));
+ }
+ if (child_fields.size() == 0) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ auto filtered_field =
+ ::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
+ arrow_field->nullable(), arrow_field->metadata());
+ out->reset(new StructReader(ctx, filtered_field, field.level_info,
+ std::move(child_readers)));
+ } else {
+ return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
+ }
+ return Status::OK();
+
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& ctx,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ return GetReader(field, field.field, ctx, out);
+}
+
+} // namespace
+
+Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out) {
+ RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
+
+ if (reader_properties_.pre_buffer()) {
+ // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ reader_->PreBuffer(row_groups, column_indices, reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
+ std::shared_ptr<::arrow::Schema> batch_schema;
+ RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
+
+ if (readers.empty()) {
+ // Just generate all batches right now; they're cheap since they have no columns.
+ int64_t batch_size = properties().batch_size();
+ auto max_sized_batch =
+ ::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});
+
+ ::arrow::RecordBatchVector batches;
+
+ for (int row_group : row_groups) {
+ int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
+
+ batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);
+
+ if (int64_t trailing_rows = num_rows % batch_size) {
+ batches.push_back(max_sized_batch->Slice(0, trailing_rows));
+ }
+ }
+
+ *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
+ ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
+
+ return Status::OK();
+ }
+
+ int64_t num_rows = 0;
+ for (int row_group : row_groups) {
+ num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
+ }
+
+ using ::arrow::RecordBatchIterator;
+
+ // NB: This lambda will be invoked outside the scope of this call to
+ // `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
+ // `this` is a non-owning pointer so we are relying on the parent FileReader outliving
+ // this RecordBatchReader.
+ ::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
+ [readers, batch_schema, num_rows,
+ this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+ ::arrow::ChunkedArrayVector columns(readers.size());
+
+ // don't reserve more rows than necessary
+ int64_t batch_size = std::min(properties().batch_size(), num_rows);
+ num_rows -= batch_size;
+
+ RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+ reader_properties_.use_threads(), static_cast<int>(readers.size()),
+ [&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
+
+ for (const auto& column : columns) {
+ if (column == nullptr || column->length() == 0) {
+ return ::arrow::IterationTraits<RecordBatchIterator>::End();
+ }
+ }
+
+ auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
+ auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
+
+ // NB: explicitly preserve table so that table_reader doesn't outlive it
+ return ::arrow::MakeFunctionIterator(
+ [table, table_reader] { return table_reader->Next(); });
+ });
+
+ *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
+ ::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));
+
+ return Status::OK();
+}
+
+/// Given a file reader and a list of row groups, this is a generator of record
+/// batch generators (where each sub-generator is the contents of a single row group).
+class RowGroupGenerator {
+ public:
+ using RecordBatchGenerator =
+ ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>;
+
+ explicit RowGroupGenerator(std::shared_ptr<FileReaderImpl> arrow_reader,
+ ::arrow::internal::Executor* cpu_executor,
+ std::vector<int> row_groups, std::vector<int> column_indices)
+ : arrow_reader_(std::move(arrow_reader)),
+ cpu_executor_(cpu_executor),
+ row_groups_(std::move(row_groups)),
+ column_indices_(std::move(column_indices)),
+ index_(0) {}
+
+ ::arrow::Future<RecordBatchGenerator> operator()() {
+ if (index_ >= row_groups_.size()) {
+ return ::arrow::AsyncGeneratorEnd<RecordBatchGenerator>();
+ }
+ int row_group = row_groups_[index_++];
+ std::vector<int> column_indices = column_indices_;
+ auto reader = arrow_reader_;
+ if (!reader->properties().pre_buffer()) {
+ return SubmitRead(cpu_executor_, reader, row_group, column_indices);
+ }
+ auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices);
+ if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready);
+ return ready.Then([=]() -> ::arrow::Future<RecordBatchGenerator> {
+ return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices);
+ });
+ }
+
+ private:
+ // Synchronous fallback for when pre-buffer isn't enabled.
+ //
+ // Making the Parquet reader truly asynchronous requires heavy refactoring, so the
+ // generator piggybacks on ReadRangeCache. The lazy ReadRangeCache can be used for
+ // async I/O without forcing readahead.
+ static ::arrow::Future<RecordBatchGenerator> SubmitRead(
+ ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+ const int row_group, const std::vector<int>& column_indices) {
+ if (!cpu_executor) {
+ return ReadOneRowGroup(cpu_executor, self, row_group, column_indices);
+ }
+ // If we have an executor, then force transfer (even if I/O was complete)
+ return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self,
+ row_group, column_indices));
+ }
+
+ static ::arrow::Future<RecordBatchGenerator> ReadOneRowGroup(
+ ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+ const int row_group, const std::vector<int>& column_indices) {
+ // Skips bound checks/pre-buffering, since we've done that already
+ return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor)
+ .Then([](const std::shared_ptr<Table>& table)
+ -> ::arrow::Result<RecordBatchGenerator> {
+ ::arrow::TableBatchReader table_reader(*table);
+ ::arrow::RecordBatchVector batches;
+ RETURN_NOT_OK(table_reader.ReadAll(&batches));
+ return ::arrow::MakeVectorGenerator(std::move(batches));
+ });
+ }
+
+ std::shared_ptr<FileReaderImpl> arrow_reader_;
+ ::arrow::internal::Executor* cpu_executor_;
+ std::vector<int> row_groups_;
+ std::vector<int> column_indices_;
+ size_t index_;
+};
+
+::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
+FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor) {
+ RETURN_NOT_OK(BoundsCheck(row_group_indices, column_indices));
+ if (reader_properties_.pre_buffer()) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ reader_->PreBuffer(row_group_indices, column_indices, reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+ ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> row_group_generator =
+ RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
+ cpu_executor, row_group_indices, column_indices);
+ return ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
+}
+
+Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_factory,
+ std::unique_ptr<ColumnReader>* out) {
+ RETURN_NOT_OK(BoundsCheckColumn(i));
+ auto ctx = std::make_shared<ReaderContext>();
+ ctx->reader = reader_.get();
+ ctx->pool = pool_;
+ ctx->iterator_factory = iterator_factory;
+ ctx->filter_leaves = false;
+ std::unique_ptr<ColumnReaderImpl> result;
+ RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
+ out->reset(result.release());
+ return Status::OK();
+}
+
+Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<Table>* out) {
+ RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
+
+ // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
+ if (reader_properties_.pre_buffer()) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ parquet_reader()->PreBuffer(row_groups, column_indices,
+ reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
+ /*cpu_executor=*/nullptr);
+ ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
+ return Status::OK();
+}
+
+Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
+ std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor) {
+ // `self` is used solely to keep `this` alive in an async context - but we use this
+ // in a sync context too so use `this` over `self`
+ std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
+ std::shared_ptr<::arrow::Schema> result_schema;
+ RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
+ // OptionalParallelForAsync requires an executor
+ if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
+
+ auto read_column = [row_groups, self, this](size_t i,
+ std::shared_ptr<ColumnReaderImpl> reader)
+ -> ::arrow::Result<std::shared_ptr<::arrow::ChunkedArray>> {
+ std::shared_ptr<::arrow::ChunkedArray> column;
+ RETURN_NOT_OK(ReadColumn(static_cast<int>(i), row_groups, reader.get(), &column));
+ return column;
+ };
+ auto make_table = [result_schema, row_groups, self,
+ this](const ::arrow::ChunkedArrayVector& columns)
+ -> ::arrow::Result<std::shared_ptr<Table>> {
+ int64_t num_rows = 0;
+ if (!columns.empty()) {
+ num_rows = columns[0]->length();
+ } else {
+ for (int i : row_groups) {
+ num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
+ }
+ }
+ auto table = Table::Make(std::move(result_schema), columns, num_rows);
+ RETURN_NOT_OK(table->Validate());
+ return table;
+ };
+ return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(),
+ std::move(readers), read_column,
+ cpu_executor)
+ .Then(std::move(make_table));
+}
+
+std::shared_ptr<RowGroupReader> FileReaderImpl::RowGroup(int row_group_index) {
+ return std::make_shared<RowGroupReaderImpl>(this, row_group_index);
+}
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ std::unique_ptr<RecordBatchReader> tmp;
+ ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, &tmp));
+ out->reset(tmp.release());
+ return Status::OK();
+}
+
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ std::unique_ptr<RecordBatchReader> tmp;
+ ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, &tmp));
+ out->reset(tmp.release());
+ return Status::OK();
+}
+
+Status FileReader::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ const ArrowReaderProperties& properties,
+ std::unique_ptr<FileReader>* out) {
+ out->reset(new FileReaderImpl(pool, std::move(reader), properties));
+ return static_cast<FileReaderImpl*>(out->get())->Init();
+}
+
+Status FileReader::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ std::unique_ptr<FileReader>* out) {
+ return Make(pool, std::move(reader), default_arrow_reader_properties(), out);
+}
+
+FileReaderBuilder::FileReaderBuilder()
+ : pool_(::arrow::default_memory_pool()),
+ properties_(default_arrow_reader_properties()) {}
+
+Status FileReaderBuilder::Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+ const ReaderProperties& properties,
+ std::shared_ptr<FileMetaData> metadata) {
+ PARQUET_CATCH_NOT_OK(raw_reader_ = ParquetReader::Open(std::move(file), properties,
+ std::move(metadata)));
+ return Status::OK();
+}
+
+FileReaderBuilder* FileReaderBuilder::memory_pool(::arrow::MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+}
+
+FileReaderBuilder* FileReaderBuilder::properties(
+ const ArrowReaderProperties& arg_properties) {
+ properties_ = arg_properties;
+ return this;
+}
+
+Status FileReaderBuilder::Build(std::unique_ptr<FileReader>* out) {
+ return FileReader::Make(pool_, std::move(raw_reader_), properties_, out);
+}
+
+Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
+ std::unique_ptr<FileReader>* reader) {
+ FileReaderBuilder builder;
+ RETURN_NOT_OK(builder.Open(std::move(file)));
+ return builder.memory_pool(pool)->Build(reader);
+}
+
+namespace internal {
+
+Status FuzzReader(std::unique_ptr<FileReader> reader) {
+ auto st = Status::OK();
+ for (int i = 0; i < reader->num_row_groups(); ++i) {
+ std::shared_ptr<Table> table;
+ auto row_group_status = reader->ReadRowGroup(i, &table);
+ if (row_group_status.ok()) {
+ row_group_status &= table->ValidateFull();
+ }
+ st &= row_group_status;
+ }
+ return st;
+}
+
+Status FuzzReader(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<::arrow::Buffer>(data, size);
+ auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
+ FileReaderBuilder builder;
+ RETURN_NOT_OK(builder.Open(std::move(file)));
+
+ std::unique_ptr<FileReader> reader;
+ RETURN_NOT_OK(builder.Build(&reader));
+ return FuzzReader(std::move(reader));
+}
+
+} // namespace internal
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
index 2d6a5ef2c3e..e8a2dd889da 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
@@ -1,343 +1,343 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-// N.B. we don't include async_generator.h as it's relatively heavy
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "parquet/file_reader.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-
-namespace arrow {
-
-class ChunkedArray;
-class KeyValueMetadata;
-class RecordBatchReader;
-struct Scalar;
-class Schema;
-class Table;
-class RecordBatch;
-
-} // namespace arrow
-
-namespace parquet {
-
-class FileMetaData;
-class SchemaDescriptor;
-
-namespace arrow {
-
-class ColumnChunkReader;
-class ColumnReader;
-struct SchemaManifest;
-class RowGroupReader;
-
-/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
-///
-/// This interfaces caters for different use cases and thus provides different
-/// interfaces. In its most simplistic form, we cater for a user that wants to
-/// read the whole Parquet at once with the `FileReader::ReadTable` method.
-///
-/// More advanced users that also want to implement parallelism on top of each
-/// single Parquet files should do this on the RowGroup level. For this, they can
-/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
-/// RowGroup as a table.
-///
-/// In the most advanced situation, where a consumer wants to independently read
-/// RowGroups in parallel and consume each column individually, they can call
-/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
-/// instance.
-///
-/// The parquet format supports an optional integer field_id which can be assigned
-/// to a field. Arrow will convert these field IDs to a metadata key named
-/// PARQUET:field_id on the appropriate field.
-// TODO(wesm): nested data does not always make sense with this user
-// interface unless you are only reading a single leaf node from a branch of
-// a table. For example:
-//
-// repeated group data {
-// optional group record {
-// optional int32 val1;
-// optional byte_array val2;
-// optional bool val3;
-// }
-// optional int32 val4;
-// }
-//
-// In the Parquet file, there are 3 leaf nodes:
-//
-// * data.record.val1
-// * data.record.val2
-// * data.record.val3
-// * data.val4
-//
-// When materializing this data in an Arrow array, we would have:
-//
-// data: list<struct<
-// record: struct<
-// val1: int32,
-// val2: string (= list<uint8>),
-// val3: bool,
-// >,
-// val4: int32
-// >>
-//
-// However, in the Parquet format, each leaf node has its own repetition and
-// definition levels describing the structure of the intermediate nodes in
-// this array structure. Thus, we will need to scan the leaf data for a group
-// of leaf nodes part of the same type tree to create a single result Arrow
-// nested array structure.
-//
-// This is additionally complicated "chunky" repeated fields or very large byte
-// arrays
-class PARQUET_EXPORT FileReader {
- public:
- /// Factory function to create a FileReader from a ParquetFileReader and properties
- static ::arrow::Status Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- const ArrowReaderProperties& properties,
- std::unique_ptr<FileReader>* out);
-
- /// Factory function to create a FileReader from a ParquetFileReader
- static ::arrow::Status Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- std::unique_ptr<FileReader>* out);
-
- // Since the distribution of columns amongst a Parquet file's row groups may
- // be uneven (the number of values in each column chunk can be different), we
- // provide a column-oriented read interface. The ColumnReader hides the
- // details of paging through the file's row groups and yielding
- // fully-materialized arrow::Array instances
- //
- // Returns error status if the column of interest is not flat.
- virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
-
- /// \brief Return arrow schema for all the columns.
- virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
-
- /// \brief Read column as a whole into a chunked array.
- ///
- /// The indicated column index is relative to the schema
- virtual ::arrow::Status ReadColumn(int i,
- std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-
- // NOTE: Experimental API
- // Reads a specific top level schema field into an Array
- // The index i refers the index of the top level schema field, which may
- // be nested or flat - e.g.
- //
- // 0 foo.bar
- // foo.bar.baz
- // foo.qux
- // 1 foo2
- // 2 foo3
- //
- // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
- virtual ::arrow::Status ReadSchemaField(
- int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-
- /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
- ///
- /// Note that the ordering in row_group_indices matters. FileReaders must outlive
- /// their RecordBatchReaders.
- ///
- /// \returns error Status if row_group_indices contains an invalid index
- virtual ::arrow::Status GetRecordBatchReader(
- const std::vector<int>& row_group_indices,
- std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
-
- ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- std::shared_ptr<::arrow::RecordBatchReader>* out);
-
- /// \brief Return a RecordBatchReader of row groups selected from
- /// row_group_indices, whose columns are selected by column_indices.
- ///
- /// Note that the ordering in row_group_indices and column_indices
- /// matter. FileReaders must outlive their RecordBatchReaders.
- ///
- /// \returns error Status if either row_group_indices or column_indices
- /// contains an invalid index
- virtual ::arrow::Status GetRecordBatchReader(
- const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
- std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
-
- /// \brief Return a generator of record batches.
- ///
- /// The FileReader must outlive the generator, so this requires that you pass in a
- /// shared_ptr.
- ///
- /// \returns error Result if either row_group_indices or column_indices contains an
- /// invalid index
- virtual ::arrow::Result<
- std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
- GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
- const std::vector<int> row_group_indices,
- const std::vector<int> column_indices,
- ::arrow::internal::Executor* cpu_executor = NULLPTR) = 0;
-
- ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::RecordBatchReader>* out);
-
- /// Read all columns into a Table
- virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
-
- /// \brief Read the given columns into a Table
- ///
- /// The indicated column indices are relative to the schema
- virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- /// \brief Scan file contents with one thread, return number of rows
- virtual ::arrow::Status ScanContents(std::vector<int> columns,
- const int32_t column_batch_size,
- int64_t* num_rows) = 0;
-
- /// \brief Return a reader for the RowGroup, this object must not outlive the
- /// FileReader.
- virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
-
- /// \brief The number of row groups in the file
- virtual int num_row_groups() const = 0;
-
- virtual ParquetFileReader* parquet_reader() const = 0;
-
- /// Set whether to use multiple threads during reads of multiple columns.
- /// By default only one thread is used.
- virtual void set_use_threads(bool use_threads) = 0;
-
- /// Set number of records to read per batch for the RecordBatchReader.
- virtual void set_batch_size(int64_t batch_size) = 0;
-
- virtual const ArrowReaderProperties& properties() const = 0;
-
- virtual const SchemaManifest& manifest() const = 0;
-
- virtual ~FileReader() = default;
-};
-
-class RowGroupReader {
- public:
- virtual ~RowGroupReader() = default;
- virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
- virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
- virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
-
- private:
- struct Iterator;
-};
-
-class ColumnChunkReader {
- public:
- virtual ~ColumnChunkReader() = default;
- virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-};
-
-// At this point, the column reader is a stream iterator. It only knows how to
-// read the next batch of values for a particular column from the file until it
-// runs out.
-//
-// We also do not expose any internal Parquet details, such as row groups. This
-// might change in the future.
-class PARQUET_EXPORT ColumnReader {
- public:
- virtual ~ColumnReader() = default;
-
- // Scan the next array of the indicated size. The actual size of the
- // returned array may be less than the passed size depending how much data is
- // available in the file.
- //
- // When all the data in the file has been exhausted, the result is set to
- // nullptr.
- //
- // Returns Status::OK on a successful read, including if you have exhausted
- // the data available in the file.
- virtual ::arrow::Status NextBatch(int64_t batch_size,
- std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-};
-
-/// \brief Experimental helper class for bindings (like Python) that struggle
-/// either with std::move or C++ exceptions
-class PARQUET_EXPORT FileReaderBuilder {
- public:
- FileReaderBuilder();
-
- /// Create FileReaderBuilder from Arrow file and optional properties / metadata
- ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
- const ReaderProperties& properties = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- ParquetFileReader* raw_reader() { return raw_reader_.get(); }
-
- /// Set Arrow MemoryPool for memory allocation
- FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
- /// Set Arrow reader properties
- FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
- /// Build FileReader instance
- ::arrow::Status Build(std::unique_ptr<FileReader>* out);
-
- private:
- ::arrow::MemoryPool* pool_;
- ArrowReaderProperties properties_;
- std::unique_ptr<ParquetFileReader> raw_reader_;
-};
-
-/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
-///
-/// @{
-
-/// \brief Build FileReader from Arrow file and MemoryPool
-///
-/// Advanced settings are supported through the FileReaderBuilder class.
-PARQUET_EXPORT
-::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
- ::arrow::MemoryPool* allocator,
- std::unique_ptr<FileReader>* reader);
-
-/// @}
-
-PARQUET_EXPORT
-::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max);
-
-namespace internal {
-
-PARQUET_EXPORT
-::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
-
-} // namespace internal
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+// N.B. we don't include async_generator.h as it's relatively heavy
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "parquet/file_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class ChunkedArray;
+class KeyValueMetadata;
+class RecordBatchReader;
+struct Scalar;
+class Schema;
+class Table;
+class RecordBatch;
+
+} // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class SchemaDescriptor;
+
+namespace arrow {
+
+class ColumnChunkReader;
+class ColumnReader;
+struct SchemaManifest;
+class RowGroupReader;
+
+/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
+///
+/// This interfaces caters for different use cases and thus provides different
+/// interfaces. In its most simplistic form, we cater for a user that wants to
+/// read the whole Parquet at once with the `FileReader::ReadTable` method.
+///
+/// More advanced users that also want to implement parallelism on top of each
+/// single Parquet files should do this on the RowGroup level. For this, they can
+/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
+/// RowGroup as a table.
+///
+/// In the most advanced situation, where a consumer wants to independently read
+/// RowGroups in parallel and consume each column individually, they can call
+/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
+/// instance.
+///
+/// The parquet format supports an optional integer field_id which can be assigned
+/// to a field. Arrow will convert these field IDs to a metadata key named
+/// PARQUET:field_id on the appropriate field.
+// TODO(wesm): nested data does not always make sense with this user
+// interface unless you are only reading a single leaf node from a branch of
+// a table. For example:
+//
+// repeated group data {
+// optional group record {
+// optional int32 val1;
+// optional byte_array val2;
+// optional bool val3;
+// }
+// optional int32 val4;
+// }
+//
+// In the Parquet file, there are 3 leaf nodes:
+//
+// * data.record.val1
+// * data.record.val2
+// * data.record.val3
+// * data.val4
+//
+// When materializing this data in an Arrow array, we would have:
+//
+// data: list<struct<
+// record: struct<
+// val1: int32,
+// val2: string (= list<uint8>),
+// val3: bool,
+// >,
+// val4: int32
+// >>
+//
+// However, in the Parquet format, each leaf node has its own repetition and
+// definition levels describing the structure of the intermediate nodes in
+// this array structure. Thus, we will need to scan the leaf data for a group
+// of leaf nodes part of the same type tree to create a single result Arrow
+// nested array structure.
+//
+// This is additionally complicated "chunky" repeated fields or very large byte
+// arrays
+class PARQUET_EXPORT FileReader {
+ public:
+ /// Factory function to create a FileReader from a ParquetFileReader and properties
+ static ::arrow::Status Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ const ArrowReaderProperties& properties,
+ std::unique_ptr<FileReader>* out);
+
+ /// Factory function to create a FileReader from a ParquetFileReader
+ static ::arrow::Status Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ std::unique_ptr<FileReader>* out);
+
+ // Since the distribution of columns amongst a Parquet file's row groups may
+ // be uneven (the number of values in each column chunk can be different), we
+ // provide a column-oriented read interface. The ColumnReader hides the
+ // details of paging through the file's row groups and yielding
+ // fully-materialized arrow::Array instances
+ //
+ // Returns error status if the column of interest is not flat.
+ virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
+
+ /// \brief Return arrow schema for all the columns.
+ virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
+
+ /// \brief Read column as a whole into a chunked array.
+ ///
+ /// The indicated column index is relative to the schema
+ virtual ::arrow::Status ReadColumn(int i,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+ // NOTE: Experimental API
+ // Reads a specific top level schema field into an Array
+ // The index i refers the index of the top level schema field, which may
+ // be nested or flat - e.g.
+ //
+ // 0 foo.bar
+ // foo.bar.baz
+ // foo.qux
+ // 1 foo2
+ // 2 foo3
+ //
+ // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
+ virtual ::arrow::Status ReadSchemaField(
+ int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+ /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
+ ///
+ /// Note that the ordering in row_group_indices matters. FileReaders must outlive
+ /// their RecordBatchReaders.
+ ///
+ /// \returns error Status if row_group_indices contains an invalid index
+ virtual ::arrow::Status GetRecordBatchReader(
+ const std::vector<int>& row_group_indices,
+ std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
+ ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+ /// \brief Return a RecordBatchReader of row groups selected from
+ /// row_group_indices, whose columns are selected by column_indices.
+ ///
+ /// Note that the ordering in row_group_indices and column_indices
+ /// matter. FileReaders must outlive their RecordBatchReaders.
+ ///
+ /// \returns error Status if either row_group_indices or column_indices
+ /// contains an invalid index
+ virtual ::arrow::Status GetRecordBatchReader(
+ const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+ std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
+ /// \brief Return a generator of record batches.
+ ///
+ /// The FileReader must outlive the generator, so this requires that you pass in a
+ /// shared_ptr.
+ ///
+ /// \returns error Result if either row_group_indices or column_indices contains an
+ /// invalid index
+ virtual ::arrow::Result<
+ std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
+ GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor = NULLPTR) = 0;
+
+ ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+ /// Read all columns into a Table
+ virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ /// \brief Read the given columns into a Table
+ ///
+ /// The indicated column indices are relative to the schema
+ virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ /// \brief Scan file contents with one thread, return number of rows
+ virtual ::arrow::Status ScanContents(std::vector<int> columns,
+ const int32_t column_batch_size,
+ int64_t* num_rows) = 0;
+
+ /// \brief Return a reader for the RowGroup, this object must not outlive the
+ /// FileReader.
+ virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
+
+ /// \brief The number of row groups in the file
+ virtual int num_row_groups() const = 0;
+
+ virtual ParquetFileReader* parquet_reader() const = 0;
+
+ /// Set whether to use multiple threads during reads of multiple columns.
+ /// By default only one thread is used.
+ virtual void set_use_threads(bool use_threads) = 0;
+
+ /// Set number of records to read per batch for the RecordBatchReader.
+ virtual void set_batch_size(int64_t batch_size) = 0;
+
+ virtual const ArrowReaderProperties& properties() const = 0;
+
+ virtual const SchemaManifest& manifest() const = 0;
+
+ virtual ~FileReader() = default;
+};
+
+class RowGroupReader {
+ public:
+ virtual ~RowGroupReader() = default;
+ virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
+ virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+ virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ private:
+ struct Iterator;
+};
+
+class ColumnChunkReader {
+ public:
+ virtual ~ColumnChunkReader() = default;
+ virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+// At this point, the column reader is a stream iterator. It only knows how to
+// read the next batch of values for a particular column from the file until it
+// runs out.
+//
+// We also do not expose any internal Parquet details, such as row groups. This
+// might change in the future.
+class PARQUET_EXPORT ColumnReader {
+ public:
+ virtual ~ColumnReader() = default;
+
+ // Scan the next array of the indicated size. The actual size of the
+ // returned array may be less than the passed size depending how much data is
+ // available in the file.
+ //
+ // When all the data in the file has been exhausted, the result is set to
+ // nullptr.
+ //
+ // Returns Status::OK on a successful read, including if you have exhausted
+ // the data available in the file.
+ virtual ::arrow::Status NextBatch(int64_t batch_size,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+/// \brief Experimental helper class for bindings (like Python) that struggle
+/// either with std::move or C++ exceptions
+class PARQUET_EXPORT FileReaderBuilder {
+ public:
+ FileReaderBuilder();
+
+ /// Create FileReaderBuilder from Arrow file and optional properties / metadata
+ ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+ const ReaderProperties& properties = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ ParquetFileReader* raw_reader() { return raw_reader_.get(); }
+
+ /// Set Arrow MemoryPool for memory allocation
+ FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
+ /// Set Arrow reader properties
+ FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
+ /// Build FileReader instance
+ ::arrow::Status Build(std::unique_ptr<FileReader>* out);
+
+ private:
+ ::arrow::MemoryPool* pool_;
+ ArrowReaderProperties properties_;
+ std::unique_ptr<ParquetFileReader> raw_reader_;
+};
+
+/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
+///
+/// @{
+
+/// \brief Build FileReader from Arrow file and MemoryPool
+///
+/// Advanced settings are supported through the FileReaderBuilder class.
+PARQUET_EXPORT
+::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
+ ::arrow::MemoryPool* allocator,
+ std::unique_ptr<FileReader>* reader);
+
+/// @}
+
+PARQUET_EXPORT
+::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max);
+
+namespace internal {
+
+PARQUET_EXPORT
+::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
+
+} // namespace internal
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
index f13687079d4..3fbbfa8da26 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
@@ -1,791 +1,791 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/reader_internal.h"
-
-#include <algorithm>
-#include <climits>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/compute/api.h"
-#include "arrow/datum.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/scalar.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/base64.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/endian.h"
-#include "arrow/util/int_util_internal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/string_view.h"
-#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/arrow/reader.h"
-#include "parquet/arrow/schema.h"
-#include "parquet/arrow/schema_internal.h"
-#include "parquet/column_reader.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
-#include "parquet/windows_compatibility.h"
-
-using arrow::Array;
-using arrow::BooleanArray;
-using arrow::ChunkedArray;
-using arrow::DataType;
-using arrow::Datum;
-using arrow::Decimal128;
-using arrow::Decimal128Array;
-using arrow::Decimal128Type;
-using arrow::Decimal256;
-using arrow::Decimal256Array;
-using arrow::Decimal256Type;
-using arrow::Field;
-using arrow::Int32Array;
-using arrow::ListArray;
-using arrow::MemoryPool;
-using arrow::ResizableBuffer;
-using arrow::Status;
-using arrow::StructArray;
-using arrow::Table;
-using arrow::TimestampArray;
-
-using ::arrow::BitUtil::FromBigEndian;
-using ::arrow::internal::checked_cast;
-using ::arrow::internal::checked_pointer_cast;
-using ::arrow::internal::SafeLeftShift;
-using ::arrow::util::SafeLoadAs;
-
-using parquet::internal::BinaryRecordReader;
-using parquet::internal::DictionaryRecordReader;
-using parquet::internal::RecordReader;
-using parquet::schema::GroupNode;
-using parquet::schema::Node;
-using parquet::schema::PrimitiveNode;
-using ParquetType = parquet::Type;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-namespace arrow {
-namespace {
-
-template <typename ArrowType>
-using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
-
-template <typename CType, typename StatisticsType>
-Status MakeMinMaxScalar(const StatisticsType& statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- *min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
- *max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
- return Status::OK();
-}
-
-template <typename CType, typename StatisticsType>
-Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
- std::shared_ptr<DataType> type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
- ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
- return Status::OK();
-}
-
-template <typename StatisticsType>
-Status MakeMinMaxIntegralScalar(const StatisticsType& statistics,
- const ::arrow::DataType& arrow_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- const auto column_desc = statistics.descr();
- const auto& logical_type = column_desc->logical_type();
- const auto& integer = checked_pointer_cast<const IntLogicalType>(logical_type);
- const bool is_signed = integer->is_signed();
-
- switch (integer->bit_width()) {
- case 8:
- return is_signed ? MakeMinMaxScalar<int8_t>(statistics, min, max)
- : MakeMinMaxScalar<uint8_t>(statistics, min, max);
- case 16:
- return is_signed ? MakeMinMaxScalar<int16_t>(statistics, min, max)
- : MakeMinMaxScalar<uint16_t>(statistics, min, max);
- case 32:
- return is_signed ? MakeMinMaxScalar<int32_t>(statistics, min, max)
- : MakeMinMaxScalar<uint32_t>(statistics, min, max);
- case 64:
- return is_signed ? MakeMinMaxScalar<int64_t>(statistics, min, max)
- : MakeMinMaxScalar<uint64_t>(statistics, min, max);
- }
-
- return Status::OK();
-}
-
-static Status FromInt32Statistics(const Int32Statistics& statistics,
- const LogicalType& logical_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- ARROW_ASSIGN_OR_RAISE(auto type, FromInt32(logical_type));
-
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeMinMaxIntegralScalar(statistics, *type, min, max);
- break;
- case LogicalType::Type::DATE:
- case LogicalType::Type::TIME:
- case LogicalType::Type::NONE:
- return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
- break;
- default:
- break;
- }
-
- return Status::NotImplemented("Cannot extract statistics for type ");
-}
-
-static Status FromInt64Statistics(const Int64Statistics& statistics,
- const LogicalType& logical_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- ARROW_ASSIGN_OR_RAISE(auto type, FromInt64(logical_type));
-
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeMinMaxIntegralScalar(statistics, *type, min, max);
- break;
- case LogicalType::Type::TIME:
- case LogicalType::Type::TIMESTAMP:
- case LogicalType::Type::NONE:
- return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
- break;
- default:
- break;
- }
-
- return Status::NotImplemented("Cannot extract statistics for type ");
-}
-
-template <typename DecimalType>
-Result<std::shared_ptr<::arrow::Scalar>> FromBigEndianString(
- const std::string& data, std::shared_ptr<DataType> arrow_type) {
- ARROW_ASSIGN_OR_RAISE(
- DecimalType decimal,
- DecimalType::FromBigEndian(reinterpret_cast<const uint8_t*>(data.data()),
- static_cast<int32_t>(data.size())));
- return ::arrow::MakeScalar(std::move(arrow_type), decimal);
-}
-
-// Extracts Min and Max scalar from bytes like types (i.e. types where
-// decimal is encoded as little endian.
-Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics,
- const LogicalType& logical_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- const DecimalLogicalType& decimal_type =
- checked_cast<const DecimalLogicalType&>(logical_type);
-
- Result<std::shared_ptr<DataType>> maybe_type =
- Decimal128Type::Make(decimal_type.precision(), decimal_type.scale());
- std::shared_ptr<DataType> arrow_type;
- if (maybe_type.ok()) {
- arrow_type = maybe_type.ValueOrDie();
- ARROW_ASSIGN_OR_RAISE(
- *min, FromBigEndianString<Decimal128>(statistics.EncodeMin(), arrow_type));
- ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal128>(statistics.EncodeMax(),
- std::move(arrow_type)));
- return Status::OK();
- }
- // Fallback to see if Decimal256 can represent the type.
- ARROW_ASSIGN_OR_RAISE(
- arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale()));
- ARROW_ASSIGN_OR_RAISE(
- *min, FromBigEndianString<Decimal256>(statistics.EncodeMin(), arrow_type));
- ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal256>(statistics.EncodeMax(),
- std::move(arrow_type)));
-
- return Status::OK();
-}
-
-Status ByteArrayStatisticsAsScalars(const Statistics& statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- auto logical_type = statistics.descr()->logical_type();
- if (logical_type->type() == LogicalType::Type::DECIMAL) {
- return ExtractDecimalMinMaxFromBytesType(statistics, *logical_type, min, max);
- }
- std::shared_ptr<::arrow::DataType> type;
- if (statistics.descr()->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
- type = ::arrow::fixed_size_binary(statistics.descr()->type_length());
- } else {
- type = logical_type->type() == LogicalType::Type::STRING ? ::arrow::utf8()
- : ::arrow::binary();
- }
- ARROW_ASSIGN_OR_RAISE(
- *min, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMin())));
- ARROW_ASSIGN_OR_RAISE(
- *max, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMax())));
-
- return Status::OK();
-}
-
-} // namespace
-
-Status StatisticsAsScalars(const Statistics& statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- if (!statistics.HasMinMax()) {
- return Status::Invalid("Statistics has no min max.");
- }
-
- auto column_desc = statistics.descr();
- if (column_desc == nullptr) {
- return Status::Invalid("Statistics carries no descriptor, can't infer arrow type.");
- }
-
- auto physical_type = column_desc->physical_type();
- auto logical_type = column_desc->logical_type();
- switch (physical_type) {
- case Type::BOOLEAN:
- return MakeMinMaxScalar<bool, BoolStatistics>(
- checked_cast<const BoolStatistics&>(statistics), min, max);
- case Type::FLOAT:
- return MakeMinMaxScalar<float, FloatStatistics>(
- checked_cast<const FloatStatistics&>(statistics), min, max);
- case Type::DOUBLE:
- return MakeMinMaxScalar<double, DoubleStatistics>(
- checked_cast<const DoubleStatistics&>(statistics), min, max);
- case Type::INT32:
- return FromInt32Statistics(checked_cast<const Int32Statistics&>(statistics),
- *logical_type, min, max);
- case Type::INT64:
- return FromInt64Statistics(checked_cast<const Int64Statistics&>(statistics),
- *logical_type, min, max);
- case Type::BYTE_ARRAY:
- case Type::FIXED_LEN_BYTE_ARRAY:
- return ByteArrayStatisticsAsScalars(statistics, min, max);
- default:
- return Status::NotImplemented("Extract statistics unsupported for physical_type ",
- physical_type, " unsupported.");
- }
-
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Primitive types
-
-namespace {
-
-template <typename ArrowType, typename ParquetType>
-Status TransferInt(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- using ArrowCType = typename ArrowType::c_type;
- using ParquetCType = typename ParquetType::c_type;
- int64_t length = reader->values_written();
- ARROW_ASSIGN_OR_RAISE(auto data,
- ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
-
- auto values = reinterpret_cast<const ParquetCType*>(reader->values());
- auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
- std::copy(values, values + length, out_ptr);
- *out = std::make_shared<ArrayType<ArrowType>>(
- type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
- return Status::OK();
-}
-
-std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
- const std::shared_ptr<DataType>& type) {
- std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(),
- reader->ReleaseValues()};
- auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(),
- buffers, reader->null_count());
- return ::arrow::MakeArray(data);
-}
-
-Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) {
- int64_t length = reader->values_written();
-
- const int64_t buffer_size = BitUtil::BytesForBits(length);
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool));
-
- // Transfer boolean values to packed bitmap
- auto values = reinterpret_cast<const bool*>(reader->values());
- uint8_t* data_ptr = data->mutable_data();
- memset(data_ptr, 0, buffer_size);
-
- for (int64_t i = 0; i < length; i++) {
- if (values[i]) {
- ::arrow::BitUtil::SetBit(data_ptr, i);
- }
- }
-
- *out = std::make_shared<BooleanArray>(length, std::move(data), reader->ReleaseIsValid(),
- reader->null_count());
- return Status::OK();
-}
-
-Status TransferInt96(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out,
- const ::arrow::TimeUnit::type int96_arrow_time_unit) {
- int64_t length = reader->values_written();
- auto values = reinterpret_cast<const Int96*>(reader->values());
- ARROW_ASSIGN_OR_RAISE(auto data,
- ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
- auto data_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
- for (int64_t i = 0; i < length; i++) {
- if (values[i].value[2] == 0) {
- // Happens for null entries: avoid triggering UBSAN as that Int96 timestamp
- // isn't representable as a 64-bit Unix timestamp.
- *data_ptr++ = 0;
- } else {
- switch (int96_arrow_time_unit) {
- case ::arrow::TimeUnit::NANO:
- *data_ptr++ = Int96GetNanoSeconds(values[i]);
- break;
- case ::arrow::TimeUnit::MICRO:
- *data_ptr++ = Int96GetMicroSeconds(values[i]);
- break;
- case ::arrow::TimeUnit::MILLI:
- *data_ptr++ = Int96GetMilliSeconds(values[i]);
- break;
- case ::arrow::TimeUnit::SECOND:
- *data_ptr++ = Int96GetSeconds(values[i]);
- break;
- }
- }
- }
- *out = std::make_shared<TimestampArray>(type, length, std::move(data),
- reader->ReleaseIsValid(), reader->null_count());
- return Status::OK();
-}
-
-Status TransferDate64(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- int64_t length = reader->values_written();
- auto values = reinterpret_cast<const int32_t*>(reader->values());
-
- ARROW_ASSIGN_OR_RAISE(auto data,
- ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
- auto out_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
-
- for (int64_t i = 0; i < length; i++) {
- *out_ptr++ = static_cast<int64_t>(values[i]) * kMillisecondsPerDay;
- }
-
- *out = std::make_shared<::arrow::Date64Array>(
- type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Binary, direct to dictionary-encoded
-
-Status TransferDictionary(RecordReader* reader,
- const std::shared_ptr<DataType>& logical_value_type,
- std::shared_ptr<ChunkedArray>* out) {
- auto dict_reader = dynamic_cast<DictionaryRecordReader*>(reader);
- DCHECK(dict_reader);
- *out = dict_reader->GetResult();
- if (!logical_value_type->Equals(*(*out)->type())) {
- ARROW_ASSIGN_OR_RAISE(*out, (*out)->View(logical_value_type));
- }
- return Status::OK();
-}
-
-Status TransferBinary(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& logical_value_type,
- std::shared_ptr<ChunkedArray>* out) {
- if (reader->read_dictionary()) {
- return TransferDictionary(
- reader, ::arrow::dictionary(::arrow::int32(), logical_value_type), out);
- }
- ::arrow::compute::ExecContext ctx(pool);
- ::arrow::compute::CastOptions cast_options;
- cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data
-
- auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
- DCHECK(binary_reader);
- auto chunks = binary_reader->GetBuilderChunks();
- for (auto& chunk : chunks) {
- if (!chunk->type()->Equals(*logical_value_type)) {
- // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
- // will be lost because they are first created as int32 and then cast to int64.
- ARROW_ASSIGN_OR_RAISE(
- chunk, ::arrow::compute::Cast(*chunk, logical_value_type, cast_options, &ctx));
- }
- }
- *out = std::make_shared<ChunkedArray>(chunks, logical_value_type);
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 || Decimal256
-
-template <typename DecimalType>
-Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width,
- uint8_t* out_buf) {
- ARROW_ASSIGN_OR_RAISE(DecimalType t, DecimalType::FromBigEndian(value, byte_width));
- t.ToBytes(out_buf);
- return ::arrow::Status::OK();
-}
-
-template <typename DecimalArrayType>
-struct DecimalTypeTrait;
-
-template <>
-struct DecimalTypeTrait<::arrow::Decimal128Array> {
- using value = ::arrow::Decimal128;
-};
-
-template <>
-struct DecimalTypeTrait<::arrow::Decimal256Array> {
- using value = ::arrow::Decimal256;
-};
-
-template <typename DecimalArrayType, typename ParquetType>
-struct DecimalConverter {
- static inline Status ConvertToDecimal(const Array& array,
- const std::shared_ptr<DataType>&,
- MemoryPool* pool, std::shared_ptr<Array>*) {
- return Status::NotImplemented("not implemented");
- }
-};
-
-template <typename DecimalArrayType>
-struct DecimalConverter<DecimalArrayType, FLBAType> {
- static inline Status ConvertToDecimal(const Array& array,
- const std::shared_ptr<DataType>& type,
- MemoryPool* pool, std::shared_ptr<Array>* out) {
- const auto& fixed_size_binary_array =
- checked_cast<const ::arrow::FixedSizeBinaryArray&>(array);
-
- // The byte width of each decimal value
- const int32_t type_length =
- checked_cast<const ::arrow::DecimalType&>(*type).byte_width();
-
- // number of elements in the entire array
- const int64_t length = fixed_size_binary_array.length();
-
- // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time
- // this will be different from the decimal array width because we write the minimum
- // number of bytes necessary to represent a given precision
- const int32_t byte_width =
- checked_cast<const ::arrow::FixedSizeBinaryType&>(*fixed_size_binary_array.type())
- .byte_width();
- // allocate memory for the decimal array
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
-
- // raw bytes that we can write to
- uint8_t* out_ptr = data->mutable_data();
-
- // convert each FixedSizeBinary value to valid decimal bytes
- const int64_t null_count = fixed_size_binary_array.null_count();
-
- using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
- if (null_count > 0) {
- for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
- if (!fixed_size_binary_array.IsNull(i)) {
- RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
- fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
- } else {
- std::memset(out_ptr, 0, type_length);
- }
- }
- } else {
- for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
- RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
- fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
- }
- }
-
- *out = std::make_shared<DecimalArrayType>(
- type, length, std::move(data), fixed_size_binary_array.null_bitmap(), null_count);
-
- return Status::OK();
- }
-};
-
-template <typename DecimalArrayType>
-struct DecimalConverter<DecimalArrayType, ByteArrayType> {
- static inline Status ConvertToDecimal(const Array& array,
- const std::shared_ptr<DataType>& type,
- MemoryPool* pool, std::shared_ptr<Array>* out) {
- const auto& binary_array = checked_cast<const ::arrow::BinaryArray&>(array);
- const int64_t length = binary_array.length();
-
- const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
- const int64_t type_length = decimal_type.byte_width();
-
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
-
- // raw bytes that we can write to
- uint8_t* out_ptr = data->mutable_data();
-
- const int64_t null_count = binary_array.null_count();
-
- // convert each BinaryArray value to valid decimal bytes
- for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
- int32_t record_len = 0;
- const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
-
- if (record_len < 0 || record_len > type_length) {
- return Status::Invalid("Invalid BYTE_ARRAY length for ", type->ToString());
- }
-
- auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
- out_ptr_view[0] = 0;
- out_ptr_view[1] = 0;
-
- // only convert rows that are not null if there are nulls, or
- // all rows, if there are not
- if ((null_count > 0 && !binary_array.IsNull(i)) || null_count <= 0) {
- using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
- RETURN_NOT_OK(
- RawBytesToDecimalBytes<DecimalType>(record_loc, record_len, out_ptr));
- }
- }
- *out = std::make_shared<DecimalArrayType>(type, length, std::move(data),
- binary_array.null_bitmap(), null_count);
- return Status::OK();
- }
-};
-
-/// \brief Convert an Int32 or Int64 array into a Decimal128Array
-/// The parquet spec allows systems to write decimals in int32, int64 if the values are
-/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
-/// This function implements the conversion from int32 and int64 arrays to decimal arrays.
-template <
- typename ParquetIntegerType,
- typename = ::arrow::enable_if_t<std::is_same<ParquetIntegerType, Int32Type>::value ||
- std::is_same<ParquetIntegerType, Int64Type>::value>>
-static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- // Decimal128 and Decimal256 are only Arrow constructs. Parquet does not
- // specifically distinguish between decimal byte widths.
- // Decimal256 isn't relevant here because the Arrow-Parquet C++ bindings never
- // write Decimal values as integers and if the decimal value can fit in an
- // integer it is wasteful to use Decimal256. Put another way, the only
- // way an integer column could be construed as Decimal256 is if an arrow
- // schema was stored as metadata in the file indicating the column was
- // Decimal256. The current Arrow-Parquet C++ bindings will never do this.
- DCHECK(type->id() == ::arrow::Type::DECIMAL128);
-
- const int64_t length = reader->values_written();
-
- using ElementType = typename ParquetIntegerType::c_type;
- static_assert(std::is_same<ElementType, int32_t>::value ||
- std::is_same<ElementType, int64_t>::value,
- "ElementType must be int32_t or int64_t");
-
- const auto values = reinterpret_cast<const ElementType*>(reader->values());
-
- const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
- const int64_t type_length = decimal_type.byte_width();
-
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
- uint8_t* out_ptr = data->mutable_data();
-
- using ::arrow::BitUtil::FromLittleEndian;
-
- for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
- // sign/zero extend int32_t values, otherwise a no-op
- const auto value = static_cast<int64_t>(values[i]);
-
- ::arrow::Decimal128 decimal(value);
- decimal.ToBytes(out_ptr);
- }
-
- if (reader->nullable_values()) {
- std::shared_ptr<ResizableBuffer> is_valid = reader->ReleaseIsValid();
- *out = std::make_shared<Decimal128Array>(type, length, std::move(data), is_valid,
- reader->null_count());
- } else {
- *out = std::make_shared<Decimal128Array>(type, length, std::move(data));
- }
- return Status::OK();
-}
-
-/// \brief Convert an arrow::BinaryArray to an arrow::Decimal{128,256}Array
-/// We do this by:
-/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
-/// 2. Allocating a buffer for the arrow::Decimal{128,256}Array
-/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
-/// representing the high and low bits of each decimal value.
-template <typename DecimalArrayType, typename ParquetType>
-Status TransferDecimal(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
- DCHECK(binary_reader);
- ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
- for (size_t i = 0; i < chunks.size(); ++i) {
- std::shared_ptr<Array> chunk_as_decimal;
- auto fn = &DecimalConverter<DecimalArrayType, ParquetType>::ConvertToDecimal;
- RETURN_NOT_OK(fn(*chunks[i], type, pool, &chunk_as_decimal));
- // Replace the chunk, which will hopefully also free memory as we go
- chunks[i] = chunk_as_decimal;
- }
- *out = std::make_shared<ChunkedArray>(chunks, type);
- return Status::OK();
-}
-
-} // namespace
-
-#define TRANSFER_INT32(ENUM, ArrowType) \
- case ::arrow::Type::ENUM: { \
- Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_type, &result); \
- RETURN_NOT_OK(s); \
- } break;
-
-#define TRANSFER_INT64(ENUM, ArrowType) \
- case ::arrow::Type::ENUM: { \
- Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_type, &result); \
- RETURN_NOT_OK(s); \
- } break;
-
-Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_type,
- const ColumnDescriptor* descr, MemoryPool* pool,
- std::shared_ptr<ChunkedArray>* out) {
- Datum result;
- std::shared_ptr<ChunkedArray> chunked_result;
- switch (value_type->id()) {
- case ::arrow::Type::DICTIONARY: {
- RETURN_NOT_OK(TransferDictionary(reader, value_type, &chunked_result));
- result = chunked_result;
- } break;
- case ::arrow::Type::NA: {
- result = std::make_shared<::arrow::NullArray>(reader->values_written());
- break;
- }
- case ::arrow::Type::INT32:
- case ::arrow::Type::INT64:
- case ::arrow::Type::FLOAT:
- case ::arrow::Type::DOUBLE:
- result = TransferZeroCopy(reader, value_type);
- break;
- case ::arrow::Type::BOOL:
- RETURN_NOT_OK(TransferBool(reader, pool, &result));
- break;
- TRANSFER_INT32(UINT8, ::arrow::UInt8Type);
- TRANSFER_INT32(INT8, ::arrow::Int8Type);
- TRANSFER_INT32(UINT16, ::arrow::UInt16Type);
- TRANSFER_INT32(INT16, ::arrow::Int16Type);
- TRANSFER_INT32(UINT32, ::arrow::UInt32Type);
- TRANSFER_INT64(UINT64, ::arrow::UInt64Type);
- TRANSFER_INT32(DATE32, ::arrow::Date32Type);
- TRANSFER_INT32(TIME32, ::arrow::Time32Type);
- TRANSFER_INT64(TIME64, ::arrow::Time64Type);
- case ::arrow::Type::DATE64:
- RETURN_NOT_OK(TransferDate64(reader, pool, value_type, &result));
- break;
- case ::arrow::Type::FIXED_SIZE_BINARY:
- case ::arrow::Type::BINARY:
- case ::arrow::Type::STRING:
- case ::arrow::Type::LARGE_BINARY:
- case ::arrow::Type::LARGE_STRING: {
- RETURN_NOT_OK(TransferBinary(reader, pool, value_type, &chunked_result));
- result = chunked_result;
- } break;
- case ::arrow::Type::DECIMAL128: {
- switch (descr->physical_type()) {
- case ::parquet::Type::INT32: {
- auto fn = DecimalIntegerTransfer<Int32Type>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::INT64: {
- auto fn = &DecimalIntegerTransfer<Int64Type>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal128Array, ByteArrayType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal128Array, FLBAType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- default:
- return Status::Invalid(
- "Physical type for decimal128 must be int32, int64, byte array, or fixed "
- "length binary");
- }
- } break;
- case ::arrow::Type::DECIMAL256:
- switch (descr->physical_type()) {
- case ::parquet::Type::BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal256Array, ByteArrayType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal256Array, FLBAType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- default:
- return Status::Invalid(
- "Physical type for decimal256 must be fixed length binary");
- }
- break;
-
- case ::arrow::Type::TIMESTAMP: {
- const ::arrow::TimestampType& timestamp_type =
- checked_cast<::arrow::TimestampType&>(*value_type);
- if (descr->physical_type() == ::parquet::Type::INT96) {
- RETURN_NOT_OK(
- TransferInt96(reader, pool, value_type, &result, timestamp_type.unit()));
- } else {
- switch (timestamp_type.unit()) {
- case ::arrow::TimeUnit::MILLI:
- case ::arrow::TimeUnit::MICRO:
- case ::arrow::TimeUnit::NANO:
- result = TransferZeroCopy(reader, value_type);
- break;
- default:
- return Status::NotImplemented("TimeUnit not supported");
- }
- }
- } break;
- default:
- return Status::NotImplemented("No support for reading columns of type ",
- value_type->ToString());
- }
-
- if (result.kind() == Datum::ARRAY) {
- *out = std::make_shared<ChunkedArray>(result.make_array());
- } else if (result.kind() == Datum::CHUNKED_ARRAY) {
- *out = result.chunked_array();
- } else {
- DCHECK(false) << "Should be impossible, result was " << result.ToString();
- }
-
- return Status::OK();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/reader_internal.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/compute/api.h"
+#include "arrow/datum.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/arrow/schema_internal.h"
+#include "parquet/column_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
+#include "parquet/windows_compatibility.h"
+
+using arrow::Array;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::Datum;
+using arrow::Decimal128;
+using arrow::Decimal128Array;
+using arrow::Decimal128Type;
+using arrow::Decimal256;
+using arrow::Decimal256Array;
+using arrow::Decimal256Type;
+using arrow::Field;
+using arrow::Int32Array;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::StructArray;
+using arrow::Table;
+using arrow::TimestampArray;
+
+using ::arrow::BitUtil::FromBigEndian;
+using ::arrow::internal::checked_cast;
+using ::arrow::internal::checked_pointer_cast;
+using ::arrow::internal::SafeLeftShift;
+using ::arrow::util::SafeLoadAs;
+
+using parquet::internal::BinaryRecordReader;
+using parquet::internal::DictionaryRecordReader;
+using parquet::internal::RecordReader;
+using parquet::schema::GroupNode;
+using parquet::schema::Node;
+using parquet::schema::PrimitiveNode;
+using ParquetType = parquet::Type;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace arrow {
+namespace {
+
+template <typename ArrowType>
+using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+
+template <typename CType, typename StatisticsType>
+Status MakeMinMaxScalar(const StatisticsType& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ *min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
+ *max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
+ return Status::OK();
+}
+
+template <typename CType, typename StatisticsType>
+Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
+ std::shared_ptr<DataType> type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
+ ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
+ return Status::OK();
+}
+
+template <typename StatisticsType>
+Status MakeMinMaxIntegralScalar(const StatisticsType& statistics,
+ const ::arrow::DataType& arrow_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ const auto column_desc = statistics.descr();
+ const auto& logical_type = column_desc->logical_type();
+ const auto& integer = checked_pointer_cast<const IntLogicalType>(logical_type);
+ const bool is_signed = integer->is_signed();
+
+ switch (integer->bit_width()) {
+ case 8:
+ return is_signed ? MakeMinMaxScalar<int8_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint8_t>(statistics, min, max);
+ case 16:
+ return is_signed ? MakeMinMaxScalar<int16_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint16_t>(statistics, min, max);
+ case 32:
+ return is_signed ? MakeMinMaxScalar<int32_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint32_t>(statistics, min, max);
+ case 64:
+ return is_signed ? MakeMinMaxScalar<int64_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint64_t>(statistics, min, max);
+ }
+
+ return Status::OK();
+}
+
+static Status FromInt32Statistics(const Int32Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(auto type, FromInt32(logical_type));
+
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeMinMaxIntegralScalar(statistics, *type, min, max);
+ break;
+ case LogicalType::Type::DATE:
+ case LogicalType::Type::TIME:
+ case LogicalType::Type::NONE:
+ return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
+ break;
+ default:
+ break;
+ }
+
+ return Status::NotImplemented("Cannot extract statistics for type ");
+}
+
+static Status FromInt64Statistics(const Int64Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(auto type, FromInt64(logical_type));
+
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeMinMaxIntegralScalar(statistics, *type, min, max);
+ break;
+ case LogicalType::Type::TIME:
+ case LogicalType::Type::TIMESTAMP:
+ case LogicalType::Type::NONE:
+ return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
+ break;
+ default:
+ break;
+ }
+
+ return Status::NotImplemented("Cannot extract statistics for type ");
+}
+
+template <typename DecimalType>
+Result<std::shared_ptr<::arrow::Scalar>> FromBigEndianString(
+ const std::string& data, std::shared_ptr<DataType> arrow_type) {
+ ARROW_ASSIGN_OR_RAISE(
+ DecimalType decimal,
+ DecimalType::FromBigEndian(reinterpret_cast<const uint8_t*>(data.data()),
+ static_cast<int32_t>(data.size())));
+ return ::arrow::MakeScalar(std::move(arrow_type), decimal);
+}
+
+// Extracts Min and Max scalar from bytes like types (i.e. types where
+// decimal is encoded as little endian.
+Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ const DecimalLogicalType& decimal_type =
+ checked_cast<const DecimalLogicalType&>(logical_type);
+
+ Result<std::shared_ptr<DataType>> maybe_type =
+ Decimal128Type::Make(decimal_type.precision(), decimal_type.scale());
+ std::shared_ptr<DataType> arrow_type;
+ if (maybe_type.ok()) {
+ arrow_type = maybe_type.ValueOrDie();
+ ARROW_ASSIGN_OR_RAISE(
+ *min, FromBigEndianString<Decimal128>(statistics.EncodeMin(), arrow_type));
+ ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal128>(statistics.EncodeMax(),
+ std::move(arrow_type)));
+ return Status::OK();
+ }
+ // Fallback to see if Decimal256 can represent the type.
+ ARROW_ASSIGN_OR_RAISE(
+ arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale()));
+ ARROW_ASSIGN_OR_RAISE(
+ *min, FromBigEndianString<Decimal256>(statistics.EncodeMin(), arrow_type));
+ ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal256>(statistics.EncodeMax(),
+ std::move(arrow_type)));
+
+ return Status::OK();
+}
+
+Status ByteArrayStatisticsAsScalars(const Statistics& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ auto logical_type = statistics.descr()->logical_type();
+ if (logical_type->type() == LogicalType::Type::DECIMAL) {
+ return ExtractDecimalMinMaxFromBytesType(statistics, *logical_type, min, max);
+ }
+ std::shared_ptr<::arrow::DataType> type;
+ if (statistics.descr()->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
+ type = ::arrow::fixed_size_binary(statistics.descr()->type_length());
+ } else {
+ type = logical_type->type() == LogicalType::Type::STRING ? ::arrow::utf8()
+ : ::arrow::binary();
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ *min, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMin())));
+ ARROW_ASSIGN_OR_RAISE(
+ *max, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMax())));
+
+ return Status::OK();
+}
+
+} // namespace
+
+Status StatisticsAsScalars(const Statistics& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ if (!statistics.HasMinMax()) {
+ return Status::Invalid("Statistics has no min max.");
+ }
+
+ auto column_desc = statistics.descr();
+ if (column_desc == nullptr) {
+ return Status::Invalid("Statistics carries no descriptor, can't infer arrow type.");
+ }
+
+ auto physical_type = column_desc->physical_type();
+ auto logical_type = column_desc->logical_type();
+ switch (physical_type) {
+ case Type::BOOLEAN:
+ return MakeMinMaxScalar<bool, BoolStatistics>(
+ checked_cast<const BoolStatistics&>(statistics), min, max);
+ case Type::FLOAT:
+ return MakeMinMaxScalar<float, FloatStatistics>(
+ checked_cast<const FloatStatistics&>(statistics), min, max);
+ case Type::DOUBLE:
+ return MakeMinMaxScalar<double, DoubleStatistics>(
+ checked_cast<const DoubleStatistics&>(statistics), min, max);
+ case Type::INT32:
+ return FromInt32Statistics(checked_cast<const Int32Statistics&>(statistics),
+ *logical_type, min, max);
+ case Type::INT64:
+ return FromInt64Statistics(checked_cast<const Int64Statistics&>(statistics),
+ *logical_type, min, max);
+ case Type::BYTE_ARRAY:
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return ByteArrayStatisticsAsScalars(statistics, min, max);
+ default:
+ return Status::NotImplemented("Extract statistics unsupported for physical_type ",
+ physical_type, " unsupported.");
+ }
+
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Primitive types
+
+namespace {
+
+template <typename ArrowType, typename ParquetType>
+Status TransferInt(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ using ArrowCType = typename ArrowType::c_type;
+ using ParquetCType = typename ParquetType::c_type;
+ int64_t length = reader->values_written();
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
+
+ auto values = reinterpret_cast<const ParquetCType*>(reader->values());
+ auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
+ std::copy(values, values + length, out_ptr);
+ *out = std::make_shared<ArrayType<ArrowType>>(
+ type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
+ const std::shared_ptr<DataType>& type) {
+ std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(),
+ reader->ReleaseValues()};
+ auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(),
+ buffers, reader->null_count());
+ return ::arrow::MakeArray(data);
+}
+
+Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) {
+ int64_t length = reader->values_written();
+
+ const int64_t buffer_size = BitUtil::BytesForBits(length);
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool));
+
+ // Transfer boolean values to packed bitmap
+ auto values = reinterpret_cast<const bool*>(reader->values());
+ uint8_t* data_ptr = data->mutable_data();
+ memset(data_ptr, 0, buffer_size);
+
+ for (int64_t i = 0; i < length; i++) {
+ if (values[i]) {
+ ::arrow::BitUtil::SetBit(data_ptr, i);
+ }
+ }
+
+ *out = std::make_shared<BooleanArray>(length, std::move(data), reader->ReleaseIsValid(),
+ reader->null_count());
+ return Status::OK();
+}
+
+Status TransferInt96(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ int64_t length = reader->values_written();
+ auto values = reinterpret_cast<const Int96*>(reader->values());
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
+ auto data_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
+ for (int64_t i = 0; i < length; i++) {
+ if (values[i].value[2] == 0) {
+ // Happens for null entries: avoid triggering UBSAN as that Int96 timestamp
+ // isn't representable as a 64-bit Unix timestamp.
+ *data_ptr++ = 0;
+ } else {
+ switch (int96_arrow_time_unit) {
+ case ::arrow::TimeUnit::NANO:
+ *data_ptr++ = Int96GetNanoSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::MICRO:
+ *data_ptr++ = Int96GetMicroSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::MILLI:
+ *data_ptr++ = Int96GetMilliSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ *data_ptr++ = Int96GetSeconds(values[i]);
+ break;
+ }
+ }
+ }
+ *out = std::make_shared<TimestampArray>(type, length, std::move(data),
+ reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+Status TransferDate64(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ int64_t length = reader->values_written();
+ auto values = reinterpret_cast<const int32_t*>(reader->values());
+
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
+ auto out_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
+
+ for (int64_t i = 0; i < length; i++) {
+ *out_ptr++ = static_cast<int64_t>(values[i]) * kMillisecondsPerDay;
+ }
+
+ *out = std::make_shared<::arrow::Date64Array>(
+ type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Binary, direct to dictionary-encoded
+
+Status TransferDictionary(RecordReader* reader,
+ const std::shared_ptr<DataType>& logical_value_type,
+ std::shared_ptr<ChunkedArray>* out) {
+ auto dict_reader = dynamic_cast<DictionaryRecordReader*>(reader);
+ DCHECK(dict_reader);
+ *out = dict_reader->GetResult();
+ if (!logical_value_type->Equals(*(*out)->type())) {
+ ARROW_ASSIGN_OR_RAISE(*out, (*out)->View(logical_value_type));
+ }
+ return Status::OK();
+}
+
+Status TransferBinary(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& logical_value_type,
+ std::shared_ptr<ChunkedArray>* out) {
+ if (reader->read_dictionary()) {
+ return TransferDictionary(
+ reader, ::arrow::dictionary(::arrow::int32(), logical_value_type), out);
+ }
+ ::arrow::compute::ExecContext ctx(pool);
+ ::arrow::compute::CastOptions cast_options;
+ cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data
+
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ auto chunks = binary_reader->GetBuilderChunks();
+ for (auto& chunk : chunks) {
+ if (!chunk->type()->Equals(*logical_value_type)) {
+ // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
+ // will be lost because they are first created as int32 and then cast to int64.
+ ARROW_ASSIGN_OR_RAISE(
+ chunk, ::arrow::compute::Cast(*chunk, logical_value_type, cast_options, &ctx));
+ }
+ }
+ *out = std::make_shared<ChunkedArray>(chunks, logical_value_type);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 || Decimal256
+
+template <typename DecimalType>
+Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width,
+ uint8_t* out_buf) {
+ ARROW_ASSIGN_OR_RAISE(DecimalType t, DecimalType::FromBigEndian(value, byte_width));
+ t.ToBytes(out_buf);
+ return ::arrow::Status::OK();
+}
+
+template <typename DecimalArrayType>
+struct DecimalTypeTrait;
+
+template <>
+struct DecimalTypeTrait<::arrow::Decimal128Array> {
+ using value = ::arrow::Decimal128;
+};
+
+template <>
+struct DecimalTypeTrait<::arrow::Decimal256Array> {
+ using value = ::arrow::Decimal256;
+};
+
+template <typename DecimalArrayType, typename ParquetType>
+struct DecimalConverter {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>&,
+ MemoryPool* pool, std::shared_ptr<Array>*) {
+ return Status::NotImplemented("not implemented");
+ }
+};
+
+template <typename DecimalArrayType>
+struct DecimalConverter<DecimalArrayType, FLBAType> {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ const auto& fixed_size_binary_array =
+ checked_cast<const ::arrow::FixedSizeBinaryArray&>(array);
+
+ // The byte width of each decimal value
+ const int32_t type_length =
+ checked_cast<const ::arrow::DecimalType&>(*type).byte_width();
+
+ // number of elements in the entire array
+ const int64_t length = fixed_size_binary_array.length();
+
+ // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time
+ // this will be different from the decimal array width because we write the minimum
+ // number of bytes necessary to represent a given precision
+ const int32_t byte_width =
+ checked_cast<const ::arrow::FixedSizeBinaryType&>(*fixed_size_binary_array.type())
+ .byte_width();
+ // allocate memory for the decimal array
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ // convert each FixedSizeBinary value to valid decimal bytes
+ const int64_t null_count = fixed_size_binary_array.null_count();
+
+ using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
+ if (null_count > 0) {
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ if (!fixed_size_binary_array.IsNull(i)) {
+ RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
+ fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
+ } else {
+ std::memset(out_ptr, 0, type_length);
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
+ fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
+ }
+ }
+
+ *out = std::make_shared<DecimalArrayType>(
+ type, length, std::move(data), fixed_size_binary_array.null_bitmap(), null_count);
+
+ return Status::OK();
+ }
+};
+
+template <typename DecimalArrayType>
+struct DecimalConverter<DecimalArrayType, ByteArrayType> {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ const auto& binary_array = checked_cast<const ::arrow::BinaryArray&>(array);
+ const int64_t length = binary_array.length();
+
+ const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+
+ // convert each BinaryArray value to valid decimal bytes
+ for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+ int32_t record_len = 0;
+ const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
+
+ if (record_len < 0 || record_len > type_length) {
+ return Status::Invalid("Invalid BYTE_ARRAY length for ", type->ToString());
+ }
+
+ auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+ out_ptr_view[0] = 0;
+ out_ptr_view[1] = 0;
+
+ // only convert rows that are not null if there are nulls, or
+ // all rows, if there are not
+ if ((null_count > 0 && !binary_array.IsNull(i)) || null_count <= 0) {
+ using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
+ RETURN_NOT_OK(
+ RawBytesToDecimalBytes<DecimalType>(record_loc, record_len, out_ptr));
+ }
+ }
+ *out = std::make_shared<DecimalArrayType>(type, length, std::move(data),
+ binary_array.null_bitmap(), null_count);
+ return Status::OK();
+ }
+};
+
+/// \brief Convert an Int32 or Int64 array into a Decimal128Array
+/// The parquet spec allows systems to write decimals in int32, int64 if the values are
+/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
+/// This function implements the conversion from int32 and int64 arrays to decimal arrays.
+template <
+ typename ParquetIntegerType,
+ typename = ::arrow::enable_if_t<std::is_same<ParquetIntegerType, Int32Type>::value ||
+ std::is_same<ParquetIntegerType, Int64Type>::value>>
+static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ // Decimal128 and Decimal256 are only Arrow constructs. Parquet does not
+ // specifically distinguish between decimal byte widths.
+ // Decimal256 isn't relevant here because the Arrow-Parquet C++ bindings never
+ // write Decimal values as integers and if the decimal value can fit in an
+ // integer it is wasteful to use Decimal256. Put another way, the only
+ // way an integer column could be construed as Decimal256 is if an arrow
+ // schema was stored as metadata in the file indicating the column was
+ // Decimal256. The current Arrow-Parquet C++ bindings will never do this.
+ DCHECK(type->id() == ::arrow::Type::DECIMAL128);
+
+ const int64_t length = reader->values_written();
+
+ using ElementType = typename ParquetIntegerType::c_type;
+ static_assert(std::is_same<ElementType, int32_t>::value ||
+ std::is_same<ElementType, int64_t>::value,
+ "ElementType must be int32_t or int64_t");
+
+ const auto values = reinterpret_cast<const ElementType*>(reader->values());
+
+ const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+ uint8_t* out_ptr = data->mutable_data();
+
+ using ::arrow::BitUtil::FromLittleEndian;
+
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ // sign/zero extend int32_t values, otherwise a no-op
+ const auto value = static_cast<int64_t>(values[i]);
+
+ ::arrow::Decimal128 decimal(value);
+ decimal.ToBytes(out_ptr);
+ }
+
+ if (reader->nullable_values()) {
+ std::shared_ptr<ResizableBuffer> is_valid = reader->ReleaseIsValid();
+ *out = std::make_shared<Decimal128Array>(type, length, std::move(data), is_valid,
+ reader->null_count());
+ } else {
+ *out = std::make_shared<Decimal128Array>(type, length, std::move(data));
+ }
+ return Status::OK();
+}
+
+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal{128,256}Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal{128,256}Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+/// representing the high and low bits of each decimal value.
+template <typename DecimalArrayType, typename ParquetType>
+Status TransferDecimal(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
+ for (size_t i = 0; i < chunks.size(); ++i) {
+ std::shared_ptr<Array> chunk_as_decimal;
+ auto fn = &DecimalConverter<DecimalArrayType, ParquetType>::ConvertToDecimal;
+ RETURN_NOT_OK(fn(*chunks[i], type, pool, &chunk_as_decimal));
+ // Replace the chunk, which will hopefully also free memory as we go
+ chunks[i] = chunk_as_decimal;
+ }
+ *out = std::make_shared<ChunkedArray>(chunks, type);
+ return Status::OK();
+}
+
+} // namespace
+
+#define TRANSFER_INT32(ENUM, ArrowType) \
+ case ::arrow::Type::ENUM: { \
+ Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_type, &result); \
+ RETURN_NOT_OK(s); \
+ } break;
+
+#define TRANSFER_INT64(ENUM, ArrowType) \
+ case ::arrow::Type::ENUM: { \
+ Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_type, &result); \
+ RETURN_NOT_OK(s); \
+ } break;
+
+Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_type,
+ const ColumnDescriptor* descr, MemoryPool* pool,
+ std::shared_ptr<ChunkedArray>* out) {
+ Datum result;
+ std::shared_ptr<ChunkedArray> chunked_result;
+ switch (value_type->id()) {
+ case ::arrow::Type::DICTIONARY: {
+ RETURN_NOT_OK(TransferDictionary(reader, value_type, &chunked_result));
+ result = chunked_result;
+ } break;
+ case ::arrow::Type::NA: {
+ result = std::make_shared<::arrow::NullArray>(reader->values_written());
+ break;
+ }
+ case ::arrow::Type::INT32:
+ case ::arrow::Type::INT64:
+ case ::arrow::Type::FLOAT:
+ case ::arrow::Type::DOUBLE:
+ result = TransferZeroCopy(reader, value_type);
+ break;
+ case ::arrow::Type::BOOL:
+ RETURN_NOT_OK(TransferBool(reader, pool, &result));
+ break;
+ TRANSFER_INT32(UINT8, ::arrow::UInt8Type);
+ TRANSFER_INT32(INT8, ::arrow::Int8Type);
+ TRANSFER_INT32(UINT16, ::arrow::UInt16Type);
+ TRANSFER_INT32(INT16, ::arrow::Int16Type);
+ TRANSFER_INT32(UINT32, ::arrow::UInt32Type);
+ TRANSFER_INT64(UINT64, ::arrow::UInt64Type);
+ TRANSFER_INT32(DATE32, ::arrow::Date32Type);
+ TRANSFER_INT32(TIME32, ::arrow::Time32Type);
+ TRANSFER_INT64(TIME64, ::arrow::Time64Type);
+ case ::arrow::Type::DATE64:
+ RETURN_NOT_OK(TransferDate64(reader, pool, value_type, &result));
+ break;
+ case ::arrow::Type::FIXED_SIZE_BINARY:
+ case ::arrow::Type::BINARY:
+ case ::arrow::Type::STRING:
+ case ::arrow::Type::LARGE_BINARY:
+ case ::arrow::Type::LARGE_STRING: {
+ RETURN_NOT_OK(TransferBinary(reader, pool, value_type, &chunked_result));
+ result = chunked_result;
+ } break;
+ case ::arrow::Type::DECIMAL128: {
+ switch (descr->physical_type()) {
+ case ::parquet::Type::INT32: {
+ auto fn = DecimalIntegerTransfer<Int32Type>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::INT64: {
+ auto fn = &DecimalIntegerTransfer<Int64Type>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal128Array, ByteArrayType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal128Array, FLBAType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ default:
+ return Status::Invalid(
+ "Physical type for decimal128 must be int32, int64, byte array, or fixed "
+ "length binary");
+ }
+ } break;
+ case ::arrow::Type::DECIMAL256:
+ switch (descr->physical_type()) {
+ case ::parquet::Type::BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal256Array, ByteArrayType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal256Array, FLBAType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ default:
+ return Status::Invalid(
+ "Physical type for decimal256 must be fixed length binary");
+ }
+ break;
+
+ case ::arrow::Type::TIMESTAMP: {
+ const ::arrow::TimestampType& timestamp_type =
+ checked_cast<::arrow::TimestampType&>(*value_type);
+ if (descr->physical_type() == ::parquet::Type::INT96) {
+ RETURN_NOT_OK(
+ TransferInt96(reader, pool, value_type, &result, timestamp_type.unit()));
+ } else {
+ switch (timestamp_type.unit()) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ case ::arrow::TimeUnit::NANO:
+ result = TransferZeroCopy(reader, value_type);
+ break;
+ default:
+ return Status::NotImplemented("TimeUnit not supported");
+ }
+ }
+ } break;
+ default:
+ return Status::NotImplemented("No support for reading columns of type ",
+ value_type->ToString());
+ }
+
+ if (result.kind() == Datum::ARRAY) {
+ *out = std::make_shared<ChunkedArray>(result.make_array());
+ } else if (result.kind() == Datum::CHUNKED_ARRAY) {
+ *out = result.chunked_array();
+ } else {
+ DCHECK(false) << "Should be impossible, result was " << result.ToString();
+ }
+
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
index ad0b781576f..cd54e499aa5 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
@@ -1,122 +1,122 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <deque>
-#include <functional>
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "parquet/arrow/schema.h"
-#include "parquet/column_reader.h"
-#include "parquet/file_reader.h"
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-class DataType;
-class Field;
-class KeyValueMetadata;
-class Schema;
-
-} // namespace arrow
-
-using arrow::Status;
-
-namespace parquet {
-
-class ArrowReaderProperties;
-
-namespace arrow {
-
-class ColumnReaderImpl;
-
-// ----------------------------------------------------------------------
-// Iteration utilities
-
-// Abstraction to decouple row group iteration details from the ColumnReader,
-// so we can read only a single row group if we want
-class FileColumnIterator {
- public:
- explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
- std::vector<int> row_groups)
- : column_index_(column_index),
- reader_(reader),
- schema_(reader->metadata()->schema()),
- row_groups_(row_groups.begin(), row_groups.end()) {}
-
- virtual ~FileColumnIterator() {}
-
- std::unique_ptr<::parquet::PageReader> NextChunk() {
- if (row_groups_.empty()) {
- return nullptr;
- }
-
- auto row_group_reader = reader_->RowGroup(row_groups_.front());
- row_groups_.pop_front();
- return row_group_reader->GetColumnPageReader(column_index_);
- }
-
- const SchemaDescriptor* schema() const { return schema_; }
-
- const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
-
- std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
-
- int column_index() const { return column_index_; }
-
- protected:
- int column_index_;
- ParquetFileReader* reader_;
- const SchemaDescriptor* schema_;
- std::deque<int> row_groups_;
-};
-
-using FileColumnIteratorFactory =
- std::function<FileColumnIterator*(int, ParquetFileReader*)>;
-
-Status TransferColumnData(::parquet::internal::RecordReader* reader,
- std::shared_ptr<::arrow::DataType> value_type,
- const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::ChunkedArray>* out);
-
-struct ReaderContext {
- ParquetFileReader* reader;
- ::arrow::MemoryPool* pool;
- FileColumnIteratorFactory iterator_factory;
- bool filter_leaves;
- std::shared_ptr<std::unordered_set<int>> included_leaves;
-
- bool IncludesLeaf(int leaf_index) const {
- if (this->filter_leaves) {
- return this->included_leaves->find(leaf_index) != this->included_leaves->end();
- }
- return true;
- }
-};
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "parquet/arrow/schema.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class Field;
+class KeyValueMetadata;
+class Schema;
+
+} // namespace arrow
+
+using arrow::Status;
+
+namespace parquet {
+
+class ArrowReaderProperties;
+
+namespace arrow {
+
+class ColumnReaderImpl;
+
+// ----------------------------------------------------------------------
+// Iteration utilities
+
+// Abstraction to decouple row group iteration details from the ColumnReader,
+// so we can read only a single row group if we want
+class FileColumnIterator {
+ public:
+ explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
+ std::vector<int> row_groups)
+ : column_index_(column_index),
+ reader_(reader),
+ schema_(reader->metadata()->schema()),
+ row_groups_(row_groups.begin(), row_groups.end()) {}
+
+ virtual ~FileColumnIterator() {}
+
+ std::unique_ptr<::parquet::PageReader> NextChunk() {
+ if (row_groups_.empty()) {
+ return nullptr;
+ }
+
+ auto row_group_reader = reader_->RowGroup(row_groups_.front());
+ row_groups_.pop_front();
+ return row_group_reader->GetColumnPageReader(column_index_);
+ }
+
+ const SchemaDescriptor* schema() const { return schema_; }
+
+ const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
+
+ std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
+
+ int column_index() const { return column_index_; }
+
+ protected:
+ int column_index_;
+ ParquetFileReader* reader_;
+ const SchemaDescriptor* schema_;
+ std::deque<int> row_groups_;
+};
+
+using FileColumnIteratorFactory =
+ std::function<FileColumnIterator*(int, ParquetFileReader*)>;
+
+Status TransferColumnData(::parquet::internal::RecordReader* reader,
+ std::shared_ptr<::arrow::DataType> value_type,
+ const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::ChunkedArray>* out);
+
+struct ReaderContext {
+ ParquetFileReader* reader;
+ ::arrow::MemoryPool* pool;
+ FileColumnIteratorFactory iterator_factory;
+ bool filter_leaves;
+ std::shared_ptr<std::unordered_set<int>> included_leaves;
+
+ bool IncludesLeaf(int leaf_index) const {
+ if (this->filter_leaves) {
+ return this->included_leaves->find(leaf_index) != this->included_leaves->end();
+ }
+ return true;
+ }
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
index eb7fd628dfc..454b0e2289a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
@@ -1,1087 +1,1087 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/schema.h"
-
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "arrow/extension_type.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/api.h"
-#include "arrow/result_internal.h"
-#include "arrow/type.h"
-#include "arrow/util/base64.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/value_parsing.h"
-
-#include "parquet/arrow/schema_internal.h"
-#include "parquet/exception.h"
-#include "parquet/properties.h"
-#include "parquet/types.h"
-
-using arrow::DecimalType;
-using arrow::Field;
-using arrow::FieldVector;
-using arrow::KeyValueMetadata;
-using arrow::Status;
-using arrow::internal::checked_cast;
-
-using ArrowType = arrow::DataType;
-using ArrowTypeId = arrow::Type;
-
-using parquet::Repetition;
-using parquet::schema::GroupNode;
-using parquet::schema::Node;
-using parquet::schema::NodePtr;
-using parquet::schema::PrimitiveNode;
-
-using ParquetType = parquet::Type;
-using parquet::ConvertedType;
-using parquet::LogicalType;
-
-using parquet::internal::LevelInfo;
-
-namespace parquet {
-
-namespace arrow {
-
-// ----------------------------------------------------------------------
-// Parquet to Arrow schema conversion
-
-namespace {
-
-Repetition::type RepetitionFromNullable(bool is_nullable) {
- return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
-}
-
-Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out);
-
-Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type,
- const std::string& name, bool nullable,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- NodePtr element;
- std::string value_name =
- arrow_properties.compliant_nested_types() ? "element" : type->value_field()->name();
- RETURN_NOT_OK(FieldToNode(value_name, type->value_field(), properties, arrow_properties,
- &element));
-
- NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element});
- *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {list},
- LogicalType::List());
- return Status::OK();
-}
-
-Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::string& name,
- bool nullable, const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- // TODO: Should we offer a non-compliant mode that forwards the type names?
- NodePtr key_node;
- RETURN_NOT_OK(
- FieldToNode("key", type->key_field(), properties, arrow_properties, &key_node));
-
- NodePtr value_node;
- RETURN_NOT_OK(FieldToNode("value", type->item_field(), properties, arrow_properties,
- &value_node));
-
- NodePtr key_value =
- GroupNode::Make("key_value", Repetition::REPEATED, {key_node, value_node});
- *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {key_value},
- LogicalType::Map());
- return Status::OK();
-}
-
-Status StructToNode(const std::shared_ptr<::arrow::StructType>& type,
- const std::string& name, bool nullable,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- std::vector<NodePtr> children(type->num_fields());
- if (type->num_fields() != 0) {
- for (int i = 0; i < type->num_fields(); i++) {
- RETURN_NOT_OK(FieldToNode(type->field(i)->name(), type->field(i), properties,
- arrow_properties, &children[i]));
- }
- } else {
- // XXX (ARROW-10928) We could add a dummy primitive node but that would
- // require special handling when writing and reading, to avoid column index
- // mismatches.
- return Status::NotImplemented("Cannot write struct type '", name,
- "' with no child field to Parquet. "
- "Consider adding a dummy child field.");
- }
-
- *out = GroupNode::Make(name, RepetitionFromNullable(nullable), std::move(children));
- return Status::OK();
-}
-
-static std::shared_ptr<const LogicalType> TimestampLogicalTypeFromArrowTimestamp(
- const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) {
- const bool utc = !(timestamp_type.timezone().empty());
- // ARROW-5878(wesm): for forward compatibility reasons, and because
- // there's no other way to signal to old readers that values are
- // timestamps, we force the ConvertedType field to be set to the
- // corresponding TIMESTAMP_* value. This does cause some ambiguity
- // as Parquet readers have not been consistent about the
- // interpretation of TIMESTAMP_* values as being UTC-normalized.
- switch (time_unit) {
- case ::arrow::TimeUnit::MILLI:
- return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MILLIS,
- /*is_from_converted_type=*/false,
- /*force_set_converted_type=*/true);
- case ::arrow::TimeUnit::MICRO:
- return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MICROS,
- /*is_from_converted_type=*/false,
- /*force_set_converted_type=*/true);
- case ::arrow::TimeUnit::NANO:
- return LogicalType::Timestamp(utc, LogicalType::TimeUnit::NANOS);
- case ::arrow::TimeUnit::SECOND:
- // No equivalent parquet logical type.
- break;
- }
- return LogicalType::None();
-}
-
-static Status GetTimestampMetadata(const ::arrow::TimestampType& type,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- ParquetType::type* physical_type,
- std::shared_ptr<const LogicalType>* logical_type) {
- const bool coerce = arrow_properties.coerce_timestamps_enabled();
- const auto target_unit =
- coerce ? arrow_properties.coerce_timestamps_unit() : type.unit();
-
- // The user is explicitly asking for Impala int96 encoding, there is no
- // logical type.
- if (arrow_properties.support_deprecated_int96_timestamps()) {
- *physical_type = ParquetType::INT96;
- return Status::OK();
- }
-
- *physical_type = ParquetType::INT64;
- *logical_type = TimestampLogicalTypeFromArrowTimestamp(type, target_unit);
-
- // The user is explicitly asking for timestamp data to be converted to the
- // specified units (target_unit).
- if (coerce) {
- if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
- switch (target_unit) {
- case ::arrow::TimeUnit::MILLI:
- case ::arrow::TimeUnit::MICRO:
- break;
- case ::arrow::TimeUnit::NANO:
- case ::arrow::TimeUnit::SECOND:
- return Status::NotImplemented(
- "For Parquet version 1.0 files, can only coerce Arrow timestamps to "
- "milliseconds or microseconds");
- }
- } else {
- switch (target_unit) {
- case ::arrow::TimeUnit::MILLI:
- case ::arrow::TimeUnit::MICRO:
- case ::arrow::TimeUnit::NANO:
- break;
- case ::arrow::TimeUnit::SECOND:
- return Status::NotImplemented(
- "For Parquet files, can only coerce Arrow timestamps to milliseconds, "
- "microseconds, or nanoseconds");
- }
- }
- return Status::OK();
- }
-
- // The user implicitly wants timestamp data to retain its original time units,
- // however the ConvertedType field used to indicate logical types for Parquet
- // version 1.0 fields does not allow for nanosecond time units and so nanoseconds
- // must be coerced to microseconds.
- if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0 &&
- type.unit() == ::arrow::TimeUnit::NANO) {
- *logical_type =
- TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MICRO);
- return Status::OK();
- }
-
- // The user implicitly wants timestamp data to retain its original time units,
- // however the Arrow seconds time unit can not be represented (annotated) in
- // any version of Parquet and so must be coerced to milliseconds.
- if (type.unit() == ::arrow::TimeUnit::SECOND) {
- *logical_type =
- TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI);
- return Status::OK();
- }
-
- return Status::OK();
-}
-
-static constexpr char FIELD_ID_KEY[] = "PARQUET:field_id";
-
-std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) {
- if (field_id >= 0) {
- return ::arrow::key_value_metadata({FIELD_ID_KEY}, {std::to_string(field_id)});
- } else {
- return nullptr;
- }
-}
-
-int FieldIdFromMetadata(
- const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
- if (!metadata) {
- return -1;
- }
- int key = metadata->FindKey(FIELD_ID_KEY);
- if (key < 0) {
- return -1;
- }
- std::string field_id_str = metadata->value(key);
- int field_id;
- if (::arrow::internal::ParseValue<::arrow::Int32Type>(
- field_id_str.c_str(), field_id_str.length(), &field_id)) {
- if (field_id < 0) {
- // Thrift should convert any negative value to null but normalize to -1 here in case
- // we later check this in logic.
- return -1;
- }
- return field_id;
- } else {
- return -1;
- }
-}
-
-Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
- ParquetType::type type;
- Repetition::type repetition = RepetitionFromNullable(field->nullable());
-
- int length = -1;
- int precision = -1;
- int scale = -1;
-
- switch (field->type()->id()) {
- case ArrowTypeId::NA: {
- type = ParquetType::INT32;
- logical_type = LogicalType::Null();
- if (repetition != Repetition::OPTIONAL) {
- return Status::Invalid("NullType Arrow field must be nullable");
- }
- } break;
- case ArrowTypeId::BOOL:
- type = ParquetType::BOOLEAN;
- break;
- case ArrowTypeId::UINT8:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(8, false);
- break;
- case ArrowTypeId::INT8:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(8, true);
- break;
- case ArrowTypeId::UINT16:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(16, false);
- break;
- case ArrowTypeId::INT16:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(16, true);
- break;
- case ArrowTypeId::UINT32:
- if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
- type = ParquetType::INT64;
- } else {
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(32, false);
- }
- break;
- case ArrowTypeId::INT32:
- type = ParquetType::INT32;
- break;
- case ArrowTypeId::UINT64:
- type = ParquetType::INT64;
- logical_type = LogicalType::Int(64, false);
- break;
- case ArrowTypeId::INT64:
- type = ParquetType::INT64;
- break;
- case ArrowTypeId::FLOAT:
- type = ParquetType::FLOAT;
- break;
- case ArrowTypeId::DOUBLE:
- type = ParquetType::DOUBLE;
- break;
- case ArrowTypeId::LARGE_STRING:
- case ArrowTypeId::STRING:
- type = ParquetType::BYTE_ARRAY;
- logical_type = LogicalType::String();
- break;
- case ArrowTypeId::LARGE_BINARY:
- case ArrowTypeId::BINARY:
- type = ParquetType::BYTE_ARRAY;
- break;
- case ArrowTypeId::FIXED_SIZE_BINARY: {
- type = ParquetType::FIXED_LEN_BYTE_ARRAY;
- const auto& fixed_size_binary_type =
- static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
- length = fixed_size_binary_type.byte_width();
- } break;
- case ArrowTypeId::DECIMAL128:
- case ArrowTypeId::DECIMAL256: {
- type = ParquetType::FIXED_LEN_BYTE_ARRAY;
- const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
- precision = decimal_type.precision();
- scale = decimal_type.scale();
- length = DecimalType::DecimalSize(precision);
- PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
- } break;
- case ArrowTypeId::DATE32:
- type = ParquetType::INT32;
- logical_type = LogicalType::Date();
- break;
- case ArrowTypeId::DATE64:
- type = ParquetType::INT32;
- logical_type = LogicalType::Date();
- break;
- case ArrowTypeId::TIMESTAMP:
- RETURN_NOT_OK(
- GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
- properties, arrow_properties, &type, &logical_type));
- break;
- case ArrowTypeId::TIME32:
- type = ParquetType::INT32;
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
- break;
- case ArrowTypeId::TIME64: {
- type = ParquetType::INT64;
- auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
- if (time_type->unit() == ::arrow::TimeUnit::NANO) {
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
- } else {
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
- }
- } break;
- case ArrowTypeId::STRUCT: {
- auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
- return StructToNode(struct_type, name, field->nullable(), properties,
- arrow_properties, out);
- }
- case ArrowTypeId::FIXED_SIZE_LIST:
- case ArrowTypeId::LARGE_LIST:
- case ArrowTypeId::LIST: {
- auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
- return ListToNode(list_type, name, field->nullable(), properties, arrow_properties,
- out);
- }
- case ArrowTypeId::DICTIONARY: {
- // Parquet has no Dictionary type, dictionary-encoded is handled on
- // the encoding, not the schema level.
- const ::arrow::DictionaryType& dict_type =
- static_cast<const ::arrow::DictionaryType&>(*field->type());
- std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
- name, dict_type.value_type(), field->nullable(), field->metadata());
- return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
- }
- case ArrowTypeId::EXTENSION: {
- auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
- std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
- name, ext_type->storage_type(), field->nullable(), field->metadata());
- return FieldToNode(name, storage_field, properties, arrow_properties, out);
- }
- case ArrowTypeId::MAP: {
- auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
- return MapToNode(map_type, name, field->nullable(), properties, arrow_properties,
- out);
- }
-
- default: {
- // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
- return Status::NotImplemented(
- "Unhandled type for Arrow to Parquet schema conversion: ",
- field->type()->ToString());
- }
- }
-
- int field_id = FieldIdFromMetadata(field->metadata());
- PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
- length, field_id));
-
- return Status::OK();
-}
-
-struct SchemaTreeContext {
- SchemaManifest* manifest;
- ArrowReaderProperties properties;
- const SchemaDescriptor* schema;
-
- void LinkParent(const SchemaField* child, const SchemaField* parent) {
- manifest->child_to_parent[child] = parent;
- }
-
- void RecordLeaf(const SchemaField* leaf) {
- manifest->column_index_to_field[leaf->column_index] = leaf;
- }
-};
-
-bool IsDictionaryReadSupported(const ArrowType& type) {
- // Only supported currently for BYTE_ARRAY types
- return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
-}
-
-// ----------------------------------------------------------------------
-// Schema logic
-
-::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
- int column_index, const schema::PrimitiveNode& primitive_node,
- SchemaTreeContext* ctx) {
- ASSIGN_OR_RAISE(
- std::shared_ptr<ArrowType> storage_type,
- GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
- if (ctx->properties.read_dictionary(column_index) &&
- IsDictionaryReadSupported(*storage_type)) {
- return ::arrow::dictionary(::arrow::int32(), storage_type);
- }
- return storage_type;
-}
-
-Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out);
-
-Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out);
-
-Status PopulateLeaf(int column_index, const std::shared_ptr<Field>& field,
- LevelInfo current_levels, SchemaTreeContext* ctx,
- const SchemaField* parent, SchemaField* out) {
- out->field = field;
- out->column_index = column_index;
- out->level_info = current_levels;
- ctx->RecordLeaf(out);
- ctx->LinkParent(out, parent);
- return Status::OK();
-}
-
-// Special case mentioned in the format spec:
-// If the name is array or ends in _tuple, this should be a list of struct
-// even for single child elements.
-bool HasStructListName(const GroupNode& node) {
- ::arrow::util::string_view name{node.name()};
- return name == "array" || name.ends_with("_tuple");
-}
-
-Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- std::vector<std::shared_ptr<Field>> arrow_fields;
- out->children.resize(node.field_count());
- // All level increments for the node are expected to happen by callers.
- // This is required because repeated elements need to have there own
- // SchemaField.
-
- for (int i = 0; i < node.field_count(); i++) {
- RETURN_NOT_OK(
- NodeToSchemaField(*node.field(i), current_levels, ctx, out, &out->children[i]));
- arrow_fields.push_back(out->children[i].field);
- }
- auto struct_type = ::arrow::struct_(arrow_fields);
- out->field = ::arrow::field(node.name(), struct_type, node.is_optional(),
- FieldIdMetadata(node.field_id()));
- out->level_info = current_levels;
- return Status::OK();
-}
-
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out);
-
-Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (group.field_count() != 1) {
- return Status::Invalid("MAP-annotated groups must have a single child.");
- }
- if (group.is_repeated()) {
- return Status::Invalid("MAP-annotated groups must not be repeated.");
- }
-
- const Node& key_value_node = *group.field(0);
-
- if (!key_value_node.is_repeated()) {
- return Status::Invalid(
- "Non-repeated key value in a MAP-annotated group are not supported.");
- }
-
- if (!key_value_node.is_group()) {
- return Status::Invalid("Key-value node must be a group.");
- }
-
- const GroupNode& key_value = checked_cast<const GroupNode&>(key_value_node);
- if (key_value.field_count() != 1 && key_value.field_count() != 2) {
- return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
- key_value.field_count());
- }
- const Node& key_node = *key_value.field(0);
- if (!key_node.is_required()) {
- return Status::Invalid("Map keys must be annotated as required.");
- }
- // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
- // make the values column nullable, or process the map as a list. We choose the latter
- // as it is simpler.
- if (key_value.field_count() == 1) {
- return ListToSchemaField(group, current_levels, ctx, parent, out);
- }
-
- current_levels.Increment(group);
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
-
- out->children.resize(1);
- SchemaField* key_value_field = &out->children[0];
-
- key_value_field->children.resize(2);
- SchemaField* key_field = &key_value_field->children[0];
- SchemaField* value_field = &key_value_field->children[1];
-
- ctx->LinkParent(out, parent);
- ctx->LinkParent(key_value_field, out);
- ctx->LinkParent(key_field, key_value_field);
- ctx->LinkParent(value_field, key_value_field);
-
- // required/optional group name=whatever {
- // repeated group name=key_values{
- // required TYPE key;
- // required/optional TYPE value;
- // }
- // }
- //
-
- RETURN_NOT_OK(NodeToSchemaField(*key_value.field(0), current_levels, ctx,
- key_value_field, key_field));
- RETURN_NOT_OK(NodeToSchemaField(*key_value.field(1), current_levels, ctx,
- key_value_field, value_field));
-
- key_value_field->field = ::arrow::field(
- group.name(), ::arrow::struct_({key_field->field, value_field->field}),
- /*nullable=*/false, FieldIdMetadata(key_value.field_id()));
- key_value_field->level_info = current_levels;
-
- out->field = ::arrow::field(group.name(),
- ::arrow::map(key_field->field->type(), value_field->field),
- group.is_optional(), FieldIdMetadata(group.field_id()));
- out->level_info = current_levels;
- // At this point current levels contains the def level for this list,
- // we need to reset to the prior parent.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
-}
-
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (group.field_count() != 1) {
- return Status::Invalid("LIST-annotated groups must have a single child.");
- }
- if (group.is_repeated()) {
- return Status::Invalid("LIST-annotated groups must not be repeated.");
- }
- current_levels.Increment(group);
-
- out->children.resize(group.field_count());
- SchemaField* child_field = &out->children[0];
-
- ctx->LinkParent(out, parent);
- ctx->LinkParent(child_field, out);
-
- const Node& list_node = *group.field(0);
-
- if (!list_node.is_repeated()) {
- return Status::Invalid(
- "Non-repeated nodes in a LIST-annotated group are not supported.");
- }
-
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
- if (list_node.is_group()) {
- // Resolve 3-level encoding
- //
- // required/optional group name=whatever {
- // repeated group name=list {
- // required/optional TYPE item;
- // }
- // }
- //
- // yields list<item: TYPE ?nullable> ?nullable
- //
- // We distinguish the special case that we have
- //
- // required/optional group name=whatever {
- // repeated group name=array or $SOMETHING_tuple {
- // required/optional TYPE item;
- // }
- // }
- //
- // In this latter case, the inner type of the list should be a struct
- // rather than a primitive value
- //
- // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
- const auto& list_group = static_cast<const GroupNode&>(list_node);
- // Special case mentioned in the format spec:
- // If the name is array or ends in _tuple, this should be a list of struct
- // even for single child elements.
- if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
- // List of primitive type
- RETURN_NOT_OK(
- NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field));
- } else {
- RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
- }
- } else {
- // Two-level list encoding
- //
- // required/optional group LIST {
- // repeated TYPE;
- // }
- const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
- int column_index = ctx->schema->GetColumnIndex(primitive_node);
- ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
- GetTypeForNode(column_index, primitive_node, ctx));
- auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
- FieldIdMetadata(list_node.field_id()));
- RETURN_NOT_OK(
- PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
- }
- out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
- group.is_optional(), FieldIdMetadata(group.field_id()));
- out->level_info = current_levels;
- // At this point current levels contains the def level for this list,
- // we need to reset to the prior parent.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
-}
-
-Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (node.logical_type()->is_list()) {
- return ListToSchemaField(node, current_levels, ctx, parent, out);
- } else if (node.logical_type()->is_map()) {
- return MapToSchemaField(node, current_levels, ctx, parent, out);
- }
- std::shared_ptr<ArrowType> type;
- if (node.is_repeated()) {
- // Simple repeated struct
- //
- // repeated group $NAME {
- // r/o TYPE[0] f0
- // r/o TYPE[1] f1
- // }
- out->children.resize(1);
-
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
- RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out, &out->children[0]));
- out->field = ::arrow::field(node.name(), ::arrow::list(out->children[0].field),
- /*nullable=*/false, FieldIdMetadata(node.field_id()));
-
- ctx->LinkParent(&out->children[0], out);
- out->level_info = current_levels;
- // At this point current_levels contains this list as the def level, we need to
- // use the previous ancenstor of thi slist.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
- } else {
- current_levels.Increment(node);
- return GroupToStruct(node, current_levels, ctx, parent, out);
- }
-}
-
-Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- // Workhorse function for converting a Parquet schema node to an Arrow
- // type. Handles different conventions for nested data.
-
- ctx->LinkParent(out, parent);
-
- // Now, walk the schema and create a ColumnDescriptor for each leaf node
- if (node.is_group()) {
- // A nested field, but we don't know what kind yet
- return GroupToSchemaField(static_cast<const GroupNode&>(node), current_levels, ctx,
- parent, out);
- } else {
- // Either a normal flat primitive type, or a list type encoded with 1-level
- // list encoding. Note that the 3-level encoding is the form recommended by
- // the parquet specification, but technically we can have either
- //
- // required/optional $TYPE $FIELD_NAME
- //
- // or
- //
- // repeated $TYPE $FIELD_NAME
- const auto& primitive_node = static_cast<const PrimitiveNode&>(node);
- int column_index = ctx->schema->GetColumnIndex(primitive_node);
- ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
- GetTypeForNode(column_index, primitive_node, ctx));
- if (node.is_repeated()) {
- // One-level list encoding, e.g.
- // a: repeated int32;
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
- out->children.resize(1);
- auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false);
- RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, ctx, out,
- &out->children[0]));
-
- out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
- /*nullable=*/false, FieldIdMetadata(node.field_id()));
- out->level_info = current_levels;
- // At this point current_levels has consider this list the ancestor so restore
- // the actual ancenstor.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
- } else {
- current_levels.Increment(node);
- // A normal (required/optional) primitive node
- return PopulateLeaf(column_index,
- ::arrow::field(node.name(), type, node.is_optional(),
- FieldIdMetadata(node.field_id())),
- current_levels, ctx, parent, out);
- }
- }
-}
-
-// Get the original Arrow schema, as serialized in the Parquet metadata
-Status GetOriginSchema(const std::shared_ptr<const KeyValueMetadata>& metadata,
- std::shared_ptr<const KeyValueMetadata>* clean_metadata,
- std::shared_ptr<::arrow::Schema>* out) {
- if (metadata == nullptr) {
- *out = nullptr;
- *clean_metadata = nullptr;
- return Status::OK();
- }
-
- static const std::string kArrowSchemaKey = "ARROW:schema";
- int schema_index = metadata->FindKey(kArrowSchemaKey);
- if (schema_index == -1) {
- *out = nullptr;
- *clean_metadata = metadata;
- return Status::OK();
- }
-
- // The original Arrow schema was serialized using the store_schema option.
- // We deserialize it here and use it to inform read options such as
- // dictionary-encoded fields.
- auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index));
- auto schema_buf = std::make_shared<Buffer>(decoded);
-
- ::arrow::ipc::DictionaryMemo dict_memo;
- ::arrow::io::BufferReader input(schema_buf);
-
- ARROW_ASSIGN_OR_RAISE(*out, ::arrow::ipc::ReadSchema(&input, &dict_memo));
-
- if (metadata->size() > 1) {
- // Copy the metadata without the schema key
- auto new_metadata = ::arrow::key_value_metadata({}, {});
- new_metadata->reserve(metadata->size() - 1);
- for (int64_t i = 0; i < metadata->size(); ++i) {
- if (i == schema_index) continue;
- new_metadata->Append(metadata->key(i), metadata->value(i));
- }
- *clean_metadata = new_metadata;
- } else {
- // No other keys, let metadata be null
- *clean_metadata = nullptr;
- }
- return Status::OK();
-}
-
-// Restore original Arrow field information that was serialized as Parquet metadata
-// but that is not necessarily present in the field reconstitued from Parquet data
-// (for example, Parquet timestamp types doesn't carry timezone information).
-
-Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred);
-
-std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
- const ArrowType& origin_type, const ArrowType& inferred_type) {
- switch (inferred_type.id()) {
- case ::arrow::Type::STRUCT:
- if (origin_type.id() == ::arrow::Type::STRUCT) {
- return ::arrow::struct_;
- }
- break;
- case ::arrow::Type::LIST:
- if (origin_type.id() == ::arrow::Type::LIST) {
- return [](FieldVector fields) {
- DCHECK_EQ(fields.size(), 1);
- return ::arrow::list(std::move(fields[0]));
- };
- }
- if (origin_type.id() == ::arrow::Type::LARGE_LIST) {
- return [](FieldVector fields) {
- DCHECK_EQ(fields.size(), 1);
- return ::arrow::large_list(std::move(fields[0]));
- };
- }
- if (origin_type.id() == ::arrow::Type::FIXED_SIZE_LIST) {
- const auto list_size =
- checked_cast<const ::arrow::FixedSizeListType&>(origin_type).list_size();
- return [list_size](FieldVector fields) {
- DCHECK_EQ(fields.size(), 1);
- return ::arrow::fixed_size_list(std::move(fields[0]), list_size);
- };
- }
- break;
- default:
- break;
- }
- return {};
-}
-
-Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
- SchemaField* inferred) {
- bool modified = false;
-
- auto origin_type = origin_field.type();
- auto inferred_type = inferred->field->type();
-
- const int num_children = inferred_type->num_fields();
-
- if (num_children > 0 && origin_type->num_fields() == num_children) {
- DCHECK_EQ(static_cast<int>(inferred->children.size()), num_children);
- const auto factory = GetNestedFactory(*origin_type, *inferred_type);
- if (factory) {
- // The type may be modified (e.g. LargeList) while the children stay the same
- modified |= origin_type->id() != inferred_type->id();
-
- // Apply original metadata recursively to children
- for (int i = 0; i < inferred_type->num_fields(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- const bool child_modified,
- ApplyOriginalMetadata(*origin_type->field(i), &inferred->children[i]));
- modified |= child_modified;
- }
- if (modified) {
- // Recreate this field using the modified child fields
- ::arrow::FieldVector modified_children(inferred_type->num_fields());
- for (int i = 0; i < inferred_type->num_fields(); ++i) {
- modified_children[i] = inferred->children[i].field;
- }
- inferred->field =
- inferred->field->WithType(factory(std::move(modified_children)));
- }
- }
- }
-
- if (origin_type->id() == ::arrow::Type::TIMESTAMP &&
- inferred_type->id() == ::arrow::Type::TIMESTAMP) {
- // Restore time zone, if any
- const auto& ts_type = checked_cast<const ::arrow::TimestampType&>(*inferred_type);
- const auto& ts_origin_type =
- checked_cast<const ::arrow::TimestampType&>(*origin_type);
-
- // If the data is tz-aware, then set the original time zone, since Parquet
- // has no native storage for timezones
- if (ts_type.timezone() == "UTC" && ts_origin_type.timezone() != "") {
- if (ts_type.unit() == ts_origin_type.unit()) {
- inferred->field = inferred->field->WithType(origin_type);
- } else {
- auto ts_type_new = ::arrow::timestamp(ts_type.unit(), ts_origin_type.timezone());
- inferred->field = inferred->field->WithType(ts_type_new);
- }
- }
- modified = true;
- }
-
- if (origin_type->id() == ::arrow::Type::DICTIONARY &&
- inferred_type->id() != ::arrow::Type::DICTIONARY &&
- IsDictionaryReadSupported(*inferred_type)) {
- // Direct dictionary reads are only suppored for a couple primitive types,
- // so no need to recurse on value types.
- const auto& dict_origin_type =
- checked_cast<const ::arrow::DictionaryType&>(*origin_type);
- inferred->field = inferred->field->WithType(
- ::arrow::dictionary(::arrow::int32(), inferred_type, dict_origin_type.ordered()));
- modified = true;
- }
-
- if ((origin_type->id() == ::arrow::Type::LARGE_BINARY &&
- inferred_type->id() == ::arrow::Type::BINARY) ||
- (origin_type->id() == ::arrow::Type::LARGE_STRING &&
- inferred_type->id() == ::arrow::Type::STRING)) {
- // Read back binary-like arrays with the intended offset width.
- inferred->field = inferred->field->WithType(origin_type);
- modified = true;
- }
-
- if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
- inferred_type->id() == ::arrow::Type::DECIMAL128) {
- inferred->field = inferred->field->WithType(origin_type);
- modified = true;
- }
-
- // Restore field metadata
- std::shared_ptr<const KeyValueMetadata> field_metadata = origin_field.metadata();
- if (field_metadata != nullptr) {
- if (inferred->field->metadata()) {
- // Prefer the metadata keys (like field_id) from the current metadata
- field_metadata = field_metadata->Merge(*inferred->field->metadata());
- }
- inferred->field = inferred->field->WithMetadata(field_metadata);
- modified = true;
- }
-
- return modified;
-}
-
-Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred) {
- bool modified = false;
-
- auto origin_type = origin_field.type();
- auto inferred_type = inferred->field->type();
-
- if (origin_type->id() == ::arrow::Type::EXTENSION) {
- const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
- auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
-
- // Apply metadata recursively to storage type
- RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
-
- // Restore extension type, if the storage type is the same as inferred
- // from the Parquet type
- if (ex_type.storage_type()->Equals(*inferred->field->type())) {
- inferred->field = inferred->field->WithType(origin_type);
- }
- modified = true;
- } else {
- ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred));
- }
-
- return modified;
-}
-
-} // namespace
-
-Status FieldToNode(const std::shared_ptr<Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- return FieldToNode(field->name(), field, properties, arrow_properties, out);
-}
-
-Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- std::shared_ptr<SchemaDescriptor>* out) {
- std::vector<NodePtr> nodes(arrow_schema->num_fields());
- for (int i = 0; i < arrow_schema->num_fields(); i++) {
- RETURN_NOT_OK(
- FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i]));
- }
-
- NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes);
- *out = std::make_shared<::parquet::SchemaDescriptor>();
- PARQUET_CATCH_NOT_OK((*out)->Init(schema));
-
- return Status::OK();
-}
-
-Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- std::shared_ptr<SchemaDescriptor>* out) {
- return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(),
- out);
-}
-
-Status FromParquetSchema(
- const SchemaDescriptor* schema, const ArrowReaderProperties& properties,
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata,
- std::shared_ptr<::arrow::Schema>* out) {
- SchemaManifest manifest;
- RETURN_NOT_OK(SchemaManifest::Make(schema, key_value_metadata, properties, &manifest));
- std::vector<std::shared_ptr<Field>> fields(manifest.schema_fields.size());
-
- for (int i = 0; i < static_cast<int>(fields.size()); i++) {
- const auto& schema_field = manifest.schema_fields[i];
- fields[i] = schema_field.field;
- }
- if (manifest.origin_schema) {
- // ARROW-8980: If the ARROW:schema was in the input metadata, then
- // manifest.origin_schema will have it scrubbed out
- *out = ::arrow::schema(fields, manifest.origin_schema->metadata());
- } else {
- *out = ::arrow::schema(fields, key_value_metadata);
- }
- return Status::OK();
-}
-
-Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- const ArrowReaderProperties& properties,
- std::shared_ptr<::arrow::Schema>* out) {
- return FromParquetSchema(parquet_schema, properties, nullptr, out);
-}
-
-Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- std::shared_ptr<::arrow::Schema>* out) {
- ArrowReaderProperties properties;
- return FromParquetSchema(parquet_schema, properties, nullptr, out);
-}
-
-Status SchemaManifest::Make(const SchemaDescriptor* schema,
- const std::shared_ptr<const KeyValueMetadata>& metadata,
- const ArrowReaderProperties& properties,
- SchemaManifest* manifest) {
- SchemaTreeContext ctx;
- ctx.manifest = manifest;
- ctx.properties = properties;
- ctx.schema = schema;
- const GroupNode& schema_node = *schema->group_node();
- manifest->descr = schema;
- manifest->schema_fields.resize(schema_node.field_count());
-
- // Try to deserialize original Arrow schema
- RETURN_NOT_OK(
- GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema));
- // Ignore original schema if it's not compatible with the Parquet schema
- if (manifest->origin_schema != nullptr &&
- manifest->origin_schema->num_fields() != schema_node.field_count()) {
- manifest->origin_schema = nullptr;
- }
-
- for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
- SchemaField* out_field = &manifest->schema_fields[i];
- RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), LevelInfo(), &ctx,
- /*parent=*/nullptr, out_field));
-
- // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin
- // schema (if any) through all functions in the schema reconstruction, but
- // I'm being lazy and just setting dictionary fields at the top level for
- // now
- if (manifest->origin_schema == nullptr) {
- continue;
- }
-
- auto origin_field = manifest->origin_schema->field(i);
- RETURN_NOT_OK(ApplyOriginalMetadata(*origin_field, out_field));
- }
- return Status::OK();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/schema.h"
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/api.h"
+#include "arrow/result_internal.h"
+#include "arrow/type.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
+
+#include "parquet/arrow/schema_internal.h"
+#include "parquet/exception.h"
+#include "parquet/properties.h"
+#include "parquet/types.h"
+
+using arrow::DecimalType;
+using arrow::Field;
+using arrow::FieldVector;
+using arrow::KeyValueMetadata;
+using arrow::Status;
+using arrow::internal::checked_cast;
+
+using ArrowType = arrow::DataType;
+using ArrowTypeId = arrow::Type;
+
+using parquet::Repetition;
+using parquet::schema::GroupNode;
+using parquet::schema::Node;
+using parquet::schema::NodePtr;
+using parquet::schema::PrimitiveNode;
+
+using ParquetType = parquet::Type;
+using parquet::ConvertedType;
+using parquet::LogicalType;
+
+using parquet::internal::LevelInfo;
+
+namespace parquet {
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Parquet to Arrow schema conversion
+
+namespace {
+
+Repetition::type RepetitionFromNullable(bool is_nullable) {
+ return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
+}
+
+Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out);
+
+Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type,
+ const std::string& name, bool nullable,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ NodePtr element;
+ std::string value_name =
+ arrow_properties.compliant_nested_types() ? "element" : type->value_field()->name();
+ RETURN_NOT_OK(FieldToNode(value_name, type->value_field(), properties, arrow_properties,
+ &element));
+
+ NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {list},
+ LogicalType::List());
+ return Status::OK();
+}
+
+Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::string& name,
+ bool nullable, const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ // TODO: Should we offer a non-compliant mode that forwards the type names?
+ NodePtr key_node;
+ RETURN_NOT_OK(
+ FieldToNode("key", type->key_field(), properties, arrow_properties, &key_node));
+
+ NodePtr value_node;
+ RETURN_NOT_OK(FieldToNode("value", type->item_field(), properties, arrow_properties,
+ &value_node));
+
+ NodePtr key_value =
+ GroupNode::Make("key_value", Repetition::REPEATED, {key_node, value_node});
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {key_value},
+ LogicalType::Map());
+ return Status::OK();
+}
+
+Status StructToNode(const std::shared_ptr<::arrow::StructType>& type,
+ const std::string& name, bool nullable,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ std::vector<NodePtr> children(type->num_fields());
+ if (type->num_fields() != 0) {
+ for (int i = 0; i < type->num_fields(); i++) {
+ RETURN_NOT_OK(FieldToNode(type->field(i)->name(), type->field(i), properties,
+ arrow_properties, &children[i]));
+ }
+ } else {
+ // XXX (ARROW-10928) We could add a dummy primitive node but that would
+ // require special handling when writing and reading, to avoid column index
+ // mismatches.
+ return Status::NotImplemented("Cannot write struct type '", name,
+ "' with no child field to Parquet. "
+ "Consider adding a dummy child field.");
+ }
+
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), std::move(children));
+ return Status::OK();
+}
+
+static std::shared_ptr<const LogicalType> TimestampLogicalTypeFromArrowTimestamp(
+ const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) {
+ const bool utc = !(timestamp_type.timezone().empty());
+ // ARROW-5878(wesm): for forward compatibility reasons, and because
+ // there's no other way to signal to old readers that values are
+ // timestamps, we force the ConvertedType field to be set to the
+ // corresponding TIMESTAMP_* value. This does cause some ambiguity
+ // as Parquet readers have not been consistent about the
+ // interpretation of TIMESTAMP_* values as being UTC-normalized.
+ switch (time_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MILLIS,
+ /*is_from_converted_type=*/false,
+ /*force_set_converted_type=*/true);
+ case ::arrow::TimeUnit::MICRO:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MICROS,
+ /*is_from_converted_type=*/false,
+ /*force_set_converted_type=*/true);
+ case ::arrow::TimeUnit::NANO:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::NANOS);
+ case ::arrow::TimeUnit::SECOND:
+ // No equivalent parquet logical type.
+ break;
+ }
+ return LogicalType::None();
+}
+
+static Status GetTimestampMetadata(const ::arrow::TimestampType& type,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ ParquetType::type* physical_type,
+ std::shared_ptr<const LogicalType>* logical_type) {
+ const bool coerce = arrow_properties.coerce_timestamps_enabled();
+ const auto target_unit =
+ coerce ? arrow_properties.coerce_timestamps_unit() : type.unit();
+
+ // The user is explicitly asking for Impala int96 encoding, there is no
+ // logical type.
+ if (arrow_properties.support_deprecated_int96_timestamps()) {
+ *physical_type = ParquetType::INT96;
+ return Status::OK();
+ }
+
+ *physical_type = ParquetType::INT64;
+ *logical_type = TimestampLogicalTypeFromArrowTimestamp(type, target_unit);
+
+ // The user is explicitly asking for timestamp data to be converted to the
+ // specified units (target_unit).
+ if (coerce) {
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
+ switch (target_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ break;
+ case ::arrow::TimeUnit::NANO:
+ case ::arrow::TimeUnit::SECOND:
+ return Status::NotImplemented(
+ "For Parquet version 1.0 files, can only coerce Arrow timestamps to "
+ "milliseconds or microseconds");
+ }
+ } else {
+ switch (target_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ case ::arrow::TimeUnit::NANO:
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ return Status::NotImplemented(
+ "For Parquet files, can only coerce Arrow timestamps to milliseconds, "
+ "microseconds, or nanoseconds");
+ }
+ }
+ return Status::OK();
+ }
+
+ // The user implicitly wants timestamp data to retain its original time units,
+ // however the ConvertedType field used to indicate logical types for Parquet
+ // version 1.0 fields does not allow for nanosecond time units and so nanoseconds
+ // must be coerced to microseconds.
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0 &&
+ type.unit() == ::arrow::TimeUnit::NANO) {
+ *logical_type =
+ TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MICRO);
+ return Status::OK();
+ }
+
+ // The user implicitly wants timestamp data to retain its original time units,
+ // however the Arrow seconds time unit can not be represented (annotated) in
+ // any version of Parquet and so must be coerced to milliseconds.
+ if (type.unit() == ::arrow::TimeUnit::SECOND) {
+ *logical_type =
+ TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI);
+ return Status::OK();
+ }
+
+ return Status::OK();
+}
+
+static constexpr char FIELD_ID_KEY[] = "PARQUET:field_id";
+
+std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) {
+ if (field_id >= 0) {
+ return ::arrow::key_value_metadata({FIELD_ID_KEY}, {std::to_string(field_id)});
+ } else {
+ return nullptr;
+ }
+}
+
+int FieldIdFromMetadata(
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ if (!metadata) {
+ return -1;
+ }
+ int key = metadata->FindKey(FIELD_ID_KEY);
+ if (key < 0) {
+ return -1;
+ }
+ std::string field_id_str = metadata->value(key);
+ int field_id;
+ if (::arrow::internal::ParseValue<::arrow::Int32Type>(
+ field_id_str.c_str(), field_id_str.length(), &field_id)) {
+ if (field_id < 0) {
+ // Thrift should convert any negative value to null but normalize to -1 here in case
+ // we later check this in logic.
+ return -1;
+ }
+ return field_id;
+ } else {
+ return -1;
+ }
+}
+
+Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
+ ParquetType::type type;
+ Repetition::type repetition = RepetitionFromNullable(field->nullable());
+
+ int length = -1;
+ int precision = -1;
+ int scale = -1;
+
+ switch (field->type()->id()) {
+ case ArrowTypeId::NA: {
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Null();
+ if (repetition != Repetition::OPTIONAL) {
+ return Status::Invalid("NullType Arrow field must be nullable");
+ }
+ } break;
+ case ArrowTypeId::BOOL:
+ type = ParquetType::BOOLEAN;
+ break;
+ case ArrowTypeId::UINT8:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(8, false);
+ break;
+ case ArrowTypeId::INT8:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(8, true);
+ break;
+ case ArrowTypeId::UINT16:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(16, false);
+ break;
+ case ArrowTypeId::INT16:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(16, true);
+ break;
+ case ArrowTypeId::UINT32:
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
+ type = ParquetType::INT64;
+ } else {
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(32, false);
+ }
+ break;
+ case ArrowTypeId::INT32:
+ type = ParquetType::INT32;
+ break;
+ case ArrowTypeId::UINT64:
+ type = ParquetType::INT64;
+ logical_type = LogicalType::Int(64, false);
+ break;
+ case ArrowTypeId::INT64:
+ type = ParquetType::INT64;
+ break;
+ case ArrowTypeId::FLOAT:
+ type = ParquetType::FLOAT;
+ break;
+ case ArrowTypeId::DOUBLE:
+ type = ParquetType::DOUBLE;
+ break;
+ case ArrowTypeId::LARGE_STRING:
+ case ArrowTypeId::STRING:
+ type = ParquetType::BYTE_ARRAY;
+ logical_type = LogicalType::String();
+ break;
+ case ArrowTypeId::LARGE_BINARY:
+ case ArrowTypeId::BINARY:
+ type = ParquetType::BYTE_ARRAY;
+ break;
+ case ArrowTypeId::FIXED_SIZE_BINARY: {
+ type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+ const auto& fixed_size_binary_type =
+ static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
+ length = fixed_size_binary_type.byte_width();
+ } break;
+ case ArrowTypeId::DECIMAL128:
+ case ArrowTypeId::DECIMAL256: {
+ type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+ const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
+ precision = decimal_type.precision();
+ scale = decimal_type.scale();
+ length = DecimalType::DecimalSize(precision);
+ PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
+ } break;
+ case ArrowTypeId::DATE32:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Date();
+ break;
+ case ArrowTypeId::DATE64:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Date();
+ break;
+ case ArrowTypeId::TIMESTAMP:
+ RETURN_NOT_OK(
+ GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
+ properties, arrow_properties, &type, &logical_type));
+ break;
+ case ArrowTypeId::TIME32:
+ type = ParquetType::INT32;
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
+ break;
+ case ArrowTypeId::TIME64: {
+ type = ParquetType::INT64;
+ auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
+ if (time_type->unit() == ::arrow::TimeUnit::NANO) {
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
+ } else {
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
+ }
+ } break;
+ case ArrowTypeId::STRUCT: {
+ auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
+ return StructToNode(struct_type, name, field->nullable(), properties,
+ arrow_properties, out);
+ }
+ case ArrowTypeId::FIXED_SIZE_LIST:
+ case ArrowTypeId::LARGE_LIST:
+ case ArrowTypeId::LIST: {
+ auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
+ return ListToNode(list_type, name, field->nullable(), properties, arrow_properties,
+ out);
+ }
+ case ArrowTypeId::DICTIONARY: {
+ // Parquet has no Dictionary type, dictionary-encoded is handled on
+ // the encoding, not the schema level.
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*field->type());
+ std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
+ name, dict_type.value_type(), field->nullable(), field->metadata());
+ return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
+ }
+ case ArrowTypeId::EXTENSION: {
+ auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
+ std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
+ name, ext_type->storage_type(), field->nullable(), field->metadata());
+ return FieldToNode(name, storage_field, properties, arrow_properties, out);
+ }
+ case ArrowTypeId::MAP: {
+ auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
+ return MapToNode(map_type, name, field->nullable(), properties, arrow_properties,
+ out);
+ }
+
+ default: {
+ // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
+ return Status::NotImplemented(
+ "Unhandled type for Arrow to Parquet schema conversion: ",
+ field->type()->ToString());
+ }
+ }
+
+ int field_id = FieldIdFromMetadata(field->metadata());
+ PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
+ length, field_id));
+
+ return Status::OK();
+}
+
+struct SchemaTreeContext {
+ SchemaManifest* manifest;
+ ArrowReaderProperties properties;
+ const SchemaDescriptor* schema;
+
+ void LinkParent(const SchemaField* child, const SchemaField* parent) {
+ manifest->child_to_parent[child] = parent;
+ }
+
+ void RecordLeaf(const SchemaField* leaf) {
+ manifest->column_index_to_field[leaf->column_index] = leaf;
+ }
+};
+
+bool IsDictionaryReadSupported(const ArrowType& type) {
+ // Only supported currently for BYTE_ARRAY types
+ return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
+}
+
+// ----------------------------------------------------------------------
+// Schema logic
+
+::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
+ int column_index, const schema::PrimitiveNode& primitive_node,
+ SchemaTreeContext* ctx) {
+ ASSIGN_OR_RAISE(
+ std::shared_ptr<ArrowType> storage_type,
+ GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+ if (ctx->properties.read_dictionary(column_index) &&
+ IsDictionaryReadSupported(*storage_type)) {
+ return ::arrow::dictionary(::arrow::int32(), storage_type);
+ }
+ return storage_type;
+}
+
+Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status PopulateLeaf(int column_index, const std::shared_ptr<Field>& field,
+ LevelInfo current_levels, SchemaTreeContext* ctx,
+ const SchemaField* parent, SchemaField* out) {
+ out->field = field;
+ out->column_index = column_index;
+ out->level_info = current_levels;
+ ctx->RecordLeaf(out);
+ ctx->LinkParent(out, parent);
+ return Status::OK();
+}
+
+// Special case mentioned in the format spec:
+// If the name is array or ends in _tuple, this should be a list of struct
+// even for single child elements.
+bool HasStructListName(const GroupNode& node) {
+ ::arrow::util::string_view name{node.name()};
+ return name == "array" || name.ends_with("_tuple");
+}
+
+Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ std::vector<std::shared_ptr<Field>> arrow_fields;
+ out->children.resize(node.field_count());
+ // All level increments for the node are expected to happen by callers.
+ // This is required because repeated elements need to have there own
+ // SchemaField.
+
+ for (int i = 0; i < node.field_count(); i++) {
+ RETURN_NOT_OK(
+ NodeToSchemaField(*node.field(i), current_levels, ctx, out, &out->children[i]));
+ arrow_fields.push_back(out->children[i].field);
+ }
+ auto struct_type = ::arrow::struct_(arrow_fields);
+ out->field = ::arrow::field(node.name(), struct_type, node.is_optional(),
+ FieldIdMetadata(node.field_id()));
+ out->level_info = current_levels;
+ return Status::OK();
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("MAP-annotated groups must have a single child.");
+ }
+ if (group.is_repeated()) {
+ return Status::Invalid("MAP-annotated groups must not be repeated.");
+ }
+
+ const Node& key_value_node = *group.field(0);
+
+ if (!key_value_node.is_repeated()) {
+ return Status::Invalid(
+ "Non-repeated key value in a MAP-annotated group are not supported.");
+ }
+
+ if (!key_value_node.is_group()) {
+ return Status::Invalid("Key-value node must be a group.");
+ }
+
+ const GroupNode& key_value = checked_cast<const GroupNode&>(key_value_node);
+ if (key_value.field_count() != 1 && key_value.field_count() != 2) {
+ return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
+ key_value.field_count());
+ }
+ const Node& key_node = *key_value.field(0);
+ if (!key_node.is_required()) {
+ return Status::Invalid("Map keys must be annotated as required.");
+ }
+ // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
+ // make the values column nullable, or process the map as a list. We choose the latter
+ // as it is simpler.
+ if (key_value.field_count() == 1) {
+ return ListToSchemaField(group, current_levels, ctx, parent, out);
+ }
+
+ current_levels.Increment(group);
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+
+ out->children.resize(1);
+ SchemaField* key_value_field = &out->children[0];
+
+ key_value_field->children.resize(2);
+ SchemaField* key_field = &key_value_field->children[0];
+ SchemaField* value_field = &key_value_field->children[1];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(key_value_field, out);
+ ctx->LinkParent(key_field, key_value_field);
+ ctx->LinkParent(value_field, key_value_field);
+
+ // required/optional group name=whatever {
+ // repeated group name=key_values{
+ // required TYPE key;
+ // required/optional TYPE value;
+ // }
+ // }
+ //
+
+ RETURN_NOT_OK(NodeToSchemaField(*key_value.field(0), current_levels, ctx,
+ key_value_field, key_field));
+ RETURN_NOT_OK(NodeToSchemaField(*key_value.field(1), current_levels, ctx,
+ key_value_field, value_field));
+
+ key_value_field->field = ::arrow::field(
+ group.name(), ::arrow::struct_({key_field->field, value_field->field}),
+ /*nullable=*/false, FieldIdMetadata(key_value.field_id()));
+ key_value_field->level_info = current_levels;
+
+ out->field = ::arrow::field(group.name(),
+ ::arrow::map(key_field->field->type(), value_field->field),
+ group.is_optional(), FieldIdMetadata(group.field_id()));
+ out->level_info = current_levels;
+ // At this point current levels contains the def level for this list,
+ // we need to reset to the prior parent.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("LIST-annotated groups must have a single child.");
+ }
+ if (group.is_repeated()) {
+ return Status::Invalid("LIST-annotated groups must not be repeated.");
+ }
+ current_levels.Increment(group);
+
+ out->children.resize(group.field_count());
+ SchemaField* child_field = &out->children[0];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(child_field, out);
+
+ const Node& list_node = *group.field(0);
+
+ if (!list_node.is_repeated()) {
+ return Status::Invalid(
+ "Non-repeated nodes in a LIST-annotated group are not supported.");
+ }
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ if (list_node.is_group()) {
+ // Resolve 3-level encoding
+ //
+ // required/optional group name=whatever {
+ // repeated group name=list {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // yields list<item: TYPE ?nullable> ?nullable
+ //
+ // We distinguish the special case that we have
+ //
+ // required/optional group name=whatever {
+ // repeated group name=array or $SOMETHING_tuple {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // In this latter case, the inner type of the list should be a struct
+ // rather than a primitive value
+ //
+ // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+ const auto& list_group = static_cast<const GroupNode&>(list_node);
+ // Special case mentioned in the format spec:
+ // If the name is array or ends in _tuple, this should be a list of struct
+ // even for single child elements.
+ if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
+ // List of primitive type
+ RETURN_NOT_OK(
+ NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field));
+ } else {
+ RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
+ }
+ } else {
+ // Two-level list encoding
+ //
+ // required/optional group LIST {
+ // repeated TYPE;
+ // }
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
+ FieldIdMetadata(list_node.field_id()));
+ RETURN_NOT_OK(
+ PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
+ }
+ out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
+ group.is_optional(), FieldIdMetadata(group.field_id()));
+ out->level_info = current_levels;
+ // At this point current levels contains the def level for this list,
+ // we need to reset to the prior parent.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+}
+
+Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (node.logical_type()->is_list()) {
+ return ListToSchemaField(node, current_levels, ctx, parent, out);
+ } else if (node.logical_type()->is_map()) {
+ return MapToSchemaField(node, current_levels, ctx, parent, out);
+ }
+ std::shared_ptr<ArrowType> type;
+ if (node.is_repeated()) {
+ // Simple repeated struct
+ //
+ // repeated group $NAME {
+ // r/o TYPE[0] f0
+ // r/o TYPE[1] f1
+ // }
+ out->children.resize(1);
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out, &out->children[0]));
+ out->field = ::arrow::field(node.name(), ::arrow::list(out->children[0].field),
+ /*nullable=*/false, FieldIdMetadata(node.field_id()));
+
+ ctx->LinkParent(&out->children[0], out);
+ out->level_info = current_levels;
+ // At this point current_levels contains this list as the def level, we need to
+ // use the previous ancenstor of thi slist.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+ } else {
+ current_levels.Increment(node);
+ return GroupToStruct(node, current_levels, ctx, parent, out);
+ }
+}
+
+Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ // Workhorse function for converting a Parquet schema node to an Arrow
+ // type. Handles different conventions for nested data.
+
+ ctx->LinkParent(out, parent);
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node.is_group()) {
+ // A nested field, but we don't know what kind yet
+ return GroupToSchemaField(static_cast<const GroupNode&>(node), current_levels, ctx,
+ parent, out);
+ } else {
+ // Either a normal flat primitive type, or a list type encoded with 1-level
+ // list encoding. Note that the 3-level encoding is the form recommended by
+ // the parquet specification, but technically we can have either
+ //
+ // required/optional $TYPE $FIELD_NAME
+ //
+ // or
+ //
+ // repeated $TYPE $FIELD_NAME
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ if (node.is_repeated()) {
+ // One-level list encoding, e.g.
+ // a: repeated int32;
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ out->children.resize(1);
+ auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false);
+ RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, ctx, out,
+ &out->children[0]));
+
+ out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
+ /*nullable=*/false, FieldIdMetadata(node.field_id()));
+ out->level_info = current_levels;
+ // At this point current_levels has consider this list the ancestor so restore
+ // the actual ancenstor.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+ } else {
+ current_levels.Increment(node);
+ // A normal (required/optional) primitive node
+ return PopulateLeaf(column_index,
+ ::arrow::field(node.name(), type, node.is_optional(),
+ FieldIdMetadata(node.field_id())),
+ current_levels, ctx, parent, out);
+ }
+ }
+}
+
+// Get the original Arrow schema, as serialized in the Parquet metadata
+Status GetOriginSchema(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<const KeyValueMetadata>* clean_metadata,
+ std::shared_ptr<::arrow::Schema>* out) {
+ if (metadata == nullptr) {
+ *out = nullptr;
+ *clean_metadata = nullptr;
+ return Status::OK();
+ }
+
+ static const std::string kArrowSchemaKey = "ARROW:schema";
+ int schema_index = metadata->FindKey(kArrowSchemaKey);
+ if (schema_index == -1) {
+ *out = nullptr;
+ *clean_metadata = metadata;
+ return Status::OK();
+ }
+
+ // The original Arrow schema was serialized using the store_schema option.
+ // We deserialize it here and use it to inform read options such as
+ // dictionary-encoded fields.
+ auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index));
+ auto schema_buf = std::make_shared<Buffer>(decoded);
+
+ ::arrow::ipc::DictionaryMemo dict_memo;
+ ::arrow::io::BufferReader input(schema_buf);
+
+ ARROW_ASSIGN_OR_RAISE(*out, ::arrow::ipc::ReadSchema(&input, &dict_memo));
+
+ if (metadata->size() > 1) {
+ // Copy the metadata without the schema key
+ auto new_metadata = ::arrow::key_value_metadata({}, {});
+ new_metadata->reserve(metadata->size() - 1);
+ for (int64_t i = 0; i < metadata->size(); ++i) {
+ if (i == schema_index) continue;
+ new_metadata->Append(metadata->key(i), metadata->value(i));
+ }
+ *clean_metadata = new_metadata;
+ } else {
+ // No other keys, let metadata be null
+ *clean_metadata = nullptr;
+ }
+ return Status::OK();
+}
+
+// Restore original Arrow field information that was serialized as Parquet metadata
+// but that is not necessarily present in the field reconstitued from Parquet data
+// (for example, Parquet timestamp types doesn't carry timezone information).
+
+Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred);
+
+std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
+ const ArrowType& origin_type, const ArrowType& inferred_type) {
+ switch (inferred_type.id()) {
+ case ::arrow::Type::STRUCT:
+ if (origin_type.id() == ::arrow::Type::STRUCT) {
+ return ::arrow::struct_;
+ }
+ break;
+ case ::arrow::Type::LIST:
+ if (origin_type.id() == ::arrow::Type::LIST) {
+ return [](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::list(std::move(fields[0]));
+ };
+ }
+ if (origin_type.id() == ::arrow::Type::LARGE_LIST) {
+ return [](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::large_list(std::move(fields[0]));
+ };
+ }
+ if (origin_type.id() == ::arrow::Type::FIXED_SIZE_LIST) {
+ const auto list_size =
+ checked_cast<const ::arrow::FixedSizeListType&>(origin_type).list_size();
+ return [list_size](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::fixed_size_list(std::move(fields[0]), list_size);
+ };
+ }
+ break;
+ default:
+ break;
+ }
+ return {};
+}
+
+Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
+ SchemaField* inferred) {
+ bool modified = false;
+
+ auto origin_type = origin_field.type();
+ auto inferred_type = inferred->field->type();
+
+ const int num_children = inferred_type->num_fields();
+
+ if (num_children > 0 && origin_type->num_fields() == num_children) {
+ DCHECK_EQ(static_cast<int>(inferred->children.size()), num_children);
+ const auto factory = GetNestedFactory(*origin_type, *inferred_type);
+ if (factory) {
+ // The type may be modified (e.g. LargeList) while the children stay the same
+ modified |= origin_type->id() != inferred_type->id();
+
+ // Apply original metadata recursively to children
+ for (int i = 0; i < inferred_type->num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ const bool child_modified,
+ ApplyOriginalMetadata(*origin_type->field(i), &inferred->children[i]));
+ modified |= child_modified;
+ }
+ if (modified) {
+ // Recreate this field using the modified child fields
+ ::arrow::FieldVector modified_children(inferred_type->num_fields());
+ for (int i = 0; i < inferred_type->num_fields(); ++i) {
+ modified_children[i] = inferred->children[i].field;
+ }
+ inferred->field =
+ inferred->field->WithType(factory(std::move(modified_children)));
+ }
+ }
+ }
+
+ if (origin_type->id() == ::arrow::Type::TIMESTAMP &&
+ inferred_type->id() == ::arrow::Type::TIMESTAMP) {
+ // Restore time zone, if any
+ const auto& ts_type = checked_cast<const ::arrow::TimestampType&>(*inferred_type);
+ const auto& ts_origin_type =
+ checked_cast<const ::arrow::TimestampType&>(*origin_type);
+
+ // If the data is tz-aware, then set the original time zone, since Parquet
+ // has no native storage for timezones
+ if (ts_type.timezone() == "UTC" && ts_origin_type.timezone() != "") {
+ if (ts_type.unit() == ts_origin_type.unit()) {
+ inferred->field = inferred->field->WithType(origin_type);
+ } else {
+ auto ts_type_new = ::arrow::timestamp(ts_type.unit(), ts_origin_type.timezone());
+ inferred->field = inferred->field->WithType(ts_type_new);
+ }
+ }
+ modified = true;
+ }
+
+ if (origin_type->id() == ::arrow::Type::DICTIONARY &&
+ inferred_type->id() != ::arrow::Type::DICTIONARY &&
+ IsDictionaryReadSupported(*inferred_type)) {
+ // Direct dictionary reads are only suppored for a couple primitive types,
+ // so no need to recurse on value types.
+ const auto& dict_origin_type =
+ checked_cast<const ::arrow::DictionaryType&>(*origin_type);
+ inferred->field = inferred->field->WithType(
+ ::arrow::dictionary(::arrow::int32(), inferred_type, dict_origin_type.ordered()));
+ modified = true;
+ }
+
+ if ((origin_type->id() == ::arrow::Type::LARGE_BINARY &&
+ inferred_type->id() == ::arrow::Type::BINARY) ||
+ (origin_type->id() == ::arrow::Type::LARGE_STRING &&
+ inferred_type->id() == ::arrow::Type::STRING)) {
+ // Read back binary-like arrays with the intended offset width.
+ inferred->field = inferred->field->WithType(origin_type);
+ modified = true;
+ }
+
+ if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
+ inferred_type->id() == ::arrow::Type::DECIMAL128) {
+ inferred->field = inferred->field->WithType(origin_type);
+ modified = true;
+ }
+
+ // Restore field metadata
+ std::shared_ptr<const KeyValueMetadata> field_metadata = origin_field.metadata();
+ if (field_metadata != nullptr) {
+ if (inferred->field->metadata()) {
+ // Prefer the metadata keys (like field_id) from the current metadata
+ field_metadata = field_metadata->Merge(*inferred->field->metadata());
+ }
+ inferred->field = inferred->field->WithMetadata(field_metadata);
+ modified = true;
+ }
+
+ return modified;
+}
+
+Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred) {
+ bool modified = false;
+
+ auto origin_type = origin_field.type();
+ auto inferred_type = inferred->field->type();
+
+ if (origin_type->id() == ::arrow::Type::EXTENSION) {
+ const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+ auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+
+ // Apply metadata recursively to storage type
+ RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
+
+ // Restore extension type, if the storage type is the same as inferred
+ // from the Parquet type
+ if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+ inferred->field = inferred->field->WithType(origin_type);
+ }
+ modified = true;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred));
+ }
+
+ return modified;
+}
+
+} // namespace
+
+Status FieldToNode(const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ return FieldToNode(field->name(), field, properties, arrow_properties, out);
+}
+
+Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ std::shared_ptr<SchemaDescriptor>* out) {
+ std::vector<NodePtr> nodes(arrow_schema->num_fields());
+ for (int i = 0; i < arrow_schema->num_fields(); i++) {
+ RETURN_NOT_OK(
+ FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i]));
+ }
+
+ NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes);
+ *out = std::make_shared<::parquet::SchemaDescriptor>();
+ PARQUET_CATCH_NOT_OK((*out)->Init(schema));
+
+ return Status::OK();
+}
+
+Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ std::shared_ptr<SchemaDescriptor>* out) {
+ return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(),
+ out);
+}
+
+Status FromParquetSchema(
+ const SchemaDescriptor* schema, const ArrowReaderProperties& properties,
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata,
+ std::shared_ptr<::arrow::Schema>* out) {
+ SchemaManifest manifest;
+ RETURN_NOT_OK(SchemaManifest::Make(schema, key_value_metadata, properties, &manifest));
+ std::vector<std::shared_ptr<Field>> fields(manifest.schema_fields.size());
+
+ for (int i = 0; i < static_cast<int>(fields.size()); i++) {
+ const auto& schema_field = manifest.schema_fields[i];
+ fields[i] = schema_field.field;
+ }
+ if (manifest.origin_schema) {
+ // ARROW-8980: If the ARROW:schema was in the input metadata, then
+ // manifest.origin_schema will have it scrubbed out
+ *out = ::arrow::schema(fields, manifest.origin_schema->metadata());
+ } else {
+ *out = ::arrow::schema(fields, key_value_metadata);
+ }
+ return Status::OK();
+}
+
+Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ const ArrowReaderProperties& properties,
+ std::shared_ptr<::arrow::Schema>* out) {
+ return FromParquetSchema(parquet_schema, properties, nullptr, out);
+}
+
+Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ std::shared_ptr<::arrow::Schema>* out) {
+ ArrowReaderProperties properties;
+ return FromParquetSchema(parquet_schema, properties, nullptr, out);
+}
+
+Status SchemaManifest::Make(const SchemaDescriptor* schema,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ const ArrowReaderProperties& properties,
+ SchemaManifest* manifest) {
+ SchemaTreeContext ctx;
+ ctx.manifest = manifest;
+ ctx.properties = properties;
+ ctx.schema = schema;
+ const GroupNode& schema_node = *schema->group_node();
+ manifest->descr = schema;
+ manifest->schema_fields.resize(schema_node.field_count());
+
+ // Try to deserialize original Arrow schema
+ RETURN_NOT_OK(
+ GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema));
+ // Ignore original schema if it's not compatible with the Parquet schema
+ if (manifest->origin_schema != nullptr &&
+ manifest->origin_schema->num_fields() != schema_node.field_count()) {
+ manifest->origin_schema = nullptr;
+ }
+
+ for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
+ SchemaField* out_field = &manifest->schema_fields[i];
+ RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), LevelInfo(), &ctx,
+ /*parent=*/nullptr, out_field));
+
+ // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin
+ // schema (if any) through all functions in the schema reconstruction, but
+ // I'm being lazy and just setting dictionary fields at the top level for
+ // now
+ if (manifest->origin_schema == nullptr) {
+ continue;
+ }
+
+ auto origin_field = manifest->origin_schema->field(i);
+ RETURN_NOT_OK(ApplyOriginalMetadata(*origin_field, out_field));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
index dd60fde4342..a5c3a58176d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
@@ -1,184 +1,184 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cassert>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_fwd.h"
-
-#include "parquet/level_conversion.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-
-class ArrowReaderProperties;
-class ArrowWriterProperties;
-class WriterProperties;
-
-namespace arrow {
-
-/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
-/// schema into a Parquet schema.
-///
-/// @{
-
-PARQUET_EXPORT
-::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- schema::NodePtr* out);
-
-PARQUET_EXPORT
-::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- std::shared_ptr<SchemaDescriptor>* out);
-
-PARQUET_EXPORT
-::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- std::shared_ptr<SchemaDescriptor>* out);
-
-/// @}
-
-/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
-/// schema into an Arrow schema.
-///
-/// @{
-
-PARQUET_EXPORT
-::arrow::Status FromParquetSchema(
- const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
- const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
- std::shared_ptr<::arrow::Schema>* out);
-
-PARQUET_EXPORT
-::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- const ArrowReaderProperties& properties,
- std::shared_ptr<::arrow::Schema>* out);
-
-PARQUET_EXPORT
-::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- std::shared_ptr<::arrow::Schema>* out);
-
-/// @}
-
-/// \brief Bridge between an arrow::Field and parquet column indices.
-struct PARQUET_EXPORT SchemaField {
- std::shared_ptr<::arrow::Field> field;
- std::vector<SchemaField> children;
-
- // Only set for leaf nodes
- int column_index = -1;
-
- parquet::internal::LevelInfo level_info;
-
- bool is_leaf() const { return column_index != -1; }
-};
-
-/// \brief Bridge between a parquet Schema and an arrow Schema.
-///
-/// Expose parquet columns as a tree structure. Useful traverse and link
-/// between arrow's Schema and parquet's Schema.
-struct PARQUET_EXPORT SchemaManifest {
- static ::arrow::Status Make(
- const SchemaDescriptor* schema,
- const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
- const ArrowReaderProperties& properties, SchemaManifest* manifest);
-
- const SchemaDescriptor* descr;
- std::shared_ptr<::arrow::Schema> origin_schema;
- std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
- std::vector<SchemaField> schema_fields;
-
- std::unordered_map<int, const SchemaField*> column_index_to_field;
- std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
-
- ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
- auto it = column_index_to_field.find(column_index);
- if (it == column_index_to_field.end()) {
- return ::arrow::Status::KeyError("Column index ", column_index,
- " not found in schema manifest, may be malformed");
- }
- *out = it->second;
- return ::arrow::Status::OK();
- }
-
- const SchemaField* GetParent(const SchemaField* field) const {
- // Returns nullptr also if not found
- auto it = child_to_parent.find(field);
- if (it == child_to_parent.end()) {
- return NULLPTR;
- }
- return it->second;
- }
-
- /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
- /// correspond to the column root (first node below the parquet schema's root group) of
- /// each leaf referenced in column_indices.
- ///
- /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
- /// the roots are `a` and `i` (return=[0,2]).
- ///
- /// root
- /// -- a <------
- /// -- -- b | |
- /// -- -- -- c |
- /// -- -- -- d |
- /// -- -- -- -- e
- /// -- f
- /// -- -- g
- /// -- -- -- h
- /// -- i <---
- /// -- -- j |
- /// -- -- -- k
- ::arrow::Result<std::vector<int>> GetFieldIndices(
- const std::vector<int>& column_indices) const {
- const schema::GroupNode* group = descr->group_node();
- std::unordered_set<int> already_added;
-
- std::vector<int> out;
- for (int column_idx : column_indices) {
- if (column_idx < 0 || column_idx >= descr->num_columns()) {
- return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
- }
-
- auto field_node = descr->GetColumnRoot(column_idx);
- auto field_idx = group->FieldIndex(*field_node);
- if (field_idx == -1) {
- return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
- }
-
- if (already_added.insert(field_idx).second) {
- out.push_back(field_idx);
- }
- }
- return out;
- }
-};
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ArrowReaderProperties;
+class ArrowWriterProperties;
+class WriterProperties;
+
+namespace arrow {
+
+/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
+/// schema into a Parquet schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ schema::NodePtr* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ std::shared_ptr<SchemaDescriptor>* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ std::shared_ptr<SchemaDescriptor>* out);
+
+/// @}
+
+/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
+/// schema into an Arrow schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(
+ const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
+ std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ const ArrowReaderProperties& properties,
+ std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ std::shared_ptr<::arrow::Schema>* out);
+
+/// @}
+
+/// \brief Bridge between an arrow::Field and parquet column indices.
+struct PARQUET_EXPORT SchemaField {
+ std::shared_ptr<::arrow::Field> field;
+ std::vector<SchemaField> children;
+
+ // Only set for leaf nodes
+ int column_index = -1;
+
+ parquet::internal::LevelInfo level_info;
+
+ bool is_leaf() const { return column_index != -1; }
+};
+
+/// \brief Bridge between a parquet Schema and an arrow Schema.
+///
+/// Expose parquet columns as a tree structure. Useful traverse and link
+/// between arrow's Schema and parquet's Schema.
+struct PARQUET_EXPORT SchemaManifest {
+ static ::arrow::Status Make(
+ const SchemaDescriptor* schema,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
+ const ArrowReaderProperties& properties, SchemaManifest* manifest);
+
+ const SchemaDescriptor* descr;
+ std::shared_ptr<::arrow::Schema> origin_schema;
+ std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
+ std::vector<SchemaField> schema_fields;
+
+ std::unordered_map<int, const SchemaField*> column_index_to_field;
+ std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
+
+ ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
+ auto it = column_index_to_field.find(column_index);
+ if (it == column_index_to_field.end()) {
+ return ::arrow::Status::KeyError("Column index ", column_index,
+ " not found in schema manifest, may be malformed");
+ }
+ *out = it->second;
+ return ::arrow::Status::OK();
+ }
+
+ const SchemaField* GetParent(const SchemaField* field) const {
+ // Returns nullptr also if not found
+ auto it = child_to_parent.find(field);
+ if (it == child_to_parent.end()) {
+ return NULLPTR;
+ }
+ return it->second;
+ }
+
+ /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
+ /// correspond to the column root (first node below the parquet schema's root group) of
+ /// each leaf referenced in column_indices.
+ ///
+ /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
+ /// the roots are `a` and `i` (return=[0,2]).
+ ///
+ /// root
+ /// -- a <------
+ /// -- -- b | |
+ /// -- -- -- c |
+ /// -- -- -- d |
+ /// -- -- -- -- e
+ /// -- f
+ /// -- -- g
+ /// -- -- -- h
+ /// -- i <---
+ /// -- -- j |
+ /// -- -- -- k
+ ::arrow::Result<std::vector<int>> GetFieldIndices(
+ const std::vector<int>& column_indices) const {
+ const schema::GroupNode* group = descr->group_node();
+ std::unordered_set<int> already_added;
+
+ std::vector<int> out;
+ for (int column_idx : column_indices) {
+ if (column_idx < 0 || column_idx >= descr->num_columns()) {
+ return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+ }
+
+ auto field_node = descr->GetColumnRoot(column_idx);
+ auto field_idx = group->FieldIndex(*field_node);
+ if (field_idx == -1) {
+ return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+ }
+
+ if (already_added.insert(field_idx).second) {
+ out.push_back(field_idx);
+ }
+ }
+ return out;
+ }
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
index 064bf4f55cc..13acbb3d555 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
@@ -1,222 +1,222 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/schema_internal.h"
-
-#include "arrow/type.h"
-
-using ArrowType = ::arrow::DataType;
-using ArrowTypeId = ::arrow::Type;
-using ParquetType = parquet::Type;
-
-namespace parquet {
-
-namespace arrow {
-
-using ::arrow::Result;
-using ::arrow::Status;
-using ::arrow::internal::checked_cast;
-
-Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
- const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
- if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
- return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
- }
- return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
- const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
- switch (integer.bit_width()) {
- case 8:
- return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
- case 16:
- return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
- case 32:
- return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Int32");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
- const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
- switch (integer.bit_width()) {
- case 64:
- return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Int64");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
- const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
- switch (time.time_unit()) {
- case LogicalType::TimeUnit::MILLIS:
- return ::arrow::time32(::arrow::TimeUnit::MILLI);
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Time32");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
- const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
- switch (time.time_unit()) {
- case LogicalType::TimeUnit::MICROS:
- return ::arrow::time64(::arrow::TimeUnit::MICRO);
- case LogicalType::TimeUnit::NANOS:
- return ::arrow::time64(::arrow::TimeUnit::NANO);
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Time64");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
- const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
- const bool utc_normalized =
- timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
- static const char* utc_timezone = "UTC";
- switch (timestamp.time_unit()) {
- case LogicalType::TimeUnit::MILLIS:
- return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
- : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
- case LogicalType::TimeUnit::MICROS:
- return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
- : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
- case LogicalType::TimeUnit::NANOS:
- return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
- : ::arrow::timestamp(::arrow::TimeUnit::NANO));
- default:
- return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
- logical_type.ToString());
- }
-}
-
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
- switch (logical_type.type()) {
- case LogicalType::Type::STRING:
- return ::arrow::utf8();
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::NONE:
- case LogicalType::Type::ENUM:
- case LogicalType::Type::JSON:
- case LogicalType::Type::BSON:
- return ::arrow::binary();
- default:
- return Status::NotImplemented("Unhandled logical logical_type ",
- logical_type.ToString(), " for binary array");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
- int32_t physical_length) {
- switch (logical_type.type()) {
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::NONE:
- case LogicalType::Type::INTERVAL:
- case LogicalType::Type::UUID:
- return ::arrow::fixed_size_binary(physical_length);
- default:
- return Status::NotImplemented("Unhandled logical logical_type ",
- logical_type.ToString(),
- " for fixed-length binary array");
- }
-}
-
-::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeArrowInt(logical_type);
- case LogicalType::Type::DATE:
- return ::arrow::date32();
- case LogicalType::Type::TIME:
- return MakeArrowTime32(logical_type);
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::NONE:
- return ::arrow::int32();
- default:
- return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
- " for INT32");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeArrowInt64(logical_type);
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::TIMESTAMP:
- return MakeArrowTimestamp(logical_type);
- case LogicalType::Type::TIME:
- return MakeArrowTime64(logical_type);
- case LogicalType::Type::NONE:
- return ::arrow::int64();
- default:
- return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
- " for INT64");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> GetArrowType(
- Type::type physical_type, const LogicalType& logical_type, int type_length,
- const ::arrow::TimeUnit::type int96_arrow_time_unit) {
- if (logical_type.is_invalid() || logical_type.is_null()) {
- return ::arrow::null();
- }
-
- switch (physical_type) {
- case ParquetType::BOOLEAN:
- return ::arrow::boolean();
- case ParquetType::INT32:
- return FromInt32(logical_type);
- case ParquetType::INT64:
- return FromInt64(logical_type);
- case ParquetType::INT96:
- return ::arrow::timestamp(int96_arrow_time_unit);
- case ParquetType::FLOAT:
- return ::arrow::float32();
- case ParquetType::DOUBLE:
- return ::arrow::float64();
- case ParquetType::BYTE_ARRAY:
- return FromByteArray(logical_type);
- case ParquetType::FIXED_LEN_BYTE_ARRAY:
- return FromFLBA(logical_type, type_length);
- default: {
- // PARQUET-1565: This can occur if the file is corrupt
- return Status::IOError("Invalid physical column type: ",
- TypeToString(physical_type));
- }
- }
-}
-
-Result<std::shared_ptr<ArrowType>> GetArrowType(
- const schema::PrimitiveNode& primitive,
- const ::arrow::TimeUnit::type int96_arrow_time_unit) {
- return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
- primitive.type_length(), int96_arrow_time_unit);
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/schema_internal.h"
+
+#include "arrow/type.h"
+
+using ArrowType = ::arrow::DataType;
+using ArrowTypeId = ::arrow::Type;
+using ParquetType = parquet::Type;
+
+namespace parquet {
+
+namespace arrow {
+
+using ::arrow::Result;
+using ::arrow::Status;
+using ::arrow::internal::checked_cast;
+
+Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
+ const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
+ if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
+ return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
+ }
+ return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
+ const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
+ switch (integer.bit_width()) {
+ case 8:
+ return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
+ case 16:
+ return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
+ case 32:
+ return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Int32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
+ const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
+ switch (integer.bit_width()) {
+ case 64:
+ return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Int64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
+ const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
+ switch (time.time_unit()) {
+ case LogicalType::TimeUnit::MILLIS:
+ return ::arrow::time32(::arrow::TimeUnit::MILLI);
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Time32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
+ const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
+ switch (time.time_unit()) {
+ case LogicalType::TimeUnit::MICROS:
+ return ::arrow::time64(::arrow::TimeUnit::MICRO);
+ case LogicalType::TimeUnit::NANOS:
+ return ::arrow::time64(::arrow::TimeUnit::NANO);
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Time64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
+ const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
+ const bool utc_normalized =
+ timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
+ static const char* utc_timezone = "UTC";
+ switch (timestamp.time_unit()) {
+ case LogicalType::TimeUnit::MILLIS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
+ case LogicalType::TimeUnit::MICROS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
+ case LogicalType::TimeUnit::NANOS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::NANO));
+ default:
+ return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
+ logical_type.ToString());
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::STRING:
+ return ::arrow::utf8();
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ case LogicalType::Type::ENUM:
+ case LogicalType::Type::JSON:
+ case LogicalType::Type::BSON:
+ return ::arrow::binary();
+ default:
+ return Status::NotImplemented("Unhandled logical logical_type ",
+ logical_type.ToString(), " for binary array");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
+ int32_t physical_length) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ case LogicalType::Type::INTERVAL:
+ case LogicalType::Type::UUID:
+ return ::arrow::fixed_size_binary(physical_length);
+ default:
+ return Status::NotImplemented("Unhandled logical logical_type ",
+ logical_type.ToString(),
+ " for fixed-length binary array");
+ }
+}
+
+::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeArrowInt(logical_type);
+ case LogicalType::Type::DATE:
+ return ::arrow::date32();
+ case LogicalType::Type::TIME:
+ return MakeArrowTime32(logical_type);
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ return ::arrow::int32();
+ default:
+ return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
+ " for INT32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeArrowInt64(logical_type);
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::TIMESTAMP:
+ return MakeArrowTimestamp(logical_type);
+ case LogicalType::Type::TIME:
+ return MakeArrowTime64(logical_type);
+ case LogicalType::Type::NONE:
+ return ::arrow::int64();
+ default:
+ return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
+ " for INT64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+ Type::type physical_type, const LogicalType& logical_type, int type_length,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ if (logical_type.is_invalid() || logical_type.is_null()) {
+ return ::arrow::null();
+ }
+
+ switch (physical_type) {
+ case ParquetType::BOOLEAN:
+ return ::arrow::boolean();
+ case ParquetType::INT32:
+ return FromInt32(logical_type);
+ case ParquetType::INT64:
+ return FromInt64(logical_type);
+ case ParquetType::INT96:
+ return ::arrow::timestamp(int96_arrow_time_unit);
+ case ParquetType::FLOAT:
+ return ::arrow::float32();
+ case ParquetType::DOUBLE:
+ return ::arrow::float64();
+ case ParquetType::BYTE_ARRAY:
+ return FromByteArray(logical_type);
+ case ParquetType::FIXED_LEN_BYTE_ARRAY:
+ return FromFLBA(logical_type, type_length);
+ default: {
+ // PARQUET-1565: This can occur if the file is corrupt
+ return Status::IOError("Invalid physical column type: ",
+ TypeToString(physical_type));
+ }
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+ const schema::PrimitiveNode& primitive,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
+ primitive.type_length(), int96_arrow_time_unit);
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
index fb837c3ee6c..c48fd7c938a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
@@ -1,51 +1,51 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/result.h"
-#include "parquet/schema.h"
-
-namespace arrow {
-class DataType;
-}
-
-namespace parquet {
-namespace arrow {
-
-using ::arrow::Result;
-
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
-Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
- int32_t physical_length);
-Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
-Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
-
-Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
- const LogicalType& logical_type,
- int type_length);
-
-Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
- Type::type physical_type, const LogicalType& logical_type, int type_length,
- ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
-
-Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
- const schema::PrimitiveNode& primitive,
- ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+class DataType;
+}
+
+namespace parquet {
+namespace arrow {
+
+using ::arrow::Result;
+
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
+ int32_t physical_length);
+Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
+ const LogicalType& logical_type,
+ int type_length);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
+ Type::type physical_type, const LogicalType& logical_type, int type_length,
+ ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
+ const schema::PrimitiveNode& primitive,
+ ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
index 2fbebf27fce..797069eb327 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
@@ -1,482 +1,482 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/writer.h"
-
-#include <algorithm>
-#include <deque>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/extension_type.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/table.h"
-#include "arrow/type.h"
-#include "arrow/util/base64.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/visitor_inline.h"
-
-#include "parquet/arrow/path_internal.h"
-#include "parquet/arrow/reader_internal.h"
-#include "parquet/arrow/schema.h"
-#include "parquet/column_writer.h"
-#include "parquet/exception.h"
-#include "parquet/file_writer.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-using arrow::Array;
-using arrow::BinaryArray;
-using arrow::BooleanArray;
-using arrow::ChunkedArray;
-using arrow::DataType;
-using arrow::DictionaryArray;
-using arrow::ExtensionArray;
-using arrow::ExtensionType;
-using arrow::Field;
-using arrow::FixedSizeBinaryArray;
-using arrow::ListArray;
-using arrow::MemoryPool;
-using arrow::NumericArray;
-using arrow::PrimitiveArray;
-using arrow::ResizableBuffer;
-using arrow::Status;
-using arrow::Table;
-using arrow::TimeUnit;
-
-using arrow::internal::checked_cast;
-
-using parquet::ParquetFileWriter;
-using parquet::ParquetVersion;
-using parquet::schema::GroupNode;
-
-namespace parquet {
-namespace arrow {
-
-namespace {
-
-int CalculateLeafCount(const DataType* type) {
- if (type->id() == ::arrow::Type::EXTENSION) {
- type = checked_cast<const ExtensionType&>(*type).storage_type().get();
- }
- // Note num_fields() can be 0 for an empty struct type
- if (!::arrow::is_nested(type->id())) {
- // Primitive type.
- return 1;
- }
-
- int num_leaves = 0;
- for (const auto& field : type->fields()) {
- num_leaves += CalculateLeafCount(field->type().get());
- }
- return num_leaves;
-}
-
-// Determines if the |schema_field|'s root ancestor is nullable.
-bool HasNullableRoot(const SchemaManifest& schema_manifest,
- const SchemaField* schema_field) {
- DCHECK(schema_field != nullptr);
- const SchemaField* current_field = schema_field;
- bool nullable = schema_field->field->nullable();
- while (current_field != nullptr) {
- nullable = current_field->field->nullable();
- current_field = schema_manifest.GetParent(current_field);
- }
- return nullable;
-}
-
-// Manages writing nested parquet columns with support for all nested types
-// supported by parquet.
-class ArrowColumnWriterV2 {
- public:
- // Constructs a new object (use Make() method below to construct from
- // A ChunkedArray).
- // level_builders should contain one MultipathLevelBuilder per chunk of the
- // Arrow-column to write.
- ArrowColumnWriterV2(std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders,
- int leaf_count, RowGroupWriter* row_group_writer)
- : level_builders_(std::move(level_builders)),
- leaf_count_(leaf_count),
- row_group_writer_(row_group_writer) {}
-
- // Writes out all leaf parquet columns to the RowGroupWriter that this
- // object was constructed with. Each leaf column is written fully before
- // the next column is written (i.e. no buffering is assumed).
- //
- // Columns are written in DFS order.
- Status Write(ArrowWriteContext* ctx) {
- for (int leaf_idx = 0; leaf_idx < leaf_count_; leaf_idx++) {
- ColumnWriter* column_writer;
- PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
- for (auto& level_builder : level_builders_) {
- RETURN_NOT_OK(level_builder->Write(
- leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
- size_t visited_component_size = result.post_list_visited_elements.size();
- DCHECK_GT(visited_component_size, 0);
- if (visited_component_size != 1) {
- return Status::NotImplemented(
- "Lists with non-zero length null components are not supported");
- }
- const ElementRange& range = result.post_list_visited_elements[0];
- std::shared_ptr<Array> values_array =
- result.leaf_array->Slice(range.start, range.Size());
-
- return column_writer->WriteArrow(result.def_levels, result.rep_levels,
- result.def_rep_level_count, *values_array,
- ctx, result.leaf_is_nullable);
- }));
- }
-
- PARQUET_CATCH_NOT_OK(column_writer->Close());
- }
- return Status::OK();
- }
-
- // Make a new object by converting each chunk in |data| to a MultipathLevelBuilder.
- //
- // It is necessary to create a new builder per array because the MultipathlevelBuilder
- // extracts the data necessary for writing each leaf column at construction time.
- // (it optimizes based on null count) and with slicing via |offset| ephemeral
- // chunks are created which need to be tracked across each leaf column-write.
- // This decision could potentially be revisited if we wanted to use "buffered"
- // RowGroupWriters (we could construct each builder on demand in that case).
- static ::arrow::Result<std::unique_ptr<ArrowColumnWriterV2>> Make(
- const ChunkedArray& data, int64_t offset, const int64_t size,
- const SchemaManifest& schema_manifest, RowGroupWriter* row_group_writer) {
- int64_t absolute_position = 0;
- int chunk_index = 0;
- int64_t chunk_offset = 0;
- if (data.length() == 0) {
- return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
- std::vector<std::unique_ptr<MultipathLevelBuilder>>{},
- CalculateLeafCount(data.type().get()), row_group_writer);
- }
- while (chunk_index < data.num_chunks() && absolute_position < offset) {
- const int64_t chunk_length = data.chunk(chunk_index)->length();
- if (absolute_position + chunk_length > offset) {
- // Relative offset into the chunk to reach the desired start offset for
- // writing
- chunk_offset = offset - absolute_position;
- break;
- } else {
- ++chunk_index;
- absolute_position += chunk_length;
- }
- }
-
- if (absolute_position >= data.length()) {
- return Status::Invalid("Cannot write data at offset past end of chunked array");
- }
-
- int64_t values_written = 0;
- std::vector<std::unique_ptr<MultipathLevelBuilder>> builders;
- const int leaf_count = CalculateLeafCount(data.type().get());
- bool is_nullable = false;
- // The row_group_writer hasn't been advanced yet so add 1 to the current
- // which is the one this instance will start writing for.
- int column_index = row_group_writer->current_column() + 1;
- for (int leaf_offset = 0; leaf_offset < leaf_count; ++leaf_offset) {
- const SchemaField* schema_field = nullptr;
- RETURN_NOT_OK(
- schema_manifest.GetColumnField(column_index + leaf_offset, &schema_field));
- bool nullable_root = HasNullableRoot(schema_manifest, schema_field);
- if (leaf_offset == 0) {
- is_nullable = nullable_root;
- }
-
-// Don't validate common ancestry for all leafs if not in debug.
-#ifndef NDEBUG
- break;
-#else
- if (is_nullable != nullable_root) {
- return Status::UnknownError(
- "Unexpected mismatched nullability between column index",
- column_index + leaf_offset, " and ", column_index);
- }
-#endif
- }
- while (values_written < size) {
- const Array& chunk = *data.chunk(chunk_index);
- const int64_t available_values = chunk.length() - chunk_offset;
- const int64_t chunk_write_size = std::min(size - values_written, available_values);
-
- // The chunk offset here will be 0 except for possibly the first chunk
- // because of the advancing logic above
- std::shared_ptr<Array> array_to_write = chunk.Slice(chunk_offset, chunk_write_size);
-
- if (array_to_write->length() > 0) {
- ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
- MultipathLevelBuilder::Make(*array_to_write, is_nullable));
- if (leaf_count != builder->GetLeafCount()) {
- return Status::UnknownError("data type leaf_count != builder_leaf_count",
- leaf_count, " ", builder->GetLeafCount());
- }
- builders.emplace_back(std::move(builder));
- }
-
- if (chunk_write_size == available_values) {
- chunk_offset = 0;
- ++chunk_index;
- }
- values_written += chunk_write_size;
- }
- return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
- std::move(builders), leaf_count, row_group_writer);
- }
-
- private:
- // One builder per column-chunk.
- std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders_;
- int leaf_count_;
- RowGroupWriter* row_group_writer_;
-};
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// FileWriter implementation
-
-class FileWriterImpl : public FileWriter {
- public:
- FileWriterImpl(std::shared_ptr<::arrow::Schema> schema, MemoryPool* pool,
- std::unique_ptr<ParquetFileWriter> writer,
- std::shared_ptr<ArrowWriterProperties> arrow_properties)
- : schema_(std::move(schema)),
- writer_(std::move(writer)),
- row_group_writer_(nullptr),
- column_write_context_(pool, arrow_properties.get()),
- arrow_properties_(std::move(arrow_properties)),
- closed_(false) {}
-
- Status Init() {
- return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr,
- default_arrow_reader_properties(), &schema_manifest_);
- }
-
- Status NewRowGroup(int64_t chunk_size) override {
- if (row_group_writer_ != nullptr) {
- PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
- }
- PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup());
- return Status::OK();
- }
-
- Status Close() override {
- if (!closed_) {
- // Make idempotent
- closed_ = true;
- if (row_group_writer_ != nullptr) {
- PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
- }
- PARQUET_CATCH_NOT_OK(writer_->Close());
- }
- return Status::OK();
- }
-
- Status WriteColumnChunk(const Array& data) override {
- // A bit awkward here since cannot instantiate ChunkedArray from const Array&
- auto chunk = ::arrow::MakeArray(data.data());
- auto chunked_array = std::make_shared<::arrow::ChunkedArray>(chunk);
- return WriteColumnChunk(chunked_array, 0, data.length());
- }
-
- Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
- int64_t size) override {
- if (arrow_properties_->engine_version() == ArrowWriterProperties::V2 ||
- arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
- ARROW_ASSIGN_OR_RAISE(
- std::unique_ptr<ArrowColumnWriterV2> writer,
- ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_,
- row_group_writer_));
- return writer->Write(&column_write_context_);
- }
- return Status::NotImplemented("Unknown engine version.");
- }
-
- Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data) override {
- return WriteColumnChunk(data, 0, data->length());
- }
-
- std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
-
- Status WriteTable(const Table& table, int64_t chunk_size) override {
- RETURN_NOT_OK(table.Validate());
-
- if (chunk_size <= 0 && table.num_rows() > 0) {
- return Status::Invalid("chunk size per row_group must be greater than 0");
- } else if (!table.schema()->Equals(*schema_, false)) {
- return Status::Invalid("table schema does not match this writer's. table:'",
- table.schema()->ToString(), "' this:'", schema_->ToString(),
- "'");
- } else if (chunk_size > this->properties().max_row_group_length()) {
- chunk_size = this->properties().max_row_group_length();
- }
-
- auto WriteRowGroup = [&](int64_t offset, int64_t size) {
- RETURN_NOT_OK(NewRowGroup(size));
- for (int i = 0; i < table.num_columns(); i++) {
- RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size));
- }
- return Status::OK();
- };
-
- if (table.num_rows() == 0) {
- // Append a row group with 0 rows
- RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close()));
- return Status::OK();
- }
-
- for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
- int64_t offset = chunk * chunk_size;
- RETURN_NOT_OK_ELSE(
- WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)),
- PARQUET_IGNORE_NOT_OK(Close()));
- }
- return Status::OK();
- }
-
- const WriterProperties& properties() const { return *writer_->properties(); }
-
- ::arrow::MemoryPool* memory_pool() const override {
- return column_write_context_.memory_pool;
- }
-
- const std::shared_ptr<FileMetaData> metadata() const override {
- return writer_->metadata();
- }
-
- private:
- friend class FileWriter;
-
- std::shared_ptr<::arrow::Schema> schema_;
-
- SchemaManifest schema_manifest_;
-
- std::unique_ptr<ParquetFileWriter> writer_;
- RowGroupWriter* row_group_writer_;
- ArrowWriteContext column_write_context_;
- std::shared_ptr<ArrowWriterProperties> arrow_properties_;
- bool closed_;
-};
-
-FileWriter::~FileWriter() {}
-
-Status FileWriter::Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileWriter> writer,
- std::shared_ptr<::arrow::Schema> schema,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* out) {
- std::unique_ptr<FileWriterImpl> impl(new FileWriterImpl(
- std::move(schema), pool, std::move(writer), std::move(arrow_properties)));
- RETURN_NOT_OK(impl->Init());
- *out = std::move(impl);
- return Status::OK();
-}
-
-Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::unique_ptr<FileWriter>* writer) {
- return Open(std::move(schema), pool, std::move(sink), std::move(properties),
- default_arrow_writer_properties(), writer);
-}
-
-Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
- const ArrowWriterProperties& properties,
- std::shared_ptr<const KeyValueMetadata>* out) {
- if (!properties.store_schema()) {
- *out = nullptr;
- return Status::OK();
- }
-
- static const std::string kArrowSchemaKey = "ARROW:schema";
- std::shared_ptr<KeyValueMetadata> result;
- if (schema.metadata()) {
- result = schema.metadata()->Copy();
- } else {
- result = ::arrow::key_value_metadata({}, {});
- }
-
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> serialized,
- ::arrow::ipc::SerializeSchema(schema, pool));
-
- // The serialized schema is not UTF-8, which is required for Thrift
- std::string schema_as_string = serialized->ToString();
- std::string schema_base64 = ::arrow::util::base64_encode(
- reinterpret_cast<const unsigned char*>(schema_as_string.data()),
- static_cast<unsigned int>(schema_as_string.size()));
- result->Append(kArrowSchemaKey, schema_base64);
- *out = result;
- return Status::OK();
-}
-
-Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* writer) {
- std::shared_ptr<SchemaDescriptor> parquet_schema;
- RETURN_NOT_OK(
- ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema));
-
- auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
-
- std::shared_ptr<const KeyValueMetadata> metadata;
- RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata));
-
- std::unique_ptr<ParquetFileWriter> base_writer;
- PARQUET_CATCH_NOT_OK(base_writer = ParquetFileWriter::Open(std::move(sink), schema_node,
- std::move(properties),
- std::move(metadata)));
-
- auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
- return Make(pool, std::move(base_writer), std::move(schema_ptr),
- std::move(arrow_properties), writer);
-}
-
-Status WriteFileMetaData(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink) {
- PARQUET_CATCH_NOT_OK(::parquet::WriteFileMetaData(file_metadata, sink));
- return Status::OK();
-}
-
-Status WriteMetaDataFile(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink) {
- PARQUET_CATCH_NOT_OK(::parquet::WriteMetaDataFile(file_metadata, sink));
- return Status::OK();
-}
-
-Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<ArrowWriterProperties> arrow_properties) {
- std::unique_ptr<FileWriter> writer;
- RETURN_NOT_OK(FileWriter::Open(*table.schema(), pool, std::move(sink),
- std::move(properties), std::move(arrow_properties),
- &writer));
- RETURN_NOT_OK(writer->WriteTable(table, chunk_size));
- return writer->Close();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/writer.h"
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/extension_type.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+#include "parquet/arrow/path_internal.h"
+#include "parquet/arrow/reader_internal.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+using arrow::Array;
+using arrow::BinaryArray;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::DictionaryArray;
+using arrow::ExtensionArray;
+using arrow::ExtensionType;
+using arrow::Field;
+using arrow::FixedSizeBinaryArray;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::NumericArray;
+using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::Table;
+using arrow::TimeUnit;
+
+using arrow::internal::checked_cast;
+
+using parquet::ParquetFileWriter;
+using parquet::ParquetVersion;
+using parquet::schema::GroupNode;
+
+namespace parquet {
+namespace arrow {
+
+namespace {
+
+int CalculateLeafCount(const DataType* type) {
+ if (type->id() == ::arrow::Type::EXTENSION) {
+ type = checked_cast<const ExtensionType&>(*type).storage_type().get();
+ }
+ // Note num_fields() can be 0 for an empty struct type
+ if (!::arrow::is_nested(type->id())) {
+ // Primitive type.
+ return 1;
+ }
+
+ int num_leaves = 0;
+ for (const auto& field : type->fields()) {
+ num_leaves += CalculateLeafCount(field->type().get());
+ }
+ return num_leaves;
+}
+
+// Determines if the |schema_field|'s root ancestor is nullable.
+bool HasNullableRoot(const SchemaManifest& schema_manifest,
+ const SchemaField* schema_field) {
+ DCHECK(schema_field != nullptr);
+ const SchemaField* current_field = schema_field;
+ bool nullable = schema_field->field->nullable();
+ while (current_field != nullptr) {
+ nullable = current_field->field->nullable();
+ current_field = schema_manifest.GetParent(current_field);
+ }
+ return nullable;
+}
+
+// Manages writing nested parquet columns with support for all nested types
+// supported by parquet.
+class ArrowColumnWriterV2 {
+ public:
+ // Constructs a new object (use Make() method below to construct from
+ // A ChunkedArray).
+ // level_builders should contain one MultipathLevelBuilder per chunk of the
+ // Arrow-column to write.
+ ArrowColumnWriterV2(std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders,
+ int leaf_count, RowGroupWriter* row_group_writer)
+ : level_builders_(std::move(level_builders)),
+ leaf_count_(leaf_count),
+ row_group_writer_(row_group_writer) {}
+
+ // Writes out all leaf parquet columns to the RowGroupWriter that this
+ // object was constructed with. Each leaf column is written fully before
+ // the next column is written (i.e. no buffering is assumed).
+ //
+ // Columns are written in DFS order.
+ Status Write(ArrowWriteContext* ctx) {
+ for (int leaf_idx = 0; leaf_idx < leaf_count_; leaf_idx++) {
+ ColumnWriter* column_writer;
+ PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
+ for (auto& level_builder : level_builders_) {
+ RETURN_NOT_OK(level_builder->Write(
+ leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
+ size_t visited_component_size = result.post_list_visited_elements.size();
+ DCHECK_GT(visited_component_size, 0);
+ if (visited_component_size != 1) {
+ return Status::NotImplemented(
+ "Lists with non-zero length null components are not supported");
+ }
+ const ElementRange& range = result.post_list_visited_elements[0];
+ std::shared_ptr<Array> values_array =
+ result.leaf_array->Slice(range.start, range.Size());
+
+ return column_writer->WriteArrow(result.def_levels, result.rep_levels,
+ result.def_rep_level_count, *values_array,
+ ctx, result.leaf_is_nullable);
+ }));
+ }
+
+ PARQUET_CATCH_NOT_OK(column_writer->Close());
+ }
+ return Status::OK();
+ }
+
+ // Make a new object by converting each chunk in |data| to a MultipathLevelBuilder.
+ //
+ // It is necessary to create a new builder per array because the MultipathlevelBuilder
+ // extracts the data necessary for writing each leaf column at construction time.
+ // (it optimizes based on null count) and with slicing via |offset| ephemeral
+ // chunks are created which need to be tracked across each leaf column-write.
+ // This decision could potentially be revisited if we wanted to use "buffered"
+ // RowGroupWriters (we could construct each builder on demand in that case).
+ static ::arrow::Result<std::unique_ptr<ArrowColumnWriterV2>> Make(
+ const ChunkedArray& data, int64_t offset, const int64_t size,
+ const SchemaManifest& schema_manifest, RowGroupWriter* row_group_writer) {
+ int64_t absolute_position = 0;
+ int chunk_index = 0;
+ int64_t chunk_offset = 0;
+ if (data.length() == 0) {
+ return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
+ std::vector<std::unique_ptr<MultipathLevelBuilder>>{},
+ CalculateLeafCount(data.type().get()), row_group_writer);
+ }
+ while (chunk_index < data.num_chunks() && absolute_position < offset) {
+ const int64_t chunk_length = data.chunk(chunk_index)->length();
+ if (absolute_position + chunk_length > offset) {
+ // Relative offset into the chunk to reach the desired start offset for
+ // writing
+ chunk_offset = offset - absolute_position;
+ break;
+ } else {
+ ++chunk_index;
+ absolute_position += chunk_length;
+ }
+ }
+
+ if (absolute_position >= data.length()) {
+ return Status::Invalid("Cannot write data at offset past end of chunked array");
+ }
+
+ int64_t values_written = 0;
+ std::vector<std::unique_ptr<MultipathLevelBuilder>> builders;
+ const int leaf_count = CalculateLeafCount(data.type().get());
+ bool is_nullable = false;
+ // The row_group_writer hasn't been advanced yet so add 1 to the current
+ // which is the one this instance will start writing for.
+ int column_index = row_group_writer->current_column() + 1;
+ for (int leaf_offset = 0; leaf_offset < leaf_count; ++leaf_offset) {
+ const SchemaField* schema_field = nullptr;
+ RETURN_NOT_OK(
+ schema_manifest.GetColumnField(column_index + leaf_offset, &schema_field));
+ bool nullable_root = HasNullableRoot(schema_manifest, schema_field);
+ if (leaf_offset == 0) {
+ is_nullable = nullable_root;
+ }
+
+// Don't validate common ancestry for all leafs if not in debug.
+#ifndef NDEBUG
+ break;
+#else
+ if (is_nullable != nullable_root) {
+ return Status::UnknownError(
+ "Unexpected mismatched nullability between column index",
+ column_index + leaf_offset, " and ", column_index);
+ }
+#endif
+ }
+ while (values_written < size) {
+ const Array& chunk = *data.chunk(chunk_index);
+ const int64_t available_values = chunk.length() - chunk_offset;
+ const int64_t chunk_write_size = std::min(size - values_written, available_values);
+
+ // The chunk offset here will be 0 except for possibly the first chunk
+ // because of the advancing logic above
+ std::shared_ptr<Array> array_to_write = chunk.Slice(chunk_offset, chunk_write_size);
+
+ if (array_to_write->length() > 0) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
+ MultipathLevelBuilder::Make(*array_to_write, is_nullable));
+ if (leaf_count != builder->GetLeafCount()) {
+ return Status::UnknownError("data type leaf_count != builder_leaf_count",
+ leaf_count, " ", builder->GetLeafCount());
+ }
+ builders.emplace_back(std::move(builder));
+ }
+
+ if (chunk_write_size == available_values) {
+ chunk_offset = 0;
+ ++chunk_index;
+ }
+ values_written += chunk_write_size;
+ }
+ return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
+ std::move(builders), leaf_count, row_group_writer);
+ }
+
+ private:
+ // One builder per column-chunk.
+ std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders_;
+ int leaf_count_;
+ RowGroupWriter* row_group_writer_;
+};
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// FileWriter implementation
+
+class FileWriterImpl : public FileWriter {
+ public:
+ FileWriterImpl(std::shared_ptr<::arrow::Schema> schema, MemoryPool* pool,
+ std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties)
+ : schema_(std::move(schema)),
+ writer_(std::move(writer)),
+ row_group_writer_(nullptr),
+ column_write_context_(pool, arrow_properties.get()),
+ arrow_properties_(std::move(arrow_properties)),
+ closed_(false) {}
+
+ Status Init() {
+ return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr,
+ default_arrow_reader_properties(), &schema_manifest_);
+ }
+
+ Status NewRowGroup(int64_t chunk_size) override {
+ if (row_group_writer_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
+ }
+ PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup());
+ return Status::OK();
+ }
+
+ Status Close() override {
+ if (!closed_) {
+ // Make idempotent
+ closed_ = true;
+ if (row_group_writer_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
+ }
+ PARQUET_CATCH_NOT_OK(writer_->Close());
+ }
+ return Status::OK();
+ }
+
+ Status WriteColumnChunk(const Array& data) override {
+ // A bit awkward here since cannot instantiate ChunkedArray from const Array&
+ auto chunk = ::arrow::MakeArray(data.data());
+ auto chunked_array = std::make_shared<::arrow::ChunkedArray>(chunk);
+ return WriteColumnChunk(chunked_array, 0, data.length());
+ }
+
+ Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
+ int64_t size) override {
+ if (arrow_properties_->engine_version() == ArrowWriterProperties::V2 ||
+ arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::unique_ptr<ArrowColumnWriterV2> writer,
+ ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_,
+ row_group_writer_));
+ return writer->Write(&column_write_context_);
+ }
+ return Status::NotImplemented("Unknown engine version.");
+ }
+
+ Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data) override {
+ return WriteColumnChunk(data, 0, data->length());
+ }
+
+ std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
+
+ Status WriteTable(const Table& table, int64_t chunk_size) override {
+ RETURN_NOT_OK(table.Validate());
+
+ if (chunk_size <= 0 && table.num_rows() > 0) {
+ return Status::Invalid("chunk size per row_group must be greater than 0");
+ } else if (!table.schema()->Equals(*schema_, false)) {
+ return Status::Invalid("table schema does not match this writer's. table:'",
+ table.schema()->ToString(), "' this:'", schema_->ToString(),
+ "'");
+ } else if (chunk_size > this->properties().max_row_group_length()) {
+ chunk_size = this->properties().max_row_group_length();
+ }
+
+ auto WriteRowGroup = [&](int64_t offset, int64_t size) {
+ RETURN_NOT_OK(NewRowGroup(size));
+ for (int i = 0; i < table.num_columns(); i++) {
+ RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size));
+ }
+ return Status::OK();
+ };
+
+ if (table.num_rows() == 0) {
+ // Append a row group with 0 rows
+ RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close()));
+ return Status::OK();
+ }
+
+ for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
+ int64_t offset = chunk * chunk_size;
+ RETURN_NOT_OK_ELSE(
+ WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)),
+ PARQUET_IGNORE_NOT_OK(Close()));
+ }
+ return Status::OK();
+ }
+
+ const WriterProperties& properties() const { return *writer_->properties(); }
+
+ ::arrow::MemoryPool* memory_pool() const override {
+ return column_write_context_.memory_pool;
+ }
+
+ const std::shared_ptr<FileMetaData> metadata() const override {
+ return writer_->metadata();
+ }
+
+ private:
+ friend class FileWriter;
+
+ std::shared_ptr<::arrow::Schema> schema_;
+
+ SchemaManifest schema_manifest_;
+
+ std::unique_ptr<ParquetFileWriter> writer_;
+ RowGroupWriter* row_group_writer_;
+ ArrowWriteContext column_write_context_;
+ std::shared_ptr<ArrowWriterProperties> arrow_properties_;
+ bool closed_;
+};
+
+FileWriter::~FileWriter() {}
+
+Status FileWriter::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<::arrow::Schema> schema,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* out) {
+ std::unique_ptr<FileWriterImpl> impl(new FileWriterImpl(
+ std::move(schema), pool, std::move(writer), std::move(arrow_properties)));
+ RETURN_NOT_OK(impl->Init());
+ *out = std::move(impl);
+ return Status::OK();
+}
+
+Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::unique_ptr<FileWriter>* writer) {
+ return Open(std::move(schema), pool, std::move(sink), std::move(properties),
+ default_arrow_writer_properties(), writer);
+}
+
+Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ const ArrowWriterProperties& properties,
+ std::shared_ptr<const KeyValueMetadata>* out) {
+ if (!properties.store_schema()) {
+ *out = nullptr;
+ return Status::OK();
+ }
+
+ static const std::string kArrowSchemaKey = "ARROW:schema";
+ std::shared_ptr<KeyValueMetadata> result;
+ if (schema.metadata()) {
+ result = schema.metadata()->Copy();
+ } else {
+ result = ::arrow::key_value_metadata({}, {});
+ }
+
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> serialized,
+ ::arrow::ipc::SerializeSchema(schema, pool));
+
+ // The serialized schema is not UTF-8, which is required for Thrift
+ std::string schema_as_string = serialized->ToString();
+ std::string schema_base64 = ::arrow::util::base64_encode(
+ reinterpret_cast<const unsigned char*>(schema_as_string.data()),
+ static_cast<unsigned int>(schema_as_string.size()));
+ result->Append(kArrowSchemaKey, schema_base64);
+ *out = result;
+ return Status::OK();
+}
+
+Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* writer) {
+ std::shared_ptr<SchemaDescriptor> parquet_schema;
+ RETURN_NOT_OK(
+ ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema));
+
+ auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
+
+ std::shared_ptr<const KeyValueMetadata> metadata;
+ RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata));
+
+ std::unique_ptr<ParquetFileWriter> base_writer;
+ PARQUET_CATCH_NOT_OK(base_writer = ParquetFileWriter::Open(std::move(sink), schema_node,
+ std::move(properties),
+ std::move(metadata)));
+
+ auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
+ return Make(pool, std::move(base_writer), std::move(schema_ptr),
+ std::move(arrow_properties), writer);
+}
+
+Status WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteFileMetaData(file_metadata, sink));
+ return Status::OK();
+}
+
+Status WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteMetaDataFile(file_metadata, sink));
+ return Status::OK();
+}
+
+Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties) {
+ std::unique_ptr<FileWriter> writer;
+ RETURN_NOT_OK(FileWriter::Open(*table.schema(), pool, std::move(sink),
+ std::move(properties), std::move(arrow_properties),
+ &writer));
+ RETURN_NOT_OK(writer->WriteTable(table, chunk_size));
+ return writer->Close();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
index f31f3d03def..43c5ede1ab5 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
@@ -1,109 +1,109 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-class Schema;
-class Table;
-
-} // namespace arrow
-
-namespace parquet {
-
-class FileMetaData;
-class ParquetFileWriter;
-
-namespace arrow {
-
-/// \brief Iterative FileWriter class
-///
-/// Start a new RowGroup or Chunk with NewRowGroup.
-/// Write column-by-column the whole column chunk.
-///
-/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
-/// value is a nonnegative integer, then it will be used as the field_id in the parquet
-/// file.
-class PARQUET_EXPORT FileWriter {
- public:
- static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
- std::shared_ptr<::arrow::Schema> schema,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* out);
-
- static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::unique_ptr<FileWriter>* writer);
-
- static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* writer);
-
- virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
-
- /// \brief Write a Table to Parquet.
- virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
-
- virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
- virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
-
- /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
- virtual ::arrow::Status WriteColumnChunk(
- const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
- int64_t size) = 0;
-
- virtual ::arrow::Status WriteColumnChunk(
- const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
- virtual ::arrow::Status Close() = 0;
- virtual ~FileWriter();
-
- virtual MemoryPool* memory_pool() const = 0;
- virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
-};
-
-/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
-PARQUET_EXPORT
-::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
-PARQUET_EXPORT
-::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-/// \brief Write a Table to Parquet.
-::arrow::Status PARQUET_EXPORT
-WriteTable(const ::arrow::Table& table, MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
- std::shared_ptr<WriterProperties> properties = default_writer_properties(),
- std::shared_ptr<ArrowWriterProperties> arrow_properties =
- default_arrow_writer_properties());
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class Schema;
+class Table;
+
+} // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class ParquetFileWriter;
+
+namespace arrow {
+
+/// \brief Iterative FileWriter class
+///
+/// Start a new RowGroup or Chunk with NewRowGroup.
+/// Write column-by-column the whole column chunk.
+///
+/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
+/// value is a nonnegative integer, then it will be used as the field_id in the parquet
+/// file.
+class PARQUET_EXPORT FileWriter {
+ public:
+ static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<::arrow::Schema> schema,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* out);
+
+ static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::unique_ptr<FileWriter>* writer);
+
+ static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* writer);
+
+ virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
+
+ /// \brief Write a Table to Parquet.
+ virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
+
+ virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
+ virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
+
+ /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
+ virtual ::arrow::Status WriteColumnChunk(
+ const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
+ int64_t size) = 0;
+
+ virtual ::arrow::Status WriteColumnChunk(
+ const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
+ virtual ::arrow::Status Close() = 0;
+ virtual ~FileWriter();
+
+ virtual MemoryPool* memory_pool() const = 0;
+ virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
+};
+
+/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+/// \brief Write a Table to Parquet.
+::arrow::Status PARQUET_EXPORT
+WriteTable(const ::arrow::Table& table, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
+ std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+ std::shared_ptr<ArrowWriterProperties> arrow_properties =
+ default_arrow_writer_properties());
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
index f6f6d327d06..e56449060ef 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
@@ -1,162 +1,162 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cstdint>
-#include <cstring>
-
-#include "arrow/result.h"
-#include "arrow/util/logging.h"
-#include "parquet/bloom_filter.h"
-#include "parquet/exception.h"
-#include "parquet/murmur3.h"
-
-namespace parquet {
-constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
-
-BlockSplitBloomFilter::BlockSplitBloomFilter()
- : pool_(::arrow::default_memory_pool()),
- hash_strategy_(HashStrategy::MURMUR3_X64_128),
- algorithm_(Algorithm::BLOCK) {}
-
-void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
- if (num_bytes < kMinimumBloomFilterBytes) {
- num_bytes = kMinimumBloomFilterBytes;
- }
-
- // Get next power of 2 if it is not power of 2.
- if ((num_bytes & (num_bytes - 1)) != 0) {
- num_bytes = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bytes));
- }
-
- if (num_bytes > kMaximumBloomFilterBytes) {
- num_bytes = kMaximumBloomFilterBytes;
- }
-
- num_bytes_ = num_bytes;
- PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
- memset(data_->mutable_data(), 0, num_bytes_);
-
- this->hasher_.reset(new MurmurHash3());
-}
-
-void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
- DCHECK(bitset != nullptr);
-
- if (num_bytes < kMinimumBloomFilterBytes || num_bytes > kMaximumBloomFilterBytes ||
- (num_bytes & (num_bytes - 1)) != 0) {
- throw ParquetException("Given length of bitset is illegal");
- }
-
- num_bytes_ = num_bytes;
- PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
- memcpy(data_->mutable_data(), bitset, num_bytes_);
-
- this->hasher_.reset(new MurmurHash3());
-}
-
-BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(ArrowInputStream* input) {
- uint32_t len, hash, algorithm;
- int64_t bytes_available;
-
- PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &len));
- if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
- throw ParquetException("Failed to deserialize from input stream");
- }
-
- PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &hash));
- if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
- throw ParquetException("Failed to deserialize from input stream");
- }
- if (static_cast<HashStrategy>(hash) != HashStrategy::MURMUR3_X64_128) {
- throw ParquetException("Unsupported hash strategy");
- }
-
- PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &algorithm));
- if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
- throw ParquetException("Failed to deserialize from input stream");
- }
- if (static_cast<Algorithm>(algorithm) != BloomFilter::Algorithm::BLOCK) {
- throw ParquetException("Unsupported Bloom filter algorithm");
- }
-
- BlockSplitBloomFilter bloom_filter;
-
- PARQUET_ASSIGN_OR_THROW(auto buffer, input->Read(len));
- bloom_filter.Init(buffer->data(), len);
- return bloom_filter;
-}
-
-void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const {
- DCHECK(sink != nullptr);
-
- PARQUET_THROW_NOT_OK(
- sink->Write(reinterpret_cast<const uint8_t*>(&num_bytes_), sizeof(num_bytes_)));
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<const uint8_t*>(&hash_strategy_),
- sizeof(hash_strategy_)));
- PARQUET_THROW_NOT_OK(
- sink->Write(reinterpret_cast<const uint8_t*>(&algorithm_), sizeof(algorithm_)));
- PARQUET_THROW_NOT_OK(sink->Write(data_->mutable_data(), num_bytes_));
-}
-
-void BlockSplitBloomFilter::SetMask(uint32_t key, BlockMask& block_mask) const {
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- block_mask.item[i] = key * SALT[i];
- }
-
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- block_mask.item[i] = block_mask.item[i] >> 27;
- }
-
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- block_mask.item[i] = UINT32_C(0x1) << block_mask.item[i];
- }
-}
-
-bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
- const uint32_t bucket_index =
- static_cast<uint32_t>((hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1));
- uint32_t key = static_cast<uint32_t>(hash);
- uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
-
- // Calculate mask for bucket.
- BlockMask block_mask;
- SetMask(key, block_mask);
-
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & block_mask.item[i])) {
- return false;
- }
- }
- return true;
-}
-
-void BlockSplitBloomFilter::InsertHash(uint64_t hash) {
- const uint32_t bucket_index =
- static_cast<uint32_t>(hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1);
- uint32_t key = static_cast<uint32_t>(hash);
- uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
-
- // Calculate mask for bucket.
- BlockMask block_mask;
- SetMask(key, block_mask);
-
- for (int i = 0; i < kBitsSetPerBlock; i++) {
- bitset32[bucket_index * kBitsSetPerBlock + i] |= block_mask.item[i];
- }
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/murmur3.h"
+
+namespace parquet {
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+ : pool_(::arrow::default_memory_pool()),
+ hash_strategy_(HashStrategy::MURMUR3_X64_128),
+ algorithm_(Algorithm::BLOCK) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+ if (num_bytes < kMinimumBloomFilterBytes) {
+ num_bytes = kMinimumBloomFilterBytes;
+ }
+
+ // Get next power of 2 if it is not power of 2.
+ if ((num_bytes & (num_bytes - 1)) != 0) {
+ num_bytes = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bytes));
+ }
+
+ if (num_bytes > kMaximumBloomFilterBytes) {
+ num_bytes = kMaximumBloomFilterBytes;
+ }
+
+ num_bytes_ = num_bytes;
+ PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
+ memset(data_->mutable_data(), 0, num_bytes_);
+
+ this->hasher_.reset(new MurmurHash3());
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+ DCHECK(bitset != nullptr);
+
+ if (num_bytes < kMinimumBloomFilterBytes || num_bytes > kMaximumBloomFilterBytes ||
+ (num_bytes & (num_bytes - 1)) != 0) {
+ throw ParquetException("Given length of bitset is illegal");
+ }
+
+ num_bytes_ = num_bytes;
+ PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
+ memcpy(data_->mutable_data(), bitset, num_bytes_);
+
+ this->hasher_.reset(new MurmurHash3());
+}
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(ArrowInputStream* input) {
+ uint32_t len, hash, algorithm;
+ int64_t bytes_available;
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &len));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &hash));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+ if (static_cast<HashStrategy>(hash) != HashStrategy::MURMUR3_X64_128) {
+ throw ParquetException("Unsupported hash strategy");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &algorithm));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+ if (static_cast<Algorithm>(algorithm) != BloomFilter::Algorithm::BLOCK) {
+ throw ParquetException("Unsupported Bloom filter algorithm");
+ }
+
+ BlockSplitBloomFilter bloom_filter;
+
+ PARQUET_ASSIGN_OR_THROW(auto buffer, input->Read(len));
+ bloom_filter.Init(buffer->data(), len);
+ return bloom_filter;
+}
+
+void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const {
+ DCHECK(sink != nullptr);
+
+ PARQUET_THROW_NOT_OK(
+ sink->Write(reinterpret_cast<const uint8_t*>(&num_bytes_), sizeof(num_bytes_)));
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<const uint8_t*>(&hash_strategy_),
+ sizeof(hash_strategy_)));
+ PARQUET_THROW_NOT_OK(
+ sink->Write(reinterpret_cast<const uint8_t*>(&algorithm_), sizeof(algorithm_)));
+ PARQUET_THROW_NOT_OK(sink->Write(data_->mutable_data(), num_bytes_));
+}
+
+void BlockSplitBloomFilter::SetMask(uint32_t key, BlockMask& block_mask) const {
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = key * SALT[i];
+ }
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = block_mask.item[i] >> 27;
+ }
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = UINT32_C(0x1) << block_mask.item[i];
+ }
+}
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+ const uint32_t bucket_index =
+ static_cast<uint32_t>((hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1));
+ uint32_t key = static_cast<uint32_t>(hash);
+ uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
+
+ // Calculate mask for bucket.
+ BlockMask block_mask;
+ SetMask(key, block_mask);
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & block_mask.item[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void BlockSplitBloomFilter::InsertHash(uint64_t hash) {
+ const uint32_t bucket_index =
+ static_cast<uint32_t>(hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1);
+ uint32_t key = static_cast<uint32_t>(hash);
+ uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
+
+ // Calculate mask for bucket.
+ BlockMask block_mask;
+ SetMask(key, block_mask);
+
+ for (int i = 0; i < kBitsSetPerBlock; i++) {
+ bitset32[bucket_index * kBitsSetPerBlock + i] |= block_mask.item[i];
+ }
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
index 39f9561ae5b..218a1162674 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
@@ -1,247 +1,247 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-#include <memory>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/logging.h"
-#include "parquet/hasher.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-// A Bloom filter is a compact structure to indicate whether an item is not in a set or
-// probably in a set. The Bloom filter usually consists of a bit set that represents a
-// set of elements, a hash strategy and a Bloom filter algorithm.
-class PARQUET_EXPORT BloomFilter {
- public:
- // Maximum Bloom filter size, it sets to HDFS default block size 128MB
- // This value will be reconsidered when implementing Bloom filter producer.
- static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
-
- /// Determine whether an element exist in set or not.
- ///
- /// @param hash the element to contain.
- /// @return false if value is definitely not in set, and true means PROBABLY
- /// in set.
- virtual bool FindHash(uint64_t hash) const = 0;
-
- /// Insert element to set represented by Bloom filter bitset.
- /// @param hash the hash of value to insert into Bloom filter.
- virtual void InsertHash(uint64_t hash) = 0;
-
- /// Write this Bloom filter to an output stream. A Bloom filter structure should
- /// include bitset length, hash strategy, algorithm, and bitset.
- ///
- /// @param sink the output stream to write
- virtual void WriteTo(ArrowOutputStream* sink) const = 0;
-
- /// Get the number of bytes of bitset
- virtual uint32_t GetBitsetSize() const = 0;
-
- /// Compute hash for 32 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int32_t value) const = 0;
-
- /// Compute hash for 64 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int64_t value) const = 0;
-
- /// Compute hash for float value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(float value) const = 0;
-
- /// Compute hash for double value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(double value) const = 0;
-
- /// Compute hash for Int96 value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const Int96* value) const = 0;
-
- /// Compute hash for ByteArray value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const ByteArray* value) const = 0;
-
- /// Compute hash for fixed byte array value by using its plain encoding result.
- ///
- /// @param value the value address.
- /// @param len the value length.
- /// @return hash result.
- virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
-
- virtual ~BloomFilter() {}
-
- protected:
- // Hash strategy available for Bloom filter.
- enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
-
- // Bloom filter algorithm.
- enum class Algorithm : uint32_t { BLOCK = 0 };
-};
-
-// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
-// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
-// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
-//
-// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
-// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
-class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
- public:
- /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
- BlockSplitBloomFilter();
-
- /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
- /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
- /// rounded up/down to lower/upper bound if num_bytes is out of range and also
- /// will be rounded up to a power of 2.
- ///
- /// @param num_bytes The number of bytes to store Bloom filter bitset.
- void Init(uint32_t num_bytes);
-
- /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
- /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
- /// which may lead to segfault when performing SIMD instructions. It is the caller's
- /// responsibility to free the bitset passed in. This is used when reconstructing
- /// a Bloom filter from a parquet file.
- ///
- /// @param bitset The given bitset to initialize the Bloom filter.
- /// @param num_bytes The number of bytes of given bitset.
- void Init(const uint8_t* bitset, uint32_t num_bytes);
-
- // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
- static constexpr uint32_t kMinimumBloomFilterBytes = 32;
-
- /// Calculate optimal size according to the number of distinct values and false
- /// positive probability.
- ///
- /// @param ndv The number of distinct values.
- /// @param fpp The false positive probability.
- /// @return it always return a value between kMinimumBloomFilterBytes and
- /// kMaximumBloomFilterBytes, and the return value is always a power of 2
- static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
- DCHECK(fpp > 0.0 && fpp < 1.0);
- const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
- uint32_t num_bits;
-
- // Handle overflow.
- if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
- num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
- } else {
- num_bits = static_cast<uint32_t>(m);
- }
-
- // Round up to lower bound
- if (num_bits < kMinimumBloomFilterBytes << 3) {
- num_bits = kMinimumBloomFilterBytes << 3;
- }
-
- // Get next power of 2 if bits is not power of 2.
- if ((num_bits & (num_bits - 1)) != 0) {
- num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
- }
-
- // Round down to upper bound
- if (num_bits > kMaximumBloomFilterBytes << 3) {
- num_bits = kMaximumBloomFilterBytes << 3;
- }
-
- return num_bits;
- }
-
- bool FindHash(uint64_t hash) const override;
- void InsertHash(uint64_t hash) override;
- void WriteTo(ArrowOutputStream* sink) const override;
- uint32_t GetBitsetSize() const override { return num_bytes_; }
-
- uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
- uint64_t Hash(float value) const override { return hasher_->Hash(value); }
- uint64_t Hash(double value) const override { return hasher_->Hash(value); }
- uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
- uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
- uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
- uint64_t Hash(const FLBA* value, uint32_t len) const override {
- return hasher_->Hash(value, len);
- }
-
- /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
- /// a Bloom filter from a parquet filter.
- ///
- /// @param input_stream The input stream from which to construct the Bloom filter
- /// @return The BlockSplitBloomFilter.
- static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
-
- private:
- // Bytes in a tiny Bloom filter block.
- static constexpr int kBytesPerFilterBlock = 32;
-
- // The number of bits to be set in each tiny Bloom filter
- static constexpr int kBitsSetPerBlock = 8;
-
- // A mask structure used to set bits in each tiny Bloom filter.
- struct BlockMask {
- uint32_t item[kBitsSetPerBlock];
- };
-
- // The block-based algorithm needs eight odd SALT values to calculate eight indexes
- // of bit to set, one bit in each 32-bit word.
- static constexpr uint32_t SALT[kBitsSetPerBlock] = {
- 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
- 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
-
- /// Set bits in mask array according to input key.
- /// @param key the value to calculate mask values.
- /// @param mask the mask array is used to set inside a block
- void SetMask(uint32_t key, BlockMask& mask) const;
-
- // Memory pool to allocate aligned buffer for bitset
- ::arrow::MemoryPool* pool_;
-
- // The underlying buffer of bitset.
- std::shared_ptr<Buffer> data_;
-
- // The number of bytes of Bloom filter bitset.
- uint32_t num_bytes_;
-
- // Hash strategy used in this Bloom filter.
- HashStrategy hash_strategy_;
-
- // Algorithm used in this Bloom filter.
- Algorithm algorithm_;
-
- // The hash pointer points to actual hash class used.
- std::unique_ptr<Hasher> hasher_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/logging.h"
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// A Bloom filter is a compact structure to indicate whether an item is not in a set or
+// probably in a set. The Bloom filter usually consists of a bit set that represents a
+// set of elements, a hash strategy and a Bloom filter algorithm.
+class PARQUET_EXPORT BloomFilter {
+ public:
+ // Maximum Bloom filter size, it sets to HDFS default block size 128MB
+ // This value will be reconsidered when implementing Bloom filter producer.
+ static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
+
+ /// Determine whether an element exist in set or not.
+ ///
+ /// @param hash the element to contain.
+ /// @return false if value is definitely not in set, and true means PROBABLY
+ /// in set.
+ virtual bool FindHash(uint64_t hash) const = 0;
+
+ /// Insert element to set represented by Bloom filter bitset.
+ /// @param hash the hash of value to insert into Bloom filter.
+ virtual void InsertHash(uint64_t hash) = 0;
+
+ /// Write this Bloom filter to an output stream. A Bloom filter structure should
+ /// include bitset length, hash strategy, algorithm, and bitset.
+ ///
+ /// @param sink the output stream to write
+ virtual void WriteTo(ArrowOutputStream* sink) const = 0;
+
+ /// Get the number of bytes of bitset
+ virtual uint32_t GetBitsetSize() const = 0;
+
+ /// Compute hash for 32 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int32_t value) const = 0;
+
+ /// Compute hash for 64 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int64_t value) const = 0;
+
+ /// Compute hash for float value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(float value) const = 0;
+
+ /// Compute hash for double value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(double value) const = 0;
+
+ /// Compute hash for Int96 value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const Int96* value) const = 0;
+
+ /// Compute hash for ByteArray value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+ /// Compute hash for fixed byte array value by using its plain encoding result.
+ ///
+ /// @param value the value address.
+ /// @param len the value length.
+ /// @return hash result.
+ virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+ virtual ~BloomFilter() {}
+
+ protected:
+ // Hash strategy available for Bloom filter.
+ enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
+
+ // Bloom filter algorithm.
+ enum class Algorithm : uint32_t { BLOCK = 0 };
+};
+
+// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
+// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
+// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
+//
+// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
+// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
+class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
+ public:
+ /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
+ BlockSplitBloomFilter();
+
+ /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
+ /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
+ /// rounded up/down to lower/upper bound if num_bytes is out of range and also
+ /// will be rounded up to a power of 2.
+ ///
+ /// @param num_bytes The number of bytes to store Bloom filter bitset.
+ void Init(uint32_t num_bytes);
+
+ /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
+ /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
+ /// which may lead to segfault when performing SIMD instructions. It is the caller's
+ /// responsibility to free the bitset passed in. This is used when reconstructing
+ /// a Bloom filter from a parquet file.
+ ///
+ /// @param bitset The given bitset to initialize the Bloom filter.
+ /// @param num_bytes The number of bytes of given bitset.
+ void Init(const uint8_t* bitset, uint32_t num_bytes);
+
+ // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
+ static constexpr uint32_t kMinimumBloomFilterBytes = 32;
+
+ /// Calculate optimal size according to the number of distinct values and false
+ /// positive probability.
+ ///
+ /// @param ndv The number of distinct values.
+ /// @param fpp The false positive probability.
+ /// @return it always return a value between kMinimumBloomFilterBytes and
+ /// kMaximumBloomFilterBytes, and the return value is always a power of 2
+ static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
+ DCHECK(fpp > 0.0 && fpp < 1.0);
+ const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
+ uint32_t num_bits;
+
+ // Handle overflow.
+ if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
+ num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
+ } else {
+ num_bits = static_cast<uint32_t>(m);
+ }
+
+ // Round up to lower bound
+ if (num_bits < kMinimumBloomFilterBytes << 3) {
+ num_bits = kMinimumBloomFilterBytes << 3;
+ }
+
+ // Get next power of 2 if bits is not power of 2.
+ if ((num_bits & (num_bits - 1)) != 0) {
+ num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
+ }
+
+ // Round down to upper bound
+ if (num_bits > kMaximumBloomFilterBytes << 3) {
+ num_bits = kMaximumBloomFilterBytes << 3;
+ }
+
+ return num_bits;
+ }
+
+ bool FindHash(uint64_t hash) const override;
+ void InsertHash(uint64_t hash) override;
+ void WriteTo(ArrowOutputStream* sink) const override;
+ uint32_t GetBitsetSize() const override { return num_bytes_; }
+
+ uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(float value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(double value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const FLBA* value, uint32_t len) const override {
+ return hasher_->Hash(value, len);
+ }
+
+ /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
+ /// a Bloom filter from a parquet filter.
+ ///
+ /// @param input_stream The input stream from which to construct the Bloom filter
+ /// @return The BlockSplitBloomFilter.
+ static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
+
+ private:
+ // Bytes in a tiny Bloom filter block.
+ static constexpr int kBytesPerFilterBlock = 32;
+
+ // The number of bits to be set in each tiny Bloom filter
+ static constexpr int kBitsSetPerBlock = 8;
+
+ // A mask structure used to set bits in each tiny Bloom filter.
+ struct BlockMask {
+ uint32_t item[kBitsSetPerBlock];
+ };
+
+ // The block-based algorithm needs eight odd SALT values to calculate eight indexes
+ // of bit to set, one bit in each 32-bit word.
+ static constexpr uint32_t SALT[kBitsSetPerBlock] = {
+ 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
+ 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
+
+ /// Set bits in mask array according to input key.
+ /// @param key the value to calculate mask values.
+ /// @param mask the mask array is used to set inside a block
+ void SetMask(uint32_t key, BlockMask& mask) const;
+
+ // Memory pool to allocate aligned buffer for bitset
+ ::arrow::MemoryPool* pool_;
+
+ // The underlying buffer of bitset.
+ std::shared_ptr<Buffer> data_;
+
+ // The number of bytes of Bloom filter bitset.
+ uint32_t num_bytes_;
+
+ // Hash strategy used in this Bloom filter.
+ HashStrategy hash_strategy_;
+
+ // Algorithm used in this Bloom filter.
+ Algorithm algorithm_;
+
+ // The hash pointer points to actual hash class used.
+ std::unique_ptr<Hasher> hasher_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
index 2fab77ed01a..242f16b2e67 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
@@ -1,160 +1,160 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This module defines an abstract interface for iterating through pages in a
-// Parquet column chunk within a row group. It could be extended in the future
-// to iterate through all data pages in all chunks in a file.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <string>
-
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-// TODO: Parallel processing is not yet safe because of memory-ownership
-// semantics (the PageReader may or may not own the memory referenced by a
-// page)
-//
-// TODO(wesm): In the future Parquet implementations may store the crc code
-// in format::PageHeader. parquet-mr currently does not, so we also skip it
-// here, both on the read and write path
-class Page {
- public:
- Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
- : buffer_(buffer), type_(type) {}
-
- PageType::type type() const { return type_; }
-
- std::shared_ptr<Buffer> buffer() const { return buffer_; }
-
- // @returns: a pointer to the page's data
- const uint8_t* data() const { return buffer_->data(); }
-
- // @returns: the total size in bytes of the page's data buffer
- int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
-
- private:
- std::shared_ptr<Buffer> buffer_;
- PageType::type type_;
-};
-
-/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
-class DataPage : public Page {
- public:
- int32_t num_values() const { return num_values_; }
- Encoding::type encoding() const { return encoding_; }
- int64_t uncompressed_size() const { return uncompressed_size_; }
- const EncodedStatistics& statistics() const { return statistics_; }
-
- virtual ~DataPage() = default;
-
- protected:
- DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
- Encoding::type encoding, int64_t uncompressed_size,
- const EncodedStatistics& statistics = EncodedStatistics())
- : Page(buffer, type),
- num_values_(num_values),
- encoding_(encoding),
- uncompressed_size_(uncompressed_size),
- statistics_(statistics) {}
-
- int32_t num_values_;
- Encoding::type encoding_;
- int64_t uncompressed_size_;
- EncodedStatistics statistics_;
-};
-
-class DataPageV1 : public DataPage {
- public:
- DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
- Encoding::type encoding, Encoding::type definition_level_encoding,
- Encoding::type repetition_level_encoding, int64_t uncompressed_size,
- const EncodedStatistics& statistics = EncodedStatistics())
- : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
- statistics),
- definition_level_encoding_(definition_level_encoding),
- repetition_level_encoding_(repetition_level_encoding) {}
-
- Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
-
- Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
-
- private:
- Encoding::type definition_level_encoding_;
- Encoding::type repetition_level_encoding_;
-};
-
-class DataPageV2 : public DataPage {
- public:
- DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
- int32_t num_rows, Encoding::type encoding,
- int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
- int64_t uncompressed_size, bool is_compressed = false,
- const EncodedStatistics& statistics = EncodedStatistics())
- : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
- statistics),
- num_nulls_(num_nulls),
- num_rows_(num_rows),
- definition_levels_byte_length_(definition_levels_byte_length),
- repetition_levels_byte_length_(repetition_levels_byte_length),
- is_compressed_(is_compressed) {}
-
- int32_t num_nulls() const { return num_nulls_; }
-
- int32_t num_rows() const { return num_rows_; }
-
- int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
-
- int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
-
- bool is_compressed() const { return is_compressed_; }
-
- private:
- int32_t num_nulls_;
- int32_t num_rows_;
- int32_t definition_levels_byte_length_;
- int32_t repetition_levels_byte_length_;
- bool is_compressed_;
-};
-
-class DictionaryPage : public Page {
- public:
- DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
- Encoding::type encoding, bool is_sorted = false)
- : Page(buffer, PageType::DICTIONARY_PAGE),
- num_values_(num_values),
- encoding_(encoding),
- is_sorted_(is_sorted) {}
-
- int32_t num_values() const { return num_values_; }
-
- Encoding::type encoding() const { return encoding_; }
-
- bool is_sorted() const { return is_sorted_; }
-
- private:
- int32_t num_values_;
- Encoding::type encoding_;
- bool is_sorted_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// TODO: Parallel processing is not yet safe because of memory-ownership
+// semantics (the PageReader may or may not own the memory referenced by a
+// page)
+//
+// TODO(wesm): In the future Parquet implementations may store the crc code
+// in format::PageHeader. parquet-mr currently does not, so we also skip it
+// here, both on the read and write path
+class Page {
+ public:
+ Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
+ : buffer_(buffer), type_(type) {}
+
+ PageType::type type() const { return type_; }
+
+ std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
+ // @returns: a pointer to the page's data
+ const uint8_t* data() const { return buffer_->data(); }
+
+ // @returns: the total size in bytes of the page's data buffer
+ int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
+
+ private:
+ std::shared_ptr<Buffer> buffer_;
+ PageType::type type_;
+};
+
+/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
+class DataPage : public Page {
+ public:
+ int32_t num_values() const { return num_values_; }
+ Encoding::type encoding() const { return encoding_; }
+ int64_t uncompressed_size() const { return uncompressed_size_; }
+ const EncodedStatistics& statistics() const { return statistics_; }
+
+ virtual ~DataPage() = default;
+
+ protected:
+ DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, int64_t uncompressed_size,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : Page(buffer, type),
+ num_values_(num_values),
+ encoding_(encoding),
+ uncompressed_size_(uncompressed_size),
+ statistics_(statistics) {}
+
+ int32_t num_values_;
+ Encoding::type encoding_;
+ int64_t uncompressed_size_;
+ EncodedStatistics statistics_;
+};
+
+class DataPageV1 : public DataPage {
+ public:
+ DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, Encoding::type definition_level_encoding,
+ Encoding::type repetition_level_encoding, int64_t uncompressed_size,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
+ statistics),
+ definition_level_encoding_(definition_level_encoding),
+ repetition_level_encoding_(repetition_level_encoding) {}
+
+ Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
+
+ Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
+
+ private:
+ Encoding::type definition_level_encoding_;
+ Encoding::type repetition_level_encoding_;
+};
+
+class DataPageV2 : public DataPage {
+ public:
+ DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
+ int32_t num_rows, Encoding::type encoding,
+ int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
+ int64_t uncompressed_size, bool is_compressed = false,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
+ statistics),
+ num_nulls_(num_nulls),
+ num_rows_(num_rows),
+ definition_levels_byte_length_(definition_levels_byte_length),
+ repetition_levels_byte_length_(repetition_levels_byte_length),
+ is_compressed_(is_compressed) {}
+
+ int32_t num_nulls() const { return num_nulls_; }
+
+ int32_t num_rows() const { return num_rows_; }
+
+ int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
+
+ int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
+
+ bool is_compressed() const { return is_compressed_; }
+
+ private:
+ int32_t num_nulls_;
+ int32_t num_rows_;
+ int32_t definition_levels_byte_length_;
+ int32_t repetition_levels_byte_length_;
+ bool is_compressed_;
+};
+
+class DictionaryPage : public Page {
+ public:
+ DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, bool is_sorted = false)
+ : Page(buffer, PageType::DICTIONARY_PAGE),
+ num_values_(num_values),
+ encoding_(encoding),
+ is_sorted_(is_sorted) {}
+
+ int32_t num_values() const { return num_values_; }
+
+ Encoding::type encoding() const { return encoding_; }
+
+ bool is_sorted() const { return is_sorted_; }
+
+ private:
+ int32_t num_values_;
+ Encoding::type encoding_;
+ bool is_sorted_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
index 047d99fed9a..713205e98dd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
@@ -1,1802 +1,1802 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/column_reader.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <exception>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/array/builder_binary.h"
-#include "arrow/array/builder_dict.h"
-#include "arrow/array/builder_primitive.h"
-#include "arrow/chunked_array.h"
-#include "arrow/type.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
-#include "arrow/util/int_util_internal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/rle_encoding.h"
-#include "parquet/column_page.h"
-#include "parquet/encoding.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/level_comparison.h"
-#include "parquet/level_conversion.h"
-#include "parquet/properties.h"
-#include "parquet/statistics.h"
-#include "parquet/thrift_internal.h" // IWYU pragma: keep
-// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
-#include "parquet/windows_compatibility.h"
-
-using arrow::MemoryPool;
-using arrow::internal::AddWithOverflow;
-using arrow::internal::checked_cast;
-using arrow::internal::MultiplyWithOverflow;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-namespace {
-inline bool HasSpacedValues(const ColumnDescriptor* descr) {
- if (descr->max_repetition_level() > 0) {
- // repeated+flat case
- return !descr->schema_node()->is_required();
- } else {
- // non-repeated+nested case
- // Find if a node forces nulls in the lowest level along the hierarchy
- const schema::Node* node = descr->schema_node().get();
- while (node) {
- if (node->is_optional()) {
- return true;
- }
- node = node->parent();
- }
- return false;
- }
-}
-} // namespace
-
-LevelDecoder::LevelDecoder() : num_values_remaining_(0) {}
-
-LevelDecoder::~LevelDecoder() {}
-
-int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
- int num_buffered_values, const uint8_t* data,
- int32_t data_size) {
- max_level_ = max_level;
- int32_t num_bytes = 0;
- encoding_ = encoding;
- num_values_remaining_ = num_buffered_values;
- bit_width_ = BitUtil::Log2(max_level + 1);
- switch (encoding) {
- case Encoding::RLE: {
- if (data_size < 4) {
- throw ParquetException("Received invalid levels (corrupt data page?)");
- }
- num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data);
- if (num_bytes < 0 || num_bytes > data_size - 4) {
- throw ParquetException("Received invalid number of bytes (corrupt data page?)");
- }
- const uint8_t* decoder_data = data + 4;
- if (!rle_decoder_) {
- rle_decoder_.reset(
- new ::arrow::util::RleDecoder(decoder_data, num_bytes, bit_width_));
- } else {
- rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
- }
- return 4 + num_bytes;
- }
- case Encoding::BIT_PACKED: {
- int num_bits = 0;
- if (MultiplyWithOverflow(num_buffered_values, bit_width_, &num_bits)) {
- throw ParquetException(
- "Number of buffered values too large (corrupt data page?)");
- }
- num_bytes = static_cast<int32_t>(BitUtil::BytesForBits(num_bits));
- if (num_bytes < 0 || num_bytes > data_size - 4) {
- throw ParquetException("Received invalid number of bytes (corrupt data page?)");
- }
- if (!bit_packed_decoder_) {
- bit_packed_decoder_.reset(new ::arrow::BitUtil::BitReader(data, num_bytes));
- } else {
- bit_packed_decoder_->Reset(data, num_bytes);
- }
- return num_bytes;
- }
- default:
- throw ParquetException("Unknown encoding type for levels.");
- }
- return -1;
-}
-
-void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level,
- int num_buffered_values, const uint8_t* data) {
- max_level_ = max_level;
- // Repetition and definition levels always uses RLE encoding
- // in the DataPageV2 format.
- if (num_bytes < 0) {
- throw ParquetException("Invalid page header (corrupt data page?)");
- }
- encoding_ = Encoding::RLE;
- num_values_remaining_ = num_buffered_values;
- bit_width_ = BitUtil::Log2(max_level + 1);
-
- if (!rle_decoder_) {
- rle_decoder_.reset(new ::arrow::util::RleDecoder(data, num_bytes, bit_width_));
- } else {
- rle_decoder_->Reset(data, num_bytes, bit_width_);
- }
-}
-
-int LevelDecoder::Decode(int batch_size, int16_t* levels) {
- int num_decoded = 0;
-
- int num_values = std::min(num_values_remaining_, batch_size);
- if (encoding_ == Encoding::RLE) {
- num_decoded = rle_decoder_->GetBatch(levels, num_values);
- } else {
- num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values);
- }
- if (num_decoded > 0) {
- internal::MinMax min_max = internal::FindMinMax(levels, num_decoded);
- if (ARROW_PREDICT_FALSE(min_max.min < 0 || min_max.max > max_level_)) {
- std::stringstream ss;
- ss << "Malformed levels. min: " << min_max.min << " max: " << min_max.max
- << " out of range. Max Level: " << max_level_;
- throw ParquetException(ss.str());
- }
- }
- num_values_remaining_ -= num_decoded;
- return num_decoded;
-}
-
-ReaderProperties default_reader_properties() {
- static ReaderProperties default_reader_properties;
- return default_reader_properties;
-}
-
-namespace {
-
-// Extracts encoded statistics from V1 and V2 data page headers
-template <typename H>
-EncodedStatistics ExtractStatsFromHeader(const H& header) {
- EncodedStatistics page_statistics;
- if (!header.__isset.statistics) {
- return page_statistics;
- }
- const format::Statistics& stats = header.statistics;
- if (stats.__isset.max) {
- page_statistics.set_max(stats.max);
- }
- if (stats.__isset.min) {
- page_statistics.set_min(stats.min);
- }
- if (stats.__isset.null_count) {
- page_statistics.set_null_count(stats.null_count);
- }
- if (stats.__isset.distinct_count) {
- page_statistics.set_distinct_count(stats.distinct_count);
- }
- return page_statistics;
-}
-
-// ----------------------------------------------------------------------
-// SerializedPageReader deserializes Thrift metadata and pages that have been
-// assembled in a serialized stream for storing in a Parquet files
-
-// This subclass delimits pages appearing in a serialized stream, each preceded
-// by a serialized Thrift format::PageHeader indicating the type of each page
-// and the page metadata.
-class SerializedPageReader : public PageReader {
- public:
- SerializedPageReader(std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
- Compression::type codec, ::arrow::MemoryPool* pool,
- const CryptoContext* crypto_ctx)
- : stream_(std::move(stream)),
- decompression_buffer_(AllocateBuffer(pool, 0)),
- page_ordinal_(0),
- seen_num_rows_(0),
- total_num_rows_(total_num_rows),
- decryption_buffer_(AllocateBuffer(pool, 0)) {
- if (crypto_ctx != nullptr) {
- crypto_ctx_ = *crypto_ctx;
- InitDecryption();
- }
- max_page_header_size_ = kDefaultMaxPageHeaderSize;
- decompressor_ = GetCodec(codec);
- }
-
- // Implement the PageReader interface
- std::shared_ptr<Page> NextPage() override;
-
- void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; }
-
- private:
- void UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor, int8_t module_type,
- const std::string& page_aad);
-
- void InitDecryption();
-
- std::shared_ptr<Buffer> DecompressIfNeeded(std::shared_ptr<Buffer> page_buffer,
- int compressed_len, int uncompressed_len,
- int levels_byte_len = 0);
-
- std::shared_ptr<ArrowInputStream> stream_;
-
- format::PageHeader current_page_header_;
- std::shared_ptr<Page> current_page_;
-
- // Compression codec to use.
- std::unique_ptr<::arrow::util::Codec> decompressor_;
- std::shared_ptr<ResizableBuffer> decompression_buffer_;
-
- // The fields below are used for calculation of AAD (additional authenticated data)
- // suffix which is part of the Parquet Modular Encryption.
- // The AAD suffix for a parquet module is built internally by
- // concatenating different parts some of which include
- // the row group ordinal, column ordinal and page ordinal.
- // Please refer to the encryption specification for more details:
- // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data
-
- // The ordinal fields in the context below are used for AAD suffix calculation.
- CryptoContext crypto_ctx_;
- int16_t page_ordinal_; // page ordinal does not count the dictionary page
-
- // Maximum allowed page size
- uint32_t max_page_header_size_;
-
- // Number of rows read in data pages so far
- int64_t seen_num_rows_;
-
- // Number of rows in all the data pages
- int64_t total_num_rows_;
-
- // data_page_aad_ and data_page_header_aad_ contain the AAD for data page and data page
- // header in a single column respectively.
- // While calculating AAD for different pages in a single column the pages AAD is
- // updated by only the page ordinal.
- std::string data_page_aad_;
- std::string data_page_header_aad_;
- // Encryption
- std::shared_ptr<ResizableBuffer> decryption_buffer_;
-};
-
-void SerializedPageReader::InitDecryption() {
- // Prepare the AAD for quick update later.
- if (crypto_ctx_.data_decryptor != nullptr) {
- DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty());
- data_page_aad_ = encryption::CreateModuleAad(
- crypto_ctx_.data_decryptor->file_aad(), encryption::kDataPage,
- crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
- }
- if (crypto_ctx_.meta_decryptor != nullptr) {
- DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty());
- data_page_header_aad_ = encryption::CreateModuleAad(
- crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader,
- crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
- }
-}
-
-void SerializedPageReader::UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor,
- int8_t module_type,
- const std::string& page_aad) {
- DCHECK(decryptor != nullptr);
- if (crypto_ctx_.start_decrypt_with_dictionary_page) {
- std::string aad = encryption::CreateModuleAad(
- decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal,
- crypto_ctx_.column_ordinal, kNonPageOrdinal);
- decryptor->UpdateAad(aad);
- } else {
- encryption::QuickUpdatePageAad(page_aad, page_ordinal_);
- decryptor->UpdateAad(page_aad);
- }
-}
-
-std::shared_ptr<Page> SerializedPageReader::NextPage() {
- // Loop here because there may be unhandled page types that we skip until
- // finding a page that we do know what to do with
-
- while (seen_num_rows_ < total_num_rows_) {
- uint32_t header_size = 0;
- uint32_t allowed_page_size = kDefaultPageHeaderSize;
-
- // Page headers can be very large because of page statistics
- // We try to deserialize a larger buffer progressively
- // until a maximum allowed header limit
- while (true) {
- PARQUET_ASSIGN_OR_THROW(auto view, stream_->Peek(allowed_page_size));
- if (view.size() == 0) {
- return std::shared_ptr<Page>(nullptr);
- }
-
- // This gets used, then set by DeserializeThriftMsg
- header_size = static_cast<uint32_t>(view.size());
- try {
- if (crypto_ctx_.meta_decryptor != nullptr) {
- UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader,
- data_page_header_aad_);
- }
- DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(view.data()), &header_size,
- &current_page_header_, crypto_ctx_.meta_decryptor);
- break;
- } catch (std::exception& e) {
- // Failed to deserialize. Double the allowed page header size and try again
- std::stringstream ss;
- ss << e.what();
- allowed_page_size *= 2;
- if (allowed_page_size > max_page_header_size_) {
- ss << "Deserializing page header failed.\n";
- throw ParquetException(ss.str());
- }
- }
- }
- // Advance the stream offset
- PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
-
- int compressed_len = current_page_header_.compressed_page_size;
- int uncompressed_len = current_page_header_.uncompressed_page_size;
- if (compressed_len < 0 || uncompressed_len < 0) {
- throw ParquetException("Invalid page header");
- }
-
- if (crypto_ctx_.data_decryptor != nullptr) {
- UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage,
- data_page_aad_);
- }
-
- // Read the compressed data page.
- PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
- if (page_buffer->size() != compressed_len) {
- std::stringstream ss;
- ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
- << compressed_len << ")";
- ParquetException::EofException(ss.str());
- }
-
- // Decrypt it if we need to
- if (crypto_ctx_.data_decryptor != nullptr) {
- PARQUET_THROW_NOT_OK(decryption_buffer_->Resize(
- compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta(), false));
- compressed_len = crypto_ctx_.data_decryptor->Decrypt(
- page_buffer->data(), compressed_len, decryption_buffer_->mutable_data());
-
- page_buffer = decryption_buffer_;
- }
-
- const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
-
- if (page_type == PageType::DICTIONARY_PAGE) {
- crypto_ctx_.start_decrypt_with_dictionary_page = false;
- const format::DictionaryPageHeader& dict_header =
- current_page_header_.dictionary_page_header;
-
- bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
- if (dict_header.num_values < 0) {
- throw ParquetException("Invalid page header (negative number of values)");
- }
-
- // Uncompress if needed
- page_buffer =
- DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
-
- return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
- LoadEnumSafe(&dict_header.encoding),
- is_sorted);
- } else if (page_type == PageType::DATA_PAGE) {
- ++page_ordinal_;
- const format::DataPageHeader& header = current_page_header_.data_page_header;
-
- if (header.num_values < 0) {
- throw ParquetException("Invalid page header (negative number of values)");
- }
- EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
- seen_num_rows_ += header.num_values;
-
- // Uncompress if needed
- page_buffer =
- DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
-
- return std::make_shared<DataPageV1>(page_buffer, header.num_values,
- LoadEnumSafe(&header.encoding),
- LoadEnumSafe(&header.definition_level_encoding),
- LoadEnumSafe(&header.repetition_level_encoding),
- uncompressed_len, page_statistics);
- } else if (page_type == PageType::DATA_PAGE_V2) {
- ++page_ordinal_;
- const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;
-
- if (header.num_values < 0) {
- throw ParquetException("Invalid page header (negative number of values)");
- }
- if (header.definition_levels_byte_length < 0 ||
- header.repetition_levels_byte_length < 0) {
- throw ParquetException("Invalid page header (negative levels byte length)");
- }
- bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
- EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
- seen_num_rows_ += header.num_values;
-
- // Uncompress if needed
- int levels_byte_len;
- if (AddWithOverflow(header.definition_levels_byte_length,
- header.repetition_levels_byte_length, &levels_byte_len)) {
- throw ParquetException("Levels size too large (corrupt file?)");
- }
- // DecompressIfNeeded doesn't take `is_compressed` into account as
- // it's page type-agnostic.
- if (is_compressed) {
- page_buffer = DecompressIfNeeded(std::move(page_buffer), compressed_len,
- uncompressed_len, levels_byte_len);
- }
-
- return std::make_shared<DataPageV2>(
- page_buffer, header.num_values, header.num_nulls, header.num_rows,
- LoadEnumSafe(&header.encoding), header.definition_levels_byte_length,
- header.repetition_levels_byte_length, uncompressed_len, is_compressed,
- page_statistics);
- } else {
- // We don't know what this page type is. We're allowed to skip non-data
- // pages.
- continue;
- }
- }
- return std::shared_ptr<Page>(nullptr);
-}
-
-std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
- std::shared_ptr<Buffer> page_buffer, int compressed_len, int uncompressed_len,
- int levels_byte_len) {
- if (decompressor_ == nullptr) {
- return page_buffer;
- }
- if (compressed_len < levels_byte_len || uncompressed_len < levels_byte_len) {
- throw ParquetException("Invalid page header");
- }
-
- // Grow the uncompressed buffer if we need to.
- if (uncompressed_len > static_cast<int>(decompression_buffer_->size())) {
- PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len, false));
- }
-
- if (levels_byte_len > 0) {
- // First copy the levels as-is
- uint8_t* decompressed = decompression_buffer_->mutable_data();
- memcpy(decompressed, page_buffer->data(), levels_byte_len);
- }
-
- // Decompress the values
- PARQUET_THROW_NOT_OK(decompressor_->Decompress(
- compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
- uncompressed_len - levels_byte_len,
- decompression_buffer_->mutable_data() + levels_byte_len));
-
- return decompression_buffer_;
-}
-
-} // namespace
-
-std::unique_ptr<PageReader> PageReader::Open(std::shared_ptr<ArrowInputStream> stream,
- int64_t total_num_rows,
- Compression::type codec,
- ::arrow::MemoryPool* pool,
- const CryptoContext* ctx) {
- return std::unique_ptr<PageReader>(
- new SerializedPageReader(std::move(stream), total_num_rows, codec, pool, ctx));
-}
-
-namespace {
-
-// ----------------------------------------------------------------------
-// Impl base class for TypedColumnReader and RecordReader
-
-// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
-// encoding.
-static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
- return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
-}
-
-template <typename DType>
-class ColumnReaderImplBase {
- public:
- using T = typename DType::c_type;
-
- ColumnReaderImplBase(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
- : descr_(descr),
- max_def_level_(descr->max_definition_level()),
- max_rep_level_(descr->max_repetition_level()),
- num_buffered_values_(0),
- num_decoded_values_(0),
- pool_(pool),
- current_decoder_(nullptr),
- current_encoding_(Encoding::UNKNOWN) {}
-
- virtual ~ColumnReaderImplBase() = default;
-
- protected:
- // Read up to batch_size values from the current data page into the
- // pre-allocated memory T*
- //
- // @returns: the number of values read into the out buffer
- int64_t ReadValues(int64_t batch_size, T* out) {
- int64_t num_decoded = current_decoder_->Decode(out, static_cast<int>(batch_size));
- return num_decoded;
- }
-
- // Read up to batch_size values from the current data page into the
- // pre-allocated memory T*, leaving spaces for null entries according
- // to the def_levels.
- //
- // @returns: the number of values read into the out buffer
- int64_t ReadValuesSpaced(int64_t batch_size, T* out, int64_t null_count,
- uint8_t* valid_bits, int64_t valid_bits_offset) {
- return current_decoder_->DecodeSpaced(out, static_cast<int>(batch_size),
- static_cast<int>(null_count), valid_bits,
- valid_bits_offset);
- }
-
- // Read multiple definition levels into preallocated memory
- //
- // Returns the number of decoded definition levels
- int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
- if (max_def_level_ == 0) {
- return 0;
- }
- return definition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
- }
-
- bool HasNextInternal() {
- // Either there is no data page available yet, or the data page has been
- // exhausted
- if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) {
- if (!ReadNewPage() || num_buffered_values_ == 0) {
- return false;
- }
- }
- return true;
- }
-
- // Read multiple repetition levels into preallocated memory
- // Returns the number of decoded repetition levels
- int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels) {
- if (max_rep_level_ == 0) {
- return 0;
- }
- return repetition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
- }
-
- // Advance to the next data page
- bool ReadNewPage() {
- // Loop until we find the next data page.
- while (true) {
- current_page_ = pager_->NextPage();
- if (!current_page_) {
- // EOS
- return false;
- }
-
- if (current_page_->type() == PageType::DICTIONARY_PAGE) {
- ConfigureDictionary(static_cast<const DictionaryPage*>(current_page_.get()));
- continue;
- } else if (current_page_->type() == PageType::DATA_PAGE) {
- const auto page = std::static_pointer_cast<DataPageV1>(current_page_);
- const int64_t levels_byte_size = InitializeLevelDecoders(
- *page, page->repetition_level_encoding(), page->definition_level_encoding());
- InitializeDataDecoder(*page, levels_byte_size);
- return true;
- } else if (current_page_->type() == PageType::DATA_PAGE_V2) {
- const auto page = std::static_pointer_cast<DataPageV2>(current_page_);
- int64_t levels_byte_size = InitializeLevelDecodersV2(*page);
- InitializeDataDecoder(*page, levels_byte_size);
- return true;
- } else {
- // We don't know what this page type is. We're allowed to skip non-data
- // pages.
- continue;
- }
- }
- return true;
- }
-
- void ConfigureDictionary(const DictionaryPage* page) {
- int encoding = static_cast<int>(page->encoding());
- if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
- page->encoding() == Encoding::PLAIN) {
- encoding = static_cast<int>(Encoding::RLE_DICTIONARY);
- }
-
- auto it = decoders_.find(encoding);
- if (it != decoders_.end()) {
- throw ParquetException("Column cannot have more than one dictionary.");
- }
-
- if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
- page->encoding() == Encoding::PLAIN) {
- auto dictionary = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
- dictionary->SetData(page->num_values(), page->data(), page->size());
-
- // The dictionary is fully decoded during DictionaryDecoder::Init, so the
- // DictionaryPage buffer is no longer required after this step
- //
- // TODO(wesm): investigate whether this all-or-nothing decoding of the
- // dictionary makes sense and whether performance can be improved
-
- std::unique_ptr<DictDecoder<DType>> decoder = MakeDictDecoder<DType>(descr_, pool_);
- decoder->SetDict(dictionary.get());
- decoders_[encoding] =
- std::unique_ptr<DecoderType>(dynamic_cast<DecoderType*>(decoder.release()));
- } else {
- ParquetException::NYI("only plain dictionary encoding has been implemented");
- }
-
- new_dictionary_ = true;
- current_decoder_ = decoders_[encoding].get();
- DCHECK(current_decoder_);
- }
-
- // Initialize repetition and definition level decoders on the next data page.
-
- // If the data page includes repetition and definition levels, we
- // initialize the level decoders and return the number of encoded level bytes.
- // The return value helps determine the number of bytes in the encoded data.
- int64_t InitializeLevelDecoders(const DataPage& page,
- Encoding::type repetition_level_encoding,
- Encoding::type definition_level_encoding) {
- // Read a data page.
- num_buffered_values_ = page.num_values();
-
- // Have not decoded any values from the data page yet
- num_decoded_values_ = 0;
-
- const uint8_t* buffer = page.data();
- int32_t levels_byte_size = 0;
- int32_t max_size = page.size();
-
- // Data page Layout: Repetition Levels - Definition Levels - encoded values.
- // Levels are encoded as rle or bit-packed.
- // Init repetition levels
- if (max_rep_level_ > 0) {
- int32_t rep_levels_bytes = repetition_level_decoder_.SetData(
- repetition_level_encoding, max_rep_level_,
- static_cast<int>(num_buffered_values_), buffer, max_size);
- buffer += rep_levels_bytes;
- levels_byte_size += rep_levels_bytes;
- max_size -= rep_levels_bytes;
- }
- // TODO figure a way to set max_def_level_ to 0
- // if the initial value is invalid
-
- // Init definition levels
- if (max_def_level_ > 0) {
- int32_t def_levels_bytes = definition_level_decoder_.SetData(
- definition_level_encoding, max_def_level_,
- static_cast<int>(num_buffered_values_), buffer, max_size);
- levels_byte_size += def_levels_bytes;
- max_size -= def_levels_bytes;
- }
-
- return levels_byte_size;
- }
-
- int64_t InitializeLevelDecodersV2(const DataPageV2& page) {
- // Read a data page.
- num_buffered_values_ = page.num_values();
-
- // Have not decoded any values from the data page yet
- num_decoded_values_ = 0;
- const uint8_t* buffer = page.data();
-
- const int64_t total_levels_length =
- static_cast<int64_t>(page.repetition_levels_byte_length()) +
- page.definition_levels_byte_length();
-
- if (total_levels_length > page.size()) {
- throw ParquetException("Data page too small for levels (corrupt header?)");
- }
-
- if (max_rep_level_ > 0) {
- repetition_level_decoder_.SetDataV2(page.repetition_levels_byte_length(),
- max_rep_level_,
- static_cast<int>(num_buffered_values_), buffer);
- buffer += page.repetition_levels_byte_length();
- }
-
- if (max_def_level_ > 0) {
- definition_level_decoder_.SetDataV2(page.definition_levels_byte_length(),
- max_def_level_,
- static_cast<int>(num_buffered_values_), buffer);
- }
-
- return total_levels_length;
- }
-
- // Get a decoder object for this page or create a new decoder if this is the
- // first page with this encoding.
- void InitializeDataDecoder(const DataPage& page, int64_t levels_byte_size) {
- const uint8_t* buffer = page.data() + levels_byte_size;
- const int64_t data_size = page.size() - levels_byte_size;
-
- if (data_size < 0) {
- throw ParquetException("Page smaller than size of encoded levels");
- }
-
- Encoding::type encoding = page.encoding();
-
- if (IsDictionaryIndexEncoding(encoding)) {
- encoding = Encoding::RLE_DICTIONARY;
- }
-
- auto it = decoders_.find(static_cast<int>(encoding));
- if (it != decoders_.end()) {
- DCHECK(it->second.get() != nullptr);
- if (encoding == Encoding::RLE_DICTIONARY) {
- DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY);
- }
- current_decoder_ = it->second.get();
- } else {
- switch (encoding) {
- case Encoding::PLAIN: {
- auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
- current_decoder_ = decoder.get();
- decoders_[static_cast<int>(encoding)] = std::move(decoder);
- break;
- }
- case Encoding::BYTE_STREAM_SPLIT: {
- auto decoder = MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT, descr_);
- current_decoder_ = decoder.get();
- decoders_[static_cast<int>(encoding)] = std::move(decoder);
- break;
- }
- case Encoding::RLE_DICTIONARY:
- throw ParquetException("Dictionary page must be before data page.");
-
- case Encoding::DELTA_BINARY_PACKED:
- case Encoding::DELTA_LENGTH_BYTE_ARRAY:
- case Encoding::DELTA_BYTE_ARRAY:
- ParquetException::NYI("Unsupported encoding");
-
- default:
- throw ParquetException("Unknown encoding type.");
- }
- }
- current_encoding_ = encoding;
- current_decoder_->SetData(static_cast<int>(num_buffered_values_), buffer,
- static_cast<int>(data_size));
- }
-
- const ColumnDescriptor* descr_;
- const int16_t max_def_level_;
- const int16_t max_rep_level_;
-
- std::unique_ptr<PageReader> pager_;
- std::shared_ptr<Page> current_page_;
-
- // Not set if full schema for this field has no optional or repeated elements
- LevelDecoder definition_level_decoder_;
-
- // Not set for flat schemas.
- LevelDecoder repetition_level_decoder_;
-
- // The total number of values stored in the data page. This is the maximum of
- // the number of encoded definition levels or encoded values. For
- // non-repeated, required columns, this is equal to the number of encoded
- // values. For repeated or optional values, there may be fewer data values
- // than levels, and this tells you how many encoded levels there are in that
- // case.
- int64_t num_buffered_values_;
-
- // The number of values from the current data page that have been decoded
- // into memory
- int64_t num_decoded_values_;
-
- ::arrow::MemoryPool* pool_;
-
- using DecoderType = TypedDecoder<DType>;
- DecoderType* current_decoder_;
- Encoding::type current_encoding_;
-
- /// Flag to signal when a new dictionary has been set, for the benefit of
- /// DictionaryRecordReader
- bool new_dictionary_;
-
- // The exposed encoding
- ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
-
- // Map of encoding type to the respective decoder object. For example, a
- // column chunk's data pages may include both dictionary-encoded and
- // plain-encoded data.
- std::unordered_map<int, std::unique_ptr<DecoderType>> decoders_;
-
- void ConsumeBufferedValues(int64_t num_values) { num_decoded_values_ += num_values; }
-};
-
-// ----------------------------------------------------------------------
-// TypedColumnReader implementations
-
-template <typename DType>
-class TypedColumnReaderImpl : public TypedColumnReader<DType>,
- public ColumnReaderImplBase<DType> {
- public:
- using T = typename DType::c_type;
-
- TypedColumnReaderImpl(const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
- ::arrow::MemoryPool* pool)
- : ColumnReaderImplBase<DType>(descr, pool) {
- this->pager_ = std::move(pager);
- }
-
- bool HasNext() override { return this->HasNextInternal(); }
-
- int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- T* values, int64_t* values_read) override;
-
- int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- T* values, uint8_t* valid_bits, int64_t valid_bits_offset,
- int64_t* levels_read, int64_t* values_read,
- int64_t* null_count) override;
-
- int64_t Skip(int64_t num_rows_to_skip) override;
-
- Type::type type() const override { return this->descr_->physical_type(); }
-
- const ColumnDescriptor* descr() const override { return this->descr_; }
-
- ExposedEncoding GetExposedEncoding() override { return this->exposed_encoding_; };
-
- int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, int32_t* indices,
- int64_t* indices_read, const T** dict,
- int32_t* dict_len) override;
-
- protected:
- void SetExposedEncoding(ExposedEncoding encoding) override {
- this->exposed_encoding_ = encoding;
- }
-
- private:
- // Read dictionary indices. Similar to ReadValues but decode data to dictionary indices.
- // This function is called only by ReadBatchWithDictionary().
- int64_t ReadDictionaryIndices(int64_t indices_to_read, int32_t* indices) {
- auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
- return decoder->DecodeIndices(static_cast<int>(indices_to_read), indices);
- }
-
- // Get dictionary. The dictionary should have been set by SetDict(). The dictionary is
- // owned by the internal decoder and is destroyed when the reader is destroyed. This
- // function is called only by ReadBatchWithDictionary() after dictionary is configured.
- void GetDictionary(const T** dictionary, int32_t* dictionary_length) {
- auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
- decoder->GetDictionary(dictionary, dictionary_length);
- }
-
- // Read definition and repetition levels. Also return the number of definition levels
- // and number of values to read. This function is called before reading values.
- void ReadLevels(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- int64_t* num_def_levels, int64_t* values_to_read) {
- batch_size =
- std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
-
- // If the field is required and non-repeated, there are no definition levels
- if (this->max_def_level_ > 0 && def_levels != nullptr) {
- *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
- // TODO(wesm): this tallying of values-to-decode can be performed with better
- // cache-efficiency if fused with the level decoding.
- for (int64_t i = 0; i < *num_def_levels; ++i) {
- if (def_levels[i] == this->max_def_level_) {
- ++(*values_to_read);
- }
- }
- } else {
- // Required field, read all values
- *values_to_read = batch_size;
- }
-
- // Not present for non-repeated fields
- if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
- int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
- if (def_levels != nullptr && *num_def_levels != num_rep_levels) {
- throw ParquetException("Number of decoded rep / def levels did not match");
- }
- }
- }
-};
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatchWithDictionary(
- int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, int32_t* indices,
- int64_t* indices_read, const T** dict, int32_t* dict_len) {
- bool has_dict_output = dict != nullptr && dict_len != nullptr;
- // Similar logic as ReadValues to get pages.
- if (!HasNext()) {
- *indices_read = 0;
- if (has_dict_output) {
- *dict = nullptr;
- *dict_len = 0;
- }
- return 0;
- }
-
- // Verify the current data page is dictionary encoded.
- if (this->current_encoding_ != Encoding::RLE_DICTIONARY) {
- std::stringstream ss;
- ss << "Data page is not dictionary encoded. Encoding: "
- << EncodingToString(this->current_encoding_);
- throw ParquetException(ss.str());
- }
-
- // Get dictionary pointer and length.
- if (has_dict_output) {
- GetDictionary(dict, dict_len);
- }
-
- // Similar logic as ReadValues to get def levels and rep levels.
- int64_t num_def_levels = 0;
- int64_t indices_to_read = 0;
- ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &indices_to_read);
-
- // Read dictionary indices.
- *indices_read = ReadDictionaryIndices(indices_to_read, indices);
- int64_t total_indices = std::max(num_def_levels, *indices_read);
- this->ConsumeBufferedValues(total_indices);
-
- return total_indices;
-}
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, T* values,
- int64_t* values_read) {
- // HasNext invokes ReadNewPage
- if (!HasNext()) {
- *values_read = 0;
- return 0;
- }
-
- // TODO(wesm): keep reading data pages until batch_size is reached, or the
- // row group is finished
- int64_t num_def_levels = 0;
- int64_t values_to_read = 0;
- ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &values_to_read);
-
- *values_read = this->ReadValues(values_to_read, values);
- int64_t total_values = std::max(num_def_levels, *values_read);
- this->ConsumeBufferedValues(total_values);
-
- return total_values;
-}
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
- int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
- uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
- int64_t* values_read, int64_t* null_count_out) {
- // HasNext invokes ReadNewPage
- if (!HasNext()) {
- *levels_read = 0;
- *values_read = 0;
- *null_count_out = 0;
- return 0;
- }
-
- int64_t total_values;
- // TODO(wesm): keep reading data pages until batch_size is reached, or the
- // row group is finished
- batch_size =
- std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
-
- // If the field is required and non-repeated, there are no definition levels
- if (this->max_def_level_ > 0) {
- int64_t num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
-
- // Not present for non-repeated fields
- if (this->max_rep_level_ > 0) {
- int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
- if (num_def_levels != num_rep_levels) {
- throw ParquetException("Number of decoded rep / def levels did not match");
- }
- }
-
- const bool has_spaced_values = HasSpacedValues(this->descr_);
- int64_t null_count = 0;
- if (!has_spaced_values) {
- int values_to_read = 0;
- for (int64_t i = 0; i < num_def_levels; ++i) {
- if (def_levels[i] == this->max_def_level_) {
- ++values_to_read;
- }
- }
- total_values = this->ReadValues(values_to_read, values);
- ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
- /*length=*/total_values,
- /*bits_are_set=*/true);
- *values_read = total_values;
- } else {
- internal::LevelInfo info;
- info.repeated_ancestor_def_level = this->max_def_level_ - 1;
- info.def_level = this->max_def_level_;
- info.rep_level = this->max_rep_level_;
- internal::ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = num_def_levels;
- validity_io.valid_bits = valid_bits;
- validity_io.valid_bits_offset = valid_bits_offset;
- validity_io.null_count = null_count;
- validity_io.values_read = *values_read;
-
- internal::DefLevelsToBitmap(def_levels, num_def_levels, info, &validity_io);
- null_count = validity_io.null_count;
- *values_read = validity_io.values_read;
-
- total_values =
- this->ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
- valid_bits, valid_bits_offset);
- }
- *levels_read = num_def_levels;
- *null_count_out = null_count;
-
- } else {
- // Required field, read all values
- total_values = this->ReadValues(batch_size, values);
- ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
- /*length=*/total_values,
- /*bits_are_set=*/true);
- *null_count_out = 0;
- *values_read = total_values;
- *levels_read = total_values;
- }
-
- this->ConsumeBufferedValues(*levels_read);
- return total_values;
-}
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::Skip(int64_t num_rows_to_skip) {
- int64_t rows_to_skip = num_rows_to_skip;
- while (HasNext() && rows_to_skip > 0) {
- // If the number of rows to skip is more than the number of undecoded values, skip the
- // Page.
- if (rows_to_skip > (this->num_buffered_values_ - this->num_decoded_values_)) {
- rows_to_skip -= this->num_buffered_values_ - this->num_decoded_values_;
- this->num_decoded_values_ = this->num_buffered_values_;
- } else {
- // We need to read this Page
- // Jump to the right offset in the Page
- int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint
- int64_t values_read = 0;
-
- // This will be enough scratch space to accommodate 16-bit levels or any
- // value type
- std::shared_ptr<ResizableBuffer> scratch = AllocateBuffer(
- this->pool_, batch_size * type_traits<DType::type_num>::value_byte_size);
-
- do {
- batch_size = std::min(batch_size, rows_to_skip);
- values_read =
- ReadBatch(static_cast<int>(batch_size),
- reinterpret_cast<int16_t*>(scratch->mutable_data()),
- reinterpret_cast<int16_t*>(scratch->mutable_data()),
- reinterpret_cast<T*>(scratch->mutable_data()), &values_read);
- rows_to_skip -= values_read;
- } while (values_read > 0 && rows_to_skip > 0);
- }
- }
- return num_rows_to_skip - rows_to_skip;
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// Dynamic column reader constructor
-
-std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
- std::unique_ptr<PageReader> pager,
- MemoryPool* pool) {
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedColumnReaderImpl<BooleanType>>(descr, std::move(pager),
- pool);
- case Type::INT32:
- return std::make_shared<TypedColumnReaderImpl<Int32Type>>(descr, std::move(pager),
- pool);
- case Type::INT64:
- return std::make_shared<TypedColumnReaderImpl<Int64Type>>(descr, std::move(pager),
- pool);
- case Type::INT96:
- return std::make_shared<TypedColumnReaderImpl<Int96Type>>(descr, std::move(pager),
- pool);
- case Type::FLOAT:
- return std::make_shared<TypedColumnReaderImpl<FloatType>>(descr, std::move(pager),
- pool);
- case Type::DOUBLE:
- return std::make_shared<TypedColumnReaderImpl<DoubleType>>(descr, std::move(pager),
- pool);
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedColumnReaderImpl<ByteArrayType>>(
- descr, std::move(pager), pool);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedColumnReaderImpl<FLBAType>>(descr, std::move(pager),
- pool);
- default:
- ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return std::shared_ptr<ColumnReader>(nullptr);
-}
-
-// ----------------------------------------------------------------------
-// RecordReader
-
-namespace internal {
-namespace {
-
-// The minimum number of repetition/definition levels to decode at a time, for
-// better vectorized performance when doing many smaller record reads
-constexpr int64_t kMinLevelBatchSize = 1024;
-
-template <typename DType>
-class TypedRecordReader : public ColumnReaderImplBase<DType>,
- virtual public RecordReader {
- public:
- using T = typename DType::c_type;
- using BASE = ColumnReaderImplBase<DType>;
- TypedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool)
- : BASE(descr, pool) {
- leaf_info_ = leaf_info;
- nullable_values_ = leaf_info.HasNullableValues();
- at_record_start_ = true;
- records_read_ = 0;
- values_written_ = 0;
- values_capacity_ = 0;
- null_count_ = 0;
- levels_written_ = 0;
- levels_position_ = 0;
- levels_capacity_ = 0;
- uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
-
- if (uses_values_) {
- values_ = AllocateBuffer(pool);
- }
- valid_bits_ = AllocateBuffer(pool);
- def_levels_ = AllocateBuffer(pool);
- rep_levels_ = AllocateBuffer(pool);
- Reset();
- }
-
- int64_t available_values_current_page() const {
- return this->num_buffered_values_ - this->num_decoded_values_;
- }
-
- // Compute the values capacity in bytes for the given number of elements
- int64_t bytes_for_values(int64_t nitems) const {
- int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
- int64_t bytes_for_values = -1;
- if (MultiplyWithOverflow(nitems, type_size, &bytes_for_values)) {
- throw ParquetException("Total size of items too large");
- }
- return bytes_for_values;
- }
-
- int64_t ReadRecords(int64_t num_records) override {
- // Delimit records, then read values at the end
- int64_t records_read = 0;
-
- if (levels_position_ < levels_written_) {
- records_read += ReadRecordData(num_records);
- }
-
- int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records);
-
- // If we are in the middle of a record, we continue until reaching the
- // desired number of records or the end of the current record if we've found
- // enough records
- while (!at_record_start_ || records_read < num_records) {
- // Is there more data to read in this row group?
- if (!this->HasNextInternal()) {
- if (!at_record_start_) {
- // We ended the row group while inside a record that we haven't seen
- // the end of yet. So increment the record count for the last record in
- // the row group
- ++records_read;
- at_record_start_ = true;
- }
- break;
- }
-
- /// We perform multiple batch reads until we either exhaust the row group
- /// or observe the desired number of records
- int64_t batch_size = std::min(level_batch_size, available_values_current_page());
-
- // No more data in column
- if (batch_size == 0) {
- break;
- }
-
- if (this->max_def_level_ > 0) {
- ReserveLevels(batch_size);
-
- int16_t* def_levels = this->def_levels() + levels_written_;
- int16_t* rep_levels = this->rep_levels() + levels_written_;
-
- // Not present for non-repeated fields
- int64_t levels_read = 0;
- if (this->max_rep_level_ > 0) {
- levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
- if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) {
- throw ParquetException("Number of decoded rep / def levels did not match");
- }
- } else if (this->max_def_level_ > 0) {
- levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
- }
-
- // Exhausted column chunk
- if (levels_read == 0) {
- break;
- }
-
- levels_written_ += levels_read;
- records_read += ReadRecordData(num_records - records_read);
- } else {
- // No repetition or definition levels
- batch_size = std::min(num_records - records_read, batch_size);
- records_read += ReadRecordData(batch_size);
- }
- }
-
- return records_read;
- }
-
- // We may outwardly have the appearance of having exhausted a column chunk
- // when in fact we are in the middle of processing the last batch
- bool has_values_to_process() const { return levels_position_ < levels_written_; }
-
- std::shared_ptr<ResizableBuffer> ReleaseValues() override {
- if (uses_values_) {
- auto result = values_;
- PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
- values_ = AllocateBuffer(this->pool_);
- values_capacity_ = 0;
- return result;
- } else {
- return nullptr;
- }
- }
-
- std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
- if (leaf_info_.HasNullableValues()) {
- auto result = valid_bits_;
- PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
- valid_bits_ = AllocateBuffer(this->pool_);
- return result;
- } else {
- return nullptr;
- }
- }
-
- // Process written repetition/definition levels to reach the end of
- // records. Process no more levels than necessary to delimit the indicated
- // number of logical records. Updates internal state of RecordReader
- //
- // \return Number of records delimited
- int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
- int64_t values_to_read = 0;
- int64_t records_read = 0;
-
- const int16_t* def_levels = this->def_levels() + levels_position_;
- const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
- DCHECK_GT(this->max_rep_level_, 0);
-
- // Count logical records and number of values to read
- while (levels_position_ < levels_written_) {
- const int16_t rep_level = *rep_levels++;
- if (rep_level == 0) {
- // If at_record_start_ is true, we are seeing the start of a record
- // for the second time, such as after repeated calls to
- // DelimitRecords. In this case we must continue until we find
- // another record start or exhausting the ColumnChunk
- if (!at_record_start_) {
- // We've reached the end of a record; increment the record count.
- ++records_read;
- if (records_read == num_records) {
- // We've found the number of records we were looking for. Set
- // at_record_start_ to true and break
- at_record_start_ = true;
- break;
- }
- }
- }
- // We have decided to consume the level at this position; therefore we
- // must advance until we find another record boundary
- at_record_start_ = false;
-
- const int16_t def_level = *def_levels++;
- if (def_level == this->max_def_level_) {
- ++values_to_read;
- }
- ++levels_position_;
- }
- *values_seen = values_to_read;
- return records_read;
- }
-
- void Reserve(int64_t capacity) override {
- ReserveLevels(capacity);
- ReserveValues(capacity);
- }
-
- int64_t UpdateCapacity(int64_t capacity, int64_t size, int64_t extra_size) {
- if (extra_size < 0) {
- throw ParquetException("Negative size (corrupt file?)");
- }
- int64_t target_size = -1;
- if (AddWithOverflow(size, extra_size, &target_size)) {
- throw ParquetException("Allocation size too large (corrupt file?)");
- }
- if (target_size >= (1LL << 62)) {
- throw ParquetException("Allocation size too large (corrupt file?)");
- }
- if (capacity >= target_size) {
- return capacity;
- }
- return BitUtil::NextPower2(target_size);
- }
-
- void ReserveLevels(int64_t extra_levels) {
- if (this->max_def_level_ > 0) {
- const int64_t new_levels_capacity =
- UpdateCapacity(levels_capacity_, levels_written_, extra_levels);
- if (new_levels_capacity > levels_capacity_) {
- constexpr auto kItemSize = static_cast<int64_t>(sizeof(int16_t));
- int64_t capacity_in_bytes = -1;
- if (MultiplyWithOverflow(new_levels_capacity, kItemSize, &capacity_in_bytes)) {
- throw ParquetException("Allocation size too large (corrupt file?)");
- }
- PARQUET_THROW_NOT_OK(def_levels_->Resize(capacity_in_bytes, false));
- if (this->max_rep_level_ > 0) {
- PARQUET_THROW_NOT_OK(rep_levels_->Resize(capacity_in_bytes, false));
- }
- levels_capacity_ = new_levels_capacity;
- }
- }
- }
-
- void ReserveValues(int64_t extra_values) {
- const int64_t new_values_capacity =
- UpdateCapacity(values_capacity_, values_written_, extra_values);
- if (new_values_capacity > values_capacity_) {
- // XXX(wesm): A hack to avoid memory allocation when reading directly
- // into builder classes
- if (uses_values_) {
- PARQUET_THROW_NOT_OK(
- values_->Resize(bytes_for_values(new_values_capacity), false));
- }
- values_capacity_ = new_values_capacity;
- }
- if (leaf_info_.HasNullableValues()) {
- int64_t valid_bytes_new = BitUtil::BytesForBits(values_capacity_);
- if (valid_bits_->size() < valid_bytes_new) {
- int64_t valid_bytes_old = BitUtil::BytesForBits(values_written_);
- PARQUET_THROW_NOT_OK(valid_bits_->Resize(valid_bytes_new, false));
-
- // Avoid valgrind warnings
- memset(valid_bits_->mutable_data() + valid_bytes_old, 0,
- valid_bytes_new - valid_bytes_old);
- }
- }
- }
-
- void Reset() override {
- ResetValues();
-
- if (levels_written_ > 0) {
- const int64_t levels_remaining = levels_written_ - levels_position_;
- // Shift remaining levels to beginning of buffer and trim to only the number
- // of decoded levels remaining
- int16_t* def_data = def_levels();
- int16_t* rep_data = rep_levels();
-
- std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
- PARQUET_THROW_NOT_OK(
- def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
-
- if (this->max_rep_level_ > 0) {
- std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
- PARQUET_THROW_NOT_OK(
- rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
- }
-
- levels_written_ -= levels_position_;
- levels_position_ = 0;
- levels_capacity_ = levels_remaining;
- }
-
- records_read_ = 0;
-
- // Call Finish on the binary builders to reset them
- }
-
- void SetPageReader(std::unique_ptr<PageReader> reader) override {
- at_record_start_ = true;
- this->pager_ = std::move(reader);
- ResetDecoders();
- }
-
- bool HasMoreData() const override { return this->pager_ != nullptr; }
-
- // Dictionary decoders must be reset when advancing row groups
- void ResetDecoders() { this->decoders_.clear(); }
-
- virtual void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) {
- uint8_t* valid_bits = valid_bits_->mutable_data();
- const int64_t valid_bits_offset = values_written_;
-
- int64_t num_decoded = this->current_decoder_->DecodeSpaced(
- ValuesHead<T>(), static_cast<int>(values_with_nulls),
- static_cast<int>(null_count), valid_bits, valid_bits_offset);
- DCHECK_EQ(num_decoded, values_with_nulls);
- }
-
- virtual void ReadValuesDense(int64_t values_to_read) {
- int64_t num_decoded =
- this->current_decoder_->Decode(ValuesHead<T>(), static_cast<int>(values_to_read));
- DCHECK_EQ(num_decoded, values_to_read);
- }
-
- // Return number of logical records read
- int64_t ReadRecordData(int64_t num_records) {
- // Conservative upper bound
- const int64_t possible_num_values =
- std::max(num_records, levels_written_ - levels_position_);
- ReserveValues(possible_num_values);
-
- const int64_t start_levels_position = levels_position_;
-
- int64_t values_to_read = 0;
- int64_t records_read = 0;
- if (this->max_rep_level_ > 0) {
- records_read = DelimitRecords(num_records, &values_to_read);
- } else if (this->max_def_level_ > 0) {
- // No repetition levels, skip delimiting logic. Each level represents a
- // null or not null entry
- records_read = std::min(levels_written_ - levels_position_, num_records);
-
- // This is advanced by DelimitRecords, which we skipped
- levels_position_ += records_read;
- } else {
- records_read = values_to_read = num_records;
- }
-
- int64_t null_count = 0;
- if (leaf_info_.HasNullableValues()) {
- ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = levels_position_ - start_levels_position;
- validity_io.valid_bits = valid_bits_->mutable_data();
- validity_io.valid_bits_offset = values_written_;
-
- DefLevelsToBitmap(def_levels() + start_levels_position,
- levels_position_ - start_levels_position, leaf_info_,
- &validity_io);
- values_to_read = validity_io.values_read - validity_io.null_count;
- null_count = validity_io.null_count;
- DCHECK_GE(values_to_read, 0);
- ReadValuesSpaced(validity_io.values_read, null_count);
- } else {
- DCHECK_GE(values_to_read, 0);
- ReadValuesDense(values_to_read);
- }
- if (this->leaf_info_.def_level > 0) {
- // Optional, repeated, or some mix thereof
- this->ConsumeBufferedValues(levels_position_ - start_levels_position);
- } else {
- // Flat, non-repeated
- this->ConsumeBufferedValues(values_to_read);
- }
- // Total values, including null spaces, if any
- values_written_ += values_to_read + null_count;
- null_count_ += null_count;
-
- return records_read;
- }
-
- void DebugPrintState() override {
- const int16_t* def_levels = this->def_levels();
- const int16_t* rep_levels = this->rep_levels();
- const int64_t total_levels_read = levels_position_;
-
- const T* vals = reinterpret_cast<const T*>(this->values());
-
- std::cout << "def levels: ";
- for (int64_t i = 0; i < total_levels_read; ++i) {
- std::cout << def_levels[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "rep levels: ";
- for (int64_t i = 0; i < total_levels_read; ++i) {
- std::cout << rep_levels[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "values: ";
- for (int64_t i = 0; i < this->values_written(); ++i) {
- std::cout << vals[i] << " ";
- }
- std::cout << std::endl;
- }
-
- void ResetValues() {
- if (values_written_ > 0) {
- // Resize to 0, but do not shrink to fit
- if (uses_values_) {
- PARQUET_THROW_NOT_OK(values_->Resize(0, false));
- }
- PARQUET_THROW_NOT_OK(valid_bits_->Resize(0, false));
- values_written_ = 0;
- values_capacity_ = 0;
- null_count_ = 0;
- }
- }
-
- protected:
- template <typename T>
- T* ValuesHead() {
- return reinterpret_cast<T*>(values_->mutable_data()) + values_written_;
- }
- LevelInfo leaf_info_;
-};
-
-class FLBARecordReader : public TypedRecordReader<FLBAType>,
- virtual public BinaryRecordReader {
- public:
- FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool)
- : TypedRecordReader<FLBAType>(descr, leaf_info, pool), builder_(nullptr) {
- DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY);
- int byte_width = descr_->type_length();
- std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width);
- builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, this->pool_));
- }
-
- ::arrow::ArrayVector GetBuilderChunks() override {
- std::shared_ptr<::arrow::Array> chunk;
- PARQUET_THROW_NOT_OK(builder_->Finish(&chunk));
- return ::arrow::ArrayVector({chunk});
- }
-
- void ReadValuesDense(int64_t values_to_read) override {
- auto values = ValuesHead<FLBA>();
- int64_t num_decoded =
- this->current_decoder_->Decode(values, static_cast<int>(values_to_read));
- DCHECK_EQ(num_decoded, values_to_read);
-
- for (int64_t i = 0; i < num_decoded; i++) {
- PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
- }
- ResetValues();
- }
-
- void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
- uint8_t* valid_bits = valid_bits_->mutable_data();
- const int64_t valid_bits_offset = values_written_;
- auto values = ValuesHead<FLBA>();
-
- int64_t num_decoded = this->current_decoder_->DecodeSpaced(
- values, static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits, valid_bits_offset);
- DCHECK_EQ(num_decoded, values_to_read);
-
- for (int64_t i = 0; i < num_decoded; i++) {
- if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
- PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
- } else {
- PARQUET_THROW_NOT_OK(builder_->AppendNull());
- }
- }
- ResetValues();
- }
-
- private:
- std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
-};
-
-class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
- virtual public BinaryRecordReader {
- public:
- ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool)
- : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
- DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
- accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
- }
-
- ::arrow::ArrayVector GetBuilderChunks() override {
- ::arrow::ArrayVector result = accumulator_.chunks;
- if (result.size() == 0 || accumulator_.builder->length() > 0) {
- std::shared_ptr<::arrow::Array> last_chunk;
- PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
- result.push_back(std::move(last_chunk));
- }
- accumulator_.chunks = {};
- return result;
- }
-
- void ReadValuesDense(int64_t values_to_read) override {
- int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
- static_cast<int>(values_to_read), &accumulator_);
- DCHECK_EQ(num_decoded, values_to_read);
- ResetValues();
- }
-
- void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
- int64_t num_decoded = this->current_decoder_->DecodeArrow(
- static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits_->mutable_data(), values_written_, &accumulator_);
- DCHECK_EQ(num_decoded, values_to_read - null_count);
- ResetValues();
- }
-
- private:
- // Helper data structure for accumulating builder chunks
- typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
-};
-
-class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
- virtual public DictionaryRecordReader {
- public:
- ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool)
- : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool), builder_(pool) {
- this->read_dictionary_ = true;
- }
-
- std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
- FlushBuilder();
- std::vector<std::shared_ptr<::arrow::Array>> result;
- std::swap(result, result_chunks_);
- return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
- }
-
- void FlushBuilder() {
- if (builder_.length() > 0) {
- std::shared_ptr<::arrow::Array> chunk;
- PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
- result_chunks_.emplace_back(std::move(chunk));
-
- // Also clears the dictionary memo table
- builder_.Reset();
- }
- }
-
- void MaybeWriteNewDictionary() {
- if (this->new_dictionary_) {
- /// If there is a new dictionary, we may need to flush the builder, then
- /// insert the new dictionary values
- FlushBuilder();
- builder_.ResetFull();
- auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
- decoder->InsertDictionary(&builder_);
- this->new_dictionary_ = false;
- }
- }
-
- void ReadValuesDense(int64_t values_to_read) override {
- int64_t num_decoded = 0;
- if (current_encoding_ == Encoding::RLE_DICTIONARY) {
- MaybeWriteNewDictionary();
- auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
- num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
- } else {
- num_decoded = this->current_decoder_->DecodeArrowNonNull(
- static_cast<int>(values_to_read), &builder_);
-
- /// Flush values since they have been copied into the builder
- ResetValues();
- }
- DCHECK_EQ(num_decoded, values_to_read);
- }
-
- void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
- int64_t num_decoded = 0;
- if (current_encoding_ == Encoding::RLE_DICTIONARY) {
- MaybeWriteNewDictionary();
- auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
- num_decoded = decoder->DecodeIndicesSpaced(
- static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits_->mutable_data(), values_written_, &builder_);
- } else {
- num_decoded = this->current_decoder_->DecodeArrow(
- static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits_->mutable_data(), values_written_, &builder_);
-
- /// Flush values since they have been copied into the builder
- ResetValues();
- }
- DCHECK_EQ(num_decoded, values_to_read - null_count);
- }
-
- private:
- using BinaryDictDecoder = DictDecoder<ByteArrayType>;
-
- ::arrow::BinaryDictionary32Builder builder_;
- std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
-};
-
-// TODO(wesm): Implement these to some satisfaction
-template <>
-void TypedRecordReader<Int96Type>::DebugPrintState() {}
-
-template <>
-void TypedRecordReader<ByteArrayType>::DebugPrintState() {}
-
-template <>
-void TypedRecordReader<FLBAType>::DebugPrintState() {}
-
-std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* descr,
- LevelInfo leaf_info,
- ::arrow::MemoryPool* pool,
- bool read_dictionary) {
- if (read_dictionary) {
- return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool);
- } else {
- return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool);
- }
-}
-
-} // namespace
-
-std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
- LevelInfo leaf_info, MemoryPool* pool,
- const bool read_dictionary) {
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool);
- case Type::INT32:
- return std::make_shared<TypedRecordReader<Int32Type>>(descr, leaf_info, pool);
- case Type::INT64:
- return std::make_shared<TypedRecordReader<Int64Type>>(descr, leaf_info, pool);
- case Type::INT96:
- return std::make_shared<TypedRecordReader<Int96Type>>(descr, leaf_info, pool);
- case Type::FLOAT:
- return std::make_shared<TypedRecordReader<FloatType>>(descr, leaf_info, pool);
- case Type::DOUBLE:
- return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool);
- case Type::BYTE_ARRAY:
- return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<FLBARecordReader>(descr, leaf_info, pool);
- default: {
- // PARQUET-1481: This can occur if the file is corrupt
- std::stringstream ss;
- ss << "Invalid physical column type: " << static_cast<int>(descr->physical_type());
- throw ParquetException(ss.str());
- }
- }
- // Unreachable code, but suppress compiler warning
- return nullptr;
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/chunked_array.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "parquet/column_page.h"
+#include "parquet/encoding.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/level_comparison.h"
+#include "parquet/level_conversion.h"
+#include "parquet/properties.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h" // IWYU pragma: keep
+// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
+#include "parquet/windows_compatibility.h"
+
+using arrow::MemoryPool;
+using arrow::internal::AddWithOverflow;
+using arrow::internal::checked_cast;
+using arrow::internal::MultiplyWithOverflow;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace {
+inline bool HasSpacedValues(const ColumnDescriptor* descr) {
+ if (descr->max_repetition_level() > 0) {
+ // repeated+flat case
+ return !descr->schema_node()->is_required();
+ } else {
+ // non-repeated+nested case
+ // Find if a node forces nulls in the lowest level along the hierarchy
+ const schema::Node* node = descr->schema_node().get();
+ while (node) {
+ if (node->is_optional()) {
+ return true;
+ }
+ node = node->parent();
+ }
+ return false;
+ }
+}
+} // namespace
+
+LevelDecoder::LevelDecoder() : num_values_remaining_(0) {}
+
+LevelDecoder::~LevelDecoder() {}
+
+int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values, const uint8_t* data,
+ int32_t data_size) {
+ max_level_ = max_level;
+ int32_t num_bytes = 0;
+ encoding_ = encoding;
+ num_values_remaining_ = num_buffered_values;
+ bit_width_ = BitUtil::Log2(max_level + 1);
+ switch (encoding) {
+ case Encoding::RLE: {
+ if (data_size < 4) {
+ throw ParquetException("Received invalid levels (corrupt data page?)");
+ }
+ num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data);
+ if (num_bytes < 0 || num_bytes > data_size - 4) {
+ throw ParquetException("Received invalid number of bytes (corrupt data page?)");
+ }
+ const uint8_t* decoder_data = data + 4;
+ if (!rle_decoder_) {
+ rle_decoder_.reset(
+ new ::arrow::util::RleDecoder(decoder_data, num_bytes, bit_width_));
+ } else {
+ rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
+ }
+ return 4 + num_bytes;
+ }
+ case Encoding::BIT_PACKED: {
+ int num_bits = 0;
+ if (MultiplyWithOverflow(num_buffered_values, bit_width_, &num_bits)) {
+ throw ParquetException(
+ "Number of buffered values too large (corrupt data page?)");
+ }
+ num_bytes = static_cast<int32_t>(BitUtil::BytesForBits(num_bits));
+ if (num_bytes < 0 || num_bytes > data_size - 4) {
+ throw ParquetException("Received invalid number of bytes (corrupt data page?)");
+ }
+ if (!bit_packed_decoder_) {
+ bit_packed_decoder_.reset(new ::arrow::BitUtil::BitReader(data, num_bytes));
+ } else {
+ bit_packed_decoder_->Reset(data, num_bytes);
+ }
+ return num_bytes;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return -1;
+}
+
+void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level,
+ int num_buffered_values, const uint8_t* data) {
+ max_level_ = max_level;
+ // Repetition and definition levels always uses RLE encoding
+ // in the DataPageV2 format.
+ if (num_bytes < 0) {
+ throw ParquetException("Invalid page header (corrupt data page?)");
+ }
+ encoding_ = Encoding::RLE;
+ num_values_remaining_ = num_buffered_values;
+ bit_width_ = BitUtil::Log2(max_level + 1);
+
+ if (!rle_decoder_) {
+ rle_decoder_.reset(new ::arrow::util::RleDecoder(data, num_bytes, bit_width_));
+ } else {
+ rle_decoder_->Reset(data, num_bytes, bit_width_);
+ }
+}
+
+int LevelDecoder::Decode(int batch_size, int16_t* levels) {
+ int num_decoded = 0;
+
+ int num_values = std::min(num_values_remaining_, batch_size);
+ if (encoding_ == Encoding::RLE) {
+ num_decoded = rle_decoder_->GetBatch(levels, num_values);
+ } else {
+ num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values);
+ }
+ if (num_decoded > 0) {
+ internal::MinMax min_max = internal::FindMinMax(levels, num_decoded);
+ if (ARROW_PREDICT_FALSE(min_max.min < 0 || min_max.max > max_level_)) {
+ std::stringstream ss;
+ ss << "Malformed levels. min: " << min_max.min << " max: " << min_max.max
+ << " out of range. Max Level: " << max_level_;
+ throw ParquetException(ss.str());
+ }
+ }
+ num_values_remaining_ -= num_decoded;
+ return num_decoded;
+}
+
+ReaderProperties default_reader_properties() {
+ static ReaderProperties default_reader_properties;
+ return default_reader_properties;
+}
+
+namespace {
+
+// Extracts encoded statistics from V1 and V2 data page headers
+template <typename H>
+EncodedStatistics ExtractStatsFromHeader(const H& header) {
+ EncodedStatistics page_statistics;
+ if (!header.__isset.statistics) {
+ return page_statistics;
+ }
+ const format::Statistics& stats = header.statistics;
+ if (stats.__isset.max) {
+ page_statistics.set_max(stats.max);
+ }
+ if (stats.__isset.min) {
+ page_statistics.set_min(stats.min);
+ }
+ if (stats.__isset.null_count) {
+ page_statistics.set_null_count(stats.null_count);
+ }
+ if (stats.__isset.distinct_count) {
+ page_statistics.set_distinct_count(stats.distinct_count);
+ }
+ return page_statistics;
+}
+
+// ----------------------------------------------------------------------
+// SerializedPageReader deserializes Thrift metadata and pages that have been
+// assembled in a serialized stream for storing in a Parquet files
+
+// This subclass delimits pages appearing in a serialized stream, each preceded
+// by a serialized Thrift format::PageHeader indicating the type of each page
+// and the page metadata.
+class SerializedPageReader : public PageReader {
+ public:
+ SerializedPageReader(std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
+ Compression::type codec, ::arrow::MemoryPool* pool,
+ const CryptoContext* crypto_ctx)
+ : stream_(std::move(stream)),
+ decompression_buffer_(AllocateBuffer(pool, 0)),
+ page_ordinal_(0),
+ seen_num_rows_(0),
+ total_num_rows_(total_num_rows),
+ decryption_buffer_(AllocateBuffer(pool, 0)) {
+ if (crypto_ctx != nullptr) {
+ crypto_ctx_ = *crypto_ctx;
+ InitDecryption();
+ }
+ max_page_header_size_ = kDefaultMaxPageHeaderSize;
+ decompressor_ = GetCodec(codec);
+ }
+
+ // Implement the PageReader interface
+ std::shared_ptr<Page> NextPage() override;
+
+ void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; }
+
+ private:
+ void UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor, int8_t module_type,
+ const std::string& page_aad);
+
+ void InitDecryption();
+
+ std::shared_ptr<Buffer> DecompressIfNeeded(std::shared_ptr<Buffer> page_buffer,
+ int compressed_len, int uncompressed_len,
+ int levels_byte_len = 0);
+
+ std::shared_ptr<ArrowInputStream> stream_;
+
+ format::PageHeader current_page_header_;
+ std::shared_ptr<Page> current_page_;
+
+ // Compression codec to use.
+ std::unique_ptr<::arrow::util::Codec> decompressor_;
+ std::shared_ptr<ResizableBuffer> decompression_buffer_;
+
+ // The fields below are used for calculation of AAD (additional authenticated data)
+ // suffix which is part of the Parquet Modular Encryption.
+ // The AAD suffix for a parquet module is built internally by
+ // concatenating different parts some of which include
+ // the row group ordinal, column ordinal and page ordinal.
+ // Please refer to the encryption specification for more details:
+ // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data
+
+ // The ordinal fields in the context below are used for AAD suffix calculation.
+ CryptoContext crypto_ctx_;
+ int16_t page_ordinal_; // page ordinal does not count the dictionary page
+
+ // Maximum allowed page size
+ uint32_t max_page_header_size_;
+
+ // Number of rows read in data pages so far
+ int64_t seen_num_rows_;
+
+ // Number of rows in all the data pages
+ int64_t total_num_rows_;
+
+ // data_page_aad_ and data_page_header_aad_ contain the AAD for data page and data page
+ // header in a single column respectively.
+ // While calculating AAD for different pages in a single column the pages AAD is
+ // updated by only the page ordinal.
+ std::string data_page_aad_;
+ std::string data_page_header_aad_;
+ // Encryption
+ std::shared_ptr<ResizableBuffer> decryption_buffer_;
+};
+
+void SerializedPageReader::InitDecryption() {
+ // Prepare the AAD for quick update later.
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty());
+ data_page_aad_ = encryption::CreateModuleAad(
+ crypto_ctx_.data_decryptor->file_aad(), encryption::kDataPage,
+ crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ }
+ if (crypto_ctx_.meta_decryptor != nullptr) {
+ DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty());
+ data_page_header_aad_ = encryption::CreateModuleAad(
+ crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader,
+ crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ }
+}
+
+void SerializedPageReader::UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor,
+ int8_t module_type,
+ const std::string& page_aad) {
+ DCHECK(decryptor != nullptr);
+ if (crypto_ctx_.start_decrypt_with_dictionary_page) {
+ std::string aad = encryption::CreateModuleAad(
+ decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal,
+ crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ decryptor->UpdateAad(aad);
+ } else {
+ encryption::QuickUpdatePageAad(page_aad, page_ordinal_);
+ decryptor->UpdateAad(page_aad);
+ }
+}
+
+std::shared_ptr<Page> SerializedPageReader::NextPage() {
+ // Loop here because there may be unhandled page types that we skip until
+ // finding a page that we do know what to do with
+
+ while (seen_num_rows_ < total_num_rows_) {
+ uint32_t header_size = 0;
+ uint32_t allowed_page_size = kDefaultPageHeaderSize;
+
+ // Page headers can be very large because of page statistics
+ // We try to deserialize a larger buffer progressively
+ // until a maximum allowed header limit
+ while (true) {
+ PARQUET_ASSIGN_OR_THROW(auto view, stream_->Peek(allowed_page_size));
+ if (view.size() == 0) {
+ return std::shared_ptr<Page>(nullptr);
+ }
+
+ // This gets used, then set by DeserializeThriftMsg
+ header_size = static_cast<uint32_t>(view.size());
+ try {
+ if (crypto_ctx_.meta_decryptor != nullptr) {
+ UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader,
+ data_page_header_aad_);
+ }
+ DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(view.data()), &header_size,
+ &current_page_header_, crypto_ctx_.meta_decryptor);
+ break;
+ } catch (std::exception& e) {
+ // Failed to deserialize. Double the allowed page header size and try again
+ std::stringstream ss;
+ ss << e.what();
+ allowed_page_size *= 2;
+ if (allowed_page_size > max_page_header_size_) {
+ ss << "Deserializing page header failed.\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ }
+ // Advance the stream offset
+ PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
+
+ int compressed_len = current_page_header_.compressed_page_size;
+ int uncompressed_len = current_page_header_.uncompressed_page_size;
+ if (compressed_len < 0 || uncompressed_len < 0) {
+ throw ParquetException("Invalid page header");
+ }
+
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage,
+ data_page_aad_);
+ }
+
+ // Read the compressed data page.
+ PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
+ if (page_buffer->size() != compressed_len) {
+ std::stringstream ss;
+ ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
+ << compressed_len << ")";
+ ParquetException::EofException(ss.str());
+ }
+
+ // Decrypt it if we need to
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ PARQUET_THROW_NOT_OK(decryption_buffer_->Resize(
+ compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta(), false));
+ compressed_len = crypto_ctx_.data_decryptor->Decrypt(
+ page_buffer->data(), compressed_len, decryption_buffer_->mutable_data());
+
+ page_buffer = decryption_buffer_;
+ }
+
+ const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
+
+ if (page_type == PageType::DICTIONARY_PAGE) {
+ crypto_ctx_.start_decrypt_with_dictionary_page = false;
+ const format::DictionaryPageHeader& dict_header =
+ current_page_header_.dictionary_page_header;
+
+ bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
+ if (dict_header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+
+ // Uncompress if needed
+ page_buffer =
+ DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
+
+ return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
+ LoadEnumSafe(&dict_header.encoding),
+ is_sorted);
+ } else if (page_type == PageType::DATA_PAGE) {
+ ++page_ordinal_;
+ const format::DataPageHeader& header = current_page_header_.data_page_header;
+
+ if (header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+ EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
+ seen_num_rows_ += header.num_values;
+
+ // Uncompress if needed
+ page_buffer =
+ DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
+
+ return std::make_shared<DataPageV1>(page_buffer, header.num_values,
+ LoadEnumSafe(&header.encoding),
+ LoadEnumSafe(&header.definition_level_encoding),
+ LoadEnumSafe(&header.repetition_level_encoding),
+ uncompressed_len, page_statistics);
+ } else if (page_type == PageType::DATA_PAGE_V2) {
+ ++page_ordinal_;
+ const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;
+
+ if (header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+ if (header.definition_levels_byte_length < 0 ||
+ header.repetition_levels_byte_length < 0) {
+ throw ParquetException("Invalid page header (negative levels byte length)");
+ }
+ bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
+ EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
+ seen_num_rows_ += header.num_values;
+
+ // Uncompress if needed
+ int levels_byte_len;
+ if (AddWithOverflow(header.definition_levels_byte_length,
+ header.repetition_levels_byte_length, &levels_byte_len)) {
+ throw ParquetException("Levels size too large (corrupt file?)");
+ }
+ // DecompressIfNeeded doesn't take `is_compressed` into account as
+ // it's page type-agnostic.
+ if (is_compressed) {
+ page_buffer = DecompressIfNeeded(std::move(page_buffer), compressed_len,
+ uncompressed_len, levels_byte_len);
+ }
+
+ return std::make_shared<DataPageV2>(
+ page_buffer, header.num_values, header.num_nulls, header.num_rows,
+ LoadEnumSafe(&header.encoding), header.definition_levels_byte_length,
+ header.repetition_levels_byte_length, uncompressed_len, is_compressed,
+ page_statistics);
+ } else {
+ // We don't know what this page type is. We're allowed to skip non-data
+ // pages.
+ continue;
+ }
+ }
+ return std::shared_ptr<Page>(nullptr);
+}
+
+std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
+ std::shared_ptr<Buffer> page_buffer, int compressed_len, int uncompressed_len,
+ int levels_byte_len) {
+ if (decompressor_ == nullptr) {
+ return page_buffer;
+ }
+ if (compressed_len < levels_byte_len || uncompressed_len < levels_byte_len) {
+ throw ParquetException("Invalid page header");
+ }
+
+ // Grow the uncompressed buffer if we need to.
+ if (uncompressed_len > static_cast<int>(decompression_buffer_->size())) {
+ PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len, false));
+ }
+
+ if (levels_byte_len > 0) {
+ // First copy the levels as-is
+ uint8_t* decompressed = decompression_buffer_->mutable_data();
+ memcpy(decompressed, page_buffer->data(), levels_byte_len);
+ }
+
+ // Decompress the values
+ PARQUET_THROW_NOT_OK(decompressor_->Decompress(
+ compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
+ uncompressed_len - levels_byte_len,
+ decompression_buffer_->mutable_data() + levels_byte_len));
+
+ return decompression_buffer_;
+}
+
+} // namespace
+
+std::unique_ptr<PageReader> PageReader::Open(std::shared_ptr<ArrowInputStream> stream,
+ int64_t total_num_rows,
+ Compression::type codec,
+ ::arrow::MemoryPool* pool,
+ const CryptoContext* ctx) {
+ return std::unique_ptr<PageReader>(
+ new SerializedPageReader(std::move(stream), total_num_rows, codec, pool, ctx));
+}
+
+namespace {
+
+// ----------------------------------------------------------------------
+// Impl base class for TypedColumnReader and RecordReader
+
+// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
+// encoding.
+static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
+ return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
+}
+
+template <typename DType>
+class ColumnReaderImplBase {
+ public:
+ using T = typename DType::c_type;
+
+ ColumnReaderImplBase(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
+ : descr_(descr),
+ max_def_level_(descr->max_definition_level()),
+ max_rep_level_(descr->max_repetition_level()),
+ num_buffered_values_(0),
+ num_decoded_values_(0),
+ pool_(pool),
+ current_decoder_(nullptr),
+ current_encoding_(Encoding::UNKNOWN) {}
+
+ virtual ~ColumnReaderImplBase() = default;
+
+ protected:
+ // Read up to batch_size values from the current data page into the
+ // pre-allocated memory T*
+ //
+ // @returns: the number of values read into the out buffer
+ int64_t ReadValues(int64_t batch_size, T* out) {
+ int64_t num_decoded = current_decoder_->Decode(out, static_cast<int>(batch_size));
+ return num_decoded;
+ }
+
+ // Read up to batch_size values from the current data page into the
+ // pre-allocated memory T*, leaving spaces for null entries according
+ // to the def_levels.
+ //
+ // @returns: the number of values read into the out buffer
+ int64_t ReadValuesSpaced(int64_t batch_size, T* out, int64_t null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset) {
+ return current_decoder_->DecodeSpaced(out, static_cast<int>(batch_size),
+ static_cast<int>(null_count), valid_bits,
+ valid_bits_offset);
+ }
+
+ // Read multiple definition levels into preallocated memory
+ //
+ // Returns the number of decoded definition levels
+ int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
+ if (max_def_level_ == 0) {
+ return 0;
+ }
+ return definition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+ }
+
+ bool HasNextInternal() {
+ // Either there is no data page available yet, or the data page has been
+ // exhausted
+ if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) {
+ if (!ReadNewPage() || num_buffered_values_ == 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Read multiple repetition levels into preallocated memory
+ // Returns the number of decoded repetition levels
+ int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels) {
+ if (max_rep_level_ == 0) {
+ return 0;
+ }
+ return repetition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+ }
+
+ // Advance to the next data page
+ bool ReadNewPage() {
+ // Loop until we find the next data page.
+ while (true) {
+ current_page_ = pager_->NextPage();
+ if (!current_page_) {
+ // EOS
+ return false;
+ }
+
+ if (current_page_->type() == PageType::DICTIONARY_PAGE) {
+ ConfigureDictionary(static_cast<const DictionaryPage*>(current_page_.get()));
+ continue;
+ } else if (current_page_->type() == PageType::DATA_PAGE) {
+ const auto page = std::static_pointer_cast<DataPageV1>(current_page_);
+ const int64_t levels_byte_size = InitializeLevelDecoders(
+ *page, page->repetition_level_encoding(), page->definition_level_encoding());
+ InitializeDataDecoder(*page, levels_byte_size);
+ return true;
+ } else if (current_page_->type() == PageType::DATA_PAGE_V2) {
+ const auto page = std::static_pointer_cast<DataPageV2>(current_page_);
+ int64_t levels_byte_size = InitializeLevelDecodersV2(*page);
+ InitializeDataDecoder(*page, levels_byte_size);
+ return true;
+ } else {
+ // We don't know what this page type is. We're allowed to skip non-data
+ // pages.
+ continue;
+ }
+ }
+ return true;
+ }
+
+ void ConfigureDictionary(const DictionaryPage* page) {
+ int encoding = static_cast<int>(page->encoding());
+ if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
+ page->encoding() == Encoding::PLAIN) {
+ encoding = static_cast<int>(Encoding::RLE_DICTIONARY);
+ }
+
+ auto it = decoders_.find(encoding);
+ if (it != decoders_.end()) {
+ throw ParquetException("Column cannot have more than one dictionary.");
+ }
+
+ if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
+ page->encoding() == Encoding::PLAIN) {
+ auto dictionary = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ dictionary->SetData(page->num_values(), page->data(), page->size());
+
+ // The dictionary is fully decoded during DictionaryDecoder::Init, so the
+ // DictionaryPage buffer is no longer required after this step
+ //
+ // TODO(wesm): investigate whether this all-or-nothing decoding of the
+ // dictionary makes sense and whether performance can be improved
+
+ std::unique_ptr<DictDecoder<DType>> decoder = MakeDictDecoder<DType>(descr_, pool_);
+ decoder->SetDict(dictionary.get());
+ decoders_[encoding] =
+ std::unique_ptr<DecoderType>(dynamic_cast<DecoderType*>(decoder.release()));
+ } else {
+ ParquetException::NYI("only plain dictionary encoding has been implemented");
+ }
+
+ new_dictionary_ = true;
+ current_decoder_ = decoders_[encoding].get();
+ DCHECK(current_decoder_);
+ }
+
+ // Initialize repetition and definition level decoders on the next data page.
+
+ // If the data page includes repetition and definition levels, we
+ // initialize the level decoders and return the number of encoded level bytes.
+ // The return value helps determine the number of bytes in the encoded data.
+ int64_t InitializeLevelDecoders(const DataPage& page,
+ Encoding::type repetition_level_encoding,
+ Encoding::type definition_level_encoding) {
+ // Read a data page.
+ num_buffered_values_ = page.num_values();
+
+ // Have not decoded any values from the data page yet
+ num_decoded_values_ = 0;
+
+ const uint8_t* buffer = page.data();
+ int32_t levels_byte_size = 0;
+ int32_t max_size = page.size();
+
+ // Data page Layout: Repetition Levels - Definition Levels - encoded values.
+ // Levels are encoded as rle or bit-packed.
+ // Init repetition levels
+ if (max_rep_level_ > 0) {
+ int32_t rep_levels_bytes = repetition_level_decoder_.SetData(
+ repetition_level_encoding, max_rep_level_,
+ static_cast<int>(num_buffered_values_), buffer, max_size);
+ buffer += rep_levels_bytes;
+ levels_byte_size += rep_levels_bytes;
+ max_size -= rep_levels_bytes;
+ }
+ // TODO figure a way to set max_def_level_ to 0
+ // if the initial value is invalid
+
+ // Init definition levels
+ if (max_def_level_ > 0) {
+ int32_t def_levels_bytes = definition_level_decoder_.SetData(
+ definition_level_encoding, max_def_level_,
+ static_cast<int>(num_buffered_values_), buffer, max_size);
+ levels_byte_size += def_levels_bytes;
+ max_size -= def_levels_bytes;
+ }
+
+ return levels_byte_size;
+ }
+
+ int64_t InitializeLevelDecodersV2(const DataPageV2& page) {
+ // Read a data page.
+ num_buffered_values_ = page.num_values();
+
+ // Have not decoded any values from the data page yet
+ num_decoded_values_ = 0;
+ const uint8_t* buffer = page.data();
+
+ const int64_t total_levels_length =
+ static_cast<int64_t>(page.repetition_levels_byte_length()) +
+ page.definition_levels_byte_length();
+
+ if (total_levels_length > page.size()) {
+ throw ParquetException("Data page too small for levels (corrupt header?)");
+ }
+
+ if (max_rep_level_ > 0) {
+ repetition_level_decoder_.SetDataV2(page.repetition_levels_byte_length(),
+ max_rep_level_,
+ static_cast<int>(num_buffered_values_), buffer);
+ buffer += page.repetition_levels_byte_length();
+ }
+
+ if (max_def_level_ > 0) {
+ definition_level_decoder_.SetDataV2(page.definition_levels_byte_length(),
+ max_def_level_,
+ static_cast<int>(num_buffered_values_), buffer);
+ }
+
+ return total_levels_length;
+ }
+
+ // Get a decoder object for this page or create a new decoder if this is the
+ // first page with this encoding.
+ void InitializeDataDecoder(const DataPage& page, int64_t levels_byte_size) {
+ const uint8_t* buffer = page.data() + levels_byte_size;
+ const int64_t data_size = page.size() - levels_byte_size;
+
+ if (data_size < 0) {
+ throw ParquetException("Page smaller than size of encoded levels");
+ }
+
+ Encoding::type encoding = page.encoding();
+
+ if (IsDictionaryIndexEncoding(encoding)) {
+ encoding = Encoding::RLE_DICTIONARY;
+ }
+
+ auto it = decoders_.find(static_cast<int>(encoding));
+ if (it != decoders_.end()) {
+ DCHECK(it->second.get() != nullptr);
+ if (encoding == Encoding::RLE_DICTIONARY) {
+ DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY);
+ }
+ current_decoder_ = it->second.get();
+ } else {
+ switch (encoding) {
+ case Encoding::PLAIN: {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ current_decoder_ = decoder.get();
+ decoders_[static_cast<int>(encoding)] = std::move(decoder);
+ break;
+ }
+ case Encoding::BYTE_STREAM_SPLIT: {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT, descr_);
+ current_decoder_ = decoder.get();
+ decoders_[static_cast<int>(encoding)] = std::move(decoder);
+ break;
+ }
+ case Encoding::RLE_DICTIONARY:
+ throw ParquetException("Dictionary page must be before data page.");
+
+ case Encoding::DELTA_BINARY_PACKED:
+ case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ case Encoding::DELTA_BYTE_ARRAY:
+ ParquetException::NYI("Unsupported encoding");
+
+ default:
+ throw ParquetException("Unknown encoding type.");
+ }
+ }
+ current_encoding_ = encoding;
+ current_decoder_->SetData(static_cast<int>(num_buffered_values_), buffer,
+ static_cast<int>(data_size));
+ }
+
+ const ColumnDescriptor* descr_;
+ const int16_t max_def_level_;
+ const int16_t max_rep_level_;
+
+ std::unique_ptr<PageReader> pager_;
+ std::shared_ptr<Page> current_page_;
+
+ // Not set if full schema for this field has no optional or repeated elements
+ LevelDecoder definition_level_decoder_;
+
+ // Not set for flat schemas.
+ LevelDecoder repetition_level_decoder_;
+
+ // The total number of values stored in the data page. This is the maximum of
+ // the number of encoded definition levels or encoded values. For
+ // non-repeated, required columns, this is equal to the number of encoded
+ // values. For repeated or optional values, there may be fewer data values
+ // than levels, and this tells you how many encoded levels there are in that
+ // case.
+ int64_t num_buffered_values_;
+
+ // The number of values from the current data page that have been decoded
+ // into memory
+ int64_t num_decoded_values_;
+
+ ::arrow::MemoryPool* pool_;
+
+ using DecoderType = TypedDecoder<DType>;
+ DecoderType* current_decoder_;
+ Encoding::type current_encoding_;
+
+ /// Flag to signal when a new dictionary has been set, for the benefit of
+ /// DictionaryRecordReader
+ bool new_dictionary_;
+
+ // The exposed encoding
+ ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
+
+ // Map of encoding type to the respective decoder object. For example, a
+ // column chunk's data pages may include both dictionary-encoded and
+ // plain-encoded data.
+ std::unordered_map<int, std::unique_ptr<DecoderType>> decoders_;
+
+ void ConsumeBufferedValues(int64_t num_values) { num_decoded_values_ += num_values; }
+};
+
+// ----------------------------------------------------------------------
+// TypedColumnReader implementations
+
+template <typename DType>
+class TypedColumnReaderImpl : public TypedColumnReader<DType>,
+ public ColumnReaderImplBase<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedColumnReaderImpl(const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+ ::arrow::MemoryPool* pool)
+ : ColumnReaderImplBase<DType>(descr, pool) {
+ this->pager_ = std::move(pager);
+ }
+
+ bool HasNext() override { return this->HasNextInternal(); }
+
+ int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, int64_t* values_read) override;
+
+ int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, uint8_t* valid_bits, int64_t valid_bits_offset,
+ int64_t* levels_read, int64_t* values_read,
+ int64_t* null_count) override;
+
+ int64_t Skip(int64_t num_rows_to_skip) override;
+
+ Type::type type() const override { return this->descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return this->descr_; }
+
+ ExposedEncoding GetExposedEncoding() override { return this->exposed_encoding_; };
+
+ int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict,
+ int32_t* dict_len) override;
+
+ protected:
+ void SetExposedEncoding(ExposedEncoding encoding) override {
+ this->exposed_encoding_ = encoding;
+ }
+
+ private:
+ // Read dictionary indices. Similar to ReadValues but decode data to dictionary indices.
+ // This function is called only by ReadBatchWithDictionary().
+ int64_t ReadDictionaryIndices(int64_t indices_to_read, int32_t* indices) {
+ auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+ return decoder->DecodeIndices(static_cast<int>(indices_to_read), indices);
+ }
+
+ // Get dictionary. The dictionary should have been set by SetDict(). The dictionary is
+ // owned by the internal decoder and is destroyed when the reader is destroyed. This
+ // function is called only by ReadBatchWithDictionary() after dictionary is configured.
+ void GetDictionary(const T** dictionary, int32_t* dictionary_length) {
+ auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+ decoder->GetDictionary(dictionary, dictionary_length);
+ }
+
+ // Read definition and repetition levels. Also return the number of definition levels
+ // and number of values to read. This function is called before reading values.
+ void ReadLevels(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ int64_t* num_def_levels, int64_t* values_to_read) {
+ batch_size =
+ std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+ // If the field is required and non-repeated, there are no definition levels
+ if (this->max_def_level_ > 0 && def_levels != nullptr) {
+ *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+ // TODO(wesm): this tallying of values-to-decode can be performed with better
+ // cache-efficiency if fused with the level decoding.
+ for (int64_t i = 0; i < *num_def_levels; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++(*values_to_read);
+ }
+ }
+ } else {
+ // Required field, read all values
+ *values_to_read = batch_size;
+ }
+
+ // Not present for non-repeated fields
+ if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
+ int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+ if (def_levels != nullptr && *num_def_levels != num_rep_levels) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ }
+ }
+};
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchWithDictionary(
+ int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict, int32_t* dict_len) {
+ bool has_dict_output = dict != nullptr && dict_len != nullptr;
+ // Similar logic as ReadValues to get pages.
+ if (!HasNext()) {
+ *indices_read = 0;
+ if (has_dict_output) {
+ *dict = nullptr;
+ *dict_len = 0;
+ }
+ return 0;
+ }
+
+ // Verify the current data page is dictionary encoded.
+ if (this->current_encoding_ != Encoding::RLE_DICTIONARY) {
+ std::stringstream ss;
+ ss << "Data page is not dictionary encoded. Encoding: "
+ << EncodingToString(this->current_encoding_);
+ throw ParquetException(ss.str());
+ }
+
+ // Get dictionary pointer and length.
+ if (has_dict_output) {
+ GetDictionary(dict, dict_len);
+ }
+
+ // Similar logic as ReadValues to get def levels and rep levels.
+ int64_t num_def_levels = 0;
+ int64_t indices_to_read = 0;
+ ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &indices_to_read);
+
+ // Read dictionary indices.
+ *indices_read = ReadDictionaryIndices(indices_to_read, indices);
+ int64_t total_indices = std::max(num_def_levels, *indices_read);
+ this->ConsumeBufferedValues(total_indices);
+
+ return total_indices;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, T* values,
+ int64_t* values_read) {
+ // HasNext invokes ReadNewPage
+ if (!HasNext()) {
+ *values_read = 0;
+ return 0;
+ }
+
+ // TODO(wesm): keep reading data pages until batch_size is reached, or the
+ // row group is finished
+ int64_t num_def_levels = 0;
+ int64_t values_to_read = 0;
+ ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &values_to_read);
+
+ *values_read = this->ReadValues(values_to_read, values);
+ int64_t total_values = std::max(num_def_levels, *values_read);
+ this->ConsumeBufferedValues(total_values);
+
+ return total_values;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
+ int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
+ uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
+ int64_t* values_read, int64_t* null_count_out) {
+ // HasNext invokes ReadNewPage
+ if (!HasNext()) {
+ *levels_read = 0;
+ *values_read = 0;
+ *null_count_out = 0;
+ return 0;
+ }
+
+ int64_t total_values;
+ // TODO(wesm): keep reading data pages until batch_size is reached, or the
+ // row group is finished
+ batch_size =
+ std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+ // If the field is required and non-repeated, there are no definition levels
+ if (this->max_def_level_ > 0) {
+ int64_t num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+
+ // Not present for non-repeated fields
+ if (this->max_rep_level_ > 0) {
+ int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+ if (num_def_levels != num_rep_levels) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ }
+
+ const bool has_spaced_values = HasSpacedValues(this->descr_);
+ int64_t null_count = 0;
+ if (!has_spaced_values) {
+ int values_to_read = 0;
+ for (int64_t i = 0; i < num_def_levels; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++values_to_read;
+ }
+ }
+ total_values = this->ReadValues(values_to_read, values);
+ ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
+ /*length=*/total_values,
+ /*bits_are_set=*/true);
+ *values_read = total_values;
+ } else {
+ internal::LevelInfo info;
+ info.repeated_ancestor_def_level = this->max_def_level_ - 1;
+ info.def_level = this->max_def_level_;
+ info.rep_level = this->max_rep_level_;
+ internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = num_def_levels;
+ validity_io.valid_bits = valid_bits;
+ validity_io.valid_bits_offset = valid_bits_offset;
+ validity_io.null_count = null_count;
+ validity_io.values_read = *values_read;
+
+ internal::DefLevelsToBitmap(def_levels, num_def_levels, info, &validity_io);
+ null_count = validity_io.null_count;
+ *values_read = validity_io.values_read;
+
+ total_values =
+ this->ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
+ valid_bits, valid_bits_offset);
+ }
+ *levels_read = num_def_levels;
+ *null_count_out = null_count;
+
+ } else {
+ // Required field, read all values
+ total_values = this->ReadValues(batch_size, values);
+ ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
+ /*length=*/total_values,
+ /*bits_are_set=*/true);
+ *null_count_out = 0;
+ *values_read = total_values;
+ *levels_read = total_values;
+ }
+
+ this->ConsumeBufferedValues(*levels_read);
+ return total_values;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::Skip(int64_t num_rows_to_skip) {
+ int64_t rows_to_skip = num_rows_to_skip;
+ while (HasNext() && rows_to_skip > 0) {
+ // If the number of rows to skip is more than the number of undecoded values, skip the
+ // Page.
+ if (rows_to_skip > (this->num_buffered_values_ - this->num_decoded_values_)) {
+ rows_to_skip -= this->num_buffered_values_ - this->num_decoded_values_;
+ this->num_decoded_values_ = this->num_buffered_values_;
+ } else {
+ // We need to read this Page
+ // Jump to the right offset in the Page
+ int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint
+ int64_t values_read = 0;
+
+ // This will be enough scratch space to accommodate 16-bit levels or any
+ // value type
+ std::shared_ptr<ResizableBuffer> scratch = AllocateBuffer(
+ this->pool_, batch_size * type_traits<DType::type_num>::value_byte_size);
+
+ do {
+ batch_size = std::min(batch_size, rows_to_skip);
+ values_read =
+ ReadBatch(static_cast<int>(batch_size),
+ reinterpret_cast<int16_t*>(scratch->mutable_data()),
+ reinterpret_cast<int16_t*>(scratch->mutable_data()),
+ reinterpret_cast<T*>(scratch->mutable_data()), &values_read);
+ rows_to_skip -= values_read;
+ } while (values_read > 0 && rows_to_skip > 0);
+ }
+ }
+ return num_rows_to_skip - rows_to_skip;
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Dynamic column reader constructor
+
+std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
+ std::unique_ptr<PageReader> pager,
+ MemoryPool* pool) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedColumnReaderImpl<BooleanType>>(descr, std::move(pager),
+ pool);
+ case Type::INT32:
+ return std::make_shared<TypedColumnReaderImpl<Int32Type>>(descr, std::move(pager),
+ pool);
+ case Type::INT64:
+ return std::make_shared<TypedColumnReaderImpl<Int64Type>>(descr, std::move(pager),
+ pool);
+ case Type::INT96:
+ return std::make_shared<TypedColumnReaderImpl<Int96Type>>(descr, std::move(pager),
+ pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedColumnReaderImpl<FloatType>>(descr, std::move(pager),
+ pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedColumnReaderImpl<DoubleType>>(descr, std::move(pager),
+ pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedColumnReaderImpl<ByteArrayType>>(
+ descr, std::move(pager), pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedColumnReaderImpl<FLBAType>>(descr, std::move(pager),
+ pool);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<ColumnReader>(nullptr);
+}
+
+// ----------------------------------------------------------------------
+// RecordReader
+
+namespace internal {
+namespace {
+
+// The minimum number of repetition/definition levels to decode at a time, for
+// better vectorized performance when doing many smaller record reads
+constexpr int64_t kMinLevelBatchSize = 1024;
+
+template <typename DType>
+class TypedRecordReader : public ColumnReaderImplBase<DType>,
+ virtual public RecordReader {
+ public:
+ using T = typename DType::c_type;
+ using BASE = ColumnReaderImplBase<DType>;
+ TypedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool)
+ : BASE(descr, pool) {
+ leaf_info_ = leaf_info;
+ nullable_values_ = leaf_info.HasNullableValues();
+ at_record_start_ = true;
+ records_read_ = 0;
+ values_written_ = 0;
+ values_capacity_ = 0;
+ null_count_ = 0;
+ levels_written_ = 0;
+ levels_position_ = 0;
+ levels_capacity_ = 0;
+ uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
+
+ if (uses_values_) {
+ values_ = AllocateBuffer(pool);
+ }
+ valid_bits_ = AllocateBuffer(pool);
+ def_levels_ = AllocateBuffer(pool);
+ rep_levels_ = AllocateBuffer(pool);
+ Reset();
+ }
+
+ int64_t available_values_current_page() const {
+ return this->num_buffered_values_ - this->num_decoded_values_;
+ }
+
+ // Compute the values capacity in bytes for the given number of elements
+ int64_t bytes_for_values(int64_t nitems) const {
+ int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
+ int64_t bytes_for_values = -1;
+ if (MultiplyWithOverflow(nitems, type_size, &bytes_for_values)) {
+ throw ParquetException("Total size of items too large");
+ }
+ return bytes_for_values;
+ }
+
+ int64_t ReadRecords(int64_t num_records) override {
+ // Delimit records, then read values at the end
+ int64_t records_read = 0;
+
+ if (levels_position_ < levels_written_) {
+ records_read += ReadRecordData(num_records);
+ }
+
+ int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records);
+
+ // If we are in the middle of a record, we continue until reaching the
+ // desired number of records or the end of the current record if we've found
+ // enough records
+ while (!at_record_start_ || records_read < num_records) {
+ // Is there more data to read in this row group?
+ if (!this->HasNextInternal()) {
+ if (!at_record_start_) {
+ // We ended the row group while inside a record that we haven't seen
+ // the end of yet. So increment the record count for the last record in
+ // the row group
+ ++records_read;
+ at_record_start_ = true;
+ }
+ break;
+ }
+
+ /// We perform multiple batch reads until we either exhaust the row group
+ /// or observe the desired number of records
+ int64_t batch_size = std::min(level_batch_size, available_values_current_page());
+
+ // No more data in column
+ if (batch_size == 0) {
+ break;
+ }
+
+ if (this->max_def_level_ > 0) {
+ ReserveLevels(batch_size);
+
+ int16_t* def_levels = this->def_levels() + levels_written_;
+ int16_t* rep_levels = this->rep_levels() + levels_written_;
+
+ // Not present for non-repeated fields
+ int64_t levels_read = 0;
+ if (this->max_rep_level_ > 0) {
+ levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
+ if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ } else if (this->max_def_level_ > 0) {
+ levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
+ }
+
+ // Exhausted column chunk
+ if (levels_read == 0) {
+ break;
+ }
+
+ levels_written_ += levels_read;
+ records_read += ReadRecordData(num_records - records_read);
+ } else {
+ // No repetition or definition levels
+ batch_size = std::min(num_records - records_read, batch_size);
+ records_read += ReadRecordData(batch_size);
+ }
+ }
+
+ return records_read;
+ }
+
+ // We may outwardly have the appearance of having exhausted a column chunk
+ // when in fact we are in the middle of processing the last batch
+ bool has_values_to_process() const { return levels_position_ < levels_written_; }
+
+ std::shared_ptr<ResizableBuffer> ReleaseValues() override {
+ if (uses_values_) {
+ auto result = values_;
+ PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
+ values_ = AllocateBuffer(this->pool_);
+ values_capacity_ = 0;
+ return result;
+ } else {
+ return nullptr;
+ }
+ }
+
+ std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
+ if (leaf_info_.HasNullableValues()) {
+ auto result = valid_bits_;
+ PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
+ valid_bits_ = AllocateBuffer(this->pool_);
+ return result;
+ } else {
+ return nullptr;
+ }
+ }
+
+ // Process written repetition/definition levels to reach the end of
+ // records. Process no more levels than necessary to delimit the indicated
+ // number of logical records. Updates internal state of RecordReader
+ //
+ // \return Number of records delimited
+ int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+ int64_t values_to_read = 0;
+ int64_t records_read = 0;
+
+ const int16_t* def_levels = this->def_levels() + levels_position_;
+ const int16_t* rep_levels = this->rep_levels() + levels_position_;
+
+ DCHECK_GT(this->max_rep_level_, 0);
+
+ // Count logical records and number of values to read
+ while (levels_position_ < levels_written_) {
+ const int16_t rep_level = *rep_levels++;
+ if (rep_level == 0) {
+ // If at_record_start_ is true, we are seeing the start of a record
+ // for the second time, such as after repeated calls to
+ // DelimitRecords. In this case we must continue until we find
+ // another record start or exhausting the ColumnChunk
+ if (!at_record_start_) {
+ // We've reached the end of a record; increment the record count.
+ ++records_read;
+ if (records_read == num_records) {
+ // We've found the number of records we were looking for. Set
+ // at_record_start_ to true and break
+ at_record_start_ = true;
+ break;
+ }
+ }
+ }
+ // We have decided to consume the level at this position; therefore we
+ // must advance until we find another record boundary
+ at_record_start_ = false;
+
+ const int16_t def_level = *def_levels++;
+ if (def_level == this->max_def_level_) {
+ ++values_to_read;
+ }
+ ++levels_position_;
+ }
+ *values_seen = values_to_read;
+ return records_read;
+ }
+
+ void Reserve(int64_t capacity) override {
+ ReserveLevels(capacity);
+ ReserveValues(capacity);
+ }
+
+ int64_t UpdateCapacity(int64_t capacity, int64_t size, int64_t extra_size) {
+ if (extra_size < 0) {
+ throw ParquetException("Negative size (corrupt file?)");
+ }
+ int64_t target_size = -1;
+ if (AddWithOverflow(size, extra_size, &target_size)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ if (target_size >= (1LL << 62)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ if (capacity >= target_size) {
+ return capacity;
+ }
+ return BitUtil::NextPower2(target_size);
+ }
+
+ void ReserveLevels(int64_t extra_levels) {
+ if (this->max_def_level_ > 0) {
+ const int64_t new_levels_capacity =
+ UpdateCapacity(levels_capacity_, levels_written_, extra_levels);
+ if (new_levels_capacity > levels_capacity_) {
+ constexpr auto kItemSize = static_cast<int64_t>(sizeof(int16_t));
+ int64_t capacity_in_bytes = -1;
+ if (MultiplyWithOverflow(new_levels_capacity, kItemSize, &capacity_in_bytes)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ PARQUET_THROW_NOT_OK(def_levels_->Resize(capacity_in_bytes, false));
+ if (this->max_rep_level_ > 0) {
+ PARQUET_THROW_NOT_OK(rep_levels_->Resize(capacity_in_bytes, false));
+ }
+ levels_capacity_ = new_levels_capacity;
+ }
+ }
+ }
+
+ void ReserveValues(int64_t extra_values) {
+ const int64_t new_values_capacity =
+ UpdateCapacity(values_capacity_, values_written_, extra_values);
+ if (new_values_capacity > values_capacity_) {
+ // XXX(wesm): A hack to avoid memory allocation when reading directly
+ // into builder classes
+ if (uses_values_) {
+ PARQUET_THROW_NOT_OK(
+ values_->Resize(bytes_for_values(new_values_capacity), false));
+ }
+ values_capacity_ = new_values_capacity;
+ }
+ if (leaf_info_.HasNullableValues()) {
+ int64_t valid_bytes_new = BitUtil::BytesForBits(values_capacity_);
+ if (valid_bits_->size() < valid_bytes_new) {
+ int64_t valid_bytes_old = BitUtil::BytesForBits(values_written_);
+ PARQUET_THROW_NOT_OK(valid_bits_->Resize(valid_bytes_new, false));
+
+ // Avoid valgrind warnings
+ memset(valid_bits_->mutable_data() + valid_bytes_old, 0,
+ valid_bytes_new - valid_bytes_old);
+ }
+ }
+ }
+
+ void Reset() override {
+ ResetValues();
+
+ if (levels_written_ > 0) {
+ const int64_t levels_remaining = levels_written_ - levels_position_;
+ // Shift remaining levels to beginning of buffer and trim to only the number
+ // of decoded levels remaining
+ int16_t* def_data = def_levels();
+ int16_t* rep_data = rep_levels();
+
+ std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
+ PARQUET_THROW_NOT_OK(
+ def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+
+ if (this->max_rep_level_ > 0) {
+ std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
+ PARQUET_THROW_NOT_OK(
+ rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+ }
+
+ levels_written_ -= levels_position_;
+ levels_position_ = 0;
+ levels_capacity_ = levels_remaining;
+ }
+
+ records_read_ = 0;
+
+ // Call Finish on the binary builders to reset them
+ }
+
+ void SetPageReader(std::unique_ptr<PageReader> reader) override {
+ at_record_start_ = true;
+ this->pager_ = std::move(reader);
+ ResetDecoders();
+ }
+
+ bool HasMoreData() const override { return this->pager_ != nullptr; }
+
+ // Dictionary decoders must be reset when advancing row groups
+ void ResetDecoders() { this->decoders_.clear(); }
+
+ virtual void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) {
+ uint8_t* valid_bits = valid_bits_->mutable_data();
+ const int64_t valid_bits_offset = values_written_;
+
+ int64_t num_decoded = this->current_decoder_->DecodeSpaced(
+ ValuesHead<T>(), static_cast<int>(values_with_nulls),
+ static_cast<int>(null_count), valid_bits, valid_bits_offset);
+ DCHECK_EQ(num_decoded, values_with_nulls);
+ }
+
+ virtual void ReadValuesDense(int64_t values_to_read) {
+ int64_t num_decoded =
+ this->current_decoder_->Decode(ValuesHead<T>(), static_cast<int>(values_to_read));
+ DCHECK_EQ(num_decoded, values_to_read);
+ }
+
+ // Return number of logical records read
+ int64_t ReadRecordData(int64_t num_records) {
+ // Conservative upper bound
+ const int64_t possible_num_values =
+ std::max(num_records, levels_written_ - levels_position_);
+ ReserveValues(possible_num_values);
+
+ const int64_t start_levels_position = levels_position_;
+
+ int64_t values_to_read = 0;
+ int64_t records_read = 0;
+ if (this->max_rep_level_ > 0) {
+ records_read = DelimitRecords(num_records, &values_to_read);
+ } else if (this->max_def_level_ > 0) {
+ // No repetition levels, skip delimiting logic. Each level represents a
+ // null or not null entry
+ records_read = std::min(levels_written_ - levels_position_, num_records);
+
+ // This is advanced by DelimitRecords, which we skipped
+ levels_position_ += records_read;
+ } else {
+ records_read = values_to_read = num_records;
+ }
+
+ int64_t null_count = 0;
+ if (leaf_info_.HasNullableValues()) {
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = levels_position_ - start_levels_position;
+ validity_io.valid_bits = valid_bits_->mutable_data();
+ validity_io.valid_bits_offset = values_written_;
+
+ DefLevelsToBitmap(def_levels() + start_levels_position,
+ levels_position_ - start_levels_position, leaf_info_,
+ &validity_io);
+ values_to_read = validity_io.values_read - validity_io.null_count;
+ null_count = validity_io.null_count;
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesSpaced(validity_io.values_read, null_count);
+ } else {
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesDense(values_to_read);
+ }
+ if (this->leaf_info_.def_level > 0) {
+ // Optional, repeated, or some mix thereof
+ this->ConsumeBufferedValues(levels_position_ - start_levels_position);
+ } else {
+ // Flat, non-repeated
+ this->ConsumeBufferedValues(values_to_read);
+ }
+ // Total values, including null spaces, if any
+ values_written_ += values_to_read + null_count;
+ null_count_ += null_count;
+
+ return records_read;
+ }
+
+ void DebugPrintState() override {
+ const int16_t* def_levels = this->def_levels();
+ const int16_t* rep_levels = this->rep_levels();
+ const int64_t total_levels_read = levels_position_;
+
+ const T* vals = reinterpret_cast<const T*>(this->values());
+
+ std::cout << "def levels: ";
+ for (int64_t i = 0; i < total_levels_read; ++i) {
+ std::cout << def_levels[i] << " ";
+ }
+ std::cout << std::endl;
+
+ std::cout << "rep levels: ";
+ for (int64_t i = 0; i < total_levels_read; ++i) {
+ std::cout << rep_levels[i] << " ";
+ }
+ std::cout << std::endl;
+
+ std::cout << "values: ";
+ for (int64_t i = 0; i < this->values_written(); ++i) {
+ std::cout << vals[i] << " ";
+ }
+ std::cout << std::endl;
+ }
+
+ void ResetValues() {
+ if (values_written_ > 0) {
+ // Resize to 0, but do not shrink to fit
+ if (uses_values_) {
+ PARQUET_THROW_NOT_OK(values_->Resize(0, false));
+ }
+ PARQUET_THROW_NOT_OK(valid_bits_->Resize(0, false));
+ values_written_ = 0;
+ values_capacity_ = 0;
+ null_count_ = 0;
+ }
+ }
+
+ protected:
+ template <typename T>
+ T* ValuesHead() {
+ return reinterpret_cast<T*>(values_->mutable_data()) + values_written_;
+ }
+ LevelInfo leaf_info_;
+};
+
+class FLBARecordReader : public TypedRecordReader<FLBAType>,
+ virtual public BinaryRecordReader {
+ public:
+ FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<FLBAType>(descr, leaf_info, pool), builder_(nullptr) {
+ DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY);
+ int byte_width = descr_->type_length();
+ std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width);
+ builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, this->pool_));
+ }
+
+ ::arrow::ArrayVector GetBuilderChunks() override {
+ std::shared_ptr<::arrow::Array> chunk;
+ PARQUET_THROW_NOT_OK(builder_->Finish(&chunk));
+ return ::arrow::ArrayVector({chunk});
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ auto values = ValuesHead<FLBA>();
+ int64_t num_decoded =
+ this->current_decoder_->Decode(values, static_cast<int>(values_to_read));
+ DCHECK_EQ(num_decoded, values_to_read);
+
+ for (int64_t i = 0; i < num_decoded; i++) {
+ PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
+ }
+ ResetValues();
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ uint8_t* valid_bits = valid_bits_->mutable_data();
+ const int64_t valid_bits_offset = values_written_;
+ auto values = ValuesHead<FLBA>();
+
+ int64_t num_decoded = this->current_decoder_->DecodeSpaced(
+ values, static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits, valid_bits_offset);
+ DCHECK_EQ(num_decoded, values_to_read);
+
+ for (int64_t i = 0; i < num_decoded; i++) {
+ if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+ PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
+ } else {
+ PARQUET_THROW_NOT_OK(builder_->AppendNull());
+ }
+ }
+ ResetValues();
+ }
+
+ private:
+ std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
+};
+
+class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
+ virtual public BinaryRecordReader {
+ public:
+ ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
+ DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+ accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
+ }
+
+ ::arrow::ArrayVector GetBuilderChunks() override {
+ ::arrow::ArrayVector result = accumulator_.chunks;
+ if (result.size() == 0 || accumulator_.builder->length() > 0) {
+ std::shared_ptr<::arrow::Array> last_chunk;
+ PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
+ result.push_back(std::move(last_chunk));
+ }
+ accumulator_.chunks = {};
+ return result;
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
+ static_cast<int>(values_to_read), &accumulator_);
+ DCHECK_EQ(num_decoded, values_to_read);
+ ResetValues();
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ int64_t num_decoded = this->current_decoder_->DecodeArrow(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &accumulator_);
+ DCHECK_EQ(num_decoded, values_to_read - null_count);
+ ResetValues();
+ }
+
+ private:
+ // Helper data structure for accumulating builder chunks
+ typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
+};
+
+class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
+ virtual public DictionaryRecordReader {
+ public:
+ ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool), builder_(pool) {
+ this->read_dictionary_ = true;
+ }
+
+ std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
+ FlushBuilder();
+ std::vector<std::shared_ptr<::arrow::Array>> result;
+ std::swap(result, result_chunks_);
+ return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
+ }
+
+ void FlushBuilder() {
+ if (builder_.length() > 0) {
+ std::shared_ptr<::arrow::Array> chunk;
+ PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
+ result_chunks_.emplace_back(std::move(chunk));
+
+ // Also clears the dictionary memo table
+ builder_.Reset();
+ }
+ }
+
+ void MaybeWriteNewDictionary() {
+ if (this->new_dictionary_) {
+ /// If there is a new dictionary, we may need to flush the builder, then
+ /// insert the new dictionary values
+ FlushBuilder();
+ builder_.ResetFull();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ decoder->InsertDictionary(&builder_);
+ this->new_dictionary_ = false;
+ }
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ int64_t num_decoded = 0;
+ if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+ MaybeWriteNewDictionary();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
+ } else {
+ num_decoded = this->current_decoder_->DecodeArrowNonNull(
+ static_cast<int>(values_to_read), &builder_);
+
+ /// Flush values since they have been copied into the builder
+ ResetValues();
+ }
+ DCHECK_EQ(num_decoded, values_to_read);
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ int64_t num_decoded = 0;
+ if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+ MaybeWriteNewDictionary();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ num_decoded = decoder->DecodeIndicesSpaced(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &builder_);
+ } else {
+ num_decoded = this->current_decoder_->DecodeArrow(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &builder_);
+
+ /// Flush values since they have been copied into the builder
+ ResetValues();
+ }
+ DCHECK_EQ(num_decoded, values_to_read - null_count);
+ }
+
+ private:
+ using BinaryDictDecoder = DictDecoder<ByteArrayType>;
+
+ ::arrow::BinaryDictionary32Builder builder_;
+ std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
+};
+
+// TODO(wesm): Implement these to some satisfaction
+template <>
+void TypedRecordReader<Int96Type>::DebugPrintState() {}
+
+template <>
+void TypedRecordReader<ByteArrayType>::DebugPrintState() {}
+
+template <>
+void TypedRecordReader<FLBAType>::DebugPrintState() {}
+
+std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* descr,
+ LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool,
+ bool read_dictionary) {
+ if (read_dictionary) {
+ return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool);
+ } else {
+ return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool);
+ }
+}
+
+} // namespace
+
+std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
+ LevelInfo leaf_info, MemoryPool* pool,
+ const bool read_dictionary) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool);
+ case Type::INT32:
+ return std::make_shared<TypedRecordReader<Int32Type>>(descr, leaf_info, pool);
+ case Type::INT64:
+ return std::make_shared<TypedRecordReader<Int64Type>>(descr, leaf_info, pool);
+ case Type::INT96:
+ return std::make_shared<TypedRecordReader<Int96Type>>(descr, leaf_info, pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedRecordReader<FloatType>>(descr, leaf_info, pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool);
+ case Type::BYTE_ARRAY:
+ return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FLBARecordReader>(descr, leaf_info, pool);
+ default: {
+ // PARQUET-1481: This can occur if the file is corrupt
+ std::stringstream ss;
+ ss << "Invalid physical column type: " << static_cast<int>(descr->physical_type());
+ throw ParquetException(ss.str());
+ }
+ }
+ // Unreachable code, but suppress compiler warning
+ return nullptr;
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
index 8c48e4d7843..7f51cff2e97 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
@@ -1,376 +1,376 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "parquet/exception.h"
-#include "parquet/level_conversion.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-
-namespace BitUtil {
-class BitReader;
-} // namespace BitUtil
-
-namespace util {
-class RleDecoder;
-} // namespace util
-
-} // namespace arrow
-
-namespace parquet {
-
-class Decryptor;
-class Page;
-
-// 16 MB is the default maximum page header size
-static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
-
-// 16 KB is the default expected page header size
-static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
-
-class PARQUET_EXPORT LevelDecoder {
- public:
- LevelDecoder();
- ~LevelDecoder();
-
- // Initialize the LevelDecoder state with new data
- // and return the number of bytes consumed
- int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
- const uint8_t* data, int32_t data_size);
-
- void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
- const uint8_t* data);
-
- // Decodes a batch of levels into an array and returns the number of levels decoded
- int Decode(int batch_size, int16_t* levels);
-
- private:
- int bit_width_;
- int num_values_remaining_;
- Encoding::type encoding_;
- std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
- std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_;
- int16_t max_level_;
-};
-
-struct CryptoContext {
- CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
- std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
- : start_decrypt_with_dictionary_page(start_with_dictionary_page),
- row_group_ordinal(rg_ordinal),
- column_ordinal(col_ordinal),
- meta_decryptor(std::move(meta)),
- data_decryptor(std::move(data)) {}
- CryptoContext() {}
-
- bool start_decrypt_with_dictionary_page = false;
- int16_t row_group_ordinal = -1;
- int16_t column_ordinal = -1;
- std::shared_ptr<Decryptor> meta_decryptor;
- std::shared_ptr<Decryptor> data_decryptor;
-};
-
-// Abstract page iterator interface. This way, we can feed column pages to the
-// ColumnReader through whatever mechanism we choose
-class PARQUET_EXPORT PageReader {
- public:
- virtual ~PageReader() = default;
-
- static std::unique_ptr<PageReader> Open(
- std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
- Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
- const CryptoContext* ctx = NULLPTR);
-
- // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
- // containing new Page otherwise
- virtual std::shared_ptr<Page> NextPage() = 0;
-
- virtual void set_max_page_header_size(uint32_t size) = 0;
-};
-
-class PARQUET_EXPORT ColumnReader {
- public:
- virtual ~ColumnReader() = default;
-
- static std::shared_ptr<ColumnReader> Make(
- const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- // Returns true if there are still values in this column.
- virtual bool HasNext() = 0;
-
- virtual Type::type type() const = 0;
-
- virtual const ColumnDescriptor* descr() const = 0;
-
- // Get the encoding that can be exposed by this reader. If it returns
- // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
- //
- // \note API EXPERIMENTAL
- virtual ExposedEncoding GetExposedEncoding() = 0;
-
- protected:
- friend class RowGroupReader;
- // Set the encoding that can be exposed by this reader.
- //
- // \note API EXPERIMENTAL
- virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
-};
-
-// API to read values from a single column. This is a main client facing API.
-template <typename DType>
-class TypedColumnReader : public ColumnReader {
- public:
- typedef typename DType::c_type T;
-
- // Read a batch of repetition levels, definition levels, and values from the
- // column.
- //
- // Since null values are not stored in the values, the number of values read
- // may be less than the number of repetition and definition levels. With
- // nested data this is almost certainly true.
- //
- // Set def_levels or rep_levels to nullptr if you want to skip reading them.
- // This is only safe if you know through some other source that there are no
- // undefined values.
- //
- // To fully exhaust a row group, you must read batches until the number of
- // values read reaches the number of stored values according to the metadata.
- //
- // This API is the same for both V1 and V2 of the DataPage
- //
- // @returns: actual number of levels read (see values_read for number of values read)
- virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- T* values, int64_t* values_read) = 0;
-
- /// Read a batch of repetition levels, definition levels, and values from the
- /// column and leave spaces for null entries on the lowest level in the values
- /// buffer.
- ///
- /// In comparison to ReadBatch the length of repetition and definition levels
- /// is the same as of the number of values read for max_definition_level == 1.
- /// In the case of max_definition_level > 1, the repetition and definition
- /// levels are larger than the values but the values include the null entries
- /// with definition_level == (max_definition_level - 1).
- ///
- /// To fully exhaust a row group, you must read batches until the number of
- /// values read reaches the number of stored values according to the metadata.
- ///
- /// @param batch_size the number of levels to read
- /// @param[out] def_levels The Parquet definition levels, output has
- /// the length levels_read.
- /// @param[out] rep_levels The Parquet repetition levels, output has
- /// the length levels_read.
- /// @param[out] values The values in the lowest nested level including
- /// spacing for nulls on the lowest levels; output has the length
- /// values_read.
- /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
- /// the row is null or on the maximum definition level. For performance
- /// reasons the underlying buffer should be able to store 1 bit more than
- /// required. If this requires an additional byte, this byte is only read
- /// but never written to.
- /// @param valid_bits_offset The offset in bits of the valid_bits where the
- /// first relevant bit resides.
- /// @param[out] levels_read The number of repetition/definition levels that were read.
- /// @param[out] values_read The number of values read, this includes all
- /// non-null entries as well as all null-entries on the lowest level
- /// (i.e. definition_level == max_definition_level - 1)
- /// @param[out] null_count The number of nulls on the lowest levels.
- /// (i.e. (values_read - null_count) is total number of non-null entries)
- ///
- /// \deprecated Since 4.0.0
- ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
- virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, T* values, uint8_t* valid_bits,
- int64_t valid_bits_offset, int64_t* levels_read,
- int64_t* values_read, int64_t* null_count) = 0;
-
- // Skip reading levels
- // Returns the number of levels skipped
- virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
-
- // Read a batch of repetition levels, definition levels, and indices from the
- // column. And read the dictionary if a dictionary page is encountered during
- // reading pages. This API is similar to ReadBatch(), with ability to read
- // dictionary and indices. It is only valid to call this method when the reader can
- // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
- // DICTIONARY).
- //
- // The dictionary is read along with the data page. When there's no data page,
- // the dictionary won't be returned.
- //
- // @param batch_size The batch size to read
- // @param[out] def_levels The Parquet definition levels.
- // @param[out] rep_levels The Parquet repetition levels.
- // @param[out] indices The dictionary indices.
- // @param[out] indices_read The number of indices read.
- // @param[out] dict The pointer to dictionary values. It will return nullptr if
- // there's no data page. Each column chunk only has one dictionary page. The dictionary
- // is owned by the reader, so the caller is responsible for copying the dictionary
- // values before the reader gets destroyed.
- // @param[out] dict_len The dictionary length. It will return 0 if there's no data
- // page.
- // @returns: actual number of levels read (see indices_read for number of
- // indices read
- //
- // \note API EXPERIMENTAL
- virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, int32_t* indices,
- int64_t* indices_read, const T** dict,
- int32_t* dict_len) = 0;
-};
-
-namespace internal {
-
-/// \brief Stateful column reader that delimits semantic records for both flat
-/// and nested columns
-///
-/// \note API EXPERIMENTAL
-/// \since 1.3.0
-class RecordReader {
- public:
- static std::shared_ptr<RecordReader> Make(
- const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
- const bool read_dictionary = false);
-
- virtual ~RecordReader() = default;
-
- /// \brief Attempt to read indicated number of records from column chunk
- /// \return number of records read
- virtual int64_t ReadRecords(int64_t num_records) = 0;
-
- /// \brief Pre-allocate space for data. Results in better flat read performance
- virtual void Reserve(int64_t num_values) = 0;
-
- /// \brief Clear consumed values and repetition/definition levels as the
- /// result of calling ReadRecords
- virtual void Reset() = 0;
-
- /// \brief Transfer filled values buffer to caller. A new one will be
- /// allocated in subsequent ReadRecords calls
- virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
-
- /// \brief Transfer filled validity bitmap buffer to caller. A new one will
- /// be allocated in subsequent ReadRecords calls
- virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
-
- /// \brief Return true if the record reader has more internal data yet to
- /// process
- virtual bool HasMoreData() const = 0;
-
- /// \brief Advance record reader to the next row group
- /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
- virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
-
- virtual void DebugPrintState() = 0;
-
- /// \brief Decoded definition levels
- int16_t* def_levels() const {
- return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
- }
-
- /// \brief Decoded repetition levels
- int16_t* rep_levels() const {
- return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
- }
-
- /// \brief Decoded values, including nulls, if any
- uint8_t* values() const { return values_->mutable_data(); }
-
- /// \brief Number of values written including nulls (if any)
- int64_t values_written() const { return values_written_; }
-
- /// \brief Number of definition / repetition levels (from those that have
- /// been decoded) that have been consumed inside the reader.
- int64_t levels_position() const { return levels_position_; }
-
- /// \brief Number of definition / repetition levels that have been written
- /// internally in the reader
- int64_t levels_written() const { return levels_written_; }
-
- /// \brief Number of nulls in the leaf
- int64_t null_count() const { return null_count_; }
-
- /// \brief True if the leaf values are nullable
- bool nullable_values() const { return nullable_values_; }
-
- /// \brief True if reading directly as Arrow dictionary-encoded
- bool read_dictionary() const { return read_dictionary_; }
-
- protected:
- bool nullable_values_;
-
- bool at_record_start_;
- int64_t records_read_;
-
- int64_t values_written_;
- int64_t values_capacity_;
- int64_t null_count_;
-
- int64_t levels_written_;
- int64_t levels_position_;
- int64_t levels_capacity_;
-
- std::shared_ptr<::arrow::ResizableBuffer> values_;
- // In the case of false, don't allocate the values buffer (when we directly read into
- // builder classes).
- bool uses_values_;
-
- std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
- std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
- std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
-
- bool read_dictionary_ = false;
-};
-
-class BinaryRecordReader : virtual public RecordReader {
- public:
- virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
-};
-
-/// \brief Read records directly to dictionary-encoded Arrow form (int32
-/// indices). Only valid for BYTE_ARRAY columns
-class DictionaryRecordReader : virtual public RecordReader {
- public:
- virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
-};
-
-} // namespace internal
-
-using BoolReader = TypedColumnReader<BooleanType>;
-using Int32Reader = TypedColumnReader<Int32Type>;
-using Int64Reader = TypedColumnReader<Int64Type>;
-using Int96Reader = TypedColumnReader<Int96Type>;
-using FloatReader = TypedColumnReader<FloatType>;
-using DoubleReader = TypedColumnReader<DoubleType>;
-using ByteArrayReader = TypedColumnReader<ByteArrayType>;
-using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "parquet/exception.h"
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+
+namespace BitUtil {
+class BitReader;
+} // namespace BitUtil
+
+namespace util {
+class RleDecoder;
+} // namespace util
+
+} // namespace arrow
+
+namespace parquet {
+
+class Decryptor;
+class Page;
+
+// 16 MB is the default maximum page header size
+static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+
+// 16 KB is the default expected page header size
+static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
+
+class PARQUET_EXPORT LevelDecoder {
+ public:
+ LevelDecoder();
+ ~LevelDecoder();
+
+ // Initialize the LevelDecoder state with new data
+ // and return the number of bytes consumed
+ int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+ const uint8_t* data, int32_t data_size);
+
+ void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
+ const uint8_t* data);
+
+ // Decodes a batch of levels into an array and returns the number of levels decoded
+ int Decode(int batch_size, int16_t* levels);
+
+ private:
+ int bit_width_;
+ int num_values_remaining_;
+ Encoding::type encoding_;
+ std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
+ std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_;
+ int16_t max_level_;
+};
+
+struct CryptoContext {
+ CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
+ std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
+ : start_decrypt_with_dictionary_page(start_with_dictionary_page),
+ row_group_ordinal(rg_ordinal),
+ column_ordinal(col_ordinal),
+ meta_decryptor(std::move(meta)),
+ data_decryptor(std::move(data)) {}
+ CryptoContext() {}
+
+ bool start_decrypt_with_dictionary_page = false;
+ int16_t row_group_ordinal = -1;
+ int16_t column_ordinal = -1;
+ std::shared_ptr<Decryptor> meta_decryptor;
+ std::shared_ptr<Decryptor> data_decryptor;
+};
+
+// Abstract page iterator interface. This way, we can feed column pages to the
+// ColumnReader through whatever mechanism we choose
+class PARQUET_EXPORT PageReader {
+ public:
+ virtual ~PageReader() = default;
+
+ static std::unique_ptr<PageReader> Open(
+ std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
+ Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ const CryptoContext* ctx = NULLPTR);
+
+ // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
+ // containing new Page otherwise
+ virtual std::shared_ptr<Page> NextPage() = 0;
+
+ virtual void set_max_page_header_size(uint32_t size) = 0;
+};
+
+class PARQUET_EXPORT ColumnReader {
+ public:
+ virtual ~ColumnReader() = default;
+
+ static std::shared_ptr<ColumnReader> Make(
+ const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ // Returns true if there are still values in this column.
+ virtual bool HasNext() = 0;
+
+ virtual Type::type type() const = 0;
+
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ // Get the encoding that can be exposed by this reader. If it returns
+ // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+ //
+ // \note API EXPERIMENTAL
+ virtual ExposedEncoding GetExposedEncoding() = 0;
+
+ protected:
+ friend class RowGroupReader;
+ // Set the encoding that can be exposed by this reader.
+ //
+ // \note API EXPERIMENTAL
+ virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
+};
+
+// API to read values from a single column. This is a main client facing API.
+template <typename DType>
+class TypedColumnReader : public ColumnReader {
+ public:
+ typedef typename DType::c_type T;
+
+ // Read a batch of repetition levels, definition levels, and values from the
+ // column.
+ //
+ // Since null values are not stored in the values, the number of values read
+ // may be less than the number of repetition and definition levels. With
+ // nested data this is almost certainly true.
+ //
+ // Set def_levels or rep_levels to nullptr if you want to skip reading them.
+ // This is only safe if you know through some other source that there are no
+ // undefined values.
+ //
+ // To fully exhaust a row group, you must read batches until the number of
+ // values read reaches the number of stored values according to the metadata.
+ //
+ // This API is the same for both V1 and V2 of the DataPage
+ //
+ // @returns: actual number of levels read (see values_read for number of values read)
+ virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, int64_t* values_read) = 0;
+
+ /// Read a batch of repetition levels, definition levels, and values from the
+ /// column and leave spaces for null entries on the lowest level in the values
+ /// buffer.
+ ///
+ /// In comparison to ReadBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1).
+ ///
+ /// To fully exhaust a row group, you must read batches until the number of
+ /// values read reaches the number of stored values according to the metadata.
+ ///
+ /// @param batch_size the number of levels to read
+ /// @param[out] def_levels The Parquet definition levels, output has
+ /// the length levels_read.
+ /// @param[out] rep_levels The Parquet repetition levels, output has
+ /// the length levels_read.
+ /// @param[out] values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; output has the length
+ /// values_read.
+ /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
+ /// the row is null or on the maximum definition level. For performance
+ /// reasons the underlying buffer should be able to store 1 bit more than
+ /// required. If this requires an additional byte, this byte is only read
+ /// but never written to.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param[out] levels_read The number of repetition/definition levels that were read.
+ /// @param[out] values_read The number of values read, this includes all
+ /// non-null entries as well as all null-entries on the lowest level
+ /// (i.e. definition_level == max_definition_level - 1)
+ /// @param[out] null_count The number of nulls on the lowest levels.
+ /// (i.e. (values_read - null_count) is total number of non-null entries)
+ ///
+ /// \deprecated Since 4.0.0
+ ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
+ virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, T* values, uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t* levels_read,
+ int64_t* values_read, int64_t* null_count) = 0;
+
+ // Skip reading levels
+ // Returns the number of levels skipped
+ virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
+
+ // Read a batch of repetition levels, definition levels, and indices from the
+ // column. And read the dictionary if a dictionary page is encountered during
+ // reading pages. This API is similar to ReadBatch(), with ability to read
+ // dictionary and indices. It is only valid to call this method when the reader can
+ // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+ // DICTIONARY).
+ //
+ // The dictionary is read along with the data page. When there's no data page,
+ // the dictionary won't be returned.
+ //
+ // @param batch_size The batch size to read
+ // @param[out] def_levels The Parquet definition levels.
+ // @param[out] rep_levels The Parquet repetition levels.
+ // @param[out] indices The dictionary indices.
+ // @param[out] indices_read The number of indices read.
+ // @param[out] dict The pointer to dictionary values. It will return nullptr if
+ // there's no data page. Each column chunk only has one dictionary page. The dictionary
+ // is owned by the reader, so the caller is responsible for copying the dictionary
+ // values before the reader gets destroyed.
+ // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+ // page.
+ // @returns: actual number of levels read (see indices_read for number of
+ // indices read
+ //
+ // \note API EXPERIMENTAL
+ virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict,
+ int32_t* dict_len) = 0;
+};
+
+namespace internal {
+
+/// \brief Stateful column reader that delimits semantic records for both flat
+/// and nested columns
+///
+/// \note API EXPERIMENTAL
+/// \since 1.3.0
+class RecordReader {
+ public:
+ static std::shared_ptr<RecordReader> Make(
+ const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ const bool read_dictionary = false);
+
+ virtual ~RecordReader() = default;
+
+ /// \brief Attempt to read indicated number of records from column chunk
+ /// \return number of records read
+ virtual int64_t ReadRecords(int64_t num_records) = 0;
+
+ /// \brief Pre-allocate space for data. Results in better flat read performance
+ virtual void Reserve(int64_t num_values) = 0;
+
+ /// \brief Clear consumed values and repetition/definition levels as the
+ /// result of calling ReadRecords
+ virtual void Reset() = 0;
+
+ /// \brief Transfer filled values buffer to caller. A new one will be
+ /// allocated in subsequent ReadRecords calls
+ virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
+
+ /// \brief Transfer filled validity bitmap buffer to caller. A new one will
+ /// be allocated in subsequent ReadRecords calls
+ virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
+
+ /// \brief Return true if the record reader has more internal data yet to
+ /// process
+ virtual bool HasMoreData() const = 0;
+
+ /// \brief Advance record reader to the next row group
+ /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
+ virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
+
+ virtual void DebugPrintState() = 0;
+
+ /// \brief Decoded definition levels
+ int16_t* def_levels() const {
+ return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
+ }
+
+ /// \brief Decoded repetition levels
+ int16_t* rep_levels() const {
+ return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
+ }
+
+ /// \brief Decoded values, including nulls, if any
+ uint8_t* values() const { return values_->mutable_data(); }
+
+ /// \brief Number of values written including nulls (if any)
+ int64_t values_written() const { return values_written_; }
+
+ /// \brief Number of definition / repetition levels (from those that have
+ /// been decoded) that have been consumed inside the reader.
+ int64_t levels_position() const { return levels_position_; }
+
+ /// \brief Number of definition / repetition levels that have been written
+ /// internally in the reader
+ int64_t levels_written() const { return levels_written_; }
+
+ /// \brief Number of nulls in the leaf
+ int64_t null_count() const { return null_count_; }
+
+ /// \brief True if the leaf values are nullable
+ bool nullable_values() const { return nullable_values_; }
+
+ /// \brief True if reading directly as Arrow dictionary-encoded
+ bool read_dictionary() const { return read_dictionary_; }
+
+ protected:
+ bool nullable_values_;
+
+ bool at_record_start_;
+ int64_t records_read_;
+
+ int64_t values_written_;
+ int64_t values_capacity_;
+ int64_t null_count_;
+
+ int64_t levels_written_;
+ int64_t levels_position_;
+ int64_t levels_capacity_;
+
+ std::shared_ptr<::arrow::ResizableBuffer> values_;
+ // In the case of false, don't allocate the values buffer (when we directly read into
+ // builder classes).
+ bool uses_values_;
+
+ std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
+ std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
+ std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
+
+ bool read_dictionary_ = false;
+};
+
+class BinaryRecordReader : virtual public RecordReader {
+ public:
+ virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+};
+
+/// \brief Read records directly to dictionary-encoded Arrow form (int32
+/// indices). Only valid for BYTE_ARRAY columns
+class DictionaryRecordReader : virtual public RecordReader {
+ public:
+ virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
+};
+
+} // namespace internal
+
+using BoolReader = TypedColumnReader<BooleanType>;
+using Int32Reader = TypedColumnReader<Int32Type>;
+using Int64Reader = TypedColumnReader<Int64Type>;
+using Int96Reader = TypedColumnReader<Int96Type>;
+using FloatReader = TypedColumnReader<FloatType>;
+using DoubleReader = TypedColumnReader<DoubleType>;
+using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
index 9ab1663ccd7..0ef83568e3e 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
@@ -1,91 +1,91 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/column_scanner.h"
-
-#include <cstdint>
-#include <memory>
-
-#include "parquet/column_reader.h"
-
-using arrow::MemoryPool;
-
-namespace parquet {
-
-std::shared_ptr<Scanner> Scanner::Make(std::shared_ptr<ColumnReader> col_reader,
- int64_t batch_size, MemoryPool* pool) {
- switch (col_reader->type()) {
- case Type::BOOLEAN:
- return std::make_shared<BoolScanner>(std::move(col_reader), batch_size, pool);
- case Type::INT32:
- return std::make_shared<Int32Scanner>(std::move(col_reader), batch_size, pool);
- case Type::INT64:
- return std::make_shared<Int64Scanner>(std::move(col_reader), batch_size, pool);
- case Type::INT96:
- return std::make_shared<Int96Scanner>(std::move(col_reader), batch_size, pool);
- case Type::FLOAT:
- return std::make_shared<FloatScanner>(std::move(col_reader), batch_size, pool);
- case Type::DOUBLE:
- return std::make_shared<DoubleScanner>(std::move(col_reader), batch_size, pool);
- case Type::BYTE_ARRAY:
- return std::make_shared<ByteArrayScanner>(std::move(col_reader), batch_size, pool);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<FixedLenByteArrayScanner>(std::move(col_reader), batch_size,
- pool);
- default:
- ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return std::shared_ptr<Scanner>(nullptr);
-}
-
-int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- uint8_t* values, int64_t* values_buffered,
- parquet::ColumnReader* reader) {
- switch (reader->type()) {
- case parquet::Type::BOOLEAN:
- return ScanAll<parquet::BoolReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::INT32:
- return ScanAll<parquet::Int32Reader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::INT64:
- return ScanAll<parquet::Int64Reader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::INT96:
- return ScanAll<parquet::Int96Reader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::FLOAT:
- return ScanAll<parquet::FloatReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::DOUBLE:
- return ScanAll<parquet::DoubleReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::BYTE_ARRAY:
- return ScanAll<parquet::ByteArrayReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::FIXED_LEN_BYTE_ARRAY:
- return ScanAll<parquet::FixedLenByteArrayReader>(batch_size, def_levels, rep_levels,
- values, values_buffered, reader);
- default:
- parquet::ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return 0;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_scanner.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/column_reader.h"
+
+using arrow::MemoryPool;
+
+namespace parquet {
+
+std::shared_ptr<Scanner> Scanner::Make(std::shared_ptr<ColumnReader> col_reader,
+ int64_t batch_size, MemoryPool* pool) {
+ switch (col_reader->type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<BoolScanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT32:
+ return std::make_shared<Int32Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT64:
+ return std::make_shared<Int64Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT96:
+ return std::make_shared<Int96Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::FLOAT:
+ return std::make_shared<FloatScanner>(std::move(col_reader), batch_size, pool);
+ case Type::DOUBLE:
+ return std::make_shared<DoubleScanner>(std::move(col_reader), batch_size, pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<ByteArrayScanner>(std::move(col_reader), batch_size, pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FixedLenByteArrayScanner>(std::move(col_reader), batch_size,
+ pool);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<Scanner>(nullptr);
+}
+
+int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered,
+ parquet::ColumnReader* reader) {
+ switch (reader->type()) {
+ case parquet::Type::BOOLEAN:
+ return ScanAll<parquet::BoolReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT32:
+ return ScanAll<parquet::Int32Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT64:
+ return ScanAll<parquet::Int64Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT96:
+ return ScanAll<parquet::Int96Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::FLOAT:
+ return ScanAll<parquet::FloatReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::DOUBLE:
+ return ScanAll<parquet::DoubleReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::BYTE_ARRAY:
+ return ScanAll<parquet::ByteArrayReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+ return ScanAll<parquet::FixedLenByteArrayReader>(batch_size, def_levels, rep_levels,
+ values, values_buffered, reader);
+ default:
+ parquet::ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return 0;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
index d53435f03cd..61d08841409 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
@@ -1,262 +1,262 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "parquet/column_reader.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
-
-class PARQUET_EXPORT Scanner {
- public:
- explicit Scanner(std::shared_ptr<ColumnReader> reader,
- int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
- : batch_size_(batch_size),
- level_offset_(0),
- levels_buffered_(0),
- value_buffer_(AllocateBuffer(pool)),
- value_offset_(0),
- values_buffered_(0),
- reader_(std::move(reader)) {
- def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
- rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
- }
-
- virtual ~Scanner() {}
-
- static std::shared_ptr<Scanner> Make(
- std::shared_ptr<ColumnReader> col_reader,
- int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
-
- bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
-
- const ColumnDescriptor* descr() const { return reader_->descr(); }
-
- int64_t batch_size() const { return batch_size_; }
-
- void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
-
- protected:
- int64_t batch_size_;
-
- std::vector<int16_t> def_levels_;
- std::vector<int16_t> rep_levels_;
- int level_offset_;
- int levels_buffered_;
-
- std::shared_ptr<ResizableBuffer> value_buffer_;
- int value_offset_;
- int64_t values_buffered_;
- std::shared_ptr<ColumnReader> reader_;
-};
-
-template <typename DType>
-class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
- public:
- typedef typename DType::c_type T;
-
- explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
- int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
- : Scanner(std::move(reader), batch_size, pool) {
- typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
- int value_byte_size = type_traits<DType::type_num>::value_byte_size;
- PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
- values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
- }
-
- virtual ~TypedScanner() {}
-
- bool NextLevels(int16_t* def_level, int16_t* rep_level) {
- if (level_offset_ == levels_buffered_) {
- levels_buffered_ = static_cast<int>(
- typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
- rep_levels_.data(), values_, &values_buffered_));
-
- value_offset_ = 0;
- level_offset_ = 0;
- if (!levels_buffered_) {
- return false;
- }
- }
- *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
- *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
- level_offset_++;
- return true;
- }
-
- bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
- if (level_offset_ == levels_buffered_) {
- if (!HasNext()) {
- // Out of data pages
- return false;
- }
- }
-
- NextLevels(def_level, rep_level);
- *is_null = *def_level < descr()->max_definition_level();
-
- if (*is_null) {
- return true;
- }
-
- if (value_offset_ == values_buffered_) {
- throw ParquetException("Value was non-null, but has not been buffered");
- }
- *val = values_[value_offset_++];
- return true;
- }
-
- // Returns true if there is a next value
- bool NextValue(T* val, bool* is_null) {
- if (level_offset_ == levels_buffered_) {
- if (!HasNext()) {
- // Out of data pages
- return false;
- }
- }
-
- // Out of values
- int16_t def_level = -1;
- int16_t rep_level = -1;
- NextLevels(&def_level, &rep_level);
- *is_null = def_level < descr()->max_definition_level();
-
- if (*is_null) {
- return true;
- }
-
- if (value_offset_ == values_buffered_) {
- throw ParquetException("Value was non-null, but has not been buffered");
- }
- *val = values_[value_offset_++];
- return true;
- }
-
- virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
- T val{};
- int16_t def_level = -1;
- int16_t rep_level = -1;
- bool is_null = false;
- char buffer[80];
-
- if (!Next(&val, &def_level, &rep_level, &is_null)) {
- throw ParquetException("No more values buffered");
- }
-
- if (with_levels) {
- out << " D:" << def_level << " R:" << rep_level << " ";
- if (!is_null) {
- out << "V:";
- }
- }
-
- if (is_null) {
- std::string null_fmt = format_fwf<ByteArrayType>(width);
- snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
- } else {
- FormatValue(&val, buffer, sizeof(buffer), width);
- }
- out << buffer;
- }
-
- private:
- // The ownership of this object is expressed through the reader_ variable in the base
- TypedColumnReader<DType>* typed_reader_;
-
- inline void FormatValue(void* val, char* buffer, int bufsize, int width);
-
- T* values_;
-};
-
-template <typename DType>
-inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<DType>(width);
- snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
-}
-
-template <>
-inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<Int96Type>(width);
- std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
- snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
-template <>
-inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<ByteArrayType>(width);
- std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
- snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
-template <>
-inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<FLBAType>(width);
- std::string result = FixedLenByteArrayToString(
- *reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
- snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
-typedef TypedScanner<BooleanType> BoolScanner;
-typedef TypedScanner<Int32Type> Int32Scanner;
-typedef TypedScanner<Int64Type> Int64Scanner;
-typedef TypedScanner<Int96Type> Int96Scanner;
-typedef TypedScanner<FloatType> FloatScanner;
-typedef TypedScanner<DoubleType> DoubleScanner;
-typedef TypedScanner<ByteArrayType> ByteArrayScanner;
-typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
-
-template <typename RType>
-int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- uint8_t* values, int64_t* values_buffered,
- parquet::ColumnReader* reader) {
- typedef typename RType::T Type;
- auto typed_reader = static_cast<RType*>(reader);
- auto vals = reinterpret_cast<Type*>(&values[0]);
- return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
- values_buffered);
-}
-
-int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, uint8_t* values,
- int64_t* values_buffered,
- parquet::ColumnReader* reader);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
+
+class PARQUET_EXPORT Scanner {
+ public:
+ explicit Scanner(std::shared_ptr<ColumnReader> reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+ : batch_size_(batch_size),
+ level_offset_(0),
+ levels_buffered_(0),
+ value_buffer_(AllocateBuffer(pool)),
+ value_offset_(0),
+ values_buffered_(0),
+ reader_(std::move(reader)) {
+ def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
+ rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
+ }
+
+ virtual ~Scanner() {}
+
+ static std::shared_ptr<Scanner> Make(
+ std::shared_ptr<ColumnReader> col_reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
+
+ bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
+
+ const ColumnDescriptor* descr() const { return reader_->descr(); }
+
+ int64_t batch_size() const { return batch_size_; }
+
+ void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
+
+ protected:
+ int64_t batch_size_;
+
+ std::vector<int16_t> def_levels_;
+ std::vector<int16_t> rep_levels_;
+ int level_offset_;
+ int levels_buffered_;
+
+ std::shared_ptr<ResizableBuffer> value_buffer_;
+ int value_offset_;
+ int64_t values_buffered_;
+ std::shared_ptr<ColumnReader> reader_;
+};
+
+template <typename DType>
+class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
+ public:
+ typedef typename DType::c_type T;
+
+ explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+ : Scanner(std::move(reader), batch_size, pool) {
+ typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
+ int value_byte_size = type_traits<DType::type_num>::value_byte_size;
+ PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
+ values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
+ }
+
+ virtual ~TypedScanner() {}
+
+ bool NextLevels(int16_t* def_level, int16_t* rep_level) {
+ if (level_offset_ == levels_buffered_) {
+ levels_buffered_ = static_cast<int>(
+ typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
+ rep_levels_.data(), values_, &values_buffered_));
+
+ value_offset_ = 0;
+ level_offset_ = 0;
+ if (!levels_buffered_) {
+ return false;
+ }
+ }
+ *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
+ *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
+ level_offset_++;
+ return true;
+ }
+
+ bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
+ if (level_offset_ == levels_buffered_) {
+ if (!HasNext()) {
+ // Out of data pages
+ return false;
+ }
+ }
+
+ NextLevels(def_level, rep_level);
+ *is_null = *def_level < descr()->max_definition_level();
+
+ if (*is_null) {
+ return true;
+ }
+
+ if (value_offset_ == values_buffered_) {
+ throw ParquetException("Value was non-null, but has not been buffered");
+ }
+ *val = values_[value_offset_++];
+ return true;
+ }
+
+ // Returns true if there is a next value
+ bool NextValue(T* val, bool* is_null) {
+ if (level_offset_ == levels_buffered_) {
+ if (!HasNext()) {
+ // Out of data pages
+ return false;
+ }
+ }
+
+ // Out of values
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
+ NextLevels(&def_level, &rep_level);
+ *is_null = def_level < descr()->max_definition_level();
+
+ if (*is_null) {
+ return true;
+ }
+
+ if (value_offset_ == values_buffered_) {
+ throw ParquetException("Value was non-null, but has not been buffered");
+ }
+ *val = values_[value_offset_++];
+ return true;
+ }
+
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
+ T val{};
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
+ bool is_null = false;
+ char buffer[80];
+
+ if (!Next(&val, &def_level, &rep_level, &is_null)) {
+ throw ParquetException("No more values buffered");
+ }
+
+ if (with_levels) {
+ out << " D:" << def_level << " R:" << rep_level << " ";
+ if (!is_null) {
+ out << "V:";
+ }
+ }
+
+ if (is_null) {
+ std::string null_fmt = format_fwf<ByteArrayType>(width);
+ snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
+ } else {
+ FormatValue(&val, buffer, sizeof(buffer), width);
+ }
+ out << buffer;
+ }
+
+ private:
+ // The ownership of this object is expressed through the reader_ variable in the base
+ TypedColumnReader<DType>* typed_reader_;
+
+ inline void FormatValue(void* val, char* buffer, int bufsize, int width);
+
+ T* values_;
+};
+
+template <typename DType>
+inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<DType>(width);
+ snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
+}
+
+template <>
+inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<Int96Type>(width);
+ std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<ByteArrayType>(width);
+ std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<FLBAType>(width);
+ std::string result = FixedLenByteArrayToString(
+ *reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+typedef TypedScanner<BooleanType> BoolScanner;
+typedef TypedScanner<Int32Type> Int32Scanner;
+typedef TypedScanner<Int64Type> Int64Scanner;
+typedef TypedScanner<Int96Type> Int96Scanner;
+typedef TypedScanner<FloatType> FloatScanner;
+typedef TypedScanner<DoubleType> DoubleScanner;
+typedef TypedScanner<ByteArrayType> ByteArrayScanner;
+typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
+
+template <typename RType>
+int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered,
+ parquet::ColumnReader* reader) {
+ typedef typename RType::T Type;
+ auto typed_reader = static_cast<RType*>(reader);
+ auto vals = reinterpret_cast<Type*>(&values[0]);
+ return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
+ values_buffered);
+}
+
+int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, uint8_t* values,
+ int64_t* values_buffered,
+ parquet::ColumnReader* reader);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
index 446fe25e644..75df6f0c683 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
@@ -1,2067 +1,2067 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/column_writer.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer_builder.h"
-#include "arrow/compute/api.h"
-#include "arrow/io/memory.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
-#include "arrow/util/endian.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/rle_encoding.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/column_page.h"
-#include "parquet/encoding.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/level_conversion.h"
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/statistics.h"
-#include "parquet/thrift_internal.h"
-#include "parquet/types.h"
-
-using arrow::Array;
-using arrow::ArrayData;
-using arrow::Datum;
-using arrow::Result;
-using arrow::Status;
-using arrow::BitUtil::BitWriter;
-using arrow::internal::checked_cast;
-using arrow::internal::checked_pointer_cast;
-using arrow::util::RleEncoder;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-
-namespace {
-
-// Visitor that exracts the value buffer from a FlatArray at a given offset.
-struct ValueBufferSlicer {
- template <typename T>
- ::arrow::enable_if_base_binary<typename T::TypeClass, Status> Visit(const T& array) {
- auto data = array.data();
- buffer_ =
- SliceBuffer(data->buffers[1], data->offset * sizeof(typename T::offset_type),
- data->length * sizeof(typename T::offset_type));
- return Status::OK();
- }
-
- template <typename T>
- ::arrow::enable_if_fixed_size_binary<typename T::TypeClass, Status> Visit(
- const T& array) {
- auto data = array.data();
- buffer_ = SliceBuffer(data->buffers[1], data->offset * array.byte_width(),
- data->length * array.byte_width());
- return Status::OK();
- }
-
- template <typename T>
- ::arrow::enable_if_t<::arrow::has_c_type<typename T::TypeClass>::value &&
- !std::is_same<BooleanType, typename T::TypeClass>::value,
- Status>
- Visit(const T& array) {
- auto data = array.data();
- buffer_ = SliceBuffer(
- data->buffers[1],
- ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->offset),
- ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->length));
- return Status::OK();
- }
-
- Status Visit(const ::arrow::BooleanArray& array) {
- auto data = array.data();
- if (BitUtil::IsMultipleOf8(data->offset)) {
- buffer_ = SliceBuffer(data->buffers[1], BitUtil::BytesForBits(data->offset),
- BitUtil::BytesForBits(data->length));
- return Status::OK();
- }
- PARQUET_ASSIGN_OR_THROW(buffer_,
- ::arrow::internal::CopyBitmap(pool_, data->buffers[1]->data(),
- data->offset, data->length));
- return Status::OK();
- }
-#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
- Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
- return Status::NotImplemented("Slicing not implemented for " #ArrowTypePrefix); \
- }
-
- NOT_IMPLEMENTED_VISIT(Null);
- NOT_IMPLEMENTED_VISIT(Union);
- NOT_IMPLEMENTED_VISIT(List);
- NOT_IMPLEMENTED_VISIT(LargeList);
- NOT_IMPLEMENTED_VISIT(Struct);
- NOT_IMPLEMENTED_VISIT(FixedSizeList);
- NOT_IMPLEMENTED_VISIT(Dictionary);
- NOT_IMPLEMENTED_VISIT(Extension);
-
-#undef NOT_IMPLEMENTED_VISIT
-
- MemoryPool* pool_;
- std::shared_ptr<Buffer> buffer_;
-};
-
-internal::LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
- internal::LevelInfo level_info;
- level_info.def_level = descr->max_definition_level();
- level_info.rep_level = descr->max_repetition_level();
-
- int16_t min_spaced_def_level = descr->max_definition_level();
- const ::parquet::schema::Node* node = descr->schema_node().get();
- while (node != nullptr && !node->is_repeated()) {
- if (node->is_optional()) {
- min_spaced_def_level--;
- }
- node = node->parent();
- }
- level_info.repeated_ancestor_def_level = min_spaced_def_level;
- return level_info;
-}
-
-template <class T>
-inline const T* AddIfNotNull(const T* base, int64_t offset) {
- if (base != nullptr) {
- return base + offset;
- }
- return nullptr;
-}
-
-} // namespace
-
-LevelEncoder::LevelEncoder() {}
-LevelEncoder::~LevelEncoder() {}
-
-void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
- int num_buffered_values, uint8_t* data, int data_size) {
- bit_width_ = BitUtil::Log2(max_level + 1);
- encoding_ = encoding;
- switch (encoding) {
- case Encoding::RLE: {
- rle_encoder_.reset(new RleEncoder(data, data_size, bit_width_));
- break;
- }
- case Encoding::BIT_PACKED: {
- int num_bytes =
- static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
- bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
- break;
- }
- default:
- throw ParquetException("Unknown encoding type for levels.");
- }
-}
-
-int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
- int num_buffered_values) {
- int bit_width = BitUtil::Log2(max_level + 1);
- int num_bytes = 0;
- switch (encoding) {
- case Encoding::RLE: {
- // TODO: Due to the way we currently check if the buffer is full enough,
- // we need to have MinBufferSize as head room.
- num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
- RleEncoder::MinBufferSize(bit_width);
- break;
- }
- case Encoding::BIT_PACKED: {
- num_bytes =
- static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
- break;
- }
- default:
- throw ParquetException("Unknown encoding type for levels.");
- }
- return num_bytes;
-}
-
-int LevelEncoder::Encode(int batch_size, const int16_t* levels) {
- int num_encoded = 0;
- if (!rle_encoder_ && !bit_packed_encoder_) {
- throw ParquetException("Level encoders are not initialized.");
- }
-
- if (encoding_ == Encoding::RLE) {
- for (int i = 0; i < batch_size; ++i) {
- if (!rle_encoder_->Put(*(levels + i))) {
- break;
- }
- ++num_encoded;
- }
- rle_encoder_->Flush();
- rle_length_ = rle_encoder_->len();
- } else {
- for (int i = 0; i < batch_size; ++i) {
- if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) {
- break;
- }
- ++num_encoded;
- }
- bit_packed_encoder_->Flush();
- }
- return num_encoded;
-}
-
-// ----------------------------------------------------------------------
-// PageWriter implementation
-
-// This subclass delimits pages appearing in a serialized stream, each preceded
-// by a serialized Thrift format::PageHeader indicating the type of each page
-// and the page metadata.
-class SerializedPageWriter : public PageWriter {
- public:
- SerializedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal, int16_t column_chunk_ordinal,
- MemoryPool* pool = ::arrow::default_memory_pool(),
- std::shared_ptr<Encryptor> meta_encryptor = nullptr,
- std::shared_ptr<Encryptor> data_encryptor = nullptr)
- : sink_(std::move(sink)),
- metadata_(metadata),
- pool_(pool),
- num_values_(0),
- dictionary_page_offset_(0),
- data_page_offset_(0),
- total_uncompressed_size_(0),
- total_compressed_size_(0),
- page_ordinal_(0),
- row_group_ordinal_(row_group_ordinal),
- column_ordinal_(column_chunk_ordinal),
- meta_encryptor_(std::move(meta_encryptor)),
- data_encryptor_(std::move(data_encryptor)),
- encryption_buffer_(AllocateBuffer(pool, 0)) {
- if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) {
- InitEncryption();
- }
- compressor_ = GetCodec(codec, compression_level);
- thrift_serializer_.reset(new ThriftSerializer);
- }
-
- int64_t WriteDictionaryPage(const DictionaryPage& page) override {
- int64_t uncompressed_size = page.size();
- std::shared_ptr<Buffer> compressed_data;
- if (has_compressor()) {
- auto buffer = std::static_pointer_cast<ResizableBuffer>(
- AllocateBuffer(pool_, uncompressed_size));
- Compress(*(page.buffer().get()), buffer.get());
- compressed_data = std::static_pointer_cast<Buffer>(buffer);
- } else {
- compressed_data = page.buffer();
- }
-
- format::DictionaryPageHeader dict_page_header;
- dict_page_header.__set_num_values(page.num_values());
- dict_page_header.__set_encoding(ToThrift(page.encoding()));
- dict_page_header.__set_is_sorted(page.is_sorted());
-
- const uint8_t* output_data_buffer = compressed_data->data();
- int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
-
- if (data_encryptor_.get()) {
- UpdateEncryption(encryption::kDictionaryPage);
- PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
- data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
- output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
- encryption_buffer_->mutable_data());
- output_data_buffer = encryption_buffer_->data();
- }
-
- format::PageHeader page_header;
- page_header.__set_type(format::PageType::DICTIONARY_PAGE);
- page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
- page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
- page_header.__set_dictionary_page_header(dict_page_header);
- // TODO(PARQUET-594) crc checksum
-
- PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
- if (dictionary_page_offset_ == 0) {
- dictionary_page_offset_ = start_pos;
- }
-
- if (meta_encryptor_) {
- UpdateEncryption(encryption::kDictionaryPageHeader);
- }
- const int64_t header_size =
- thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
-
- PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
-
- total_uncompressed_size_ += uncompressed_size + header_size;
- total_compressed_size_ += output_data_len + header_size;
- ++dict_encoding_stats_[page.encoding()];
- return uncompressed_size + header_size;
- }
-
- void Close(bool has_dictionary, bool fallback) override {
- if (meta_encryptor_ != nullptr) {
- UpdateEncryption(encryption::kColumnMetaData);
- }
- // index_page_offset = -1 since they are not supported
- metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_,
- total_compressed_size_, total_uncompressed_size_, has_dictionary,
- fallback, dict_encoding_stats_, data_encoding_stats_,
- meta_encryptor_);
- // Write metadata at end of column chunk
- metadata_->WriteTo(sink_.get());
- }
-
- /**
- * Compress a buffer.
- */
- void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
- DCHECK(compressor_ != nullptr);
-
- // Compress the data
- int64_t max_compressed_size =
- compressor_->MaxCompressedLen(src_buffer.size(), src_buffer.data());
-
- // Use Arrow::Buffer::shrink_to_fit = false
- // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
- PARQUET_THROW_NOT_OK(dest_buffer->Resize(max_compressed_size, false));
-
- PARQUET_ASSIGN_OR_THROW(
- int64_t compressed_size,
- compressor_->Compress(src_buffer.size(), src_buffer.data(), max_compressed_size,
- dest_buffer->mutable_data()));
- PARQUET_THROW_NOT_OK(dest_buffer->Resize(compressed_size, false));
- }
-
- int64_t WriteDataPage(const DataPage& page) override {
- const int64_t uncompressed_size = page.uncompressed_size();
- std::shared_ptr<Buffer> compressed_data = page.buffer();
- const uint8_t* output_data_buffer = compressed_data->data();
- int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
-
- if (data_encryptor_.get()) {
- PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
- data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
- UpdateEncryption(encryption::kDataPage);
- output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
- encryption_buffer_->mutable_data());
- output_data_buffer = encryption_buffer_->data();
- }
-
- format::PageHeader page_header;
- page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
- page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
- // TODO(PARQUET-594) crc checksum
-
- if (page.type() == PageType::DATA_PAGE) {
- const DataPageV1& v1_page = checked_cast<const DataPageV1&>(page);
- SetDataPageHeader(page_header, v1_page);
- } else if (page.type() == PageType::DATA_PAGE_V2) {
- const DataPageV2& v2_page = checked_cast<const DataPageV2&>(page);
- SetDataPageV2Header(page_header, v2_page);
- } else {
- throw ParquetException("Unexpected page type");
- }
-
- PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
- if (page_ordinal_ == 0) {
- data_page_offset_ = start_pos;
- }
-
- if (meta_encryptor_) {
- UpdateEncryption(encryption::kDataPageHeader);
- }
- const int64_t header_size =
- thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
- PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
-
- total_uncompressed_size_ += uncompressed_size + header_size;
- total_compressed_size_ += output_data_len + header_size;
- num_values_ += page.num_values();
- ++data_encoding_stats_[page.encoding()];
- ++page_ordinal_;
- return uncompressed_size + header_size;
- }
-
- void SetDataPageHeader(format::PageHeader& page_header, const DataPageV1& page) {
- format::DataPageHeader data_page_header;
- data_page_header.__set_num_values(page.num_values());
- data_page_header.__set_encoding(ToThrift(page.encoding()));
- data_page_header.__set_definition_level_encoding(
- ToThrift(page.definition_level_encoding()));
- data_page_header.__set_repetition_level_encoding(
- ToThrift(page.repetition_level_encoding()));
- data_page_header.__set_statistics(ToThrift(page.statistics()));
-
- page_header.__set_type(format::PageType::DATA_PAGE);
- page_header.__set_data_page_header(data_page_header);
- }
-
- void SetDataPageV2Header(format::PageHeader& page_header, const DataPageV2 page) {
- format::DataPageHeaderV2 data_page_header;
- data_page_header.__set_num_values(page.num_values());
- data_page_header.__set_num_nulls(page.num_nulls());
- data_page_header.__set_num_rows(page.num_rows());
- data_page_header.__set_encoding(ToThrift(page.encoding()));
-
- data_page_header.__set_definition_levels_byte_length(
- page.definition_levels_byte_length());
- data_page_header.__set_repetition_levels_byte_length(
- page.repetition_levels_byte_length());
-
- data_page_header.__set_is_compressed(page.is_compressed());
- data_page_header.__set_statistics(ToThrift(page.statistics()));
-
- page_header.__set_type(format::PageType::DATA_PAGE_V2);
- page_header.__set_data_page_header_v2(data_page_header);
- }
-
- bool has_compressor() override { return (compressor_ != nullptr); }
-
- int64_t num_values() { return num_values_; }
-
- int64_t dictionary_page_offset() { return dictionary_page_offset_; }
-
- int64_t data_page_offset() { return data_page_offset_; }
-
- int64_t total_compressed_size() { return total_compressed_size_; }
-
- int64_t total_uncompressed_size() { return total_uncompressed_size_; }
-
- private:
- // To allow UpdateEncryption on Close
- friend class BufferedPageWriter;
-
- void InitEncryption() {
- // Prepare the AAD for quick update later.
- if (data_encryptor_ != nullptr) {
- data_page_aad_ = encryption::CreateModuleAad(
- data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_,
- column_ordinal_, kNonPageOrdinal);
- }
- if (meta_encryptor_ != nullptr) {
- data_page_header_aad_ = encryption::CreateModuleAad(
- meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_,
- column_ordinal_, kNonPageOrdinal);
- }
- }
-
- void UpdateEncryption(int8_t module_type) {
- switch (module_type) {
- case encryption::kColumnMetaData: {
- meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
- meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
- kNonPageOrdinal));
- break;
- }
- case encryption::kDataPage: {
- encryption::QuickUpdatePageAad(data_page_aad_, page_ordinal_);
- data_encryptor_->UpdateAad(data_page_aad_);
- break;
- }
- case encryption::kDataPageHeader: {
- encryption::QuickUpdatePageAad(data_page_header_aad_, page_ordinal_);
- meta_encryptor_->UpdateAad(data_page_header_aad_);
- break;
- }
- case encryption::kDictionaryPageHeader: {
- meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
- meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
- kNonPageOrdinal));
- break;
- }
- case encryption::kDictionaryPage: {
- data_encryptor_->UpdateAad(encryption::CreateModuleAad(
- data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
- kNonPageOrdinal));
- break;
- }
- default:
- throw ParquetException("Unknown module type in UpdateEncryption");
- }
- }
-
- std::shared_ptr<ArrowOutputStream> sink_;
- ColumnChunkMetaDataBuilder* metadata_;
- MemoryPool* pool_;
- int64_t num_values_;
- int64_t dictionary_page_offset_;
- int64_t data_page_offset_;
- int64_t total_uncompressed_size_;
- int64_t total_compressed_size_;
- int16_t page_ordinal_;
- int16_t row_group_ordinal_;
- int16_t column_ordinal_;
-
- std::unique_ptr<ThriftSerializer> thrift_serializer_;
-
- // Compression codec to use.
- std::unique_ptr<::arrow::util::Codec> compressor_;
-
- std::string data_page_aad_;
- std::string data_page_header_aad_;
-
- std::shared_ptr<Encryptor> meta_encryptor_;
- std::shared_ptr<Encryptor> data_encryptor_;
-
- std::shared_ptr<ResizableBuffer> encryption_buffer_;
-
- std::map<Encoding::type, int32_t> dict_encoding_stats_;
- std::map<Encoding::type, int32_t> data_encoding_stats_;
-};
-
-// This implementation of the PageWriter writes to the final sink on Close .
-class BufferedPageWriter : public PageWriter {
- public:
- BufferedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal, int16_t current_column_ordinal,
- MemoryPool* pool = ::arrow::default_memory_pool(),
- std::shared_ptr<Encryptor> meta_encryptor = nullptr,
- std::shared_ptr<Encryptor> data_encryptor = nullptr)
- : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false) {
- in_memory_sink_ = CreateOutputStream(pool);
- pager_ = std::unique_ptr<SerializedPageWriter>(
- new SerializedPageWriter(in_memory_sink_, codec, compression_level, metadata,
- row_group_ordinal, current_column_ordinal, pool,
- std::move(meta_encryptor), std::move(data_encryptor)));
- }
-
- int64_t WriteDictionaryPage(const DictionaryPage& page) override {
- has_dictionary_pages_ = true;
- return pager_->WriteDictionaryPage(page);
- }
-
- void Close(bool has_dictionary, bool fallback) override {
- if (pager_->meta_encryptor_ != nullptr) {
- pager_->UpdateEncryption(encryption::kColumnMetaData);
- }
- // index_page_offset = -1 since they are not supported
- PARQUET_ASSIGN_OR_THROW(int64_t final_position, final_sink_->Tell());
- // dictionary page offset should be 0 iff there are no dictionary pages
- auto dictionary_page_offset =
- has_dictionary_pages_ ? pager_->dictionary_page_offset() + final_position : 0;
- metadata_->Finish(pager_->num_values(), dictionary_page_offset, -1,
- pager_->data_page_offset() + final_position,
- pager_->total_compressed_size(), pager_->total_uncompressed_size(),
- has_dictionary, fallback, pager_->dict_encoding_stats_,
- pager_->data_encoding_stats_, pager_->meta_encryptor_);
-
- // Write metadata at end of column chunk
- metadata_->WriteTo(in_memory_sink_.get());
-
- // flush everything to the serialized sink
- PARQUET_ASSIGN_OR_THROW(auto buffer, in_memory_sink_->Finish());
- PARQUET_THROW_NOT_OK(final_sink_->Write(buffer));
- }
-
- int64_t WriteDataPage(const DataPage& page) override {
- return pager_->WriteDataPage(page);
- }
-
- void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
- pager_->Compress(src_buffer, dest_buffer);
- }
-
- bool has_compressor() override { return pager_->has_compressor(); }
-
- private:
- std::shared_ptr<ArrowOutputStream> final_sink_;
- ColumnChunkMetaDataBuilder* metadata_;
- std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_;
- std::unique_ptr<SerializedPageWriter> pager_;
- bool has_dictionary_pages_;
-};
-
-std::unique_ptr<PageWriter> PageWriter::Open(
- std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
- bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
- std::shared_ptr<Encryptor> data_encryptor) {
- if (buffered_row_group) {
- return std::unique_ptr<PageWriter>(
- new BufferedPageWriter(std::move(sink), codec, compression_level, metadata,
- row_group_ordinal, column_chunk_ordinal, pool,
- std::move(meta_encryptor), std::move(data_encryptor)));
- } else {
- return std::unique_ptr<PageWriter>(
- new SerializedPageWriter(std::move(sink), codec, compression_level, metadata,
- row_group_ordinal, column_chunk_ordinal, pool,
- std::move(meta_encryptor), std::move(data_encryptor)));
- }
-}
-
-// ----------------------------------------------------------------------
-// ColumnWriter
-
-const std::shared_ptr<WriterProperties>& default_writer_properties() {
- static std::shared_ptr<WriterProperties> default_writer_properties =
- WriterProperties::Builder().build();
- return default_writer_properties;
-}
-
-class ColumnWriterImpl {
- public:
- ColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
- std::unique_ptr<PageWriter> pager, const bool use_dictionary,
- Encoding::type encoding, const WriterProperties* properties)
- : metadata_(metadata),
- descr_(metadata->descr()),
- level_info_(ComputeLevelInfo(metadata->descr())),
- pager_(std::move(pager)),
- has_dictionary_(use_dictionary),
- encoding_(encoding),
- properties_(properties),
- allocator_(properties->memory_pool()),
- num_buffered_values_(0),
- num_buffered_encoded_values_(0),
- rows_written_(0),
- total_bytes_written_(0),
- total_compressed_bytes_(0),
- closed_(false),
- fallback_(false),
- definition_levels_sink_(allocator_),
- repetition_levels_sink_(allocator_) {
- definition_levels_rle_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
- repetition_levels_rle_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
- uncompressed_data_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
-
- if (pager_->has_compressor()) {
- compressor_temp_buffer_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
- }
- }
-
- virtual ~ColumnWriterImpl() = default;
-
- int64_t Close();
-
- protected:
- virtual std::shared_ptr<Buffer> GetValuesBuffer() = 0;
-
- // Serializes Dictionary Page if enabled
- virtual void WriteDictionaryPage() = 0;
-
- // Plain-encoded statistics of the current page
- virtual EncodedStatistics GetPageStatistics() = 0;
-
- // Plain-encoded statistics of the whole chunk
- virtual EncodedStatistics GetChunkStatistics() = 0;
-
- // Merges page statistics into chunk statistics, then resets the values
- virtual void ResetPageStatistics() = 0;
-
- // Adds Data Pages to an in memory buffer in dictionary encoding mode
- // Serializes the Data Pages in other encoding modes
- void AddDataPage();
-
- void BuildDataPageV1(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size, int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values);
- void BuildDataPageV2(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size, int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values);
-
- // Serializes Data Pages
- void WriteDataPage(const DataPage& page) {
- total_bytes_written_ += pager_->WriteDataPage(page);
- }
-
- // Write multiple definition levels
- void WriteDefinitionLevels(int64_t num_levels, const int16_t* levels) {
- DCHECK(!closed_);
- PARQUET_THROW_NOT_OK(
- definition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
- }
-
- // Write multiple repetition levels
- void WriteRepetitionLevels(int64_t num_levels, const int16_t* levels) {
- DCHECK(!closed_);
- PARQUET_THROW_NOT_OK(
- repetition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
- }
-
- // RLE encode the src_buffer into dest_buffer and return the encoded size
- int64_t RleEncodeLevels(const void* src_buffer, ResizableBuffer* dest_buffer,
- int16_t max_level, bool include_length_prefix = true);
-
- // Serialize the buffered Data Pages
- void FlushBufferedDataPages();
-
- ColumnChunkMetaDataBuilder* metadata_;
- const ColumnDescriptor* descr_;
- // scratch buffer if validity bits need to be recalculated.
- std::shared_ptr<ResizableBuffer> bits_buffer_;
- const internal::LevelInfo level_info_;
-
- std::unique_ptr<PageWriter> pager_;
-
- bool has_dictionary_;
- Encoding::type encoding_;
- const WriterProperties* properties_;
-
- LevelEncoder level_encoder_;
-
- MemoryPool* allocator_;
-
- // The total number of values stored in the data page. This is the maximum of
- // the number of encoded definition levels or encoded values. For
- // non-repeated, required columns, this is equal to the number of encoded
- // values. For repeated or optional values, there may be fewer data values
- // than levels, and this tells you how many encoded levels there are in that
- // case.
- int64_t num_buffered_values_;
-
- // The total number of stored values. For repeated or optional values, this
- // number may be lower than num_buffered_values_.
- int64_t num_buffered_encoded_values_;
-
- // Total number of rows written with this ColumnWriter
- int rows_written_;
-
- // Records the total number of uncompressed bytes written by the serializer
- int64_t total_bytes_written_;
-
- // Records the current number of compressed bytes in a column
- int64_t total_compressed_bytes_;
-
- // Flag to check if the Writer has been closed
- bool closed_;
-
- // Flag to infer if dictionary encoding has fallen back to PLAIN
- bool fallback_;
-
- ::arrow::BufferBuilder definition_levels_sink_;
- ::arrow::BufferBuilder repetition_levels_sink_;
-
- std::shared_ptr<ResizableBuffer> definition_levels_rle_;
- std::shared_ptr<ResizableBuffer> repetition_levels_rle_;
-
- std::shared_ptr<ResizableBuffer> uncompressed_data_;
- std::shared_ptr<ResizableBuffer> compressor_temp_buffer_;
-
- std::vector<std::unique_ptr<DataPage>> data_pages_;
-
- private:
- void InitSinks() {
- definition_levels_sink_.Rewind(0);
- repetition_levels_sink_.Rewind(0);
- }
-
- // Concatenate the encoded levels and values into one buffer
- void ConcatenateBuffers(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size,
- const std::shared_ptr<Buffer>& values, uint8_t* combined) {
- memcpy(combined, repetition_levels_rle_->data(), repetition_levels_rle_size);
- combined += repetition_levels_rle_size;
- memcpy(combined, definition_levels_rle_->data(), definition_levels_rle_size);
- combined += definition_levels_rle_size;
- memcpy(combined, values->data(), values->size());
- }
-};
-
-// return the size of the encoded buffer
-int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer,
- ResizableBuffer* dest_buffer, int16_t max_level,
- bool include_length_prefix) {
- // V1 DataPage includes the length of the RLE level as a prefix.
- int32_t prefix_size = include_length_prefix ? sizeof(int32_t) : 0;
-
- // TODO: This only works with due to some RLE specifics
- int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level,
- static_cast<int>(num_buffered_values_)) +
- prefix_size;
-
- // Use Arrow::Buffer::shrink_to_fit = false
- // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
- PARQUET_THROW_NOT_OK(dest_buffer->Resize(rle_size, false));
-
- level_encoder_.Init(Encoding::RLE, max_level, static_cast<int>(num_buffered_values_),
- dest_buffer->mutable_data() + prefix_size,
- static_cast<int>(dest_buffer->size() - prefix_size));
- int encoded = level_encoder_.Encode(static_cast<int>(num_buffered_values_),
- reinterpret_cast<const int16_t*>(src_buffer));
- DCHECK_EQ(encoded, num_buffered_values_);
-
- if (include_length_prefix) {
- reinterpret_cast<int32_t*>(dest_buffer->mutable_data())[0] = level_encoder_.len();
- }
-
- return level_encoder_.len() + prefix_size;
-}
-
-void ColumnWriterImpl::AddDataPage() {
- int64_t definition_levels_rle_size = 0;
- int64_t repetition_levels_rle_size = 0;
-
- std::shared_ptr<Buffer> values = GetValuesBuffer();
- bool is_v1_data_page = properties_->data_page_version() == ParquetDataPageVersion::V1;
-
- if (descr_->max_definition_level() > 0) {
- definition_levels_rle_size = RleEncodeLevels(
- definition_levels_sink_.data(), definition_levels_rle_.get(),
- descr_->max_definition_level(), /*include_length_prefix=*/is_v1_data_page);
- }
-
- if (descr_->max_repetition_level() > 0) {
- repetition_levels_rle_size = RleEncodeLevels(
- repetition_levels_sink_.data(), repetition_levels_rle_.get(),
- descr_->max_repetition_level(), /*include_length_prefix=*/is_v1_data_page);
- }
-
- int64_t uncompressed_size =
- definition_levels_rle_size + repetition_levels_rle_size + values->size();
-
- if (is_v1_data_page) {
- BuildDataPageV1(definition_levels_rle_size, repetition_levels_rle_size,
- uncompressed_size, values);
- } else {
- BuildDataPageV2(definition_levels_rle_size, repetition_levels_rle_size,
- uncompressed_size, values);
- }
-
- // Re-initialize the sinks for next Page.
- InitSinks();
- num_buffered_values_ = 0;
- num_buffered_encoded_values_ = 0;
-}
-
-void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size,
- int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values) {
- // Use Arrow::Buffer::shrink_to_fit = false
- // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
- PARQUET_THROW_NOT_OK(uncompressed_data_->Resize(uncompressed_size, false));
- ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, values,
- uncompressed_data_->mutable_data());
-
- EncodedStatistics page_stats = GetPageStatistics();
- page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
- page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
- ResetPageStatistics();
-
- std::shared_ptr<Buffer> compressed_data;
- if (pager_->has_compressor()) {
- pager_->Compress(*(uncompressed_data_.get()), compressor_temp_buffer_.get());
- compressed_data = compressor_temp_buffer_;
- } else {
- compressed_data = uncompressed_data_;
- }
-
- // Write the page to OutputStream eagerly if there is no dictionary or
- // if dictionary encoding has fallen back to PLAIN
- if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
- PARQUET_ASSIGN_OR_THROW(
- auto compressed_data_copy,
- compressed_data->CopySlice(0, compressed_data->size(), allocator_));
- std::unique_ptr<DataPage> page_ptr(new DataPageV1(
- compressed_data_copy, static_cast<int32_t>(num_buffered_values_), encoding_,
- Encoding::RLE, Encoding::RLE, uncompressed_size, page_stats));
- total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
-
- data_pages_.push_back(std::move(page_ptr));
- } else { // Eagerly write pages
- DataPageV1 page(compressed_data, static_cast<int32_t>(num_buffered_values_),
- encoding_, Encoding::RLE, Encoding::RLE, uncompressed_size,
- page_stats);
- WriteDataPage(page);
- }
-}
-
-void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size,
- int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values) {
- // Compress the values if needed. Repetition and definition levels are uncompressed in
- // V2.
- std::shared_ptr<Buffer> compressed_values;
- if (pager_->has_compressor()) {
- pager_->Compress(*values, compressor_temp_buffer_.get());
- compressed_values = compressor_temp_buffer_;
- } else {
- compressed_values = values;
- }
-
- // Concatenate uncompressed levels and the possibly compressed values
- int64_t combined_size =
- definition_levels_rle_size + repetition_levels_rle_size + compressed_values->size();
- std::shared_ptr<ResizableBuffer> combined = AllocateBuffer(allocator_, combined_size);
-
- ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size,
- compressed_values, combined->mutable_data());
-
- EncodedStatistics page_stats = GetPageStatistics();
- page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
- page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
- ResetPageStatistics();
-
- int32_t num_values = static_cast<int32_t>(num_buffered_values_);
- int32_t null_count = static_cast<int32_t>(page_stats.null_count);
- int32_t def_levels_byte_length = static_cast<int32_t>(definition_levels_rle_size);
- int32_t rep_levels_byte_length = static_cast<int32_t>(repetition_levels_rle_size);
-
- // Write the page to OutputStream eagerly if there is no dictionary or
- // if dictionary encoding has fallen back to PLAIN
- if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
- PARQUET_ASSIGN_OR_THROW(auto data_copy,
- combined->CopySlice(0, combined->size(), allocator_));
- std::unique_ptr<DataPage> page_ptr(new DataPageV2(
- combined, num_values, null_count, num_values, encoding_, def_levels_byte_length,
- rep_levels_byte_length, uncompressed_size, pager_->has_compressor()));
- total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
- data_pages_.push_back(std::move(page_ptr));
- } else {
- DataPageV2 page(combined, num_values, null_count, num_values, encoding_,
- def_levels_byte_length, rep_levels_byte_length, uncompressed_size,
- pager_->has_compressor());
- WriteDataPage(page);
- }
-}
-
-int64_t ColumnWriterImpl::Close() {
- if (!closed_) {
- closed_ = true;
- if (has_dictionary_ && !fallback_) {
- WriteDictionaryPage();
- }
-
- FlushBufferedDataPages();
-
- EncodedStatistics chunk_statistics = GetChunkStatistics();
- chunk_statistics.ApplyStatSizeLimits(
- properties_->max_statistics_size(descr_->path()));
- chunk_statistics.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
-
- // Write stats only if the column has at least one row written
- if (rows_written_ > 0 && chunk_statistics.is_set()) {
- metadata_->SetStatistics(chunk_statistics);
- }
- pager_->Close(has_dictionary_, fallback_);
- }
-
- return total_bytes_written_;
-}
-
-void ColumnWriterImpl::FlushBufferedDataPages() {
- // Write all outstanding data to a new page
- if (num_buffered_values_ > 0) {
- AddDataPage();
- }
- for (const auto& page_ptr : data_pages_) {
- WriteDataPage(*page_ptr);
- }
- data_pages_.clear();
- total_compressed_bytes_ = 0;
-}
-
-// ----------------------------------------------------------------------
-// TypedColumnWriter
-
-template <typename Action>
-inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
- int64_t num_batches = static_cast<int>(total / batch_size);
- for (int round = 0; round < num_batches; round++) {
- action(round * batch_size, batch_size);
- }
- // Write the remaining values
- if (total % batch_size > 0) {
- action(num_batches * batch_size, total % batch_size);
- }
-}
-
-bool DictionaryDirectWriteSupported(const ::arrow::Array& array) {
- DCHECK_EQ(array.type_id(), ::arrow::Type::DICTIONARY);
- const ::arrow::DictionaryType& dict_type =
- static_cast<const ::arrow::DictionaryType&>(*array.type());
- return ::arrow::is_base_binary_like(dict_type.value_type()->id());
-}
-
-Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
- std::shared_ptr<::arrow::Array>* out) {
- const ::arrow::DictionaryType& dict_type =
- static_cast<const ::arrow::DictionaryType&>(*array.type());
-
- ::arrow::compute::ExecContext ctx(pool);
- ARROW_ASSIGN_OR_RAISE(Datum cast_output,
- ::arrow::compute::Cast(array.data(), dict_type.value_type(),
- ::arrow::compute::CastOptions(), &ctx));
- *out = cast_output.make_array();
- return Status::OK();
-}
-
-static inline bool IsDictionaryEncoding(Encoding::type encoding) {
- return encoding == Encoding::PLAIN_DICTIONARY;
-}
-
-template <typename DType>
-class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
- public:
- using T = typename DType::c_type;
-
- TypedColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
- std::unique_ptr<PageWriter> pager, const bool use_dictionary,
- Encoding::type encoding, const WriterProperties* properties)
- : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding,
- properties) {
- current_encoder_ = MakeEncoder(DType::type_num, encoding, use_dictionary, descr_,
- properties->memory_pool());
-
- if (properties->statistics_enabled(descr_->path()) &&
- (SortOrder::UNKNOWN != descr_->sort_order())) {
- page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
- chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
- }
- }
-
- int64_t Close() override { return ColumnWriterImpl::Close(); }
-
- int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const T* values) override {
- // We check for DataPage limits only after we have inserted the values. If a user
- // writes a large number of values, the DataPage size can be much above the limit.
- // The purpose of this chunking is to bound this. Even if a user writes large number
- // of values, the chunking will ensure the AddDataPage() is called at a reasonable
- // pagesize limit
- int64_t value_offset = 0;
-
- auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t values_to_write = WriteLevels(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
-
- // PARQUET-780
- if (values_to_write > 0) {
- DCHECK_NE(nullptr, values);
- }
- WriteValues(AddIfNotNull(values, value_offset), values_to_write,
- batch_size - values_to_write);
- CommitWriteAndCheckPageLimit(batch_size, values_to_write);
- value_offset += values_to_write;
-
- // Dictionary size checked separately from data page size since we
- // circumvent this check when writing ::arrow::DictionaryArray directly
- CheckDictionarySizeLimit();
- };
- DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
- return value_offset;
- }
-
- void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const uint8_t* valid_bits,
- int64_t valid_bits_offset, const T* values) override {
- // Like WriteBatch, but for spaced values
- int64_t value_offset = 0;
- auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t batch_num_values = 0;
- int64_t batch_num_spaced_values = 0;
- int64_t null_count;
- MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
- &batch_num_values, &batch_num_spaced_values,
- &null_count);
-
- WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
- if (bits_buffer_ != nullptr) {
- WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
- batch_num_spaced_values, bits_buffer_->data(), /*offset=*/0);
- } else {
- WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
- batch_num_spaced_values, valid_bits,
- valid_bits_offset + value_offset);
- }
- CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values);
- value_offset += batch_num_spaced_values;
-
- // Dictionary size checked separately from data page size since we
- // circumvent this check when writing ::arrow::DictionaryArray directly
- CheckDictionarySizeLimit();
- };
- DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
- }
-
- Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& leaf_array,
- ArrowWriteContext* ctx, bool leaf_field_nullable) override {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- // Leaf nulls are canonical when there is only a single null element after a list
- // and it is at the leaf.
- bool single_nullable_element =
- (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
- leaf_field_nullable;
- bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
- if (maybe_parent_nulls) {
- ARROW_ASSIGN_OR_RAISE(
- bits_buffer_,
- ::arrow::AllocateResizableBuffer(
- BitUtil::BytesForBits(properties_->write_batch_size()), ctx->memory_pool));
- bits_buffer_->ZeroPadding();
- }
-
- if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
- return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx,
- maybe_parent_nulls);
- } else {
- return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx,
- maybe_parent_nulls);
- }
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- int64_t EstimatedBufferedValueBytes() const override {
- return current_encoder_->EstimatedDataEncodedSize();
- }
-
- protected:
- std::shared_ptr<Buffer> GetValuesBuffer() override {
- return current_encoder_->FlushValues();
- }
-
- // Internal function to handle direct writing of ::arrow::DictionaryArray,
- // since the standard logic concerning dictionary size limits and fallback to
- // plain encoding is circumvented
- Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& array,
- ArrowWriteContext* context, bool maybe_parent_nulls);
-
- Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& array,
- ArrowWriteContext* context, bool maybe_parent_nulls);
-
- void WriteDictionaryPage() override {
- // We have to dynamic cast here because of TypedEncoder<Type> as
- // some compilers don't want to cast through virtual inheritance
- auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
- DCHECK(dict_encoder);
- std::shared_ptr<ResizableBuffer> buffer =
- AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size());
- dict_encoder->WriteDict(buffer->mutable_data());
-
- DictionaryPage page(buffer, dict_encoder->num_entries(),
- properties_->dictionary_page_encoding());
- total_bytes_written_ += pager_->WriteDictionaryPage(page);
- }
-
- EncodedStatistics GetPageStatistics() override {
- EncodedStatistics result;
- if (page_statistics_) result = page_statistics_->Encode();
- return result;
- }
-
- EncodedStatistics GetChunkStatistics() override {
- EncodedStatistics result;
- if (chunk_statistics_) result = chunk_statistics_->Encode();
- return result;
- }
-
- void ResetPageStatistics() override {
- if (chunk_statistics_ != nullptr) {
- chunk_statistics_->Merge(*page_statistics_);
- page_statistics_->Reset();
- }
- }
-
- Type::type type() const override { return descr_->physical_type(); }
-
- const ColumnDescriptor* descr() const override { return descr_; }
-
- int64_t rows_written() const override { return rows_written_; }
-
- int64_t total_compressed_bytes() const override { return total_compressed_bytes_; }
-
- int64_t total_bytes_written() const override { return total_bytes_written_; }
-
- const WriterProperties* properties() override { return properties_; }
-
- private:
- using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
- using TypedStats = TypedStatistics<DType>;
- std::unique_ptr<Encoder> current_encoder_;
- std::shared_ptr<TypedStats> page_statistics_;
- std::shared_ptr<TypedStats> chunk_statistics_;
-
- // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the
- // dictionary passed to DictEncoder<T>::PutDictionary so we can check
- // subsequent array chunks to see either if materialization is required (in
- // which case we call back to the dense write path)
- std::shared_ptr<::arrow::Array> preserved_dictionary_;
-
- int64_t WriteLevels(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels) {
- int64_t values_to_write = 0;
- // If the field is required and non-repeated, there are no definition levels
- if (descr_->max_definition_level() > 0) {
- for (int64_t i = 0; i < num_values; ++i) {
- if (def_levels[i] == descr_->max_definition_level()) {
- ++values_to_write;
- }
- }
-
- WriteDefinitionLevels(num_values, def_levels);
- } else {
- // Required field, write all values
- values_to_write = num_values;
- }
-
- // Not present for non-repeated fields
- if (descr_->max_repetition_level() > 0) {
- // A row could include more than one value
- // Count the occasions where we start a new row
- for (int64_t i = 0; i < num_values; ++i) {
- if (rep_levels[i] == 0) {
- rows_written_++;
- }
- }
-
- WriteRepetitionLevels(num_values, rep_levels);
- } else {
- // Each value is exactly one row
- rows_written_ += static_cast<int>(num_values);
- }
- return values_to_write;
- }
-
- // This method will always update the three output parameters,
- // out_values_to_write, out_spaced_values_to_write and null_count. Additionally
- // it will update the validity bitmap if required (i.e. if at least one level
- // of nullable structs directly precede the leaf node).
- void MaybeCalculateValidityBits(const int16_t* def_levels, int64_t batch_size,
- int64_t* out_values_to_write,
- int64_t* out_spaced_values_to_write,
- int64_t* null_count) {
- if (bits_buffer_ == nullptr) {
- if (level_info_.def_level == 0) {
- // In this case def levels should be null and we only
- // need to output counts which will always be equal to
- // the batch size passed in (max def_level == 0 indicates
- // there cannot be repeated or null fields).
- DCHECK_EQ(def_levels, nullptr);
- *out_values_to_write = batch_size;
- *out_spaced_values_to_write = batch_size;
- *null_count = 0;
- } else {
- for (int x = 0; x < batch_size; x++) {
- *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 0;
- *out_spaced_values_to_write +=
- def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
- }
- *null_count = *out_values_to_write - *out_spaced_values_to_write;
- }
- return;
- }
- // Shrink to fit possible causes another allocation, and would only be necessary
- // on the last batch.
- int64_t new_bitmap_size = BitUtil::BytesForBits(batch_size);
- if (new_bitmap_size != bits_buffer_->size()) {
- PARQUET_THROW_NOT_OK(
- bits_buffer_->Resize(new_bitmap_size, /*shrink_to_fit=*/false));
- bits_buffer_->ZeroPadding();
- }
- internal::ValidityBitmapInputOutput io;
- io.valid_bits = bits_buffer_->mutable_data();
- io.values_read_upper_bound = batch_size;
- internal::DefLevelsToBitmap(def_levels, batch_size, level_info_, &io);
- *out_values_to_write = io.values_read - io.null_count;
- *out_spaced_values_to_write = io.values_read;
- *null_count = io.null_count;
- }
-
- Result<std::shared_ptr<Array>> MaybeReplaceValidity(std::shared_ptr<Array> array,
- int64_t new_null_count,
- ::arrow::MemoryPool* memory_pool) {
- if (bits_buffer_ == nullptr) {
- return array;
- }
- std::vector<std::shared_ptr<Buffer>> buffers = array->data()->buffers;
- if (buffers.empty()) {
- return array;
- }
- buffers[0] = bits_buffer_;
- // Should be a leaf array.
- DCHECK_GT(buffers.size(), 1);
- ValueBufferSlicer slicer{memory_pool, /*buffer=*/nullptr};
- if (array->data()->offset > 0) {
- RETURN_NOT_OK(::arrow::VisitArrayInline(*array, &slicer));
- buffers[1] = slicer.buffer_;
- }
- return ::arrow::MakeArray(std::make_shared<ArrayData>(
- array->type(), array->length(), std::move(buffers), new_null_count));
- }
-
- void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels,
- const int16_t* rep_levels) {
- // If the field is required and non-repeated, there are no definition levels
- if (descr_->max_definition_level() > 0) {
- WriteDefinitionLevels(num_levels, def_levels);
- }
- // Not present for non-repeated fields
- if (descr_->max_repetition_level() > 0) {
- // A row could include more than one value
- // Count the occasions where we start a new row
- for (int64_t i = 0; i < num_levels; ++i) {
- if (rep_levels[i] == 0) {
- rows_written_++;
- }
- }
- WriteRepetitionLevels(num_levels, rep_levels);
- } else {
- // Each value is exactly one row
- rows_written_ += static_cast<int>(num_levels);
- }
- }
-
- void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) {
- num_buffered_values_ += num_levels;
- num_buffered_encoded_values_ += num_values;
-
- if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
- AddDataPage();
- }
- }
-
- void FallbackToPlainEncoding() {
- if (IsDictionaryEncoding(current_encoder_->encoding())) {
- WriteDictionaryPage();
- // Serialize the buffered Dictionary Indices
- FlushBufferedDataPages();
- fallback_ = true;
- // Only PLAIN encoding is supported for fallback in V1
- current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
- properties_->memory_pool());
- encoding_ = Encoding::PLAIN;
- }
- }
-
- // Checks if the Dictionary Page size limit is reached
- // If the limit is reached, the Dictionary and Data Pages are serialized
- // The encoding is switched to PLAIN
- //
- // Only one Dictionary Page is written.
- // Fallback to PLAIN if dictionary page limit is reached.
- void CheckDictionarySizeLimit() {
- if (!has_dictionary_ || fallback_) {
- // Either not using dictionary encoding, or we have already fallen back
- // to PLAIN encoding because the size threshold was reached
- return;
- }
-
- // We have to dynamic cast here because TypedEncoder<Type> as some compilers
- // don't want to cast through virtual inheritance
- auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
- if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
- FallbackToPlainEncoding();
- }
- }
-
- void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) {
- dynamic_cast<ValueEncoderType*>(current_encoder_.get())
- ->Put(values, static_cast<int>(num_values));
- if (page_statistics_ != nullptr) {
- page_statistics_->Update(values, num_values, num_nulls);
- }
- }
-
- void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values,
- const uint8_t* valid_bits, int64_t valid_bits_offset) {
- if (num_values != num_spaced_values) {
- dynamic_cast<ValueEncoderType*>(current_encoder_.get())
- ->PutSpaced(values, static_cast<int>(num_spaced_values), valid_bits,
- valid_bits_offset);
- } else {
- dynamic_cast<ValueEncoderType*>(current_encoder_.get())
- ->Put(values, static_cast<int>(num_values));
- }
- if (page_statistics_ != nullptr) {
- const int64_t num_nulls = num_spaced_values - num_values;
- page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values,
- num_nulls);
- }
- }
-};
-
-template <typename DType>
-Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- // If this is the first time writing a DictionaryArray, then there's
- // a few possible paths to take:
- //
- // - If dictionary encoding is not enabled, convert to densely
- // encoded and call WriteArrow
- // - Dictionary encoding enabled
- // - If this is the first time this is called, then we call
- // PutDictionary into the encoder and then PutIndices on each
- // chunk. We store the dictionary that was written in
- // preserved_dictionary_ so that subsequent calls to this method
- // can make sure the dictionary has not changed
- // - On subsequent calls, we have to check whether the dictionary
- // has changed. If it has, then we trigger the varying
- // dictionary path and materialize each chunk and then call
- // WriteArrow with that
- auto WriteDense = [&] {
- std::shared_ptr<::arrow::Array> dense_array;
- RETURN_NOT_OK(
- ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array));
- return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx,
- maybe_parent_nulls);
- };
-
- if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
- !DictionaryDirectWriteSupported(array)) {
- // No longer dictionary-encoding for whatever reason, maybe we never were
- // or we decided to stop. Note that WriteArrow can be invoked multiple
- // times with both dense and dictionary-encoded versions of the same data
- // without a problem. Any dense data will be hashed to indices until the
- // dictionary page limit is reached, at which everything (dictionary and
- // dense) will fall back to plain encoding
- return WriteDense();
- }
-
- auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
- const auto& data = checked_cast<const ::arrow::DictionaryArray&>(array);
- std::shared_ptr<::arrow::Array> dictionary = data.dictionary();
- std::shared_ptr<::arrow::Array> indices = data.indices();
-
- int64_t value_offset = 0;
- auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t batch_num_values = 0;
- int64_t batch_num_spaced_values = 0;
- int64_t null_count = ::arrow::kUnknownNullCount;
- // Bits is not null for nullable values. At this point in the code we can't determine
- // if the leaf array has the same null values as any parents it might have had so we
- // need to recompute it from def levels.
- MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
- &batch_num_values, &batch_num_spaced_values, &null_count);
- WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
- std::shared_ptr<Array> writeable_indices =
- indices->Slice(value_offset, batch_num_spaced_values);
- PARQUET_ASSIGN_OR_THROW(
- writeable_indices,
- MaybeReplaceValidity(writeable_indices, null_count, ctx->memory_pool));
- dict_encoder->PutIndices(*writeable_indices);
- CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
- value_offset += batch_num_spaced_values;
- };
-
- // Handle seeing dictionary for the first time
- if (!preserved_dictionary_) {
- // It's a new dictionary. Call PutDictionary and keep track of it
- PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));
-
- // If there were duplicate value in the dictionary, the encoder's memo table
- // will be out of sync with the indices in the Arrow array.
- // The easiest solution for this uncommon case is to fallback to plain encoding.
- if (dict_encoder->num_entries() != dictionary->length()) {
- PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
- return WriteDense();
- }
-
- // TODO(wesm): If some dictionary values are unobserved, then the
- // statistics will be inaccurate. Do we care enough to fix it?
- if (page_statistics_ != nullptr) {
- PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary));
- }
- preserved_dictionary_ = dictionary;
- } else if (!dictionary->Equals(*preserved_dictionary_)) {
- // Dictionary has changed
- PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
- return WriteDense();
- }
-
- PARQUET_CATCH_NOT_OK(
- DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk));
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Direct Arrow write path
-
-template <typename ParquetType, typename ArrowType, typename Enable = void>
-struct SerializeFunctor {
- using ArrowCType = typename ArrowType::c_type;
- using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
- using ParquetCType = typename ParquetType::c_type;
- Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) {
- const ArrowCType* input = array.raw_values();
- if (array.null_count() > 0) {
- for (int i = 0; i < array.length(); i++) {
- out[i] = static_cast<ParquetCType>(input[i]);
- }
- } else {
- std::copy(input, input + array.length(), out);
- }
- return Status::OK();
- }
-};
-
-template <typename ParquetType, typename ArrowType>
-Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
- const int16_t* def_levels, const int16_t* rep_levels,
- ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
- bool maybe_parent_nulls) {
- using ParquetCType = typename ParquetType::c_type;
- using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
-
- ParquetCType* buffer = nullptr;
- PARQUET_THROW_NOT_OK(ctx->GetScratchData<ParquetCType>(array.length(), &buffer));
-
- SerializeFunctor<ParquetType, ArrowType> functor;
- RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array), ctx, buffer));
- bool no_nulls =
- writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
- if (!maybe_parent_nulls && no_nulls) {
- PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, buffer));
- } else {
- PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
- array.null_bitmap_data(),
- array.offset(), buffer));
- }
- return Status::OK();
-}
-
-template <typename ParquetType>
-Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
- const int16_t* def_levels, const int16_t* rep_levels,
- ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
- bool maybe_parent_nulls) {
- using T = typename ParquetType::c_type;
- const auto& data = static_cast<const ::arrow::PrimitiveArray&>(array);
- const T* values = nullptr;
- // The values buffer may be null if the array is empty (ARROW-2744)
- if (data.values() != nullptr) {
- values = reinterpret_cast<const T*>(data.values()->data()) + data.offset();
- } else {
- DCHECK_EQ(data.length(), 0);
- }
- bool no_nulls =
- writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
-
- if (!maybe_parent_nulls && no_nulls) {
- PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, values));
- } else {
- PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
- data.null_bitmap_data(), data.offset(),
- values));
- }
- return Status::OK();
-}
-
-#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \
- case ::arrow::Type::ArrowEnum: \
- return WriteArrowSerialize<ParquetType, ::arrow::ArrowType>( \
- array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
-
-#define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \
- case ::arrow::Type::ArrowEnum: \
- return WriteArrowZeroCopy<ParquetType>(array, num_levels, def_levels, rep_levels, \
- ctx, this, maybe_parent_nulls);
-
-#define ARROW_UNSUPPORTED() \
- std::stringstream ss; \
- ss << "Arrow type " << array.type()->ToString() \
- << " cannot be written to Parquet type " << descr_->ToString(); \
- return Status::Invalid(ss.str());
-
-// ----------------------------------------------------------------------
-// Write Arrow to BooleanType
-
-template <>
-struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
- Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) {
- for (int i = 0; i < data.length(); i++) {
- *out++ = data.Value(i);
- }
- return Status::OK();
- }
-};
-
-template <>
-Status TypedColumnWriterImpl<BooleanType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::BOOL) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowSerialize<BooleanType, ::arrow::BooleanType>(
- array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow types to INT32
-
-template <>
-struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
- Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
- const int64_t* input = array.raw_values();
- for (int i = 0; i < array.length(); i++) {
- *out++ = static_cast<int32_t>(*input++ / 86400000);
- }
- return Status::OK();
- }
-};
-
-template <>
-struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
- Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
- const int32_t* input = array.raw_values();
- const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
- if (type.unit() == ::arrow::TimeUnit::SECOND) {
- for (int i = 0; i < array.length(); i++) {
- out[i] = input[i] * 1000;
- }
- } else {
- std::copy(input, input + array.length(), out);
- }
- return Status::OK();
- }
-};
-
-template <>
-Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- switch (array.type()->id()) {
- case ::arrow::Type::NA: {
- PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr));
- } break;
- WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type)
- WRITE_SERIALIZE_CASE(UINT8, UInt8Type, Int32Type)
- WRITE_SERIALIZE_CASE(INT16, Int16Type, Int32Type)
- WRITE_SERIALIZE_CASE(UINT16, UInt16Type, Int32Type)
- WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int32Type)
- WRITE_ZERO_COPY_CASE(INT32, Int32Type, Int32Type)
- WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type)
- WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type)
- WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type)
- default:
- ARROW_UNSUPPORTED()
- }
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow to Int64 and Int96
-
-#define INT96_CONVERT_LOOP(ConversionFunction) \
- for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]);
-
-template <>
-struct SerializeFunctor<Int96Type, ::arrow::TimestampType> {
- Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
- const int64_t* input = array.raw_values();
- const auto& type = static_cast<const ::arrow::TimestampType&>(*array.type());
- switch (type.unit()) {
- case ::arrow::TimeUnit::NANO:
- INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp);
- break;
- case ::arrow::TimeUnit::MICRO:
- INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp);
- break;
- case ::arrow::TimeUnit::MILLI:
- INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp);
- break;
- case ::arrow::TimeUnit::SECOND:
- INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp);
- break;
- }
- return Status::OK();
- }
-};
-
-#define COERCE_DIVIDE -1
-#define COERCE_INVALID 0
-#define COERCE_MULTIPLY +1
-
-static std::pair<int, int64_t> kTimestampCoercionFactors[4][4] = {
- // from seconds ...
- {{COERCE_INVALID, 0}, // ... to seconds
- {COERCE_MULTIPLY, 1000}, // ... to millis
- {COERCE_MULTIPLY, 1000000}, // ... to micros
- {COERCE_MULTIPLY, INT64_C(1000000000)}}, // ... to nanos
- // from millis ...
- {{COERCE_INVALID, 0},
- {COERCE_MULTIPLY, 1},
- {COERCE_MULTIPLY, 1000},
- {COERCE_MULTIPLY, 1000000}},
- // from micros ...
- {{COERCE_INVALID, 0},
- {COERCE_DIVIDE, 1000},
- {COERCE_MULTIPLY, 1},
- {COERCE_MULTIPLY, 1000}},
- // from nanos ...
- {{COERCE_INVALID, 0},
- {COERCE_DIVIDE, 1000000},
- {COERCE_DIVIDE, 1000},
- {COERCE_MULTIPLY, 1}}};
-
-template <>
-struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
- Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx,
- int64_t* out) {
- const auto& source_type = static_cast<const ::arrow::TimestampType&>(*array.type());
- auto source_unit = source_type.unit();
- const int64_t* values = array.raw_values();
-
- ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
- auto target_type = ::arrow::timestamp(target_unit);
- bool truncation_allowed = ctx->properties->truncated_timestamps_allowed();
-
- auto DivideBy = [&](const int64_t factor) {
- for (int64_t i = 0; i < array.length(); i++) {
- if (!truncation_allowed && array.IsValid(i) && (values[i] % factor != 0)) {
- return Status::Invalid("Casting from ", source_type.ToString(), " to ",
- target_type->ToString(),
- " would lose data: ", values[i]);
- }
- out[i] = values[i] / factor;
- }
- return Status::OK();
- };
-
- auto MultiplyBy = [&](const int64_t factor) {
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = values[i] * factor;
- }
- return Status::OK();
- };
-
- const auto& coercion = kTimestampCoercionFactors[static_cast<int>(source_unit)]
- [static_cast<int>(target_unit)];
-
- // .first -> coercion operation; .second -> scale factor
- DCHECK_NE(coercion.first, COERCE_INVALID);
- return coercion.first == COERCE_DIVIDE ? DivideBy(coercion.second)
- : MultiplyBy(coercion.second);
- }
-};
-
-#undef COERCE_DIVIDE
-#undef COERCE_INVALID
-#undef COERCE_MULTIPLY
-
-Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
- const int16_t* def_levels, const int16_t* rep_levels,
- ArrowWriteContext* ctx, TypedColumnWriter<Int64Type>* writer,
- bool maybe_parent_nulls) {
- const auto& source_type = static_cast<const ::arrow::TimestampType&>(*values.type());
-
- auto WriteCoerce = [&](const ArrowWriterProperties* properties) {
- ArrowWriteContext temp_ctx = *ctx;
- temp_ctx.properties = properties;
- return WriteArrowSerialize<Int64Type, ::arrow::TimestampType>(
- values, num_levels, def_levels, rep_levels, &temp_ctx, writer,
- maybe_parent_nulls);
- };
-
- if (ctx->properties->coerce_timestamps_enabled()) {
- // User explicitly requested coercion to specific unit
- if (source_type.unit() == ctx->properties->coerce_timestamps_unit()) {
- // No data conversion necessary
- return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels,
- ctx, writer, maybe_parent_nulls);
- } else {
- return WriteCoerce(ctx->properties);
- }
- } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 &&
- source_type.unit() == ::arrow::TimeUnit::NANO) {
- // Absent superseding user instructions, when writing Parquet version 1.0 files,
- // timestamps in nanoseconds are coerced to microseconds
- std::shared_ptr<ArrowWriterProperties> properties =
- (ArrowWriterProperties::Builder())
- .coerce_timestamps(::arrow::TimeUnit::MICRO)
- ->disallow_truncated_timestamps()
- ->build();
- return WriteCoerce(properties.get());
- } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) {
- // Absent superseding user instructions, timestamps in seconds are coerced to
- // milliseconds
- std::shared_ptr<ArrowWriterProperties> properties =
- (ArrowWriterProperties::Builder())
- .coerce_timestamps(::arrow::TimeUnit::MILLI)
- ->build();
- return WriteCoerce(properties.get());
- } else {
- // No data conversion necessary
- return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels, ctx,
- writer, maybe_parent_nulls);
- }
-}
-
-template <>
-Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- switch (array.type()->id()) {
- case ::arrow::Type::TIMESTAMP:
- return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this,
- maybe_parent_nulls);
- WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type)
- WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type)
- WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type)
- WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type)
- default:
- ARROW_UNSUPPORTED();
- }
-}
-
-template <>
-Status TypedColumnWriterImpl<Int96Type>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::TIMESTAMP) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowSerialize<Int96Type, ::arrow::TimestampType>(
- array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
-}
-
-// ----------------------------------------------------------------------
-// Floating point types
-
-template <>
-Status TypedColumnWriterImpl<FloatType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::FLOAT) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowZeroCopy<FloatType>(array, num_levels, def_levels, rep_levels, ctx,
- this, maybe_parent_nulls);
-}
-
-template <>
-Status TypedColumnWriterImpl<DoubleType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::DOUBLE) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowZeroCopy<DoubleType>(array, num_levels, def_levels, rep_levels, ctx,
- this, maybe_parent_nulls);
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow to BYTE_ARRAY
-
-template <>
-Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (!::arrow::is_base_binary_like(array.type()->id())) {
- ARROW_UNSUPPORTED();
- }
-
- int64_t value_offset = 0;
- auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t batch_num_values = 0;
- int64_t batch_num_spaced_values = 0;
- int64_t null_count = 0;
-
- MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
- &batch_num_values, &batch_num_spaced_values, &null_count);
- WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
- std::shared_ptr<Array> data_slice =
- array.Slice(value_offset, batch_num_spaced_values);
- PARQUET_ASSIGN_OR_THROW(
- data_slice, MaybeReplaceValidity(data_slice, null_count, ctx->memory_pool));
-
- current_encoder_->Put(*data_slice);
- if (page_statistics_ != nullptr) {
- page_statistics_->Update(*data_slice);
- }
- CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
- CheckDictionarySizeLimit();
- value_offset += batch_num_spaced_values;
- };
-
- PARQUET_CATCH_NOT_OK(
- DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk));
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow to FIXED_LEN_BYTE_ARRAY
-
-template <typename ParquetType, typename ArrowType>
-struct SerializeFunctor<
- ParquetType, ArrowType,
- ::arrow::enable_if_t<::arrow::is_fixed_size_binary_type<ArrowType>::value &&
- !::arrow::is_decimal_type<ArrowType>::value>> {
- Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
- FLBA* out) {
- if (array.null_count() == 0) {
- // no nulls, just dump the data
- // todo(advancedxy): use a writeBatch to avoid this step
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = FixedLenByteArray(array.GetValue(i));
- }
- } else {
- for (int64_t i = 0; i < array.length(); i++) {
- if (array.IsValid(i)) {
- out[i] = FixedLenByteArray(array.GetValue(i));
- }
- }
- }
- return Status::OK();
- }
-};
-
-// ----------------------------------------------------------------------
-// Write Arrow to Decimal128
-
-// Requires a custom serializer because decimal in parquet are in big-endian
-// format. Thus, a temporary local buffer is required.
-template <typename ParquetType, typename ArrowType>
-struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_decimal<ArrowType>> {
- Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
- ArrowWriteContext* ctx, FLBA* out) {
- AllocateScratch(array, ctx);
- auto offset = Offset(array);
-
- if (array.null_count() == 0) {
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = FixDecimalEndianess<ArrowType::kByteWidth>(array.GetValue(i), offset);
- }
- } else {
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = array.IsValid(i) ? FixDecimalEndianess<ArrowType::kByteWidth>(
- array.GetValue(i), offset)
- : FixedLenByteArray();
- }
- }
-
- return Status::OK();
- }
-
- // Parquet's Decimal are stored with FixedLength values where the length is
- // proportional to the precision. Arrow's Decimal are always stored with 16/32
- // bytes. Thus the internal FLBA pointer must be adjusted by the offset calculated
- // here.
- int32_t Offset(const Array& array) {
- auto decimal_type = checked_pointer_cast<::arrow::DecimalType>(array.type());
- return decimal_type->byte_width() -
- ::arrow::DecimalType::DecimalSize(decimal_type->precision());
- }
-
- void AllocateScratch(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
- ArrowWriteContext* ctx) {
- int64_t non_null_count = array.length() - array.null_count();
- int64_t size = non_null_count * ArrowType::kByteWidth;
- scratch_buffer = AllocateBuffer(ctx->memory_pool, size);
- scratch = reinterpret_cast<int64_t*>(scratch_buffer->mutable_data());
- }
-
- template <int byte_width>
- FixedLenByteArray FixDecimalEndianess(const uint8_t* in, int64_t offset) {
- const auto* u64_in = reinterpret_cast<const int64_t*>(in);
- auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
- static_assert(byte_width == 16 || byte_width == 32,
- "only 16 and 32 byte Decimals supported");
- if (byte_width == 32) {
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[3]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[2]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
- } else {
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
- }
- return FixedLenByteArray(out);
- }
-
- std::shared_ptr<ResizableBuffer> scratch_buffer;
- int64_t* scratch;
-};
-
-template <>
-Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- switch (array.type()->id()) {
- WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
- WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
- WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
- default:
- break;
- }
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Dynamic column writer constructor
-
-std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* metadata,
- std::unique_ptr<PageWriter> pager,
- const WriterProperties* properties) {
- const ColumnDescriptor* descr = metadata->descr();
- const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
- descr->physical_type() != Type::BOOLEAN;
- Encoding::type encoding = properties->encoding(descr->path());
- if (use_dictionary) {
- encoding = properties->dictionary_index_encoding();
- }
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedColumnWriterImpl<BooleanType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::INT32:
- return std::make_shared<TypedColumnWriterImpl<Int32Type>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::INT64:
- return std::make_shared<TypedColumnWriterImpl<Int64Type>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::INT96:
- return std::make_shared<TypedColumnWriterImpl<Int96Type>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::FLOAT:
- return std::make_shared<TypedColumnWriterImpl<FloatType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::DOUBLE:
- return std::make_shared<TypedColumnWriterImpl<DoubleType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedColumnWriterImpl<ByteArrayType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedColumnWriterImpl<FLBAType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- default:
- ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return std::shared_ptr<ColumnWriter>(nullptr);
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_writer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api.h"
+#include "arrow/io/memory.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/column_page.h"
+#include "parquet/encoding.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/level_conversion.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h"
+#include "parquet/types.h"
+
+using arrow::Array;
+using arrow::ArrayData;
+using arrow::Datum;
+using arrow::Result;
+using arrow::Status;
+using arrow::BitUtil::BitWriter;
+using arrow::internal::checked_cast;
+using arrow::internal::checked_pointer_cast;
+using arrow::util::RleEncoder;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+
+namespace {
+
+// Visitor that exracts the value buffer from a FlatArray at a given offset.
+struct ValueBufferSlicer {
+ template <typename T>
+ ::arrow::enable_if_base_binary<typename T::TypeClass, Status> Visit(const T& array) {
+ auto data = array.data();
+ buffer_ =
+ SliceBuffer(data->buffers[1], data->offset * sizeof(typename T::offset_type),
+ data->length * sizeof(typename T::offset_type));
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_fixed_size_binary<typename T::TypeClass, Status> Visit(
+ const T& array) {
+ auto data = array.data();
+ buffer_ = SliceBuffer(data->buffers[1], data->offset * array.byte_width(),
+ data->length * array.byte_width());
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<::arrow::has_c_type<typename T::TypeClass>::value &&
+ !std::is_same<BooleanType, typename T::TypeClass>::value,
+ Status>
+ Visit(const T& array) {
+ auto data = array.data();
+ buffer_ = SliceBuffer(
+ data->buffers[1],
+ ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->offset),
+ ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->length));
+ return Status::OK();
+ }
+
+ Status Visit(const ::arrow::BooleanArray& array) {
+ auto data = array.data();
+ if (BitUtil::IsMultipleOf8(data->offset)) {
+ buffer_ = SliceBuffer(data->buffers[1], BitUtil::BytesForBits(data->offset),
+ BitUtil::BytesForBits(data->length));
+ return Status::OK();
+ }
+ PARQUET_ASSIGN_OR_THROW(buffer_,
+ ::arrow::internal::CopyBitmap(pool_, data->buffers[1]->data(),
+ data->offset, data->length));
+ return Status::OK();
+ }
+#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
+ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
+ return Status::NotImplemented("Slicing not implemented for " #ArrowTypePrefix); \
+ }
+
+ NOT_IMPLEMENTED_VISIT(Null);
+ NOT_IMPLEMENTED_VISIT(Union);
+ NOT_IMPLEMENTED_VISIT(List);
+ NOT_IMPLEMENTED_VISIT(LargeList);
+ NOT_IMPLEMENTED_VISIT(Struct);
+ NOT_IMPLEMENTED_VISIT(FixedSizeList);
+ NOT_IMPLEMENTED_VISIT(Dictionary);
+ NOT_IMPLEMENTED_VISIT(Extension);
+
+#undef NOT_IMPLEMENTED_VISIT
+
+ MemoryPool* pool_;
+ std::shared_ptr<Buffer> buffer_;
+};
+
+internal::LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
+ internal::LevelInfo level_info;
+ level_info.def_level = descr->max_definition_level();
+ level_info.rep_level = descr->max_repetition_level();
+
+ int16_t min_spaced_def_level = descr->max_definition_level();
+ const ::parquet::schema::Node* node = descr->schema_node().get();
+ while (node != nullptr && !node->is_repeated()) {
+ if (node->is_optional()) {
+ min_spaced_def_level--;
+ }
+ node = node->parent();
+ }
+ level_info.repeated_ancestor_def_level = min_spaced_def_level;
+ return level_info;
+}
+
+template <class T>
+inline const T* AddIfNotNull(const T* base, int64_t offset) {
+ if (base != nullptr) {
+ return base + offset;
+ }
+ return nullptr;
+}
+
+} // namespace
+
+LevelEncoder::LevelEncoder() {}
+LevelEncoder::~LevelEncoder() {}
+
+void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values, uint8_t* data, int data_size) {
+ bit_width_ = BitUtil::Log2(max_level + 1);
+ encoding_ = encoding;
+ switch (encoding) {
+ case Encoding::RLE: {
+ rle_encoder_.reset(new RleEncoder(data, data_size, bit_width_));
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ int num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
+ bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+}
+
+int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values) {
+ int bit_width = BitUtil::Log2(max_level + 1);
+ int num_bytes = 0;
+ switch (encoding) {
+ case Encoding::RLE: {
+ // TODO: Due to the way we currently check if the buffer is full enough,
+ // we need to have MinBufferSize as head room.
+ num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
+ RleEncoder::MinBufferSize(bit_width);
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return num_bytes;
+}
+
+int LevelEncoder::Encode(int batch_size, const int16_t* levels) {
+ int num_encoded = 0;
+ if (!rle_encoder_ && !bit_packed_encoder_) {
+ throw ParquetException("Level encoders are not initialized.");
+ }
+
+ if (encoding_ == Encoding::RLE) {
+ for (int i = 0; i < batch_size; ++i) {
+ if (!rle_encoder_->Put(*(levels + i))) {
+ break;
+ }
+ ++num_encoded;
+ }
+ rle_encoder_->Flush();
+ rle_length_ = rle_encoder_->len();
+ } else {
+ for (int i = 0; i < batch_size; ++i) {
+ if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) {
+ break;
+ }
+ ++num_encoded;
+ }
+ bit_packed_encoder_->Flush();
+ }
+ return num_encoded;
+}
+
+// ----------------------------------------------------------------------
+// PageWriter implementation
+
+// This subclass delimits pages appearing in a serialized stream, each preceded
+// by a serialized Thrift format::PageHeader indicating the type of each page
+// and the page metadata.
+class SerializedPageWriter : public PageWriter {
+ public:
+ SerializedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t column_chunk_ordinal,
+ MemoryPool* pool = ::arrow::default_memory_pool(),
+ std::shared_ptr<Encryptor> meta_encryptor = nullptr,
+ std::shared_ptr<Encryptor> data_encryptor = nullptr)
+ : sink_(std::move(sink)),
+ metadata_(metadata),
+ pool_(pool),
+ num_values_(0),
+ dictionary_page_offset_(0),
+ data_page_offset_(0),
+ total_uncompressed_size_(0),
+ total_compressed_size_(0),
+ page_ordinal_(0),
+ row_group_ordinal_(row_group_ordinal),
+ column_ordinal_(column_chunk_ordinal),
+ meta_encryptor_(std::move(meta_encryptor)),
+ data_encryptor_(std::move(data_encryptor)),
+ encryption_buffer_(AllocateBuffer(pool, 0)) {
+ if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) {
+ InitEncryption();
+ }
+ compressor_ = GetCodec(codec, compression_level);
+ thrift_serializer_.reset(new ThriftSerializer);
+ }
+
+ int64_t WriteDictionaryPage(const DictionaryPage& page) override {
+ int64_t uncompressed_size = page.size();
+ std::shared_ptr<Buffer> compressed_data;
+ if (has_compressor()) {
+ auto buffer = std::static_pointer_cast<ResizableBuffer>(
+ AllocateBuffer(pool_, uncompressed_size));
+ Compress(*(page.buffer().get()), buffer.get());
+ compressed_data = std::static_pointer_cast<Buffer>(buffer);
+ } else {
+ compressed_data = page.buffer();
+ }
+
+ format::DictionaryPageHeader dict_page_header;
+ dict_page_header.__set_num_values(page.num_values());
+ dict_page_header.__set_encoding(ToThrift(page.encoding()));
+ dict_page_header.__set_is_sorted(page.is_sorted());
+
+ const uint8_t* output_data_buffer = compressed_data->data();
+ int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+
+ if (data_encryptor_.get()) {
+ UpdateEncryption(encryption::kDictionaryPage);
+ PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
+ data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
+ output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+ encryption_buffer_->mutable_data());
+ output_data_buffer = encryption_buffer_->data();
+ }
+
+ format::PageHeader page_header;
+ page_header.__set_type(format::PageType::DICTIONARY_PAGE);
+ page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
+ page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
+ page_header.__set_dictionary_page_header(dict_page_header);
+ // TODO(PARQUET-594) crc checksum
+
+ PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
+ if (dictionary_page_offset_ == 0) {
+ dictionary_page_offset_ = start_pos;
+ }
+
+ if (meta_encryptor_) {
+ UpdateEncryption(encryption::kDictionaryPageHeader);
+ }
+ const int64_t header_size =
+ thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
+
+ PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
+
+ total_uncompressed_size_ += uncompressed_size + header_size;
+ total_compressed_size_ += output_data_len + header_size;
+ ++dict_encoding_stats_[page.encoding()];
+ return uncompressed_size + header_size;
+ }
+
+ void Close(bool has_dictionary, bool fallback) override {
+ if (meta_encryptor_ != nullptr) {
+ UpdateEncryption(encryption::kColumnMetaData);
+ }
+ // index_page_offset = -1 since they are not supported
+ metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_,
+ total_compressed_size_, total_uncompressed_size_, has_dictionary,
+ fallback, dict_encoding_stats_, data_encoding_stats_,
+ meta_encryptor_);
+ // Write metadata at end of column chunk
+ metadata_->WriteTo(sink_.get());
+ }
+
+ /**
+ * Compress a buffer.
+ */
+ void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
+ DCHECK(compressor_ != nullptr);
+
+ // Compress the data
+ int64_t max_compressed_size =
+ compressor_->MaxCompressedLen(src_buffer.size(), src_buffer.data());
+
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(max_compressed_size, false));
+
+ PARQUET_ASSIGN_OR_THROW(
+ int64_t compressed_size,
+ compressor_->Compress(src_buffer.size(), src_buffer.data(), max_compressed_size,
+ dest_buffer->mutable_data()));
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(compressed_size, false));
+ }
+
+ int64_t WriteDataPage(const DataPage& page) override {
+ const int64_t uncompressed_size = page.uncompressed_size();
+ std::shared_ptr<Buffer> compressed_data = page.buffer();
+ const uint8_t* output_data_buffer = compressed_data->data();
+ int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+
+ if (data_encryptor_.get()) {
+ PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
+ data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
+ UpdateEncryption(encryption::kDataPage);
+ output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+ encryption_buffer_->mutable_data());
+ output_data_buffer = encryption_buffer_->data();
+ }
+
+ format::PageHeader page_header;
+ page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
+ page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
+ // TODO(PARQUET-594) crc checksum
+
+ if (page.type() == PageType::DATA_PAGE) {
+ const DataPageV1& v1_page = checked_cast<const DataPageV1&>(page);
+ SetDataPageHeader(page_header, v1_page);
+ } else if (page.type() == PageType::DATA_PAGE_V2) {
+ const DataPageV2& v2_page = checked_cast<const DataPageV2&>(page);
+ SetDataPageV2Header(page_header, v2_page);
+ } else {
+ throw ParquetException("Unexpected page type");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
+ if (page_ordinal_ == 0) {
+ data_page_offset_ = start_pos;
+ }
+
+ if (meta_encryptor_) {
+ UpdateEncryption(encryption::kDataPageHeader);
+ }
+ const int64_t header_size =
+ thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
+ PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
+
+ total_uncompressed_size_ += uncompressed_size + header_size;
+ total_compressed_size_ += output_data_len + header_size;
+ num_values_ += page.num_values();
+ ++data_encoding_stats_[page.encoding()];
+ ++page_ordinal_;
+ return uncompressed_size + header_size;
+ }
+
+ void SetDataPageHeader(format::PageHeader& page_header, const DataPageV1& page) {
+ format::DataPageHeader data_page_header;
+ data_page_header.__set_num_values(page.num_values());
+ data_page_header.__set_encoding(ToThrift(page.encoding()));
+ data_page_header.__set_definition_level_encoding(
+ ToThrift(page.definition_level_encoding()));
+ data_page_header.__set_repetition_level_encoding(
+ ToThrift(page.repetition_level_encoding()));
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ page_header.__set_type(format::PageType::DATA_PAGE);
+ page_header.__set_data_page_header(data_page_header);
+ }
+
+ void SetDataPageV2Header(format::PageHeader& page_header, const DataPageV2 page) {
+ format::DataPageHeaderV2 data_page_header;
+ data_page_header.__set_num_values(page.num_values());
+ data_page_header.__set_num_nulls(page.num_nulls());
+ data_page_header.__set_num_rows(page.num_rows());
+ data_page_header.__set_encoding(ToThrift(page.encoding()));
+
+ data_page_header.__set_definition_levels_byte_length(
+ page.definition_levels_byte_length());
+ data_page_header.__set_repetition_levels_byte_length(
+ page.repetition_levels_byte_length());
+
+ data_page_header.__set_is_compressed(page.is_compressed());
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ page_header.__set_type(format::PageType::DATA_PAGE_V2);
+ page_header.__set_data_page_header_v2(data_page_header);
+ }
+
+ bool has_compressor() override { return (compressor_ != nullptr); }
+
+ int64_t num_values() { return num_values_; }
+
+ int64_t dictionary_page_offset() { return dictionary_page_offset_; }
+
+ int64_t data_page_offset() { return data_page_offset_; }
+
+ int64_t total_compressed_size() { return total_compressed_size_; }
+
+ int64_t total_uncompressed_size() { return total_uncompressed_size_; }
+
+ private:
+ // To allow UpdateEncryption on Close
+ friend class BufferedPageWriter;
+
+ void InitEncryption() {
+ // Prepare the AAD for quick update later.
+ if (data_encryptor_ != nullptr) {
+ data_page_aad_ = encryption::CreateModuleAad(
+ data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_,
+ column_ordinal_, kNonPageOrdinal);
+ }
+ if (meta_encryptor_ != nullptr) {
+ data_page_header_aad_ = encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_,
+ column_ordinal_, kNonPageOrdinal);
+ }
+ }
+
+ void UpdateEncryption(int8_t module_type) {
+ switch (module_type) {
+ case encryption::kColumnMetaData: {
+ meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ case encryption::kDataPage: {
+ encryption::QuickUpdatePageAad(data_page_aad_, page_ordinal_);
+ data_encryptor_->UpdateAad(data_page_aad_);
+ break;
+ }
+ case encryption::kDataPageHeader: {
+ encryption::QuickUpdatePageAad(data_page_header_aad_, page_ordinal_);
+ meta_encryptor_->UpdateAad(data_page_header_aad_);
+ break;
+ }
+ case encryption::kDictionaryPageHeader: {
+ meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ case encryption::kDictionaryPage: {
+ data_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown module type in UpdateEncryption");
+ }
+ }
+
+ std::shared_ptr<ArrowOutputStream> sink_;
+ ColumnChunkMetaDataBuilder* metadata_;
+ MemoryPool* pool_;
+ int64_t num_values_;
+ int64_t dictionary_page_offset_;
+ int64_t data_page_offset_;
+ int64_t total_uncompressed_size_;
+ int64_t total_compressed_size_;
+ int16_t page_ordinal_;
+ int16_t row_group_ordinal_;
+ int16_t column_ordinal_;
+
+ std::unique_ptr<ThriftSerializer> thrift_serializer_;
+
+ // Compression codec to use.
+ std::unique_ptr<::arrow::util::Codec> compressor_;
+
+ std::string data_page_aad_;
+ std::string data_page_header_aad_;
+
+ std::shared_ptr<Encryptor> meta_encryptor_;
+ std::shared_ptr<Encryptor> data_encryptor_;
+
+ std::shared_ptr<ResizableBuffer> encryption_buffer_;
+
+ std::map<Encoding::type, int32_t> dict_encoding_stats_;
+ std::map<Encoding::type, int32_t> data_encoding_stats_;
+};
+
+// This implementation of the PageWriter writes to the final sink on Close .
+class BufferedPageWriter : public PageWriter {
+ public:
+ BufferedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t current_column_ordinal,
+ MemoryPool* pool = ::arrow::default_memory_pool(),
+ std::shared_ptr<Encryptor> meta_encryptor = nullptr,
+ std::shared_ptr<Encryptor> data_encryptor = nullptr)
+ : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false) {
+ in_memory_sink_ = CreateOutputStream(pool);
+ pager_ = std::unique_ptr<SerializedPageWriter>(
+ new SerializedPageWriter(in_memory_sink_, codec, compression_level, metadata,
+ row_group_ordinal, current_column_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ }
+
+ int64_t WriteDictionaryPage(const DictionaryPage& page) override {
+ has_dictionary_pages_ = true;
+ return pager_->WriteDictionaryPage(page);
+ }
+
+ void Close(bool has_dictionary, bool fallback) override {
+ if (pager_->meta_encryptor_ != nullptr) {
+ pager_->UpdateEncryption(encryption::kColumnMetaData);
+ }
+ // index_page_offset = -1 since they are not supported
+ PARQUET_ASSIGN_OR_THROW(int64_t final_position, final_sink_->Tell());
+ // dictionary page offset should be 0 iff there are no dictionary pages
+ auto dictionary_page_offset =
+ has_dictionary_pages_ ? pager_->dictionary_page_offset() + final_position : 0;
+ metadata_->Finish(pager_->num_values(), dictionary_page_offset, -1,
+ pager_->data_page_offset() + final_position,
+ pager_->total_compressed_size(), pager_->total_uncompressed_size(),
+ has_dictionary, fallback, pager_->dict_encoding_stats_,
+ pager_->data_encoding_stats_, pager_->meta_encryptor_);
+
+ // Write metadata at end of column chunk
+ metadata_->WriteTo(in_memory_sink_.get());
+
+ // flush everything to the serialized sink
+ PARQUET_ASSIGN_OR_THROW(auto buffer, in_memory_sink_->Finish());
+ PARQUET_THROW_NOT_OK(final_sink_->Write(buffer));
+ }
+
+ int64_t WriteDataPage(const DataPage& page) override {
+ return pager_->WriteDataPage(page);
+ }
+
+ void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
+ pager_->Compress(src_buffer, dest_buffer);
+ }
+
+ bool has_compressor() override { return pager_->has_compressor(); }
+
+ private:
+ std::shared_ptr<ArrowOutputStream> final_sink_;
+ ColumnChunkMetaDataBuilder* metadata_;
+ std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_;
+ std::unique_ptr<SerializedPageWriter> pager_;
+ bool has_dictionary_pages_;
+};
+
+std::unique_ptr<PageWriter> PageWriter::Open(
+ std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
+ bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
+ std::shared_ptr<Encryptor> data_encryptor) {
+ if (buffered_row_group) {
+ return std::unique_ptr<PageWriter>(
+ new BufferedPageWriter(std::move(sink), codec, compression_level, metadata,
+ row_group_ordinal, column_chunk_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ } else {
+ return std::unique_ptr<PageWriter>(
+ new SerializedPageWriter(std::move(sink), codec, compression_level, metadata,
+ row_group_ordinal, column_chunk_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ }
+}
+
+// ----------------------------------------------------------------------
+// ColumnWriter
+
+const std::shared_ptr<WriterProperties>& default_writer_properties() {
+ static std::shared_ptr<WriterProperties> default_writer_properties =
+ WriterProperties::Builder().build();
+ return default_writer_properties;
+}
+
+class ColumnWriterImpl {
+ public:
+ ColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager, const bool use_dictionary,
+ Encoding::type encoding, const WriterProperties* properties)
+ : metadata_(metadata),
+ descr_(metadata->descr()),
+ level_info_(ComputeLevelInfo(metadata->descr())),
+ pager_(std::move(pager)),
+ has_dictionary_(use_dictionary),
+ encoding_(encoding),
+ properties_(properties),
+ allocator_(properties->memory_pool()),
+ num_buffered_values_(0),
+ num_buffered_encoded_values_(0),
+ rows_written_(0),
+ total_bytes_written_(0),
+ total_compressed_bytes_(0),
+ closed_(false),
+ fallback_(false),
+ definition_levels_sink_(allocator_),
+ repetition_levels_sink_(allocator_) {
+ definition_levels_rle_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ repetition_levels_rle_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ uncompressed_data_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+
+ if (pager_->has_compressor()) {
+ compressor_temp_buffer_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ }
+ }
+
+ virtual ~ColumnWriterImpl() = default;
+
+ int64_t Close();
+
+ protected:
+ virtual std::shared_ptr<Buffer> GetValuesBuffer() = 0;
+
+ // Serializes Dictionary Page if enabled
+ virtual void WriteDictionaryPage() = 0;
+
+ // Plain-encoded statistics of the current page
+ virtual EncodedStatistics GetPageStatistics() = 0;
+
+ // Plain-encoded statistics of the whole chunk
+ virtual EncodedStatistics GetChunkStatistics() = 0;
+
+ // Merges page statistics into chunk statistics, then resets the values
+ virtual void ResetPageStatistics() = 0;
+
+ // Adds Data Pages to an in memory buffer in dictionary encoding mode
+ // Serializes the Data Pages in other encoding modes
+ void AddDataPage();
+
+ void BuildDataPageV1(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size, int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values);
+ void BuildDataPageV2(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size, int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values);
+
+ // Serializes Data Pages
+ void WriteDataPage(const DataPage& page) {
+ total_bytes_written_ += pager_->WriteDataPage(page);
+ }
+
+ // Write multiple definition levels
+ void WriteDefinitionLevels(int64_t num_levels, const int16_t* levels) {
+ DCHECK(!closed_);
+ PARQUET_THROW_NOT_OK(
+ definition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
+ }
+
+ // Write multiple repetition levels
+ void WriteRepetitionLevels(int64_t num_levels, const int16_t* levels) {
+ DCHECK(!closed_);
+ PARQUET_THROW_NOT_OK(
+ repetition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
+ }
+
+ // RLE encode the src_buffer into dest_buffer and return the encoded size
+ int64_t RleEncodeLevels(const void* src_buffer, ResizableBuffer* dest_buffer,
+ int16_t max_level, bool include_length_prefix = true);
+
+ // Serialize the buffered Data Pages
+ void FlushBufferedDataPages();
+
+ ColumnChunkMetaDataBuilder* metadata_;
+ const ColumnDescriptor* descr_;
+ // scratch buffer if validity bits need to be recalculated.
+ std::shared_ptr<ResizableBuffer> bits_buffer_;
+ const internal::LevelInfo level_info_;
+
+ std::unique_ptr<PageWriter> pager_;
+
+ bool has_dictionary_;
+ Encoding::type encoding_;
+ const WriterProperties* properties_;
+
+ LevelEncoder level_encoder_;
+
+ MemoryPool* allocator_;
+
+ // The total number of values stored in the data page. This is the maximum of
+ // the number of encoded definition levels or encoded values. For
+ // non-repeated, required columns, this is equal to the number of encoded
+ // values. For repeated or optional values, there may be fewer data values
+ // than levels, and this tells you how many encoded levels there are in that
+ // case.
+ int64_t num_buffered_values_;
+
+ // The total number of stored values. For repeated or optional values, this
+ // number may be lower than num_buffered_values_.
+ int64_t num_buffered_encoded_values_;
+
+ // Total number of rows written with this ColumnWriter
+ int rows_written_;
+
+ // Records the total number of uncompressed bytes written by the serializer
+ int64_t total_bytes_written_;
+
+ // Records the current number of compressed bytes in a column
+ int64_t total_compressed_bytes_;
+
+ // Flag to check if the Writer has been closed
+ bool closed_;
+
+ // Flag to infer if dictionary encoding has fallen back to PLAIN
+ bool fallback_;
+
+ ::arrow::BufferBuilder definition_levels_sink_;
+ ::arrow::BufferBuilder repetition_levels_sink_;
+
+ std::shared_ptr<ResizableBuffer> definition_levels_rle_;
+ std::shared_ptr<ResizableBuffer> repetition_levels_rle_;
+
+ std::shared_ptr<ResizableBuffer> uncompressed_data_;
+ std::shared_ptr<ResizableBuffer> compressor_temp_buffer_;
+
+ std::vector<std::unique_ptr<DataPage>> data_pages_;
+
+ private:
+ void InitSinks() {
+ definition_levels_sink_.Rewind(0);
+ repetition_levels_sink_.Rewind(0);
+ }
+
+ // Concatenate the encoded levels and values into one buffer
+ void ConcatenateBuffers(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ const std::shared_ptr<Buffer>& values, uint8_t* combined) {
+ memcpy(combined, repetition_levels_rle_->data(), repetition_levels_rle_size);
+ combined += repetition_levels_rle_size;
+ memcpy(combined, definition_levels_rle_->data(), definition_levels_rle_size);
+ combined += definition_levels_rle_size;
+ memcpy(combined, values->data(), values->size());
+ }
+};
+
+// return the size of the encoded buffer
+int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer,
+ ResizableBuffer* dest_buffer, int16_t max_level,
+ bool include_length_prefix) {
+ // V1 DataPage includes the length of the RLE level as a prefix.
+ int32_t prefix_size = include_length_prefix ? sizeof(int32_t) : 0;
+
+ // TODO: This only works with due to some RLE specifics
+ int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level,
+ static_cast<int>(num_buffered_values_)) +
+ prefix_size;
+
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(rle_size, false));
+
+ level_encoder_.Init(Encoding::RLE, max_level, static_cast<int>(num_buffered_values_),
+ dest_buffer->mutable_data() + prefix_size,
+ static_cast<int>(dest_buffer->size() - prefix_size));
+ int encoded = level_encoder_.Encode(static_cast<int>(num_buffered_values_),
+ reinterpret_cast<const int16_t*>(src_buffer));
+ DCHECK_EQ(encoded, num_buffered_values_);
+
+ if (include_length_prefix) {
+ reinterpret_cast<int32_t*>(dest_buffer->mutable_data())[0] = level_encoder_.len();
+ }
+
+ return level_encoder_.len() + prefix_size;
+}
+
+void ColumnWriterImpl::AddDataPage() {
+ int64_t definition_levels_rle_size = 0;
+ int64_t repetition_levels_rle_size = 0;
+
+ std::shared_ptr<Buffer> values = GetValuesBuffer();
+ bool is_v1_data_page = properties_->data_page_version() == ParquetDataPageVersion::V1;
+
+ if (descr_->max_definition_level() > 0) {
+ definition_levels_rle_size = RleEncodeLevels(
+ definition_levels_sink_.data(), definition_levels_rle_.get(),
+ descr_->max_definition_level(), /*include_length_prefix=*/is_v1_data_page);
+ }
+
+ if (descr_->max_repetition_level() > 0) {
+ repetition_levels_rle_size = RleEncodeLevels(
+ repetition_levels_sink_.data(), repetition_levels_rle_.get(),
+ descr_->max_repetition_level(), /*include_length_prefix=*/is_v1_data_page);
+ }
+
+ int64_t uncompressed_size =
+ definition_levels_rle_size + repetition_levels_rle_size + values->size();
+
+ if (is_v1_data_page) {
+ BuildDataPageV1(definition_levels_rle_size, repetition_levels_rle_size,
+ uncompressed_size, values);
+ } else {
+ BuildDataPageV2(definition_levels_rle_size, repetition_levels_rle_size,
+ uncompressed_size, values);
+ }
+
+ // Re-initialize the sinks for next Page.
+ InitSinks();
+ num_buffered_values_ = 0;
+ num_buffered_encoded_values_ = 0;
+}
+
+void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values) {
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(uncompressed_data_->Resize(uncompressed_size, false));
+ ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, values,
+ uncompressed_data_->mutable_data());
+
+ EncodedStatistics page_stats = GetPageStatistics();
+ page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
+ page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+ ResetPageStatistics();
+
+ std::shared_ptr<Buffer> compressed_data;
+ if (pager_->has_compressor()) {
+ pager_->Compress(*(uncompressed_data_.get()), compressor_temp_buffer_.get());
+ compressed_data = compressor_temp_buffer_;
+ } else {
+ compressed_data = uncompressed_data_;
+ }
+
+ // Write the page to OutputStream eagerly if there is no dictionary or
+ // if dictionary encoding has fallen back to PLAIN
+ if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
+ PARQUET_ASSIGN_OR_THROW(
+ auto compressed_data_copy,
+ compressed_data->CopySlice(0, compressed_data->size(), allocator_));
+ std::unique_ptr<DataPage> page_ptr(new DataPageV1(
+ compressed_data_copy, static_cast<int32_t>(num_buffered_values_), encoding_,
+ Encoding::RLE, Encoding::RLE, uncompressed_size, page_stats));
+ total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
+
+ data_pages_.push_back(std::move(page_ptr));
+ } else { // Eagerly write pages
+ DataPageV1 page(compressed_data, static_cast<int32_t>(num_buffered_values_),
+ encoding_, Encoding::RLE, Encoding::RLE, uncompressed_size,
+ page_stats);
+ WriteDataPage(page);
+ }
+}
+
+void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values) {
+ // Compress the values if needed. Repetition and definition levels are uncompressed in
+ // V2.
+ std::shared_ptr<Buffer> compressed_values;
+ if (pager_->has_compressor()) {
+ pager_->Compress(*values, compressor_temp_buffer_.get());
+ compressed_values = compressor_temp_buffer_;
+ } else {
+ compressed_values = values;
+ }
+
+ // Concatenate uncompressed levels and the possibly compressed values
+ int64_t combined_size =
+ definition_levels_rle_size + repetition_levels_rle_size + compressed_values->size();
+ std::shared_ptr<ResizableBuffer> combined = AllocateBuffer(allocator_, combined_size);
+
+ ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size,
+ compressed_values, combined->mutable_data());
+
+ EncodedStatistics page_stats = GetPageStatistics();
+ page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
+ page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+ ResetPageStatistics();
+
+ int32_t num_values = static_cast<int32_t>(num_buffered_values_);
+ int32_t null_count = static_cast<int32_t>(page_stats.null_count);
+ int32_t def_levels_byte_length = static_cast<int32_t>(definition_levels_rle_size);
+ int32_t rep_levels_byte_length = static_cast<int32_t>(repetition_levels_rle_size);
+
+ // Write the page to OutputStream eagerly if there is no dictionary or
+ // if dictionary encoding has fallen back to PLAIN
+ if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
+ PARQUET_ASSIGN_OR_THROW(auto data_copy,
+ combined->CopySlice(0, combined->size(), allocator_));
+ std::unique_ptr<DataPage> page_ptr(new DataPageV2(
+ combined, num_values, null_count, num_values, encoding_, def_levels_byte_length,
+ rep_levels_byte_length, uncompressed_size, pager_->has_compressor()));
+ total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
+ data_pages_.push_back(std::move(page_ptr));
+ } else {
+ DataPageV2 page(combined, num_values, null_count, num_values, encoding_,
+ def_levels_byte_length, rep_levels_byte_length, uncompressed_size,
+ pager_->has_compressor());
+ WriteDataPage(page);
+ }
+}
+
+int64_t ColumnWriterImpl::Close() {
+ if (!closed_) {
+ closed_ = true;
+ if (has_dictionary_ && !fallback_) {
+ WriteDictionaryPage();
+ }
+
+ FlushBufferedDataPages();
+
+ EncodedStatistics chunk_statistics = GetChunkStatistics();
+ chunk_statistics.ApplyStatSizeLimits(
+ properties_->max_statistics_size(descr_->path()));
+ chunk_statistics.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+
+ // Write stats only if the column has at least one row written
+ if (rows_written_ > 0 && chunk_statistics.is_set()) {
+ metadata_->SetStatistics(chunk_statistics);
+ }
+ pager_->Close(has_dictionary_, fallback_);
+ }
+
+ return total_bytes_written_;
+}
+
+void ColumnWriterImpl::FlushBufferedDataPages() {
+ // Write all outstanding data to a new page
+ if (num_buffered_values_ > 0) {
+ AddDataPage();
+ }
+ for (const auto& page_ptr : data_pages_) {
+ WriteDataPage(*page_ptr);
+ }
+ data_pages_.clear();
+ total_compressed_bytes_ = 0;
+}
+
+// ----------------------------------------------------------------------
+// TypedColumnWriter
+
+template <typename Action>
+inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
+ int64_t num_batches = static_cast<int>(total / batch_size);
+ for (int round = 0; round < num_batches; round++) {
+ action(round * batch_size, batch_size);
+ }
+ // Write the remaining values
+ if (total % batch_size > 0) {
+ action(num_batches * batch_size, total % batch_size);
+ }
+}
+
+bool DictionaryDirectWriteSupported(const ::arrow::Array& array) {
+ DCHECK_EQ(array.type_id(), ::arrow::Type::DICTIONARY);
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*array.type());
+ return ::arrow::is_base_binary_like(dict_type.value_type()->id());
+}
+
+Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
+ std::shared_ptr<::arrow::Array>* out) {
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*array.type());
+
+ ::arrow::compute::ExecContext ctx(pool);
+ ARROW_ASSIGN_OR_RAISE(Datum cast_output,
+ ::arrow::compute::Cast(array.data(), dict_type.value_type(),
+ ::arrow::compute::CastOptions(), &ctx));
+ *out = cast_output.make_array();
+ return Status::OK();
+}
+
+static inline bool IsDictionaryEncoding(Encoding::type encoding) {
+ return encoding == Encoding::PLAIN_DICTIONARY;
+}
+
+template <typename DType>
+class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager, const bool use_dictionary,
+ Encoding::type encoding, const WriterProperties* properties)
+ : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding,
+ properties) {
+ current_encoder_ = MakeEncoder(DType::type_num, encoding, use_dictionary, descr_,
+ properties->memory_pool());
+
+ if (properties->statistics_enabled(descr_->path()) &&
+ (SortOrder::UNKNOWN != descr_->sort_order())) {
+ page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+ chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+ }
+ }
+
+ int64_t Close() override { return ColumnWriterImpl::Close(); }
+
+ int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const T* values) override {
+ // We check for DataPage limits only after we have inserted the values. If a user
+ // writes a large number of values, the DataPage size can be much above the limit.
+ // The purpose of this chunking is to bound this. Even if a user writes large number
+ // of values, the chunking will ensure the AddDataPage() is called at a reasonable
+ // pagesize limit
+ int64_t value_offset = 0;
+
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t values_to_write = WriteLevels(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+
+ // PARQUET-780
+ if (values_to_write > 0) {
+ DCHECK_NE(nullptr, values);
+ }
+ WriteValues(AddIfNotNull(values, value_offset), values_to_write,
+ batch_size - values_to_write);
+ CommitWriteAndCheckPageLimit(batch_size, values_to_write);
+ value_offset += values_to_write;
+
+ // Dictionary size checked separately from data page size since we
+ // circumvent this check when writing ::arrow::DictionaryArray directly
+ CheckDictionarySizeLimit();
+ };
+ DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+ return value_offset;
+ }
+
+ void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, const T* values) override {
+ // Like WriteBatch, but for spaced values
+ int64_t value_offset = 0;
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count;
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values,
+ &null_count);
+
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ if (bits_buffer_ != nullptr) {
+ WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
+ batch_num_spaced_values, bits_buffer_->data(), /*offset=*/0);
+ } else {
+ WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
+ batch_num_spaced_values, valid_bits,
+ valid_bits_offset + value_offset);
+ }
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values);
+ value_offset += batch_num_spaced_values;
+
+ // Dictionary size checked separately from data page size since we
+ // circumvent this check when writing ::arrow::DictionaryArray directly
+ CheckDictionarySizeLimit();
+ };
+ DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+ }
+
+ Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& leaf_array,
+ ArrowWriteContext* ctx, bool leaf_field_nullable) override {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ // Leaf nulls are canonical when there is only a single null element after a list
+ // and it is at the leaf.
+ bool single_nullable_element =
+ (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
+ leaf_field_nullable;
+ bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
+ if (maybe_parent_nulls) {
+ ARROW_ASSIGN_OR_RAISE(
+ bits_buffer_,
+ ::arrow::AllocateResizableBuffer(
+ BitUtil::BytesForBits(properties_->write_batch_size()), ctx->memory_pool));
+ bits_buffer_->ZeroPadding();
+ }
+
+ if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
+ return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx,
+ maybe_parent_nulls);
+ } else {
+ return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx,
+ maybe_parent_nulls);
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ int64_t EstimatedBufferedValueBytes() const override {
+ return current_encoder_->EstimatedDataEncodedSize();
+ }
+
+ protected:
+ std::shared_ptr<Buffer> GetValuesBuffer() override {
+ return current_encoder_->FlushValues();
+ }
+
+ // Internal function to handle direct writing of ::arrow::DictionaryArray,
+ // since the standard logic concerning dictionary size limits and fallback to
+ // plain encoding is circumvented
+ Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& array,
+ ArrowWriteContext* context, bool maybe_parent_nulls);
+
+ Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& array,
+ ArrowWriteContext* context, bool maybe_parent_nulls);
+
+ void WriteDictionaryPage() override {
+ // We have to dynamic cast here because of TypedEncoder<Type> as
+ // some compilers don't want to cast through virtual inheritance
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ DCHECK(dict_encoder);
+ std::shared_ptr<ResizableBuffer> buffer =
+ AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size());
+ dict_encoder->WriteDict(buffer->mutable_data());
+
+ DictionaryPage page(buffer, dict_encoder->num_entries(),
+ properties_->dictionary_page_encoding());
+ total_bytes_written_ += pager_->WriteDictionaryPage(page);
+ }
+
+ EncodedStatistics GetPageStatistics() override {
+ EncodedStatistics result;
+ if (page_statistics_) result = page_statistics_->Encode();
+ return result;
+ }
+
+ EncodedStatistics GetChunkStatistics() override {
+ EncodedStatistics result;
+ if (chunk_statistics_) result = chunk_statistics_->Encode();
+ return result;
+ }
+
+ void ResetPageStatistics() override {
+ if (chunk_statistics_ != nullptr) {
+ chunk_statistics_->Merge(*page_statistics_);
+ page_statistics_->Reset();
+ }
+ }
+
+ Type::type type() const override { return descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return descr_; }
+
+ int64_t rows_written() const override { return rows_written_; }
+
+ int64_t total_compressed_bytes() const override { return total_compressed_bytes_; }
+
+ int64_t total_bytes_written() const override { return total_bytes_written_; }
+
+ const WriterProperties* properties() override { return properties_; }
+
+ private:
+ using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
+ using TypedStats = TypedStatistics<DType>;
+ std::unique_ptr<Encoder> current_encoder_;
+ std::shared_ptr<TypedStats> page_statistics_;
+ std::shared_ptr<TypedStats> chunk_statistics_;
+
+ // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the
+ // dictionary passed to DictEncoder<T>::PutDictionary so we can check
+ // subsequent array chunks to see either if materialization is required (in
+ // which case we call back to the dense write path)
+ std::shared_ptr<::arrow::Array> preserved_dictionary_;
+
+ int64_t WriteLevels(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels) {
+ int64_t values_to_write = 0;
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (def_levels[i] == descr_->max_definition_level()) {
+ ++values_to_write;
+ }
+ }
+
+ WriteDefinitionLevels(num_values, def_levels);
+ } else {
+ // Required field, write all values
+ values_to_write = num_values;
+ }
+
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (rep_levels[i] == 0) {
+ rows_written_++;
+ }
+ }
+
+ WriteRepetitionLevels(num_values, rep_levels);
+ } else {
+ // Each value is exactly one row
+ rows_written_ += static_cast<int>(num_values);
+ }
+ return values_to_write;
+ }
+
+ // This method will always update the three output parameters,
+ // out_values_to_write, out_spaced_values_to_write and null_count. Additionally
+ // it will update the validity bitmap if required (i.e. if at least one level
+ // of nullable structs directly precede the leaf node).
+ void MaybeCalculateValidityBits(const int16_t* def_levels, int64_t batch_size,
+ int64_t* out_values_to_write,
+ int64_t* out_spaced_values_to_write,
+ int64_t* null_count) {
+ if (bits_buffer_ == nullptr) {
+ if (level_info_.def_level == 0) {
+ // In this case def levels should be null and we only
+ // need to output counts which will always be equal to
+ // the batch size passed in (max def_level == 0 indicates
+ // there cannot be repeated or null fields).
+ DCHECK_EQ(def_levels, nullptr);
+ *out_values_to_write = batch_size;
+ *out_spaced_values_to_write = batch_size;
+ *null_count = 0;
+ } else {
+ for (int x = 0; x < batch_size; x++) {
+ *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 0;
+ *out_spaced_values_to_write +=
+ def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
+ }
+ *null_count = *out_values_to_write - *out_spaced_values_to_write;
+ }
+ return;
+ }
+ // Shrink to fit possible causes another allocation, and would only be necessary
+ // on the last batch.
+ int64_t new_bitmap_size = BitUtil::BytesForBits(batch_size);
+ if (new_bitmap_size != bits_buffer_->size()) {
+ PARQUET_THROW_NOT_OK(
+ bits_buffer_->Resize(new_bitmap_size, /*shrink_to_fit=*/false));
+ bits_buffer_->ZeroPadding();
+ }
+ internal::ValidityBitmapInputOutput io;
+ io.valid_bits = bits_buffer_->mutable_data();
+ io.values_read_upper_bound = batch_size;
+ internal::DefLevelsToBitmap(def_levels, batch_size, level_info_, &io);
+ *out_values_to_write = io.values_read - io.null_count;
+ *out_spaced_values_to_write = io.values_read;
+ *null_count = io.null_count;
+ }
+
+ Result<std::shared_ptr<Array>> MaybeReplaceValidity(std::shared_ptr<Array> array,
+ int64_t new_null_count,
+ ::arrow::MemoryPool* memory_pool) {
+ if (bits_buffer_ == nullptr) {
+ return array;
+ }
+ std::vector<std::shared_ptr<Buffer>> buffers = array->data()->buffers;
+ if (buffers.empty()) {
+ return array;
+ }
+ buffers[0] = bits_buffer_;
+ // Should be a leaf array.
+ DCHECK_GT(buffers.size(), 1);
+ ValueBufferSlicer slicer{memory_pool, /*buffer=*/nullptr};
+ if (array->data()->offset > 0) {
+ RETURN_NOT_OK(::arrow::VisitArrayInline(*array, &slicer));
+ buffers[1] = slicer.buffer_;
+ }
+ return ::arrow::MakeArray(std::make_shared<ArrayData>(
+ array->type(), array->length(), std::move(buffers), new_null_count));
+ }
+
+ void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels,
+ const int16_t* rep_levels) {
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ WriteDefinitionLevels(num_levels, def_levels);
+ }
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_levels; ++i) {
+ if (rep_levels[i] == 0) {
+ rows_written_++;
+ }
+ }
+ WriteRepetitionLevels(num_levels, rep_levels);
+ } else {
+ // Each value is exactly one row
+ rows_written_ += static_cast<int>(num_levels);
+ }
+ }
+
+ void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) {
+ num_buffered_values_ += num_levels;
+ num_buffered_encoded_values_ += num_values;
+
+ if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
+ AddDataPage();
+ }
+ }
+
+ void FallbackToPlainEncoding() {
+ if (IsDictionaryEncoding(current_encoder_->encoding())) {
+ WriteDictionaryPage();
+ // Serialize the buffered Dictionary Indices
+ FlushBufferedDataPages();
+ fallback_ = true;
+ // Only PLAIN encoding is supported for fallback in V1
+ current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
+ properties_->memory_pool());
+ encoding_ = Encoding::PLAIN;
+ }
+ }
+
+ // Checks if the Dictionary Page size limit is reached
+ // If the limit is reached, the Dictionary and Data Pages are serialized
+ // The encoding is switched to PLAIN
+ //
+ // Only one Dictionary Page is written.
+ // Fallback to PLAIN if dictionary page limit is reached.
+ void CheckDictionarySizeLimit() {
+ if (!has_dictionary_ || fallback_) {
+ // Either not using dictionary encoding, or we have already fallen back
+ // to PLAIN encoding because the size threshold was reached
+ return;
+ }
+
+ // We have to dynamic cast here because TypedEncoder<Type> as some compilers
+ // don't want to cast through virtual inheritance
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
+ FallbackToPlainEncoding();
+ }
+ }
+
+ void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->Put(values, static_cast<int>(num_values));
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(values, num_values, num_nulls);
+ }
+ }
+
+ void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ if (num_values != num_spaced_values) {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->PutSpaced(values, static_cast<int>(num_spaced_values), valid_bits,
+ valid_bits_offset);
+ } else {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->Put(values, static_cast<int>(num_values));
+ }
+ if (page_statistics_ != nullptr) {
+ const int64_t num_nulls = num_spaced_values - num_values;
+ page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values,
+ num_nulls);
+ }
+ }
+};
+
+template <typename DType>
+Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ // If this is the first time writing a DictionaryArray, then there's
+ // a few possible paths to take:
+ //
+ // - If dictionary encoding is not enabled, convert to densely
+ // encoded and call WriteArrow
+ // - Dictionary encoding enabled
+ // - If this is the first time this is called, then we call
+ // PutDictionary into the encoder and then PutIndices on each
+ // chunk. We store the dictionary that was written in
+ // preserved_dictionary_ so that subsequent calls to this method
+ // can make sure the dictionary has not changed
+ // - On subsequent calls, we have to check whether the dictionary
+ // has changed. If it has, then we trigger the varying
+ // dictionary path and materialize each chunk and then call
+ // WriteArrow with that
+ auto WriteDense = [&] {
+ std::shared_ptr<::arrow::Array> dense_array;
+ RETURN_NOT_OK(
+ ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array));
+ return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx,
+ maybe_parent_nulls);
+ };
+
+ if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
+ !DictionaryDirectWriteSupported(array)) {
+ // No longer dictionary-encoding for whatever reason, maybe we never were
+ // or we decided to stop. Note that WriteArrow can be invoked multiple
+ // times with both dense and dictionary-encoded versions of the same data
+ // without a problem. Any dense data will be hashed to indices until the
+ // dictionary page limit is reached, at which everything (dictionary and
+ // dense) will fall back to plain encoding
+ return WriteDense();
+ }
+
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ const auto& data = checked_cast<const ::arrow::DictionaryArray&>(array);
+ std::shared_ptr<::arrow::Array> dictionary = data.dictionary();
+ std::shared_ptr<::arrow::Array> indices = data.indices();
+
+ int64_t value_offset = 0;
+ auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count = ::arrow::kUnknownNullCount;
+ // Bits is not null for nullable values. At this point in the code we can't determine
+ // if the leaf array has the same null values as any parents it might have had so we
+ // need to recompute it from def levels.
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values, &null_count);
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ std::shared_ptr<Array> writeable_indices =
+ indices->Slice(value_offset, batch_num_spaced_values);
+ PARQUET_ASSIGN_OR_THROW(
+ writeable_indices,
+ MaybeReplaceValidity(writeable_indices, null_count, ctx->memory_pool));
+ dict_encoder->PutIndices(*writeable_indices);
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+ value_offset += batch_num_spaced_values;
+ };
+
+ // Handle seeing dictionary for the first time
+ if (!preserved_dictionary_) {
+ // It's a new dictionary. Call PutDictionary and keep track of it
+ PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));
+
+ // If there were duplicate value in the dictionary, the encoder's memo table
+ // will be out of sync with the indices in the Arrow array.
+ // The easiest solution for this uncommon case is to fallback to plain encoding.
+ if (dict_encoder->num_entries() != dictionary->length()) {
+ PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+ return WriteDense();
+ }
+
+ // TODO(wesm): If some dictionary values are unobserved, then the
+ // statistics will be inaccurate. Do we care enough to fix it?
+ if (page_statistics_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary));
+ }
+ preserved_dictionary_ = dictionary;
+ } else if (!dictionary->Equals(*preserved_dictionary_)) {
+ // Dictionary has changed
+ PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+ return WriteDense();
+ }
+
+ PARQUET_CATCH_NOT_OK(
+ DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk));
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Direct Arrow write path
+
+template <typename ParquetType, typename ArrowType, typename Enable = void>
+struct SerializeFunctor {
+ using ArrowCType = typename ArrowType::c_type;
+ using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+ using ParquetCType = typename ParquetType::c_type;
+ Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) {
+ const ArrowCType* input = array.raw_values();
+ if (array.null_count() > 0) {
+ for (int i = 0; i < array.length(); i++) {
+ out[i] = static_cast<ParquetCType>(input[i]);
+ }
+ } else {
+ std::copy(input, input + array.length(), out);
+ }
+ return Status::OK();
+ }
+};
+
+template <typename ParquetType, typename ArrowType>
+Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
+ bool maybe_parent_nulls) {
+ using ParquetCType = typename ParquetType::c_type;
+ using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+
+ ParquetCType* buffer = nullptr;
+ PARQUET_THROW_NOT_OK(ctx->GetScratchData<ParquetCType>(array.length(), &buffer));
+
+ SerializeFunctor<ParquetType, ArrowType> functor;
+ RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array), ctx, buffer));
+ bool no_nulls =
+ writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
+ if (!maybe_parent_nulls && no_nulls) {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, buffer));
+ } else {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
+ array.null_bitmap_data(),
+ array.offset(), buffer));
+ }
+ return Status::OK();
+}
+
+template <typename ParquetType>
+Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
+ bool maybe_parent_nulls) {
+ using T = typename ParquetType::c_type;
+ const auto& data = static_cast<const ::arrow::PrimitiveArray&>(array);
+ const T* values = nullptr;
+ // The values buffer may be null if the array is empty (ARROW-2744)
+ if (data.values() != nullptr) {
+ values = reinterpret_cast<const T*>(data.values()->data()) + data.offset();
+ } else {
+ DCHECK_EQ(data.length(), 0);
+ }
+ bool no_nulls =
+ writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
+
+ if (!maybe_parent_nulls && no_nulls) {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, values));
+ } else {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
+ data.null_bitmap_data(), data.offset(),
+ values));
+ }
+ return Status::OK();
+}
+
+#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \
+ case ::arrow::Type::ArrowEnum: \
+ return WriteArrowSerialize<ParquetType, ::arrow::ArrowType>( \
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+
+#define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \
+ case ::arrow::Type::ArrowEnum: \
+ return WriteArrowZeroCopy<ParquetType>(array, num_levels, def_levels, rep_levels, \
+ ctx, this, maybe_parent_nulls);
+
+#define ARROW_UNSUPPORTED() \
+ std::stringstream ss; \
+ ss << "Arrow type " << array.type()->ToString() \
+ << " cannot be written to Parquet type " << descr_->ToString(); \
+ return Status::Invalid(ss.str());
+
+// ----------------------------------------------------------------------
+// Write Arrow to BooleanType
+
+template <>
+struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
+ Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) {
+ for (int i = 0; i < data.length(); i++) {
+ *out++ = data.Value(i);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+Status TypedColumnWriterImpl<BooleanType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::BOOL) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowSerialize<BooleanType, ::arrow::BooleanType>(
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow types to INT32
+
+template <>
+struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
+ Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
+ const int64_t* input = array.raw_values();
+ for (int i = 0; i < array.length(); i++) {
+ *out++ = static_cast<int32_t>(*input++ / 86400000);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
+ Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
+ const int32_t* input = array.raw_values();
+ const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
+ if (type.unit() == ::arrow::TimeUnit::SECOND) {
+ for (int i = 0; i < array.length(); i++) {
+ out[i] = input[i] * 1000;
+ }
+ } else {
+ std::copy(input, input + array.length(), out);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ case ::arrow::Type::NA: {
+ PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr));
+ } break;
+ WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT8, UInt8Type, Int32Type)
+ WRITE_SERIALIZE_CASE(INT16, Int16Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT16, UInt16Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int32Type)
+ WRITE_ZERO_COPY_CASE(INT32, Int32Type, Int32Type)
+ WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type)
+ WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type)
+ WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type)
+ default:
+ ARROW_UNSUPPORTED()
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to Int64 and Int96
+
+#define INT96_CONVERT_LOOP(ConversionFunction) \
+ for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]);
+
+template <>
+struct SerializeFunctor<Int96Type, ::arrow::TimestampType> {
+ Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
+ const int64_t* input = array.raw_values();
+ const auto& type = static_cast<const ::arrow::TimestampType&>(*array.type());
+ switch (type.unit()) {
+ case ::arrow::TimeUnit::NANO:
+ INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::MICRO:
+ INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::MILLI:
+ INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp);
+ break;
+ }
+ return Status::OK();
+ }
+};
+
+#define COERCE_DIVIDE -1
+#define COERCE_INVALID 0
+#define COERCE_MULTIPLY +1
+
+static std::pair<int, int64_t> kTimestampCoercionFactors[4][4] = {
+ // from seconds ...
+ {{COERCE_INVALID, 0}, // ... to seconds
+ {COERCE_MULTIPLY, 1000}, // ... to millis
+ {COERCE_MULTIPLY, 1000000}, // ... to micros
+ {COERCE_MULTIPLY, INT64_C(1000000000)}}, // ... to nanos
+ // from millis ...
+ {{COERCE_INVALID, 0},
+ {COERCE_MULTIPLY, 1},
+ {COERCE_MULTIPLY, 1000},
+ {COERCE_MULTIPLY, 1000000}},
+ // from micros ...
+ {{COERCE_INVALID, 0},
+ {COERCE_DIVIDE, 1000},
+ {COERCE_MULTIPLY, 1},
+ {COERCE_MULTIPLY, 1000}},
+ // from nanos ...
+ {{COERCE_INVALID, 0},
+ {COERCE_DIVIDE, 1000000},
+ {COERCE_DIVIDE, 1000},
+ {COERCE_MULTIPLY, 1}}};
+
+template <>
+struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
+ Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx,
+ int64_t* out) {
+ const auto& source_type = static_cast<const ::arrow::TimestampType&>(*array.type());
+ auto source_unit = source_type.unit();
+ const int64_t* values = array.raw_values();
+
+ ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
+ auto target_type = ::arrow::timestamp(target_unit);
+ bool truncation_allowed = ctx->properties->truncated_timestamps_allowed();
+
+ auto DivideBy = [&](const int64_t factor) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (!truncation_allowed && array.IsValid(i) && (values[i] % factor != 0)) {
+ return Status::Invalid("Casting from ", source_type.ToString(), " to ",
+ target_type->ToString(),
+ " would lose data: ", values[i]);
+ }
+ out[i] = values[i] / factor;
+ }
+ return Status::OK();
+ };
+
+ auto MultiplyBy = [&](const int64_t factor) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = values[i] * factor;
+ }
+ return Status::OK();
+ };
+
+ const auto& coercion = kTimestampCoercionFactors[static_cast<int>(source_unit)]
+ [static_cast<int>(target_unit)];
+
+ // .first -> coercion operation; .second -> scale factor
+ DCHECK_NE(coercion.first, COERCE_INVALID);
+ return coercion.first == COERCE_DIVIDE ? DivideBy(coercion.second)
+ : MultiplyBy(coercion.second);
+ }
+};
+
+#undef COERCE_DIVIDE
+#undef COERCE_INVALID
+#undef COERCE_MULTIPLY
+
+Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<Int64Type>* writer,
+ bool maybe_parent_nulls) {
+ const auto& source_type = static_cast<const ::arrow::TimestampType&>(*values.type());
+
+ auto WriteCoerce = [&](const ArrowWriterProperties* properties) {
+ ArrowWriteContext temp_ctx = *ctx;
+ temp_ctx.properties = properties;
+ return WriteArrowSerialize<Int64Type, ::arrow::TimestampType>(
+ values, num_levels, def_levels, rep_levels, &temp_ctx, writer,
+ maybe_parent_nulls);
+ };
+
+ if (ctx->properties->coerce_timestamps_enabled()) {
+ // User explicitly requested coercion to specific unit
+ if (source_type.unit() == ctx->properties->coerce_timestamps_unit()) {
+ // No data conversion necessary
+ return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels,
+ ctx, writer, maybe_parent_nulls);
+ } else {
+ return WriteCoerce(ctx->properties);
+ }
+ } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 &&
+ source_type.unit() == ::arrow::TimeUnit::NANO) {
+ // Absent superseding user instructions, when writing Parquet version 1.0 files,
+ // timestamps in nanoseconds are coerced to microseconds
+ std::shared_ptr<ArrowWriterProperties> properties =
+ (ArrowWriterProperties::Builder())
+ .coerce_timestamps(::arrow::TimeUnit::MICRO)
+ ->disallow_truncated_timestamps()
+ ->build();
+ return WriteCoerce(properties.get());
+ } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) {
+ // Absent superseding user instructions, timestamps in seconds are coerced to
+ // milliseconds
+ std::shared_ptr<ArrowWriterProperties> properties =
+ (ArrowWriterProperties::Builder())
+ .coerce_timestamps(::arrow::TimeUnit::MILLI)
+ ->build();
+ return WriteCoerce(properties.get());
+ } else {
+ // No data conversion necessary
+ return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels, ctx,
+ writer, maybe_parent_nulls);
+ }
+}
+
+template <>
+Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ case ::arrow::Type::TIMESTAMP:
+ return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this,
+ maybe_parent_nulls);
+ WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type)
+ WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type)
+ WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type)
+ WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type)
+ default:
+ ARROW_UNSUPPORTED();
+ }
+}
+
+template <>
+Status TypedColumnWriterImpl<Int96Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::TIMESTAMP) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowSerialize<Int96Type, ::arrow::TimestampType>(
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Floating point types
+
+template <>
+Status TypedColumnWriterImpl<FloatType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::FLOAT) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowZeroCopy<FloatType>(array, num_levels, def_levels, rep_levels, ctx,
+ this, maybe_parent_nulls);
+}
+
+template <>
+Status TypedColumnWriterImpl<DoubleType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::DOUBLE) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowZeroCopy<DoubleType>(array, num_levels, def_levels, rep_levels, ctx,
+ this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to BYTE_ARRAY
+
+template <>
+Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (!::arrow::is_base_binary_like(array.type()->id())) {
+ ARROW_UNSUPPORTED();
+ }
+
+ int64_t value_offset = 0;
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count = 0;
+
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values, &null_count);
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ std::shared_ptr<Array> data_slice =
+ array.Slice(value_offset, batch_num_spaced_values);
+ PARQUET_ASSIGN_OR_THROW(
+ data_slice, MaybeReplaceValidity(data_slice, null_count, ctx->memory_pool));
+
+ current_encoder_->Put(*data_slice);
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(*data_slice);
+ }
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+ CheckDictionarySizeLimit();
+ value_offset += batch_num_spaced_values;
+ };
+
+ PARQUET_CATCH_NOT_OK(
+ DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk));
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to FIXED_LEN_BYTE_ARRAY
+
+template <typename ParquetType, typename ArrowType>
+struct SerializeFunctor<
+ ParquetType, ArrowType,
+ ::arrow::enable_if_t<::arrow::is_fixed_size_binary_type<ArrowType>::value &&
+ !::arrow::is_decimal_type<ArrowType>::value>> {
+ Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
+ FLBA* out) {
+ if (array.null_count() == 0) {
+ // no nulls, just dump the data
+ // todo(advancedxy): use a writeBatch to avoid this step
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = FixedLenByteArray(array.GetValue(i));
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsValid(i)) {
+ out[i] = FixedLenByteArray(array.GetValue(i));
+ }
+ }
+ }
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Write Arrow to Decimal128
+
+// Requires a custom serializer because decimal in parquet are in big-endian
+// format. Thus, a temporary local buffer is required.
+template <typename ParquetType, typename ArrowType>
+struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_decimal<ArrowType>> {
+ Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
+ ArrowWriteContext* ctx, FLBA* out) {
+ AllocateScratch(array, ctx);
+ auto offset = Offset(array);
+
+ if (array.null_count() == 0) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = FixDecimalEndianess<ArrowType::kByteWidth>(array.GetValue(i), offset);
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = array.IsValid(i) ? FixDecimalEndianess<ArrowType::kByteWidth>(
+ array.GetValue(i), offset)
+ : FixedLenByteArray();
+ }
+ }
+
+ return Status::OK();
+ }
+
+ // Parquet's Decimal are stored with FixedLength values where the length is
+ // proportional to the precision. Arrow's Decimal are always stored with 16/32
+ // bytes. Thus the internal FLBA pointer must be adjusted by the offset calculated
+ // here.
+ int32_t Offset(const Array& array) {
+ auto decimal_type = checked_pointer_cast<::arrow::DecimalType>(array.type());
+ return decimal_type->byte_width() -
+ ::arrow::DecimalType::DecimalSize(decimal_type->precision());
+ }
+
+ void AllocateScratch(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
+ ArrowWriteContext* ctx) {
+ int64_t non_null_count = array.length() - array.null_count();
+ int64_t size = non_null_count * ArrowType::kByteWidth;
+ scratch_buffer = AllocateBuffer(ctx->memory_pool, size);
+ scratch = reinterpret_cast<int64_t*>(scratch_buffer->mutable_data());
+ }
+
+ template <int byte_width>
+ FixedLenByteArray FixDecimalEndianess(const uint8_t* in, int64_t offset) {
+ const auto* u64_in = reinterpret_cast<const int64_t*>(in);
+ auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
+ static_assert(byte_width == 16 || byte_width == 32,
+ "only 16 and 32 byte Decimals supported");
+ if (byte_width == 32) {
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[3]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[2]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
+ } else {
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
+ }
+ return FixedLenByteArray(out);
+ }
+
+ std::shared_ptr<ResizableBuffer> scratch_buffer;
+ int64_t* scratch;
+};
+
+template <>
+Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
+ WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
+ WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
+ default:
+ break;
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Dynamic column writer constructor
+
+std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager,
+ const WriterProperties* properties) {
+ const ColumnDescriptor* descr = metadata->descr();
+ const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
+ descr->physical_type() != Type::BOOLEAN;
+ Encoding::type encoding = properties->encoding(descr->path());
+ if (use_dictionary) {
+ encoding = properties->dictionary_index_encoding();
+ }
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedColumnWriterImpl<BooleanType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT32:
+ return std::make_shared<TypedColumnWriterImpl<Int32Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT64:
+ return std::make_shared<TypedColumnWriterImpl<Int64Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT96:
+ return std::make_shared<TypedColumnWriterImpl<Int96Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::FLOAT:
+ return std::make_shared<TypedColumnWriterImpl<FloatType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::DOUBLE:
+ return std::make_shared<TypedColumnWriterImpl<DoubleType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedColumnWriterImpl<ByteArrayType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedColumnWriterImpl<FLBAType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<ColumnWriter>(nullptr);
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
index 0a609021739..6661385abdb 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
@@ -1,270 +1,270 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-
-namespace BitUtil {
-class BitWriter;
-} // namespace BitUtil
-
-namespace util {
-class RleEncoder;
-} // namespace util
-
-} // namespace arrow
-
-namespace parquet {
-
-struct ArrowWriteContext;
-class ColumnDescriptor;
-class DataPage;
-class DictionaryPage;
-class ColumnChunkMetaDataBuilder;
-class Encryptor;
-class WriterProperties;
-
-class PARQUET_EXPORT LevelEncoder {
- public:
- LevelEncoder();
- ~LevelEncoder();
-
- static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
- int num_buffered_values);
-
- // Initialize the LevelEncoder.
- void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
- uint8_t* data, int data_size);
-
- // Encodes a batch of levels from an array and returns the number of levels encoded
- int Encode(int batch_size, const int16_t* levels);
-
- int32_t len() {
- if (encoding_ != Encoding::RLE) {
- throw ParquetException("Only implemented for RLE encoding");
- }
- return rle_length_;
- }
-
- private:
- int bit_width_;
- int rle_length_;
- Encoding::type encoding_;
- std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
- std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
-};
-
-class PARQUET_EXPORT PageWriter {
- public:
- virtual ~PageWriter() {}
-
- static std::unique_ptr<PageWriter> Open(
- std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
- bool buffered_row_group = false,
- std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
- std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
-
- // The Column Writer decides if dictionary encoding is used if set and
- // if the dictionary encoding has fallen back to default encoding on reaching dictionary
- // page limit
- virtual void Close(bool has_dictionary, bool fallback) = 0;
-
- // Return the number of uncompressed bytes written (including header size)
- virtual int64_t WriteDataPage(const DataPage& page) = 0;
-
- // Return the number of uncompressed bytes written (including header size)
- virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
-
- virtual bool has_compressor() = 0;
-
- virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
-};
-
-static constexpr int WRITE_BATCH_SIZE = 1000;
-class PARQUET_EXPORT ColumnWriter {
- public:
- virtual ~ColumnWriter() = default;
-
- static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
- std::unique_ptr<PageWriter>,
- const WriterProperties* properties);
-
- /// \brief Closes the ColumnWriter, commits any buffered values to pages.
- /// \return Total size of the column in bytes
- virtual int64_t Close() = 0;
-
- /// \brief The physical Parquet type of the column
- virtual Type::type type() const = 0;
-
- /// \brief The schema for the column
- virtual const ColumnDescriptor* descr() const = 0;
-
- /// \brief The number of rows written so far
- virtual int64_t rows_written() const = 0;
-
- /// \brief The total size of the compressed pages + page headers. Some values
- /// might be still buffered and not written to a page yet
- virtual int64_t total_compressed_bytes() const = 0;
-
- /// \brief The total number of bytes written as serialized data and
- /// dictionary pages to the ColumnChunk so far
- virtual int64_t total_bytes_written() const = 0;
-
- /// \brief The file-level writer properties
- virtual const WriterProperties* properties() = 0;
-
- /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
- /// error status if the array data type is not compatible with the concrete
- /// writer type.
- ///
- /// leaf_array is always a primitive (possibly dictionary encoded type).
- /// Leaf_field_nullable indicates whether the leaf array is considered nullable
- /// according to its schema in a Table or its parent array.
- virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& leaf_array,
- ArrowWriteContext* ctx,
- bool leaf_field_nullable) = 0;
-};
-
-// API to write values to a single column. This is the main client facing API.
-template <typename DType>
-class TypedColumnWriter : public ColumnWriter {
- public:
- using T = typename DType::c_type;
-
- // Write a batch of repetition levels, definition levels, and values to the
- // column.
- // `num_values` is the number of logical leaf values.
- // `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
- // (resp. max repetition level) is 0.
- // If not null, each of `def_levels` and `rep_levels` must have at least
- // `num_values`.
- //
- // The number of physical values written (taken from `values`) is returned.
- // It can be smaller than `num_values` is there are some undefined values.
- virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const T* values) = 0;
-
- /// Write a batch of repetition levels, definition levels, and values to the
- /// column.
- ///
- /// In comparison to WriteBatch the length of repetition and definition levels
- /// is the same as of the number of values read for max_definition_level == 1.
- /// In the case of max_definition_level > 1, the repetition and definition
- /// levels are larger than the values but the values include the null entries
- /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
- /// in the parameters of this function if the input has the length of num_values or the
- /// _number of rows in the lowest nesting level_.
- ///
- /// In the case that the most inner node in the Parquet is required, the _number of rows
- /// in the lowest nesting level_ is equal to the number of non-null values. If the
- /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
- /// also includes all values with definition_level == (max_definition_level - 1).
- ///
- /// @param num_values number of levels to write.
- /// @param def_levels The Parquet definition levels, length is num_values
- /// @param rep_levels The Parquet repetition levels, length is num_values
- /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
- /// level. The length is number of rows in the lowest nesting level.
- /// @param valid_bits_offset The offset in bits of the valid_bits where the
- /// first relevant bit resides.
- /// @param values The values in the lowest nested level including
- /// spacing for nulls on the lowest levels; input has the length
- /// of the number of rows on the lowest nesting level.
- virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const uint8_t* valid_bits,
- int64_t valid_bits_offset, const T* values) = 0;
-
- // Estimated size of the values that are not written to a page yet
- virtual int64_t EstimatedBufferedValueBytes() const = 0;
-};
-
-using BoolWriter = TypedColumnWriter<BooleanType>;
-using Int32Writer = TypedColumnWriter<Int32Type>;
-using Int64Writer = TypedColumnWriter<Int64Type>;
-using Int96Writer = TypedColumnWriter<Int96Type>;
-using FloatWriter = TypedColumnWriter<FloatType>;
-using DoubleWriter = TypedColumnWriter<DoubleType>;
-using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
-using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
-
-namespace internal {
-
-/**
- * Timestamp conversion constants
- */
-constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
-
-template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
-inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
- int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
- (*impala_timestamp).value[2] = (uint32_t)julian_days;
-
- int64_t last_day_units = time % UnitPerDay;
- auto last_day_nanos = last_day_units * NanosecondsPerUnit;
- // impala_timestamp will be unaligned every other entry so do memcpy instead
- // of assign and reinterpret cast to avoid undefined behavior.
- std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
-}
-
-constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
-
-inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
- impala_timestamp);
-}
-
-constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
-
-inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
- Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
- milliseconds, impala_timestamp);
-}
-
-constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
-
-inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
- Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
- microseconds, impala_timestamp);
-}
-
-constexpr int64_t kNanosecondsInNanos = INT64_C(1);
-
-inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
- Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
- nanoseconds, impala_timestamp);
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+
+namespace BitUtil {
+class BitWriter;
+} // namespace BitUtil
+
+namespace util {
+class RleEncoder;
+} // namespace util
+
+} // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+class ColumnDescriptor;
+class DataPage;
+class DictionaryPage;
+class ColumnChunkMetaDataBuilder;
+class Encryptor;
+class WriterProperties;
+
+class PARQUET_EXPORT LevelEncoder {
+ public:
+ LevelEncoder();
+ ~LevelEncoder();
+
+ static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values);
+
+ // Initialize the LevelEncoder.
+ void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+ uint8_t* data, int data_size);
+
+ // Encodes a batch of levels from an array and returns the number of levels encoded
+ int Encode(int batch_size, const int16_t* levels);
+
+ int32_t len() {
+ if (encoding_ != Encoding::RLE) {
+ throw ParquetException("Only implemented for RLE encoding");
+ }
+ return rle_length_;
+ }
+
+ private:
+ int bit_width_;
+ int rle_length_;
+ Encoding::type encoding_;
+ std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
+ std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
+};
+
+class PARQUET_EXPORT PageWriter {
+ public:
+ virtual ~PageWriter() {}
+
+ static std::unique_ptr<PageWriter> Open(
+ std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ bool buffered_row_group = false,
+ std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
+ std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
+
+ // The Column Writer decides if dictionary encoding is used if set and
+ // if the dictionary encoding has fallen back to default encoding on reaching dictionary
+ // page limit
+ virtual void Close(bool has_dictionary, bool fallback) = 0;
+
+ // Return the number of uncompressed bytes written (including header size)
+ virtual int64_t WriteDataPage(const DataPage& page) = 0;
+
+ // Return the number of uncompressed bytes written (including header size)
+ virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
+
+ virtual bool has_compressor() = 0;
+
+ virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
+};
+
+static constexpr int WRITE_BATCH_SIZE = 1000;
+class PARQUET_EXPORT ColumnWriter {
+ public:
+ virtual ~ColumnWriter() = default;
+
+ static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
+ std::unique_ptr<PageWriter>,
+ const WriterProperties* properties);
+
+ /// \brief Closes the ColumnWriter, commits any buffered values to pages.
+ /// \return Total size of the column in bytes
+ virtual int64_t Close() = 0;
+
+ /// \brief The physical Parquet type of the column
+ virtual Type::type type() const = 0;
+
+ /// \brief The schema for the column
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ /// \brief The number of rows written so far
+ virtual int64_t rows_written() const = 0;
+
+ /// \brief The total size of the compressed pages + page headers. Some values
+ /// might be still buffered and not written to a page yet
+ virtual int64_t total_compressed_bytes() const = 0;
+
+ /// \brief The total number of bytes written as serialized data and
+ /// dictionary pages to the ColumnChunk so far
+ virtual int64_t total_bytes_written() const = 0;
+
+ /// \brief The file-level writer properties
+ virtual const WriterProperties* properties() = 0;
+
+ /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
+ /// error status if the array data type is not compatible with the concrete
+ /// writer type.
+ ///
+ /// leaf_array is always a primitive (possibly dictionary encoded type).
+ /// Leaf_field_nullable indicates whether the leaf array is considered nullable
+ /// according to its schema in a Table or its parent array.
+ virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& leaf_array,
+ ArrowWriteContext* ctx,
+ bool leaf_field_nullable) = 0;
+};
+
+// API to write values to a single column. This is the main client facing API.
+template <typename DType>
+class TypedColumnWriter : public ColumnWriter {
+ public:
+ using T = typename DType::c_type;
+
+ // Write a batch of repetition levels, definition levels, and values to the
+ // column.
+ // `num_values` is the number of logical leaf values.
+ // `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
+ // (resp. max repetition level) is 0.
+ // If not null, each of `def_levels` and `rep_levels` must have at least
+ // `num_values`.
+ //
+ // The number of physical values written (taken from `values`) is returned.
+ // It can be smaller than `num_values` is there are some undefined values.
+ virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const T* values) = 0;
+
+ /// Write a batch of repetition levels, definition levels, and values to the
+ /// column.
+ ///
+ /// In comparison to WriteBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+ /// in the parameters of this function if the input has the length of num_values or the
+ /// _number of rows in the lowest nesting level_.
+ ///
+ /// In the case that the most inner node in the Parquet is required, the _number of rows
+ /// in the lowest nesting level_ is equal to the number of non-null values. If the
+ /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+ /// also includes all values with definition_level == (max_definition_level - 1).
+ ///
+ /// @param num_values number of levels to write.
+ /// @param def_levels The Parquet definition levels, length is num_values
+ /// @param rep_levels The Parquet repetition levels, length is num_values
+ /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+ /// level. The length is number of rows in the lowest nesting level.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; input has the length
+ /// of the number of rows on the lowest nesting level.
+ virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, const T* values) = 0;
+
+ // Estimated size of the values that are not written to a page yet
+ virtual int64_t EstimatedBufferedValueBytes() const = 0;
+};
+
+using BoolWriter = TypedColumnWriter<BooleanType>;
+using Int32Writer = TypedColumnWriter<Int32Type>;
+using Int64Writer = TypedColumnWriter<Int64Type>;
+using Int96Writer = TypedColumnWriter<Int96Type>;
+using FloatWriter = TypedColumnWriter<FloatType>;
+using DoubleWriter = TypedColumnWriter<DoubleType>;
+using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
+using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
+
+namespace internal {
+
+/**
+ * Timestamp conversion constants
+ */
+constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
+
+template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
+inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
+ int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
+ (*impala_timestamp).value[2] = (uint32_t)julian_days;
+
+ int64_t last_day_units = time % UnitPerDay;
+ auto last_day_nanos = last_day_units * NanosecondsPerUnit;
+ // impala_timestamp will be unaligned every other entry so do memcpy instead
+ // of assign and reinterpret cast to avoid undefined behavior.
+ std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
+}
+
+constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
+
+inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
+ impala_timestamp);
+}
+
+constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
+
+inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
+ milliseconds, impala_timestamp);
+}
+
+constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
+
+inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
+ microseconds, impala_timestamp);
+}
+
+constexpr int64_t kNanosecondsInNanos = INT64_C(1);
+
+inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
+ nanoseconds, impala_timestamp);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
index 6e8f7ee5491..3b615af706d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
@@ -1,2547 +1,2547 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encoding.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/array/builder_dict.h"
-#include "arrow/stl_allocator.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_writer.h"
-#include "arrow/util/byte_stream_split.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/rle_encoding.h"
-#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
-
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace BitUtil = arrow::BitUtil;
-
-using arrow::Status;
-using arrow::VisitNullBitmapInline;
-using arrow::internal::checked_cast;
-
-template <typename T>
-using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
-
-namespace parquet {
-namespace {
-
-constexpr int64_t kInMemoryDefaultCapacity = 1024;
-// The Parquet spec isn't very clear whether ByteArray lengths are signed or
-// unsigned, but the Java implementation uses signed ints.
-constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
-
-class EncoderImpl : virtual public Encoder {
- public:
- EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
- : descr_(descr),
- encoding_(encoding),
- pool_(pool),
- type_length_(descr ? descr->type_length() : -1) {}
-
- Encoding::type encoding() const override { return encoding_; }
-
- MemoryPool* memory_pool() const override { return pool_; }
-
- protected:
- // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
- const ColumnDescriptor* descr_;
- const Encoding::type encoding_;
- MemoryPool* pool_;
-
- /// Type length from descr
- int type_length_;
-};
-
-// ----------------------------------------------------------------------
-// Plain encoder implementation
-
-template <typename DType>
-class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
- public:
- using T = typename DType::c_type;
-
- explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
- : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
-
- int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
- std::shared_ptr<Buffer> FlushValues() override {
- std::shared_ptr<Buffer> buffer;
- PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
- return buffer;
- }
-
- using TypedEncoder<DType>::Put;
-
- void Put(const T* buffer, int num_values) override;
-
- void Put(const ::arrow::Array& values) override;
-
- void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- if (valid_bits != NULLPTR) {
- PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
- this->memory_pool()));
- T* data = reinterpret_cast<T*>(buffer->mutable_data());
- int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
- src, num_values, valid_bits, valid_bits_offset, data);
- Put(data, num_valid_values);
- } else {
- Put(src, num_values);
- }
- }
-
- void UnsafePutByteArray(const void* data, uint32_t length) {
- DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
- sink_.UnsafeAppend(&length, sizeof(uint32_t));
- sink_.UnsafeAppend(data, static_cast<int64_t>(length));
- }
-
- void Put(const ByteArray& val) {
- // Write the result to the output stream
- const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
- if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
- PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
- }
- UnsafePutByteArray(val.ptr, val.len);
- }
-
- protected:
- template <typename ArrayType>
- void PutBinaryArray(const ArrayType& array) {
- const int64_t total_bytes =
- array.value_offset(array.length()) - array.value_offset(0);
- PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
-
- PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
- *array.data(),
- [&](::arrow::util::string_view view) {
- if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
- }
- UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
- return Status::OK();
- },
- []() { return Status::OK(); }));
- }
-
- ::arrow::BufferBuilder sink_;
-};
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
- if (num_values > 0) {
- PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
- }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
- for (int i = 0; i < num_values; ++i) {
- Put(src[i]);
- }
-}
-
-template <typename ArrayType>
-void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
- if (values.type_id() != ArrayType::TypeClass::type_id) {
- std::string type_name = ArrayType::TypeClass::type_name();
- throw ParquetException("direct put to " + type_name + " from " +
- values.type()->ToString() + " not supported");
- }
-
- using value_type = typename ArrayType::value_type;
- constexpr auto value_size = sizeof(value_type);
- auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
-
- if (values.null_count() == 0) {
- // no nulls, just dump the data
- PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
- } else {
- PARQUET_THROW_NOT_OK(
- sink->Reserve((values.length() - values.null_count()) * value_size));
-
- for (int64_t i = 0; i < values.length(); i++) {
- if (values.IsValid(i)) {
- sink->UnsafeAppend(&raw_values[i], value_size);
- }
- }
- }
-}
-
-template <>
-void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::Int32Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::Int64Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
- ParquetException::NYI("direct put to Int96");
-}
-
-template <>
-void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::FloatArray>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
-}
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
- ParquetException::NYI("direct put of " + values.type()->ToString());
-}
-
-void AssertBaseBinary(const ::arrow::Array& values) {
- if (!::arrow::is_base_binary_like(values.type_id())) {
- throw ParquetException("Only BaseBinaryArray and subclasses supported");
- }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
- AssertBaseBinary(values);
-
- if (::arrow::is_binary_like(values.type_id())) {
- PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
- }
-}
-
-void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
- if (values.type_id() != ::arrow::Type::FIXED_SIZE_BINARY &&
- values.type_id() != ::arrow::Type::DECIMAL) {
- throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
- }
- if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
- type_length) {
- throw ParquetException("Size mismatch: " + values.type()->ToString() +
- " should have been " + std::to_string(type_length) + " wide");
- }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
- AssertFixedSizeBinary(values, descr_->type_length());
- const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
- if (data.null_count() == 0) {
- // no nulls, just dump the data
- PARQUET_THROW_NOT_OK(
- sink_.Append(data.raw_values(), data.length() * data.byte_width()));
- } else {
- const int64_t total_bytes =
- data.length() * data.byte_width() - data.null_count() * data.byte_width();
- PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- sink_.UnsafeAppend(data.Value(i), data.byte_width());
- }
- }
- }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
- if (descr_->type_length() == 0) {
- return;
- }
- for (int i = 0; i < num_values; ++i) {
- // Write the result to the output stream
- DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
- PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
- }
-}
-
-template <>
-class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
- public:
- explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
- : EncoderImpl(descr, Encoding::PLAIN, pool),
- bits_available_(kInMemoryDefaultCapacity * 8),
- bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
- sink_(pool),
- bit_writer_(bits_buffer_->mutable_data(),
- static_cast<int>(bits_buffer_->size())) {}
-
- int64_t EstimatedDataEncodedSize() override;
- std::shared_ptr<Buffer> FlushValues() override;
-
- void Put(const bool* src, int num_values) override;
-
- void Put(const std::vector<bool>& src, int num_values) override;
-
- void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- if (valid_bits != NULLPTR) {
- PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
- this->memory_pool()));
- T* data = reinterpret_cast<T*>(buffer->mutable_data());
- int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
- src, num_values, valid_bits, valid_bits_offset, data);
- Put(data, num_valid_values);
- } else {
- Put(src, num_values);
- }
- }
-
- void Put(const ::arrow::Array& values) override {
- if (values.type_id() != ::arrow::Type::BOOL) {
- throw ParquetException("direct put to boolean from " + values.type()->ToString() +
- " not supported");
- }
-
- const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
- if (data.null_count() == 0) {
- PARQUET_THROW_NOT_OK(sink_.Reserve(BitUtil::BytesForBits(data.length())));
- // no nulls, just dump the data
- ::arrow::internal::CopyBitmap(data.data()->GetValues<uint8_t>(1), data.offset(),
- data.length(), sink_.mutable_data(), sink_.length());
- } else {
- auto n_valid = BitUtil::BytesForBits(data.length() - data.null_count());
- PARQUET_THROW_NOT_OK(sink_.Reserve(n_valid));
- ::arrow::internal::FirstTimeBitmapWriter writer(sink_.mutable_data(),
- sink_.length(), n_valid);
-
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- if (data.Value(i)) {
- writer.Set();
- } else {
- writer.Clear();
- }
- writer.Next();
- }
- }
- writer.Finish();
- }
- sink_.UnsafeAdvance(data.length());
- }
-
- private:
- int bits_available_;
- std::shared_ptr<ResizableBuffer> bits_buffer_;
- ::arrow::BufferBuilder sink_;
- ::arrow::BitUtil::BitWriter bit_writer_;
-
- template <typename SequenceType>
- void PutImpl(const SequenceType& src, int num_values);
-};
-
-template <typename SequenceType>
-void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
- int bit_offset = 0;
- if (bits_available_ > 0) {
- int bits_to_write = std::min(bits_available_, num_values);
- for (int i = 0; i < bits_to_write; i++) {
- bit_writer_.PutValue(src[i], 1);
- }
- bits_available_ -= bits_to_write;
- bit_offset = bits_to_write;
-
- if (bits_available_ == 0) {
- bit_writer_.Flush();
- PARQUET_THROW_NOT_OK(
- sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
- bit_writer_.Clear();
- }
- }
-
- int bits_remaining = num_values - bit_offset;
- while (bit_offset < num_values) {
- bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
-
- int bits_to_write = std::min(bits_available_, bits_remaining);
- for (int i = bit_offset; i < bit_offset + bits_to_write; i++) {
- bit_writer_.PutValue(src[i], 1);
- }
- bit_offset += bits_to_write;
- bits_available_ -= bits_to_write;
- bits_remaining -= bits_to_write;
-
- if (bits_available_ == 0) {
- bit_writer_.Flush();
- PARQUET_THROW_NOT_OK(
- sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
- bit_writer_.Clear();
- }
- }
-}
-
-int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
- int64_t position = sink_.length();
- return position + bit_writer_.bytes_written();
-}
-
-std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
- if (bits_available_ > 0) {
- bit_writer_.Flush();
- PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
- bit_writer_.Clear();
- bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
- }
-
- std::shared_ptr<Buffer> buffer;
- PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
- return buffer;
-}
-
-void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
- PutImpl(src, num_values);
-}
-
-void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
- PutImpl(src, num_values);
-}
-
-// ----------------------------------------------------------------------
-// DictEncoder<T> implementations
-
-template <typename DType>
-struct DictEncoderTraits {
- using c_type = typename DType::c_type;
- using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
-};
-
-template <>
-struct DictEncoderTraits<ByteArrayType> {
- using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-template <>
-struct DictEncoderTraits<FLBAType> {
- using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-// Initially 1024 elements
-static constexpr int32_t kInitialHashTableSize = 1 << 10;
-
-/// See the dictionary encoding section of
-/// https://github.com/Parquet/parquet-format. The encoding supports
-/// streaming encoding. Values are encoded as they are added while the
-/// dictionary is being constructed. At any time, the buffered values
-/// can be written out with the current dictionary size. More values
-/// can then be added to the encoder, including new dictionary
-/// entries.
-template <typename DType>
-class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
- using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
-
- public:
- typedef typename DType::c_type T;
-
- explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
- : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
- buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
- dict_encoded_size_(0),
- memo_table_(pool, kInitialHashTableSize) {}
-
- ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); }
-
- int dict_encoded_size() override { return dict_encoded_size_; }
-
- int WriteIndices(uint8_t* buffer, int buffer_len) override {
- // Write bit width in first byte
- *buffer = static_cast<uint8_t>(bit_width());
- ++buffer;
- --buffer_len;
-
- ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
-
- for (int32_t index : buffered_indices_) {
- if (!encoder.Put(index)) return -1;
- }
- encoder.Flush();
-
- ClearIndices();
- return 1 + encoder.len();
- }
-
- void set_type_length(int type_length) { this->type_length_ = type_length; }
-
- /// Returns a conservative estimate of the number of bytes needed to encode the buffered
- /// indices. Used to size the buffer passed to WriteIndices().
- int64_t EstimatedDataEncodedSize() override {
- // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
- // reserve
- // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
- // but not reserving them would cause the encoder to fail.
- return 1 +
- ::arrow::util::RleEncoder::MaxBufferSize(
- bit_width(), static_cast<int>(buffered_indices_.size())) +
- ::arrow::util::RleEncoder::MinBufferSize(bit_width());
- }
-
- /// The minimum bit width required to encode the currently buffered indices.
- int bit_width() const override {
- if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
- if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
- return BitUtil::Log2(num_entries());
- }
-
- /// Encode value. Note that this does not actually write any data, just
- /// buffers the value's index to be written later.
- inline void Put(const T& value);
-
- // Not implemented for other data types
- inline void PutByteArray(const void* ptr, int32_t length);
-
- void Put(const T* src, int num_values) override {
- for (int32_t i = 0; i < num_values; i++) {
- Put(src[i]);
- }
- }
-
- void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
- [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; i++) {
- Put(src[i + position]);
- }
- });
- }
-
- using TypedEncoder<DType>::Put;
-
- void Put(const ::arrow::Array& values) override;
- void PutDictionary(const ::arrow::Array& values) override;
-
- template <typename ArrowType, typename T = typename ArrowType::c_type>
- void PutIndicesTyped(const ::arrow::Array& data) {
- auto values = data.data()->GetValues<T>(1);
- size_t buffer_position = buffered_indices_.size();
- buffered_indices_.resize(buffer_position +
- static_cast<size_t>(data.length() - data.null_count()));
- ::arrow::internal::VisitSetBitRunsVoid(
- data.null_bitmap_data(), data.offset(), data.length(),
- [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; ++i) {
- buffered_indices_[buffer_position++] =
- static_cast<int32_t>(values[i + position]);
- }
- });
- }
-
- void PutIndices(const ::arrow::Array& data) override {
- switch (data.type()->id()) {
- case ::arrow::Type::UINT8:
- case ::arrow::Type::INT8:
- return PutIndicesTyped<::arrow::UInt8Type>(data);
- case ::arrow::Type::UINT16:
- case ::arrow::Type::INT16:
- return PutIndicesTyped<::arrow::UInt16Type>(data);
- case ::arrow::Type::UINT32:
- case ::arrow::Type::INT32:
- return PutIndicesTyped<::arrow::UInt32Type>(data);
- case ::arrow::Type::UINT64:
- case ::arrow::Type::INT64:
- return PutIndicesTyped<::arrow::UInt64Type>(data);
- default:
- throw ParquetException("Passed non-integer array to PutIndices");
- }
- }
-
- std::shared_ptr<Buffer> FlushValues() override {
- std::shared_ptr<ResizableBuffer> buffer =
- AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
- int result_size = WriteIndices(buffer->mutable_data(),
- static_cast<int>(EstimatedDataEncodedSize()));
- PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
- return std::move(buffer);
- }
-
- /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
- /// dict_encoded_size() bytes.
- void WriteDict(uint8_t* buffer) override;
-
- /// The number of entries in the dictionary.
- int num_entries() const override { return memo_table_.size(); }
-
- private:
- /// Clears all the indices (but leaves the dictionary).
- void ClearIndices() { buffered_indices_.clear(); }
-
- /// Indices that have not yet be written out by WriteIndices().
- ArrowPoolVector<int32_t> buffered_indices_;
-
- template <typename ArrayType>
- void PutBinaryArray(const ArrayType& array) {
- PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
- *array.data(),
- [&](::arrow::util::string_view view) {
- if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
- }
- PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
- return Status::OK();
- },
- []() { return Status::OK(); }));
- }
-
- template <typename ArrayType>
- void PutBinaryDictionaryArray(const ArrayType& array) {
- DCHECK_EQ(array.null_count(), 0);
- for (int64_t i = 0; i < array.length(); i++) {
- auto v = array.GetView(i);
- if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
- throw ParquetException("Parquet cannot store strings with size 2GB or more");
- }
- dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
- int32_t unused_memo_index;
- PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
- v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
- }
- }
-
- /// The number of bytes needed to encode the dictionary.
- int dict_encoded_size_;
-
- MemoTableType memo_table_;
-};
-
-template <typename DType>
-void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
- // For primitive types, only a memcpy
- DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
- memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
-}
-
-// ByteArray and FLBA already have the dictionary encoded in their data heaps
-template <>
-void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
- memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) {
- uint32_t len = static_cast<uint32_t>(v.length());
- memcpy(buffer, &len, sizeof(len));
- buffer += sizeof(len);
- memcpy(buffer, v.data(), len);
- buffer += len;
- });
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) {
- memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) {
- DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
- memcpy(buffer, v.data(), type_length_);
- buffer += type_length_;
- });
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::Put(const T& v) {
- // Put() implementation for primitive types
- auto on_found = [](int32_t memo_index) {};
- auto on_not_found = [this](int32_t memo_index) {
- dict_encoded_size_ += static_cast<int>(sizeof(T));
- };
-
- int32_t memo_index;
- PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
- buffered_indices_.push_back(memo_index);
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
- DCHECK(false);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
- int32_t length) {
- static const uint8_t empty[] = {0};
-
- auto on_found = [](int32_t memo_index) {};
- auto on_not_found = [&](int32_t memo_index) {
- dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
- };
-
- DCHECK(ptr != nullptr || length == 0);
- ptr = (ptr != nullptr) ? ptr : empty;
- int32_t memo_index;
- PARQUET_THROW_NOT_OK(
- memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
- buffered_indices_.push_back(memo_index);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
- return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
-}
-
-template <>
-inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
- static const uint8_t empty[] = {0};
-
- auto on_found = [](int32_t memo_index) {};
- auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
-
- DCHECK(v.ptr != nullptr || type_length_ == 0);
- const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
- int32_t memo_index;
- PARQUET_THROW_NOT_OK(
- memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
- buffered_indices_.push_back(memo_index);
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
- ParquetException::NYI("Direct put to Int96");
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
- ParquetException::NYI("Direct put to Int96");
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
- using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
- const auto& data = checked_cast<const ArrayType&>(values);
- if (data.null_count() == 0) {
- // no nulls, just dump the data
- for (int64_t i = 0; i < data.length(); i++) {
- Put(data.Value(i));
- }
- } else {
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- Put(data.Value(i));
- }
- }
- }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
- AssertFixedSizeBinary(values, type_length_);
- const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
- if (data.null_count() == 0) {
- // no nulls, just dump the data
- for (int64_t i = 0; i < data.length(); i++) {
- Put(FixedLenByteArray(data.Value(i)));
- }
- } else {
- std::vector<uint8_t> empty(type_length_, 0);
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- Put(FixedLenByteArray(data.Value(i)));
- }
- }
- }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
- AssertBaseBinary(values);
- if (::arrow::is_binary_like(values.type_id())) {
- PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
- }
-}
-
-template <typename DType>
-void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
- if (dict.null_count() > 0) {
- throw ParquetException("Inserted dictionary cannot cannot contain nulls");
- }
-
- if (encoder->num_entries() > 0) {
- throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
- }
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
- AssertCanPutDictionary(this, values);
-
- using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
- const auto& data = checked_cast<const ArrayType&>(values);
-
- dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
- for (int64_t i = 0; i < data.length(); i++) {
- int32_t unused_memo_index;
- PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
- }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
- AssertFixedSizeBinary(values, type_length_);
- AssertCanPutDictionary(this, values);
-
- const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
- dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
- for (int64_t i = 0; i < data.length(); i++) {
- int32_t unused_memo_index;
- PARQUET_THROW_NOT_OK(
- memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
- }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
- AssertBaseBinary(values);
- AssertCanPutDictionary(this, values);
-
- if (::arrow::is_binary_like(values.type_id())) {
- PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
- }
-}
-
-// ----------------------------------------------------------------------
-// ByteStreamSplitEncoder<T> implementations
-
-template <typename DType>
-class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
- public:
- using T = typename DType::c_type;
- using TypedEncoder<DType>::Put;
-
- explicit ByteStreamSplitEncoder(
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- int64_t EstimatedDataEncodedSize() override;
- std::shared_ptr<Buffer> FlushValues() override;
-
- void Put(const T* buffer, int num_values) override;
- void Put(const ::arrow::Array& values) override;
- void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override;
-
- protected:
- template <typename ArrowType>
- void PutImpl(const ::arrow::Array& values) {
- if (values.type_id() != ArrowType::type_id) {
- throw ParquetException(std::string() + "direct put to " + ArrowType::type_name() +
- " from " + values.type()->ToString() + " not supported");
- }
- const auto& data = *values.data();
- PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
- static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0), data.offset);
- }
-
- ::arrow::BufferBuilder sink_;
- int64_t num_values_in_buffer_;
-};
-
-template <typename DType>
-ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool)
- : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
- sink_{pool},
- num_values_in_buffer_{0} {}
-
-template <typename DType>
-int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
- return sink_.length();
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
- std::shared_ptr<ResizableBuffer> output_buffer =
- AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
- uint8_t* output_buffer_raw = output_buffer->mutable_data();
- const uint8_t* raw_values = sink_.data();
- ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values_in_buffer_,
- output_buffer_raw);
- sink_.Reset();
- num_values_in_buffer_ = 0;
- return std::move(output_buffer);
-}
-
-template <typename DType>
-void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
- if (num_values > 0) {
- PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
- num_values_in_buffer_ += num_values;
- }
-}
-
-template <>
-void ByteStreamSplitEncoder<FloatType>::Put(const ::arrow::Array& values) {
- PutImpl<::arrow::FloatType>(values);
-}
-
-template <>
-void ByteStreamSplitEncoder<DoubleType>::Put(const ::arrow::Array& values) {
- PutImpl<::arrow::DoubleType>(values);
-}
-
-template <typename DType>
-void ByteStreamSplitEncoder<DType>::PutSpaced(const T* src, int num_values,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) {
- if (valid_bits != NULLPTR) {
- PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
- this->memory_pool()));
- T* data = reinterpret_cast<T*>(buffer->mutable_data());
- int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
- src, num_values, valid_bits, valid_bits_offset, data);
- Put(data, num_valid_values);
- } else {
- Put(src, num_values);
- }
-}
-
-class DecoderImpl : virtual public Decoder {
- public:
- void SetData(int num_values, const uint8_t* data, int len) override {
- num_values_ = num_values;
- data_ = data;
- len_ = len;
- }
-
- int values_left() const override { return num_values_; }
- Encoding::type encoding() const override { return encoding_; }
-
- protected:
- explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding)
- : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {}
-
- // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
- const ColumnDescriptor* descr_;
-
- const Encoding::type encoding_;
- int num_values_;
- const uint8_t* data_;
- int len_;
- int type_length_;
-};
-
-template <typename DType>
-class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
- public:
- using T = typename DType::c_type;
- explicit PlainDecoder(const ColumnDescriptor* descr);
-
- int Decode(T* buffer, int max_values) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) override;
-};
-
-template <>
-inline int PlainDecoder<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::Accumulator* builder) {
- ParquetException::NYI("DecodeArrow not supported for Int96");
-}
-
-template <>
-inline int PlainDecoder<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow not supported for Int96");
-}
-
-template <>
-inline int PlainDecoder<BooleanType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
- ParquetException::NYI("dictionaries of BooleanType");
-}
-
-template <typename DType>
-int PlainDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) {
- using value_type = typename DType::c_type;
-
- constexpr int value_size = static_cast<int>(sizeof(value_type));
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- builder->UnsafeAppend(::arrow::util::SafeLoadAs<value_type>(data_));
- data_ += sizeof(value_type);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- num_values_ -= values_decoded;
- len_ -= sizeof(value_type) * values_decoded;
- return values_decoded;
-}
-
-template <typename DType>
-int PlainDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- using value_type = typename DType::c_type;
-
- constexpr int value_size = static_cast<int>(sizeof(value_type));
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- PARQUET_THROW_NOT_OK(
- builder->Append(::arrow::util::SafeLoadAs<value_type>(data_)));
- data_ += sizeof(value_type);
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- num_values_ -= values_decoded;
- len_ -= sizeof(value_type) * values_decoded;
- return values_decoded;
-}
-
-// Decode routine templated on C++ type rather than type enum
-template <typename T>
-inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
- int type_length, T* out) {
- int64_t bytes_to_decode = num_values * static_cast<int64_t>(sizeof(T));
- if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
- ParquetException::EofException();
- }
- // If bytes_to_decode == 0, data could be null
- if (bytes_to_decode > 0) {
- memcpy(out, data, bytes_to_decode);
- }
- return static_cast<int>(bytes_to_decode);
-}
-
-template <typename DType>
-PlainDecoder<DType>::PlainDecoder(const ColumnDescriptor* descr)
- : DecoderImpl(descr, Encoding::PLAIN) {
- if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
- type_length_ = descr_->type_length();
- } else {
- type_length_ = -1;
- }
-}
-
-// Template specialization for BYTE_ARRAY. The written values do not own their
-// own data.
-
-static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size,
- ByteArray* out) {
- if (ARROW_PREDICT_FALSE(data_size < 4)) {
- ParquetException::EofException();
- }
- const int32_t len = ::arrow::util::SafeLoadAs<int32_t>(data);
- if (len < 0) {
- throw ParquetException("Invalid BYTE_ARRAY value");
- }
- const int64_t consumed_length = static_cast<int64_t>(len) + 4;
- if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
- ParquetException::EofException();
- }
- *out = ByteArray{static_cast<uint32_t>(len), data + 4};
- return consumed_length;
-}
-
-template <>
-inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int num_values,
- int type_length, ByteArray* out) {
- int bytes_decoded = 0;
- for (int i = 0; i < num_values; ++i) {
- const auto increment = ReadByteArray(data, data_size, out + i);
- if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
- throw ParquetException("BYTE_ARRAY chunk too large");
- }
- data += increment;
- data_size -= increment;
- bytes_decoded += static_cast<int>(increment);
- }
- return bytes_decoded;
-}
-
-// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
-// own their own data.
-template <>
-inline int DecodePlain<FixedLenByteArray>(const uint8_t* data, int64_t data_size,
- int num_values, int type_length,
- FixedLenByteArray* out) {
- int64_t bytes_to_decode = static_cast<int64_t>(type_length) * num_values;
- if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
- ParquetException::EofException();
- }
- for (int i = 0; i < num_values; ++i) {
- out[i].ptr = data;
- data += type_length;
- data_size -= type_length;
- }
- return static_cast<int>(bytes_to_decode);
-}
-
-template <typename DType>
-int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
- max_values = std::min(max_values, num_values_);
- int bytes_consumed = DecodePlain<T>(data_, len_, max_values, type_length_, buffer);
- data_ += bytes_consumed;
- len_ -= bytes_consumed;
- num_values_ -= max_values;
- return max_values;
-}
-
-class PlainBooleanDecoder : public DecoderImpl,
- virtual public TypedDecoder<BooleanType>,
- virtual public BooleanDecoder {
- public:
- explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
- void SetData(int num_values, const uint8_t* data, int len) override;
-
- // Two flavors of bool decoding
- int Decode(uint8_t* buffer, int max_values) override;
- int Decode(bool* buffer, int max_values) override;
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::Accumulator* out) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* out) override;
-
- private:
- std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_;
-};
-
-PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
- : DecoderImpl(descr, Encoding::PLAIN) {}
-
-void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
- num_values_ = num_values;
- bit_reader_.reset(new BitUtil::BitReader(data, len));
-}
-
-int PlainBooleanDecoder::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::Accumulator* builder) {
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- bool value;
- ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
- builder->UnsafeAppend(value);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- num_values_ -= values_decoded;
- return values_decoded;
-}
-
-inline int PlainBooleanDecoder::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
- ParquetException::NYI("dictionaries of BooleanType");
-}
-
-int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
- max_values = std::min(max_values, num_values_);
- bool val;
- ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
- for (int i = 0; i < max_values; ++i) {
- if (!bit_reader_->GetValue(1, &val)) {
- ParquetException::EofException();
- }
- if (val) {
- bit_writer.Set();
- }
- bit_writer.Next();
- }
- bit_writer.Finish();
- num_values_ -= max_values;
- return max_values;
-}
-
-int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
- max_values = std::min(max_values, num_values_);
- if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) {
- ParquetException::EofException();
- }
- num_values_ -= max_values;
- return max_values;
-}
-
-struct ArrowBinaryHelper {
- explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) {
- this->out = out;
- this->builder = out->builder.get();
- this->chunk_space_remaining =
- ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
- }
-
- Status PushChunk() {
- std::shared_ptr<::arrow::Array> result;
- RETURN_NOT_OK(builder->Finish(&result));
- out->chunks.push_back(result);
- chunk_space_remaining = ::arrow::kBinaryMemoryLimit;
- return Status::OK();
- }
-
- bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
-
- void UnsafeAppend(const uint8_t* data, int32_t length) {
- chunk_space_remaining -= length;
- builder->UnsafeAppend(data, length);
- }
-
- void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
-
- Status Append(const uint8_t* data, int32_t length) {
- chunk_space_remaining -= length;
- return builder->Append(data, length);
- }
-
- Status AppendNull() { return builder->AppendNull(); }
-
- typename EncodingTraits<ByteArrayType>::Accumulator* out;
- ::arrow::BinaryBuilder* builder;
- int64_t chunk_space_remaining;
-};
-
-template <>
-inline int PlainDecoder<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
- ParquetException::NYI();
-}
-
-template <>
-inline int PlainDecoder<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
- ParquetException::NYI();
-}
-
-template <>
-inline int PlainDecoder<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::Accumulator* builder) {
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- builder->UnsafeAppend(data_);
- data_ += descr_->type_length();
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- num_values_ -= values_decoded;
- len_ -= descr_->type_length() * values_decoded;
- return values_decoded;
-}
-
-template <>
-inline int PlainDecoder<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- PARQUET_THROW_NOT_OK(builder->Append(data_));
- data_ += descr_->type_length();
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- num_values_ -= values_decoded;
- len_ -= descr_->type_length() * values_decoded;
- return values_decoded;
-}
-
-class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
- virtual public ByteArrayDecoder {
- public:
- using Base = PlainDecoder<ByteArrayType>;
- using Base::DecodeSpaced;
- using Base::PlainDecoder;
-
- // ----------------------------------------------------------------------
- // Dictionary read paths
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- ::arrow::BinaryDictionary32Builder* builder) override {
- int result = 0;
- PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
- valid_bits_offset, builder, &result));
- return result;
- }
-
- // ----------------------------------------------------------------------
- // Optimized dense binary read paths
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
- int result = 0;
- PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
- valid_bits_offset, out, &result));
- return result;
- }
-
- private:
- Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out,
- int* out_values_decoded) {
- ArrowBinaryHelper helper(out);
- int values_decoded = 0;
-
- RETURN_NOT_OK(helper.builder->Reserve(num_values));
- RETURN_NOT_OK(helper.builder->ReserveData(
- std::min<int64_t>(len_, helper.chunk_space_remaining)));
-
- int i = 0;
- RETURN_NOT_OK(VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- if (ARROW_PREDICT_FALSE(len_ < 4)) {
- ParquetException::EofException();
- }
- auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
- if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
- return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
- }
- auto increment = value_len + 4;
- if (ARROW_PREDICT_FALSE(len_ < increment)) {
- ParquetException::EofException();
- }
- if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
- // This element would exceed the capacity of a chunk
- RETURN_NOT_OK(helper.PushChunk());
- RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
- RETURN_NOT_OK(helper.builder->ReserveData(
- std::min<int64_t>(len_, helper.chunk_space_remaining)));
- }
- helper.UnsafeAppend(data_ + 4, value_len);
- data_ += increment;
- len_ -= increment;
- ++values_decoded;
- ++i;
- return Status::OK();
- },
- [&]() {
- helper.UnsafeAppendNull();
- ++i;
- return Status::OK();
- }));
-
- num_values_ -= values_decoded;
- *out_values_decoded = values_decoded;
- return Status::OK();
- }
-
- template <typename BuilderType>
- Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset, BuilderType* builder,
- int* out_values_decoded) {
- RETURN_NOT_OK(builder->Reserve(num_values));
- int values_decoded = 0;
-
- RETURN_NOT_OK(VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- if (ARROW_PREDICT_FALSE(len_ < 4)) {
- ParquetException::EofException();
- }
- auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
- if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
- return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
- }
- auto increment = value_len + 4;
- if (ARROW_PREDICT_FALSE(len_ < increment)) {
- ParquetException::EofException();
- }
- RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
- data_ += increment;
- len_ -= increment;
- ++values_decoded;
- return Status::OK();
- },
- [&]() { return builder->AppendNull(); }));
-
- num_values_ -= values_decoded;
- *out_values_decoded = values_decoded;
- return Status::OK();
- }
-};
-
-class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
- public:
- using Base = PlainDecoder<FLBAType>;
- using Base::PlainDecoder;
-};
-
-// ----------------------------------------------------------------------
-// Dictionary encoding and decoding
-
-template <typename Type>
-class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
- public:
- typedef typename Type::c_type T;
-
- // Initializes the dictionary with values from 'dictionary'. The data in
- // dictionary is not guaranteed to persist in memory after this call so the
- // dictionary decoder needs to copy the data out if necessary.
- explicit DictDecoderImpl(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::RLE_DICTIONARY),
- dictionary_(AllocateBuffer(pool, 0)),
- dictionary_length_(0),
- byte_array_data_(AllocateBuffer(pool, 0)),
- byte_array_offsets_(AllocateBuffer(pool, 0)),
- indices_scratch_space_(AllocateBuffer(pool, 0)) {}
-
- // Perform type-specific initiatialization
- void SetDict(TypedDecoder<Type>* dictionary) override;
-
- void SetData(int num_values, const uint8_t* data, int len) override {
- num_values_ = num_values;
- if (len == 0) {
- // Initialize dummy decoder to avoid crashes later on
- idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1);
- return;
- }
- uint8_t bit_width = *data;
- if (ARROW_PREDICT_FALSE(bit_width >= 64)) {
- throw ParquetException("Invalid or corrupted bit_width");
- }
- idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width);
- }
-
- int Decode(T* buffer, int num_values) override {
- num_values = std::min(num_values, num_values_);
- int decoded_values =
- idx_decoder_.GetBatchWithDict(reinterpret_cast<const T*>(dictionary_->data()),
- dictionary_length_, buffer, num_values);
- if (decoded_values != num_values) {
- ParquetException::EofException();
- }
- num_values_ -= num_values;
- return num_values;
- }
-
- int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- num_values = std::min(num_values, num_values_);
- if (num_values != idx_decoder_.GetBatchWithDictSpaced(
- reinterpret_cast<const T*>(dictionary_->data()),
- dictionary_length_, buffer, num_values, null_count, valid_bits,
- valid_bits_offset)) {
- ParquetException::EofException();
- }
- num_values_ -= num_values;
- return num_values;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<Type>::Accumulator* out) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<Type>::DictAccumulator* out) override;
-
- void InsertDictionary(::arrow::ArrayBuilder* builder) override;
-
- int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- ::arrow::ArrayBuilder* builder) override {
- if (num_values > 0) {
- // TODO(wesm): Refactor to batch reads for improved memory use. It is not
- // trivial because the null_count is relative to the entire bitmap
- PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
- num_values, /*shrink_to_fit=*/false));
- }
-
- auto indices_buffer =
- reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
-
- if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
- valid_bits_offset, indices_buffer)) {
- ParquetException::EofException();
- }
-
- /// XXX(wesm): Cannot append "valid bits" directly to the builder
- std::vector<uint8_t> valid_bytes(num_values);
- ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
- for (int64_t i = 0; i < num_values; ++i) {
- valid_bytes[i] = static_cast<uint8_t>(bit_reader.IsSet());
- bit_reader.Next();
- }
-
- auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
- PARQUET_THROW_NOT_OK(
- binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
- num_values_ -= num_values - null_count;
- return num_values - null_count;
- }
-
- int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
- num_values = std::min(num_values, num_values_);
- if (num_values > 0) {
- // TODO(wesm): Refactor to batch reads for improved memory use. This is
- // relatively simple here because we don't have to do any bookkeeping of
- // nulls
- PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
- num_values, /*shrink_to_fit=*/false));
- }
- auto indices_buffer =
- reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
- if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
- ParquetException::EofException();
- }
- auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
- PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
- num_values_ -= num_values;
- return num_values;
- }
-
- int DecodeIndices(int num_values, int32_t* indices) override {
- if (num_values != idx_decoder_.GetBatch(indices, num_values)) {
- ParquetException::EofException();
- }
- num_values_ -= num_values;
- return num_values;
- }
-
- void GetDictionary(const T** dictionary, int32_t* dictionary_length) override {
- *dictionary_length = dictionary_length_;
- *dictionary = reinterpret_cast<T*>(dictionary_->mutable_data());
- }
-
- protected:
- Status IndexInBounds(int32_t index) {
- if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
- return Status::OK();
- }
- return Status::Invalid("Index not in dictionary bounds");
- }
-
- inline void DecodeDict(TypedDecoder<Type>* dictionary) {
- dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
- PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
- /*shrink_to_fit=*/false));
- dictionary->Decode(reinterpret_cast<T*>(dictionary_->mutable_data()),
- dictionary_length_);
- }
-
- // Only one is set.
- std::shared_ptr<ResizableBuffer> dictionary_;
-
- int32_t dictionary_length_;
-
- // Data that contains the byte array data (byte_array_dictionary_ just has the
- // pointers).
- std::shared_ptr<ResizableBuffer> byte_array_data_;
-
- // Arrow-style byte offsets for each dictionary value. We maintain two
- // representations of the dictionary, one as ByteArray* for non-Arrow
- // consumers and this one for Arrow consumers. Since dictionaries are
- // generally pretty small to begin with this doesn't mean too much extra
- // memory use in most cases
- std::shared_ptr<ResizableBuffer> byte_array_offsets_;
-
- // Reusable buffer for decoding dictionary indices to be appended to a
- // BinaryDictionary32Builder
- std::shared_ptr<ResizableBuffer> indices_scratch_space_;
-
- ::arrow::util::RleDecoder idx_decoder_;
-};
-
-template <typename Type>
-void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {
- DecodeDict(dictionary);
-}
-
-template <>
-void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
- ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
-}
-
-template <>
-void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictionary) {
- DecodeDict(dictionary);
-
- auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
-
- int total_size = 0;
- for (int i = 0; i < dictionary_length_; ++i) {
- total_size += dict_values[i].len;
- }
- PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
- /*shrink_to_fit=*/false));
- PARQUET_THROW_NOT_OK(
- byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
- /*shrink_to_fit=*/false));
-
- int32_t offset = 0;
- uint8_t* bytes_data = byte_array_data_->mutable_data();
- int32_t* bytes_offsets =
- reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
- for (int i = 0; i < dictionary_length_; ++i) {
- memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
- bytes_offsets[i] = offset;
- dict_values[i].ptr = bytes_data + offset;
- offset += dict_values[i].len;
- }
- bytes_offsets[dictionary_length_] = offset;
-}
-
-template <>
-inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionary) {
- DecodeDict(dictionary);
-
- auto dict_values = reinterpret_cast<FLBA*>(dictionary_->mutable_data());
-
- int fixed_len = descr_->type_length();
- int total_size = dictionary_length_ * fixed_len;
-
- PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
- /*shrink_to_fit=*/false));
- uint8_t* bytes_data = byte_array_data_->mutable_data();
- for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) {
- memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
- dict_values[i].ptr = bytes_data + offset;
- }
-}
-
-template <>
-inline int DictDecoderImpl<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::Accumulator* builder) {
- ParquetException::NYI("DecodeArrow to Int96Type");
-}
-
-template <>
-inline int DictDecoderImpl<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow to Int96Type");
-}
-
-template <>
-inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
- ParquetException::NYI("DecodeArrow implemented elsewhere");
-}
-
-template <>
-inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow implemented elsewhere");
-}
-
-template <typename DType>
-int DictDecoderImpl<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const typename DType::c_type*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- PARQUET_THROW_NOT_OK(builder->Append(dict_values[index]));
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- return num_values - null_count;
-}
-
-template <>
-int DictDecoderImpl<BooleanType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
- ParquetException::NYI("No dictionary encoding for BooleanType");
-}
-
-template <>
-inline int DictDecoderImpl<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::Accumulator* builder) {
- if (builder->byte_width() != descr_->type_length()) {
- throw ParquetException("Byte width mismatch: builder was " +
- std::to_string(builder->byte_width()) + " but decoder was " +
- std::to_string(descr_->type_length()));
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- builder->UnsafeAppend(dict_values[index].ptr);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- return num_values - null_count;
-}
-
-template <>
-int DictDecoderImpl<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
- auto value_type =
- checked_cast<const ::arrow::DictionaryType&>(*builder->type()).value_type();
- auto byte_width =
- checked_cast<const ::arrow::FixedSizeBinaryType&>(*value_type).byte_width();
- if (byte_width != descr_->type_length()) {
- throw ParquetException("Byte width mismatch: builder was " +
- std::to_string(byte_width) + " but decoder was " +
- std::to_string(descr_->type_length()));
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- PARQUET_THROW_NOT_OK(builder->Append(dict_values[index].ptr));
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- return num_values - null_count;
-}
-
-template <typename Type>
-int DictDecoderImpl<Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Type>::Accumulator* builder) {
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- using value_type = typename Type::c_type;
- auto dict_values = reinterpret_cast<const value_type*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- builder->UnsafeAppend(dict_values[index]);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- return num_values - null_count;
-}
-
-template <typename Type>
-void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
- ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types");
-}
-
-template <>
-void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
- auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
-
- // Make a BinaryArray referencing the internal dictionary data
- auto arr = std::make_shared<::arrow::BinaryArray>(
- dictionary_length_, byte_array_offsets_, byte_array_data_);
- PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
-}
-
-class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
- virtual public ByteArrayDecoder {
- public:
- using BASE = DictDecoderImpl<ByteArrayType>;
- using BASE::DictDecoderImpl;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- ::arrow::BinaryDictionary32Builder* builder) override {
- int result = 0;
- if (null_count == 0) {
- PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
- } else {
- PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
- valid_bits_offset, builder, &result));
- }
- return result;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
- int result = 0;
- if (null_count == 0) {
- PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
- } else {
- PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
- valid_bits_offset, out, &result));
- }
- return result;
- }
-
- private:
- Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out,
- int* out_num_values) {
- constexpr int32_t kBufferSize = 1024;
- int32_t indices[kBufferSize];
-
- ArrowBinaryHelper helper(out);
-
- ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
- int values_decoded = 0;
- int num_appended = 0;
- while (num_appended < num_values) {
- bool is_valid = bit_reader.IsSet();
- bit_reader.Next();
-
- if (is_valid) {
- int32_t batch_size =
- std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-
- if (ARROW_PREDICT_FALSE(num_indices < 1)) {
- return Status::Invalid("Invalid number of indices '", num_indices, "'");
- }
-
- int i = 0;
- while (true) {
- // Consume all indices
- if (is_valid) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
- RETURN_NOT_OK(helper.PushChunk());
- }
- RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
- ++i;
- ++values_decoded;
- } else {
- RETURN_NOT_OK(helper.AppendNull());
- --null_count;
- }
- ++num_appended;
- if (i == num_indices) {
- // Do not advance the bit_reader if we have fulfilled the decode
- // request
- break;
- }
- is_valid = bit_reader.IsSet();
- bit_reader.Next();
- }
- } else {
- RETURN_NOT_OK(helper.AppendNull());
- --null_count;
- ++num_appended;
- }
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-
- Status DecodeArrowDenseNonNull(int num_values,
- typename EncodingTraits<ByteArrayType>::Accumulator* out,
- int* out_num_values) {
- constexpr int32_t kBufferSize = 2048;
- int32_t indices[kBufferSize];
- int values_decoded = 0;
-
- ArrowBinaryHelper helper(out);
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
- while (values_decoded < num_values) {
- int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
- if (num_indices == 0) ParquetException::EofException();
- for (int i = 0; i < num_indices; ++i) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
- RETURN_NOT_OK(helper.PushChunk());
- }
- RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
- }
- values_decoded += num_indices;
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-
- template <typename BuilderType>
- Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset, BuilderType* builder,
- int* out_num_values) {
- constexpr int32_t kBufferSize = 1024;
- int32_t indices[kBufferSize];
-
- RETURN_NOT_OK(builder->Reserve(num_values));
- ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
- int values_decoded = 0;
- int num_appended = 0;
- while (num_appended < num_values) {
- bool is_valid = bit_reader.IsSet();
- bit_reader.Next();
-
- if (is_valid) {
- int32_t batch_size =
- std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-
- int i = 0;
- while (true) {
- // Consume all indices
- if (is_valid) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- RETURN_NOT_OK(builder->Append(val.ptr, val.len));
- ++i;
- ++values_decoded;
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- --null_count;
- }
- ++num_appended;
- if (i == num_indices) {
- // Do not advance the bit_reader if we have fulfilled the decode
- // request
- break;
- }
- is_valid = bit_reader.IsSet();
- bit_reader.Next();
- }
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- --null_count;
- ++num_appended;
- }
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-
- template <typename BuilderType>
- Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
- constexpr int32_t kBufferSize = 2048;
- int32_t indices[kBufferSize];
-
- RETURN_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
- int values_decoded = 0;
- while (values_decoded < num_values) {
- int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
- if (num_indices == 0) ParquetException::EofException();
- for (int i = 0; i < num_indices; ++i) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- RETURN_NOT_OK(builder->Append(val.ptr, val.len));
- }
- values_decoded += num_indices;
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-};
-
-// ----------------------------------------------------------------------
-// DeltaBitPackDecoder
-
-template <typename DType>
-class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
- public:
- typedef typename DType::c_type T;
-
- explicit DeltaBitPackDecoder(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) {
- if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) {
- throw ParquetException("Delta bit pack encoding should only be for integer data.");
- }
- }
-
- void SetData(int num_values, const uint8_t* data, int len) override {
- this->num_values_ = num_values;
- decoder_ = ::arrow::BitUtil::BitReader(data, len);
- values_current_block_ = 0;
- values_current_mini_block_ = 0;
- }
-
- int Decode(T* buffer, int max_values) override {
- return GetInternal(buffer, max_values);
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* out) override {
- if (null_count != 0) {
- ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
- }
- std::vector<T> values(num_values);
- GetInternal(values.data(), num_values);
- PARQUET_THROW_NOT_OK(out->AppendValues(values));
- return num_values;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* out) override {
- if (null_count != 0) {
- ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
- }
- std::vector<T> values(num_values);
- GetInternal(values.data(), num_values);
- PARQUET_THROW_NOT_OK(out->Reserve(num_values));
- for (T value : values) {
- PARQUET_THROW_NOT_OK(out->Append(value));
- }
- return num_values;
- }
-
- private:
- void InitBlock() {
- // The number of values per block.
- uint32_t block_size;
- if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
- if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
- if (!decoder_.GetVlqInt(&values_current_block_)) {
- ParquetException::EofException();
- }
- if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException();
-
- delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
- uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
-
- if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
- for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
- if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
- ParquetException::EofException();
- }
- }
- values_per_mini_block_ = block_size / num_mini_blocks_;
- mini_block_idx_ = 0;
- delta_bit_width_ = bit_width_data[0];
- values_current_mini_block_ = values_per_mini_block_;
- }
-
- template <typename T>
- int GetInternal(T* buffer, int max_values) {
- max_values = std::min(max_values, this->num_values_);
- const uint8_t* bit_width_data = delta_bit_widths_->data();
- for (int i = 0; i < max_values; ++i) {
- if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
- ++mini_block_idx_;
- if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
- delta_bit_width_ = bit_width_data[mini_block_idx_];
- values_current_mini_block_ = values_per_mini_block_;
- } else {
- InitBlock();
- buffer[i] = last_value_;
- continue;
- }
- }
-
- // TODO: the key to this algorithm is to decode the entire miniblock at once.
- int64_t delta;
- if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException();
- delta += min_delta_;
- last_value_ += static_cast<int32_t>(delta);
- buffer[i] = last_value_;
- --values_current_mini_block_;
- }
- this->num_values_ -= max_values;
- return max_values;
- }
-
- MemoryPool* pool_;
- ::arrow::BitUtil::BitReader decoder_;
- uint32_t values_current_block_;
- uint32_t num_mini_blocks_;
- uint64_t values_per_mini_block_;
- uint64_t values_current_mini_block_;
-
- int32_t min_delta_;
- size_t mini_block_idx_;
- std::shared_ptr<ResizableBuffer> delta_bit_widths_;
- int delta_bit_width_;
-
- int32_t last_value_;
-};
-
-// ----------------------------------------------------------------------
-// DELTA_LENGTH_BYTE_ARRAY
-
-class DeltaLengthByteArrayDecoder : public DecoderImpl,
- virtual public TypedDecoder<ByteArrayType> {
- public:
- explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
- len_decoder_(nullptr, pool),
- pool_(pool) {}
-
- void SetData(int num_values, const uint8_t* data, int len) override {
- num_values_ = num_values;
- if (len == 0) return;
- int total_lengths_len = ::arrow::util::SafeLoadAs<int32_t>(data);
- data += 4;
- this->len_decoder_.SetData(num_values, data, total_lengths_len);
- data_ = data + total_lengths_len;
- this->len_ = len - 4 - total_lengths_len;
- }
-
- int Decode(ByteArray* buffer, int max_values) override {
- using VectorT = ArrowPoolVector<int>;
- max_values = std::min(max_values, num_values_);
- VectorT lengths(max_values, 0, ::arrow::stl::allocator<int>(pool_));
- len_decoder_.Decode(lengths.data(), max_values);
- for (int i = 0; i < max_values; ++i) {
- buffer[i].len = lengths[i];
- buffer[i].ptr = data_;
- this->data_ += lengths[i];
- this->len_ -= lengths[i];
- }
- this->num_values_ -= max_values;
- return max_values;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
- ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override {
- ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
- }
-
- private:
- DeltaBitPackDecoder<Int32Type> len_decoder_;
- ::arrow::MemoryPool* pool_;
-};
-
-// ----------------------------------------------------------------------
-// DELTA_BYTE_ARRAY
-
-class DeltaByteArrayDecoder : public DecoderImpl,
- virtual public TypedDecoder<ByteArrayType> {
- public:
- explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
- prefix_len_decoder_(nullptr, pool),
- suffix_decoder_(nullptr, pool),
- last_value_(0, nullptr) {}
-
- virtual void SetData(int num_values, const uint8_t* data, int len) {
- num_values_ = num_values;
- if (len == 0) return;
- int prefix_len_length = ::arrow::util::SafeLoadAs<int32_t>(data);
- data += 4;
- len -= 4;
- prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
- data += prefix_len_length;
- len -= prefix_len_length;
- suffix_decoder_.SetData(num_values, data, len);
- }
-
- // TODO: this doesn't work and requires memory management. We need to allocate
- // new strings to store the results.
- virtual int Decode(ByteArray* buffer, int max_values) {
- max_values = std::min(max_values, this->num_values_);
- for (int i = 0; i < max_values; ++i) {
- int prefix_len = 0;
- prefix_len_decoder_.Decode(&prefix_len, 1);
- ByteArray suffix = {0, nullptr};
- suffix_decoder_.Decode(&suffix, 1);
- buffer[i].len = prefix_len + suffix.len;
-
- uint8_t* result = reinterpret_cast<uint8_t*>(malloc(buffer[i].len));
- memcpy(result, last_value_.ptr, prefix_len);
- memcpy(result + prefix_len, suffix.ptr, suffix.len);
-
- buffer[i].ptr = result;
- last_value_ = buffer[i];
- }
- this->num_values_ -= max_values;
- return max_values;
- }
-
- private:
- DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
- DeltaLengthByteArrayDecoder suffix_decoder_;
- ByteArray last_value_;
-};
-
-// ----------------------------------------------------------------------
-// BYTE_STREAM_SPLIT
-
-template <typename DType>
-class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
- public:
- using T = typename DType::c_type;
- explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr);
-
- int Decode(T* buffer, int max_values) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) override;
-
- void SetData(int num_values, const uint8_t* data, int len) override;
-
- T* EnsureDecodeBuffer(int64_t min_values) {
- const int64_t size = sizeof(T) * min_values;
- if (!decode_buffer_ || decode_buffer_->size() < size) {
- PARQUET_ASSIGN_OR_THROW(decode_buffer_, ::arrow::AllocateBuffer(size));
- }
- return reinterpret_cast<T*>(decode_buffer_->mutable_data());
- }
-
- private:
- int num_values_in_buffer_{0};
- std::shared_ptr<Buffer> decode_buffer_;
-
- static constexpr size_t kNumStreams = sizeof(T);
-};
-
-template <typename DType>
-ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* descr)
- : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {}
-
-template <typename DType>
-void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
- int len) {
- DecoderImpl::SetData(num_values, data, len);
- if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
- throw ParquetException("Data size too small for number of values (corrupted file?)");
- }
- num_values_in_buffer_ = num_values;
-}
-
-template <typename DType>
-int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
- const int values_to_decode = std::min(num_values_, max_values);
- const int num_decoded_previously = num_values_in_buffer_ - num_values_;
- const uint8_t* data = data_ + num_decoded_previously;
-
- ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_to_decode,
- num_values_in_buffer_, buffer);
- num_values_ -= values_to_decode;
- len_ -= sizeof(T) * values_to_decode;
- return values_to_decode;
-}
-
-template <typename DType>
-int ByteStreamSplitDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) {
- constexpr int value_size = static_cast<int>(kNumStreams);
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- const int num_decoded_previously = num_values_in_buffer_ - num_values_;
- const uint8_t* data = data_ + num_decoded_previously;
- int offset = 0;
-
-#if defined(ARROW_HAVE_SIMD_SPLIT)
- // Use fast decoding into intermediate buffer. This will also decode
- // some null values, but it's fast enough that we don't care.
- T* decode_out = EnsureDecodeBuffer(values_decoded);
- ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_decoded,
- num_values_in_buffer_, decode_out);
-
- // XXX If null_count is 0, we could even append in bulk or decode directly into
- // builder
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- builder->UnsafeAppend(decode_out[offset]);
- ++offset;
- },
- [&]() { builder->UnsafeAppendNull(); });
-
-#else
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * num_values_in_buffer_ + offset;
- gathered_byte_data[b] = data[byte_index];
- }
- builder->UnsafeAppend(::arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]));
- ++offset;
- },
- [&]() { builder->UnsafeAppendNull(); });
-#endif
-
- num_values_ -= values_decoded;
- len_ -= sizeof(T) * values_decoded;
- return values_decoded;
-}
-
-template <typename DType>
-int ByteStreamSplitDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// Encoder and decoder factory functions
-
-std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
- bool use_dictionary, const ColumnDescriptor* descr,
- MemoryPool* pool) {
- if (use_dictionary) {
- switch (type_num) {
- case Type::INT32:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<Int32Type>(descr, pool));
- case Type::INT64:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<Int64Type>(descr, pool));
- case Type::INT96:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<Int96Type>(descr, pool));
- case Type::FLOAT:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<DoubleType>(descr, pool));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<ByteArrayType>(descr, pool));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<FLBAType>(descr, pool));
- default:
- DCHECK(false) << "Encoder not implemented";
- break;
- }
- } else if (encoding == Encoding::PLAIN) {
- switch (type_num) {
- case Type::BOOLEAN:
- return std::unique_ptr<Encoder>(new PlainEncoder<BooleanType>(descr, pool));
- case Type::INT32:
- return std::unique_ptr<Encoder>(new PlainEncoder<Int32Type>(descr, pool));
- case Type::INT64:
- return std::unique_ptr<Encoder>(new PlainEncoder<Int64Type>(descr, pool));
- case Type::INT96:
- return std::unique_ptr<Encoder>(new PlainEncoder<Int96Type>(descr, pool));
- case Type::FLOAT:
- return std::unique_ptr<Encoder>(new PlainEncoder<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Encoder>(new PlainEncoder<DoubleType>(descr, pool));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new PlainEncoder<ByteArrayType>(descr, pool));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new PlainEncoder<FLBAType>(descr, pool));
- default:
- DCHECK(false) << "Encoder not implemented";
- break;
- }
- } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
- switch (type_num) {
- case Type::FLOAT:
- return std::unique_ptr<Encoder>(
- new ByteStreamSplitEncoder<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Encoder>(
- new ByteStreamSplitEncoder<DoubleType>(descr, pool));
- default:
- throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
- break;
- }
- } else {
- ParquetException::NYI("Selected encoding is not supported");
- }
- DCHECK(false) << "Should not be able to reach this code";
- return nullptr;
-}
-
-std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
- const ColumnDescriptor* descr) {
- if (encoding == Encoding::PLAIN) {
- switch (type_num) {
- case Type::BOOLEAN:
- return std::unique_ptr<Decoder>(new PlainBooleanDecoder(descr));
- case Type::INT32:
- return std::unique_ptr<Decoder>(new PlainDecoder<Int32Type>(descr));
- case Type::INT64:
- return std::unique_ptr<Decoder>(new PlainDecoder<Int64Type>(descr));
- case Type::INT96:
- return std::unique_ptr<Decoder>(new PlainDecoder<Int96Type>(descr));
- case Type::FLOAT:
- return std::unique_ptr<Decoder>(new PlainDecoder<FloatType>(descr));
- case Type::DOUBLE:
- return std::unique_ptr<Decoder>(new PlainDecoder<DoubleType>(descr));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new PlainByteArrayDecoder(descr));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new PlainFLBADecoder(descr));
- default:
- break;
- }
- } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
- switch (type_num) {
- case Type::FLOAT:
- return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<FloatType>(descr));
- case Type::DOUBLE:
- return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<DoubleType>(descr));
- default:
- throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
- break;
- }
- } else {
- ParquetException::NYI("Selected encoding is not supported");
- }
- DCHECK(false) << "Should not be able to reach this code";
- return nullptr;
-}
-
-namespace detail {
-std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
- const ColumnDescriptor* descr,
- MemoryPool* pool) {
- switch (type_num) {
- case Type::BOOLEAN:
- ParquetException::NYI("Dictionary encoding not implemented for boolean type");
- case Type::INT32:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<Int32Type>(descr, pool));
- case Type::INT64:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<Int64Type>(descr, pool));
- case Type::INT96:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<Int96Type>(descr, pool));
- case Type::FLOAT:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<DoubleType>(descr, pool));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new DictByteArrayDecoderImpl(descr, pool));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<FLBAType>(descr, pool));
- default:
- break;
- }
- DCHECK(false) << "Should not be able to reach this code";
- return nullptr;
-}
-
-} // namespace detail
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encoding.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/byte_stream_split.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace BitUtil = arrow::BitUtil;
+
+using arrow::Status;
+using arrow::VisitNullBitmapInline;
+using arrow::internal::checked_cast;
+
+template <typename T>
+using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
+
+namespace parquet {
+namespace {
+
+constexpr int64_t kInMemoryDefaultCapacity = 1024;
+// The Parquet spec isn't very clear whether ByteArray lengths are signed or
+// unsigned, but the Java implementation uses signed ints.
+constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
+
+class EncoderImpl : virtual public Encoder {
+ public:
+ EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
+ : descr_(descr),
+ encoding_(encoding),
+ pool_(pool),
+ type_length_(descr ? descr->type_length() : -1) {}
+
+ Encoding::type encoding() const override { return encoding_; }
+
+ MemoryPool* memory_pool() const override { return pool_; }
+
+ protected:
+ // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+ const ColumnDescriptor* descr_;
+ const Encoding::type encoding_;
+ MemoryPool* pool_;
+
+ /// Type length from descr
+ int type_length_;
+};
+
+// ----------------------------------------------------------------------
+// Plain encoder implementation
+
+template <typename DType>
+class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+ int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+ std::shared_ptr<Buffer> FlushValues() override {
+ std::shared_ptr<Buffer> buffer;
+ PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+ return buffer;
+ }
+
+ using TypedEncoder<DType>::Put;
+
+ void Put(const T* buffer, int num_values) override;
+
+ void Put(const ::arrow::Array& values) override;
+
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+ }
+
+ void UnsafePutByteArray(const void* data, uint32_t length) {
+ DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
+ sink_.UnsafeAppend(&length, sizeof(uint32_t));
+ sink_.UnsafeAppend(data, static_cast<int64_t>(length));
+ }
+
+ void Put(const ByteArray& val) {
+ // Write the result to the output stream
+ const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
+ if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
+ PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
+ }
+ UnsafePutByteArray(val.ptr, val.len);
+ }
+
+ protected:
+ template <typename ArrayType>
+ void PutBinaryArray(const ArrayType& array) {
+ const int64_t total_bytes =
+ array.value_offset(array.length()) - array.value_offset(0);
+ PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
+
+ PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+ *array.data(),
+ [&](::arrow::util::string_view view) {
+ if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+ return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ }
+ UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+ return Status::OK();
+ },
+ []() { return Status::OK(); }));
+ }
+
+ ::arrow::BufferBuilder sink_;
+};
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
+ if (num_values > 0) {
+ PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+ }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+ for (int i = 0; i < num_values; ++i) {
+ Put(src[i]);
+ }
+}
+
+template <typename ArrayType>
+void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
+ if (values.type_id() != ArrayType::TypeClass::type_id) {
+ std::string type_name = ArrayType::TypeClass::type_name();
+ throw ParquetException("direct put to " + type_name + " from " +
+ values.type()->ToString() + " not supported");
+ }
+
+ using value_type = typename ArrayType::value_type;
+ constexpr auto value_size = sizeof(value_type);
+ auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
+
+ if (values.null_count() == 0) {
+ // no nulls, just dump the data
+ PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
+ } else {
+ PARQUET_THROW_NOT_OK(
+ sink->Reserve((values.length() - values.null_count()) * value_size));
+
+ for (int64_t i = 0; i < values.length(); i++) {
+ if (values.IsValid(i)) {
+ sink->UnsafeAppend(&raw_values[i], value_size);
+ }
+ }
+ }
+}
+
+template <>
+void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::Int32Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::Int64Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("direct put to Int96");
+}
+
+template <>
+void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::FloatArray>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
+}
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("direct put of " + values.type()->ToString());
+}
+
+void AssertBaseBinary(const ::arrow::Array& values) {
+ if (!::arrow::is_base_binary_like(values.type_id())) {
+ throw ParquetException("Only BaseBinaryArray and subclasses supported");
+ }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
+ if (values.type_id() != ::arrow::Type::FIXED_SIZE_BINARY &&
+ values.type_id() != ::arrow::Type::DECIMAL) {
+ throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
+ }
+ if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
+ type_length) {
+ throw ParquetException("Size mismatch: " + values.type()->ToString() +
+ " should have been " + std::to_string(type_length) + " wide");
+ }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, descr_->type_length());
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(data.raw_values(), data.length() * data.byte_width()));
+ } else {
+ const int64_t total_bytes =
+ data.length() * data.byte_width() - data.null_count() * data.byte_width();
+ PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ sink_.UnsafeAppend(data.Value(i), data.byte_width());
+ }
+ }
+ }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
+ if (descr_->type_length() == 0) {
+ return;
+ }
+ for (int i = 0; i < num_values; ++i) {
+ // Write the result to the output stream
+ DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
+ PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
+ }
+}
+
+template <>
+class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+ explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::PLAIN, pool),
+ bits_available_(kInMemoryDefaultCapacity * 8),
+ bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
+ sink_(pool),
+ bit_writer_(bits_buffer_->mutable_data(),
+ static_cast<int>(bits_buffer_->size())) {}
+
+ int64_t EstimatedDataEncodedSize() override;
+ std::shared_ptr<Buffer> FlushValues() override;
+
+ void Put(const bool* src, int num_values) override;
+
+ void Put(const std::vector<bool>& src, int num_values) override;
+
+ void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+ }
+
+ void Put(const ::arrow::Array& values) override {
+ if (values.type_id() != ::arrow::Type::BOOL) {
+ throw ParquetException("direct put to boolean from " + values.type()->ToString() +
+ " not supported");
+ }
+
+ const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
+ if (data.null_count() == 0) {
+ PARQUET_THROW_NOT_OK(sink_.Reserve(BitUtil::BytesForBits(data.length())));
+ // no nulls, just dump the data
+ ::arrow::internal::CopyBitmap(data.data()->GetValues<uint8_t>(1), data.offset(),
+ data.length(), sink_.mutable_data(), sink_.length());
+ } else {
+ auto n_valid = BitUtil::BytesForBits(data.length() - data.null_count());
+ PARQUET_THROW_NOT_OK(sink_.Reserve(n_valid));
+ ::arrow::internal::FirstTimeBitmapWriter writer(sink_.mutable_data(),
+ sink_.length(), n_valid);
+
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ if (data.Value(i)) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ }
+ }
+ writer.Finish();
+ }
+ sink_.UnsafeAdvance(data.length());
+ }
+
+ private:
+ int bits_available_;
+ std::shared_ptr<ResizableBuffer> bits_buffer_;
+ ::arrow::BufferBuilder sink_;
+ ::arrow::BitUtil::BitWriter bit_writer_;
+
+ template <typename SequenceType>
+ void PutImpl(const SequenceType& src, int num_values);
+};
+
+template <typename SequenceType>
+void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
+ int bit_offset = 0;
+ if (bits_available_ > 0) {
+ int bits_to_write = std::min(bits_available_, num_values);
+ for (int i = 0; i < bits_to_write; i++) {
+ bit_writer_.PutValue(src[i], 1);
+ }
+ bits_available_ -= bits_to_write;
+ bit_offset = bits_to_write;
+
+ if (bits_available_ == 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ }
+ }
+
+ int bits_remaining = num_values - bit_offset;
+ while (bit_offset < num_values) {
+ bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
+
+ int bits_to_write = std::min(bits_available_, bits_remaining);
+ for (int i = bit_offset; i < bit_offset + bits_to_write; i++) {
+ bit_writer_.PutValue(src[i], 1);
+ }
+ bit_offset += bits_to_write;
+ bits_available_ -= bits_to_write;
+ bits_remaining -= bits_to_write;
+
+ if (bits_available_ == 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ }
+ }
+}
+
+int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
+ int64_t position = sink_.length();
+ return position + bit_writer_.bytes_written();
+}
+
+std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
+ if (bits_available_ > 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
+ }
+
+ std::shared_ptr<Buffer> buffer;
+ PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+ return buffer;
+}
+
+void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
+ PutImpl(src, num_values);
+}
+
+void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+ PutImpl(src, num_values);
+}
+
+// ----------------------------------------------------------------------
+// DictEncoder<T> implementations
+
+template <typename DType>
+struct DictEncoderTraits {
+ using c_type = typename DType::c_type;
+ using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
+};
+
+template <>
+struct DictEncoderTraits<ByteArrayType> {
+ using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+template <>
+struct DictEncoderTraits<FLBAType> {
+ using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+// Initially 1024 elements
+static constexpr int32_t kInitialHashTableSize = 1 << 10;
+
+/// See the dictionary encoding section of
+/// https://github.com/Parquet/parquet-format. The encoding supports
+/// streaming encoding. Values are encoded as they are added while the
+/// dictionary is being constructed. At any time, the buffered values
+/// can be written out with the current dictionary size. More values
+/// can then be added to the encoder, including new dictionary
+/// entries.
+template <typename DType>
+class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
+ using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
+
+ public:
+ typedef typename DType::c_type T;
+
+ explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
+ : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
+ buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
+ dict_encoded_size_(0),
+ memo_table_(pool, kInitialHashTableSize) {}
+
+ ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); }
+
+ int dict_encoded_size() override { return dict_encoded_size_; }
+
+ int WriteIndices(uint8_t* buffer, int buffer_len) override {
+ // Write bit width in first byte
+ *buffer = static_cast<uint8_t>(bit_width());
+ ++buffer;
+ --buffer_len;
+
+ ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
+
+ for (int32_t index : buffered_indices_) {
+ if (!encoder.Put(index)) return -1;
+ }
+ encoder.Flush();
+
+ ClearIndices();
+ return 1 + encoder.len();
+ }
+
+ void set_type_length(int type_length) { this->type_length_ = type_length; }
+
+ /// Returns a conservative estimate of the number of bytes needed to encode the buffered
+ /// indices. Used to size the buffer passed to WriteIndices().
+ int64_t EstimatedDataEncodedSize() override {
+ // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
+ // reserve
+ // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
+ // but not reserving them would cause the encoder to fail.
+ return 1 +
+ ::arrow::util::RleEncoder::MaxBufferSize(
+ bit_width(), static_cast<int>(buffered_indices_.size())) +
+ ::arrow::util::RleEncoder::MinBufferSize(bit_width());
+ }
+
+ /// The minimum bit width required to encode the currently buffered indices.
+ int bit_width() const override {
+ if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
+ if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
+ return BitUtil::Log2(num_entries());
+ }
+
+ /// Encode value. Note that this does not actually write any data, just
+ /// buffers the value's index to be written later.
+ inline void Put(const T& value);
+
+ // Not implemented for other data types
+ inline void PutByteArray(const void* ptr, int32_t length);
+
+ void Put(const T* src, int num_values) override {
+ for (int32_t i = 0; i < num_values; i++) {
+ Put(src[i]);
+ }
+ }
+
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ Put(src[i + position]);
+ }
+ });
+ }
+
+ using TypedEncoder<DType>::Put;
+
+ void Put(const ::arrow::Array& values) override;
+ void PutDictionary(const ::arrow::Array& values) override;
+
+ template <typename ArrowType, typename T = typename ArrowType::c_type>
+ void PutIndicesTyped(const ::arrow::Array& data) {
+ auto values = data.data()->GetValues<T>(1);
+ size_t buffer_position = buffered_indices_.size();
+ buffered_indices_.resize(buffer_position +
+ static_cast<size_t>(data.length() - data.null_count()));
+ ::arrow::internal::VisitSetBitRunsVoid(
+ data.null_bitmap_data(), data.offset(), data.length(),
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; ++i) {
+ buffered_indices_[buffer_position++] =
+ static_cast<int32_t>(values[i + position]);
+ }
+ });
+ }
+
+ void PutIndices(const ::arrow::Array& data) override {
+ switch (data.type()->id()) {
+ case ::arrow::Type::UINT8:
+ case ::arrow::Type::INT8:
+ return PutIndicesTyped<::arrow::UInt8Type>(data);
+ case ::arrow::Type::UINT16:
+ case ::arrow::Type::INT16:
+ return PutIndicesTyped<::arrow::UInt16Type>(data);
+ case ::arrow::Type::UINT32:
+ case ::arrow::Type::INT32:
+ return PutIndicesTyped<::arrow::UInt32Type>(data);
+ case ::arrow::Type::UINT64:
+ case ::arrow::Type::INT64:
+ return PutIndicesTyped<::arrow::UInt64Type>(data);
+ default:
+ throw ParquetException("Passed non-integer array to PutIndices");
+ }
+ }
+
+ std::shared_ptr<Buffer> FlushValues() override {
+ std::shared_ptr<ResizableBuffer> buffer =
+ AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
+ int result_size = WriteIndices(buffer->mutable_data(),
+ static_cast<int>(EstimatedDataEncodedSize()));
+ PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
+ return std::move(buffer);
+ }
+
+ /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+ /// dict_encoded_size() bytes.
+ void WriteDict(uint8_t* buffer) override;
+
+ /// The number of entries in the dictionary.
+ int num_entries() const override { return memo_table_.size(); }
+
+ private:
+ /// Clears all the indices (but leaves the dictionary).
+ void ClearIndices() { buffered_indices_.clear(); }
+
+ /// Indices that have not yet be written out by WriteIndices().
+ ArrowPoolVector<int32_t> buffered_indices_;
+
+ template <typename ArrayType>
+ void PutBinaryArray(const ArrayType& array) {
+ PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+ *array.data(),
+ [&](::arrow::util::string_view view) {
+ if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+ return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ }
+ PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+ return Status::OK();
+ },
+ []() { return Status::OK(); }));
+ }
+
+ template <typename ArrayType>
+ void PutBinaryDictionaryArray(const ArrayType& array) {
+ DCHECK_EQ(array.null_count(), 0);
+ for (int64_t i = 0; i < array.length(); i++) {
+ auto v = array.GetView(i);
+ if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
+ throw ParquetException("Parquet cannot store strings with size 2GB or more");
+ }
+ dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
+ v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
+ }
+ }
+
+ /// The number of bytes needed to encode the dictionary.
+ int dict_encoded_size_;
+
+ MemoTableType memo_table_;
+};
+
+template <typename DType>
+void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
+ // For primitive types, only a memcpy
+ DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
+ memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
+}
+
+// ByteArray and FLBA already have the dictionary encoded in their data heaps
+template <>
+void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
+ memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) {
+ uint32_t len = static_cast<uint32_t>(v.length());
+ memcpy(buffer, &len, sizeof(len));
+ buffer += sizeof(len);
+ memcpy(buffer, v.data(), len);
+ buffer += len;
+ });
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) {
+ memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) {
+ DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
+ memcpy(buffer, v.data(), type_length_);
+ buffer += type_length_;
+ });
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::Put(const T& v) {
+ // Put() implementation for primitive types
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [this](int32_t memo_index) {
+ dict_encoded_size_ += static_cast<int>(sizeof(T));
+ };
+
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
+ DCHECK(false);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
+ int32_t length) {
+ static const uint8_t empty[] = {0};
+
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [&](int32_t memo_index) {
+ dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
+ };
+
+ DCHECK(ptr != nullptr || length == 0);
+ ptr = (ptr != nullptr) ? ptr : empty;
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
+ return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
+}
+
+template <>
+inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
+ static const uint8_t empty[] = {0};
+
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
+
+ DCHECK(v.ptr != nullptr || type_length_ == 0);
+ const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("Direct put to Int96");
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
+ ParquetException::NYI("Direct put to Int96");
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
+ using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+ const auto& data = checked_cast<const ArrayType&>(values);
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ for (int64_t i = 0; i < data.length(); i++) {
+ Put(data.Value(i));
+ }
+ } else {
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ Put(data.Value(i));
+ }
+ }
+ }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, type_length_);
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ for (int64_t i = 0; i < data.length(); i++) {
+ Put(FixedLenByteArray(data.Value(i)));
+ }
+ } else {
+ std::vector<uint8_t> empty(type_length_, 0);
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ Put(FixedLenByteArray(data.Value(i)));
+ }
+ }
+ }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+template <typename DType>
+void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
+ if (dict.null_count() > 0) {
+ throw ParquetException("Inserted dictionary cannot cannot contain nulls");
+ }
+
+ if (encoder->num_entries() > 0) {
+ throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
+ }
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
+ AssertCanPutDictionary(this, values);
+
+ using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+ const auto& data = checked_cast<const ArrayType&>(values);
+
+ dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
+ for (int64_t i = 0; i < data.length(); i++) {
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
+ }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, type_length_);
+ AssertCanPutDictionary(this, values);
+
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+ dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
+ for (int64_t i = 0; i < data.length(); i++) {
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
+ }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+ AssertCanPutDictionary(this, values);
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+// ----------------------------------------------------------------------
+// ByteStreamSplitEncoder<T> implementations
+
+template <typename DType>
+class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ using TypedEncoder<DType>::Put;
+
+ explicit ByteStreamSplitEncoder(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ int64_t EstimatedDataEncodedSize() override;
+ std::shared_ptr<Buffer> FlushValues() override;
+
+ void Put(const T* buffer, int num_values) override;
+ void Put(const ::arrow::Array& values) override;
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override;
+
+ protected:
+ template <typename ArrowType>
+ void PutImpl(const ::arrow::Array& values) {
+ if (values.type_id() != ArrowType::type_id) {
+ throw ParquetException(std::string() + "direct put to " + ArrowType::type_name() +
+ " from " + values.type()->ToString() + " not supported");
+ }
+ const auto& data = *values.data();
+ PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
+ static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0), data.offset);
+ }
+
+ ::arrow::BufferBuilder sink_;
+ int64_t num_values_in_buffer_;
+};
+
+template <typename DType>
+ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
+ sink_{pool},
+ num_values_in_buffer_{0} {}
+
+template <typename DType>
+int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
+ return sink_.length();
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
+ std::shared_ptr<ResizableBuffer> output_buffer =
+ AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
+ uint8_t* output_buffer_raw = output_buffer->mutable_data();
+ const uint8_t* raw_values = sink_.data();
+ ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values_in_buffer_,
+ output_buffer_raw);
+ sink_.Reset();
+ num_values_in_buffer_ = 0;
+ return std::move(output_buffer);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
+ if (num_values > 0) {
+ PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+ num_values_in_buffer_ += num_values;
+ }
+}
+
+template <>
+void ByteStreamSplitEncoder<FloatType>::Put(const ::arrow::Array& values) {
+ PutImpl<::arrow::FloatType>(values);
+}
+
+template <>
+void ByteStreamSplitEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+ PutImpl<::arrow::DoubleType>(values);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::PutSpaced(const T* src, int num_values,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+}
+
+class DecoderImpl : virtual public Decoder {
+ public:
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ data_ = data;
+ len_ = len;
+ }
+
+ int values_left() const override { return num_values_; }
+ Encoding::type encoding() const override { return encoding_; }
+
+ protected:
+ explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding)
+ : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {}
+
+ // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+ const ColumnDescriptor* descr_;
+
+ const Encoding::type encoding_;
+ int num_values_;
+ const uint8_t* data_;
+ int len_;
+ int type_length_;
+};
+
+template <typename DType>
+class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ explicit PlainDecoder(const ColumnDescriptor* descr);
+
+ int Decode(T* buffer, int max_values) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) override;
+};
+
+template <>
+inline int PlainDecoder<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow not supported for Int96");
+}
+
+template <>
+inline int PlainDecoder<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow not supported for Int96");
+}
+
+template <>
+inline int PlainDecoder<BooleanType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("dictionaries of BooleanType");
+}
+
+template <typename DType>
+int PlainDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) {
+ using value_type = typename DType::c_type;
+
+ constexpr int value_size = static_cast<int>(sizeof(value_type));
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(::arrow::util::SafeLoadAs<value_type>(data_));
+ data_ += sizeof(value_type);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(value_type) * values_decoded;
+ return values_decoded;
+}
+
+template <typename DType>
+int PlainDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ using value_type = typename DType::c_type;
+
+ constexpr int value_size = static_cast<int>(sizeof(value_type));
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ PARQUET_THROW_NOT_OK(
+ builder->Append(::arrow::util::SafeLoadAs<value_type>(data_)));
+ data_ += sizeof(value_type);
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(value_type) * values_decoded;
+ return values_decoded;
+}
+
+// Decode routine templated on C++ type rather than type enum
+template <typename T>
+inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
+ int type_length, T* out) {
+ int64_t bytes_to_decode = num_values * static_cast<int64_t>(sizeof(T));
+ if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
+ ParquetException::EofException();
+ }
+ // If bytes_to_decode == 0, data could be null
+ if (bytes_to_decode > 0) {
+ memcpy(out, data, bytes_to_decode);
+ }
+ return static_cast<int>(bytes_to_decode);
+}
+
+template <typename DType>
+PlainDecoder<DType>::PlainDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::PLAIN) {
+ if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
+ type_length_ = descr_->type_length();
+ } else {
+ type_length_ = -1;
+ }
+}
+
+// Template specialization for BYTE_ARRAY. The written values do not own their
+// own data.
+
+static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size,
+ ByteArray* out) {
+ if (ARROW_PREDICT_FALSE(data_size < 4)) {
+ ParquetException::EofException();
+ }
+ const int32_t len = ::arrow::util::SafeLoadAs<int32_t>(data);
+ if (len < 0) {
+ throw ParquetException("Invalid BYTE_ARRAY value");
+ }
+ const int64_t consumed_length = static_cast<int64_t>(len) + 4;
+ if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
+ ParquetException::EofException();
+ }
+ *out = ByteArray{static_cast<uint32_t>(len), data + 4};
+ return consumed_length;
+}
+
+template <>
+inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int num_values,
+ int type_length, ByteArray* out) {
+ int bytes_decoded = 0;
+ for (int i = 0; i < num_values; ++i) {
+ const auto increment = ReadByteArray(data, data_size, out + i);
+ if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
+ throw ParquetException("BYTE_ARRAY chunk too large");
+ }
+ data += increment;
+ data_size -= increment;
+ bytes_decoded += static_cast<int>(increment);
+ }
+ return bytes_decoded;
+}
+
+// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
+// own their own data.
+template <>
+inline int DecodePlain<FixedLenByteArray>(const uint8_t* data, int64_t data_size,
+ int num_values, int type_length,
+ FixedLenByteArray* out) {
+ int64_t bytes_to_decode = static_cast<int64_t>(type_length) * num_values;
+ if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
+ ParquetException::EofException();
+ }
+ for (int i = 0; i < num_values; ++i) {
+ out[i].ptr = data;
+ data += type_length;
+ data_size -= type_length;
+ }
+ return static_cast<int>(bytes_to_decode);
+}
+
+template <typename DType>
+int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ int bytes_consumed = DecodePlain<T>(data_, len_, max_values, type_length_, buffer);
+ data_ += bytes_consumed;
+ len_ -= bytes_consumed;
+ num_values_ -= max_values;
+ return max_values;
+}
+
+class PlainBooleanDecoder : public DecoderImpl,
+ virtual public TypedDecoder<BooleanType>,
+ virtual public BooleanDecoder {
+ public:
+ explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
+ void SetData(int num_values, const uint8_t* data, int len) override;
+
+ // Two flavors of bool decoding
+ int Decode(uint8_t* buffer, int max_values) override;
+ int Decode(bool* buffer, int max_values) override;
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::Accumulator* out) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* out) override;
+
+ private:
+ std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_;
+};
+
+PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::PLAIN) {}
+
+void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
+ num_values_ = num_values;
+ bit_reader_.reset(new BitUtil::BitReader(data, len));
+}
+
+int PlainBooleanDecoder::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::Accumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ bool value;
+ ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
+ builder->UnsafeAppend(value);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ return values_decoded;
+}
+
+inline int PlainBooleanDecoder::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("dictionaries of BooleanType");
+}
+
+int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ bool val;
+ ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
+ for (int i = 0; i < max_values; ++i) {
+ if (!bit_reader_->GetValue(1, &val)) {
+ ParquetException::EofException();
+ }
+ if (val) {
+ bit_writer.Set();
+ }
+ bit_writer.Next();
+ }
+ bit_writer.Finish();
+ num_values_ -= max_values;
+ return max_values;
+}
+
+int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) {
+ ParquetException::EofException();
+ }
+ num_values_ -= max_values;
+ return max_values;
+}
+
+struct ArrowBinaryHelper {
+ explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) {
+ this->out = out;
+ this->builder = out->builder.get();
+ this->chunk_space_remaining =
+ ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
+ }
+
+ Status PushChunk() {
+ std::shared_ptr<::arrow::Array> result;
+ RETURN_NOT_OK(builder->Finish(&result));
+ out->chunks.push_back(result);
+ chunk_space_remaining = ::arrow::kBinaryMemoryLimit;
+ return Status::OK();
+ }
+
+ bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
+
+ void UnsafeAppend(const uint8_t* data, int32_t length) {
+ chunk_space_remaining -= length;
+ builder->UnsafeAppend(data, length);
+ }
+
+ void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
+
+ Status Append(const uint8_t* data, int32_t length) {
+ chunk_space_remaining -= length;
+ return builder->Append(data, length);
+ }
+
+ Status AppendNull() { return builder->AppendNull(); }
+
+ typename EncodingTraits<ByteArrayType>::Accumulator* out;
+ ::arrow::BinaryBuilder* builder;
+ int64_t chunk_space_remaining;
+};
+
+template <>
+inline int PlainDecoder<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
+ ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
+ ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::Accumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(data_);
+ data_ += descr_->type_length();
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ len_ -= descr_->type_length() * values_decoded;
+ return values_decoded;
+}
+
+template <>
+inline int PlainDecoder<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ PARQUET_THROW_NOT_OK(builder->Append(data_));
+ data_ += descr_->type_length();
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ num_values_ -= values_decoded;
+ len_ -= descr_->type_length() * values_decoded;
+ return values_decoded;
+}
+
+class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
+ virtual public ByteArrayDecoder {
+ public:
+ using Base = PlainDecoder<ByteArrayType>;
+ using Base::DecodeSpaced;
+ using Base::PlainDecoder;
+
+ // ----------------------------------------------------------------------
+ // Dictionary read paths
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::BinaryDictionary32Builder* builder) override {
+ int result = 0;
+ PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+ valid_bits_offset, builder, &result));
+ return result;
+ }
+
+ // ----------------------------------------------------------------------
+ // Optimized dense binary read paths
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ int result = 0;
+ PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+ valid_bits_offset, out, &result));
+ return result;
+ }
+
+ private:
+ Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_values_decoded) {
+ ArrowBinaryHelper helper(out);
+ int values_decoded = 0;
+
+ RETURN_NOT_OK(helper.builder->Reserve(num_values));
+ RETURN_NOT_OK(helper.builder->ReserveData(
+ std::min<int64_t>(len_, helper.chunk_space_remaining)));
+
+ int i = 0;
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ if (ARROW_PREDICT_FALSE(len_ < 4)) {
+ ParquetException::EofException();
+ }
+ auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+ return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+ }
+ auto increment = value_len + 4;
+ if (ARROW_PREDICT_FALSE(len_ < increment)) {
+ ParquetException::EofException();
+ }
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
+ // This element would exceed the capacity of a chunk
+ RETURN_NOT_OK(helper.PushChunk());
+ RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
+ RETURN_NOT_OK(helper.builder->ReserveData(
+ std::min<int64_t>(len_, helper.chunk_space_remaining)));
+ }
+ helper.UnsafeAppend(data_ + 4, value_len);
+ data_ += increment;
+ len_ -= increment;
+ ++values_decoded;
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ helper.UnsafeAppendNull();
+ ++i;
+ return Status::OK();
+ }));
+
+ num_values_ -= values_decoded;
+ *out_values_decoded = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, BuilderType* builder,
+ int* out_values_decoded) {
+ RETURN_NOT_OK(builder->Reserve(num_values));
+ int values_decoded = 0;
+
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ if (ARROW_PREDICT_FALSE(len_ < 4)) {
+ ParquetException::EofException();
+ }
+ auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+ return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+ }
+ auto increment = value_len + 4;
+ if (ARROW_PREDICT_FALSE(len_ < increment)) {
+ ParquetException::EofException();
+ }
+ RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
+ data_ += increment;
+ len_ -= increment;
+ ++values_decoded;
+ return Status::OK();
+ },
+ [&]() { return builder->AppendNull(); }));
+
+ num_values_ -= values_decoded;
+ *out_values_decoded = values_decoded;
+ return Status::OK();
+ }
+};
+
+class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
+ public:
+ using Base = PlainDecoder<FLBAType>;
+ using Base::PlainDecoder;
+};
+
+// ----------------------------------------------------------------------
+// Dictionary encoding and decoding
+
+template <typename Type>
+class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
+ public:
+ typedef typename Type::c_type T;
+
+ // Initializes the dictionary with values from 'dictionary'. The data in
+ // dictionary is not guaranteed to persist in memory after this call so the
+ // dictionary decoder needs to copy the data out if necessary.
+ explicit DictDecoderImpl(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::RLE_DICTIONARY),
+ dictionary_(AllocateBuffer(pool, 0)),
+ dictionary_length_(0),
+ byte_array_data_(AllocateBuffer(pool, 0)),
+ byte_array_offsets_(AllocateBuffer(pool, 0)),
+ indices_scratch_space_(AllocateBuffer(pool, 0)) {}
+
+ // Perform type-specific initiatialization
+ void SetDict(TypedDecoder<Type>* dictionary) override;
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ if (len == 0) {
+ // Initialize dummy decoder to avoid crashes later on
+ idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1);
+ return;
+ }
+ uint8_t bit_width = *data;
+ if (ARROW_PREDICT_FALSE(bit_width >= 64)) {
+ throw ParquetException("Invalid or corrupted bit_width");
+ }
+ idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width);
+ }
+
+ int Decode(T* buffer, int num_values) override {
+ num_values = std::min(num_values, num_values_);
+ int decoded_values =
+ idx_decoder_.GetBatchWithDict(reinterpret_cast<const T*>(dictionary_->data()),
+ dictionary_length_, buffer, num_values);
+ if (decoded_values != num_values) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ num_values = std::min(num_values, num_values_);
+ if (num_values != idx_decoder_.GetBatchWithDictSpaced(
+ reinterpret_cast<const T*>(dictionary_->data()),
+ dictionary_length_, buffer, num_values, null_count, valid_bits,
+ valid_bits_offset)) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::Accumulator* out) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::DictAccumulator* out) override;
+
+ void InsertDictionary(::arrow::ArrayBuilder* builder) override;
+
+ int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::ArrayBuilder* builder) override {
+ if (num_values > 0) {
+ // TODO(wesm): Refactor to batch reads for improved memory use. It is not
+ // trivial because the null_count is relative to the entire bitmap
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
+ num_values, /*shrink_to_fit=*/false));
+ }
+
+ auto indices_buffer =
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
+
+ if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
+ valid_bits_offset, indices_buffer)) {
+ ParquetException::EofException();
+ }
+
+ /// XXX(wesm): Cannot append "valid bits" directly to the builder
+ std::vector<uint8_t> valid_bytes(num_values);
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+ for (int64_t i = 0; i < num_values; ++i) {
+ valid_bytes[i] = static_cast<uint8_t>(bit_reader.IsSet());
+ bit_reader.Next();
+ }
+
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+ PARQUET_THROW_NOT_OK(
+ binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
+ num_values_ -= num_values - null_count;
+ return num_values - null_count;
+ }
+
+ int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
+ num_values = std::min(num_values, num_values_);
+ if (num_values > 0) {
+ // TODO(wesm): Refactor to batch reads for improved memory use. This is
+ // relatively simple here because we don't have to do any bookkeeping of
+ // nulls
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
+ num_values, /*shrink_to_fit=*/false));
+ }
+ auto indices_buffer =
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
+ if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
+ ParquetException::EofException();
+ }
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+ PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeIndices(int num_values, int32_t* indices) override {
+ if (num_values != idx_decoder_.GetBatch(indices, num_values)) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ void GetDictionary(const T** dictionary, int32_t* dictionary_length) override {
+ *dictionary_length = dictionary_length_;
+ *dictionary = reinterpret_cast<T*>(dictionary_->mutable_data());
+ }
+
+ protected:
+ Status IndexInBounds(int32_t index) {
+ if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
+ return Status::OK();
+ }
+ return Status::Invalid("Index not in dictionary bounds");
+ }
+
+ inline void DecodeDict(TypedDecoder<Type>* dictionary) {
+ dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
+ PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
+ /*shrink_to_fit=*/false));
+ dictionary->Decode(reinterpret_cast<T*>(dictionary_->mutable_data()),
+ dictionary_length_);
+ }
+
+ // Only one is set.
+ std::shared_ptr<ResizableBuffer> dictionary_;
+
+ int32_t dictionary_length_;
+
+ // Data that contains the byte array data (byte_array_dictionary_ just has the
+ // pointers).
+ std::shared_ptr<ResizableBuffer> byte_array_data_;
+
+ // Arrow-style byte offsets for each dictionary value. We maintain two
+ // representations of the dictionary, one as ByteArray* for non-Arrow
+ // consumers and this one for Arrow consumers. Since dictionaries are
+ // generally pretty small to begin with this doesn't mean too much extra
+ // memory use in most cases
+ std::shared_ptr<ResizableBuffer> byte_array_offsets_;
+
+ // Reusable buffer for decoding dictionary indices to be appended to a
+ // BinaryDictionary32Builder
+ std::shared_ptr<ResizableBuffer> indices_scratch_space_;
+
+ ::arrow::util::RleDecoder idx_decoder_;
+};
+
+template <typename Type>
+void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {
+ DecodeDict(dictionary);
+}
+
+template <>
+void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
+ ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
+}
+
+template <>
+void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictionary) {
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
+
+ int total_size = 0;
+ for (int i = 0; i < dictionary_length_; ++i) {
+ total_size += dict_values[i].len;
+ }
+ PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+ /*shrink_to_fit=*/false));
+ PARQUET_THROW_NOT_OK(
+ byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
+ /*shrink_to_fit=*/false));
+
+ int32_t offset = 0;
+ uint8_t* bytes_data = byte_array_data_->mutable_data();
+ int32_t* bytes_offsets =
+ reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
+ for (int i = 0; i < dictionary_length_; ++i) {
+ memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
+ bytes_offsets[i] = offset;
+ dict_values[i].ptr = bytes_data + offset;
+ offset += dict_values[i].len;
+ }
+ bytes_offsets[dictionary_length_] = offset;
+}
+
+template <>
+inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionary) {
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<FLBA*>(dictionary_->mutable_data());
+
+ int fixed_len = descr_->type_length();
+ int total_size = dictionary_length_ * fixed_len;
+
+ PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+ /*shrink_to_fit=*/false));
+ uint8_t* bytes_data = byte_array_data_->mutable_data();
+ for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) {
+ memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
+ dict_values[i].ptr = bytes_data + offset;
+ }
+}
+
+template <>
+inline int DictDecoderImpl<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow to Int96Type");
+}
+
+template <>
+inline int DictDecoderImpl<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow to Int96Type");
+}
+
+template <>
+inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <>
+inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <typename DType>
+int DictDecoderImpl<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const typename DType::c_type*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ PARQUET_THROW_NOT_OK(builder->Append(dict_values[index]));
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ return num_values - null_count;
+}
+
+template <>
+int DictDecoderImpl<BooleanType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("No dictionary encoding for BooleanType");
+}
+
+template <>
+inline int DictDecoderImpl<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::Accumulator* builder) {
+ if (builder->byte_width() != descr_->type_length()) {
+ throw ParquetException("Byte width mismatch: builder was " +
+ std::to_string(builder->byte_width()) + " but decoder was " +
+ std::to_string(descr_->type_length()));
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ builder->UnsafeAppend(dict_values[index].ptr);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ return num_values - null_count;
+}
+
+template <>
+int DictDecoderImpl<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
+ auto value_type =
+ checked_cast<const ::arrow::DictionaryType&>(*builder->type()).value_type();
+ auto byte_width =
+ checked_cast<const ::arrow::FixedSizeBinaryType&>(*value_type).byte_width();
+ if (byte_width != descr_->type_length()) {
+ throw ParquetException("Byte width mismatch: builder was " +
+ std::to_string(byte_width) + " but decoder was " +
+ std::to_string(descr_->type_length()));
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ PARQUET_THROW_NOT_OK(builder->Append(dict_values[index].ptr));
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ return num_values - null_count;
+}
+
+template <typename Type>
+int DictDecoderImpl<Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::Accumulator* builder) {
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ using value_type = typename Type::c_type;
+ auto dict_values = reinterpret_cast<const value_type*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ builder->UnsafeAppend(dict_values[index]);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ return num_values - null_count;
+}
+
+template <typename Type>
+void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+ ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types");
+}
+
+template <>
+void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+
+ // Make a BinaryArray referencing the internal dictionary data
+ auto arr = std::make_shared<::arrow::BinaryArray>(
+ dictionary_length_, byte_array_offsets_, byte_array_data_);
+ PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
+}
+
+class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
+ virtual public ByteArrayDecoder {
+ public:
+ using BASE = DictDecoderImpl<ByteArrayType>;
+ using BASE::DictDecoderImpl;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::BinaryDictionary32Builder* builder) override {
+ int result = 0;
+ if (null_count == 0) {
+ PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+ } else {
+ PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+ valid_bits_offset, builder, &result));
+ }
+ return result;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ int result = 0;
+ if (null_count == 0) {
+ PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
+ } else {
+ PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+ valid_bits_offset, out, &result));
+ }
+ return result;
+ }
+
+ private:
+ Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 1024;
+ int32_t indices[kBufferSize];
+
+ ArrowBinaryHelper helper(out);
+
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+ int values_decoded = 0;
+ int num_appended = 0;
+ while (num_appended < num_values) {
+ bool is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+
+ if (is_valid) {
+ int32_t batch_size =
+ std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+ if (ARROW_PREDICT_FALSE(num_indices < 1)) {
+ return Status::Invalid("Invalid number of indices '", num_indices, "'");
+ }
+
+ int i = 0;
+ while (true) {
+ // Consume all indices
+ if (is_valid) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+ RETURN_NOT_OK(helper.PushChunk());
+ }
+ RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+ ++i;
+ ++values_decoded;
+ } else {
+ RETURN_NOT_OK(helper.AppendNull());
+ --null_count;
+ }
+ ++num_appended;
+ if (i == num_indices) {
+ // Do not advance the bit_reader if we have fulfilled the decode
+ // request
+ break;
+ }
+ is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+ }
+ } else {
+ RETURN_NOT_OK(helper.AppendNull());
+ --null_count;
+ ++num_appended;
+ }
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ Status DecodeArrowDenseNonNull(int num_values,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 2048;
+ int32_t indices[kBufferSize];
+ int values_decoded = 0;
+
+ ArrowBinaryHelper helper(out);
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ while (values_decoded < num_values) {
+ int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+ if (num_indices == 0) ParquetException::EofException();
+ for (int i = 0; i < num_indices; ++i) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+ RETURN_NOT_OK(helper.PushChunk());
+ }
+ RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+ }
+ values_decoded += num_indices;
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, BuilderType* builder,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 1024;
+ int32_t indices[kBufferSize];
+
+ RETURN_NOT_OK(builder->Reserve(num_values));
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ int values_decoded = 0;
+ int num_appended = 0;
+ while (num_appended < num_values) {
+ bool is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+
+ if (is_valid) {
+ int32_t batch_size =
+ std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+ int i = 0;
+ while (true) {
+ // Consume all indices
+ if (is_valid) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+ ++i;
+ ++values_decoded;
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ --null_count;
+ }
+ ++num_appended;
+ if (i == num_indices) {
+ // Do not advance the bit_reader if we have fulfilled the decode
+ // request
+ break;
+ }
+ is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+ }
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ --null_count;
+ ++num_appended;
+ }
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+ constexpr int32_t kBufferSize = 2048;
+ int32_t indices[kBufferSize];
+
+ RETURN_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ int values_decoded = 0;
+ while (values_decoded < num_values) {
+ int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+ if (num_indices == 0) ParquetException::EofException();
+ for (int i = 0; i < num_indices; ++i) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+ }
+ values_decoded += num_indices;
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// DeltaBitPackDecoder
+
+template <typename DType>
+class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ typedef typename DType::c_type T;
+
+ explicit DeltaBitPackDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) {
+ if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) {
+ throw ParquetException("Delta bit pack encoding should only be for integer data.");
+ }
+ }
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ this->num_values_ = num_values;
+ decoder_ = ::arrow::BitUtil::BitReader(data, len);
+ values_current_block_ = 0;
+ values_current_mini_block_ = 0;
+ }
+
+ int Decode(T* buffer, int max_values) override {
+ return GetInternal(buffer, max_values);
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* out) override {
+ if (null_count != 0) {
+ ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
+ }
+ std::vector<T> values(num_values);
+ GetInternal(values.data(), num_values);
+ PARQUET_THROW_NOT_OK(out->AppendValues(values));
+ return num_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* out) override {
+ if (null_count != 0) {
+ ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
+ }
+ std::vector<T> values(num_values);
+ GetInternal(values.data(), num_values);
+ PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+ for (T value : values) {
+ PARQUET_THROW_NOT_OK(out->Append(value));
+ }
+ return num_values;
+ }
+
+ private:
+ void InitBlock() {
+ // The number of values per block.
+ uint32_t block_size;
+ if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
+ if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
+ if (!decoder_.GetVlqInt(&values_current_block_)) {
+ ParquetException::EofException();
+ }
+ if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException();
+
+ delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
+ uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+
+ if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
+ for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
+ if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
+ ParquetException::EofException();
+ }
+ }
+ values_per_mini_block_ = block_size / num_mini_blocks_;
+ mini_block_idx_ = 0;
+ delta_bit_width_ = bit_width_data[0];
+ values_current_mini_block_ = values_per_mini_block_;
+ }
+
+ template <typename T>
+ int GetInternal(T* buffer, int max_values) {
+ max_values = std::min(max_values, this->num_values_);
+ const uint8_t* bit_width_data = delta_bit_widths_->data();
+ for (int i = 0; i < max_values; ++i) {
+ if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
+ ++mini_block_idx_;
+ if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
+ delta_bit_width_ = bit_width_data[mini_block_idx_];
+ values_current_mini_block_ = values_per_mini_block_;
+ } else {
+ InitBlock();
+ buffer[i] = last_value_;
+ continue;
+ }
+ }
+
+ // TODO: the key to this algorithm is to decode the entire miniblock at once.
+ int64_t delta;
+ if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException();
+ delta += min_delta_;
+ last_value_ += static_cast<int32_t>(delta);
+ buffer[i] = last_value_;
+ --values_current_mini_block_;
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ MemoryPool* pool_;
+ ::arrow::BitUtil::BitReader decoder_;
+ uint32_t values_current_block_;
+ uint32_t num_mini_blocks_;
+ uint64_t values_per_mini_block_;
+ uint64_t values_current_mini_block_;
+
+ int32_t min_delta_;
+ size_t mini_block_idx_;
+ std::shared_ptr<ResizableBuffer> delta_bit_widths_;
+ int delta_bit_width_;
+
+ int32_t last_value_;
+};
+
+// ----------------------------------------------------------------------
+// DELTA_LENGTH_BYTE_ARRAY
+
+class DeltaLengthByteArrayDecoder : public DecoderImpl,
+ virtual public TypedDecoder<ByteArrayType> {
+ public:
+ explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
+ len_decoder_(nullptr, pool),
+ pool_(pool) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ if (len == 0) return;
+ int total_lengths_len = ::arrow::util::SafeLoadAs<int32_t>(data);
+ data += 4;
+ this->len_decoder_.SetData(num_values, data, total_lengths_len);
+ data_ = data + total_lengths_len;
+ this->len_ = len - 4 - total_lengths_len;
+ }
+
+ int Decode(ByteArray* buffer, int max_values) override {
+ using VectorT = ArrowPoolVector<int>;
+ max_values = std::min(max_values, num_values_);
+ VectorT lengths(max_values, 0, ::arrow::stl::allocator<int>(pool_));
+ len_decoder_.Decode(lengths.data(), max_values);
+ for (int i = 0; i < max_values; ++i) {
+ buffer[i].len = lengths[i];
+ buffer[i].ptr = data_;
+ this->data_ += lengths[i];
+ this->len_ -= lengths[i];
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override {
+ ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
+ }
+
+ private:
+ DeltaBitPackDecoder<Int32Type> len_decoder_;
+ ::arrow::MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY
+
+class DeltaByteArrayDecoder : public DecoderImpl,
+ virtual public TypedDecoder<ByteArrayType> {
+ public:
+ explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
+ prefix_len_decoder_(nullptr, pool),
+ suffix_decoder_(nullptr, pool),
+ last_value_(0, nullptr) {}
+
+ virtual void SetData(int num_values, const uint8_t* data, int len) {
+ num_values_ = num_values;
+ if (len == 0) return;
+ int prefix_len_length = ::arrow::util::SafeLoadAs<int32_t>(data);
+ data += 4;
+ len -= 4;
+ prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
+ data += prefix_len_length;
+ len -= prefix_len_length;
+ suffix_decoder_.SetData(num_values, data, len);
+ }
+
+ // TODO: this doesn't work and requires memory management. We need to allocate
+ // new strings to store the results.
+ virtual int Decode(ByteArray* buffer, int max_values) {
+ max_values = std::min(max_values, this->num_values_);
+ for (int i = 0; i < max_values; ++i) {
+ int prefix_len = 0;
+ prefix_len_decoder_.Decode(&prefix_len, 1);
+ ByteArray suffix = {0, nullptr};
+ suffix_decoder_.Decode(&suffix, 1);
+ buffer[i].len = prefix_len + suffix.len;
+
+ uint8_t* result = reinterpret_cast<uint8_t*>(malloc(buffer[i].len));
+ memcpy(result, last_value_.ptr, prefix_len);
+ memcpy(result + prefix_len, suffix.ptr, suffix.len);
+
+ buffer[i].ptr = result;
+ last_value_ = buffer[i];
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ private:
+ DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
+ DeltaLengthByteArrayDecoder suffix_decoder_;
+ ByteArray last_value_;
+};
+
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT
+
+template <typename DType>
+class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr);
+
+ int Decode(T* buffer, int max_values) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) override;
+
+ void SetData(int num_values, const uint8_t* data, int len) override;
+
+ T* EnsureDecodeBuffer(int64_t min_values) {
+ const int64_t size = sizeof(T) * min_values;
+ if (!decode_buffer_ || decode_buffer_->size() < size) {
+ PARQUET_ASSIGN_OR_THROW(decode_buffer_, ::arrow::AllocateBuffer(size));
+ }
+ return reinterpret_cast<T*>(decode_buffer_->mutable_data());
+ }
+
+ private:
+ int num_values_in_buffer_{0};
+ std::shared_ptr<Buffer> decode_buffer_;
+
+ static constexpr size_t kNumStreams = sizeof(T);
+};
+
+template <typename DType>
+ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {}
+
+template <typename DType>
+void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
+ int len) {
+ DecoderImpl::SetData(num_values, data, len);
+ if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
+ throw ParquetException("Data size too small for number of values (corrupted file?)");
+ }
+ num_values_in_buffer_ = num_values;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
+ const int values_to_decode = std::min(num_values_, max_values);
+ const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+ const uint8_t* data = data_ + num_decoded_previously;
+
+ ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_to_decode,
+ num_values_in_buffer_, buffer);
+ num_values_ -= values_to_decode;
+ len_ -= sizeof(T) * values_to_decode;
+ return values_to_decode;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) {
+ constexpr int value_size = static_cast<int>(kNumStreams);
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+ const uint8_t* data = data_ + num_decoded_previously;
+ int offset = 0;
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ // Use fast decoding into intermediate buffer. This will also decode
+ // some null values, but it's fast enough that we don't care.
+ T* decode_out = EnsureDecodeBuffer(values_decoded);
+ ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_decoded,
+ num_values_in_buffer_, decode_out);
+
+ // XXX If null_count is 0, we could even append in bulk or decode directly into
+ // builder
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(decode_out[offset]);
+ ++offset;
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+#else
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * num_values_in_buffer_ + offset;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ builder->UnsafeAppend(::arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]));
+ ++offset;
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+#endif
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(T) * values_decoded;
+ return values_decoded;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Encoder and decoder factory functions
+
+std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
+ bool use_dictionary, const ColumnDescriptor* descr,
+ MemoryPool* pool) {
+ if (use_dictionary) {
+ switch (type_num) {
+ case Type::INT32:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<ByteArrayType>(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<FLBAType>(descr, pool));
+ default:
+ DCHECK(false) << "Encoder not implemented";
+ break;
+ }
+ } else if (encoding == Encoding::PLAIN) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ return std::unique_ptr<Encoder>(new PlainEncoder<BooleanType>(descr, pool));
+ case Type::INT32:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(new PlainEncoder<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(new PlainEncoder<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new PlainEncoder<ByteArrayType>(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new PlainEncoder<FLBAType>(descr, pool));
+ default:
+ DCHECK(false) << "Encoder not implemented";
+ break;
+ }
+ } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+ switch (type_num) {
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(
+ new ByteStreamSplitEncoder<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(
+ new ByteStreamSplitEncoder<DoubleType>(descr, pool));
+ default:
+ throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+ break;
+ }
+ } else {
+ ParquetException::NYI("Selected encoding is not supported");
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
+ const ColumnDescriptor* descr) {
+ if (encoding == Encoding::PLAIN) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ return std::unique_ptr<Decoder>(new PlainBooleanDecoder(descr));
+ case Type::INT32:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int32Type>(descr));
+ case Type::INT64:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int64Type>(descr));
+ case Type::INT96:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int96Type>(descr));
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new PlainDecoder<FloatType>(descr));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new PlainDecoder<DoubleType>(descr));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new PlainByteArrayDecoder(descr));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new PlainFLBADecoder(descr));
+ default:
+ break;
+ }
+ } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+ switch (type_num) {
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<FloatType>(descr));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<DoubleType>(descr));
+ default:
+ throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+ break;
+ }
+ } else {
+ ParquetException::NYI("Selected encoding is not supported");
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+namespace detail {
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+ const ColumnDescriptor* descr,
+ MemoryPool* pool) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ ParquetException::NYI("Dictionary encoding not implemented for boolean type");
+ case Type::INT32:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new DictByteArrayDecoderImpl(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<FLBAType>(descr, pool));
+ default:
+ break;
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+} // namespace detail
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
index b9ca7a7ee68..bf5446e0174 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
@@ -1,460 +1,460 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "arrow/util/spaced.h"
-
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-class ArrayBuilder;
-class BinaryArray;
-class BinaryBuilder;
-class BooleanBuilder;
-class Int32Type;
-class Int64Type;
-class FloatType;
-class DoubleType;
-class FixedSizeBinaryType;
-template <typename T>
-class NumericBuilder;
-class FixedSizeBinaryBuilder;
-template <typename T>
-class Dictionary32Builder;
-
-} // namespace arrow
-
-namespace parquet {
-
-template <typename DType>
-class TypedEncoder;
-
-using BooleanEncoder = TypedEncoder<BooleanType>;
-using Int32Encoder = TypedEncoder<Int32Type>;
-using Int64Encoder = TypedEncoder<Int64Type>;
-using Int96Encoder = TypedEncoder<Int96Type>;
-using FloatEncoder = TypedEncoder<FloatType>;
-using DoubleEncoder = TypedEncoder<DoubleType>;
-using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
-using FLBAEncoder = TypedEncoder<FLBAType>;
-
-template <typename DType>
-class TypedDecoder;
-
-class BooleanDecoder;
-using Int32Decoder = TypedDecoder<Int32Type>;
-using Int64Decoder = TypedDecoder<Int64Type>;
-using Int96Decoder = TypedDecoder<Int96Type>;
-using FloatDecoder = TypedDecoder<FloatType>;
-using DoubleDecoder = TypedDecoder<DoubleType>;
-using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
-class FLBADecoder;
-
-template <typename T>
-struct EncodingTraits;
-
-template <>
-struct EncodingTraits<BooleanType> {
- using Encoder = BooleanEncoder;
- using Decoder = BooleanDecoder;
-
- using ArrowType = ::arrow::BooleanType;
- using Accumulator = ::arrow::BooleanBuilder;
- struct DictAccumulator {};
-};
-
-template <>
-struct EncodingTraits<Int32Type> {
- using Encoder = Int32Encoder;
- using Decoder = Int32Decoder;
-
- using ArrowType = ::arrow::Int32Type;
- using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
-};
-
-template <>
-struct EncodingTraits<Int64Type> {
- using Encoder = Int64Encoder;
- using Decoder = Int64Decoder;
-
- using ArrowType = ::arrow::Int64Type;
- using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
-};
-
-template <>
-struct EncodingTraits<Int96Type> {
- using Encoder = Int96Encoder;
- using Decoder = Int96Decoder;
-
- struct Accumulator {};
- struct DictAccumulator {};
-};
-
-template <>
-struct EncodingTraits<FloatType> {
- using Encoder = FloatEncoder;
- using Decoder = FloatDecoder;
-
- using ArrowType = ::arrow::FloatType;
- using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
-};
-
-template <>
-struct EncodingTraits<DoubleType> {
- using Encoder = DoubleEncoder;
- using Decoder = DoubleDecoder;
-
- using ArrowType = ::arrow::DoubleType;
- using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
-};
-
-template <>
-struct EncodingTraits<ByteArrayType> {
- using Encoder = ByteArrayEncoder;
- using Decoder = ByteArrayDecoder;
-
- /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
- /// overflow the capacity of a single arrow::BinaryArray
- struct Accumulator {
- std::unique_ptr<::arrow::BinaryBuilder> builder;
- std::vector<std::shared_ptr<::arrow::Array>> chunks;
- };
- using ArrowType = ::arrow::BinaryType;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
-};
-
-template <>
-struct EncodingTraits<FLBAType> {
- using Encoder = FLBAEncoder;
- using Decoder = FLBADecoder;
-
- using ArrowType = ::arrow::FixedSizeBinaryType;
- using Accumulator = ::arrow::FixedSizeBinaryBuilder;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
-};
-
-class ColumnDescriptor;
-
-// Untyped base for all encoders
-class Encoder {
- public:
- virtual ~Encoder() = default;
-
- virtual int64_t EstimatedDataEncodedSize() = 0;
- virtual std::shared_ptr<Buffer> FlushValues() = 0;
- virtual Encoding::type encoding() const = 0;
-
- virtual void Put(const ::arrow::Array& values) = 0;
-
- virtual MemoryPool* memory_pool() const = 0;
-};
-
-// Base class for value encoders. Since encoders may or not have state (e.g.,
-// dictionary encoding) we use a class instance to maintain any state.
-//
-// Encode interfaces are internal, subject to change without deprecation.
-template <typename DType>
-class TypedEncoder : virtual public Encoder {
- public:
- typedef typename DType::c_type T;
-
- using Encoder::Put;
-
- virtual void Put(const T* src, int num_values) = 0;
-
- virtual void Put(const std::vector<T>& src, int num_values = -1);
-
- virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) = 0;
-};
-
-template <typename DType>
-void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
- if (num_values == -1) {
- num_values = static_cast<int>(src.size());
- }
- Put(src.data(), num_values);
-}
-
-template <>
-inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
- // NOTE(wesm): This stub is here only to satisfy the compiler; it is
- // overridden later with the actual implementation
-}
-
-// Base class for dictionary encoders
-template <typename DType>
-class DictEncoder : virtual public TypedEncoder<DType> {
- public:
- /// Writes out any buffered indices to buffer preceded by the bit width of this data.
- /// Returns the number of bytes written.
- /// If the supplied buffer is not big enough, returns -1.
- /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
- /// to size buffer.
- virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
-
- virtual int dict_encoded_size() = 0;
- // virtual int dict_encoded_size() { return dict_encoded_size_; }
-
- virtual int bit_width() const = 0;
-
- /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
- /// dict_encoded_size() bytes.
- virtual void WriteDict(uint8_t* buffer) = 0;
-
- virtual int num_entries() const = 0;
-
- /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
- /// assumed (without any boundschecking) that the indices reference
- /// pre-existing dictionary values
- /// \param[in] indices the dictionary index values. Only Int32Array currently
- /// supported
- virtual void PutIndices(const ::arrow::Array& indices) = 0;
-
- /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
- /// separately. Currently throws exception if the current dictionary memo is
- /// non-empty
- /// \param[in] values the dictionary values. Only valid for certain
- /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
- virtual void PutDictionary(const ::arrow::Array& values) = 0;
-};
-
-// ----------------------------------------------------------------------
-// Value decoding
-
-class Decoder {
- public:
- virtual ~Decoder() = default;
-
- // Sets the data for a new page. This will be called multiple times on the same
- // decoder and should reset all internal state.
- virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
-
- // Returns the number of values left (for the last call to SetData()). This is
- // the number of values left in this page.
- virtual int values_left() const = 0;
- virtual Encoding::type encoding() const = 0;
-};
-
-template <typename DType>
-class TypedDecoder : virtual public Decoder {
- public:
- using T = typename DType::c_type;
-
- /// \brief Decode values into a buffer
- ///
- /// Subclasses may override the more specialized Decode methods below.
- ///
- /// \param[in] buffer destination for decoded values
- /// \param[in] max_values maximum number of values to decode
- /// \return The number of values decoded. Should be identical to max_values except
- /// at the end of the current data page.
- virtual int Decode(T* buffer, int max_values) = 0;
-
- /// \brief Decode the values in this data page but leave spaces for null entries.
- ///
- /// \param[in] buffer destination for decoded values
- /// \param[in] num_values size of the def_levels and buffer arrays including the number
- /// of null slots
- /// \param[in] null_count number of null slots
- /// \param[in] valid_bits bitmap data indicating position of valid slots
- /// \param[in] valid_bits_offset offset into valid_bits
- /// \return The number of values decoded, including nulls.
- virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset) {
- if (null_count > 0) {
- int values_to_read = num_values - null_count;
- int values_read = Decode(buffer, values_to_read);
- if (values_read != values_to_read) {
- throw ParquetException("Number of values / definition_levels read did not match");
- }
-
- return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
- valid_bits, valid_bits_offset);
- } else {
- return Decode(buffer, num_values);
- }
- }
-
- /// \brief Decode into an ArrayBuilder or other accumulator
- ///
- /// This function assumes the definition levels were already decoded
- /// as a validity bitmap in the given `valid_bits`. `null_count`
- /// is the number of 0s in `valid_bits`.
- /// As a space optimization, it is allowed for `valid_bits` to be null
- /// if `null_count` is zero.
- ///
- /// \return number of values decoded
- virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* out) = 0;
-
- /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
- ///
- /// \return number of values decoded
- int DecodeArrowNonNull(int num_values,
- typename EncodingTraits<DType>::Accumulator* out) {
- return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
- }
-
- /// \brief Decode into a DictionaryBuilder
- ///
- /// This function assumes the definition levels were already decoded
- /// as a validity bitmap in the given `valid_bits`. `null_count`
- /// is the number of 0s in `valid_bits`.
- /// As a space optimization, it is allowed for `valid_bits` to be null
- /// if `null_count` is zero.
- ///
- /// \return number of values decoded
- virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
-
- /// \brief Decode into a DictionaryBuilder ignoring nulls
- ///
- /// \return number of values decoded
- int DecodeArrowNonNull(int num_values,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
- }
-};
-
-template <typename DType>
-class DictDecoder : virtual public TypedDecoder<DType> {
- public:
- using T = typename DType::c_type;
-
- virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
-
- /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
- /// but do not append any indices
- virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
-
- /// \brief Decode only dictionary indices and append to dictionary
- /// builder. The builder must have had the dictionary from this decoder
- /// inserted already.
- ///
- /// \warning Remember to reset the builder each time the dict decoder is initialized
- /// with a new dictionary page
- virtual int DecodeIndicesSpaced(int num_values, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset,
- ::arrow::ArrayBuilder* builder) = 0;
-
- /// \brief Decode only dictionary indices (no nulls)
- ///
- /// \warning Remember to reset the builder each time the dict decoder is initialized
- /// with a new dictionary page
- virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
-
- /// \brief Decode only dictionary indices (no nulls). Same as above
- /// DecodeIndices but target is an array instead of a builder.
- ///
- /// \note API EXPERIMENTAL
- virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
-
- /// \brief Get dictionary. The reader will call this API when it encounters a
- /// new dictionary.
- ///
- /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
- /// the decoder and is destroyed when the decoder is destroyed.
- /// @param[out] dictionary_length The dictionary length.
- ///
- /// \note API EXPERIMENTAL
- virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
-};
-
-// ----------------------------------------------------------------------
-// TypedEncoder specializations, traits, and factory functions
-
-class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
- public:
- using TypedDecoder<BooleanType>::Decode;
- virtual int Decode(uint8_t* buffer, int max_values) = 0;
-};
-
-class FLBADecoder : virtual public TypedDecoder<FLBAType> {
- public:
- using TypedDecoder<FLBAType>::DecodeSpaced;
-
- // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
- // there is value in adding specialized read methods for
- // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
- // then perhaps not
-};
-
-PARQUET_EXPORT
-std::unique_ptr<Encoder> MakeEncoder(
- Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
- const ColumnDescriptor* descr = NULLPTR,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
-template <typename DType>
-std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
- Encoding::type encoding, bool use_dictionary = false,
- const ColumnDescriptor* descr = NULLPTR,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- using OutType = typename EncodingTraits<DType>::Encoder;
- std::unique_ptr<Encoder> base =
- MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
- return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
-}
-
-PARQUET_EXPORT
-std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
- const ColumnDescriptor* descr = NULLPTR);
-
-namespace detail {
-
-PARQUET_EXPORT
-std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool);
-
-} // namespace detail
-
-template <typename DType>
-std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
- const ColumnDescriptor* descr = NULLPTR,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- using OutType = DictDecoder<DType>;
- auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
- return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
-}
-
-template <typename DType>
-std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
- Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
- using OutType = typename EncodingTraits<DType>::Decoder;
- std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
- return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "arrow/util/spaced.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class ArrayBuilder;
+class BinaryArray;
+class BinaryBuilder;
+class BooleanBuilder;
+class Int32Type;
+class Int64Type;
+class FloatType;
+class DoubleType;
+class FixedSizeBinaryType;
+template <typename T>
+class NumericBuilder;
+class FixedSizeBinaryBuilder;
+template <typename T>
+class Dictionary32Builder;
+
+} // namespace arrow
+
+namespace parquet {
+
+template <typename DType>
+class TypedEncoder;
+
+using BooleanEncoder = TypedEncoder<BooleanType>;
+using Int32Encoder = TypedEncoder<Int32Type>;
+using Int64Encoder = TypedEncoder<Int64Type>;
+using Int96Encoder = TypedEncoder<Int96Type>;
+using FloatEncoder = TypedEncoder<FloatType>;
+using DoubleEncoder = TypedEncoder<DoubleType>;
+using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
+using FLBAEncoder = TypedEncoder<FLBAType>;
+
+template <typename DType>
+class TypedDecoder;
+
+class BooleanDecoder;
+using Int32Decoder = TypedDecoder<Int32Type>;
+using Int64Decoder = TypedDecoder<Int64Type>;
+using Int96Decoder = TypedDecoder<Int96Type>;
+using FloatDecoder = TypedDecoder<FloatType>;
+using DoubleDecoder = TypedDecoder<DoubleType>;
+using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
+class FLBADecoder;
+
+template <typename T>
+struct EncodingTraits;
+
+template <>
+struct EncodingTraits<BooleanType> {
+ using Encoder = BooleanEncoder;
+ using Decoder = BooleanDecoder;
+
+ using ArrowType = ::arrow::BooleanType;
+ using Accumulator = ::arrow::BooleanBuilder;
+ struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<Int32Type> {
+ using Encoder = Int32Encoder;
+ using Decoder = Int32Decoder;
+
+ using ArrowType = ::arrow::Int32Type;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
+};
+
+template <>
+struct EncodingTraits<Int64Type> {
+ using Encoder = Int64Encoder;
+ using Decoder = Int64Decoder;
+
+ using ArrowType = ::arrow::Int64Type;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
+};
+
+template <>
+struct EncodingTraits<Int96Type> {
+ using Encoder = Int96Encoder;
+ using Decoder = Int96Decoder;
+
+ struct Accumulator {};
+ struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<FloatType> {
+ using Encoder = FloatEncoder;
+ using Decoder = FloatDecoder;
+
+ using ArrowType = ::arrow::FloatType;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
+};
+
+template <>
+struct EncodingTraits<DoubleType> {
+ using Encoder = DoubleEncoder;
+ using Decoder = DoubleDecoder;
+
+ using ArrowType = ::arrow::DoubleType;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
+};
+
+template <>
+struct EncodingTraits<ByteArrayType> {
+ using Encoder = ByteArrayEncoder;
+ using Decoder = ByteArrayDecoder;
+
+ /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
+ /// overflow the capacity of a single arrow::BinaryArray
+ struct Accumulator {
+ std::unique_ptr<::arrow::BinaryBuilder> builder;
+ std::vector<std::shared_ptr<::arrow::Array>> chunks;
+ };
+ using ArrowType = ::arrow::BinaryType;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+};
+
+template <>
+struct EncodingTraits<FLBAType> {
+ using Encoder = FLBAEncoder;
+ using Decoder = FLBADecoder;
+
+ using ArrowType = ::arrow::FixedSizeBinaryType;
+ using Accumulator = ::arrow::FixedSizeBinaryBuilder;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
+};
+
+class ColumnDescriptor;
+
+// Untyped base for all encoders
+class Encoder {
+ public:
+ virtual ~Encoder() = default;
+
+ virtual int64_t EstimatedDataEncodedSize() = 0;
+ virtual std::shared_ptr<Buffer> FlushValues() = 0;
+ virtual Encoding::type encoding() const = 0;
+
+ virtual void Put(const ::arrow::Array& values) = 0;
+
+ virtual MemoryPool* memory_pool() const = 0;
+};
+
+// Base class for value encoders. Since encoders may or not have state (e.g.,
+// dictionary encoding) we use a class instance to maintain any state.
+//
+// Encode interfaces are internal, subject to change without deprecation.
+template <typename DType>
+class TypedEncoder : virtual public Encoder {
+ public:
+ typedef typename DType::c_type T;
+
+ using Encoder::Put;
+
+ virtual void Put(const T* src, int num_values) = 0;
+
+ virtual void Put(const std::vector<T>& src, int num_values = -1);
+
+ virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) = 0;
+};
+
+template <typename DType>
+void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
+ if (num_values == -1) {
+ num_values = static_cast<int>(src.size());
+ }
+ Put(src.data(), num_values);
+}
+
+template <>
+inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+ // NOTE(wesm): This stub is here only to satisfy the compiler; it is
+ // overridden later with the actual implementation
+}
+
+// Base class for dictionary encoders
+template <typename DType>
+class DictEncoder : virtual public TypedEncoder<DType> {
+ public:
+ /// Writes out any buffered indices to buffer preceded by the bit width of this data.
+ /// Returns the number of bytes written.
+ /// If the supplied buffer is not big enough, returns -1.
+ /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
+ /// to size buffer.
+ virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
+
+ virtual int dict_encoded_size() = 0;
+ // virtual int dict_encoded_size() { return dict_encoded_size_; }
+
+ virtual int bit_width() const = 0;
+
+ /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+ /// dict_encoded_size() bytes.
+ virtual void WriteDict(uint8_t* buffer) = 0;
+
+ virtual int num_entries() const = 0;
+
+ /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
+ /// assumed (without any boundschecking) that the indices reference
+ /// pre-existing dictionary values
+ /// \param[in] indices the dictionary index values. Only Int32Array currently
+ /// supported
+ virtual void PutIndices(const ::arrow::Array& indices) = 0;
+
+ /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
+ /// separately. Currently throws exception if the current dictionary memo is
+ /// non-empty
+ /// \param[in] values the dictionary values. Only valid for certain
+ /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
+ virtual void PutDictionary(const ::arrow::Array& values) = 0;
+};
+
+// ----------------------------------------------------------------------
+// Value decoding
+
+class Decoder {
+ public:
+ virtual ~Decoder() = default;
+
+ // Sets the data for a new page. This will be called multiple times on the same
+ // decoder and should reset all internal state.
+ virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
+
+ // Returns the number of values left (for the last call to SetData()). This is
+ // the number of values left in this page.
+ virtual int values_left() const = 0;
+ virtual Encoding::type encoding() const = 0;
+};
+
+template <typename DType>
+class TypedDecoder : virtual public Decoder {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief Decode values into a buffer
+ ///
+ /// Subclasses may override the more specialized Decode methods below.
+ ///
+ /// \param[in] buffer destination for decoded values
+ /// \param[in] max_values maximum number of values to decode
+ /// \return The number of values decoded. Should be identical to max_values except
+ /// at the end of the current data page.
+ virtual int Decode(T* buffer, int max_values) = 0;
+
+ /// \brief Decode the values in this data page but leave spaces for null entries.
+ ///
+ /// \param[in] buffer destination for decoded values
+ /// \param[in] num_values size of the def_levels and buffer arrays including the number
+ /// of null slots
+ /// \param[in] null_count number of null slots
+ /// \param[in] valid_bits bitmap data indicating position of valid slots
+ /// \param[in] valid_bits_offset offset into valid_bits
+ /// \return The number of values decoded, including nulls.
+ virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ if (null_count > 0) {
+ int values_to_read = num_values - null_count;
+ int values_read = Decode(buffer, values_to_read);
+ if (values_read != values_to_read) {
+ throw ParquetException("Number of values / definition_levels read did not match");
+ }
+
+ return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
+ valid_bits, valid_bits_offset);
+ } else {
+ return Decode(buffer, num_values);
+ }
+ }
+
+ /// \brief Decode into an ArrayBuilder or other accumulator
+ ///
+ /// This function assumes the definition levels were already decoded
+ /// as a validity bitmap in the given `valid_bits`. `null_count`
+ /// is the number of 0s in `valid_bits`.
+ /// As a space optimization, it is allowed for `valid_bits` to be null
+ /// if `null_count` is zero.
+ ///
+ /// \return number of values decoded
+ virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* out) = 0;
+
+ /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
+ ///
+ /// \return number of values decoded
+ int DecodeArrowNonNull(int num_values,
+ typename EncodingTraits<DType>::Accumulator* out) {
+ return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
+ }
+
+ /// \brief Decode into a DictionaryBuilder
+ ///
+ /// This function assumes the definition levels were already decoded
+ /// as a validity bitmap in the given `valid_bits`. `null_count`
+ /// is the number of 0s in `valid_bits`.
+ /// As a space optimization, it is allowed for `valid_bits` to be null
+ /// if `null_count` is zero.
+ ///
+ /// \return number of values decoded
+ virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
+
+ /// \brief Decode into a DictionaryBuilder ignoring nulls
+ ///
+ /// \return number of values decoded
+ int DecodeArrowNonNull(int num_values,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
+ }
+};
+
+template <typename DType>
+class DictDecoder : virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
+
+ /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
+ /// but do not append any indices
+ virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices and append to dictionary
+ /// builder. The builder must have had the dictionary from this decoder
+ /// inserted already.
+ ///
+ /// \warning Remember to reset the builder each time the dict decoder is initialized
+ /// with a new dictionary page
+ virtual int DecodeIndicesSpaced(int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset,
+ ::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices (no nulls)
+ ///
+ /// \warning Remember to reset the builder each time the dict decoder is initialized
+ /// with a new dictionary page
+ virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices (no nulls). Same as above
+ /// DecodeIndices but target is an array instead of a builder.
+ ///
+ /// \note API EXPERIMENTAL
+ virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
+
+ /// \brief Get dictionary. The reader will call this API when it encounters a
+ /// new dictionary.
+ ///
+ /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
+ /// the decoder and is destroyed when the decoder is destroyed.
+ /// @param[out] dictionary_length The dictionary length.
+ ///
+ /// \note API EXPERIMENTAL
+ virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
+};
+
+// ----------------------------------------------------------------------
+// TypedEncoder specializations, traits, and factory functions
+
+class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
+ public:
+ using TypedDecoder<BooleanType>::Decode;
+ virtual int Decode(uint8_t* buffer, int max_values) = 0;
+};
+
+class FLBADecoder : virtual public TypedDecoder<FLBAType> {
+ public:
+ using TypedDecoder<FLBAType>::DecodeSpaced;
+
+ // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
+ // there is value in adding specialized read methods for
+ // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
+ // then perhaps not
+};
+
+PARQUET_EXPORT
+std::unique_ptr<Encoder> MakeEncoder(
+ Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
+ Encoding::type encoding, bool use_dictionary = false,
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ using OutType = typename EncodingTraits<DType>::Encoder;
+ std::unique_ptr<Encoder> base =
+ MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
+ const ColumnDescriptor* descr = NULLPTR);
+
+namespace detail {
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool);
+
+} // namespace detail
+
+template <typename DType>
+std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ using OutType = DictDecoder<DType>;
+ auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
+}
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
+ Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
+ using OutType = typename EncodingTraits<DType>::Decoder;
+ std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
index 5927503aba3..829b0e778f1 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
@@ -1,412 +1,412 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/encryption.h"
-
-#include <string.h>
-
-#include <map>
-#include <utility>
-
-#include "arrow/util/logging.h"
-#include "arrow/util/utf8.h"
-#include "parquet/encryption/encryption_internal.h"
-
-namespace parquet {
-
-// integer key retriever
-void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) {
- key_map_.insert({key_id, key});
-}
-
-std::string IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) {
- uint32_t key_id;
- memcpy(reinterpret_cast<uint8_t*>(&key_id), key_metadata.c_str(), 4);
-
- return key_map_.at(key_id);
-}
-
-// string key retriever
-void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) {
- key_map_.insert({key_id, key});
-}
-
-std::string StringKeyIdRetriever::GetKey(const std::string& key_id) {
- return key_map_.at(key_id);
-}
-
-ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key(
- std::string column_key) {
- if (column_key.empty()) return this;
-
- DCHECK(key_.empty());
- key_ = column_key;
- return this;
-}
-
-ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata(
- const std::string& key_metadata) {
- DCHECK(!key_metadata.empty());
- DCHECK(key_metadata_.empty());
- key_metadata_ = key_metadata;
- return this;
-}
-
-ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id(
- const std::string& key_id) {
- // key_id is expected to be in UTF8 encoding
- ::arrow::util::InitializeUTF8();
- const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
- if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
- throw ParquetException("key id should be in UTF8 encoding");
- }
-
- DCHECK(!key_id.empty());
- this->key_metadata(key_id);
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys(
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) {
- if (column_decryption_properties.size() == 0) return this;
-
- if (column_decryption_properties_.size() != 0)
- throw ParquetException("Column properties already set");
-
- for (const auto& element : column_decryption_properties) {
- if (element.second->is_utilized()) {
- throw ParquetException("Column properties utilized in another file");
- }
- element.second->set_utilized();
- }
-
- column_decryption_properties_ = column_decryption_properties;
- return this;
-}
-
-void FileDecryptionProperties::WipeOutDecryptionKeys() {
- footer_key_.clear();
-
- for (const auto& element : column_decryption_properties_) {
- element.second->WipeOutDecryptionKey();
- }
-}
-
-bool FileDecryptionProperties::is_utilized() {
- if (footer_key_.empty() && column_decryption_properties_.size() == 0 &&
- aad_prefix_.empty())
- return false;
-
- return utilized_;
-}
-
-std::shared_ptr<FileDecryptionProperties> FileDecryptionProperties::DeepClone(
- std::string new_aad_prefix) {
- std::string footer_key_copy = footer_key_;
- ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy;
-
- for (const auto& element : column_decryption_properties_) {
- column_decryption_properties_map_copy.insert(
- {element.second->column_path(), element.second->DeepClone()});
- }
-
- if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
- return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
- footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix,
- aad_prefix_verifier_, column_decryption_properties_map_copy,
- plaintext_files_allowed_));
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key(
- const std::string footer_key) {
- if (footer_key.empty()) {
- return this;
- }
- DCHECK(footer_key_.empty());
- footer_key_ = footer_key;
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever(
- const std::shared_ptr<DecryptionKeyRetriever>& key_retriever) {
- if (key_retriever == nullptr) return this;
-
- DCHECK(key_retriever_ == nullptr);
- key_retriever_ = key_retriever;
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix(
- const std::string& aad_prefix) {
- if (aad_prefix.empty()) {
- return this;
- }
- DCHECK(aad_prefix_.empty());
- aad_prefix_ = aad_prefix;
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier(
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier) {
- if (aad_prefix_verifier == nullptr) return this;
-
- DCHECK(aad_prefix_verifier_ == nullptr);
- aad_prefix_verifier_ = std::move(aad_prefix_verifier);
- return this;
-}
-
-ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
- const std::string& key) {
- if (key.empty()) return this;
-
- DCHECK(!key.empty());
- key_ = key;
- return this;
-}
-
-std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::Builder::build() {
- return std::shared_ptr<ColumnDecryptionProperties>(
- new ColumnDecryptionProperties(column_path_, key_));
-}
-
-void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); }
-
-std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::DeepClone() {
- std::string key_copy = key_;
- return std::shared_ptr<ColumnDecryptionProperties>(
- new ColumnDecryptionProperties(column_path_, key_copy));
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata(
- const std::string& footer_key_metadata) {
- if (footer_key_metadata.empty()) return this;
-
- DCHECK(footer_key_metadata_.empty());
- footer_key_metadata_ = footer_key_metadata;
- return this;
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns(
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns) {
- if (encrypted_columns.size() == 0) return this;
-
- if (encrypted_columns_.size() != 0)
- throw ParquetException("Column properties already set");
-
- for (const auto& element : encrypted_columns) {
- if (element.second->is_utilized()) {
- throw ParquetException("Column properties utilized in another file");
- }
- element.second->set_utilized();
- }
- encrypted_columns_ = encrypted_columns;
- return this;
-}
-
-void FileEncryptionProperties::WipeOutEncryptionKeys() {
- footer_key_.clear();
- for (const auto& element : encrypted_columns_) {
- element.second->WipeOutEncryptionKey();
- }
-}
-
-std::shared_ptr<FileEncryptionProperties> FileEncryptionProperties::DeepClone(
- std::string new_aad_prefix) {
- std::string footer_key_copy = footer_key_;
- ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy;
-
- for (const auto& element : encrypted_columns_) {
- encrypted_columns_map_copy.insert(
- {element.second->column_path(), element.second->DeepClone()});
- }
-
- if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
- return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
- algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_,
- new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy));
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix(
- const std::string& aad_prefix) {
- if (aad_prefix.empty()) return this;
-
- DCHECK(aad_prefix_.empty());
- aad_prefix_ = aad_prefix;
- store_aad_prefix_in_file_ = true;
- return this;
-}
-
-FileEncryptionProperties::Builder*
-FileEncryptionProperties::Builder::disable_aad_prefix_storage() {
- DCHECK(!aad_prefix_.empty());
-
- store_aad_prefix_in_file_ = false;
- return this;
-}
-
-ColumnEncryptionProperties::ColumnEncryptionProperties(bool encrypted,
- const std::string& column_path,
- const std::string& key,
- const std::string& key_metadata)
- : column_path_(column_path) {
- // column encryption properties object (with a column key) can be used for writing only
- // one file.
- // Upon completion of file writing, the encryption keys in the properties will be wiped
- // out (set to 0 in memory).
- utilized_ = false;
-
- DCHECK(!column_path.empty());
- if (!encrypted) {
- DCHECK(key.empty() && key_metadata.empty());
- }
-
- if (!key.empty()) {
- DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
- }
-
- encrypted_with_footer_key_ = (encrypted && key.empty());
- if (encrypted_with_footer_key_) {
- DCHECK(key_metadata.empty());
- }
-
- encrypted_ = encrypted;
- key_metadata_ = key_metadata;
- key_ = key;
-}
-
-ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path,
- const std::string& key)
- : column_path_(column_path) {
- utilized_ = false;
- DCHECK(!column_path.empty());
-
- if (!key.empty()) {
- DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
- }
-
- key_ = key;
-}
-
-std::string FileDecryptionProperties::column_key(const std::string& column_path) const {
- if (column_decryption_properties_.find(column_path) !=
- column_decryption_properties_.end()) {
- auto column_prop = column_decryption_properties_.at(column_path);
- if (column_prop != nullptr) {
- return column_prop->key();
- }
- }
- return empty_string_;
-}
-
-FileDecryptionProperties::FileDecryptionProperties(
- const std::string& footer_key, std::shared_ptr<DecryptionKeyRetriever> key_retriever,
- bool check_plaintext_footer_integrity, const std::string& aad_prefix,
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
- bool plaintext_files_allowed) {
- DCHECK(!footer_key.empty() || nullptr != key_retriever ||
- 0 != column_decryption_properties.size());
-
- if (!footer_key.empty()) {
- DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
- footer_key.length() == 32);
- }
- if (footer_key.empty() && check_plaintext_footer_integrity) {
- DCHECK(nullptr != key_retriever);
- }
- aad_prefix_verifier_ = std::move(aad_prefix_verifier);
- footer_key_ = footer_key;
- check_plaintext_footer_integrity_ = check_plaintext_footer_integrity;
- key_retriever_ = std::move(key_retriever);
- aad_prefix_ = aad_prefix;
- column_decryption_properties_ = column_decryption_properties;
- plaintext_files_allowed_ = plaintext_files_allowed;
- utilized_ = false;
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id(
- const std::string& key_id) {
- // key_id is expected to be in UTF8 encoding
- ::arrow::util::InitializeUTF8();
- const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
- if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
- throw ParquetException("footer key id should be in UTF8 encoding");
- }
-
- if (key_id.empty()) {
- return this;
- }
-
- return footer_key_metadata(key_id);
-}
-
-std::shared_ptr<ColumnEncryptionProperties>
-FileEncryptionProperties::column_encryption_properties(const std::string& column_path) {
- if (encrypted_columns_.size() == 0) {
- auto builder = std::make_shared<ColumnEncryptionProperties::Builder>(column_path);
- return builder->build();
- }
- if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) {
- return encrypted_columns_[column_path];
- }
-
- return nullptr;
-}
-
-FileEncryptionProperties::FileEncryptionProperties(
- ParquetCipher::type cipher, const std::string& footer_key,
- const std::string& footer_key_metadata, bool encrypted_footer,
- const std::string& aad_prefix, bool store_aad_prefix_in_file,
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns)
- : footer_key_(footer_key),
- footer_key_metadata_(footer_key_metadata),
- encrypted_footer_(encrypted_footer),
- aad_prefix_(aad_prefix),
- store_aad_prefix_in_file_(store_aad_prefix_in_file),
- encrypted_columns_(encrypted_columns) {
- // file encryption properties object can be used for writing only one file.
- // Upon completion of file writing, the encryption keys in the properties will be wiped
- // out (set to 0 in memory).
- utilized_ = false;
-
- DCHECK(!footer_key.empty());
- // footer_key must be either 16, 24 or 32 bytes.
- DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
- footer_key.length() == 32);
-
- uint8_t aad_file_unique[kAadFileUniqueLength];
- memset(aad_file_unique, 0, kAadFileUniqueLength);
- encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength));
- std::string aad_file_unique_str(reinterpret_cast<char const*>(aad_file_unique),
- kAadFileUniqueLength);
-
- bool supply_aad_prefix = false;
- if (aad_prefix.empty()) {
- file_aad_ = aad_file_unique_str;
- } else {
- file_aad_ = aad_prefix + aad_file_unique_str;
- if (!store_aad_prefix_in_file) supply_aad_prefix = true;
- }
- algorithm_.algorithm = cipher;
- algorithm_.aad.aad_file_unique = aad_file_unique_str;
- algorithm_.aad.supply_aad_prefix = supply_aad_prefix;
- if (!aad_prefix.empty() && store_aad_prefix_in_file) {
- algorithm_.aad.aad_prefix = aad_prefix;
- }
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/encryption.h"
+
+#include <string.h>
+
+#include <map>
+#include <utility>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// integer key retriever
+void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) {
+ key_map_.insert({key_id, key});
+}
+
+std::string IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) {
+ uint32_t key_id;
+ memcpy(reinterpret_cast<uint8_t*>(&key_id), key_metadata.c_str(), 4);
+
+ return key_map_.at(key_id);
+}
+
+// string key retriever
+void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) {
+ key_map_.insert({key_id, key});
+}
+
+std::string StringKeyIdRetriever::GetKey(const std::string& key_id) {
+ return key_map_.at(key_id);
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key(
+ std::string column_key) {
+ if (column_key.empty()) return this;
+
+ DCHECK(key_.empty());
+ key_ = column_key;
+ return this;
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata(
+ const std::string& key_metadata) {
+ DCHECK(!key_metadata.empty());
+ DCHECK(key_metadata_.empty());
+ key_metadata_ = key_metadata;
+ return this;
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id(
+ const std::string& key_id) {
+ // key_id is expected to be in UTF8 encoding
+ ::arrow::util::InitializeUTF8();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
+ if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
+ throw ParquetException("key id should be in UTF8 encoding");
+ }
+
+ DCHECK(!key_id.empty());
+ this->key_metadata(key_id);
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys(
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) {
+ if (column_decryption_properties.size() == 0) return this;
+
+ if (column_decryption_properties_.size() != 0)
+ throw ParquetException("Column properties already set");
+
+ for (const auto& element : column_decryption_properties) {
+ if (element.second->is_utilized()) {
+ throw ParquetException("Column properties utilized in another file");
+ }
+ element.second->set_utilized();
+ }
+
+ column_decryption_properties_ = column_decryption_properties;
+ return this;
+}
+
+void FileDecryptionProperties::WipeOutDecryptionKeys() {
+ footer_key_.clear();
+
+ for (const auto& element : column_decryption_properties_) {
+ element.second->WipeOutDecryptionKey();
+ }
+}
+
+bool FileDecryptionProperties::is_utilized() {
+ if (footer_key_.empty() && column_decryption_properties_.size() == 0 &&
+ aad_prefix_.empty())
+ return false;
+
+ return utilized_;
+}
+
+std::shared_ptr<FileDecryptionProperties> FileDecryptionProperties::DeepClone(
+ std::string new_aad_prefix) {
+ std::string footer_key_copy = footer_key_;
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy;
+
+ for (const auto& element : column_decryption_properties_) {
+ column_decryption_properties_map_copy.insert(
+ {element.second->column_path(), element.second->DeepClone()});
+ }
+
+ if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
+ return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+ footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix,
+ aad_prefix_verifier_, column_decryption_properties_map_copy,
+ plaintext_files_allowed_));
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key(
+ const std::string footer_key) {
+ if (footer_key.empty()) {
+ return this;
+ }
+ DCHECK(footer_key_.empty());
+ footer_key_ = footer_key;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever(
+ const std::shared_ptr<DecryptionKeyRetriever>& key_retriever) {
+ if (key_retriever == nullptr) return this;
+
+ DCHECK(key_retriever_ == nullptr);
+ key_retriever_ = key_retriever;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix(
+ const std::string& aad_prefix) {
+ if (aad_prefix.empty()) {
+ return this;
+ }
+ DCHECK(aad_prefix_.empty());
+ aad_prefix_ = aad_prefix;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier(
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier) {
+ if (aad_prefix_verifier == nullptr) return this;
+
+ DCHECK(aad_prefix_verifier_ == nullptr);
+ aad_prefix_verifier_ = std::move(aad_prefix_verifier);
+ return this;
+}
+
+ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
+ const std::string& key) {
+ if (key.empty()) return this;
+
+ DCHECK(!key.empty());
+ key_ = key;
+ return this;
+}
+
+std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::Builder::build() {
+ return std::shared_ptr<ColumnDecryptionProperties>(
+ new ColumnDecryptionProperties(column_path_, key_));
+}
+
+void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); }
+
+std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::DeepClone() {
+ std::string key_copy = key_;
+ return std::shared_ptr<ColumnDecryptionProperties>(
+ new ColumnDecryptionProperties(column_path_, key_copy));
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata(
+ const std::string& footer_key_metadata) {
+ if (footer_key_metadata.empty()) return this;
+
+ DCHECK(footer_key_metadata_.empty());
+ footer_key_metadata_ = footer_key_metadata;
+ return this;
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns(
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns) {
+ if (encrypted_columns.size() == 0) return this;
+
+ if (encrypted_columns_.size() != 0)
+ throw ParquetException("Column properties already set");
+
+ for (const auto& element : encrypted_columns) {
+ if (element.second->is_utilized()) {
+ throw ParquetException("Column properties utilized in another file");
+ }
+ element.second->set_utilized();
+ }
+ encrypted_columns_ = encrypted_columns;
+ return this;
+}
+
+void FileEncryptionProperties::WipeOutEncryptionKeys() {
+ footer_key_.clear();
+ for (const auto& element : encrypted_columns_) {
+ element.second->WipeOutEncryptionKey();
+ }
+}
+
+std::shared_ptr<FileEncryptionProperties> FileEncryptionProperties::DeepClone(
+ std::string new_aad_prefix) {
+ std::string footer_key_copy = footer_key_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy;
+
+ for (const auto& element : encrypted_columns_) {
+ encrypted_columns_map_copy.insert(
+ {element.second->column_path(), element.second->DeepClone()});
+ }
+
+ if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
+ return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+ algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_,
+ new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy));
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix(
+ const std::string& aad_prefix) {
+ if (aad_prefix.empty()) return this;
+
+ DCHECK(aad_prefix_.empty());
+ aad_prefix_ = aad_prefix;
+ store_aad_prefix_in_file_ = true;
+ return this;
+}
+
+FileEncryptionProperties::Builder*
+FileEncryptionProperties::Builder::disable_aad_prefix_storage() {
+ DCHECK(!aad_prefix_.empty());
+
+ store_aad_prefix_in_file_ = false;
+ return this;
+}
+
+ColumnEncryptionProperties::ColumnEncryptionProperties(bool encrypted,
+ const std::string& column_path,
+ const std::string& key,
+ const std::string& key_metadata)
+ : column_path_(column_path) {
+ // column encryption properties object (with a column key) can be used for writing only
+ // one file.
+ // Upon completion of file writing, the encryption keys in the properties will be wiped
+ // out (set to 0 in memory).
+ utilized_ = false;
+
+ DCHECK(!column_path.empty());
+ if (!encrypted) {
+ DCHECK(key.empty() && key_metadata.empty());
+ }
+
+ if (!key.empty()) {
+ DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
+ }
+
+ encrypted_with_footer_key_ = (encrypted && key.empty());
+ if (encrypted_with_footer_key_) {
+ DCHECK(key_metadata.empty());
+ }
+
+ encrypted_ = encrypted;
+ key_metadata_ = key_metadata;
+ key_ = key;
+}
+
+ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path,
+ const std::string& key)
+ : column_path_(column_path) {
+ utilized_ = false;
+ DCHECK(!column_path.empty());
+
+ if (!key.empty()) {
+ DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
+ }
+
+ key_ = key;
+}
+
+std::string FileDecryptionProperties::column_key(const std::string& column_path) const {
+ if (column_decryption_properties_.find(column_path) !=
+ column_decryption_properties_.end()) {
+ auto column_prop = column_decryption_properties_.at(column_path);
+ if (column_prop != nullptr) {
+ return column_prop->key();
+ }
+ }
+ return empty_string_;
+}
+
+FileDecryptionProperties::FileDecryptionProperties(
+ const std::string& footer_key, std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+ bool check_plaintext_footer_integrity, const std::string& aad_prefix,
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
+ bool plaintext_files_allowed) {
+ DCHECK(!footer_key.empty() || nullptr != key_retriever ||
+ 0 != column_decryption_properties.size());
+
+ if (!footer_key.empty()) {
+ DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
+ footer_key.length() == 32);
+ }
+ if (footer_key.empty() && check_plaintext_footer_integrity) {
+ DCHECK(nullptr != key_retriever);
+ }
+ aad_prefix_verifier_ = std::move(aad_prefix_verifier);
+ footer_key_ = footer_key;
+ check_plaintext_footer_integrity_ = check_plaintext_footer_integrity;
+ key_retriever_ = std::move(key_retriever);
+ aad_prefix_ = aad_prefix;
+ column_decryption_properties_ = column_decryption_properties;
+ plaintext_files_allowed_ = plaintext_files_allowed;
+ utilized_ = false;
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id(
+ const std::string& key_id) {
+ // key_id is expected to be in UTF8 encoding
+ ::arrow::util::InitializeUTF8();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
+ if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
+ throw ParquetException("footer key id should be in UTF8 encoding");
+ }
+
+ if (key_id.empty()) {
+ return this;
+ }
+
+ return footer_key_metadata(key_id);
+}
+
+std::shared_ptr<ColumnEncryptionProperties>
+FileEncryptionProperties::column_encryption_properties(const std::string& column_path) {
+ if (encrypted_columns_.size() == 0) {
+ auto builder = std::make_shared<ColumnEncryptionProperties::Builder>(column_path);
+ return builder->build();
+ }
+ if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) {
+ return encrypted_columns_[column_path];
+ }
+
+ return nullptr;
+}
+
+FileEncryptionProperties::FileEncryptionProperties(
+ ParquetCipher::type cipher, const std::string& footer_key,
+ const std::string& footer_key_metadata, bool encrypted_footer,
+ const std::string& aad_prefix, bool store_aad_prefix_in_file,
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns)
+ : footer_key_(footer_key),
+ footer_key_metadata_(footer_key_metadata),
+ encrypted_footer_(encrypted_footer),
+ aad_prefix_(aad_prefix),
+ store_aad_prefix_in_file_(store_aad_prefix_in_file),
+ encrypted_columns_(encrypted_columns) {
+ // file encryption properties object can be used for writing only one file.
+ // Upon completion of file writing, the encryption keys in the properties will be wiped
+ // out (set to 0 in memory).
+ utilized_ = false;
+
+ DCHECK(!footer_key.empty());
+ // footer_key must be either 16, 24 or 32 bytes.
+ DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
+ footer_key.length() == 32);
+
+ uint8_t aad_file_unique[kAadFileUniqueLength];
+ memset(aad_file_unique, 0, kAadFileUniqueLength);
+ encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength));
+ std::string aad_file_unique_str(reinterpret_cast<char const*>(aad_file_unique),
+ kAadFileUniqueLength);
+
+ bool supply_aad_prefix = false;
+ if (aad_prefix.empty()) {
+ file_aad_ = aad_file_unique_str;
+ } else {
+ file_aad_ = aad_prefix + aad_file_unique_str;
+ if (!store_aad_prefix_in_file) supply_aad_prefix = true;
+ }
+ algorithm_.algorithm = cipher;
+ algorithm_.aad.aad_file_unique = aad_file_unique_str;
+ algorithm_.aad.supply_aad_prefix = supply_aad_prefix;
+ if (!aad_prefix.empty() && store_aad_prefix_in_file) {
+ algorithm_.aad.aad_prefix = aad_prefix;
+ }
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
index 8fd7ec8d3d0..840c669e6bd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
@@ -1,510 +1,510 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "parquet/exception.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
- ParquetCipher::AES_GCM_V1;
-static constexpr int32_t kMaximalAadMetadataLength = 256;
-static constexpr bool kDefaultEncryptedFooter = true;
-static constexpr bool kDefaultCheckSignature = true;
-static constexpr bool kDefaultAllowPlaintextFiles = false;
-static constexpr int32_t kAadFileUniqueLength = 8;
-
-class ColumnDecryptionProperties;
-using ColumnPathToDecryptionPropertiesMap =
- std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
-
-class ColumnEncryptionProperties;
-using ColumnPathToEncryptionPropertiesMap =
- std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
-
-class PARQUET_EXPORT DecryptionKeyRetriever {
- public:
- virtual std::string GetKey(const std::string& key_metadata) = 0;
- virtual ~DecryptionKeyRetriever() {}
-};
-
-/// Simple integer key retriever
-class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
- public:
- void PutKey(uint32_t key_id, const std::string& key);
- std::string GetKey(const std::string& key_metadata) override;
-
- private:
- std::map<uint32_t, std::string> key_map_;
-};
-
-// Simple string key retriever
-class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
- public:
- void PutKey(const std::string& key_id, const std::string& key);
- std::string GetKey(const std::string& key_metadata) override;
-
- private:
- std::map<std::string, std::string> key_map_;
-};
-
-class PARQUET_EXPORT HiddenColumnException : public ParquetException {
- public:
- explicit HiddenColumnException(const std::string& columnPath)
- : ParquetException(columnPath.c_str()) {}
-};
-
-class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
- public:
- explicit KeyAccessDeniedException(const std::string& columnPath)
- : ParquetException(columnPath.c_str()) {}
-};
-
-inline const uint8_t* str2bytes(const std::string& str) {
- if (str.empty()) return NULLPTR;
-
- char* cbytes = const_cast<char*>(str.c_str());
- return reinterpret_cast<const uint8_t*>(cbytes);
-}
-
-class PARQUET_EXPORT ColumnEncryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- /// Convenience builder for encrypted columns.
- explicit Builder(const std::string& name) : Builder(name, true) {}
-
- /// Convenience builder for encrypted columns.
- explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
- : Builder(path->ToDotString(), true) {}
-
- /// Set a column-specific key.
- /// If key is not set on an encrypted column, the column will
- /// be encrypted with the footer key.
- /// keyBytes Key length must be either 16, 24 or 32 bytes.
- /// The key is cloned, and will be wiped out (array values set to 0) upon completion
- /// of file writing.
- /// Caller is responsible for wiping out the input key array.
- Builder* key(std::string column_key);
-
- /// Set a key retrieval metadata.
- /// use either key_metadata() or key_id(), not both
- Builder* key_metadata(const std::string& key_metadata);
-
- /// A convenience function to set key metadata using a string id.
- /// Set a key retrieval metadata (converted from String).
- /// use either key_metadata() or key_id(), not both
- /// key_id will be converted to metadata (UTF-8 array).
- Builder* key_id(const std::string& key_id);
-
- std::shared_ptr<ColumnEncryptionProperties> build() {
- return std::shared_ptr<ColumnEncryptionProperties>(
- new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
- }
-
- private:
- const std::string column_path_;
- bool encrypted_;
- std::string key_;
- std::string key_metadata_;
-
- Builder(const std::string path, bool encrypted)
- : column_path_(path), encrypted_(encrypted) {}
- };
-
- std::string column_path() const { return column_path_; }
- bool is_encrypted() const { return encrypted_; }
- bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
- std::string key() const { return key_; }
- std::string key_metadata() const { return key_metadata_; }
-
- /// Upon completion of file writing, the encryption key
- /// will be wiped out.
- void WipeOutEncryptionKey() { key_.clear(); }
-
- bool is_utilized() {
- if (key_.empty())
- return false; // can re-use column properties without encryption keys
- return utilized_;
- }
-
- /// ColumnEncryptionProperties object can be used for writing one file only.
- /// Mark ColumnEncryptionProperties as utilized once it is used in
- /// FileEncryptionProperties as the encryption key will be wiped out upon
- /// completion of file writing.
- void set_utilized() { utilized_ = true; }
-
- std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
- std::string key_copy = key_;
- return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
- encrypted_, column_path_, key_copy, key_metadata_));
- }
-
- ColumnEncryptionProperties() = default;
- ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
- ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
-
- private:
- const std::string column_path_;
- bool encrypted_;
- bool encrypted_with_footer_key_;
- std::string key_;
- std::string key_metadata_;
- bool utilized_;
- explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
- const std::string& key,
- const std::string& key_metadata);
-};
-
-class PARQUET_EXPORT ColumnDecryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- explicit Builder(const std::string& name) : column_path_(name) {}
-
- explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
- : Builder(path->ToDotString()) {}
-
- /// Set an explicit column key. If applied on a file that contains
- /// key metadata for this column the metadata will be ignored,
- /// the column will be decrypted with this key.
- /// key length must be either 16, 24 or 32 bytes.
- Builder* key(const std::string& key);
-
- std::shared_ptr<ColumnDecryptionProperties> build();
-
- private:
- const std::string column_path_;
- std::string key_;
- };
-
- ColumnDecryptionProperties() = default;
- ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
- ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
-
- std::string column_path() const { return column_path_; }
- std::string key() const { return key_; }
- bool is_utilized() { return utilized_; }
-
- /// ColumnDecryptionProperties object can be used for reading one file only.
- /// Mark ColumnDecryptionProperties as utilized once it is used in
- /// FileDecryptionProperties as the encryption key will be wiped out upon
- /// completion of file reading.
- void set_utilized() { utilized_ = true; }
-
- /// Upon completion of file reading, the encryption key
- /// will be wiped out.
- void WipeOutDecryptionKey();
-
- std::shared_ptr<ColumnDecryptionProperties> DeepClone();
-
- private:
- const std::string column_path_;
- std::string key_;
- bool utilized_;
-
- /// This class is only required for setting explicit column decryption keys -
- /// to override key retriever (or to provide keys when key metadata and/or
- /// key retriever are not available)
- explicit ColumnDecryptionProperties(const std::string& column_path,
- const std::string& key);
-};
-
-class PARQUET_EXPORT AADPrefixVerifier {
- public:
- /// Verifies identity (AAD Prefix) of individual file,
- /// or of file collection in a data set.
- /// Throws exception if an AAD prefix is wrong.
- /// In a data set, AAD Prefixes should be collected,
- /// and then checked for missing files.
- virtual void Verify(const std::string& aad_prefix) = 0;
- virtual ~AADPrefixVerifier() {}
-};
-
-class PARQUET_EXPORT FileDecryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- Builder() {
- check_plaintext_footer_integrity_ = kDefaultCheckSignature;
- plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
- }
-
- /// Set an explicit footer key. If applied on a file that contains
- /// footer key metadata the metadata will be ignored, the footer
- /// will be decrypted/verified with this key.
- /// If explicit key is not set, footer key will be fetched from
- /// key retriever.
- /// With explicit keys or AAD prefix, new encryption properties object must be
- /// created for each encrypted file.
- /// Explicit encryption keys (footer and column) are cloned.
- /// Upon completion of file reading, the cloned encryption keys in the properties
- /// will be wiped out (array values set to 0).
- /// Caller is responsible for wiping out the input key array.
- /// param footerKey Key length must be either 16, 24 or 32 bytes.
- Builder* footer_key(const std::string footer_key);
-
- /// Set explicit column keys (decryption properties).
- /// Its also possible to set a key retriever on this property object.
- /// Upon file decryption, availability of explicit keys is checked before
- /// invocation of the retriever callback.
- /// If an explicit key is available for a footer or a column,
- /// its key metadata will be ignored.
- Builder* column_keys(
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
-
- /// Set a key retriever callback. Its also possible to
- /// set explicit footer or column keys on this file property object.
- /// Upon file decryption, availability of explicit keys is checked before
- /// invocation of the retriever callback.
- /// If an explicit key is available for a footer or a column,
- /// its key metadata will be ignored.
- Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
-
- /// Skip integrity verification of plaintext footers.
- /// If not called, integrity of plaintext footers will be checked in runtime,
- /// and an exception will be thrown in the following situations:
- /// - footer signing key is not available
- /// (not passed, or not found by key retriever)
- /// - footer content and signature don't match
- Builder* disable_footer_signature_verification() {
- check_plaintext_footer_integrity_ = false;
- return this;
- }
-
- /// Explicitly supply the file AAD prefix.
- /// A must when a prefix is used for file encryption, but not stored in file.
- /// If AAD prefix is stored in file, it will be compared to the explicitly
- /// supplied value and an exception will be thrown if they differ.
- Builder* aad_prefix(const std::string& aad_prefix);
-
- /// Set callback for verification of AAD Prefixes stored in file.
- Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
-
- /// By default, reading plaintext (unencrypted) files is not
- /// allowed when using a decryptor
- /// - in order to detect files that were not encrypted by mistake.
- /// However, the default behavior can be overridden by calling this method.
- /// The caller should use then a different method to ensure encryption
- /// of files with sensitive data.
- Builder* plaintext_files_allowed() {
- plaintext_files_allowed_ = true;
- return this;
- }
-
- std::shared_ptr<FileDecryptionProperties> build() {
- return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
- footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
- aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
- }
-
- private:
- std::string footer_key_;
- std::string aad_prefix_;
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
- ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
-
- std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
- bool check_plaintext_footer_integrity_;
- bool plaintext_files_allowed_;
- };
-
- std::string column_key(const std::string& column_path) const;
-
- std::string footer_key() const { return footer_key_; }
-
- std::string aad_prefix() const { return aad_prefix_; }
-
- const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
- return key_retriever_;
- }
-
- bool check_plaintext_footer_integrity() const {
- return check_plaintext_footer_integrity_;
- }
-
- bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
-
- const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
- return aad_prefix_verifier_;
- }
-
- /// Upon completion of file reading, the encryption keys in the properties
- /// will be wiped out (array values set to 0).
- void WipeOutDecryptionKeys();
-
- bool is_utilized();
-
- /// FileDecryptionProperties object can be used for reading one file only.
- /// Mark FileDecryptionProperties as utilized once it is used to read a file as the
- /// encryption keys will be wiped out upon completion of file reading.
- void set_utilized() { utilized_ = true; }
-
- /// FileDecryptionProperties object can be used for reading one file only.
- /// (unless this object keeps the keyRetrieval callback only, and no explicit
- /// keys or aadPrefix).
- /// At the end, keys are wiped out in the memory.
- /// This method allows to clone identical properties for another file,
- /// with an option to update the aadPrefix (if newAadPrefix is null,
- /// aadPrefix will be cloned too)
- std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
-
- private:
- std::string footer_key_;
- std::string aad_prefix_;
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
-
- const std::string empty_string_ = "";
- ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
-
- std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
- bool check_plaintext_footer_integrity_;
- bool plaintext_files_allowed_;
- bool utilized_;
-
- FileDecryptionProperties(
- const std::string& footer_key,
- std::shared_ptr<DecryptionKeyRetriever> key_retriever,
- bool check_plaintext_footer_integrity, const std::string& aad_prefix,
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
- bool plaintext_files_allowed);
-};
-
-class PARQUET_EXPORT FileEncryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- explicit Builder(const std::string& footer_key)
- : parquet_cipher_(kDefaultEncryptionAlgorithm),
- encrypted_footer_(kDefaultEncryptedFooter) {
- footer_key_ = footer_key;
- store_aad_prefix_in_file_ = false;
- }
-
- /// Create files with plaintext footer.
- /// If not called, the files will be created with encrypted footer (default).
- Builder* set_plaintext_footer() {
- encrypted_footer_ = false;
- return this;
- }
-
- /// Set encryption algorithm.
- /// If not called, files will be encrypted with AES_GCM_V1 (default).
- Builder* algorithm(ParquetCipher::type parquet_cipher) {
- parquet_cipher_ = parquet_cipher;
- return this;
- }
-
- /// Set a key retrieval metadata (converted from String).
- /// use either footer_key_metadata or footer_key_id, not both.
- Builder* footer_key_id(const std::string& key_id);
-
- /// Set a key retrieval metadata.
- /// use either footer_key_metadata or footer_key_id, not both.
- Builder* footer_key_metadata(const std::string& footer_key_metadata);
-
- /// Set the file AAD Prefix.
- Builder* aad_prefix(const std::string& aad_prefix);
-
- /// Skip storing AAD Prefix in file.
- /// If not called, and if AAD Prefix is set, it will be stored.
- Builder* disable_aad_prefix_storage();
-
- /// Set the list of encrypted columns and their properties (keys etc).
- /// If not called, all columns will be encrypted with the footer key.
- /// If called, the file columns not in the list will be left unencrypted.
- Builder* encrypted_columns(
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
-
- std::shared_ptr<FileEncryptionProperties> build() {
- return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
- parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
- aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
- }
-
- private:
- ParquetCipher::type parquet_cipher_;
- bool encrypted_footer_;
- std::string footer_key_;
- std::string footer_key_metadata_;
-
- std::string aad_prefix_;
- bool store_aad_prefix_in_file_;
- ColumnPathToEncryptionPropertiesMap encrypted_columns_;
- };
- bool encrypted_footer() const { return encrypted_footer_; }
-
- EncryptionAlgorithm algorithm() const { return algorithm_; }
-
- std::string footer_key() const { return footer_key_; }
-
- std::string footer_key_metadata() const { return footer_key_metadata_; }
-
- std::string file_aad() const { return file_aad_; }
-
- std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
- const std::string& column_path);
-
- bool is_utilized() const { return utilized_; }
-
- /// FileEncryptionProperties object can be used for writing one file only.
- /// Mark FileEncryptionProperties as utilized once it is used to write a file as the
- /// encryption keys will be wiped out upon completion of file writing.
- void set_utilized() { utilized_ = true; }
-
- /// Upon completion of file writing, the encryption keys
- /// will be wiped out (array values set to 0).
- void WipeOutEncryptionKeys();
-
- /// FileEncryptionProperties object can be used for writing one file only.
- /// (at the end, keys are wiped out in the memory).
- /// This method allows to clone identical properties for another file,
- /// with an option to update the aadPrefix (if newAadPrefix is null,
- /// aadPrefix will be cloned too)
- std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
-
- ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
- return encrypted_columns_;
- }
-
- private:
- EncryptionAlgorithm algorithm_;
- std::string footer_key_;
- std::string footer_key_metadata_;
- bool encrypted_footer_;
- std::string file_aad_;
- std::string aad_prefix_;
- bool utilized_;
- bool store_aad_prefix_in_file_;
- ColumnPathToEncryptionPropertiesMap encrypted_columns_;
-
- FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
- const std::string& footer_key_metadata, bool encrypted_footer,
- const std::string& aad_prefix, bool store_aad_prefix_in_file,
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
+ ParquetCipher::AES_GCM_V1;
+static constexpr int32_t kMaximalAadMetadataLength = 256;
+static constexpr bool kDefaultEncryptedFooter = true;
+static constexpr bool kDefaultCheckSignature = true;
+static constexpr bool kDefaultAllowPlaintextFiles = false;
+static constexpr int32_t kAadFileUniqueLength = 8;
+
+class ColumnDecryptionProperties;
+using ColumnPathToDecryptionPropertiesMap =
+ std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
+
+class ColumnEncryptionProperties;
+using ColumnPathToEncryptionPropertiesMap =
+ std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
+
+class PARQUET_EXPORT DecryptionKeyRetriever {
+ public:
+ virtual std::string GetKey(const std::string& key_metadata) = 0;
+ virtual ~DecryptionKeyRetriever() {}
+};
+
+/// Simple integer key retriever
+class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+ void PutKey(uint32_t key_id, const std::string& key);
+ std::string GetKey(const std::string& key_metadata) override;
+
+ private:
+ std::map<uint32_t, std::string> key_map_;
+};
+
+// Simple string key retriever
+class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+ void PutKey(const std::string& key_id, const std::string& key);
+ std::string GetKey(const std::string& key_metadata) override;
+
+ private:
+ std::map<std::string, std::string> key_map_;
+};
+
+class PARQUET_EXPORT HiddenColumnException : public ParquetException {
+ public:
+ explicit HiddenColumnException(const std::string& columnPath)
+ : ParquetException(columnPath.c_str()) {}
+};
+
+class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
+ public:
+ explicit KeyAccessDeniedException(const std::string& columnPath)
+ : ParquetException(columnPath.c_str()) {}
+};
+
+inline const uint8_t* str2bytes(const std::string& str) {
+ if (str.empty()) return NULLPTR;
+
+ char* cbytes = const_cast<char*>(str.c_str());
+ return reinterpret_cast<const uint8_t*>(cbytes);
+}
+
+class PARQUET_EXPORT ColumnEncryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ /// Convenience builder for encrypted columns.
+ explicit Builder(const std::string& name) : Builder(name, true) {}
+
+ /// Convenience builder for encrypted columns.
+ explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
+ : Builder(path->ToDotString(), true) {}
+
+ /// Set a column-specific key.
+ /// If key is not set on an encrypted column, the column will
+ /// be encrypted with the footer key.
+ /// keyBytes Key length must be either 16, 24 or 32 bytes.
+ /// The key is cloned, and will be wiped out (array values set to 0) upon completion
+ /// of file writing.
+ /// Caller is responsible for wiping out the input key array.
+ Builder* key(std::string column_key);
+
+ /// Set a key retrieval metadata.
+ /// use either key_metadata() or key_id(), not both
+ Builder* key_metadata(const std::string& key_metadata);
+
+ /// A convenience function to set key metadata using a string id.
+ /// Set a key retrieval metadata (converted from String).
+ /// use either key_metadata() or key_id(), not both
+ /// key_id will be converted to metadata (UTF-8 array).
+ Builder* key_id(const std::string& key_id);
+
+ std::shared_ptr<ColumnEncryptionProperties> build() {
+ return std::shared_ptr<ColumnEncryptionProperties>(
+ new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
+ }
+
+ private:
+ const std::string column_path_;
+ bool encrypted_;
+ std::string key_;
+ std::string key_metadata_;
+
+ Builder(const std::string path, bool encrypted)
+ : column_path_(path), encrypted_(encrypted) {}
+ };
+
+ std::string column_path() const { return column_path_; }
+ bool is_encrypted() const { return encrypted_; }
+ bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
+ std::string key() const { return key_; }
+ std::string key_metadata() const { return key_metadata_; }
+
+ /// Upon completion of file writing, the encryption key
+ /// will be wiped out.
+ void WipeOutEncryptionKey() { key_.clear(); }
+
+ bool is_utilized() {
+ if (key_.empty())
+ return false; // can re-use column properties without encryption keys
+ return utilized_;
+ }
+
+ /// ColumnEncryptionProperties object can be used for writing one file only.
+ /// Mark ColumnEncryptionProperties as utilized once it is used in
+ /// FileEncryptionProperties as the encryption key will be wiped out upon
+ /// completion of file writing.
+ void set_utilized() { utilized_ = true; }
+
+ std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
+ std::string key_copy = key_;
+ return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
+ encrypted_, column_path_, key_copy, key_metadata_));
+ }
+
+ ColumnEncryptionProperties() = default;
+ ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
+ ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
+
+ private:
+ const std::string column_path_;
+ bool encrypted_;
+ bool encrypted_with_footer_key_;
+ std::string key_;
+ std::string key_metadata_;
+ bool utilized_;
+ explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
+ const std::string& key,
+ const std::string& key_metadata);
+};
+
+class PARQUET_EXPORT ColumnDecryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ explicit Builder(const std::string& name) : column_path_(name) {}
+
+ explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
+ : Builder(path->ToDotString()) {}
+
+ /// Set an explicit column key. If applied on a file that contains
+ /// key metadata for this column the metadata will be ignored,
+ /// the column will be decrypted with this key.
+ /// key length must be either 16, 24 or 32 bytes.
+ Builder* key(const std::string& key);
+
+ std::shared_ptr<ColumnDecryptionProperties> build();
+
+ private:
+ const std::string column_path_;
+ std::string key_;
+ };
+
+ ColumnDecryptionProperties() = default;
+ ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
+ ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
+
+ std::string column_path() const { return column_path_; }
+ std::string key() const { return key_; }
+ bool is_utilized() { return utilized_; }
+
+ /// ColumnDecryptionProperties object can be used for reading one file only.
+ /// Mark ColumnDecryptionProperties as utilized once it is used in
+ /// FileDecryptionProperties as the encryption key will be wiped out upon
+ /// completion of file reading.
+ void set_utilized() { utilized_ = true; }
+
+ /// Upon completion of file reading, the encryption key
+ /// will be wiped out.
+ void WipeOutDecryptionKey();
+
+ std::shared_ptr<ColumnDecryptionProperties> DeepClone();
+
+ private:
+ const std::string column_path_;
+ std::string key_;
+ bool utilized_;
+
+ /// This class is only required for setting explicit column decryption keys -
+ /// to override key retriever (or to provide keys when key metadata and/or
+ /// key retriever are not available)
+ explicit ColumnDecryptionProperties(const std::string& column_path,
+ const std::string& key);
+};
+
+class PARQUET_EXPORT AADPrefixVerifier {
+ public:
+ /// Verifies identity (AAD Prefix) of individual file,
+ /// or of file collection in a data set.
+ /// Throws exception if an AAD prefix is wrong.
+ /// In a data set, AAD Prefixes should be collected,
+ /// and then checked for missing files.
+ virtual void Verify(const std::string& aad_prefix) = 0;
+ virtual ~AADPrefixVerifier() {}
+};
+
+class PARQUET_EXPORT FileDecryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ Builder() {
+ check_plaintext_footer_integrity_ = kDefaultCheckSignature;
+ plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
+ }
+
+ /// Set an explicit footer key. If applied on a file that contains
+ /// footer key metadata the metadata will be ignored, the footer
+ /// will be decrypted/verified with this key.
+ /// If explicit key is not set, footer key will be fetched from
+ /// key retriever.
+ /// With explicit keys or AAD prefix, new encryption properties object must be
+ /// created for each encrypted file.
+ /// Explicit encryption keys (footer and column) are cloned.
+ /// Upon completion of file reading, the cloned encryption keys in the properties
+ /// will be wiped out (array values set to 0).
+ /// Caller is responsible for wiping out the input key array.
+ /// param footerKey Key length must be either 16, 24 or 32 bytes.
+ Builder* footer_key(const std::string footer_key);
+
+ /// Set explicit column keys (decryption properties).
+ /// Its also possible to set a key retriever on this property object.
+ /// Upon file decryption, availability of explicit keys is checked before
+ /// invocation of the retriever callback.
+ /// If an explicit key is available for a footer or a column,
+ /// its key metadata will be ignored.
+ Builder* column_keys(
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
+
+ /// Set a key retriever callback. Its also possible to
+ /// set explicit footer or column keys on this file property object.
+ /// Upon file decryption, availability of explicit keys is checked before
+ /// invocation of the retriever callback.
+ /// If an explicit key is available for a footer or a column,
+ /// its key metadata will be ignored.
+ Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
+
+ /// Skip integrity verification of plaintext footers.
+ /// If not called, integrity of plaintext footers will be checked in runtime,
+ /// and an exception will be thrown in the following situations:
+ /// - footer signing key is not available
+ /// (not passed, or not found by key retriever)
+ /// - footer content and signature don't match
+ Builder* disable_footer_signature_verification() {
+ check_plaintext_footer_integrity_ = false;
+ return this;
+ }
+
+ /// Explicitly supply the file AAD prefix.
+ /// A must when a prefix is used for file encryption, but not stored in file.
+ /// If AAD prefix is stored in file, it will be compared to the explicitly
+ /// supplied value and an exception will be thrown if they differ.
+ Builder* aad_prefix(const std::string& aad_prefix);
+
+ /// Set callback for verification of AAD Prefixes stored in file.
+ Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
+
+ /// By default, reading plaintext (unencrypted) files is not
+ /// allowed when using a decryptor
+ /// - in order to detect files that were not encrypted by mistake.
+ /// However, the default behavior can be overridden by calling this method.
+ /// The caller should use then a different method to ensure encryption
+ /// of files with sensitive data.
+ Builder* plaintext_files_allowed() {
+ plaintext_files_allowed_ = true;
+ return this;
+ }
+
+ std::shared_ptr<FileDecryptionProperties> build() {
+ return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+ footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
+ aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
+ }
+
+ private:
+ std::string footer_key_;
+ std::string aad_prefix_;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+ bool check_plaintext_footer_integrity_;
+ bool plaintext_files_allowed_;
+ };
+
+ std::string column_key(const std::string& column_path) const;
+
+ std::string footer_key() const { return footer_key_; }
+
+ std::string aad_prefix() const { return aad_prefix_; }
+
+ const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
+ return key_retriever_;
+ }
+
+ bool check_plaintext_footer_integrity() const {
+ return check_plaintext_footer_integrity_;
+ }
+
+ bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
+
+ const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
+ return aad_prefix_verifier_;
+ }
+
+ /// Upon completion of file reading, the encryption keys in the properties
+ /// will be wiped out (array values set to 0).
+ void WipeOutDecryptionKeys();
+
+ bool is_utilized();
+
+ /// FileDecryptionProperties object can be used for reading one file only.
+ /// Mark FileDecryptionProperties as utilized once it is used to read a file as the
+ /// encryption keys will be wiped out upon completion of file reading.
+ void set_utilized() { utilized_ = true; }
+
+ /// FileDecryptionProperties object can be used for reading one file only.
+ /// (unless this object keeps the keyRetrieval callback only, and no explicit
+ /// keys or aadPrefix).
+ /// At the end, keys are wiped out in the memory.
+ /// This method allows to clone identical properties for another file,
+ /// with an option to update the aadPrefix (if newAadPrefix is null,
+ /// aadPrefix will be cloned too)
+ std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
+
+ private:
+ std::string footer_key_;
+ std::string aad_prefix_;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+
+ const std::string empty_string_ = "";
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+ bool check_plaintext_footer_integrity_;
+ bool plaintext_files_allowed_;
+ bool utilized_;
+
+ FileDecryptionProperties(
+ const std::string& footer_key,
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+ bool check_plaintext_footer_integrity, const std::string& aad_prefix,
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
+ bool plaintext_files_allowed);
+};
+
+class PARQUET_EXPORT FileEncryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ explicit Builder(const std::string& footer_key)
+ : parquet_cipher_(kDefaultEncryptionAlgorithm),
+ encrypted_footer_(kDefaultEncryptedFooter) {
+ footer_key_ = footer_key;
+ store_aad_prefix_in_file_ = false;
+ }
+
+ /// Create files with plaintext footer.
+ /// If not called, the files will be created with encrypted footer (default).
+ Builder* set_plaintext_footer() {
+ encrypted_footer_ = false;
+ return this;
+ }
+
+ /// Set encryption algorithm.
+ /// If not called, files will be encrypted with AES_GCM_V1 (default).
+ Builder* algorithm(ParquetCipher::type parquet_cipher) {
+ parquet_cipher_ = parquet_cipher;
+ return this;
+ }
+
+ /// Set a key retrieval metadata (converted from String).
+ /// use either footer_key_metadata or footer_key_id, not both.
+ Builder* footer_key_id(const std::string& key_id);
+
+ /// Set a key retrieval metadata.
+ /// use either footer_key_metadata or footer_key_id, not both.
+ Builder* footer_key_metadata(const std::string& footer_key_metadata);
+
+ /// Set the file AAD Prefix.
+ Builder* aad_prefix(const std::string& aad_prefix);
+
+ /// Skip storing AAD Prefix in file.
+ /// If not called, and if AAD Prefix is set, it will be stored.
+ Builder* disable_aad_prefix_storage();
+
+ /// Set the list of encrypted columns and their properties (keys etc).
+ /// If not called, all columns will be encrypted with the footer key.
+ /// If called, the file columns not in the list will be left unencrypted.
+ Builder* encrypted_columns(
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
+
+ std::shared_ptr<FileEncryptionProperties> build() {
+ return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+ parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
+ aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
+ }
+
+ private:
+ ParquetCipher::type parquet_cipher_;
+ bool encrypted_footer_;
+ std::string footer_key_;
+ std::string footer_key_metadata_;
+
+ std::string aad_prefix_;
+ bool store_aad_prefix_in_file_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+ };
+ bool encrypted_footer() const { return encrypted_footer_; }
+
+ EncryptionAlgorithm algorithm() const { return algorithm_; }
+
+ std::string footer_key() const { return footer_key_; }
+
+ std::string footer_key_metadata() const { return footer_key_metadata_; }
+
+ std::string file_aad() const { return file_aad_; }
+
+ std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+ const std::string& column_path);
+
+ bool is_utilized() const { return utilized_; }
+
+ /// FileEncryptionProperties object can be used for writing one file only.
+ /// Mark FileEncryptionProperties as utilized once it is used to write a file as the
+ /// encryption keys will be wiped out upon completion of file writing.
+ void set_utilized() { utilized_ = true; }
+
+ /// Upon completion of file writing, the encryption keys
+ /// will be wiped out (array values set to 0).
+ void WipeOutEncryptionKeys();
+
+ /// FileEncryptionProperties object can be used for writing one file only.
+ /// (at the end, keys are wiped out in the memory).
+ /// This method allows to clone identical properties for another file,
+ /// with an option to update the aadPrefix (if newAadPrefix is null,
+ /// aadPrefix will be cloned too)
+ std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
+
+ ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
+ return encrypted_columns_;
+ }
+
+ private:
+ EncryptionAlgorithm algorithm_;
+ std::string footer_key_;
+ std::string footer_key_metadata_;
+ bool encrypted_footer_;
+ std::string file_aad_;
+ std::string aad_prefix_;
+ bool utilized_;
+ bool store_aad_prefix_in_file_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+
+ FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
+ const std::string& footer_key_metadata, bool encrypted_footer,
+ const std::string& aad_prefix, bool store_aad_prefix_in_file,
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
index e50fb9d0b8a..48e64574b43 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
@@ -1,116 +1,116 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/properties.h"
-#include "parquet/types.h"
-
-using parquet::ParquetCipher;
-
-namespace parquet {
-namespace encryption {
-
-constexpr int kGcmTagLength = 16;
-constexpr int kNonceLength = 12;
-
-// Module types
-constexpr int8_t kFooter = 0;
-constexpr int8_t kColumnMetaData = 1;
-constexpr int8_t kDataPage = 2;
-constexpr int8_t kDictionaryPage = 3;
-constexpr int8_t kDataPageHeader = 4;
-constexpr int8_t kDictionaryPageHeader = 5;
-constexpr int8_t kColumnIndex = 6;
-constexpr int8_t kOffsetIndex = 7;
-
-/// Performs AES encryption operations with GCM or CTR ciphers.
-class AesEncryptor {
- public:
- /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
- explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
-
- static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesEncryptor*>* all_encryptors);
-
- ~AesEncryptor();
-
- /// Size difference between plaintext and ciphertext, for this cipher.
- int CiphertextSizeDelta();
-
- /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
- /// If different from value in constructor, exception will be thrown.
- int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext);
-
- /// Encrypts plaintext footer, in order to compute footer signature (tag).
- int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len,
- const uint8_t* nonce, uint8_t* encrypted_footer);
-
- void WipeOut();
-
- private:
- // PIMPL Idiom
- class AesEncryptorImpl;
- std::unique_ptr<AesEncryptorImpl> impl_;
-};
-
-/// Performs AES decryption operations with GCM or CTR ciphers.
-class AesDecryptor {
- public:
- /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
- explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
-
- static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesDecryptor*>* all_decryptors);
-
- ~AesDecryptor();
- void WipeOut();
-
- /// Size difference between plaintext and ciphertext, for this cipher.
- int CiphertextSizeDelta();
-
- /// Decrypts ciphertext with the key and aad. Key length is passed only for
- /// validation. If different from value in constructor, exception will be thrown.
- int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext);
-
- private:
- // PIMPL Idiom
- class AesDecryptorImpl;
- std::unique_ptr<AesDecryptorImpl> impl_;
-};
-
-std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
- int16_t row_group_ordinal, int16_t column_ordinal,
- int16_t page_ordinal);
-
-std::string CreateFooterAad(const std::string& aad_prefix_bytes);
-
-// Update last two bytes of page (or page header) module AAD
-void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal);
-
-// Wraps OpenSSL RAND_bytes function
-void RandBytes(unsigned char* buf, int num);
-
-} // namespace encryption
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/properties.h"
+#include "parquet/types.h"
+
+using parquet::ParquetCipher;
+
+namespace parquet {
+namespace encryption {
+
+constexpr int kGcmTagLength = 16;
+constexpr int kNonceLength = 12;
+
+// Module types
+constexpr int8_t kFooter = 0;
+constexpr int8_t kColumnMetaData = 1;
+constexpr int8_t kDataPage = 2;
+constexpr int8_t kDictionaryPage = 3;
+constexpr int8_t kDataPageHeader = 4;
+constexpr int8_t kDictionaryPageHeader = 5;
+constexpr int8_t kColumnIndex = 6;
+constexpr int8_t kOffsetIndex = 7;
+
+/// Performs AES encryption operations with GCM or CTR ciphers.
+class AesEncryptor {
+ public:
+ /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
+ explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
+
+ static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesEncryptor*>* all_encryptors);
+
+ ~AesEncryptor();
+
+ /// Size difference between plaintext and ciphertext, for this cipher.
+ int CiphertextSizeDelta();
+
+ /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
+ /// If different from value in constructor, exception will be thrown.
+ int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext);
+
+ /// Encrypts plaintext footer, in order to compute footer signature (tag).
+ int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ const uint8_t* nonce, uint8_t* encrypted_footer);
+
+ void WipeOut();
+
+ private:
+ // PIMPL Idiom
+ class AesEncryptorImpl;
+ std::unique_ptr<AesEncryptorImpl> impl_;
+};
+
+/// Performs AES decryption operations with GCM or CTR ciphers.
+class AesDecryptor {
+ public:
+ /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
+ explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
+
+ static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesDecryptor*>* all_decryptors);
+
+ ~AesDecryptor();
+ void WipeOut();
+
+ /// Size difference between plaintext and ciphertext, for this cipher.
+ int CiphertextSizeDelta();
+
+ /// Decrypts ciphertext with the key and aad. Key length is passed only for
+ /// validation. If different from value in constructor, exception will be thrown.
+ int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext);
+
+ private:
+ // PIMPL Idiom
+ class AesDecryptorImpl;
+ std::unique_ptr<AesDecryptorImpl> impl_;
+};
+
+std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ int16_t page_ordinal);
+
+std::string CreateFooterAad(const std::string& aad_prefix_bytes);
+
+// Update last two bytes of page (or page header) module AAD
+void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal);
+
+// Wraps OpenSSL RAND_bytes function
+void RandBytes(unsigned char* buf, int num);
+
+} // namespace encryption
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
index 7f2edfa1d78..fd3c1775d25 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
@@ -1,110 +1,110 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/exception.h"
-
-namespace parquet {
-namespace encryption {
-
-void ThrowOpenSSLRequiredException() {
- throw ParquetException(
- "Calling encryption method in Arrow/Parquet built without OpenSSL");
-}
-
-class AesEncryptor::AesEncryptorImpl {};
-
-AesEncryptor::~AesEncryptor() {}
-
-int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len,
- const uint8_t* key, int key_len, const uint8_t* aad,
- int aad_len, const uint8_t* nonce,
- uint8_t* encrypted_footer) {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
-
-int AesEncryptor::CiphertextSizeDelta() {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len,
- uint8_t* ciphertext) {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
- ThrowOpenSSLRequiredException();
-}
-
-class AesDecryptor::AesDecryptorImpl {};
-
-int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len,
- uint8_t* ciphertext) {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
-
-AesDecryptor::~AesDecryptor() {}
-
-AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesEncryptor*>* all_encryptors) {
- return NULLPTR;
-}
-
-AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
- ThrowOpenSSLRequiredException();
-}
-
-AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesDecryptor*>* all_decryptors) {
- return NULLPTR;
-}
-
-int AesDecryptor::CiphertextSizeDelta() {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
- int16_t row_group_ordinal, int16_t column_ordinal,
- int16_t page_ordinal) {
- ThrowOpenSSLRequiredException();
- return "";
-}
-
-std::string CreateFooterAad(const std::string& aad_prefix_bytes) {
- ThrowOpenSSLRequiredException();
- return "";
-}
-
-void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) {
- ThrowOpenSSLRequiredException();
-}
-
-void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
-
-} // namespace encryption
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/exception.h"
+
+namespace parquet {
+namespace encryption {
+
+void ThrowOpenSSLRequiredException() {
+ throw ParquetException(
+ "Calling encryption method in Arrow/Parquet built without OpenSSL");
+}
+
+class AesEncryptor::AesEncryptorImpl {};
+
+AesEncryptor::~AesEncryptor() {}
+
+int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len,
+ const uint8_t* key, int key_len, const uint8_t* aad,
+ int aad_len, const uint8_t* nonce,
+ uint8_t* encrypted_footer) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
+
+int AesEncryptor::CiphertextSizeDelta() {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ uint8_t* ciphertext) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
+ ThrowOpenSSLRequiredException();
+}
+
+class AesDecryptor::AesDecryptorImpl {};
+
+int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ uint8_t* ciphertext) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
+
+AesDecryptor::~AesDecryptor() {}
+
+AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesEncryptor*>* all_encryptors) {
+ return NULLPTR;
+}
+
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
+ ThrowOpenSSLRequiredException();
+}
+
+AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesDecryptor*>* all_decryptors) {
+ return NULLPTR;
+}
+
+int AesDecryptor::CiphertextSizeDelta() {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ int16_t page_ordinal) {
+ ThrowOpenSSLRequiredException();
+ return "";
+}
+
+std::string CreateFooterAad(const std::string& aad_prefix_bytes) {
+ ThrowOpenSSLRequiredException();
+ return "";
+}
+
+void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) {
+ ThrowOpenSSLRequiredException();
+}
+
+void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
+
+} // namespace encryption
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
index 6381e4f37f7..5aa7d010d3f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
@@ -1,240 +1,240 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/encryption/encryption.h"
-#include "parquet/encryption/encryption_internal.h"
-
-namespace parquet {
-
-// Decryptor
-Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool)
- : aes_decryptor_(aes_decryptor),
- key_(key),
- file_aad_(file_aad),
- aad_(aad),
- pool_(pool) {}
-
-int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); }
-
-int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len,
- uint8_t* plaintext) {
- return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_),
- static_cast<int>(key_.size()), str2bytes(aad_),
- static_cast<int>(aad_.size()), plaintext);
-}
-
-// InternalFileDecryptor
-InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties,
- const std::string& file_aad,
- ParquetCipher::type algorithm,
- const std::string& footer_key_metadata,
- ::arrow::MemoryPool* pool)
- : properties_(properties),
- file_aad_(file_aad),
- algorithm_(algorithm),
- footer_key_metadata_(footer_key_metadata),
- pool_(pool) {
- if (properties_->is_utilized()) {
- throw ParquetException(
- "Re-using decryption properties with explicit keys for another file");
- }
- properties_->set_utilized();
-}
-
-void InternalFileDecryptor::WipeOutDecryptionKeys() {
- properties_->WipeOutDecryptionKeys();
- for (auto const& i : all_decryptors_) {
- i->WipeOut();
- }
-}
-
-std::string InternalFileDecryptor::GetFooterKey() {
- std::string footer_key = properties_->footer_key();
- // ignore footer key metadata if footer key is explicitly set via API
- if (footer_key.empty()) {
- if (footer_key_metadata_.empty())
- throw ParquetException("No footer key or key metadata");
- if (properties_->key_retriever() == nullptr)
- throw ParquetException("No footer key or key retriever");
- try {
- footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
- } catch (KeyAccessDeniedException& e) {
- std::stringstream ss;
- ss << "Footer key: access denied " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- }
- if (footer_key.empty()) {
- throw ParquetException(
- "Footer key unavailable. Could not verify "
- "plaintext footer metadata");
- }
- return footer_key;
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor() {
- std::string aad = encryption::CreateFooterAad(file_aad_);
- return GetFooterDecryptor(aad, true);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnMeta(
- const std::string& aad) {
- return GetFooterDecryptor(aad, true);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnData(
- const std::string& aad) {
- return GetFooterDecryptor(aad, false);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
- const std::string& aad, bool metadata) {
- if (metadata) {
- if (footer_metadata_decryptor_ != nullptr) return footer_metadata_decryptor_;
- } else {
- if (footer_data_decryptor_ != nullptr) return footer_data_decryptor_;
- }
-
- std::string footer_key = properties_->footer_key();
- if (footer_key.empty()) {
- if (footer_key_metadata_.empty())
- throw ParquetException("No footer key or key metadata");
- if (properties_->key_retriever() == nullptr)
- throw ParquetException("No footer key or key retriever");
- try {
- footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
- } catch (KeyAccessDeniedException& e) {
- std::stringstream ss;
- ss << "Footer key: access denied " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- }
- if (footer_key.empty()) {
- throw ParquetException(
- "Invalid footer encryption key. "
- "Could not parse footer metadata");
- }
-
- // Create both data and metadata decryptors to avoid redundant retrieval of key
- // from the key_retriever.
- auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size());
- auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size());
-
- footer_metadata_decryptor_ = std::make_shared<Decryptor>(
- aes_metadata_decryptor, footer_key, file_aad_, aad, pool_);
- footer_data_decryptor_ =
- std::make_shared<Decryptor>(aes_data_decryptor, footer_key, file_aad_, aad, pool_);
-
- if (metadata) return footer_metadata_decryptor_;
- return footer_data_decryptor_;
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnMetaDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad) {
- return GetColumnDecryptor(column_path, column_key_metadata, aad, true);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDataDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad) {
- return GetColumnDecryptor(column_path, column_key_metadata, aad, false);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad, bool metadata) {
- std::string column_key;
- // first look if we already got the decryptor from before
- if (metadata) {
- if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
- auto res(column_metadata_map_.at(column_path));
- res->UpdateAad(aad);
- return res;
- }
- } else {
- if (column_data_map_.find(column_path) != column_data_map_.end()) {
- auto res(column_data_map_.at(column_path));
- res->UpdateAad(aad);
- return res;
- }
- }
-
- column_key = properties_->column_key(column_path);
- // No explicit column key given via API. Retrieve via key metadata.
- if (column_key.empty() && !column_key_metadata.empty() &&
- properties_->key_retriever() != nullptr) {
- try {
- column_key = properties_->key_retriever()->GetKey(column_key_metadata);
- } catch (KeyAccessDeniedException& e) {
- std::stringstream ss;
- ss << "HiddenColumnException, path=" + column_path + " " << e.what() << "\n";
- throw HiddenColumnException(ss.str());
- }
- }
- if (column_key.empty()) {
- throw HiddenColumnException("HiddenColumnException, path=" + column_path);
- }
-
- // Create both data and metadata decryptors to avoid redundant retrieval of key
- // using the key_retriever.
- auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size());
- auto aes_data_decryptor = GetDataAesDecryptor(column_key.size());
-
- column_metadata_map_[column_path] = std::make_shared<Decryptor>(
- aes_metadata_decryptor, column_key, file_aad_, aad, pool_);
- column_data_map_[column_path] =
- std::make_shared<Decryptor>(aes_data_decryptor, column_key, file_aad_, aad, pool_);
-
- if (metadata) return column_metadata_map_[column_path];
- return column_data_map_[column_path];
-}
-
-int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) {
- if (key_len == 16)
- return 0;
- else if (key_len == 24)
- return 1;
- else if (key_len == 32)
- return 2;
- throw ParquetException("decryption key must be 16, 24 or 32 bytes in length");
-}
-
-encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToDecryptorArrayIndex(key_len);
- if (meta_decryptor_[index] == nullptr) {
- meta_decryptor_[index].reset(
- encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_));
- }
- return meta_decryptor_[index].get();
-}
-
-encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToDecryptorArrayIndex(key_len);
- if (data_decryptor_[index] == nullptr) {
- data_decryptor_[index].reset(
- encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_));
- }
- return data_decryptor_[index].get();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// Decryptor
+Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool)
+ : aes_decryptor_(aes_decryptor),
+ key_(key),
+ file_aad_(file_aad),
+ aad_(aad),
+ pool_(pool) {}
+
+int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); }
+
+int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len,
+ uint8_t* plaintext) {
+ return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_),
+ static_cast<int>(key_.size()), str2bytes(aad_),
+ static_cast<int>(aad_.size()), plaintext);
+}
+
+// InternalFileDecryptor
+InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties,
+ const std::string& file_aad,
+ ParquetCipher::type algorithm,
+ const std::string& footer_key_metadata,
+ ::arrow::MemoryPool* pool)
+ : properties_(properties),
+ file_aad_(file_aad),
+ algorithm_(algorithm),
+ footer_key_metadata_(footer_key_metadata),
+ pool_(pool) {
+ if (properties_->is_utilized()) {
+ throw ParquetException(
+ "Re-using decryption properties with explicit keys for another file");
+ }
+ properties_->set_utilized();
+}
+
+void InternalFileDecryptor::WipeOutDecryptionKeys() {
+ properties_->WipeOutDecryptionKeys();
+ for (auto const& i : all_decryptors_) {
+ i->WipeOut();
+ }
+}
+
+std::string InternalFileDecryptor::GetFooterKey() {
+ std::string footer_key = properties_->footer_key();
+ // ignore footer key metadata if footer key is explicitly set via API
+ if (footer_key.empty()) {
+ if (footer_key_metadata_.empty())
+ throw ParquetException("No footer key or key metadata");
+ if (properties_->key_retriever() == nullptr)
+ throw ParquetException("No footer key or key retriever");
+ try {
+ footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "Footer key: access denied " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ if (footer_key.empty()) {
+ throw ParquetException(
+ "Footer key unavailable. Could not verify "
+ "plaintext footer metadata");
+ }
+ return footer_key;
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor() {
+ std::string aad = encryption::CreateFooterAad(file_aad_);
+ return GetFooterDecryptor(aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnMeta(
+ const std::string& aad) {
+ return GetFooterDecryptor(aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnData(
+ const std::string& aad) {
+ return GetFooterDecryptor(aad, false);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
+ const std::string& aad, bool metadata) {
+ if (metadata) {
+ if (footer_metadata_decryptor_ != nullptr) return footer_metadata_decryptor_;
+ } else {
+ if (footer_data_decryptor_ != nullptr) return footer_data_decryptor_;
+ }
+
+ std::string footer_key = properties_->footer_key();
+ if (footer_key.empty()) {
+ if (footer_key_metadata_.empty())
+ throw ParquetException("No footer key or key metadata");
+ if (properties_->key_retriever() == nullptr)
+ throw ParquetException("No footer key or key retriever");
+ try {
+ footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "Footer key: access denied " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ if (footer_key.empty()) {
+ throw ParquetException(
+ "Invalid footer encryption key. "
+ "Could not parse footer metadata");
+ }
+
+ // Create both data and metadata decryptors to avoid redundant retrieval of key
+ // from the key_retriever.
+ auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size());
+ auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size());
+
+ footer_metadata_decryptor_ = std::make_shared<Decryptor>(
+ aes_metadata_decryptor, footer_key, file_aad_, aad, pool_);
+ footer_data_decryptor_ =
+ std::make_shared<Decryptor>(aes_data_decryptor, footer_key, file_aad_, aad, pool_);
+
+ if (metadata) return footer_metadata_decryptor_;
+ return footer_data_decryptor_;
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnMetaDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad) {
+ return GetColumnDecryptor(column_path, column_key_metadata, aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDataDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad) {
+ return GetColumnDecryptor(column_path, column_key_metadata, aad, false);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad, bool metadata) {
+ std::string column_key;
+ // first look if we already got the decryptor from before
+ if (metadata) {
+ if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
+ auto res(column_metadata_map_.at(column_path));
+ res->UpdateAad(aad);
+ return res;
+ }
+ } else {
+ if (column_data_map_.find(column_path) != column_data_map_.end()) {
+ auto res(column_data_map_.at(column_path));
+ res->UpdateAad(aad);
+ return res;
+ }
+ }
+
+ column_key = properties_->column_key(column_path);
+ // No explicit column key given via API. Retrieve via key metadata.
+ if (column_key.empty() && !column_key_metadata.empty() &&
+ properties_->key_retriever() != nullptr) {
+ try {
+ column_key = properties_->key_retriever()->GetKey(column_key_metadata);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "HiddenColumnException, path=" + column_path + " " << e.what() << "\n";
+ throw HiddenColumnException(ss.str());
+ }
+ }
+ if (column_key.empty()) {
+ throw HiddenColumnException("HiddenColumnException, path=" + column_path);
+ }
+
+ // Create both data and metadata decryptors to avoid redundant retrieval of key
+ // using the key_retriever.
+ auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size());
+ auto aes_data_decryptor = GetDataAesDecryptor(column_key.size());
+
+ column_metadata_map_[column_path] = std::make_shared<Decryptor>(
+ aes_metadata_decryptor, column_key, file_aad_, aad, pool_);
+ column_data_map_[column_path] =
+ std::make_shared<Decryptor>(aes_data_decryptor, column_key, file_aad_, aad, pool_);
+
+ if (metadata) return column_metadata_map_[column_path];
+ return column_data_map_[column_path];
+}
+
+int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) {
+ if (key_len == 16)
+ return 0;
+ else if (key_len == 24)
+ return 1;
+ else if (key_len == 32)
+ return 2;
+ throw ParquetException("decryption key must be 16, 24 or 32 bytes in length");
+}
+
+encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToDecryptorArrayIndex(key_len);
+ if (meta_decryptor_[index] == nullptr) {
+ meta_decryptor_[index].reset(
+ encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_));
+ }
+ return meta_decryptor_[index].get();
+}
+
+encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToDecryptorArrayIndex(key_len);
+ if (data_decryptor_[index] == nullptr) {
+ data_decryptor_[index].reset(
+ encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_));
+ }
+ return data_decryptor_[index].get();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
index 011c4acbeb6..fc2bc433d92 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
@@ -1,121 +1,121 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/schema.h"
-
-namespace parquet {
-
-namespace encryption {
-class AesDecryptor;
-class AesEncryptor;
-} // namespace encryption
-
-class FileDecryptionProperties;
-
-class PARQUET_EXPORT Decryptor {
- public:
- Decryptor(encryption::AesDecryptor* decryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool);
-
- const std::string& file_aad() const { return file_aad_; }
- void UpdateAad(const std::string& aad) { aad_ = aad; }
- ::arrow::MemoryPool* pool() { return pool_; }
-
- int CiphertextSizeDelta();
- int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext);
-
- private:
- encryption::AesDecryptor* aes_decryptor_;
- std::string key_;
- std::string file_aad_;
- std::string aad_;
- ::arrow::MemoryPool* pool_;
-};
-
-class InternalFileDecryptor {
- public:
- explicit InternalFileDecryptor(FileDecryptionProperties* properties,
- const std::string& file_aad,
- ParquetCipher::type algorithm,
- const std::string& footer_key_metadata,
- ::arrow::MemoryPool* pool);
-
- std::string& file_aad() { return file_aad_; }
-
- std::string GetFooterKey();
-
- ParquetCipher::type algorithm() { return algorithm_; }
-
- std::string& footer_key_metadata() { return footer_key_metadata_; }
-
- FileDecryptionProperties* properties() { return properties_; }
-
- void WipeOutDecryptionKeys();
-
- ::arrow::MemoryPool* pool() { return pool_; }
-
- std::shared_ptr<Decryptor> GetFooterDecryptor();
- std::shared_ptr<Decryptor> GetFooterDecryptorForColumnMeta(const std::string& aad = "");
- std::shared_ptr<Decryptor> GetFooterDecryptorForColumnData(const std::string& aad = "");
- std::shared_ptr<Decryptor> GetColumnMetaDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad = "");
- std::shared_ptr<Decryptor> GetColumnDataDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad = "");
-
- private:
- FileDecryptionProperties* properties_;
- // Concatenation of aad_prefix (if exists) and aad_file_unique
- std::string file_aad_;
- std::map<std::string, std::shared_ptr<Decryptor>> column_data_map_;
- std::map<std::string, std::shared_ptr<Decryptor>> column_metadata_map_;
-
- std::shared_ptr<Decryptor> footer_metadata_decryptor_;
- std::shared_ptr<Decryptor> footer_data_decryptor_;
- ParquetCipher::type algorithm_;
- std::string footer_key_metadata_;
- std::vector<encryption::AesDecryptor*> all_decryptors_;
-
- /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
- // types of meta_decryptors and data_decryptors.
- std::unique_ptr<encryption::AesDecryptor> meta_decryptor_[3];
- std::unique_ptr<encryption::AesDecryptor> data_decryptor_[3];
-
- ::arrow::MemoryPool* pool_;
-
- std::shared_ptr<Decryptor> GetFooterDecryptor(const std::string& aad, bool metadata);
- std::shared_ptr<Decryptor> GetColumnDecryptor(const std::string& column_path,
- const std::string& column_key_metadata,
- const std::string& aad,
- bool metadata = false);
-
- encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size);
- encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size);
-
- int MapKeyLenToDecryptorArrayIndex(int key_len);
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/schema.h"
+
+namespace parquet {
+
+namespace encryption {
+class AesDecryptor;
+class AesEncryptor;
+} // namespace encryption
+
+class FileDecryptionProperties;
+
+class PARQUET_EXPORT Decryptor {
+ public:
+ Decryptor(encryption::AesDecryptor* decryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool);
+
+ const std::string& file_aad() const { return file_aad_; }
+ void UpdateAad(const std::string& aad) { aad_ = aad; }
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ int CiphertextSizeDelta();
+ int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext);
+
+ private:
+ encryption::AesDecryptor* aes_decryptor_;
+ std::string key_;
+ std::string file_aad_;
+ std::string aad_;
+ ::arrow::MemoryPool* pool_;
+};
+
+class InternalFileDecryptor {
+ public:
+ explicit InternalFileDecryptor(FileDecryptionProperties* properties,
+ const std::string& file_aad,
+ ParquetCipher::type algorithm,
+ const std::string& footer_key_metadata,
+ ::arrow::MemoryPool* pool);
+
+ std::string& file_aad() { return file_aad_; }
+
+ std::string GetFooterKey();
+
+ ParquetCipher::type algorithm() { return algorithm_; }
+
+ std::string& footer_key_metadata() { return footer_key_metadata_; }
+
+ FileDecryptionProperties* properties() { return properties_; }
+
+ void WipeOutDecryptionKeys();
+
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ std::shared_ptr<Decryptor> GetFooterDecryptor();
+ std::shared_ptr<Decryptor> GetFooterDecryptorForColumnMeta(const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetFooterDecryptorForColumnData(const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetColumnMetaDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetColumnDataDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad = "");
+
+ private:
+ FileDecryptionProperties* properties_;
+ // Concatenation of aad_prefix (if exists) and aad_file_unique
+ std::string file_aad_;
+ std::map<std::string, std::shared_ptr<Decryptor>> column_data_map_;
+ std::map<std::string, std::shared_ptr<Decryptor>> column_metadata_map_;
+
+ std::shared_ptr<Decryptor> footer_metadata_decryptor_;
+ std::shared_ptr<Decryptor> footer_data_decryptor_;
+ ParquetCipher::type algorithm_;
+ std::string footer_key_metadata_;
+ std::vector<encryption::AesDecryptor*> all_decryptors_;
+
+ /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
+ // types of meta_decryptors and data_decryptors.
+ std::unique_ptr<encryption::AesDecryptor> meta_decryptor_[3];
+ std::unique_ptr<encryption::AesDecryptor> data_decryptor_[3];
+
+ ::arrow::MemoryPool* pool_;
+
+ std::shared_ptr<Decryptor> GetFooterDecryptor(const std::string& aad, bool metadata);
+ std::shared_ptr<Decryptor> GetColumnDecryptor(const std::string& column_path,
+ const std::string& column_key_metadata,
+ const std::string& aad,
+ bool metadata = false);
+
+ encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size);
+ encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size);
+
+ int MapKeyLenToDecryptorArrayIndex(int key_len);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
index 15bf52b84dd..c9f265cf7f1 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
@@ -1,170 +1,170 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/encryption/encryption.h"
-#include "parquet/encryption/encryption_internal.h"
-
-namespace parquet {
-
-// Encryptor
-Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool)
- : aes_encryptor_(aes_encryptor),
- key_(key),
- file_aad_(file_aad),
- aad_(aad),
- pool_(pool) {}
-
-int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); }
-
-int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) {
- return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_),
- static_cast<int>(key_.size()), str2bytes(aad_),
- static_cast<int>(aad_.size()), ciphertext);
-}
-
-// InternalFileEncryptor
-InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties,
- ::arrow::MemoryPool* pool)
- : properties_(properties), pool_(pool) {
- if (properties_->is_utilized()) {
- throw ParquetException("Re-using encryption properties for another file");
- }
- properties_->set_utilized();
-}
-
-void InternalFileEncryptor::WipeOutEncryptionKeys() {
- properties_->WipeOutEncryptionKeys();
-
- for (auto const& i : all_encryptors_) {
- i->WipeOut();
- }
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterEncryptor() {
- if (footer_encryptor_ != nullptr) {
- return footer_encryptor_;
- }
-
- ParquetCipher::type algorithm = properties_->algorithm().algorithm;
- std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
- std::string footer_key = properties_->footer_key();
- auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size());
- footer_encryptor_ = std::make_shared<Encryptor>(
- aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_);
- return footer_encryptor_;
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterSigningEncryptor() {
- if (footer_signing_encryptor_ != nullptr) {
- return footer_signing_encryptor_;
- }
-
- ParquetCipher::type algorithm = properties_->algorithm().algorithm;
- std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
- std::string footer_signing_key = properties_->footer_key();
- auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size());
- footer_signing_encryptor_ = std::make_shared<Encryptor>(
- aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_);
- return footer_signing_encryptor_;
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnMetaEncryptor(
- const std::string& column_path) {
- return GetColumnEncryptor(column_path, true);
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnDataEncryptor(
- const std::string& column_path) {
- return GetColumnEncryptor(column_path, false);
-}
-
-std::shared_ptr<Encryptor>
-InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
- const std::string& column_path, bool metadata) {
- // first look if we already got the encryptor from before
- if (metadata) {
- if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
- return column_metadata_map_.at(column_path);
- }
- } else {
- if (column_data_map_.find(column_path) != column_data_map_.end()) {
- return column_data_map_.at(column_path);
- }
- }
- auto column_prop = properties_->column_encryption_properties(column_path);
- if (column_prop == nullptr) {
- return nullptr;
- }
-
- std::string key;
- if (column_prop->is_encrypted_with_footer_key()) {
- key = properties_->footer_key();
- } else {
- key = column_prop->key();
- }
-
- ParquetCipher::type algorithm = properties_->algorithm().algorithm;
- auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size())
- : GetDataAesEncryptor(algorithm, key.size());
-
- std::string file_aad = properties_->file_aad();
- std::shared_ptr<Encryptor> encryptor =
- std::make_shared<Encryptor>(aes_encryptor, key, file_aad, "", pool_);
- if (metadata)
- column_metadata_map_[column_path] = encryptor;
- else
- column_data_map_[column_path] = encryptor;
-
- return encryptor;
-}
-
-int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) {
- if (key_len == 16)
- return 0;
- else if (key_len == 24)
- return 1;
- else if (key_len == 32)
- return 2;
- throw ParquetException("encryption key must be 16, 24 or 32 bytes in length");
-}
-
-encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
- ParquetCipher::type algorithm, size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToEncryptorArrayIndex(key_len);
- if (meta_encryptor_[index] == nullptr) {
- meta_encryptor_[index].reset(
- encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_));
- }
- return meta_encryptor_[index].get();
-}
-
-encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
- ParquetCipher::type algorithm, size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToEncryptorArrayIndex(key_len);
- if (data_encryptor_[index] == nullptr) {
- data_encryptor_[index].reset(
- encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_));
- }
- return data_encryptor_[index].get();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// Encryptor
+Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool)
+ : aes_encryptor_(aes_encryptor),
+ key_(key),
+ file_aad_(file_aad),
+ aad_(aad),
+ pool_(pool) {}
+
+int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); }
+
+int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) {
+ return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_),
+ static_cast<int>(key_.size()), str2bytes(aad_),
+ static_cast<int>(aad_.size()), ciphertext);
+}
+
+// InternalFileEncryptor
+InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties,
+ ::arrow::MemoryPool* pool)
+ : properties_(properties), pool_(pool) {
+ if (properties_->is_utilized()) {
+ throw ParquetException("Re-using encryption properties for another file");
+ }
+ properties_->set_utilized();
+}
+
+void InternalFileEncryptor::WipeOutEncryptionKeys() {
+ properties_->WipeOutEncryptionKeys();
+
+ for (auto const& i : all_encryptors_) {
+ i->WipeOut();
+ }
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterEncryptor() {
+ if (footer_encryptor_ != nullptr) {
+ return footer_encryptor_;
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
+ std::string footer_key = properties_->footer_key();
+ auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size());
+ footer_encryptor_ = std::make_shared<Encryptor>(
+ aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_);
+ return footer_encryptor_;
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterSigningEncryptor() {
+ if (footer_signing_encryptor_ != nullptr) {
+ return footer_signing_encryptor_;
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
+ std::string footer_signing_key = properties_->footer_key();
+ auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size());
+ footer_signing_encryptor_ = std::make_shared<Encryptor>(
+ aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_);
+ return footer_signing_encryptor_;
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnMetaEncryptor(
+ const std::string& column_path) {
+ return GetColumnEncryptor(column_path, true);
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnDataEncryptor(
+ const std::string& column_path) {
+ return GetColumnEncryptor(column_path, false);
+}
+
+std::shared_ptr<Encryptor>
+InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
+ const std::string& column_path, bool metadata) {
+ // first look if we already got the encryptor from before
+ if (metadata) {
+ if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
+ return column_metadata_map_.at(column_path);
+ }
+ } else {
+ if (column_data_map_.find(column_path) != column_data_map_.end()) {
+ return column_data_map_.at(column_path);
+ }
+ }
+ auto column_prop = properties_->column_encryption_properties(column_path);
+ if (column_prop == nullptr) {
+ return nullptr;
+ }
+
+ std::string key;
+ if (column_prop->is_encrypted_with_footer_key()) {
+ key = properties_->footer_key();
+ } else {
+ key = column_prop->key();
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size())
+ : GetDataAesEncryptor(algorithm, key.size());
+
+ std::string file_aad = properties_->file_aad();
+ std::shared_ptr<Encryptor> encryptor =
+ std::make_shared<Encryptor>(aes_encryptor, key, file_aad, "", pool_);
+ if (metadata)
+ column_metadata_map_[column_path] = encryptor;
+ else
+ column_data_map_[column_path] = encryptor;
+
+ return encryptor;
+}
+
+int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) {
+ if (key_len == 16)
+ return 0;
+ else if (key_len == 24)
+ return 1;
+ else if (key_len == 32)
+ return 2;
+ throw ParquetException("encryption key must be 16, 24 or 32 bytes in length");
+}
+
+encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
+ ParquetCipher::type algorithm, size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToEncryptorArrayIndex(key_len);
+ if (meta_encryptor_[index] == nullptr) {
+ meta_encryptor_[index].reset(
+ encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_));
+ }
+ return meta_encryptor_[index].get();
+}
+
+encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
+ ParquetCipher::type algorithm, size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToEncryptorArrayIndex(key_len);
+ if (data_encryptor_[index] == nullptr) {
+ data_encryptor_[index].reset(
+ encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_));
+ }
+ return data_encryptor_[index].get();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
index 3cbe53500c2..7cf513ca810 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
@@ -1,109 +1,109 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/encryption/encryption.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-
-namespace encryption {
-class AesEncryptor;
-} // namespace encryption
-
-class FileEncryptionProperties;
-class ColumnEncryptionProperties;
-
-class PARQUET_EXPORT Encryptor {
- public:
- Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool);
- const std::string& file_aad() { return file_aad_; }
- void UpdateAad(const std::string& aad) { aad_ = aad; }
- ::arrow::MemoryPool* pool() { return pool_; }
-
- int CiphertextSizeDelta();
- int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext);
-
- bool EncryptColumnMetaData(
- bool encrypted_footer,
- const std::shared_ptr<ColumnEncryptionProperties>& column_encryption_properties) {
- // if column is not encrypted then do not encrypt the column metadata
- if (!column_encryption_properties || !column_encryption_properties->is_encrypted())
- return false;
- // if plaintext footer then encrypt the column metadata
- if (!encrypted_footer) return true;
- // if column is not encrypted with footer key then encrypt the column metadata
- return !column_encryption_properties->is_encrypted_with_footer_key();
- }
-
- private:
- encryption::AesEncryptor* aes_encryptor_;
- std::string key_;
- std::string file_aad_;
- std::string aad_;
- ::arrow::MemoryPool* pool_;
-};
-
-class InternalFileEncryptor {
- public:
- explicit InternalFileEncryptor(FileEncryptionProperties* properties,
- ::arrow::MemoryPool* pool);
-
- std::shared_ptr<Encryptor> GetFooterEncryptor();
- std::shared_ptr<Encryptor> GetFooterSigningEncryptor();
- std::shared_ptr<Encryptor> GetColumnMetaEncryptor(const std::string& column_path);
- std::shared_ptr<Encryptor> GetColumnDataEncryptor(const std::string& column_path);
- void WipeOutEncryptionKeys();
-
- private:
- FileEncryptionProperties* properties_;
-
- std::map<std::string, std::shared_ptr<Encryptor>> column_data_map_;
- std::map<std::string, std::shared_ptr<Encryptor>> column_metadata_map_;
-
- std::shared_ptr<Encryptor> footer_signing_encryptor_;
- std::shared_ptr<Encryptor> footer_encryptor_;
-
- std::vector<encryption::AesEncryptor*> all_encryptors_;
-
- // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
- // types of meta_encryptors and data_encryptors.
- std::unique_ptr<encryption::AesEncryptor> meta_encryptor_[3];
- std::unique_ptr<encryption::AesEncryptor> data_encryptor_[3];
-
- ::arrow::MemoryPool* pool_;
-
- std::shared_ptr<Encryptor> GetColumnEncryptor(const std::string& column_path,
- bool metadata);
-
- encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm,
- size_t key_len);
- encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
- size_t key_len);
-
- int MapKeyLenToEncryptorArrayIndex(int key_len);
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/encryption/encryption.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+namespace encryption {
+class AesEncryptor;
+} // namespace encryption
+
+class FileEncryptionProperties;
+class ColumnEncryptionProperties;
+
+class PARQUET_EXPORT Encryptor {
+ public:
+ Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool);
+ const std::string& file_aad() { return file_aad_; }
+ void UpdateAad(const std::string& aad) { aad_ = aad; }
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ int CiphertextSizeDelta();
+ int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext);
+
+ bool EncryptColumnMetaData(
+ bool encrypted_footer,
+ const std::shared_ptr<ColumnEncryptionProperties>& column_encryption_properties) {
+ // if column is not encrypted then do not encrypt the column metadata
+ if (!column_encryption_properties || !column_encryption_properties->is_encrypted())
+ return false;
+ // if plaintext footer then encrypt the column metadata
+ if (!encrypted_footer) return true;
+ // if column is not encrypted with footer key then encrypt the column metadata
+ return !column_encryption_properties->is_encrypted_with_footer_key();
+ }
+
+ private:
+ encryption::AesEncryptor* aes_encryptor_;
+ std::string key_;
+ std::string file_aad_;
+ std::string aad_;
+ ::arrow::MemoryPool* pool_;
+};
+
+class InternalFileEncryptor {
+ public:
+ explicit InternalFileEncryptor(FileEncryptionProperties* properties,
+ ::arrow::MemoryPool* pool);
+
+ std::shared_ptr<Encryptor> GetFooterEncryptor();
+ std::shared_ptr<Encryptor> GetFooterSigningEncryptor();
+ std::shared_ptr<Encryptor> GetColumnMetaEncryptor(const std::string& column_path);
+ std::shared_ptr<Encryptor> GetColumnDataEncryptor(const std::string& column_path);
+ void WipeOutEncryptionKeys();
+
+ private:
+ FileEncryptionProperties* properties_;
+
+ std::map<std::string, std::shared_ptr<Encryptor>> column_data_map_;
+ std::map<std::string, std::shared_ptr<Encryptor>> column_metadata_map_;
+
+ std::shared_ptr<Encryptor> footer_signing_encryptor_;
+ std::shared_ptr<Encryptor> footer_encryptor_;
+
+ std::vector<encryption::AesEncryptor*> all_encryptors_;
+
+ // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
+ // types of meta_encryptors and data_encryptors.
+ std::unique_ptr<encryption::AesEncryptor> meta_encryptor_[3];
+ std::unique_ptr<encryption::AesEncryptor> data_encryptor_[3];
+
+ ::arrow::MemoryPool* pool_;
+
+ std::shared_ptr<Encryptor> GetColumnEncryptor(const std::string& column_path,
+ bool metadata);
+
+ encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm,
+ size_t key_len);
+ encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
+ size_t key_len);
+
+ int MapKeyLenToEncryptorArrayIndex(int key_len);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc b/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
index c333957dd1d..909a5079c76 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
@@ -1,27 +1,27 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/exception.h"
-
-namespace parquet {
-
-std::ostream& operator<<(std::ostream& os, const ParquetException& exception) {
- os << exception.what();
- return os;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception) {
+ os << exception.what();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/exception.h b/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
index 826f5bdc8bf..a76761c63c3 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
@@ -1,158 +1,158 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <exception>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "arrow/type_fwd.h"
-#include "arrow/util/string_builder.h"
-#include "parquet/platform.h"
-
-// PARQUET-1085
-#if !defined(ARROW_UNUSED)
-#define ARROW_UNUSED(x) UNUSED(x)
-#endif
-
-// Parquet exception to Arrow Status
-
-#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
-#define END_PARQUET_CATCH_EXCEPTIONS \
- } \
- catch (const ::parquet::ParquetStatusException& e) { \
- return e.status(); \
- } \
- catch (const ::parquet::ParquetException& e) { \
- return ::arrow::Status::IOError(e.what()); \
- }
-
-// clang-format off
-
-#define PARQUET_CATCH_NOT_OK(s) \
- BEGIN_PARQUET_CATCH_EXCEPTIONS \
- (s); \
- END_PARQUET_CATCH_EXCEPTIONS
-
-// clang-format on
-
-#define PARQUET_CATCH_AND_RETURN(s) \
- BEGIN_PARQUET_CATCH_EXCEPTIONS \
- return (s); \
- END_PARQUET_CATCH_EXCEPTIONS
-
-// Arrow Status to Parquet exception
-
-#define PARQUET_IGNORE_NOT_OK(s) \
- do { \
- ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
- ARROW_UNUSED(_s); \
- } while (0)
-
-#define PARQUET_THROW_NOT_OK(s) \
- do { \
- ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
- if (!_s.ok()) { \
- throw ::parquet::ParquetStatusException(std::move(_s)); \
- } \
- } while (0)
-
-#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
- auto status_name = (rexpr); \
- PARQUET_THROW_NOT_OK(status_name.status()); \
- lhs = std::move(status_name).ValueOrDie();
-
-#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
- PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
- lhs, rexpr);
-
-namespace parquet {
-
-class ParquetException : public std::exception {
- public:
- PARQUET_NORETURN static void EofException(const std::string& msg = "") {
- static std::string prefix = "Unexpected end of stream";
- if (msg.empty()) {
- throw ParquetException(prefix);
- }
- throw ParquetException(prefix, ": ", msg);
- }
-
- PARQUET_NORETURN static void NYI(const std::string& msg = "") {
- throw ParquetException("Not yet implemented: ", msg, ".");
- }
-
- template <typename... Args>
- explicit ParquetException(Args&&... args)
- : msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
-
- explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
-
- explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
-
- ParquetException(const ParquetException&) = default;
- ParquetException& operator=(const ParquetException&) = default;
- ParquetException(ParquetException&&) = default;
- ParquetException& operator=(ParquetException&&) = default;
-
- const char* what() const noexcept override { return msg_.c_str(); }
-
- private:
- std::string msg_;
-};
-
-// Support printing a ParquetException.
-// This is needed for clang-on-MSVC as there operator<< is not defined for
-// std::exception.
-PARQUET_EXPORT
-std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
-
-class ParquetStatusException : public ParquetException {
- public:
- explicit ParquetStatusException(::arrow::Status status)
- : ParquetException(status.ToString()), status_(std::move(status)) {}
-
- const ::arrow::Status& status() const { return status_; }
-
- private:
- ::arrow::Status status_;
-};
-
-// This class exists for the purpose of detecting an invalid or corrupted file.
-class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
- public:
- ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
- default;
-
- template <typename Arg,
- typename std::enable_if<
- !std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
- int>::type = 0,
- typename... Args>
- explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
- : ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
- std::forward<Args>(args)...)) {}
-};
-
-template <typename StatusReturnBlock>
-void ThrowNotOk(StatusReturnBlock&& b) {
- PARQUET_THROW_NOT_OK(b());
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <exception>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/string_builder.h"
+#include "parquet/platform.h"
+
+// PARQUET-1085
+#if !defined(ARROW_UNUSED)
+#define ARROW_UNUSED(x) UNUSED(x)
+#endif
+
+// Parquet exception to Arrow Status
+
+#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
+#define END_PARQUET_CATCH_EXCEPTIONS \
+ } \
+ catch (const ::parquet::ParquetStatusException& e) { \
+ return e.status(); \
+ } \
+ catch (const ::parquet::ParquetException& e) { \
+ return ::arrow::Status::IOError(e.what()); \
+ }
+
+// clang-format off
+
+#define PARQUET_CATCH_NOT_OK(s) \
+ BEGIN_PARQUET_CATCH_EXCEPTIONS \
+ (s); \
+ END_PARQUET_CATCH_EXCEPTIONS
+
+// clang-format on
+
+#define PARQUET_CATCH_AND_RETURN(s) \
+ BEGIN_PARQUET_CATCH_EXCEPTIONS \
+ return (s); \
+ END_PARQUET_CATCH_EXCEPTIONS
+
+// Arrow Status to Parquet exception
+
+#define PARQUET_IGNORE_NOT_OK(s) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ ARROW_UNUSED(_s); \
+ } while (0)
+
+#define PARQUET_THROW_NOT_OK(s) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ if (!_s.ok()) { \
+ throw ::parquet::ParquetStatusException(std::move(_s)); \
+ } \
+ } while (0)
+
+#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+ auto status_name = (rexpr); \
+ PARQUET_THROW_NOT_OK(status_name.status()); \
+ lhs = std::move(status_name).ValueOrDie();
+
+#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
+ PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+namespace parquet {
+
+class ParquetException : public std::exception {
+ public:
+ PARQUET_NORETURN static void EofException(const std::string& msg = "") {
+ static std::string prefix = "Unexpected end of stream";
+ if (msg.empty()) {
+ throw ParquetException(prefix);
+ }
+ throw ParquetException(prefix, ": ", msg);
+ }
+
+ PARQUET_NORETURN static void NYI(const std::string& msg = "") {
+ throw ParquetException("Not yet implemented: ", msg, ".");
+ }
+
+ template <typename... Args>
+ explicit ParquetException(Args&&... args)
+ : msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
+
+ explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
+
+ explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
+
+ ParquetException(const ParquetException&) = default;
+ ParquetException& operator=(const ParquetException&) = default;
+ ParquetException(ParquetException&&) = default;
+ ParquetException& operator=(ParquetException&&) = default;
+
+ const char* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+ std::string msg_;
+};
+
+// Support printing a ParquetException.
+// This is needed for clang-on-MSVC as there operator<< is not defined for
+// std::exception.
+PARQUET_EXPORT
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
+
+class ParquetStatusException : public ParquetException {
+ public:
+ explicit ParquetStatusException(::arrow::Status status)
+ : ParquetException(status.ToString()), status_(std::move(status)) {}
+
+ const ::arrow::Status& status() const { return status_; }
+
+ private:
+ ::arrow::Status status_;
+};
+
+// This class exists for the purpose of detecting an invalid or corrupted file.
+class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
+ public:
+ ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
+ default;
+
+ template <typename Arg,
+ typename std::enable_if<
+ !std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
+ int>::type = 0,
+ typename... Args>
+ explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
+ : ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
+ std::forward<Args>(args)...)) {}
+};
+
+template <typename StatusReturnBlock>
+void ThrowNotOk(StatusReturnBlock&& b) {
+ PARQUET_THROW_NOT_OK(b());
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
index 4e38901aa0d..3c3c124987e 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
@@ -1,868 +1,868 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/file_reader.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-
-#include "arrow/io/caching.h"
-#include "arrow/io/file.h"
-#include "arrow/io/memory.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/future.h"
-#include "arrow/util/int_util_internal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/ubsan.h"
-#include "parquet/column_reader.h"
-#include "parquet/column_scanner.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/exception.h"
-#include "parquet/file_writer.h"
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-using arrow::internal::AddWithOverflow;
-
-namespace parquet {
-
-// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
-static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
-static constexpr uint32_t kFooterSize = 8;
-
-// For PARQUET-816
-static constexpr int64_t kMaxDictHeaderSize = 100;
-
-// ----------------------------------------------------------------------
-// RowGroupReader public API
-
-RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
- : contents_(std::move(contents)) {}
-
-std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
- if (i >= metadata()->num_columns()) {
- std::stringstream ss;
- ss << "Trying to read column index " << i << " but row group metadata has only "
- << metadata()->num_columns() << " columns";
- throw ParquetException(ss.str());
- }
- const ColumnDescriptor* descr = metadata()->schema()->Column(i);
-
- std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
- return ColumnReader::Make(
- descr, std::move(page_reader),
- const_cast<ReaderProperties*>(contents_->properties())->memory_pool());
-}
-
-std::shared_ptr<ColumnReader> RowGroupReader::ColumnWithExposeEncoding(
- int i, ExposedEncoding encoding_to_expose) {
- std::shared_ptr<ColumnReader> reader = Column(i);
-
- if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
- // Check the encoding_stats to see if all data pages are dictionary encoded.
- std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
- const std::vector<PageEncodingStats>& encoding_stats = col->encoding_stats();
- if (encoding_stats.empty()) {
- // Some parquet files may have empty encoding_stats. In this case we are
- // not sure whether all data pages are dictionary encoded. So we do not
- // enable exposing dictionary.
- return reader;
- }
- // The 1st page should be the dictionary page.
- if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
- (encoding_stats[0].encoding != Encoding::PLAIN &&
- encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
- return reader;
- }
- // The following pages should be dictionary encoded data pages.
- for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
- if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
- encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
- (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
- encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
- return reader;
- }
- }
- } else {
- // Exposing other encodings are not supported for now.
- return reader;
- }
-
- // Set exposed encoding.
- reader->SetExposedEncoding(encoding_to_expose);
- return reader;
-}
-
-std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
- if (i >= metadata()->num_columns()) {
- std::stringstream ss;
- ss << "Trying to read column index " << i << " but row group metadata has only "
- << metadata()->num_columns() << " columns";
- throw ParquetException(ss.str());
- }
- return contents_->GetColumnPageReader(i);
-}
-
-// Returns the rowgroup metadata
-const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->metadata(); }
-
-/// Compute the section of the file that should be read for the given
-/// row group and column chunk.
-::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
- int64_t source_size, int row_group_index,
- int column_index) {
- auto row_group_metadata = file_metadata->RowGroup(row_group_index);
- auto column_metadata = row_group_metadata->ColumnChunk(column_index);
-
- int64_t col_start = column_metadata->data_page_offset();
- if (column_metadata->has_dictionary_page() &&
- column_metadata->dictionary_page_offset() > 0 &&
- col_start > column_metadata->dictionary_page_offset()) {
- col_start = column_metadata->dictionary_page_offset();
- }
-
- int64_t col_length = column_metadata->total_compressed_size();
- int64_t col_end;
- if (AddWithOverflow(col_start, col_length, &col_end) || col_end > source_size) {
- throw ParquetException("Invalid column metadata (corrupt file?)");
- }
-
- // PARQUET-816 workaround for old files created by older parquet-mr
- const ApplicationVersion& version = file_metadata->writer_version();
- if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION())) {
- // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
- // dictionary page header size in total_compressed_size and total_uncompressed_size
- // (see IMPALA-694). We add padding to compensate.
- int64_t bytes_remaining = source_size - col_end;
- int64_t padding = std::min<int64_t>(kMaxDictHeaderSize, bytes_remaining);
- col_length += padding;
- }
-
- return {col_start, col_length};
-}
-
-// RowGroupReader::Contents implementation for the Parquet file specification
-class SerializedRowGroup : public RowGroupReader::Contents {
- public:
- SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
- std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source,
- int64_t source_size, FileMetaData* file_metadata,
- int row_group_number, const ReaderProperties& props,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
- : source_(std::move(source)),
- cached_source_(std::move(cached_source)),
- source_size_(source_size),
- file_metadata_(file_metadata),
- properties_(props),
- row_group_ordinal_(row_group_number),
- file_decryptor_(file_decryptor) {
- row_group_metadata_ = file_metadata->RowGroup(row_group_number);
- }
-
- const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); }
-
- const ReaderProperties* properties() const override { return &properties_; }
-
- std::unique_ptr<PageReader> GetColumnPageReader(int i) override {
- // Read column chunk from the file
- auto col = row_group_metadata_->ColumnChunk(i);
-
- ::arrow::io::ReadRange col_range =
- ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i);
- std::shared_ptr<ArrowInputStream> stream;
- if (cached_source_) {
- // PARQUET-1698: if read coalescing is enabled, read from pre-buffered
- // segments.
- PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range));
- stream = std::make_shared<::arrow::io::BufferReader>(buffer);
- } else {
- stream = properties_.GetStream(source_, col_range.offset, col_range.length);
- }
-
- std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col->crypto_metadata();
-
- // Column is encrypted only if crypto_metadata exists.
- if (!crypto_metadata) {
- return PageReader::Open(stream, col->num_values(), col->compression(),
- properties_.memory_pool());
- }
-
- if (file_decryptor_ == nullptr) {
- throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
- }
-
- constexpr auto kEncryptedRowGroupsLimit = 32767;
- if (i > kEncryptedRowGroupsLimit) {
- throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
- }
-
- // The column is encrypted
- std::shared_ptr<Decryptor> meta_decryptor;
- std::shared_ptr<Decryptor> data_decryptor;
- // The column is encrypted with footer key
- if (crypto_metadata->encrypted_with_footer_key()) {
- meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta();
- data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData();
- CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
- static_cast<int16_t>(i), meta_decryptor, data_decryptor);
- return PageReader::Open(stream, col->num_values(), col->compression(),
- properties_.memory_pool(), &ctx);
- }
-
- // The column is encrypted with its own key
- std::string column_key_metadata = crypto_metadata->key_metadata();
- const std::string column_path = crypto_metadata->path_in_schema()->ToDotString();
-
- meta_decryptor =
- file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata);
- data_decryptor =
- file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata);
-
- CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
- static_cast<int16_t>(i), meta_decryptor, data_decryptor);
- return PageReader::Open(stream, col->num_values(), col->compression(),
- properties_.memory_pool(), &ctx);
- }
-
- private:
- std::shared_ptr<ArrowInputFile> source_;
- // Will be nullptr if PreBuffer() is not called.
- std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
- int64_t source_size_;
- FileMetaData* file_metadata_;
- std::unique_ptr<RowGroupMetaData> row_group_metadata_;
- ReaderProperties properties_;
- int row_group_ordinal_;
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-};
-
-// ----------------------------------------------------------------------
-// SerializedFile: An implementation of ParquetFileReader::Contents that deals
-// with the Parquet file structure, Thrift deserialization, and other internal
-// matters
-
-// This class takes ownership of the provided data source
-class SerializedFile : public ParquetFileReader::Contents {
- public:
- SerializedFile(std::shared_ptr<ArrowInputFile> source,
- const ReaderProperties& props = default_reader_properties())
- : source_(std::move(source)), properties_(props) {
- PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize());
- }
-
- ~SerializedFile() override {
- try {
- Close();
- } catch (...) {
- }
- }
-
- void Close() override {
- if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys();
- }
-
- std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
- std::unique_ptr<SerializedRowGroup> contents(
- new SerializedRowGroup(source_, cached_source_, source_size_,
- file_metadata_.get(), i, properties_, file_decryptor_));
- return std::make_shared<RowGroupReader>(std::move(contents));
- }
-
- std::shared_ptr<FileMetaData> metadata() const override { return file_metadata_; }
-
- void set_metadata(std::shared_ptr<FileMetaData> metadata) {
- file_metadata_ = std::move(metadata);
- }
-
- void PreBuffer(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- const ::arrow::io::IOContext& ctx,
- const ::arrow::io::CacheOptions& options) {
- cached_source_ =
- std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options);
- std::vector<::arrow::io::ReadRange> ranges;
- for (int row : row_groups) {
- for (int col : column_indices) {
- ranges.push_back(
- ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
- }
- }
- PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
- }
-
- ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices) const {
- if (!cached_source_) {
- return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
- }
- std::vector<::arrow::io::ReadRange> ranges;
- for (int row : row_groups) {
- for (int col : column_indices) {
- ranges.push_back(
- ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
- }
- }
- return cached_source_->WaitFor(ranges);
- }
-
- // Metadata/footer parsing. Divided up to separate sync/async paths, and to use
- // exceptions for error handling (with the async path converting to Future/Status).
-
- void ParseMetaData() {
- int64_t footer_read_size = GetFooterReadSize();
- PARQUET_ASSIGN_OR_THROW(
- auto footer_buffer,
- source_->ReadAt(source_size_ - footer_read_size, footer_read_size));
- uint32_t metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
- int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
-
- std::shared_ptr<::arrow::Buffer> metadata_buffer;
- if (footer_read_size >= (metadata_len + kFooterSize)) {
- metadata_buffer = SliceBuffer(
- footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len);
- } else {
- PARQUET_ASSIGN_OR_THROW(metadata_buffer,
- source_->ReadAt(metadata_start, metadata_len));
- }
-
- // Parse the footer depending on encryption type
- const bool is_encrypted_footer =
- memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
- if (is_encrypted_footer) {
- // Encrypted file with Encrypted footer.
- const std::pair<int64_t, uint32_t> read_size =
- ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
- // Read the actual footer
- metadata_start = read_size.first;
- metadata_len = read_size.second;
- PARQUET_ASSIGN_OR_THROW(metadata_buffer,
- source_->ReadAt(metadata_start, metadata_len));
- // Fall through
- }
-
- const uint32_t read_metadata_len =
- ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
- auto file_decryption_properties = properties_.file_decryption_properties().get();
- if (is_encrypted_footer) {
- // Nothing else to do here.
- return;
- } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
- if (file_decryption_properties != nullptr) {
- if (!file_decryption_properties->plaintext_files_allowed()) {
- throw ParquetException("Applying decryption properties on plaintext file");
- }
- }
- } else {
- // Encrypted file with plaintext footer mode.
- ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
- }
- }
-
- // Validate the source size and get the initial read size.
- int64_t GetFooterReadSize() {
- if (source_size_ == 0) {
- throw ParquetInvalidOrCorruptedFileException("Parquet file size is 0 bytes");
- } else if (source_size_ < kFooterSize) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet file size is ", source_size_,
- " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
- }
- return std::min(source_size_, kDefaultFooterReadSize);
- }
-
- // Validate the magic bytes and get the length of the full footer.
- uint32_t ParseFooterLength(const std::shared_ptr<::arrow::Buffer>& footer_buffer,
- const int64_t footer_read_size) {
- // Check if all bytes are read. Check if last 4 bytes read have the magic bits
- if (footer_buffer->size() != footer_read_size ||
- (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 &&
- memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet magic bytes not found in footer. Either the file is corrupted or this "
- "is not a parquet file.");
- }
- // Both encrypted/unencrypted footers have the same footer length check.
- uint32_t metadata_len = ::arrow::util::SafeLoadAs<uint32_t>(
- reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
- kFooterSize);
- if (metadata_len > source_size_ - kFooterSize) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet file size is ", source_size_,
- " bytes, smaller than the size reported by footer's (", metadata_len, "bytes)");
- }
- return metadata_len;
- }
-
- // Does not throw.
- ::arrow::Future<> ParseMetaDataAsync() {
- int64_t footer_read_size;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- footer_read_size = GetFooterReadSize();
- END_PARQUET_CATCH_EXCEPTIONS
- // Assumes this is kept alive externally
- return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
- .Then([=](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
- -> ::arrow::Future<> {
- uint32_t metadata_len;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
- END_PARQUET_CATCH_EXCEPTIONS
- int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
-
- std::shared_ptr<::arrow::Buffer> metadata_buffer;
- if (footer_read_size >= (metadata_len + kFooterSize)) {
- metadata_buffer =
- SliceBuffer(footer_buffer, footer_read_size - metadata_len - kFooterSize,
- metadata_len);
- return ParseMaybeEncryptedMetaDataAsync(footer_buffer,
- std::move(metadata_buffer),
- footer_read_size, metadata_len);
- }
- return source_->ReadAsync(metadata_start, metadata_len)
- .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
- return ParseMaybeEncryptedMetaDataAsync(footer_buffer, metadata_buffer,
- footer_read_size, metadata_len);
- });
- });
- }
-
- // Continuation
- ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
- std::shared_ptr<::arrow::Buffer> footer_buffer,
- std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
- uint32_t metadata_len) {
- // Parse the footer depending on encryption type
- const bool is_encrypted_footer =
- memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
- if (is_encrypted_footer) {
- // Encrypted file with Encrypted footer.
- std::pair<int64_t, uint32_t> read_size;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- read_size =
- ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
- END_PARQUET_CATCH_EXCEPTIONS
- // Read the actual footer
- int64_t metadata_start = read_size.first;
- metadata_len = read_size.second;
- return source_->ReadAsync(metadata_start, metadata_len)
- .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
- // Continue and read the file footer
- return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer);
- });
- }
- return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
- is_encrypted_footer);
- }
-
- // Continuation
- ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
- uint32_t metadata_len,
- const bool is_encrypted_footer) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- const uint32_t read_metadata_len =
- ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
- auto file_decryption_properties = properties_.file_decryption_properties().get();
- if (is_encrypted_footer) {
- // Nothing else to do here.
- return ::arrow::Status::OK();
- } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
- if (file_decryption_properties != nullptr) {
- if (!file_decryption_properties->plaintext_files_allowed()) {
- throw ParquetException("Applying decryption properties on plaintext file");
- }
- }
- } else {
- // Encrypted file with plaintext footer mode.
- ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
- }
- END_PARQUET_CATCH_EXCEPTIONS
- return ::arrow::Status::OK();
- }
-
- private:
- std::shared_ptr<ArrowInputFile> source_;
- std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
- int64_t source_size_;
- std::shared_ptr<FileMetaData> file_metadata_;
- ReaderProperties properties_;
-
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-
- // \return The true length of the metadata in bytes
- uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr<Buffer>& footer_buffer,
- const uint32_t metadata_len);
-
- std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties,
- EncryptionAlgorithm& algo);
-
- void ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- FileDecryptionProperties* file_decryption_properties,
- const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
- uint32_t read_metadata_len);
-
- // \return The position and size of the actual footer
- std::pair<int64_t, uint32_t> ParseMetaDataOfEncryptedFileWithEncryptedFooter(
- const std::shared_ptr<Buffer>& crypto_metadata_buffer, uint32_t footer_len);
-};
-
-uint32_t SerializedFile::ParseUnencryptedFileMetadata(
- const std::shared_ptr<Buffer>& metadata_buffer, const uint32_t metadata_len) {
- if (metadata_buffer->size() != metadata_len) {
- throw ParquetException("Failed reading metadata buffer (requested " +
- std::to_string(metadata_len) + " bytes but got " +
- std::to_string(metadata_buffer->size()) + " bytes)");
- }
- uint32_t read_metadata_len = metadata_len;
- // The encrypted read path falls through to here, so pass in the decryptor
- file_metadata_ =
- FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, file_decryptor_);
- return read_metadata_len;
-}
-
-std::pair<int64_t, uint32_t>
-SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter(
- const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer,
- // both metadata & crypto metadata length
- const uint32_t footer_len) {
- // encryption with encrypted footer
- // Check if the footer_buffer contains the entire metadata
- if (crypto_metadata_buffer->size() != footer_len) {
- throw ParquetException("Failed reading encrypted metadata buffer (requested " +
- std::to_string(footer_len) + " bytes but got " +
- std::to_string(crypto_metadata_buffer->size()) + " bytes)");
- }
- auto file_decryption_properties = properties_.file_decryption_properties().get();
- if (file_decryption_properties == nullptr) {
- throw ParquetException(
- "Could not read encrypted metadata, no decryption found in reader's properties");
- }
- uint32_t crypto_metadata_len = footer_len;
- std::shared_ptr<FileCryptoMetaData> file_crypto_metadata =
- FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len);
- // Handle AAD prefix
- EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm();
- std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
- file_decryptor_ = std::make_shared<InternalFileDecryptor>(
- file_decryption_properties, file_aad, algo.algorithm,
- file_crypto_metadata->key_metadata(), properties_.memory_pool());
-
- int64_t metadata_offset = source_size_ - kFooterSize - footer_len + crypto_metadata_len;
- uint32_t metadata_len = footer_len - crypto_metadata_len;
- return std::make_pair(metadata_offset, metadata_len);
-}
-
-void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- FileDecryptionProperties* file_decryption_properties,
- const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
- uint32_t read_metadata_len) {
- // Providing decryption properties in plaintext footer mode is not mandatory, for
- // example when reading by legacy reader.
- if (file_decryption_properties != nullptr) {
- EncryptionAlgorithm algo = file_metadata_->encryption_algorithm();
- // Handle AAD prefix
- std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
- file_decryptor_ = std::make_shared<InternalFileDecryptor>(
- file_decryption_properties, file_aad, algo.algorithm,
- file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
- // set the InternalFileDecryptor in the metadata as well, as it's used
- // for signature verification and for ColumnChunkMetaData creation.
- file_metadata_->set_file_decryptor(file_decryptor_);
-
- if (file_decryption_properties->check_plaintext_footer_integrity()) {
- if (metadata_len - read_metadata_len !=
- (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
- throw ParquetInvalidOrCorruptedFileException(
- "Failed reading metadata for encryption signature (requested ",
- parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
- " bytes but have ", metadata_len - read_metadata_len, " bytes)");
- }
-
- if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet crypto signature verification failed");
- }
- }
- }
-}
-
-std::string SerializedFile::HandleAadPrefix(
- FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) {
- std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix();
- std::string aad_prefix = aad_prefix_in_properties;
- bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true;
- std::string aad_prefix_in_file = algo.aad.aad_prefix;
-
- if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) {
- throw ParquetException(
- "AAD prefix used for file encryption, "
- "but not stored in file and not supplied "
- "in decryption properties");
- }
-
- if (file_has_aad_prefix) {
- if (!aad_prefix_in_properties.empty()) {
- if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) {
- throw ParquetException(
- "AAD Prefix in file and in properties "
- "is not the same");
- }
- }
- aad_prefix = aad_prefix_in_file;
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
- file_decryption_properties->aad_prefix_verifier();
- if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix);
- } else {
- if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) {
- throw ParquetException(
- "AAD Prefix set in decryption properties, but was not used "
- "for file encryption");
- }
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
- file_decryption_properties->aad_prefix_verifier();
- if (aad_prefix_verifier != nullptr) {
- throw ParquetException(
- "AAD Prefix Verifier is set, but AAD Prefix not found in file");
- }
- }
- return aad_prefix + algo.aad.aad_file_unique;
-}
-
-// ----------------------------------------------------------------------
-// ParquetFileReader public API
-
-ParquetFileReader::ParquetFileReader() {}
-
-ParquetFileReader::~ParquetFileReader() {
- try {
- Close();
- } catch (...) {
- }
-}
-
-// Open the file. If no metadata is passed, it is parsed from the footer of
-// the file
-std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
- std::shared_ptr<ArrowInputFile> source, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- std::unique_ptr<ParquetFileReader::Contents> result(
- new SerializedFile(std::move(source), props));
-
- // Access private methods here, but otherwise unavailable
- SerializedFile* file = static_cast<SerializedFile*>(result.get());
-
- if (metadata == nullptr) {
- // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor
- file->ParseMetaData();
- } else {
- file->set_metadata(std::move(metadata));
- }
-
- return result;
-}
-
-::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
-ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
- const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- std::unique_ptr<ParquetFileReader::Contents> result(
- new SerializedFile(std::move(source), props));
- SerializedFile* file = static_cast<SerializedFile*>(result.get());
- if (metadata == nullptr) {
- // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
- struct {
- ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
- return std::move(result);
- }
-
- std::unique_ptr<ParquetFileReader::Contents> result;
- } Continuation;
- Continuation.result = std::move(result);
- return file->ParseMetaDataAsync().Then(std::move(Continuation));
- } else {
- file->set_metadata(std::move(metadata));
- return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
- std::move(result));
- }
- END_PARQUET_CATCH_EXCEPTIONS
-}
-
-std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
- std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- auto contents = SerializedFile::Open(std::move(source), props, std::move(metadata));
- std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
- result->Open(std::move(contents));
- return result;
-}
-
-std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
- const std::string& path, bool memory_map, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- std::shared_ptr<::arrow::io::RandomAccessFile> source;
- if (memory_map) {
- PARQUET_ASSIGN_OR_THROW(
- source, ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ));
- } else {
- PARQUET_ASSIGN_OR_THROW(source,
- ::arrow::io::ReadableFile::Open(path, props.memory_pool()));
- }
-
- return Open(std::move(source), props, std::move(metadata));
-}
-
-::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
- std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
- // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
- auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
- fut.AddCallback([fut, completed](
- const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
- contents) mutable {
- if (!contents.ok()) {
- completed.MarkFinished(contents.status());
- return;
- }
- std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
- result->Open(fut.MoveResult().MoveValueUnsafe());
- completed.MarkFinished(std::move(result));
- });
- return completed;
- END_PARQUET_CATCH_EXCEPTIONS
-}
-
-void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) {
- contents_ = std::move(contents);
-}
-
-void ParquetFileReader::Close() {
- if (contents_) {
- contents_->Close();
- }
-}
-
-std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
- return contents_->metadata();
-}
-
-std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
- if (i >= metadata()->num_row_groups()) {
- std::stringstream ss;
- ss << "Trying to read row group " << i << " but file only has "
- << metadata()->num_row_groups() << " row groups";
- throw ParquetException(ss.str());
- }
- return contents_->GetRowGroup(i);
-}
-
-void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- const ::arrow::io::IOContext& ctx,
- const ::arrow::io::CacheOptions& options) {
- // Access private methods here
- SerializedFile* file =
- ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
- file->PreBuffer(row_groups, column_indices, ctx, options);
-}
-
-::arrow::Future<> ParquetFileReader::WhenBuffered(
- const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
- // Access private methods here
- SerializedFile* file =
- ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
- return file->WhenBuffered(row_groups, column_indices);
-}
-
-// ----------------------------------------------------------------------
-// File metadata helpers
-
-std::shared_ptr<FileMetaData> ReadMetaData(
- const std::shared_ptr<::arrow::io::RandomAccessFile>& source) {
- return ParquetFileReader::Open(source)->metadata();
-}
-
-// ----------------------------------------------------------------------
-// File scanner for performance testing
-
-int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
- ParquetFileReader* reader) {
- std::vector<int16_t> rep_levels(column_batch_size);
- std::vector<int16_t> def_levels(column_batch_size);
-
- int num_columns = static_cast<int>(columns.size());
-
- // columns are not specified explicitly. Add all columns
- if (columns.size() == 0) {
- num_columns = reader->metadata()->num_columns();
- columns.resize(num_columns);
- for (int i = 0; i < num_columns; i++) {
- columns[i] = i;
- }
- }
-
- std::vector<int64_t> total_rows(num_columns, 0);
-
- for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
- auto group_reader = reader->RowGroup(r);
- int col = 0;
- for (auto i : columns) {
- std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
- size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
- std::vector<uint8_t> values(column_batch_size * value_byte_size);
-
- int64_t values_read = 0;
- while (col_reader->HasNext()) {
- int64_t levels_read =
- ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
- values.data(), &values_read, col_reader.get());
- if (col_reader->descr()->max_repetition_level() > 0) {
- for (int64_t i = 0; i < levels_read; i++) {
- if (rep_levels[i] == 0) {
- total_rows[col]++;
- }
- }
- } else {
- total_rows[col] += levels_read;
- }
- }
- col++;
- }
- }
-
- for (int i = 1; i < num_columns; ++i) {
- if (total_rows[0] != total_rows[i]) {
- throw ParquetException("Parquet error: Total rows among columns do not match");
- }
- }
-
- return total_rows[0];
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/io/file.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/future.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_scanner.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+using arrow::internal::AddWithOverflow;
+
+namespace parquet {
+
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
+static constexpr uint32_t kFooterSize = 8;
+
+// For PARQUET-816
+static constexpr int64_t kMaxDictHeaderSize = 100;
+
+// ----------------------------------------------------------------------
+// RowGroupReader public API
+
+RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
+ : contents_(std::move(contents)) {}
+
+std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
+ const ColumnDescriptor* descr = metadata()->schema()->Column(i);
+
+ std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
+ return ColumnReader::Make(
+ descr, std::move(page_reader),
+ const_cast<ReaderProperties*>(contents_->properties())->memory_pool());
+}
+
+std::shared_ptr<ColumnReader> RowGroupReader::ColumnWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose) {
+ std::shared_ptr<ColumnReader> reader = Column(i);
+
+ if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
+ // Check the encoding_stats to see if all data pages are dictionary encoded.
+ std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
+ const std::vector<PageEncodingStats>& encoding_stats = col->encoding_stats();
+ if (encoding_stats.empty()) {
+ // Some parquet files may have empty encoding_stats. In this case we are
+ // not sure whether all data pages are dictionary encoded. So we do not
+ // enable exposing dictionary.
+ return reader;
+ }
+ // The 1st page should be the dictionary page.
+ if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
+ (encoding_stats[0].encoding != Encoding::PLAIN &&
+ encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
+ return reader;
+ }
+ // The following pages should be dictionary encoded data pages.
+ for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
+ if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
+ encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
+ (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
+ encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
+ return reader;
+ }
+ }
+ } else {
+ // Exposing other encodings are not supported for now.
+ return reader;
+ }
+
+ // Set exposed encoding.
+ reader->SetExposedEncoding(encoding_to_expose);
+ return reader;
+}
+
+std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
+ return contents_->GetColumnPageReader(i);
+}
+
+// Returns the rowgroup metadata
+const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->metadata(); }
+
+/// Compute the section of the file that should be read for the given
+/// row group and column chunk.
+::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
+ int64_t source_size, int row_group_index,
+ int column_index) {
+ auto row_group_metadata = file_metadata->RowGroup(row_group_index);
+ auto column_metadata = row_group_metadata->ColumnChunk(column_index);
+
+ int64_t col_start = column_metadata->data_page_offset();
+ if (column_metadata->has_dictionary_page() &&
+ column_metadata->dictionary_page_offset() > 0 &&
+ col_start > column_metadata->dictionary_page_offset()) {
+ col_start = column_metadata->dictionary_page_offset();
+ }
+
+ int64_t col_length = column_metadata->total_compressed_size();
+ int64_t col_end;
+ if (AddWithOverflow(col_start, col_length, &col_end) || col_end > source_size) {
+ throw ParquetException("Invalid column metadata (corrupt file?)");
+ }
+
+ // PARQUET-816 workaround for old files created by older parquet-mr
+ const ApplicationVersion& version = file_metadata->writer_version();
+ if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION())) {
+ // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
+ // dictionary page header size in total_compressed_size and total_uncompressed_size
+ // (see IMPALA-694). We add padding to compensate.
+ int64_t bytes_remaining = source_size - col_end;
+ int64_t padding = std::min<int64_t>(kMaxDictHeaderSize, bytes_remaining);
+ col_length += padding;
+ }
+
+ return {col_start, col_length};
+}
+
+// RowGroupReader::Contents implementation for the Parquet file specification
+class SerializedRowGroup : public RowGroupReader::Contents {
+ public:
+ SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source,
+ int64_t source_size, FileMetaData* file_metadata,
+ int row_group_number, const ReaderProperties& props,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
+ : source_(std::move(source)),
+ cached_source_(std::move(cached_source)),
+ source_size_(source_size),
+ file_metadata_(file_metadata),
+ properties_(props),
+ row_group_ordinal_(row_group_number),
+ file_decryptor_(file_decryptor) {
+ row_group_metadata_ = file_metadata->RowGroup(row_group_number);
+ }
+
+ const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); }
+
+ const ReaderProperties* properties() const override { return &properties_; }
+
+ std::unique_ptr<PageReader> GetColumnPageReader(int i) override {
+ // Read column chunk from the file
+ auto col = row_group_metadata_->ColumnChunk(i);
+
+ ::arrow::io::ReadRange col_range =
+ ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i);
+ std::shared_ptr<ArrowInputStream> stream;
+ if (cached_source_) {
+ // PARQUET-1698: if read coalescing is enabled, read from pre-buffered
+ // segments.
+ PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range));
+ stream = std::make_shared<::arrow::io::BufferReader>(buffer);
+ } else {
+ stream = properties_.GetStream(source_, col_range.offset, col_range.length);
+ }
+
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col->crypto_metadata();
+
+ // Column is encrypted only if crypto_metadata exists.
+ if (!crypto_metadata) {
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool());
+ }
+
+ if (file_decryptor_ == nullptr) {
+ throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
+ }
+
+ constexpr auto kEncryptedRowGroupsLimit = 32767;
+ if (i > kEncryptedRowGroupsLimit) {
+ throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
+ }
+
+ // The column is encrypted
+ std::shared_ptr<Decryptor> meta_decryptor;
+ std::shared_ptr<Decryptor> data_decryptor;
+ // The column is encrypted with footer key
+ if (crypto_metadata->encrypted_with_footer_key()) {
+ meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta();
+ data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData();
+ CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
+ static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool(), &ctx);
+ }
+
+ // The column is encrypted with its own key
+ std::string column_key_metadata = crypto_metadata->key_metadata();
+ const std::string column_path = crypto_metadata->path_in_schema()->ToDotString();
+
+ meta_decryptor =
+ file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata);
+ data_decryptor =
+ file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata);
+
+ CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
+ static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool(), &ctx);
+ }
+
+ private:
+ std::shared_ptr<ArrowInputFile> source_;
+ // Will be nullptr if PreBuffer() is not called.
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
+ int64_t source_size_;
+ FileMetaData* file_metadata_;
+ std::unique_ptr<RowGroupMetaData> row_group_metadata_;
+ ReaderProperties properties_;
+ int row_group_ordinal_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+};
+
+// ----------------------------------------------------------------------
+// SerializedFile: An implementation of ParquetFileReader::Contents that deals
+// with the Parquet file structure, Thrift deserialization, and other internal
+// matters
+
+// This class takes ownership of the provided data source
+class SerializedFile : public ParquetFileReader::Contents {
+ public:
+ SerializedFile(std::shared_ptr<ArrowInputFile> source,
+ const ReaderProperties& props = default_reader_properties())
+ : source_(std::move(source)), properties_(props) {
+ PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize());
+ }
+
+ ~SerializedFile() override {
+ try {
+ Close();
+ } catch (...) {
+ }
+ }
+
+ void Close() override {
+ if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys();
+ }
+
+ std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
+ std::unique_ptr<SerializedRowGroup> contents(
+ new SerializedRowGroup(source_, cached_source_, source_size_,
+ file_metadata_.get(), i, properties_, file_decryptor_));
+ return std::make_shared<RowGroupReader>(std::move(contents));
+ }
+
+ std::shared_ptr<FileMetaData> metadata() const override { return file_metadata_; }
+
+ void set_metadata(std::shared_ptr<FileMetaData> metadata) {
+ file_metadata_ = std::move(metadata);
+ }
+
+ void PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options) {
+ cached_source_ =
+ std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options);
+ std::vector<::arrow::io::ReadRange> ranges;
+ for (int row : row_groups) {
+ for (int col : column_indices) {
+ ranges.push_back(
+ ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
+ }
+ }
+ PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
+ }
+
+ ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) const {
+ if (!cached_source_) {
+ return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
+ }
+ std::vector<::arrow::io::ReadRange> ranges;
+ for (int row : row_groups) {
+ for (int col : column_indices) {
+ ranges.push_back(
+ ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
+ }
+ }
+ return cached_source_->WaitFor(ranges);
+ }
+
+ // Metadata/footer parsing. Divided up to separate sync/async paths, and to use
+ // exceptions for error handling (with the async path converting to Future/Status).
+
+ void ParseMetaData() {
+ int64_t footer_read_size = GetFooterReadSize();
+ PARQUET_ASSIGN_OR_THROW(
+ auto footer_buffer,
+ source_->ReadAt(source_size_ - footer_read_size, footer_read_size));
+ uint32_t metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
+ int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
+
+ std::shared_ptr<::arrow::Buffer> metadata_buffer;
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
+ metadata_buffer = SliceBuffer(
+ footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len);
+ } else {
+ PARQUET_ASSIGN_OR_THROW(metadata_buffer,
+ source_->ReadAt(metadata_start, metadata_len));
+ }
+
+ // Parse the footer depending on encryption type
+ const bool is_encrypted_footer =
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
+ if (is_encrypted_footer) {
+ // Encrypted file with Encrypted footer.
+ const std::pair<int64_t, uint32_t> read_size =
+ ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
+ // Read the actual footer
+ metadata_start = read_size.first;
+ metadata_len = read_size.second;
+ PARQUET_ASSIGN_OR_THROW(metadata_buffer,
+ source_->ReadAt(metadata_start, metadata_len));
+ // Fall through
+ }
+
+ const uint32_t read_metadata_len =
+ ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (is_encrypted_footer) {
+ // Nothing else to do here.
+ return;
+ } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
+ if (file_decryption_properties != nullptr) {
+ if (!file_decryption_properties->plaintext_files_allowed()) {
+ throw ParquetException("Applying decryption properties on plaintext file");
+ }
+ }
+ } else {
+ // Encrypted file with plaintext footer mode.
+ ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
+ }
+ }
+
+ // Validate the source size and get the initial read size.
+ int64_t GetFooterReadSize() {
+ if (source_size_ == 0) {
+ throw ParquetInvalidOrCorruptedFileException("Parquet file size is 0 bytes");
+ } else if (source_size_ < kFooterSize) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet file size is ", source_size_,
+ " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
+ }
+ return std::min(source_size_, kDefaultFooterReadSize);
+ }
+
+ // Validate the magic bytes and get the length of the full footer.
+ uint32_t ParseFooterLength(const std::shared_ptr<::arrow::Buffer>& footer_buffer,
+ const int64_t footer_read_size) {
+ // Check if all bytes are read. Check if last 4 bytes read have the magic bits
+ if (footer_buffer->size() != footer_read_size ||
+ (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 &&
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet magic bytes not found in footer. Either the file is corrupted or this "
+ "is not a parquet file.");
+ }
+ // Both encrypted/unencrypted footers have the same footer length check.
+ uint32_t metadata_len = ::arrow::util::SafeLoadAs<uint32_t>(
+ reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
+ kFooterSize);
+ if (metadata_len > source_size_ - kFooterSize) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet file size is ", source_size_,
+ " bytes, smaller than the size reported by footer's (", metadata_len, "bytes)");
+ }
+ return metadata_len;
+ }
+
+ // Does not throw.
+ ::arrow::Future<> ParseMetaDataAsync() {
+ int64_t footer_read_size;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ footer_read_size = GetFooterReadSize();
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Assumes this is kept alive externally
+ return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
+ -> ::arrow::Future<> {
+ uint32_t metadata_len;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
+ END_PARQUET_CATCH_EXCEPTIONS
+ int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
+
+ std::shared_ptr<::arrow::Buffer> metadata_buffer;
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
+ metadata_buffer =
+ SliceBuffer(footer_buffer, footer_read_size - metadata_len - kFooterSize,
+ metadata_len);
+ return ParseMaybeEncryptedMetaDataAsync(footer_buffer,
+ std::move(metadata_buffer),
+ footer_read_size, metadata_len);
+ }
+ return source_->ReadAsync(metadata_start, metadata_len)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
+ return ParseMaybeEncryptedMetaDataAsync(footer_buffer, metadata_buffer,
+ footer_read_size, metadata_len);
+ });
+ });
+ }
+
+ // Continuation
+ ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
+ std::shared_ptr<::arrow::Buffer> footer_buffer,
+ std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
+ uint32_t metadata_len) {
+ // Parse the footer depending on encryption type
+ const bool is_encrypted_footer =
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
+ if (is_encrypted_footer) {
+ // Encrypted file with Encrypted footer.
+ std::pair<int64_t, uint32_t> read_size;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ read_size =
+ ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Read the actual footer
+ int64_t metadata_start = read_size.first;
+ metadata_len = read_size.second;
+ return source_->ReadAsync(metadata_start, metadata_len)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
+ // Continue and read the file footer
+ return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer);
+ });
+ }
+ return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
+ is_encrypted_footer);
+ }
+
+ // Continuation
+ ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
+ uint32_t metadata_len,
+ const bool is_encrypted_footer) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ const uint32_t read_metadata_len =
+ ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (is_encrypted_footer) {
+ // Nothing else to do here.
+ return ::arrow::Status::OK();
+ } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
+ if (file_decryption_properties != nullptr) {
+ if (!file_decryption_properties->plaintext_files_allowed()) {
+ throw ParquetException("Applying decryption properties on plaintext file");
+ }
+ }
+ } else {
+ // Encrypted file with plaintext footer mode.
+ ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ return ::arrow::Status::OK();
+ }
+
+ private:
+ std::shared_ptr<ArrowInputFile> source_;
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
+ int64_t source_size_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+ ReaderProperties properties_;
+
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+
+ // \return The true length of the metadata in bytes
+ uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr<Buffer>& footer_buffer,
+ const uint32_t metadata_len);
+
+ std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties,
+ EncryptionAlgorithm& algo);
+
+ void ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ FileDecryptionProperties* file_decryption_properties,
+ const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
+ uint32_t read_metadata_len);
+
+ // \return The position and size of the actual footer
+ std::pair<int64_t, uint32_t> ParseMetaDataOfEncryptedFileWithEncryptedFooter(
+ const std::shared_ptr<Buffer>& crypto_metadata_buffer, uint32_t footer_len);
+};
+
+uint32_t SerializedFile::ParseUnencryptedFileMetadata(
+ const std::shared_ptr<Buffer>& metadata_buffer, const uint32_t metadata_len) {
+ if (metadata_buffer->size() != metadata_len) {
+ throw ParquetException("Failed reading metadata buffer (requested " +
+ std::to_string(metadata_len) + " bytes but got " +
+ std::to_string(metadata_buffer->size()) + " bytes)");
+ }
+ uint32_t read_metadata_len = metadata_len;
+ // The encrypted read path falls through to here, so pass in the decryptor
+ file_metadata_ =
+ FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, file_decryptor_);
+ return read_metadata_len;
+}
+
+std::pair<int64_t, uint32_t>
+SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter(
+ const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer,
+ // both metadata & crypto metadata length
+ const uint32_t footer_len) {
+ // encryption with encrypted footer
+ // Check if the footer_buffer contains the entire metadata
+ if (crypto_metadata_buffer->size() != footer_len) {
+ throw ParquetException("Failed reading encrypted metadata buffer (requested " +
+ std::to_string(footer_len) + " bytes but got " +
+ std::to_string(crypto_metadata_buffer->size()) + " bytes)");
+ }
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (file_decryption_properties == nullptr) {
+ throw ParquetException(
+ "Could not read encrypted metadata, no decryption found in reader's properties");
+ }
+ uint32_t crypto_metadata_len = footer_len;
+ std::shared_ptr<FileCryptoMetaData> file_crypto_metadata =
+ FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len);
+ // Handle AAD prefix
+ EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm();
+ std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
+ file_decryptor_ = std::make_shared<InternalFileDecryptor>(
+ file_decryption_properties, file_aad, algo.algorithm,
+ file_crypto_metadata->key_metadata(), properties_.memory_pool());
+
+ int64_t metadata_offset = source_size_ - kFooterSize - footer_len + crypto_metadata_len;
+ uint32_t metadata_len = footer_len - crypto_metadata_len;
+ return std::make_pair(metadata_offset, metadata_len);
+}
+
+void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ FileDecryptionProperties* file_decryption_properties,
+ const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
+ uint32_t read_metadata_len) {
+ // Providing decryption properties in plaintext footer mode is not mandatory, for
+ // example when reading by legacy reader.
+ if (file_decryption_properties != nullptr) {
+ EncryptionAlgorithm algo = file_metadata_->encryption_algorithm();
+ // Handle AAD prefix
+ std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
+ file_decryptor_ = std::make_shared<InternalFileDecryptor>(
+ file_decryption_properties, file_aad, algo.algorithm,
+ file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
+ // set the InternalFileDecryptor in the metadata as well, as it's used
+ // for signature verification and for ColumnChunkMetaData creation.
+ file_metadata_->set_file_decryptor(file_decryptor_);
+
+ if (file_decryption_properties->check_plaintext_footer_integrity()) {
+ if (metadata_len - read_metadata_len !=
+ (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Failed reading metadata for encryption signature (requested ",
+ parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
+ " bytes but have ", metadata_len - read_metadata_len, " bytes)");
+ }
+
+ if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet crypto signature verification failed");
+ }
+ }
+ }
+}
+
+std::string SerializedFile::HandleAadPrefix(
+ FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) {
+ std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix();
+ std::string aad_prefix = aad_prefix_in_properties;
+ bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true;
+ std::string aad_prefix_in_file = algo.aad.aad_prefix;
+
+ if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) {
+ throw ParquetException(
+ "AAD prefix used for file encryption, "
+ "but not stored in file and not supplied "
+ "in decryption properties");
+ }
+
+ if (file_has_aad_prefix) {
+ if (!aad_prefix_in_properties.empty()) {
+ if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) {
+ throw ParquetException(
+ "AAD Prefix in file and in properties "
+ "is not the same");
+ }
+ }
+ aad_prefix = aad_prefix_in_file;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
+ file_decryption_properties->aad_prefix_verifier();
+ if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix);
+ } else {
+ if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) {
+ throw ParquetException(
+ "AAD Prefix set in decryption properties, but was not used "
+ "for file encryption");
+ }
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
+ file_decryption_properties->aad_prefix_verifier();
+ if (aad_prefix_verifier != nullptr) {
+ throw ParquetException(
+ "AAD Prefix Verifier is set, but AAD Prefix not found in file");
+ }
+ }
+ return aad_prefix + algo.aad.aad_file_unique;
+}
+
+// ----------------------------------------------------------------------
+// ParquetFileReader public API
+
+ParquetFileReader::ParquetFileReader() {}
+
+ParquetFileReader::~ParquetFileReader() {
+ try {
+ Close();
+ } catch (...) {
+ }
+}
+
+// Open the file. If no metadata is passed, it is parsed from the footer of
+// the file
+std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
+ std::shared_ptr<ArrowInputFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ std::unique_ptr<ParquetFileReader::Contents> result(
+ new SerializedFile(std::move(source), props));
+
+ // Access private methods here, but otherwise unavailable
+ SerializedFile* file = static_cast<SerializedFile*>(result.get());
+
+ if (metadata == nullptr) {
+ // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor
+ file->ParseMetaData();
+ } else {
+ file->set_metadata(std::move(metadata));
+ }
+
+ return result;
+}
+
+::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
+ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
+ const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ std::unique_ptr<ParquetFileReader::Contents> result(
+ new SerializedFile(std::move(source), props));
+ SerializedFile* file = static_cast<SerializedFile*>(result.get());
+ if (metadata == nullptr) {
+ // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
+ struct {
+ ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
+ return std::move(result);
+ }
+
+ std::unique_ptr<ParquetFileReader::Contents> result;
+ } Continuation;
+ Continuation.result = std::move(result);
+ return file->ParseMetaDataAsync().Then(std::move(Continuation));
+ } else {
+ file->set_metadata(std::move(metadata));
+ return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
+ std::move(result));
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ auto contents = SerializedFile::Open(std::move(source), props, std::move(metadata));
+ std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
+ result->Open(std::move(contents));
+ return result;
+}
+
+std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
+ const std::string& path, bool memory_map, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ std::shared_ptr<::arrow::io::RandomAccessFile> source;
+ if (memory_map) {
+ PARQUET_ASSIGN_OR_THROW(
+ source, ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ));
+ } else {
+ PARQUET_ASSIGN_OR_THROW(source,
+ ::arrow::io::ReadableFile::Open(path, props.memory_pool()));
+ }
+
+ return Open(std::move(source), props, std::move(metadata));
+}
+
+::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
+ // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
+ auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
+ fut.AddCallback([fut, completed](
+ const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
+ contents) mutable {
+ if (!contents.ok()) {
+ completed.MarkFinished(contents.status());
+ return;
+ }
+ std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
+ result->Open(fut.MoveResult().MoveValueUnsafe());
+ completed.MarkFinished(std::move(result));
+ });
+ return completed;
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) {
+ contents_ = std::move(contents);
+}
+
+void ParquetFileReader::Close() {
+ if (contents_) {
+ contents_->Close();
+ }
+}
+
+std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
+ return contents_->metadata();
+}
+
+std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
+ if (i >= metadata()->num_row_groups()) {
+ std::stringstream ss;
+ ss << "Trying to read row group " << i << " but file only has "
+ << metadata()->num_row_groups() << " row groups";
+ throw ParquetException(ss.str());
+ }
+ return contents_->GetRowGroup(i);
+}
+
+void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options) {
+ // Access private methods here
+ SerializedFile* file =
+ ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
+ file->PreBuffer(row_groups, column_indices, ctx, options);
+}
+
+::arrow::Future<> ParquetFileReader::WhenBuffered(
+ const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
+ // Access private methods here
+ SerializedFile* file =
+ ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
+ return file->WhenBuffered(row_groups, column_indices);
+}
+
+// ----------------------------------------------------------------------
+// File metadata helpers
+
+std::shared_ptr<FileMetaData> ReadMetaData(
+ const std::shared_ptr<::arrow::io::RandomAccessFile>& source) {
+ return ParquetFileReader::Open(source)->metadata();
+}
+
+// ----------------------------------------------------------------------
+// File scanner for performance testing
+
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader) {
+ std::vector<int16_t> rep_levels(column_batch_size);
+ std::vector<int16_t> def_levels(column_batch_size);
+
+ int num_columns = static_cast<int>(columns.size());
+
+ // columns are not specified explicitly. Add all columns
+ if (columns.size() == 0) {
+ num_columns = reader->metadata()->num_columns();
+ columns.resize(num_columns);
+ for (int i = 0; i < num_columns; i++) {
+ columns[i] = i;
+ }
+ }
+
+ std::vector<int64_t> total_rows(num_columns, 0);
+
+ for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+ auto group_reader = reader->RowGroup(r);
+ int col = 0;
+ for (auto i : columns) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+ std::vector<uint8_t> values(column_batch_size * value_byte_size);
+
+ int64_t values_read = 0;
+ while (col_reader->HasNext()) {
+ int64_t levels_read =
+ ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
+ values.data(), &values_read, col_reader.get());
+ if (col_reader->descr()->max_repetition_level() > 0) {
+ for (int64_t i = 0; i < levels_read; i++) {
+ if (rep_levels[i] == 0) {
+ total_rows[col]++;
+ }
+ }
+ } else {
+ total_rows[col] += levels_read;
+ }
+ }
+ col++;
+ }
+ }
+
+ for (int i = 1; i < num_columns; ++i) {
+ if (total_rows[0] != total_rows[i]) {
+ throw ParquetException("Parquet error: Total rows among columns do not match");
+ }
+ }
+
+ return total_rows[0];
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
index 0fc84054939..a6358684250 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
@@ -1,188 +1,188 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/io/caching.h"
-#include "arrow/util/type_fwd.h"
-#include "parquet/metadata.h" // IWYU pragma: keep
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-
-namespace parquet {
-
-class ColumnReader;
-class FileMetaData;
-class PageReader;
-class RowGroupMetaData;
-
-class PARQUET_EXPORT RowGroupReader {
- public:
- // Forward declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct Contents {
- virtual ~Contents() {}
- virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
- virtual const RowGroupMetaData* metadata() const = 0;
- virtual const ReaderProperties* properties() const = 0;
- };
-
- explicit RowGroupReader(std::unique_ptr<Contents> contents);
-
- // Returns the rowgroup metadata
- const RowGroupMetaData* metadata() const;
-
- // Construct a ColumnReader for the indicated row group-relative
- // column. Ownership is shared with the RowGroupReader.
- std::shared_ptr<ColumnReader> Column(int i);
-
- // Construct a ColumnReader, trying to enable exposed encoding.
- //
- // For dictionary encoding, currently we only support column chunks that are fully
- // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
- // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
- // encoding will not be exposed.
- //
- // The returned column reader provides an API GetExposedEncoding() for the
- // users to check the exposed encoding and determine how to read the batches.
- //
- // \note API EXPERIMENTAL
- std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
- int i, ExposedEncoding encoding_to_expose);
-
- std::unique_ptr<PageReader> GetColumnPageReader(int i);
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
-};
-
-class PARQUET_EXPORT ParquetFileReader {
- public:
- // Declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct PARQUET_EXPORT Contents {
- static std::unique_ptr<Contents> Open(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- virtual ~Contents() = default;
- // Perform any cleanup associated with the file contents
- virtual void Close() = 0;
- virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
- virtual std::shared_ptr<FileMetaData> metadata() const = 0;
- };
-
- ParquetFileReader();
- ~ParquetFileReader();
-
- // Create a file reader instance from an Arrow file object. Thread-safety is
- // the responsibility of the file implementation
- static std::unique_ptr<ParquetFileReader> Open(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- // API Convenience to open a serialized Parquet file on disk, using Arrow IO
- // interfaces.
- static std::unique_ptr<ParquetFileReader> OpenFile(
- const std::string& path, bool memory_map = true,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- // Asynchronously open a file reader from an Arrow file object.
- // Does not throw - all errors are reported through the Future.
- static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- void Open(std::unique_ptr<Contents> contents);
- void Close();
-
- // The RowGroupReader is owned by the FileReader
- std::shared_ptr<RowGroupReader> RowGroup(int i);
-
- // Returns the file metadata. Only one instance is ever created
- std::shared_ptr<FileMetaData> metadata() const;
-
- /// Pre-buffer the specified column indices in all row groups.
- ///
- /// Readers can optionally call this to cache the necessary slices
- /// of the file in-memory before deserialization. Arrow readers can
- /// automatically do this via an option. This is intended to
- /// increase performance when reading from high-latency filesystems
- /// (e.g. Amazon S3).
- ///
- /// After calling this, creating readers for row groups/column
- /// indices that were not buffered may fail. Creating multiple
- /// readers for the a subset of the buffered regions is
- /// acceptable. This may be called again to buffer a different set
- /// of row groups/columns.
- ///
- /// If memory usage is a concern, note that data will remain
- /// buffered in memory until either \a PreBuffer() is called again,
- /// or the reader itself is destructed. Reading - and buffering -
- /// only one row group at a time may be useful.
- ///
- /// This method may throw.
- void PreBuffer(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- const ::arrow::io::IOContext& ctx,
- const ::arrow::io::CacheOptions& options);
-
- /// Wait for the specified row groups and column indices to be pre-buffered.
- ///
- /// After the returned Future completes, reading the specified row
- /// groups/columns will not block.
- ///
- /// PreBuffer must be called first. This method does not throw.
- ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices) const;
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
-};
-
-// Read only Parquet file metadata
-std::shared_ptr<FileMetaData> PARQUET_EXPORT
-ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
-
-/// \brief Scan all values in file. Useful for performance testing
-/// \param[in] columns the column numbers to scan. If empty scans all
-/// \param[in] column_batch_size number of values to read at a time when scanning column
-/// \param[in] reader a ParquetFileReader instance
-/// \return number of semantic rows in file
-PARQUET_EXPORT
-int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
- ParquetFileReader* reader);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/util/type_fwd.h"
+#include "parquet/metadata.h" // IWYU pragma: keep
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+class ColumnReader;
+class FileMetaData;
+class PageReader;
+class RowGroupMetaData;
+
+class PARQUET_EXPORT RowGroupReader {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ virtual ~Contents() {}
+ virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
+ virtual const RowGroupMetaData* metadata() const = 0;
+ virtual const ReaderProperties* properties() const = 0;
+ };
+
+ explicit RowGroupReader(std::unique_ptr<Contents> contents);
+
+ // Returns the rowgroup metadata
+ const RowGroupMetaData* metadata() const;
+
+ // Construct a ColumnReader for the indicated row group-relative
+ // column. Ownership is shared with the RowGroupReader.
+ std::shared_ptr<ColumnReader> Column(int i);
+
+ // Construct a ColumnReader, trying to enable exposed encoding.
+ //
+ // For dictionary encoding, currently we only support column chunks that are fully
+ // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
+ // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
+ // encoding will not be exposed.
+ //
+ // The returned column reader provides an API GetExposedEncoding() for the
+ // users to check the exposed encoding and determine how to read the batches.
+ //
+ // \note API EXPERIMENTAL
+ std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose);
+
+ std::unique_ptr<PageReader> GetColumnPageReader(int i);
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+class PARQUET_EXPORT ParquetFileReader {
+ public:
+ // Declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct PARQUET_EXPORT Contents {
+ static std::unique_ptr<Contents> Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ virtual ~Contents() = default;
+ // Perform any cleanup associated with the file contents
+ virtual void Close() = 0;
+ virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
+ virtual std::shared_ptr<FileMetaData> metadata() const = 0;
+ };
+
+ ParquetFileReader();
+ ~ParquetFileReader();
+
+ // Create a file reader instance from an Arrow file object. Thread-safety is
+ // the responsibility of the file implementation
+ static std::unique_ptr<ParquetFileReader> Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ // API Convenience to open a serialized Parquet file on disk, using Arrow IO
+ // interfaces.
+ static std::unique_ptr<ParquetFileReader> OpenFile(
+ const std::string& path, bool memory_map = true,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ // Asynchronously open a file reader from an Arrow file object.
+ // Does not throw - all errors are reported through the Future.
+ static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ void Open(std::unique_ptr<Contents> contents);
+ void Close();
+
+ // The RowGroupReader is owned by the FileReader
+ std::shared_ptr<RowGroupReader> RowGroup(int i);
+
+ // Returns the file metadata. Only one instance is ever created
+ std::shared_ptr<FileMetaData> metadata() const;
+
+ /// Pre-buffer the specified column indices in all row groups.
+ ///
+ /// Readers can optionally call this to cache the necessary slices
+ /// of the file in-memory before deserialization. Arrow readers can
+ /// automatically do this via an option. This is intended to
+ /// increase performance when reading from high-latency filesystems
+ /// (e.g. Amazon S3).
+ ///
+ /// After calling this, creating readers for row groups/column
+ /// indices that were not buffered may fail. Creating multiple
+ /// readers for the a subset of the buffered regions is
+ /// acceptable. This may be called again to buffer a different set
+ /// of row groups/columns.
+ ///
+ /// If memory usage is a concern, note that data will remain
+ /// buffered in memory until either \a PreBuffer() is called again,
+ /// or the reader itself is destructed. Reading - and buffering -
+ /// only one row group at a time may be useful.
+ ///
+ /// This method may throw.
+ void PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options);
+
+ /// Wait for the specified row groups and column indices to be pre-buffered.
+ ///
+ /// After the returned Future completes, reading the specified row
+ /// groups/columns will not block.
+ ///
+ /// PreBuffer must be called first. This method does not throw.
+ ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+// Read only Parquet file metadata
+std::shared_ptr<FileMetaData> PARQUET_EXPORT
+ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
+
+/// \brief Scan all values in file. Useful for performance testing
+/// \param[in] columns the column numbers to scan. If empty scans all
+/// \param[in] column_batch_size number of values to read at a time when scanning column
+/// \param[in] reader a ParquetFileReader instance
+/// \return number of semantic rows in file
+PARQUET_EXPORT
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
index deac9586e5a..a4c824c423b 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
@@ -1,547 +1,547 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/file_writer.h"
-
-#include <cstddef>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "parquet/column_writer.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-using arrow::MemoryPool;
-
-using parquet::schema::GroupNode;
-
-namespace parquet {
-
-// ----------------------------------------------------------------------
-// RowGroupWriter public API
-
-RowGroupWriter::RowGroupWriter(std::unique_ptr<Contents> contents)
- : contents_(std::move(contents)) {}
-
-void RowGroupWriter::Close() {
- if (contents_) {
- contents_->Close();
- }
-}
-
-ColumnWriter* RowGroupWriter::NextColumn() { return contents_->NextColumn(); }
-
-ColumnWriter* RowGroupWriter::column(int i) { return contents_->column(i); }
-
-int64_t RowGroupWriter::total_compressed_bytes() const {
- return contents_->total_compressed_bytes();
-}
-
-int64_t RowGroupWriter::total_bytes_written() const {
- return contents_->total_bytes_written();
-}
-
-int RowGroupWriter::current_column() { return contents_->current_column(); }
-
-int RowGroupWriter::num_columns() const { return contents_->num_columns(); }
-
-int64_t RowGroupWriter::num_rows() const { return contents_->num_rows(); }
-
-inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) {
- std::stringstream ss;
- ss << "Column " << col << " had " << curr << " while previous column had " << prev;
- throw ParquetException(ss.str());
-}
-
-// ----------------------------------------------------------------------
-// RowGroupSerializer
-
-// RowGroupWriter::Contents implementation for the Parquet file specification
-class RowGroupSerializer : public RowGroupWriter::Contents {
- public:
- RowGroupSerializer(std::shared_ptr<ArrowOutputStream> sink,
- RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal,
- const WriterProperties* properties, bool buffered_row_group = false,
- InternalFileEncryptor* file_encryptor = nullptr)
- : sink_(std::move(sink)),
- metadata_(metadata),
- properties_(properties),
- total_bytes_written_(0),
- closed_(false),
- row_group_ordinal_(row_group_ordinal),
- next_column_index_(0),
- num_rows_(0),
- buffered_row_group_(buffered_row_group),
- file_encryptor_(file_encryptor) {
- if (buffered_row_group) {
- InitColumns();
- } else {
- column_writers_.push_back(nullptr);
- }
- }
-
- int num_columns() const override { return metadata_->num_columns(); }
-
- int64_t num_rows() const override {
- CheckRowsWritten();
- // CheckRowsWritten ensures num_rows_ is set correctly
- return num_rows_;
- }
-
- ColumnWriter* NextColumn() override {
- if (buffered_row_group_) {
- throw ParquetException(
- "NextColumn() is not supported when a RowGroup is written by size");
- }
-
- if (column_writers_[0]) {
- CheckRowsWritten();
- }
-
- // Throws an error if more columns are being written
- auto col_meta = metadata_->NextColumnChunk();
-
- if (column_writers_[0]) {
- total_bytes_written_ += column_writers_[0]->Close();
- }
-
- ++next_column_index_;
-
- const auto& path = col_meta->descr()->path();
- auto meta_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
- : nullptr;
- auto data_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
- : nullptr;
- std::unique_ptr<PageWriter> pager = PageWriter::Open(
- sink_, properties_->compression(path), properties_->compression_level(path),
- col_meta, row_group_ordinal_, static_cast<int16_t>(next_column_index_ - 1),
- properties_->memory_pool(), false, meta_encryptor, data_encryptor);
- column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_);
- return column_writers_[0].get();
- }
-
- ColumnWriter* column(int i) override {
- if (!buffered_row_group_) {
- throw ParquetException(
- "column() is only supported when a BufferedRowGroup is being written");
- }
-
- if (i >= 0 && i < static_cast<int>(column_writers_.size())) {
- return column_writers_[i].get();
- }
- return nullptr;
- }
-
- int current_column() const override { return metadata_->current_column(); }
-
- int64_t total_compressed_bytes() const override {
- int64_t total_compressed_bytes = 0;
- for (size_t i = 0; i < column_writers_.size(); i++) {
- if (column_writers_[i]) {
- total_compressed_bytes += column_writers_[i]->total_compressed_bytes();
- }
- }
- return total_compressed_bytes;
- }
-
- int64_t total_bytes_written() const override {
- int64_t total_bytes_written = 0;
- for (size_t i = 0; i < column_writers_.size(); i++) {
- if (column_writers_[i]) {
- total_bytes_written += column_writers_[i]->total_bytes_written();
- }
- }
- return total_bytes_written;
- }
-
- void Close() override {
- if (!closed_) {
- closed_ = true;
- CheckRowsWritten();
-
- for (size_t i = 0; i < column_writers_.size(); i++) {
- if (column_writers_[i]) {
- total_bytes_written_ += column_writers_[i]->Close();
- column_writers_[i].reset();
- }
- }
-
- column_writers_.clear();
-
- // Ensures all columns have been written
- metadata_->set_num_rows(num_rows_);
- metadata_->Finish(total_bytes_written_, row_group_ordinal_);
- }
- }
-
- private:
- std::shared_ptr<ArrowOutputStream> sink_;
- mutable RowGroupMetaDataBuilder* metadata_;
- const WriterProperties* properties_;
- int64_t total_bytes_written_;
- bool closed_;
- int16_t row_group_ordinal_;
- int next_column_index_;
- mutable int64_t num_rows_;
- bool buffered_row_group_;
- InternalFileEncryptor* file_encryptor_;
-
- void CheckRowsWritten() const {
- // verify when only one column is written at a time
- if (!buffered_row_group_ && column_writers_.size() > 0 && column_writers_[0]) {
- int64_t current_col_rows = column_writers_[0]->rows_written();
- if (num_rows_ == 0) {
- num_rows_ = current_col_rows;
- } else if (num_rows_ != current_col_rows) {
- ThrowRowsMisMatchError(next_column_index_, current_col_rows, num_rows_);
- }
- } else if (buffered_row_group_ &&
- column_writers_.size() > 0) { // when buffered_row_group = true
- int64_t current_col_rows = column_writers_[0]->rows_written();
- for (int i = 1; i < static_cast<int>(column_writers_.size()); i++) {
- int64_t current_col_rows_i = column_writers_[i]->rows_written();
- if (current_col_rows != current_col_rows_i) {
- ThrowRowsMisMatchError(i, current_col_rows_i, current_col_rows);
- }
- }
- num_rows_ = current_col_rows;
- }
- }
-
- void InitColumns() {
- for (int i = 0; i < num_columns(); i++) {
- auto col_meta = metadata_->NextColumnChunk();
- const auto& path = col_meta->descr()->path();
- auto meta_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
- : nullptr;
- auto data_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
- : nullptr;
- std::unique_ptr<PageWriter> pager = PageWriter::Open(
- sink_, properties_->compression(path), properties_->compression_level(path),
- col_meta, static_cast<int16_t>(row_group_ordinal_),
- static_cast<int16_t>(next_column_index_++), properties_->memory_pool(),
- buffered_row_group_, meta_encryptor, data_encryptor);
- column_writers_.push_back(
- ColumnWriter::Make(col_meta, std::move(pager), properties_));
- }
- }
-
- std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
-};
-
-// ----------------------------------------------------------------------
-// FileSerializer
-
-// An implementation of ParquetFileWriter::Contents that deals with the Parquet
-// file structure, Thrift serialization, and other internal matters
-
-class FileSerializer : public ParquetFileWriter::Contents {
- public:
- static std::unique_ptr<ParquetFileWriter::Contents> Open(
- std::shared_ptr<ArrowOutputStream> sink, std::shared_ptr<GroupNode> schema,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
- std::unique_ptr<ParquetFileWriter::Contents> result(
- new FileSerializer(std::move(sink), std::move(schema), std::move(properties),
- std::move(key_value_metadata)));
-
- return result;
- }
-
- void Close() override {
- if (is_open_) {
- // If any functions here raise an exception, we set is_open_ to be false
- // so that this does not get called again (possibly causing segfault)
- is_open_ = false;
- if (row_group_writer_) {
- num_rows_ += row_group_writer_->num_rows();
- row_group_writer_->Close();
- }
- row_group_writer_.reset();
-
- // Write magic bytes and metadata
- auto file_encryption_properties = properties_->file_encryption_properties();
-
- if (file_encryption_properties == nullptr) { // Non encrypted file.
- file_metadata_ = metadata_->Finish();
- WriteFileMetaData(*file_metadata_, sink_.get());
- } else { // Encrypted file
- CloseEncryptedFile(file_encryption_properties);
- }
- }
- }
-
- int num_columns() const override { return schema_.num_columns(); }
-
- int num_row_groups() const override { return num_row_groups_; }
-
- int64_t num_rows() const override { return num_rows_; }
-
- const std::shared_ptr<WriterProperties>& properties() const override {
- return properties_;
- }
-
- RowGroupWriter* AppendRowGroup(bool buffered_row_group) {
- if (row_group_writer_) {
- row_group_writer_->Close();
- }
- num_row_groups_++;
- auto rg_metadata = metadata_->AppendRowGroup();
- std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
- sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), properties_.get(),
- buffered_row_group, file_encryptor_.get()));
- row_group_writer_.reset(new RowGroupWriter(std::move(contents)));
- return row_group_writer_.get();
- }
-
- RowGroupWriter* AppendRowGroup() override { return AppendRowGroup(false); }
-
- RowGroupWriter* AppendBufferedRowGroup() override { return AppendRowGroup(true); }
-
- ~FileSerializer() override {
- try {
- Close();
- } catch (...) {
- }
- }
-
- private:
- FileSerializer(std::shared_ptr<ArrowOutputStream> sink,
- std::shared_ptr<GroupNode> schema,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : ParquetFileWriter::Contents(std::move(schema), std::move(key_value_metadata)),
- sink_(std::move(sink)),
- is_open_(true),
- properties_(std::move(properties)),
- num_row_groups_(0),
- num_rows_(0),
- metadata_(FileMetaDataBuilder::Make(&schema_, properties_, key_value_metadata_)) {
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
- if (position == 0) {
- StartFile();
- } else {
- throw ParquetException("Appending to file not implemented.");
- }
- }
-
- void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) {
- // Encrypted file with encrypted footer
- if (file_encryption_properties->encrypted_footer()) {
- // encrypted footer
- file_metadata_ = metadata_->Finish();
-
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
- uint64_t metadata_start = static_cast<uint64_t>(position);
- auto crypto_metadata = metadata_->GetCryptoMetaData();
- WriteFileCryptoMetaData(*crypto_metadata, sink_.get());
-
- auto footer_encryptor = file_encryptor_->GetFooterEncryptor();
- WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true);
- PARQUET_ASSIGN_OR_THROW(position, sink_->Tell());
- uint32_t footer_and_crypto_len = static_cast<uint32_t>(position - metadata_start);
- PARQUET_THROW_NOT_OK(
- sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len), 4));
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
- } else { // Encrypted file with plaintext footer
- file_metadata_ = metadata_->Finish();
- auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor();
- WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor,
- false);
- }
- if (file_encryptor_) {
- file_encryptor_->WipeOutEncryptionKeys();
- }
- }
-
- std::shared_ptr<ArrowOutputStream> sink_;
- bool is_open_;
- const std::shared_ptr<WriterProperties> properties_;
- int num_row_groups_;
- int64_t num_rows_;
- std::unique_ptr<FileMetaDataBuilder> metadata_;
- // Only one of the row group writers is active at a time
- std::unique_ptr<RowGroupWriter> row_group_writer_;
-
- std::unique_ptr<InternalFileEncryptor> file_encryptor_;
-
- void StartFile() {
- auto file_encryption_properties = properties_->file_encryption_properties();
- if (file_encryption_properties == nullptr) {
- // Unencrypted parquet files always start with PAR1
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
- } else {
- // Check that all columns in columnEncryptionProperties exist in the schema.
- auto encrypted_columns = file_encryption_properties->encrypted_columns();
- // if columnEncryptionProperties is empty, every column in file schema will be
- // encrypted with footer key.
- if (encrypted_columns.size() != 0) {
- std::vector<std::string> column_path_vec;
- // First, save all column paths in schema.
- for (int i = 0; i < num_columns(); i++) {
- column_path_vec.push_back(schema_.Column(i)->path()->ToDotString());
- }
- // Check if column exists in schema.
- for (const auto& elem : encrypted_columns) {
- auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first);
- if (it == column_path_vec.end()) {
- std::stringstream ss;
- ss << "Encrypted column " + elem.first + " not in file schema";
- throw ParquetException(ss.str());
- }
- }
- }
-
- file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties,
- properties_->memory_pool()));
- if (file_encryption_properties->encrypted_footer()) {
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
- } else {
- // Encrypted file with plaintext footer mode.
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
- }
- }
- }
-};
-
-// ----------------------------------------------------------------------
-// ParquetFileWriter public API
-
-ParquetFileWriter::ParquetFileWriter() {}
-
-ParquetFileWriter::~ParquetFileWriter() {
- try {
- Close();
- } catch (...) {
- }
-}
-
-std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
- std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr<GroupNode> schema,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
- auto contents =
- FileSerializer::Open(std::move(sink), std::move(schema), std::move(properties),
- std::move(key_value_metadata));
- std::unique_ptr<ParquetFileWriter> result(new ParquetFileWriter());
- result->Open(std::move(contents));
- return result;
-}
-
-void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
- // Write MetaData
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
- uint32_t metadata_len = static_cast<uint32_t>(position);
-
- file_metadata.WriteTo(sink);
- PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
- metadata_len = static_cast<uint32_t>(position) - metadata_len;
-
- // Write Footer
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
- PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
-}
-
-void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
- PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
- return WriteFileMetaData(file_metadata, sink);
-}
-
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
- ArrowOutputStream* sink,
- const std::shared_ptr<Encryptor>& encryptor,
- bool encrypt_footer) {
- if (encrypt_footer) { // Encrypted file with encrypted footer
- // encrypt and write to sink
- file_metadata.WriteTo(sink, encryptor);
- } else { // Encrypted file with plaintext footer mode.
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
- uint32_t metadata_len = static_cast<uint32_t>(position);
- file_metadata.WriteTo(sink, encryptor);
- PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
- metadata_len = static_cast<uint32_t>(position) - metadata_len;
-
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
- PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
- }
-}
-
-void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
- ArrowOutputStream* sink) {
- crypto_metadata.WriteTo(sink);
-}
-
-const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
-
-const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
- return contents_->schema()->Column(i);
-}
-
-int ParquetFileWriter::num_columns() const { return contents_->num_columns(); }
-
-int64_t ParquetFileWriter::num_rows() const { return contents_->num_rows(); }
-
-int ParquetFileWriter::num_row_groups() const { return contents_->num_row_groups(); }
-
-const std::shared_ptr<const KeyValueMetadata>& ParquetFileWriter::key_value_metadata()
- const {
- return contents_->key_value_metadata();
-}
-
-const std::shared_ptr<FileMetaData> ParquetFileWriter::metadata() const {
- return file_metadata_;
-}
-
-void ParquetFileWriter::Open(std::unique_ptr<ParquetFileWriter::Contents> contents) {
- contents_ = std::move(contents);
-}
-
-void ParquetFileWriter::Close() {
- if (contents_) {
- contents_->Close();
- file_metadata_ = contents_->metadata();
- contents_.reset();
- }
-}
-
-RowGroupWriter* ParquetFileWriter::AppendRowGroup() {
- return contents_->AppendRowGroup();
-}
-
-RowGroupWriter* ParquetFileWriter::AppendBufferedRowGroup() {
- return contents_->AppendBufferedRowGroup();
-}
-
-RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) {
- return AppendRowGroup();
-}
-
-const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const {
- return contents_->properties();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_writer.h"
+
+#include <cstddef>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_writer.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+using arrow::MemoryPool;
+
+using parquet::schema::GroupNode;
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// RowGroupWriter public API
+
+RowGroupWriter::RowGroupWriter(std::unique_ptr<Contents> contents)
+ : contents_(std::move(contents)) {}
+
+void RowGroupWriter::Close() {
+ if (contents_) {
+ contents_->Close();
+ }
+}
+
+ColumnWriter* RowGroupWriter::NextColumn() { return contents_->NextColumn(); }
+
+ColumnWriter* RowGroupWriter::column(int i) { return contents_->column(i); }
+
+int64_t RowGroupWriter::total_compressed_bytes() const {
+ return contents_->total_compressed_bytes();
+}
+
+int64_t RowGroupWriter::total_bytes_written() const {
+ return contents_->total_bytes_written();
+}
+
+int RowGroupWriter::current_column() { return contents_->current_column(); }
+
+int RowGroupWriter::num_columns() const { return contents_->num_columns(); }
+
+int64_t RowGroupWriter::num_rows() const { return contents_->num_rows(); }
+
+inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) {
+ std::stringstream ss;
+ ss << "Column " << col << " had " << curr << " while previous column had " << prev;
+ throw ParquetException(ss.str());
+}
+
+// ----------------------------------------------------------------------
+// RowGroupSerializer
+
+// RowGroupWriter::Contents implementation for the Parquet file specification
+class RowGroupSerializer : public RowGroupWriter::Contents {
+ public:
+ RowGroupSerializer(std::shared_ptr<ArrowOutputStream> sink,
+ RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal,
+ const WriterProperties* properties, bool buffered_row_group = false,
+ InternalFileEncryptor* file_encryptor = nullptr)
+ : sink_(std::move(sink)),
+ metadata_(metadata),
+ properties_(properties),
+ total_bytes_written_(0),
+ closed_(false),
+ row_group_ordinal_(row_group_ordinal),
+ next_column_index_(0),
+ num_rows_(0),
+ buffered_row_group_(buffered_row_group),
+ file_encryptor_(file_encryptor) {
+ if (buffered_row_group) {
+ InitColumns();
+ } else {
+ column_writers_.push_back(nullptr);
+ }
+ }
+
+ int num_columns() const override { return metadata_->num_columns(); }
+
+ int64_t num_rows() const override {
+ CheckRowsWritten();
+ // CheckRowsWritten ensures num_rows_ is set correctly
+ return num_rows_;
+ }
+
+ ColumnWriter* NextColumn() override {
+ if (buffered_row_group_) {
+ throw ParquetException(
+ "NextColumn() is not supported when a RowGroup is written by size");
+ }
+
+ if (column_writers_[0]) {
+ CheckRowsWritten();
+ }
+
+ // Throws an error if more columns are being written
+ auto col_meta = metadata_->NextColumnChunk();
+
+ if (column_writers_[0]) {
+ total_bytes_written_ += column_writers_[0]->Close();
+ }
+
+ ++next_column_index_;
+
+ const auto& path = col_meta->descr()->path();
+ auto meta_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, properties_->compression(path), properties_->compression_level(path),
+ col_meta, row_group_ordinal_, static_cast<int16_t>(next_column_index_ - 1),
+ properties_->memory_pool(), false, meta_encryptor, data_encryptor);
+ column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_);
+ return column_writers_[0].get();
+ }
+
+ ColumnWriter* column(int i) override {
+ if (!buffered_row_group_) {
+ throw ParquetException(
+ "column() is only supported when a BufferedRowGroup is being written");
+ }
+
+ if (i >= 0 && i < static_cast<int>(column_writers_.size())) {
+ return column_writers_[i].get();
+ }
+ return nullptr;
+ }
+
+ int current_column() const override { return metadata_->current_column(); }
+
+ int64_t total_compressed_bytes() const override {
+ int64_t total_compressed_bytes = 0;
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_compressed_bytes += column_writers_[i]->total_compressed_bytes();
+ }
+ }
+ return total_compressed_bytes;
+ }
+
+ int64_t total_bytes_written() const override {
+ int64_t total_bytes_written = 0;
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_bytes_written += column_writers_[i]->total_bytes_written();
+ }
+ }
+ return total_bytes_written;
+ }
+
+ void Close() override {
+ if (!closed_) {
+ closed_ = true;
+ CheckRowsWritten();
+
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_bytes_written_ += column_writers_[i]->Close();
+ column_writers_[i].reset();
+ }
+ }
+
+ column_writers_.clear();
+
+ // Ensures all columns have been written
+ metadata_->set_num_rows(num_rows_);
+ metadata_->Finish(total_bytes_written_, row_group_ordinal_);
+ }
+ }
+
+ private:
+ std::shared_ptr<ArrowOutputStream> sink_;
+ mutable RowGroupMetaDataBuilder* metadata_;
+ const WriterProperties* properties_;
+ int64_t total_bytes_written_;
+ bool closed_;
+ int16_t row_group_ordinal_;
+ int next_column_index_;
+ mutable int64_t num_rows_;
+ bool buffered_row_group_;
+ InternalFileEncryptor* file_encryptor_;
+
+ void CheckRowsWritten() const {
+ // verify when only one column is written at a time
+ if (!buffered_row_group_ && column_writers_.size() > 0 && column_writers_[0]) {
+ int64_t current_col_rows = column_writers_[0]->rows_written();
+ if (num_rows_ == 0) {
+ num_rows_ = current_col_rows;
+ } else if (num_rows_ != current_col_rows) {
+ ThrowRowsMisMatchError(next_column_index_, current_col_rows, num_rows_);
+ }
+ } else if (buffered_row_group_ &&
+ column_writers_.size() > 0) { // when buffered_row_group = true
+ int64_t current_col_rows = column_writers_[0]->rows_written();
+ for (int i = 1; i < static_cast<int>(column_writers_.size()); i++) {
+ int64_t current_col_rows_i = column_writers_[i]->rows_written();
+ if (current_col_rows != current_col_rows_i) {
+ ThrowRowsMisMatchError(i, current_col_rows_i, current_col_rows);
+ }
+ }
+ num_rows_ = current_col_rows;
+ }
+ }
+
+ void InitColumns() {
+ for (int i = 0; i < num_columns(); i++) {
+ auto col_meta = metadata_->NextColumnChunk();
+ const auto& path = col_meta->descr()->path();
+ auto meta_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, properties_->compression(path), properties_->compression_level(path),
+ col_meta, static_cast<int16_t>(row_group_ordinal_),
+ static_cast<int16_t>(next_column_index_++), properties_->memory_pool(),
+ buffered_row_group_, meta_encryptor, data_encryptor);
+ column_writers_.push_back(
+ ColumnWriter::Make(col_meta, std::move(pager), properties_));
+ }
+ }
+
+ std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
+};
+
+// ----------------------------------------------------------------------
+// FileSerializer
+
+// An implementation of ParquetFileWriter::Contents that deals with the Parquet
+// file structure, Thrift serialization, and other internal matters
+
+class FileSerializer : public ParquetFileWriter::Contents {
+ public:
+ static std::unique_ptr<ParquetFileWriter::Contents> Open(
+ std::shared_ptr<ArrowOutputStream> sink, std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ std::unique_ptr<ParquetFileWriter::Contents> result(
+ new FileSerializer(std::move(sink), std::move(schema), std::move(properties),
+ std::move(key_value_metadata)));
+
+ return result;
+ }
+
+ void Close() override {
+ if (is_open_) {
+ // If any functions here raise an exception, we set is_open_ to be false
+ // so that this does not get called again (possibly causing segfault)
+ is_open_ = false;
+ if (row_group_writer_) {
+ num_rows_ += row_group_writer_->num_rows();
+ row_group_writer_->Close();
+ }
+ row_group_writer_.reset();
+
+ // Write magic bytes and metadata
+ auto file_encryption_properties = properties_->file_encryption_properties();
+
+ if (file_encryption_properties == nullptr) { // Non encrypted file.
+ file_metadata_ = metadata_->Finish();
+ WriteFileMetaData(*file_metadata_, sink_.get());
+ } else { // Encrypted file
+ CloseEncryptedFile(file_encryption_properties);
+ }
+ }
+ }
+
+ int num_columns() const override { return schema_.num_columns(); }
+
+ int num_row_groups() const override { return num_row_groups_; }
+
+ int64_t num_rows() const override { return num_rows_; }
+
+ const std::shared_ptr<WriterProperties>& properties() const override {
+ return properties_;
+ }
+
+ RowGroupWriter* AppendRowGroup(bool buffered_row_group) {
+ if (row_group_writer_) {
+ row_group_writer_->Close();
+ }
+ num_row_groups_++;
+ auto rg_metadata = metadata_->AppendRowGroup();
+ std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
+ sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), properties_.get(),
+ buffered_row_group, file_encryptor_.get()));
+ row_group_writer_.reset(new RowGroupWriter(std::move(contents)));
+ return row_group_writer_.get();
+ }
+
+ RowGroupWriter* AppendRowGroup() override { return AppendRowGroup(false); }
+
+ RowGroupWriter* AppendBufferedRowGroup() override { return AppendRowGroup(true); }
+
+ ~FileSerializer() override {
+ try {
+ Close();
+ } catch (...) {
+ }
+ }
+
+ private:
+ FileSerializer(std::shared_ptr<ArrowOutputStream> sink,
+ std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : ParquetFileWriter::Contents(std::move(schema), std::move(key_value_metadata)),
+ sink_(std::move(sink)),
+ is_open_(true),
+ properties_(std::move(properties)),
+ num_row_groups_(0),
+ num_rows_(0),
+ metadata_(FileMetaDataBuilder::Make(&schema_, properties_, key_value_metadata_)) {
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
+ if (position == 0) {
+ StartFile();
+ } else {
+ throw ParquetException("Appending to file not implemented.");
+ }
+ }
+
+ void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) {
+ // Encrypted file with encrypted footer
+ if (file_encryption_properties->encrypted_footer()) {
+ // encrypted footer
+ file_metadata_ = metadata_->Finish();
+
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
+ uint64_t metadata_start = static_cast<uint64_t>(position);
+ auto crypto_metadata = metadata_->GetCryptoMetaData();
+ WriteFileCryptoMetaData(*crypto_metadata, sink_.get());
+
+ auto footer_encryptor = file_encryptor_->GetFooterEncryptor();
+ WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true);
+ PARQUET_ASSIGN_OR_THROW(position, sink_->Tell());
+ uint32_t footer_and_crypto_len = static_cast<uint32_t>(position - metadata_start);
+ PARQUET_THROW_NOT_OK(
+ sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len), 4));
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
+ } else { // Encrypted file with plaintext footer
+ file_metadata_ = metadata_->Finish();
+ auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor();
+ WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor,
+ false);
+ }
+ if (file_encryptor_) {
+ file_encryptor_->WipeOutEncryptionKeys();
+ }
+ }
+
+ std::shared_ptr<ArrowOutputStream> sink_;
+ bool is_open_;
+ const std::shared_ptr<WriterProperties> properties_;
+ int num_row_groups_;
+ int64_t num_rows_;
+ std::unique_ptr<FileMetaDataBuilder> metadata_;
+ // Only one of the row group writers is active at a time
+ std::unique_ptr<RowGroupWriter> row_group_writer_;
+
+ std::unique_ptr<InternalFileEncryptor> file_encryptor_;
+
+ void StartFile() {
+ auto file_encryption_properties = properties_->file_encryption_properties();
+ if (file_encryption_properties == nullptr) {
+ // Unencrypted parquet files always start with PAR1
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
+ } else {
+ // Check that all columns in columnEncryptionProperties exist in the schema.
+ auto encrypted_columns = file_encryption_properties->encrypted_columns();
+ // if columnEncryptionProperties is empty, every column in file schema will be
+ // encrypted with footer key.
+ if (encrypted_columns.size() != 0) {
+ std::vector<std::string> column_path_vec;
+ // First, save all column paths in schema.
+ for (int i = 0; i < num_columns(); i++) {
+ column_path_vec.push_back(schema_.Column(i)->path()->ToDotString());
+ }
+ // Check if column exists in schema.
+ for (const auto& elem : encrypted_columns) {
+ auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first);
+ if (it == column_path_vec.end()) {
+ std::stringstream ss;
+ ss << "Encrypted column " + elem.first + " not in file schema";
+ throw ParquetException(ss.str());
+ }
+ }
+ }
+
+ file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties,
+ properties_->memory_pool()));
+ if (file_encryption_properties->encrypted_footer()) {
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
+ } else {
+ // Encrypted file with plaintext footer mode.
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
+ }
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
+// ParquetFileWriter public API
+
+ParquetFileWriter::ParquetFileWriter() {}
+
+ParquetFileWriter::~ParquetFileWriter() {
+ try {
+ Close();
+ } catch (...) {
+ }
+}
+
+std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
+ std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ auto contents =
+ FileSerializer::Open(std::move(sink), std::move(schema), std::move(properties),
+ std::move(key_value_metadata));
+ std::unique_ptr<ParquetFileWriter> result(new ParquetFileWriter());
+ result->Open(std::move(contents));
+ return result;
+}
+
+void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
+ // Write MetaData
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
+ uint32_t metadata_len = static_cast<uint32_t>(position);
+
+ file_metadata.WriteTo(sink);
+ PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
+ metadata_len = static_cast<uint32_t>(position) - metadata_len;
+
+ // Write Footer
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+}
+
+void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+ return WriteFileMetaData(file_metadata, sink);
+}
+
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ArrowOutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor,
+ bool encrypt_footer) {
+ if (encrypt_footer) { // Encrypted file with encrypted footer
+ // encrypt and write to sink
+ file_metadata.WriteTo(sink, encryptor);
+ } else { // Encrypted file with plaintext footer mode.
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
+ uint32_t metadata_len = static_cast<uint32_t>(position);
+ file_metadata.WriteTo(sink, encryptor);
+ PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
+ metadata_len = static_cast<uint32_t>(position) - metadata_len;
+
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+ }
+}
+
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+ ArrowOutputStream* sink) {
+ crypto_metadata.WriteTo(sink);
+}
+
+const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
+
+const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
+ return contents_->schema()->Column(i);
+}
+
+int ParquetFileWriter::num_columns() const { return contents_->num_columns(); }
+
+int64_t ParquetFileWriter::num_rows() const { return contents_->num_rows(); }
+
+int ParquetFileWriter::num_row_groups() const { return contents_->num_row_groups(); }
+
+const std::shared_ptr<const KeyValueMetadata>& ParquetFileWriter::key_value_metadata()
+ const {
+ return contents_->key_value_metadata();
+}
+
+const std::shared_ptr<FileMetaData> ParquetFileWriter::metadata() const {
+ return file_metadata_;
+}
+
+void ParquetFileWriter::Open(std::unique_ptr<ParquetFileWriter::Contents> contents) {
+ contents_ = std::move(contents);
+}
+
+void ParquetFileWriter::Close() {
+ if (contents_) {
+ contents_->Close();
+ file_metadata_ = contents_->metadata();
+ contents_.reset();
+ }
+}
+
+RowGroupWriter* ParquetFileWriter::AppendRowGroup() {
+ return contents_->AppendRowGroup();
+}
+
+RowGroupWriter* ParquetFileWriter::AppendBufferedRowGroup() {
+ return contents_->AppendBufferedRowGroup();
+}
+
+RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) {
+ return AppendRowGroup();
+}
+
+const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const {
+ return contents_->properties();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
index 4cfc24719a3..dafb2573b2c 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
@@ -1,234 +1,234 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-
-class ColumnWriter;
-
-// FIXME: copied from reader-internal.cc
-static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
-static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
-
-class PARQUET_EXPORT RowGroupWriter {
- public:
- // Forward declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct Contents {
- virtual ~Contents() = default;
- virtual int num_columns() const = 0;
- virtual int64_t num_rows() const = 0;
-
- // to be used only with ParquetFileWriter::AppendRowGroup
- virtual ColumnWriter* NextColumn() = 0;
- // to be used only with ParquetFileWriter::AppendBufferedRowGroup
- virtual ColumnWriter* column(int i) = 0;
-
- virtual int current_column() const = 0;
- virtual void Close() = 0;
-
- // total bytes written by the page writer
- virtual int64_t total_bytes_written() const = 0;
- // total bytes still compressed but not written
- virtual int64_t total_compressed_bytes() const = 0;
- };
-
- explicit RowGroupWriter(std::unique_ptr<Contents> contents);
-
- /// Construct a ColumnWriter for the indicated row group-relative column.
- ///
- /// To be used only with ParquetFileWriter::AppendRowGroup
- /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
- /// valid until the next call to NextColumn or Close. As the contents are
- /// directly written to the sink, once a new column is started, the contents
- /// of the previous one cannot be modified anymore.
- ColumnWriter* NextColumn();
- /// Index of currently written column. Equal to -1 if NextColumn()
- /// has not been called yet.
- int current_column();
- void Close();
-
- int num_columns() const;
-
- /// Construct a ColumnWriter for the indicated row group column.
- ///
- /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
- /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
- /// valid until Close. The contents are buffered in memory and written to sink
- /// on Close
- ColumnWriter* column(int i);
-
- /**
- * Number of rows that shall be written as part of this RowGroup.
- */
- int64_t num_rows() const;
-
- int64_t total_bytes_written() const;
- int64_t total_compressed_bytes() const;
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
-};
-
-PARQUET_EXPORT
-void WriteFileMetaData(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-PARQUET_EXPORT
-void WriteMetaDataFile(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-PARQUET_EXPORT
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
- ArrowOutputStream* sink,
- const std::shared_ptr<Encryptor>& encryptor,
- bool encrypt_footer);
-
-PARQUET_EXPORT
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
- bool encrypt_footer = false);
-PARQUET_EXPORT
-void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
- ::arrow::io::OutputStream* sink);
-
-class PARQUET_EXPORT ParquetFileWriter {
- public:
- // Forward declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct Contents {
- Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
- schema_.Init(std::move(schema));
- }
- virtual ~Contents() {}
- // Perform any cleanup associated with the file contents
- virtual void Close() = 0;
-
- /// \note Deprecated since 1.3.0
- RowGroupWriter* AppendRowGroup(int64_t num_rows);
-
- virtual RowGroupWriter* AppendRowGroup() = 0;
- virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
-
- virtual int64_t num_rows() const = 0;
- virtual int num_columns() const = 0;
- virtual int num_row_groups() const = 0;
-
- virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
-
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
- return key_value_metadata_;
- }
-
- // Return const-pointer to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const { return &schema_; }
-
- SchemaDescriptor schema_;
-
- /// This should be the only place this is stored. Everything else is a const reference
- std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
-
- const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
- std::shared_ptr<FileMetaData> file_metadata_;
- };
-
- ParquetFileWriter();
- ~ParquetFileWriter();
-
- static std::unique_ptr<ParquetFileWriter> Open(
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<schema::GroupNode> schema,
- std::shared_ptr<WriterProperties> properties = default_writer_properties(),
- std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
-
- void Open(std::unique_ptr<Contents> contents);
- void Close();
-
- // Construct a RowGroupWriter for the indicated number of rows.
- //
- // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
- // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
- // @param num_rows The number of rows that are stored in the new RowGroup
- //
- // \deprecated Since 1.3.0
- RowGroupWriter* AppendRowGroup(int64_t num_rows);
-
- /// Construct a RowGroupWriter with an arbitrary number of rows.
- ///
- /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
- /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
- RowGroupWriter* AppendRowGroup();
-
- /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
- /// Use this if you want to write a RowGroup based on a certain size
- ///
- /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
- /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
- RowGroupWriter* AppendBufferedRowGroup();
-
- /// Number of columns.
- ///
- /// This number is fixed during the lifetime of the writer as it is determined via
- /// the schema.
- int num_columns() const;
-
- /// Number of rows in the yet started RowGroups.
- ///
- /// Changes on the addition of a new RowGroup.
- int64_t num_rows() const;
-
- /// Number of started RowGroups.
- int num_row_groups() const;
-
- /// Configuration passed to the writer, e.g. the used Parquet format version.
- const std::shared_ptr<WriterProperties>& properties() const;
-
- /// Returns the file schema descriptor
- const SchemaDescriptor* schema() const;
-
- /// Returns a column descriptor in schema
- const ColumnDescriptor* descr(int i) const;
-
- /// Returns the file custom metadata
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
-
- /// Returns the file metadata, only available after calling Close().
- const std::shared_ptr<FileMetaData> metadata() const;
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
- std::shared_ptr<FileMetaData> file_metadata_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ColumnWriter;
+
+// FIXME: copied from reader-internal.cc
+static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
+static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
+
+class PARQUET_EXPORT RowGroupWriter {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ virtual ~Contents() = default;
+ virtual int num_columns() const = 0;
+ virtual int64_t num_rows() const = 0;
+
+ // to be used only with ParquetFileWriter::AppendRowGroup
+ virtual ColumnWriter* NextColumn() = 0;
+ // to be used only with ParquetFileWriter::AppendBufferedRowGroup
+ virtual ColumnWriter* column(int i) = 0;
+
+ virtual int current_column() const = 0;
+ virtual void Close() = 0;
+
+ // total bytes written by the page writer
+ virtual int64_t total_bytes_written() const = 0;
+ // total bytes still compressed but not written
+ virtual int64_t total_compressed_bytes() const = 0;
+ };
+
+ explicit RowGroupWriter(std::unique_ptr<Contents> contents);
+
+ /// Construct a ColumnWriter for the indicated row group-relative column.
+ ///
+ /// To be used only with ParquetFileWriter::AppendRowGroup
+ /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
+ /// valid until the next call to NextColumn or Close. As the contents are
+ /// directly written to the sink, once a new column is started, the contents
+ /// of the previous one cannot be modified anymore.
+ ColumnWriter* NextColumn();
+ /// Index of currently written column. Equal to -1 if NextColumn()
+ /// has not been called yet.
+ int current_column();
+ void Close();
+
+ int num_columns() const;
+
+ /// Construct a ColumnWriter for the indicated row group column.
+ ///
+ /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
+ /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
+ /// valid until Close. The contents are buffered in memory and written to sink
+ /// on Close
+ ColumnWriter* column(int i);
+
+ /**
+ * Number of rows that shall be written as part of this RowGroup.
+ */
+ int64_t num_rows() const;
+
+ int64_t total_bytes_written() const;
+ int64_t total_compressed_bytes() const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+PARQUET_EXPORT
+void WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ArrowOutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor,
+ bool encrypt_footer);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
+ bool encrypt_footer = false);
+PARQUET_EXPORT
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+ ::arrow::io::OutputStream* sink);
+
+class PARQUET_EXPORT ParquetFileWriter {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
+ schema_.Init(std::move(schema));
+ }
+ virtual ~Contents() {}
+ // Perform any cleanup associated with the file contents
+ virtual void Close() = 0;
+
+ /// \note Deprecated since 1.3.0
+ RowGroupWriter* AppendRowGroup(int64_t num_rows);
+
+ virtual RowGroupWriter* AppendRowGroup() = 0;
+ virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
+
+ virtual int64_t num_rows() const = 0;
+ virtual int num_columns() const = 0;
+ virtual int num_row_groups() const = 0;
+
+ virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const { return &schema_; }
+
+ SchemaDescriptor schema_;
+
+ /// This should be the only place this is stored. Everything else is a const reference
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+
+ const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
+ std::shared_ptr<FileMetaData> file_metadata_;
+ };
+
+ ParquetFileWriter();
+ ~ParquetFileWriter();
+
+ static std::unique_ptr<ParquetFileWriter> Open(
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<schema::GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+ void Open(std::unique_ptr<Contents> contents);
+ void Close();
+
+ // Construct a RowGroupWriter for the indicated number of rows.
+ //
+ // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ // @param num_rows The number of rows that are stored in the new RowGroup
+ //
+ // \deprecated Since 1.3.0
+ RowGroupWriter* AppendRowGroup(int64_t num_rows);
+
+ /// Construct a RowGroupWriter with an arbitrary number of rows.
+ ///
+ /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ RowGroupWriter* AppendRowGroup();
+
+ /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
+ /// Use this if you want to write a RowGroup based on a certain size
+ ///
+ /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ RowGroupWriter* AppendBufferedRowGroup();
+
+ /// Number of columns.
+ ///
+ /// This number is fixed during the lifetime of the writer as it is determined via
+ /// the schema.
+ int num_columns() const;
+
+ /// Number of rows in the yet started RowGroups.
+ ///
+ /// Changes on the addition of a new RowGroup.
+ int64_t num_rows() const;
+
+ /// Number of started RowGroups.
+ int num_row_groups() const;
+
+ /// Configuration passed to the writer, e.g. the used Parquet format version.
+ const std::shared_ptr<WriterProperties>& properties() const;
+
+ /// Returns the file schema descriptor
+ const SchemaDescriptor* schema() const;
+
+ /// Returns a column descriptor in schema
+ const ColumnDescriptor* descr(int i) const;
+
+ /// Returns the file custom metadata
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ /// Returns the file metadata, only available after calling Close().
+ const std::shared_ptr<FileMetaData> metadata() const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h b/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
index d699356a6c4..7452e39190f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
@@ -1,72 +1,72 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include "parquet/types.h"
-
-namespace parquet {
-// Abstract class for hash
-class Hasher {
- public:
- /// Compute hash for 32 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int32_t value) const = 0;
-
- /// Compute hash for 64 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int64_t value) const = 0;
-
- /// Compute hash for float value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(float value) const = 0;
-
- /// Compute hash for double value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(double value) const = 0;
-
- /// Compute hash for Int96 value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const Int96* value) const = 0;
-
- /// Compute hash for ByteArray value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const ByteArray* value) const = 0;
-
- /// Compute hash for fixed byte array value by using its plain encoding result.
- ///
- /// @param value the value address.
- /// @param len the value length.
- virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
-
- virtual ~Hasher() = default;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "parquet/types.h"
+
+namespace parquet {
+// Abstract class for hash
+class Hasher {
+ public:
+ /// Compute hash for 32 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int32_t value) const = 0;
+
+ /// Compute hash for 64 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int64_t value) const = 0;
+
+ /// Compute hash for float value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(float value) const = 0;
+
+ /// Compute hash for double value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(double value) const = 0;
+
+ /// Compute hash for Int96 value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const Int96* value) const = 0;
+
+ /// Compute hash for ByteArray value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+ /// Compute hash for fixed byte array value by using its plain encoding result.
+ ///
+ /// @param value the value address.
+ /// @param len the value length.
+ virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+ virtual ~Hasher() = default;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
index 30614ae61fb..b0851f5cf1f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
@@ -1,82 +1,82 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/level_comparison.h"
-
-#define PARQUET_IMPL_NAMESPACE standard
-#include "parquet/level_comparison_inc.h"
-#undef PARQUET_IMPL_NAMESPACE
-
-#include <vector>
-
-#include "arrow/util/dispatch.h"
-
-namespace parquet {
-namespace internal {
-
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-MinMax FindMinMaxAvx2(const int16_t* levels, int64_t num_levels);
-uint64_t GreaterThanBitmapAvx2(const int16_t* levels, int64_t num_levels, int16_t rhs);
-#endif
-
-namespace {
-
-using ::arrow::internal::DispatchLevel;
-using ::arrow::internal::DynamicDispatch;
-
-// defined in level_comparison_avx2.cc
-
-struct GreaterThanDynamicFunction {
- using FunctionType = decltype(&GreaterThanBitmap);
-
- static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
- return {
- { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
- , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
-#endif
- };
- }
-};
-
-struct MinMaxDynamicFunction {
- using FunctionType = decltype(&FindMinMax);
-
- static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
- return {
- { DispatchLevel::NONE, standard::FindMinMaxImpl }
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
- , { DispatchLevel::AVX2, FindMinMaxAvx2 }
-#endif
- };
- }
-};
-
-} // namespace
-
-uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs) {
- static DynamicDispatch<GreaterThanDynamicFunction> dispatch;
- return dispatch.func(levels, num_levels, rhs);
-}
-
-MinMax FindMinMax(const int16_t* levels, int64_t num_levels) {
- static DynamicDispatch<MinMaxDynamicFunction> dispatch;
- return dispatch.func(levels, num_levels);
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/level_comparison.h"
+
+#define PARQUET_IMPL_NAMESPACE standard
+#include "parquet/level_comparison_inc.h"
+#undef PARQUET_IMPL_NAMESPACE
+
+#include <vector>
+
+#include "arrow/util/dispatch.h"
+
+namespace parquet {
+namespace internal {
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+MinMax FindMinMaxAvx2(const int16_t* levels, int64_t num_levels);
+uint64_t GreaterThanBitmapAvx2(const int16_t* levels, int64_t num_levels, int16_t rhs);
+#endif
+
+namespace {
+
+using ::arrow::internal::DispatchLevel;
+using ::arrow::internal::DynamicDispatch;
+
+// defined in level_comparison_avx2.cc
+
+struct GreaterThanDynamicFunction {
+ using FunctionType = decltype(&GreaterThanBitmap);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
+#endif
+ };
+ }
+};
+
+struct MinMaxDynamicFunction {
+ using FunctionType = decltype(&FindMinMax);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, standard::FindMinMaxImpl }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, FindMinMaxAvx2 }
+#endif
+ };
+ }
+};
+
+} // namespace
+
+uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs) {
+ static DynamicDispatch<GreaterThanDynamicFunction> dispatch;
+ return dispatch.func(levels, num_levels, rhs);
+}
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels) {
+ static DynamicDispatch<MinMaxDynamicFunction> dispatch;
+ return dispatch.func(levels, num_levels);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
index 38e7ef8e2ec..2097e4db8a0 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
@@ -1,40 +1,40 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-
-#include "parquet/platform.h"
-
-namespace parquet {
-namespace internal {
-
-/// Builds a bitmap where each set bit indicates the corresponding level is greater
-/// than rhs.
-uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
- int16_t rhs);
-
-struct MinMax {
- int16_t min;
- int16_t max;
-};
-
-MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+namespace internal {
+
+/// Builds a bitmap where each set bit indicates the corresponding level is greater
+/// than rhs.
+uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
+ int16_t rhs);
+
+struct MinMax {
+ int16_t min;
+ int16_t max;
+};
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
index e21c3e5824d..cc6bf382a50 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
@@ -1,65 +1,65 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
-#include "parquet/level_comparison.h"
-
-// Used to make sure ODR rule isn't violated.
-#ifndef PARQUET_IMPL_NAMESPACE
-#error "PARQUET_IMPL_NAMESPACE must be defined"
-#endif
-namespace parquet {
-namespace internal {
-namespace PARQUET_IMPL_NAMESPACE {
-/// Builds a bitmap by applying predicate to the level vector provided.
-///
-/// \param[in] levels Rep or def level array.
-/// \param[in] num_levels The number of levels to process (must be [0, 64])
-/// \param[in] predicate The predicate to apply (must have the signature `bool
-/// predicate(int16_t)`.
-/// \returns The bitmap using least significant "bit" ordering.
-///
-template <typename Predicate>
-inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
- Predicate predicate) {
- // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
- uint64_t mask = 0;
- for (int x = 0; x < num_levels; x++) {
- mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
- }
- return ::arrow::BitUtil::ToLittleEndian(mask);
-}
-
-inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
- MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
- for (int x = 0; x < num_levels; x++) {
- out.min = std::min(levels[x], out.min);
- out.max = std::max(levels[x], out.max);
- }
- return out;
-}
-
-inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
- int16_t rhs) {
- return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
-}
-
-} // namespace PARQUET_IMPL_NAMESPACE
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "parquet/level_comparison.h"
+
+// Used to make sure ODR rule isn't violated.
+#ifndef PARQUET_IMPL_NAMESPACE
+#error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace parquet {
+namespace internal {
+namespace PARQUET_IMPL_NAMESPACE {
+/// Builds a bitmap by applying predicate to the level vector provided.
+///
+/// \param[in] levels Rep or def level array.
+/// \param[in] num_levels The number of levels to process (must be [0, 64])
+/// \param[in] predicate The predicate to apply (must have the signature `bool
+/// predicate(int16_t)`.
+/// \returns The bitmap using least significant "bit" ordering.
+///
+template <typename Predicate>
+inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
+ Predicate predicate) {
+ // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
+ uint64_t mask = 0;
+ for (int x = 0; x < num_levels; x++) {
+ mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
+ }
+ return ::arrow::BitUtil::ToLittleEndian(mask);
+}
+
+inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
+ MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
+ for (int x = 0; x < num_levels; x++) {
+ out.min = std::min(levels[x], out.min);
+ out.max = std::max(levels[x], out.max);
+ }
+ return out;
+}
+
+inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
+ int16_t rhs) {
+ return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
+}
+
+} // namespace PARQUET_IMPL_NAMESPACE
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
index ffdca476ddd..998fd982fd7 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
@@ -1,183 +1,183 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "parquet/level_conversion.h"
-
-#include <algorithm>
-#include <limits>
-
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/cpu_info.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-#include "parquet/exception.h"
-
-#include "parquet/level_comparison.h"
-#define PARQUET_IMPL_NAMESPACE standard
-#include "parquet/level_conversion_inc.h"
-#undef PARQUET_IMPL_NAMESPACE
-
-namespace parquet {
-namespace internal {
-namespace {
-
-using ::arrow::internal::CpuInfo;
-using ::arrow::util::optional;
-
-template <typename OffsetType>
-void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output, OffsetType* offsets) {
- OffsetType* orig_pos = offsets;
- optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
- if (output->valid_bits) {
- valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
- output->values_read_upper_bound);
- }
- for (int x = 0; x < num_def_levels; x++) {
- // Skip items that belong to empty or null ancestor lists and further nested lists.
- if (def_levels[x] < level_info.repeated_ancestor_def_level ||
- rep_levels[x] > level_info.rep_level) {
- continue;
- }
-
- if (rep_levels[x] == level_info.rep_level) {
- // A continuation of an existing list.
- // offsets can be null for structs with repeated children (we don't need to know
- // offsets until we get to the children).
- if (offsets != nullptr) {
- if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
- throw ParquetException("List index overflow.");
- }
- *offsets += 1;
- }
- } else {
- if (ARROW_PREDICT_FALSE(
- (valid_bits_writer.has_value() &&
- valid_bits_writer->position() >= output->values_read_upper_bound) ||
- (offsets - orig_pos) >= output->values_read_upper_bound)) {
- std::stringstream ss;
- ss << "Definition levels exceeded upper bound: "
- << output->values_read_upper_bound;
- throw ParquetException(ss.str());
- }
-
- // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
- // filtered out above).
- // offsets can be null for structs with repeated children (we don't need to know
- // offsets until we get to the children).
- if (offsets != nullptr) {
- ++offsets;
- // Use cumulative offsets because variable size lists are more common then
- // fixed size lists so it should be cheaper to make these cumulative and
- // subtract when validating fixed size lists.
- *offsets = *(offsets - 1);
- if (def_levels[x] >= level_info.def_level) {
- if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
- throw ParquetException("List index overflow.");
- }
- *offsets += 1;
- }
- }
-
- if (valid_bits_writer.has_value()) {
- // the level_info def level for lists reflects element present level.
- // the prior level distinguishes between empty lists.
- if (def_levels[x] >= level_info.def_level - 1) {
- valid_bits_writer->Set();
- } else {
- output->null_count++;
- valid_bits_writer->Clear();
- }
- valid_bits_writer->Next();
- }
- }
- }
- if (valid_bits_writer.has_value()) {
- valid_bits_writer->Finish();
- }
- if (offsets != nullptr) {
- output->values_read = offsets - orig_pos;
- } else if (valid_bits_writer.has_value()) {
- output->values_read = valid_bits_writer->position();
- }
- if (output->null_count > 0 && level_info.null_slot_usage > 1) {
- throw ParquetException(
- "Null values with null_slot_usage > 1 not supported."
- "(i.e. FixedSizeLists with null values are not supported)");
- }
-}
-
-} // namespace
-
-#if defined(ARROW_HAVE_RUNTIME_BMI2)
-// defined in level_conversion_bmi2.cc for dynamic dispatch.
-void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output);
-#endif
-
-void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
- LevelInfo level_info, ValidityBitmapInputOutput* output) {
- // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
- // is deleted in a follow-up release.
- if (level_info.rep_level > 0) {
-#if defined(ARROW_HAVE_RUNTIME_BMI2)
- if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
- return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
- level_info, output);
- }
-#endif
- standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
- def_levels, num_def_levels, level_info, output);
- } else {
- standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
- def_levels, num_def_levels, level_info, output);
- }
-}
-
-uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
- return standard::ExtractBitsSoftware(bitmap, select_bitmap);
-}
-
-void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output, int32_t* offsets) {
- DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
- output, offsets);
-}
-
-void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output, int64_t* offsets) {
- DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
- output, offsets);
-}
-
-void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output) {
- // DefReplevelsToListInfo assumes it for the actual list method and this
- // method is for parent structs, so we need to bump def and ref level.
- level_info.rep_level += 1;
- level_info.def_level += 1;
- DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
- output, /*offsets=*/nullptr);
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "parquet/exception.h"
+
+#include "parquet/level_comparison.h"
+#define PARQUET_IMPL_NAMESPACE standard
+#include "parquet/level_conversion_inc.h"
+#undef PARQUET_IMPL_NAMESPACE
+
+namespace parquet {
+namespace internal {
+namespace {
+
+using ::arrow::internal::CpuInfo;
+using ::arrow::util::optional;
+
+template <typename OffsetType>
+void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, OffsetType* offsets) {
+ OffsetType* orig_pos = offsets;
+ optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
+ if (output->valid_bits) {
+ valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
+ output->values_read_upper_bound);
+ }
+ for (int x = 0; x < num_def_levels; x++) {
+ // Skip items that belong to empty or null ancestor lists and further nested lists.
+ if (def_levels[x] < level_info.repeated_ancestor_def_level ||
+ rep_levels[x] > level_info.rep_level) {
+ continue;
+ }
+
+ if (rep_levels[x] == level_info.rep_level) {
+ // A continuation of an existing list.
+ // offsets can be null for structs with repeated children (we don't need to know
+ // offsets until we get to the children).
+ if (offsets != nullptr) {
+ if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
+ throw ParquetException("List index overflow.");
+ }
+ *offsets += 1;
+ }
+ } else {
+ if (ARROW_PREDICT_FALSE(
+ (valid_bits_writer.has_value() &&
+ valid_bits_writer->position() >= output->values_read_upper_bound) ||
+ (offsets - orig_pos) >= output->values_read_upper_bound)) {
+ std::stringstream ss;
+ ss << "Definition levels exceeded upper bound: "
+ << output->values_read_upper_bound;
+ throw ParquetException(ss.str());
+ }
+
+ // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
+ // filtered out above).
+ // offsets can be null for structs with repeated children (we don't need to know
+ // offsets until we get to the children).
+ if (offsets != nullptr) {
+ ++offsets;
+ // Use cumulative offsets because variable size lists are more common then
+ // fixed size lists so it should be cheaper to make these cumulative and
+ // subtract when validating fixed size lists.
+ *offsets = *(offsets - 1);
+ if (def_levels[x] >= level_info.def_level) {
+ if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
+ throw ParquetException("List index overflow.");
+ }
+ *offsets += 1;
+ }
+ }
+
+ if (valid_bits_writer.has_value()) {
+ // the level_info def level for lists reflects element present level.
+ // the prior level distinguishes between empty lists.
+ if (def_levels[x] >= level_info.def_level - 1) {
+ valid_bits_writer->Set();
+ } else {
+ output->null_count++;
+ valid_bits_writer->Clear();
+ }
+ valid_bits_writer->Next();
+ }
+ }
+ }
+ if (valid_bits_writer.has_value()) {
+ valid_bits_writer->Finish();
+ }
+ if (offsets != nullptr) {
+ output->values_read = offsets - orig_pos;
+ } else if (valid_bits_writer.has_value()) {
+ output->values_read = valid_bits_writer->position();
+ }
+ if (output->null_count > 0 && level_info.null_slot_usage > 1) {
+ throw ParquetException(
+ "Null values with null_slot_usage > 1 not supported."
+ "(i.e. FixedSizeLists with null values are not supported)");
+ }
+}
+
+} // namespace
+
+#if defined(ARROW_HAVE_RUNTIME_BMI2)
+// defined in level_conversion_bmi2.cc for dynamic dispatch.
+void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+#endif
+
+void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info, ValidityBitmapInputOutput* output) {
+ // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
+ // is deleted in a follow-up release.
+ if (level_info.rep_level > 0) {
+#if defined(ARROW_HAVE_RUNTIME_BMI2)
+ if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
+ return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
+ level_info, output);
+ }
+#endif
+ standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
+ def_levels, num_def_levels, level_info, output);
+ } else {
+ standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
+ def_levels, num_def_levels, level_info, output);
+ }
+}
+
+uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+ return standard::ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, int32_t* offsets) {
+ DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, offsets);
+}
+
+void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, int64_t* offsets) {
+ DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, offsets);
+}
+
+void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output) {
+ // DefReplevelsToListInfo assumes it for the actual list method and this
+ // method is for parent structs, so we need to bump def and ref level.
+ level_info.rep_level += 1;
+ level_info.def_level += 1;
+ DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, /*offsets=*/nullptr);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
index e45a288e8c0..fa6a23c1d9b 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
@@ -1,199 +1,199 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/util/endian.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-namespace internal {
-
-struct PARQUET_EXPORT LevelInfo {
- LevelInfo()
- : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
- LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
- int32_t repeated_ancestor_definition_level)
- : null_slot_usage(null_slots),
- def_level(definition_level),
- rep_level(repetition_level),
- repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
-
- bool operator==(const LevelInfo& b) const {
- return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
- rep_level == b.rep_level &&
- repeated_ancestor_def_level == b.repeated_ancestor_def_level;
- }
-
- bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
-
- // How many slots an undefined but present (i.e. null) element in
- // parquet consumes when decoding to Arrow.
- // "Slot" is used in the same context as the Arrow specification
- // (i.e. a value holder).
- // This is only ever >1 for descendents of FixedSizeList.
- int32_t null_slot_usage = 1;
-
- // The definition level at which the value for the field
- // is considered not null (definition levels greater than
- // or equal to this value indicate a not-null
- // value for the field). For list fields definition levels
- // greater than or equal to this field indicate a present,
- // possibly null, child value.
- int16_t def_level = 0;
-
- // The repetition level corresponding to this element
- // or the closest repeated ancestor. Any repetition
- // level less than this indicates either a new list OR
- // an empty list (which is determined in conjunction
- // with definition levels).
- int16_t rep_level = 0;
-
- // The definition level indicating the level at which the closest
- // repeated ancestor is not empty. This is used to discriminate
- // between a value less than |def_level| being null or excluded entirely.
- // For instance if we have an arrow schema like:
- // list(struct(f0: int)). Then then there are the following
- // definition levels:
- // 0 = null list
- // 1 = present but empty list.
- // 2 = a null value in the list
- // 3 = a non null struct but null integer.
- // 4 = a present integer.
- // When reconstructing, the struct and integer arrays'
- // repeated_ancestor_def_level would be 2. Any
- // def_level < 2 indicates that there isn't a corresponding
- // child value in the list.
- // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
- // has the def levels [0, 1, 2, 3, 4]. The actual
- // struct array is only of length 3: [not-set, set, set] and
- // the int array is also of length 3: [N/A, null, 1].
- //
- int16_t repeated_ancestor_def_level = 0;
-
- /// Increments levels according to the cardinality of node.
- void Increment(const schema::Node& node) {
- if (node.is_repeated()) {
- IncrementRepeated();
- return;
- }
- if (node.is_optional()) {
- IncrementOptional();
- return;
- }
- }
-
- /// Incremetns level for a optional node.
- void IncrementOptional() { def_level++; }
-
- /// Increments levels for the repeated node. Returns
- /// the previous ancestor_list_def_level.
- int16_t IncrementRepeated() {
- int16_t last_repeated_ancestor = repeated_ancestor_def_level;
-
- // Repeated fields add both a repetition and definition level. This is used
- // to distinguish between an empty list and a list with an item in it.
- ++rep_level;
- ++def_level;
- // For levels >= repeated_ancenstor_def_level it indicates the list was
- // non-null and had at least one element. This is important
- // for later decoding because we need to add a slot for these
- // values. for levels < current_def_level no slots are added
- // to arrays.
- repeated_ancestor_def_level = def_level;
- return last_repeated_ancestor;
- }
-
- friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
- // This print method is to silence valgrind issues. What's printed
- // is not important because all asserts happen directly on
- // members.
- os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
- << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
- if (levels.null_slot_usage > 1) {
- os << ", null_slot_usage=" << levels.null_slot_usage;
- }
- os << "}";
- return os;
- }
-};
-
-// Input/Output structure for reconstructed validity bitmaps.
-struct PARQUET_EXPORT ValidityBitmapInputOutput {
- // Input only.
- // The maximum number of values_read expected (actual
- // values read must be less than or equal to this value).
- // If this number is exceeded methods will throw a
- // ParquetException. Exceeding this limit indicates
- // either a corrupt or incorrectly written file.
- int64_t values_read_upper_bound = 0;
- // Output only. The number of values added to the encountered
- // (this is logically the count of the number of elements
- // for an Arrow array).
- int64_t values_read = 0;
- // Input/Output. The number of nulls encountered.
- int64_t null_count = 0;
- // Output only. The validity bitmap to populate. May be be null only
- // for DefRepLevelsToListInfo (if all that is needed is list offsets).
- uint8_t* valid_bits = NULLPTR;
- // Input only, offset into valid_bits to start at.
- int64_t valid_bits_offset = 0;
-};
-
-// Converts def_levels to validity bitmaps for non-list arrays and structs that have
-// at least one member that is not a list and has no list descendents.
-// For lists use DefRepLevelsToList and structs where all descendants contain
-// a list use DefRepLevelsToBitmap.
-void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
- LevelInfo level_info,
- ValidityBitmapInputOutput* output);
-
-// Reconstructs a validity bitmap and list offsets for a list arrays based on
-// def/rep levels. The first element of offsets will not be modified if rep_levels
-// starts with a new list. The first element of offsets will be used when calculating
-// the next offset. See documentation onf DefLevelsToBitmap for when to use this
-// method vs the other ones in this file for reconstruction.
-//
-// Offsets must be sized to 1 + values_read_upper_bound.
-void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
- const int16_t* rep_levels, int64_t num_def_levels,
- LevelInfo level_info,
- ValidityBitmapInputOutput* output,
- int32_t* offsets);
-void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
- const int16_t* rep_levels, int64_t num_def_levels,
- LevelInfo level_info,
- ValidityBitmapInputOutput* output,
- int64_t* offsets);
-
-// Reconstructs a validity bitmap for a struct every member is a list or has
-// a list descendant. See documentation on DefLevelsToBitmap for when more
-// details on this method compared to the other ones defined above.
-void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
- const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output);
-
-// This is exposed to ensure we can properly test a software simulated pext function
-// (i.e. it isn't hidden by runtime dispatch).
-uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/endian.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+namespace internal {
+
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repetition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
+
+ // How many slots an undefined but present (i.e. null) element in
+ // parquet consumes when decoding to Arrow.
+ // "Slot" is used in the same context as the Arrow specification
+ // (i.e. a value holder).
+ // This is only ever >1 for descendents of FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater than or equal to this field indicate a present,
+ // possibly null, child value.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // with definition levels).
+ int16_t rep_level = 0;
+
+ // The definition level indicating the level at which the closest
+ // repeated ancestor is not empty. This is used to discriminate
+ // between a value less than |def_level| being null or excluded entirely.
+ // For instance if we have an arrow schema like:
+ // list(struct(f0: int)). Then then there are the following
+ // definition levels:
+ // 0 = null list
+ // 1 = present but empty list.
+ // 2 = a null value in the list
+ // 3 = a non null struct but null integer.
+ // 4 = a present integer.
+ // When reconstructing, the struct and integer arrays'
+ // repeated_ancestor_def_level would be 2. Any
+ // def_level < 2 indicates that there isn't a corresponding
+ // child value in the list.
+ // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
+ // has the def levels [0, 1, 2, 3, 4]. The actual
+ // struct array is only of length 3: [not-set, set, set] and
+ // the int array is also of length 3: [N/A, null, 1].
+ //
+ int16_t repeated_ancestor_def_level = 0;
+
+ /// Increments levels according to the cardinality of node.
+ void Increment(const schema::Node& node) {
+ if (node.is_repeated()) {
+ IncrementRepeated();
+ return;
+ }
+ if (node.is_optional()) {
+ IncrementOptional();
+ return;
+ }
+ }
+
+ /// Incremetns level for a optional node.
+ void IncrementOptional() { def_level++; }
+
+ /// Increments levels for the repeated node. Returns
+ /// the previous ancestor_list_def_level.
+ int16_t IncrementRepeated() {
+ int16_t last_repeated_ancestor = repeated_ancestor_def_level;
+
+ // Repeated fields add both a repetition and definition level. This is used
+ // to distinguish between an empty list and a list with an item in it.
+ ++rep_level;
+ ++def_level;
+ // For levels >= repeated_ancenstor_def_level it indicates the list was
+ // non-null and had at least one element. This is important
+ // for later decoding because we need to add a slot for these
+ // values. for levels < current_def_level no slots are added
+ // to arrays.
+ repeated_ancestor_def_level = def_level;
+ return last_repeated_ancestor;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
+ if (levels.null_slot_usage > 1) {
+ os << ", null_slot_usage=" << levels.null_slot_usage;
+ }
+ os << "}";
+ return os;
+ }
+};
+
+// Input/Output structure for reconstructed validity bitmaps.
+struct PARQUET_EXPORT ValidityBitmapInputOutput {
+ // Input only.
+ // The maximum number of values_read expected (actual
+ // values read must be less than or equal to this value).
+ // If this number is exceeded methods will throw a
+ // ParquetException. Exceeding this limit indicates
+ // either a corrupt or incorrectly written file.
+ int64_t values_read_upper_bound = 0;
+ // Output only. The number of values added to the encountered
+ // (this is logically the count of the number of elements
+ // for an Arrow array).
+ int64_t values_read = 0;
+ // Input/Output. The number of nulls encountered.
+ int64_t null_count = 0;
+ // Output only. The validity bitmap to populate. May be be null only
+ // for DefRepLevelsToListInfo (if all that is needed is list offsets).
+ uint8_t* valid_bits = NULLPTR;
+ // Input only, offset into valid_bits to start at.
+ int64_t valid_bits_offset = 0;
+};
+
+// Converts def_levels to validity bitmaps for non-list arrays and structs that have
+// at least one member that is not a list and has no list descendents.
+// For lists use DefRepLevelsToList and structs where all descendants contain
+// a list use DefRepLevelsToBitmap.
+void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+
+// Reconstructs a validity bitmap and list offsets for a list arrays based on
+// def/rep levels. The first element of offsets will not be modified if rep_levels
+// starts with a new list. The first element of offsets will be used when calculating
+// the next offset. See documentation onf DefLevelsToBitmap for when to use this
+// method vs the other ones in this file for reconstruction.
+//
+// Offsets must be sized to 1 + values_read_upper_bound.
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+ const int16_t* rep_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output,
+ int32_t* offsets);
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+ const int16_t* rep_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output,
+ int64_t* offsets);
+
+// Reconstructs a validity bitmap for a struct every member is a list or has
+// a list descendant. See documentation on DefLevelsToBitmap for when more
+// details on this method compared to the other ones defined above.
+void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
+ const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+
+// This is exposed to ensure we can properly test a software simulated pext function
+// (i.e. it isn't hidden by runtime dispatch).
+uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
index 75c7716c483..fd06b7334dd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
@@ -1,357 +1,357 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include "parquet/level_conversion.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_writer.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/simd.h"
-#include "parquet/exception.h"
-#include "parquet/level_comparison.h"
-
-namespace parquet {
-namespace internal {
-#ifndef PARQUET_IMPL_NAMESPACE
-#error "PARQUET_IMPL_NAMESPACE must be defined"
-#endif
-namespace PARQUET_IMPL_NAMESPACE {
-
-// clang-format off
-/* Python code to generate lookup table:
-
-kLookupBits = 5
-count = 0
-print('constexpr int kLookupBits = {};'.format(kLookupBits))
-print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
-print(' ', end = '')
-for mask in range(1 << kLookupBits):
- for data in range(1 << kLookupBits):
- bit_value = 0
- bit_len = 0
- for i in range(kLookupBits):
- if mask & (1 << i):
- bit_value |= (((data >> i) & 1) << bit_len)
- bit_len += 1
- out = '0x{:02X},'.format(bit_value)
- count += 1
- if count % (1 << kLookupBits) == 1:
- print(' {')
- if count % 8 == 1:
- print(' ', end = '')
- if count % 8 == 0:
- print(out, end = '\n')
- else:
- print(out, end = ' ')
- if count % (1 << kLookupBits) == 0:
- print(' },', end = '')
-print('\n};')
-
-*/
-// clang-format on
-
-constexpr int kLookupBits = 5;
-constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
- 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
- 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
- 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
- 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
- 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
- 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
- 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
- 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
- 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
- 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
- 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
- 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
- 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
- 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
- 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
- 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
- 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
- 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
- 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
- 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
- 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
- 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
- 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
- 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
- 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
- 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
- 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
- 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
- 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
- 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
- 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
- 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
- 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
- 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
- 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
- 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
- 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
- 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
- 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
- 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
- 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
- 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
- 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
- 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
- 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
- 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
- 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
- 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
- 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
- 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
- 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
- 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
- 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
- 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
- },
-};
-
-inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
- // A software emulation of _pext_u64
-
- // These checks should be inline and are likely to be common cases.
- if (select_bitmap == ~uint64_t{0}) {
- return bitmap;
- } else if (select_bitmap == 0) {
- return 0;
- }
-
- // Fallback to lookup table method
- uint64_t bit_value = 0;
- int bit_len = 0;
- constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
- while (select_bitmap != 0) {
- const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
- const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
- bit_value |= (value << bit_len);
- bit_len += mask_len;
- bitmap >>= kLookupBits;
- select_bitmap >>= kLookupBits;
- }
- return bit_value;
-}
-
-#ifdef ARROW_HAVE_BMI2
-
-// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
-#if UINTPTR_MAX == 0xFFFFFFFF
-
-using extract_bitmap_t = uint32_t;
-inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
- extract_bitmap_t select_bitmap) {
- return _pext_u32(bitmap, select_bitmap);
-}
-
-#else
-
-using extract_bitmap_t = uint64_t;
-inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
- extract_bitmap_t select_bitmap) {
- return _pext_u64(bitmap, select_bitmap);
-}
-
-#endif
-
-#else // !defined(ARROW_HAVE_BMI2)
-
-// Use 64-bit pext emulation when BMI2 isn't available.
-using extract_bitmap_t = uint64_t;
-inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
- extract_bitmap_t select_bitmap) {
- return ExtractBitsSoftware(bitmap, select_bitmap);
-}
-
-#endif
-
-static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
-
-template <bool has_repeated_parent>
-int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
- int64_t upper_bound_remaining, LevelInfo level_info,
- ::arrow::internal::FirstTimeBitmapWriter* writer) {
- DCHECK_LE(batch_size, kExtractBitsSize);
-
- // Greater than level_info.def_level - 1 implies >= the def_level
- auto defined_bitmap = static_cast<extract_bitmap_t>(
- internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
-
- if (has_repeated_parent) {
- // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
- // repeated_ancestor_def_level
- auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
- def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
- auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
- int64_t selected_count = ::arrow::BitUtil::PopCount(present_bitmap);
- if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
- throw ParquetException("Values read exceeded upper bound");
- }
- writer->AppendWord(selected_bits, selected_count);
- return ::arrow::BitUtil::PopCount(selected_bits);
- } else {
- if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
- std::stringstream ss;
- ss << "Values read exceeded upper bound";
- throw ParquetException(ss.str());
- }
-
- writer->AppendWord(defined_bitmap, batch_size);
- return ::arrow::BitUtil::PopCount(defined_bitmap);
- }
-}
-
-template <bool has_repeated_parent>
-void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
- LevelInfo level_info, ValidityBitmapInputOutput* output) {
- ::arrow::internal::FirstTimeBitmapWriter writer(
- output->valid_bits,
- /*start_offset=*/output->valid_bits_offset,
- /*length=*/num_def_levels);
- int64_t set_count = 0;
- output->values_read = 0;
- int64_t values_read_remaining = output->values_read_upper_bound;
- while (num_def_levels > kExtractBitsSize) {
- set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
- def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
- def_levels += kExtractBitsSize;
- num_def_levels -= kExtractBitsSize;
- values_read_remaining = output->values_read_upper_bound - writer.position();
- }
- set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
- def_levels, num_def_levels, values_read_remaining, level_info, &writer);
-
- output->values_read = writer.position();
- output->null_count += output->values_read - set_count;
- writer.Finish();
-}
-
-} // namespace PARQUET_IMPL_NAMESPACE
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/simd.h"
+#include "parquet/exception.h"
+#include "parquet/level_comparison.h"
+
+namespace parquet {
+namespace internal {
+#ifndef PARQUET_IMPL_NAMESPACE
+#error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace PARQUET_IMPL_NAMESPACE {
+
+// clang-format off
+/* Python code to generate lookup table:
+
+kLookupBits = 5
+count = 0
+print('constexpr int kLookupBits = {};'.format(kLookupBits))
+print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
+print(' ', end = '')
+for mask in range(1 << kLookupBits):
+ for data in range(1 << kLookupBits):
+ bit_value = 0
+ bit_len = 0
+ for i in range(kLookupBits):
+ if mask & (1 << i):
+ bit_value |= (((data >> i) & 1) << bit_len)
+ bit_len += 1
+ out = '0x{:02X},'.format(bit_value)
+ count += 1
+ if count % (1 << kLookupBits) == 1:
+ print(' {')
+ if count % 8 == 1:
+ print(' ', end = '')
+ if count % 8 == 0:
+ print(out, end = '\n')
+ else:
+ print(out, end = ' ')
+ if count % (1 << kLookupBits) == 0:
+ print(' },', end = '')
+print('\n};')
+
+*/
+// clang-format on
+
+constexpr int kLookupBits = 5;
+constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+ 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+ 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+ 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+ 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+ 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+ 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+ 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+ 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+ 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+ 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+ 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+ 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+ 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+ 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+ 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
+ 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+ 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
+ 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
+ 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+ 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
+ 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+ 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
+ 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+ 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+ 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
+ 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
+ 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+ 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
+ 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+ 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+ 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
+ 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+ 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+ },
+};
+
+inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+ // A software emulation of _pext_u64
+
+ // These checks should be inline and are likely to be common cases.
+ if (select_bitmap == ~uint64_t{0}) {
+ return bitmap;
+ } else if (select_bitmap == 0) {
+ return 0;
+ }
+
+ // Fallback to lookup table method
+ uint64_t bit_value = 0;
+ int bit_len = 0;
+ constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
+ while (select_bitmap != 0) {
+ const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
+ const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
+ bit_value |= (value << bit_len);
+ bit_len += mask_len;
+ bitmap >>= kLookupBits;
+ select_bitmap >>= kLookupBits;
+ }
+ return bit_value;
+}
+
+#ifdef ARROW_HAVE_BMI2
+
+// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
+#if UINTPTR_MAX == 0xFFFFFFFF
+
+using extract_bitmap_t = uint32_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return _pext_u32(bitmap, select_bitmap);
+}
+
+#else
+
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return _pext_u64(bitmap, select_bitmap);
+}
+
+#endif
+
+#else // !defined(ARROW_HAVE_BMI2)
+
+// Use 64-bit pext emulation when BMI2 isn't available.
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+#endif
+
+static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
+
+template <bool has_repeated_parent>
+int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
+ int64_t upper_bound_remaining, LevelInfo level_info,
+ ::arrow::internal::FirstTimeBitmapWriter* writer) {
+ DCHECK_LE(batch_size, kExtractBitsSize);
+
+ // Greater than level_info.def_level - 1 implies >= the def_level
+ auto defined_bitmap = static_cast<extract_bitmap_t>(
+ internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
+
+ if (has_repeated_parent) {
+ // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
+ // repeated_ancestor_def_level
+ auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
+ def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
+ auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
+ int64_t selected_count = ::arrow::BitUtil::PopCount(present_bitmap);
+ if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
+ throw ParquetException("Values read exceeded upper bound");
+ }
+ writer->AppendWord(selected_bits, selected_count);
+ return ::arrow::BitUtil::PopCount(selected_bits);
+ } else {
+ if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
+ std::stringstream ss;
+ ss << "Values read exceeded upper bound";
+ throw ParquetException(ss.str());
+ }
+
+ writer->AppendWord(defined_bitmap, batch_size);
+ return ::arrow::BitUtil::PopCount(defined_bitmap);
+ }
+}
+
+template <bool has_repeated_parent>
+void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info, ValidityBitmapInputOutput* output) {
+ ::arrow::internal::FirstTimeBitmapWriter writer(
+ output->valid_bits,
+ /*start_offset=*/output->valid_bits_offset,
+ /*length=*/num_def_levels);
+ int64_t set_count = 0;
+ output->values_read = 0;
+ int64_t values_read_remaining = output->values_read_upper_bound;
+ while (num_def_levels > kExtractBitsSize) {
+ set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+ def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
+ def_levels += kExtractBitsSize;
+ num_def_levels -= kExtractBitsSize;
+ values_read_remaining = output->values_read_upper_bound - writer.position();
+ }
+ set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+ def_levels, num_def_levels, values_read_remaining, level_info, &writer);
+
+ output->values_read = writer.position();
+ output->null_count += output->values_read - set_count;
+ writer.Finish();
+}
+
+} // namespace PARQUET_IMPL_NAMESPACE
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
index bd9bf77c42d..1524333702f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
@@ -1,1783 +1,1783 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/metadata.h"
-
-#include <algorithm>
-#include <cinttypes>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/io/memory.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/string_view.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/exception.h"
-#include "parquet/schema.h"
-#include "parquet/schema_internal.h"
-#include "parquet/statistics.h"
-#include "parquet/thrift_internal.h"
-
-namespace parquet {
-
-const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() {
- static ApplicationVersion version("parquet-mr", 1, 8, 0);
- return version;
-}
-
-const ApplicationVersion& ApplicationVersion::PARQUET_816_FIXED_VERSION() {
- static ApplicationVersion version("parquet-mr", 1, 2, 9);
- return version;
-}
-
-const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() {
- static ApplicationVersion version("parquet-cpp", 1, 3, 0);
- return version;
-}
-
-const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() {
- static ApplicationVersion version("parquet-mr", 1, 10, 0);
- return version;
-}
-
-std::string ParquetVersionToString(ParquetVersion::type ver) {
- switch (ver) {
- case ParquetVersion::PARQUET_1_0:
- return "1.0";
- case ParquetVersion::PARQUET_2_0:
- return "2.0";
- }
-
- // This should be unreachable
- return "UNKNOWN";
-}
-
-template <typename DType>
-static std::shared_ptr<Statistics> MakeTypedColumnStats(
- const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
- // If ColumnOrder is defined, return max_value and min_value
- if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
- return MakeStatistics<DType>(
- descr, metadata.statistics.min_value, metadata.statistics.max_value,
- metadata.num_values - metadata.statistics.null_count,
- metadata.statistics.null_count, metadata.statistics.distinct_count,
- metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value,
- metadata.statistics.__isset.null_count,
- metadata.statistics.__isset.distinct_count);
- }
- // Default behavior
- return MakeStatistics<DType>(
- descr, metadata.statistics.min, metadata.statistics.max,
- metadata.num_values - metadata.statistics.null_count,
- metadata.statistics.null_count, metadata.statistics.distinct_count,
- metadata.statistics.__isset.max || metadata.statistics.__isset.min,
- metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
-}
-
-std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_data,
- const ColumnDescriptor* descr) {
- switch (static_cast<Type::type>(meta_data.type)) {
- case Type::BOOLEAN:
- return MakeTypedColumnStats<BooleanType>(meta_data, descr);
- case Type::INT32:
- return MakeTypedColumnStats<Int32Type>(meta_data, descr);
- case Type::INT64:
- return MakeTypedColumnStats<Int64Type>(meta_data, descr);
- case Type::INT96:
- return MakeTypedColumnStats<Int96Type>(meta_data, descr);
- case Type::DOUBLE:
- return MakeTypedColumnStats<DoubleType>(meta_data, descr);
- case Type::FLOAT:
- return MakeTypedColumnStats<FloatType>(meta_data, descr);
- case Type::BYTE_ARRAY:
- return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return MakeTypedColumnStats<FLBAType>(meta_data, descr);
- case Type::UNDEFINED:
- break;
- }
- throw ParquetException("Can't decode page statistics for selected column type");
-}
-
-// MetaData Accessor
-
-// ColumnCryptoMetaData
-class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl {
- public:
- explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata)
- : crypto_metadata_(crypto_metadata) {}
-
- bool encrypted_with_footer_key() const {
- return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY;
- }
- bool encrypted_with_column_key() const {
- return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY;
- }
- std::shared_ptr<schema::ColumnPath> path_in_schema() const {
- return std::make_shared<schema::ColumnPath>(
- crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
- }
- const std::string& key_metadata() const {
- return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
- }
-
- private:
- const format::ColumnCryptoMetaData* crypto_metadata_;
-};
-
-std::unique_ptr<ColumnCryptoMetaData> ColumnCryptoMetaData::Make(
- const uint8_t* metadata) {
- return std::unique_ptr<ColumnCryptoMetaData>(new ColumnCryptoMetaData(metadata));
-}
-
-ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata)
- : impl_(new ColumnCryptoMetaDataImpl(
- reinterpret_cast<const format::ColumnCryptoMetaData*>(metadata))) {}
-
-ColumnCryptoMetaData::~ColumnCryptoMetaData() = default;
-
-std::shared_ptr<schema::ColumnPath> ColumnCryptoMetaData::path_in_schema() const {
- return impl_->path_in_schema();
-}
-bool ColumnCryptoMetaData::encrypted_with_footer_key() const {
- return impl_->encrypted_with_footer_key();
-}
-const std::string& ColumnCryptoMetaData::key_metadata() const {
- return impl_->key_metadata();
-}
-
-// ColumnChunk metadata
-class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
- public:
- explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
- const ColumnDescriptor* descr,
- int16_t row_group_ordinal, int16_t column_ordinal,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : column_(column), descr_(descr), writer_version_(writer_version) {
- column_metadata_ = &column->meta_data;
- if (column->__isset.crypto_metadata) { // column metadata is encrypted
- format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
-
- if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
- if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) {
- // should decrypt metadata
- std::shared_ptr<schema::ColumnPath> path = std::make_shared<schema::ColumnPath>(
- ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
- std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
-
- std::string aad_column_metadata = encryption::CreateModuleAad(
- file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal,
- column_ordinal, static_cast<int16_t>(-1));
- auto decryptor = file_decryptor->GetColumnMetaDecryptor(
- path->ToDotString(), key_metadata, aad_column_metadata);
- auto len = static_cast<uint32_t>(column->encrypted_column_metadata.size());
- DeserializeThriftMsg(
- reinterpret_cast<const uint8_t*>(column->encrypted_column_metadata.c_str()),
- &len, &decrypted_metadata_, decryptor);
- column_metadata_ = &decrypted_metadata_;
- } else {
- throw ParquetException(
- "Cannot decrypt ColumnMetadata."
- " FileDecryption is not setup correctly");
- }
- }
- }
- for (const auto& encoding : column_metadata_->encodings) {
- encodings_.push_back(LoadEnumSafe(&encoding));
- }
- for (const auto& encoding_stats : column_metadata_->encoding_stats) {
- encoding_stats_.push_back({LoadEnumSafe(&encoding_stats.page_type),
- LoadEnumSafe(&encoding_stats.encoding),
- encoding_stats.count});
- }
- possible_stats_ = nullptr;
- }
-
- bool Equals(const ColumnChunkMetaDataImpl& other) const {
- return *column_metadata_ == *other.column_metadata_;
- }
-
- // column chunk
- inline int64_t file_offset() const { return column_->file_offset; }
- inline const std::string& file_path() const { return column_->file_path; }
-
- inline Type::type type() const { return LoadEnumSafe(&column_metadata_->type); }
-
- inline int64_t num_values() const { return column_metadata_->num_values; }
-
- std::shared_ptr<schema::ColumnPath> path_in_schema() {
- return std::make_shared<schema::ColumnPath>(column_metadata_->path_in_schema);
- }
-
- // Check if statistics are set and are valid
- // 1) Must be set in the metadata
- // 2) Statistics must not be corrupted
- inline bool is_stats_set() const {
- DCHECK(writer_version_ != nullptr);
- // If the column statistics don't exist or column sort order is unknown
- // we cannot use the column stats
- if (!column_metadata_->__isset.statistics ||
- descr_->sort_order() == SortOrder::UNKNOWN) {
- return false;
- }
- if (possible_stats_ == nullptr) {
- possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
- }
- EncodedStatistics encodedStatistics = possible_stats_->Encode();
- return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
- descr_->sort_order());
- }
-
- inline std::shared_ptr<Statistics> statistics() const {
- return is_stats_set() ? possible_stats_ : nullptr;
- }
-
- inline Compression::type compression() const {
- return LoadEnumSafe(&column_metadata_->codec);
- }
-
- const std::vector<Encoding::type>& encodings() const { return encodings_; }
-
- const std::vector<PageEncodingStats>& encoding_stats() const { return encoding_stats_; }
-
- inline bool has_dictionary_page() const {
- return column_metadata_->__isset.dictionary_page_offset;
- }
-
- inline int64_t dictionary_page_offset() const {
- return column_metadata_->dictionary_page_offset;
- }
-
- inline int64_t data_page_offset() const { return column_metadata_->data_page_offset; }
-
- inline bool has_index_page() const {
- return column_metadata_->__isset.index_page_offset;
- }
-
- inline int64_t index_page_offset() const { return column_metadata_->index_page_offset; }
-
- inline int64_t total_compressed_size() const {
- return column_metadata_->total_compressed_size;
- }
-
- inline int64_t total_uncompressed_size() const {
- return column_metadata_->total_uncompressed_size;
- }
-
- inline std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const {
- if (column_->__isset.crypto_metadata) {
- return ColumnCryptoMetaData::Make(
- reinterpret_cast<const uint8_t*>(&column_->crypto_metadata));
- } else {
- return nullptr;
- }
- }
-
- private:
- mutable std::shared_ptr<Statistics> possible_stats_;
- std::vector<Encoding::type> encodings_;
- std::vector<PageEncodingStats> encoding_stats_;
- const format::ColumnChunk* column_;
- const format::ColumnMetaData* column_metadata_;
- format::ColumnMetaData decrypted_metadata_;
- const ColumnDescriptor* descr_;
- const ApplicationVersion* writer_version_;
-};
-
-std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
- const void* metadata, const ColumnDescriptor* descr,
- const ApplicationVersion* writer_version, int16_t row_group_ordinal,
- int16_t column_ordinal, std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- return std::unique_ptr<ColumnChunkMetaData>(
- new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal,
- writer_version, std::move(file_decryptor)));
-}
-
-ColumnChunkMetaData::ColumnChunkMetaData(
- const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
- int16_t column_ordinal, const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : impl_{new ColumnChunkMetaDataImpl(
- reinterpret_cast<const format::ColumnChunk*>(metadata), descr,
- row_group_ordinal, column_ordinal, writer_version, std::move(file_decryptor))} {
-}
-
-ColumnChunkMetaData::~ColumnChunkMetaData() = default;
-
-// column chunk
-int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); }
-
-const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); }
-
-Type::type ColumnChunkMetaData::type() const { return impl_->type(); }
-
-int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); }
-
-std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const {
- return impl_->path_in_schema();
-}
-
-std::shared_ptr<Statistics> ColumnChunkMetaData::statistics() const {
- return impl_->statistics();
-}
-
-bool ColumnChunkMetaData::is_stats_set() const { return impl_->is_stats_set(); }
-
-bool ColumnChunkMetaData::has_dictionary_page() const {
- return impl_->has_dictionary_page();
-}
-
-int64_t ColumnChunkMetaData::dictionary_page_offset() const {
- return impl_->dictionary_page_offset();
-}
-
-int64_t ColumnChunkMetaData::data_page_offset() const {
- return impl_->data_page_offset();
-}
-
-bool ColumnChunkMetaData::has_index_page() const { return impl_->has_index_page(); }
-
-int64_t ColumnChunkMetaData::index_page_offset() const {
- return impl_->index_page_offset();
-}
-
-Compression::type ColumnChunkMetaData::compression() const {
- return impl_->compression();
-}
-
-bool ColumnChunkMetaData::can_decompress() const {
- return ::arrow::util::Codec::IsAvailable(compression());
-}
-
-const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
- return impl_->encodings();
-}
-
-const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats() const {
- return impl_->encoding_stats();
-}
-
-int64_t ColumnChunkMetaData::total_uncompressed_size() const {
- return impl_->total_uncompressed_size();
-}
-
-int64_t ColumnChunkMetaData::total_compressed_size() const {
- return impl_->total_compressed_size();
-}
-
-std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() const {
- return impl_->crypto_metadata();
-}
-
-bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
- return impl_->Equals(*other.impl_);
-}
-
-// row-group metadata
-class RowGroupMetaData::RowGroupMetaDataImpl {
- public:
- explicit RowGroupMetaDataImpl(const format::RowGroup* row_group,
- const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : row_group_(row_group),
- schema_(schema),
- writer_version_(writer_version),
- file_decryptor_(std::move(file_decryptor)) {}
-
- bool Equals(const RowGroupMetaDataImpl& other) const {
- return *row_group_ == *other.row_group_;
- }
-
- inline int num_columns() const { return static_cast<int>(row_group_->columns.size()); }
-
- inline int64_t num_rows() const { return row_group_->num_rows; }
-
- inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
-
- inline int64_t total_compressed_size() const {
- return row_group_->total_compressed_size;
- }
-
- inline int64_t file_offset() const { return row_group_->file_offset; }
-
- inline const SchemaDescriptor* schema() const { return schema_; }
-
- std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
- if (i < num_columns()) {
- return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
- writer_version_, row_group_->ordinal,
- static_cast<int16_t>(i), file_decryptor_);
- }
- throw ParquetException("The file only has ", num_columns(),
- " columns, requested metadata for column: ", i);
- }
-
- private:
- const format::RowGroup* row_group_;
- const SchemaDescriptor* schema_;
- const ApplicationVersion* writer_version_;
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-};
-
-std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
- const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- return std::unique_ptr<RowGroupMetaData>(
- new RowGroupMetaData(metadata, schema, writer_version, std::move(file_decryptor)));
-}
-
-RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : impl_{new RowGroupMetaDataImpl(reinterpret_cast<const format::RowGroup*>(metadata),
- schema, writer_version, std::move(file_decryptor))} {
-}
-
-RowGroupMetaData::~RowGroupMetaData() = default;
-
-bool RowGroupMetaData::Equals(const RowGroupMetaData& other) const {
- return impl_->Equals(*other.impl_);
-}
-
-int RowGroupMetaData::num_columns() const { return impl_->num_columns(); }
-
-int64_t RowGroupMetaData::num_rows() const { return impl_->num_rows(); }
-
-int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); }
-
-int64_t RowGroupMetaData::total_compressed_size() const {
- return impl_->total_compressed_size();
-}
-
-int64_t RowGroupMetaData::file_offset() const { return impl_->file_offset(); }
-
-const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); }
-
-std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const {
- return impl_->ColumnChunk(i);
-}
-
-bool RowGroupMetaData::can_decompress() const {
- int n_columns = num_columns();
- for (int i = 0; i < n_columns; i++) {
- if (!ColumnChunk(i)->can_decompress()) {
- return false;
- }
- }
- return true;
-}
-
-// file metadata
-class FileMetaData::FileMetaDataImpl {
- public:
- FileMetaDataImpl() = default;
-
- explicit FileMetaDataImpl(
- const void* metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
- : file_decryptor_(file_decryptor) {
- metadata_.reset(new format::FileMetaData);
-
- auto footer_decryptor =
- file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr;
-
- DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(metadata), metadata_len,
- metadata_.get(), footer_decryptor);
- metadata_len_ = *metadata_len;
-
- if (metadata_->__isset.created_by) {
- writer_version_ = ApplicationVersion(metadata_->created_by);
- } else {
- writer_version_ = ApplicationVersion("unknown 0.0.0");
- }
-
- InitSchema();
- InitColumnOrders();
- InitKeyValueMetadata();
- }
-
- bool VerifySignature(const void* signature) {
- // verify decryption properties are set
- if (file_decryptor_ == nullptr) {
- throw ParquetException("Decryption not set properly. cannot verify signature");
- }
- // serialize the footer
- uint8_t* serialized_data;
- uint32_t serialized_len = metadata_len_;
- ThriftSerializer serializer;
- serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
-
- // encrypt with nonce
- auto nonce = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature));
- auto tag = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature)) +
- encryption::kNonceLength;
-
- std::string key = file_decryptor_->GetFooterKey();
- std::string aad = encryption::CreateFooterAad(file_decryptor_->file_aad());
-
- auto aes_encryptor = encryption::AesEncryptor::Make(
- file_decryptor_->algorithm(), static_cast<int>(key.size()), true, nullptr);
-
- std::shared_ptr<Buffer> encrypted_buffer = std::static_pointer_cast<ResizableBuffer>(
- AllocateBuffer(file_decryptor_->pool(),
- aes_encryptor->CiphertextSizeDelta() + serialized_len));
- uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
- serialized_data, serialized_len, str2bytes(key), static_cast<int>(key.size()),
- str2bytes(aad), static_cast<int>(aad.size()), nonce,
- encrypted_buffer->mutable_data());
- // Delete AES encryptor object. It was created only to verify the footer signature.
- aes_encryptor->WipeOut();
- delete aes_encryptor;
- return 0 ==
- memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
- tag, encryption::kGcmTagLength);
- }
-
- inline uint32_t size() const { return metadata_len_; }
- inline int num_columns() const { return schema_.num_columns(); }
- inline int64_t num_rows() const { return metadata_->num_rows; }
- inline int num_row_groups() const {
- return static_cast<int>(metadata_->row_groups.size());
- }
- inline int32_t version() const { return metadata_->version; }
- inline const std::string& created_by() const { return metadata_->created_by; }
- inline int num_schema_elements() const {
- return static_cast<int>(metadata_->schema.size());
- }
-
- inline bool is_encryption_algorithm_set() const {
- return metadata_->__isset.encryption_algorithm;
- }
- inline EncryptionAlgorithm encryption_algorithm() {
- return FromThrift(metadata_->encryption_algorithm);
- }
- inline const std::string& footer_signing_key_metadata() {
- return metadata_->footer_signing_key_metadata;
- }
-
- const ApplicationVersion& writer_version() const { return writer_version_; }
-
- void WriteTo(::arrow::io::OutputStream* dst,
- const std::shared_ptr<Encryptor>& encryptor) const {
- ThriftSerializer serializer;
- // Only in encrypted files with plaintext footers the
- // encryption_algorithm is set in footer
- if (is_encryption_algorithm_set()) {
- uint8_t* serialized_data;
- uint32_t serialized_len;
- serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
-
- // encrypt the footer key
- std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
- serialized_len);
- unsigned encrypted_len =
- encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
-
- // write unencrypted footer
- PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
- // Write signature (nonce and tag)
- PARQUET_THROW_NOT_OK(
- dst->Write(encrypted_data.data() + 4, encryption::kNonceLength));
- PARQUET_THROW_NOT_OK(
- dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength,
- encryption::kGcmTagLength));
- } else { // either plaintext file (when encryptor is null)
- // or encrypted file with encrypted footer
- serializer.Serialize(metadata_.get(), dst, encryptor);
- }
- }
-
- std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
- if (!(i < num_row_groups())) {
- std::stringstream ss;
- ss << "The file only has " << num_row_groups()
- << " row groups, requested metadata for row group: " << i;
- throw ParquetException(ss.str());
- }
- return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_,
- file_decryptor_);
- }
-
- bool Equals(const FileMetaDataImpl& other) const {
- return *metadata_ == *other.metadata_;
- }
-
- const SchemaDescriptor* schema() const { return &schema_; }
-
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
- return key_value_metadata_;
- }
-
- void set_file_path(const std::string& path) {
- for (format::RowGroup& row_group : metadata_->row_groups) {
- for (format::ColumnChunk& chunk : row_group.columns) {
- chunk.__set_file_path(path);
- }
- }
- }
-
- format::RowGroup& row_group(int i) {
- DCHECK_LT(i, num_row_groups());
- return metadata_->row_groups[i];
- }
-
- void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
- if (!schema()->Equals(*other->schema())) {
- throw ParquetException("AppendRowGroups requires equal schemas.");
- }
-
- format::RowGroup other_rg;
- for (int i = 0; i < other->num_row_groups(); i++) {
- other_rg = other->row_group(i);
- metadata_->row_groups.push_back(other_rg);
- metadata_->num_rows += other_rg.num_rows;
- }
- }
-
- std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) {
- for (int i : row_groups) {
- if (i < num_row_groups()) continue;
-
- throw ParquetException(
- "The file only has ", num_row_groups(),
- " row groups, but requested a subset including row group: ", i);
- }
-
- std::shared_ptr<FileMetaData> out(new FileMetaData());
- out->impl_.reset(new FileMetaDataImpl());
- out->impl_->metadata_.reset(new format::FileMetaData());
-
- auto metadata = out->impl_->metadata_.get();
- metadata->version = metadata_->version;
- metadata->schema = metadata_->schema;
-
- metadata->row_groups.resize(row_groups.size());
- int i = 0;
- for (int selected_index : row_groups) {
- metadata->num_rows += row_group(selected_index).num_rows;
- metadata->row_groups[i++] = row_group(selected_index);
- }
-
- metadata->key_value_metadata = metadata_->key_value_metadata;
- metadata->created_by = metadata_->created_by;
- metadata->column_orders = metadata_->column_orders;
- metadata->encryption_algorithm = metadata_->encryption_algorithm;
- metadata->footer_signing_key_metadata = metadata_->footer_signing_key_metadata;
- metadata->__isset = metadata_->__isset;
-
- out->impl_->schema_ = schema_;
- out->impl_->writer_version_ = writer_version_;
- out->impl_->key_value_metadata_ = key_value_metadata_;
- out->impl_->file_decryptor_ = file_decryptor_;
-
- return out;
- }
-
- void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- file_decryptor_ = file_decryptor;
- }
-
- private:
- friend FileMetaDataBuilder;
- uint32_t metadata_len_ = 0;
- std::unique_ptr<format::FileMetaData> metadata_;
- SchemaDescriptor schema_;
- ApplicationVersion writer_version_;
- std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-
- void InitSchema() {
- if (metadata_->schema.empty()) {
- throw ParquetException("Empty file schema (no root)");
- }
- schema_.Init(schema::Unflatten(&metadata_->schema[0],
- static_cast<int>(metadata_->schema.size())));
- }
-
- void InitColumnOrders() {
- // update ColumnOrder
- std::vector<parquet::ColumnOrder> column_orders;
- if (metadata_->__isset.column_orders) {
- for (auto column_order : metadata_->column_orders) {
- if (column_order.__isset.TYPE_ORDER) {
- column_orders.push_back(ColumnOrder::type_defined_);
- } else {
- column_orders.push_back(ColumnOrder::undefined_);
- }
- }
- } else {
- column_orders.resize(schema_.num_columns(), ColumnOrder::undefined_);
- }
-
- schema_.updateColumnOrders(column_orders);
- }
-
- void InitKeyValueMetadata() {
- std::shared_ptr<KeyValueMetadata> metadata = nullptr;
- if (metadata_->__isset.key_value_metadata) {
- metadata = std::make_shared<KeyValueMetadata>();
- for (const auto& it : metadata_->key_value_metadata) {
- metadata->Append(it.key, it.value);
- }
- }
- key_value_metadata_ = std::move(metadata);
- }
-};
-
-std::shared_ptr<FileMetaData> FileMetaData::Make(
- const void* metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- // This FileMetaData ctor is private, not compatible with std::make_shared
- return std::shared_ptr<FileMetaData>(
- new FileMetaData(metadata, metadata_len, file_decryptor));
-}
-
-FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : impl_{std::unique_ptr<FileMetaDataImpl>(
- new FileMetaDataImpl(metadata, metadata_len, file_decryptor))} {}
-
-FileMetaData::FileMetaData()
- : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {}
-
-FileMetaData::~FileMetaData() = default;
-
-bool FileMetaData::Equals(const FileMetaData& other) const {
- return impl_->Equals(*other.impl_);
-}
-
-std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const {
- return impl_->RowGroup(i);
-}
-
-bool FileMetaData::VerifySignature(const void* signature) {
- return impl_->VerifySignature(signature);
-}
-
-uint32_t FileMetaData::size() const { return impl_->size(); }
-
-int FileMetaData::num_columns() const { return impl_->num_columns(); }
-
-int64_t FileMetaData::num_rows() const { return impl_->num_rows(); }
-
-int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); }
-
-bool FileMetaData::can_decompress() const {
- int n_row_groups = num_row_groups();
- for (int i = 0; i < n_row_groups; i++) {
- if (!RowGroup(i)->can_decompress()) {
- return false;
- }
- }
- return true;
-}
-
-bool FileMetaData::is_encryption_algorithm_set() const {
- return impl_->is_encryption_algorithm_set();
-}
-
-EncryptionAlgorithm FileMetaData::encryption_algorithm() const {
- return impl_->encryption_algorithm();
-}
-
-const std::string& FileMetaData::footer_signing_key_metadata() const {
- return impl_->footer_signing_key_metadata();
-}
-
-void FileMetaData::set_file_decryptor(
- std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- impl_->set_file_decryptor(file_decryptor);
-}
-
-ParquetVersion::type FileMetaData::version() const {
- switch (impl_->version()) {
- case 1:
- return ParquetVersion::PARQUET_1_0;
- case 2:
- return ParquetVersion::PARQUET_2_0;
- default:
- // Improperly set version, assuming Parquet 1.0
- break;
- }
- return ParquetVersion::PARQUET_1_0;
-}
-
-const ApplicationVersion& FileMetaData::writer_version() const {
- return impl_->writer_version();
-}
-
-const std::string& FileMetaData::created_by() const { return impl_->created_by(); }
-
-int FileMetaData::num_schema_elements() const { return impl_->num_schema_elements(); }
-
-const SchemaDescriptor* FileMetaData::schema() const { return impl_->schema(); }
-
-const std::shared_ptr<const KeyValueMetadata>& FileMetaData::key_value_metadata() const {
- return impl_->key_value_metadata();
-}
-
-void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
-
-void FileMetaData::AppendRowGroups(const FileMetaData& other) {
- impl_->AppendRowGroups(other.impl_);
-}
-
-std::shared_ptr<FileMetaData> FileMetaData::Subset(
- const std::vector<int>& row_groups) const {
- return impl_->Subset(row_groups);
-}
-
-void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
- const std::shared_ptr<Encryptor>& encryptor) const {
- return impl_->WriteTo(dst, encryptor);
-}
-
-class FileCryptoMetaData::FileCryptoMetaDataImpl {
- public:
- FileCryptoMetaDataImpl() = default;
-
- explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) {
- metadata_.reset(new format::FileCryptoMetaData);
- DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
- metadata_len_ = *metadata_len;
- }
-
- EncryptionAlgorithm encryption_algorithm() {
- return FromThrift(metadata_->encryption_algorithm);
- }
- const std::string& key_metadata() { return metadata_->key_metadata; }
- void WriteTo(::arrow::io::OutputStream* dst) const {
- ThriftSerializer serializer;
- serializer.Serialize(metadata_.get(), dst);
- }
-
- private:
- friend FileMetaDataBuilder;
- std::unique_ptr<format::FileCryptoMetaData> metadata_;
- uint32_t metadata_len_;
-};
-
-EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const {
- return impl_->encryption_algorithm();
-}
-
-const std::string& FileCryptoMetaData::key_metadata() const {
- return impl_->key_metadata();
-}
-
-std::shared_ptr<FileCryptoMetaData> FileCryptoMetaData::Make(
- const uint8_t* serialized_metadata, uint32_t* metadata_len) {
- return std::shared_ptr<FileCryptoMetaData>(
- new FileCryptoMetaData(serialized_metadata, metadata_len));
-}
-
-FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata,
- uint32_t* metadata_len)
- : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {}
-
-FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {}
-
-FileCryptoMetaData::~FileCryptoMetaData() = default;
-
-void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const {
- impl_->WriteTo(dst);
-}
-
-std::string FileMetaData::SerializeToString() const {
- // We need to pass in an initial size. Since it will automatically
- // increase the buffer size to hold the metadata, we just leave it 0.
- PARQUET_ASSIGN_OR_THROW(auto serializer, ::arrow::io::BufferOutputStream::Create(0));
- WriteTo(serializer.get());
- PARQUET_ASSIGN_OR_THROW(auto metadata_buffer, serializer->Finish());
- return metadata_buffer->ToString();
-}
-
-ApplicationVersion::ApplicationVersion(std::string application, int major, int minor,
- int patch)
- : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
-
-namespace {
-// Parse the application version format and set parsed values to
-// ApplicationVersion.
-//
-// The application version format must be compatible parquet-mr's
-// one. See also:
-// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
-// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
-//
-// The application version format:
-// "${APPLICATION_NAME}"
-// "${APPLICATION_NAME} version ${VERSION}"
-// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
-//
-// Eg:
-// parquet-cpp
-// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
-// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
-//
-// The VERSION format:
-// "${MAJOR}"
-// "${MAJOR}.${MINOR}"
-// "${MAJOR}.${MINOR}.${PATCH}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
-// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
-// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
-// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
-//
-// Eg:
-// 1
-// 1.5
-// 1.5.0
-// 1.5.0ab
-// 1.5.0ab-cdh5.5.0
-// 1.5.0ab-cdh5.5.0+cd
-// 1.5.0ab+cd
-// 1.5.0-cdh5.5.0
-// 1.5.0-cdh5.5.0+cd
-// 1.5.0+cd
-class ApplicationVersionParser {
- public:
- ApplicationVersionParser(const std::string& created_by,
- ApplicationVersion& application_version)
- : created_by_(created_by),
- application_version_(application_version),
- spaces_(" \t\v\r\n\f"),
- digits_("0123456789") {}
-
- void Parse() {
- application_version_.application_ = "unknown";
- application_version_.version = {0, 0, 0, "", "", ""};
-
- if (!ParseApplicationName()) {
- return;
- }
- if (!ParseVersion()) {
- return;
- }
- if (!ParseBuildName()) {
- return;
- }
- }
-
- private:
- bool IsSpace(const std::string& string, const size_t& offset) {
- auto target = ::arrow::util::string_view(string).substr(offset, 1);
- return target.find_first_of(spaces_) != ::arrow::util::string_view::npos;
- }
-
- void RemovePrecedingSpaces(const std::string& string, size_t& start,
- const size_t& end) {
- while (start < end && IsSpace(string, start)) {
- ++start;
- }
- }
-
- void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
- while (start < (end - 1) && (end - 1) < string.size() && IsSpace(string, end - 1)) {
- --end;
- }
- }
-
- bool ParseApplicationName() {
- std::string version_mark(" version ");
- auto version_mark_position = created_by_.find(version_mark);
- size_t application_name_end;
- // No VERSION and BUILD_NAME.
- if (version_mark_position == std::string::npos) {
- version_start_ = std::string::npos;
- application_name_end = created_by_.size();
- } else {
- version_start_ = version_mark_position + version_mark.size();
- application_name_end = version_mark_position;
- }
-
- size_t application_name_start = 0;
- RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
- RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
- application_version_.application_ = created_by_.substr(
- application_name_start, application_name_end - application_name_start);
-
- return true;
- }
-
- bool ParseVersion() {
- // No VERSION.
- if (version_start_ == std::string::npos) {
- return false;
- }
-
- RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
- version_end_ = created_by_.find(" (", version_start_);
- // No BUILD_NAME.
- if (version_end_ == std::string::npos) {
- version_end_ = created_by_.size();
- }
- RemoveTrailingSpaces(created_by_, version_start_, version_end_);
- // No VERSION.
- if (version_start_ == version_end_) {
- return false;
- }
- version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
-
- if (!ParseVersionMajor()) {
- return false;
- }
- if (!ParseVersionMinor()) {
- return false;
- }
- if (!ParseVersionPatch()) {
- return false;
- }
- if (!ParseVersionUnknown()) {
- return false;
- }
- if (!ParseVersionPreRelease()) {
- return false;
- }
- if (!ParseVersionBuildInfo()) {
- return false;
- }
-
- return true;
- }
-
- bool ParseVersionMajor() {
- size_t version_major_start = 0;
- auto version_major_end = version_string_.find_first_not_of(digits_);
- // MAJOR only.
- if (version_major_end == std::string::npos) {
- version_major_end = version_string_.size();
- version_parsing_position_ = version_major_end;
- } else {
- // No ".".
- if (version_string_[version_major_end] != '.') {
- return false;
- }
- // No MAJOR.
- if (version_major_end == version_major_start) {
- return false;
- }
- version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
- }
- auto version_major_string = version_string_.substr(
- version_major_start, version_major_end - version_major_start);
- application_version_.version.major = atoi(version_major_string.c_str());
- return true;
- }
-
- bool ParseVersionMinor() {
- auto version_minor_start = version_parsing_position_;
- auto version_minor_end =
- version_string_.find_first_not_of(digits_, version_minor_start);
- // MAJOR.MINOR only.
- if (version_minor_end == std::string::npos) {
- version_minor_end = version_string_.size();
- version_parsing_position_ = version_minor_end;
- } else {
- // No ".".
- if (version_string_[version_minor_end] != '.') {
- return false;
- }
- // No MINOR.
- if (version_minor_end == version_minor_start) {
- return false;
- }
- version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
- }
- auto version_minor_string = version_string_.substr(
- version_minor_start, version_minor_end - version_minor_start);
- application_version_.version.minor = atoi(version_minor_string.c_str());
- return true;
- }
-
- bool ParseVersionPatch() {
- auto version_patch_start = version_parsing_position_;
- auto version_patch_end =
- version_string_.find_first_not_of(digits_, version_patch_start);
- // No UNKNOWN, PRE_RELEASE and BUILD_INFO.
- if (version_patch_end == std::string::npos) {
- version_patch_end = version_string_.size();
- }
- // No PATCH.
- if (version_patch_end == version_patch_start) {
- return false;
- }
- auto version_patch_string = version_string_.substr(
- version_patch_start, version_patch_end - version_patch_start);
- application_version_.version.patch = atoi(version_patch_string.c_str());
- version_parsing_position_ = version_patch_end;
- return true;
- }
-
- bool ParseVersionUnknown() {
- // No UNKNOWN.
- if (version_parsing_position_ == version_string_.size()) {
- return true;
- }
- auto version_unknown_start = version_parsing_position_;
- auto version_unknown_end = version_string_.find_first_of("-+", version_unknown_start);
- // No PRE_RELEASE and BUILD_INFO
- if (version_unknown_end == std::string::npos) {
- version_unknown_end = version_string_.size();
- }
- application_version_.version.unknown = version_string_.substr(
- version_unknown_start, version_unknown_end - version_unknown_start);
- version_parsing_position_ = version_unknown_end;
- return true;
- }
-
- bool ParseVersionPreRelease() {
- // No PRE_RELEASE.
- if (version_parsing_position_ == version_string_.size() ||
- version_string_[version_parsing_position_] != '-') {
- return true;
- }
-
- auto version_pre_release_start = version_parsing_position_ + 1; // +1 is for '-'.
- auto version_pre_release_end =
- version_string_.find_first_of("+", version_pre_release_start);
- // No BUILD_INFO
- if (version_pre_release_end == std::string::npos) {
- version_pre_release_end = version_string_.size();
- }
- application_version_.version.pre_release = version_string_.substr(
- version_pre_release_start, version_pre_release_end - version_pre_release_start);
- version_parsing_position_ = version_pre_release_end;
- return true;
- }
-
- bool ParseVersionBuildInfo() {
- // No BUILD_INFO.
- if (version_parsing_position_ == version_string_.size() ||
- version_string_[version_parsing_position_] != '+') {
- return true;
- }
-
- auto version_build_info_start = version_parsing_position_ + 1; // +1 is for '+'.
- application_version_.version.build_info =
- version_string_.substr(version_build_info_start);
- return true;
- }
-
- bool ParseBuildName() {
- std::string build_mark(" (build ");
- auto build_mark_position = created_by_.find(build_mark, version_end_);
- // No BUILD_NAME.
- if (build_mark_position == std::string::npos) {
- return false;
- }
- auto build_name_start = build_mark_position + build_mark.size();
- RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
- auto build_name_end = created_by_.find_first_of(")", build_name_start);
- // No end ")".
- if (build_name_end == std::string::npos) {
- return false;
- }
- RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
- application_version_.build_ =
- created_by_.substr(build_name_start, build_name_end - build_name_start);
-
- return true;
- }
-
- const std::string& created_by_;
- ApplicationVersion& application_version_;
-
- // For parsing.
- std::string spaces_;
- std::string digits_;
- size_t version_parsing_position_;
- size_t version_start_;
- size_t version_end_;
- std::string version_string_;
-};
-} // namespace
-
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
- ApplicationVersionParser parser(created_by, *this);
- parser.Parse();
-}
-
-bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
- if (application_ != other_version.application_) return false;
-
- if (version.major < other_version.version.major) return true;
- if (version.major > other_version.version.major) return false;
- DCHECK_EQ(version.major, other_version.version.major);
- if (version.minor < other_version.version.minor) return true;
- if (version.minor > other_version.version.minor) return false;
- DCHECK_EQ(version.minor, other_version.version.minor);
- return version.patch < other_version.version.patch;
-}
-
-bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
- return application_ == other_version.application_ &&
- version.major == other_version.version.major &&
- version.minor == other_version.version.minor &&
- version.patch == other_version.version.patch;
-}
-
-// Reference:
-// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
-// PARQUET-686 has more discussion on statistics
-bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
- EncodedStatistics& statistics,
- SortOrder::type sort_order) const {
- // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
- // correctly for all types
- if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
- (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
- // Only SIGNED are valid unless max and min are the same
- // (in which case the sort order does not matter)
- bool max_equals_min = statistics.has_min && statistics.has_max
- ? statistics.min() == statistics.max()
- : false;
- if (SortOrder::SIGNED != sort_order && !max_equals_min) {
- return false;
- }
-
- // Statistics of other types are OK
- if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
- return true;
- }
- }
- // created_by is not populated, which could have been caused by
- // parquet-mr during the same time as PARQUET-251, see PARQUET-297
- if (application_ == "unknown") {
- return true;
- }
-
- // Unknown sort order has incorrect stats
- if (SortOrder::UNKNOWN == sort_order) {
- return false;
- }
-
- // PARQUET-251
- if (VersionLt(PARQUET_251_FIXED_VERSION())) {
- return false;
- }
-
- return true;
-}
-
-// MetaData Builders
-// row-group metadata
-class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
- public:
- explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column)
- : owned_column_chunk_(new format::ColumnChunk),
- properties_(std::move(props)),
- column_(column) {
- Init(owned_column_chunk_.get());
- }
-
- explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column,
- format::ColumnChunk* column_chunk)
- : properties_(std::move(props)), column_(column) {
- Init(column_chunk);
- }
-
- const void* contents() const { return column_chunk_; }
-
- // column chunk
- void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
-
- // column metadata
- void SetStatistics(const EncodedStatistics& val) {
- column_chunk_->meta_data.__set_statistics(ToThrift(val));
- }
-
- void Finish(int64_t num_values, int64_t dictionary_page_offset,
- int64_t index_page_offset, int64_t data_page_offset,
- int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
- bool dictionary_fallback,
- const std::map<Encoding::type, int32_t>& dict_encoding_stats,
- const std::map<Encoding::type, int32_t>& data_encoding_stats,
- const std::shared_ptr<Encryptor>& encryptor) {
- if (dictionary_page_offset > 0) {
- column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
- column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
- } else {
- column_chunk_->__set_file_offset(data_page_offset + compressed_size);
- }
- column_chunk_->__isset.meta_data = true;
- column_chunk_->meta_data.__set_num_values(num_values);
- if (index_page_offset >= 0) {
- column_chunk_->meta_data.__set_index_page_offset(index_page_offset);
- }
- column_chunk_->meta_data.__set_data_page_offset(data_page_offset);
- column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size);
- column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
-
- std::vector<format::Encoding::type> thrift_encodings;
- if (has_dictionary) {
- thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
- if (properties_->version() == ParquetVersion::PARQUET_1_0) {
- thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
- } else {
- thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
- }
- } else { // Dictionary not enabled
- thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
- }
- thrift_encodings.push_back(ToThrift(Encoding::RLE));
- // Only PLAIN encoding is supported for fallback in V1
- // TODO(majetideepak): Use user specified encoding for V2
- if (dictionary_fallback) {
- thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
- }
- column_chunk_->meta_data.__set_encodings(thrift_encodings);
- std::vector<format::PageEncodingStats> thrift_encoding_stats;
- // Add dictionary page encoding stats
- for (const auto& entry : dict_encoding_stats) {
- format::PageEncodingStats dict_enc_stat;
- dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
- dict_enc_stat.__set_encoding(ToThrift(entry.first));
- dict_enc_stat.__set_count(entry.second);
- thrift_encoding_stats.push_back(dict_enc_stat);
- }
- // Add data page encoding stats
- for (const auto& entry : data_encoding_stats) {
- format::PageEncodingStats data_enc_stat;
- data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
- data_enc_stat.__set_encoding(ToThrift(entry.first));
- data_enc_stat.__set_count(entry.second);
- thrift_encoding_stats.push_back(data_enc_stat);
- }
- column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
-
- const auto& encrypt_md =
- properties_->column_encryption_properties(column_->path()->ToDotString());
- // column is encrypted
- if (encrypt_md != nullptr && encrypt_md->is_encrypted()) {
- column_chunk_->__isset.crypto_metadata = true;
- format::ColumnCryptoMetaData ccmd;
- if (encrypt_md->is_encrypted_with_footer_key()) {
- // encrypted with footer key
- ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
- ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey());
- } else { // encrypted with column key
- format::EncryptionWithColumnKey eck;
- eck.__set_key_metadata(encrypt_md->key_metadata());
- eck.__set_path_in_schema(column_->path()->ToDotVector());
- ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
- ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
- }
- column_chunk_->__set_crypto_metadata(ccmd);
-
- bool encrypted_footer =
- properties_->file_encryption_properties()->encrypted_footer();
- bool encrypt_metadata =
- !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key();
- if (encrypt_metadata) {
- ThriftSerializer serializer;
- // Serialize and encrypt ColumnMetadata separately
- // Thrift-serialize the ColumnMetaData structure,
- // encrypt it with the column key, and write to encrypted_column_metadata
- uint8_t* serialized_data;
- uint32_t serialized_len;
-
- serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len,
- &serialized_data);
-
- std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
- serialized_len);
- unsigned encrypted_len =
- encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
-
- const char* temp =
- const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
- std::string encrypted_column_metadata(temp, encrypted_len);
- column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata);
-
- if (encrypted_footer) {
- column_chunk_->__isset.meta_data = false;
- } else {
- // Keep redacted metadata version for old readers
- column_chunk_->__isset.meta_data = true;
- column_chunk_->meta_data.__isset.statistics = false;
- column_chunk_->meta_data.__isset.encoding_stats = false;
- }
- }
- }
- }
-
- void WriteTo(::arrow::io::OutputStream* sink) {
- ThriftSerializer serializer;
- serializer.Serialize(column_chunk_, sink);
- }
-
- const ColumnDescriptor* descr() const { return column_; }
- int64_t total_compressed_size() const {
- return column_chunk_->meta_data.total_compressed_size;
- }
-
- private:
- void Init(format::ColumnChunk* column_chunk) {
- column_chunk_ = column_chunk;
-
- column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type()));
- column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector());
- column_chunk_->meta_data.__set_codec(
- ToThrift(properties_->compression(column_->path())));
- }
-
- format::ColumnChunk* column_chunk_;
- std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
- const std::shared_ptr<WriterProperties> properties_;
- const ColumnDescriptor* column_;
-};
-
-std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
- void* contents) {
- return std::unique_ptr<ColumnChunkMetaDataBuilder>(
- new ColumnChunkMetaDataBuilder(std::move(props), column, contents));
-}
-
-std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column) {
- return std::unique_ptr<ColumnChunkMetaDataBuilder>(
- new ColumnChunkMetaDataBuilder(std::move(props), column));
-}
-
-ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
- : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
- new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
-
-ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
- void* contents)
- : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
- new ColumnChunkMetaDataBuilderImpl(
- std::move(props), column,
- reinterpret_cast<format::ColumnChunk*>(contents)))} {}
-
-ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;
-
-const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
-
-void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
- impl_->set_file_path(path);
-}
-
-void ColumnChunkMetaDataBuilder::Finish(
- int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset,
- int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size,
- bool has_dictionary, bool dictionary_fallback,
- const std::map<Encoding::type, int32_t>& dict_encoding_stats,
- const std::map<Encoding::type, int32_t>& data_encoding_stats,
- const std::shared_ptr<Encryptor>& encryptor) {
- impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
- compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
- dict_encoding_stats, data_encoding_stats, encryptor);
-}
-
-void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) {
- impl_->WriteTo(sink);
-}
-
-const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const {
- return impl_->descr();
-}
-
-void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) {
- impl_->SetStatistics(result);
-}
-
-int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
- return impl_->total_compressed_size();
-}
-
-class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
- public:
- explicit RowGroupMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
- const SchemaDescriptor* schema, void* contents)
- : properties_(std::move(props)), schema_(schema), next_column_(0) {
- row_group_ = reinterpret_cast<format::RowGroup*>(contents);
- InitializeColumns(schema->num_columns());
- }
-
- ColumnChunkMetaDataBuilder* NextColumnChunk() {
- if (!(next_column_ < num_columns())) {
- std::stringstream ss;
- ss << "The schema only has " << num_columns()
- << " columns, requested metadata for column: " << next_column_;
- throw ParquetException(ss.str());
- }
- auto column = schema_->Column(next_column_);
- auto column_builder = ColumnChunkMetaDataBuilder::Make(
- properties_, column, &row_group_->columns[next_column_++]);
- auto column_builder_ptr = column_builder.get();
- column_builders_.push_back(std::move(column_builder));
- return column_builder_ptr;
- }
-
- int current_column() { return next_column_ - 1; }
-
- void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) {
- if (!(next_column_ == schema_->num_columns())) {
- std::stringstream ss;
- ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns()
- << " columns are initialized";
- throw ParquetException(ss.str());
- }
-
- int64_t file_offset = 0;
- int64_t total_compressed_size = 0;
- for (int i = 0; i < schema_->num_columns(); i++) {
- if (!(row_group_->columns[i].file_offset >= 0)) {
- std::stringstream ss;
- ss << "Column " << i << " is not complete.";
- throw ParquetException(ss.str());
- }
- if (i == 0) {
- file_offset = row_group_->columns[0].file_offset;
- }
- // sometimes column metadata is encrypted and not available to read,
- // so we must get total_compressed_size from column builder
- total_compressed_size += column_builders_[i]->total_compressed_size();
- }
-
- row_group_->__set_file_offset(file_offset);
- row_group_->__set_total_compressed_size(total_compressed_size);
- row_group_->__set_total_byte_size(total_bytes_written);
- row_group_->__set_ordinal(row_group_ordinal);
- }
-
- void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }
-
- int num_columns() { return static_cast<int>(row_group_->columns.size()); }
-
- int64_t num_rows() { return row_group_->num_rows; }
-
- private:
- void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); }
-
- format::RowGroup* row_group_;
- const std::shared_ptr<WriterProperties> properties_;
- const SchemaDescriptor* schema_;
- std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_;
- int next_column_;
-};
-
-std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
- std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
- void* contents) {
- return std::unique_ptr<RowGroupMetaDataBuilder>(
- new RowGroupMetaDataBuilder(std::move(props), schema_, contents));
-}
-
-RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const SchemaDescriptor* schema_,
- void* contents)
- : impl_{new RowGroupMetaDataBuilderImpl(std::move(props), schema_, contents)} {}
-
-RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() = default;
-
-ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() {
- return impl_->NextColumnChunk();
-}
-
-int RowGroupMetaDataBuilder::current_column() const { return impl_->current_column(); }
-
-int RowGroupMetaDataBuilder::num_columns() { return impl_->num_columns(); }
-
-int64_t RowGroupMetaDataBuilder::num_rows() { return impl_->num_rows(); }
-
-void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) {
- impl_->set_num_rows(num_rows);
-}
-
-void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written,
- int16_t row_group_ordinal) {
- impl_->Finish(total_bytes_written, row_group_ordinal);
-}
-
-// file metadata
-// TODO(PARQUET-595) Support key_value_metadata
-class FileMetaDataBuilder::FileMetaDataBuilderImpl {
- public:
- explicit FileMetaDataBuilderImpl(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : metadata_(new format::FileMetaData()),
- properties_(std::move(props)),
- schema_(schema),
- key_value_metadata_(std::move(key_value_metadata)) {
- if (properties_->file_encryption_properties() != nullptr &&
- properties_->file_encryption_properties()->encrypted_footer()) {
- crypto_metadata_.reset(new format::FileCryptoMetaData());
- }
- }
-
- RowGroupMetaDataBuilder* AppendRowGroup() {
- row_groups_.emplace_back();
- current_row_group_builder_ =
- RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back());
- return current_row_group_builder_.get();
- }
-
- std::unique_ptr<FileMetaData> Finish() {
- int64_t total_rows = 0;
- for (auto row_group : row_groups_) {
- total_rows += row_group.num_rows;
- }
- metadata_->__set_num_rows(total_rows);
- metadata_->__set_row_groups(row_groups_);
-
- if (key_value_metadata_) {
- metadata_->key_value_metadata.clear();
- metadata_->key_value_metadata.reserve(key_value_metadata_->size());
- for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
- format::KeyValue kv_pair;
- kv_pair.__set_key(key_value_metadata_->key(i));
- kv_pair.__set_value(key_value_metadata_->value(i));
- metadata_->key_value_metadata.push_back(kv_pair);
- }
- metadata_->__isset.key_value_metadata = true;
- }
-
- int32_t file_version = 0;
- switch (properties_->version()) {
- case ParquetVersion::PARQUET_1_0:
- file_version = 1;
- break;
- case ParquetVersion::PARQUET_2_0:
- file_version = 2;
- break;
- default:
- break;
- }
- metadata_->__set_version(file_version);
- metadata_->__set_created_by(properties_->created_by());
-
- // Users cannot set the `ColumnOrder` since we donot not have user defined sort order
- // in the spec yet.
- // We always default to `TYPE_DEFINED_ORDER`. We can expose it in
- // the API once we have user defined sort orders in the Parquet format.
- // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
- format::TypeDefinedOrder type_defined_order;
- format::ColumnOrder column_order;
- column_order.__set_TYPE_ORDER(type_defined_order);
- column_order.__isset.TYPE_ORDER = true;
- metadata_->column_orders.resize(schema_->num_columns(), column_order);
- metadata_->__isset.column_orders = true;
-
- // if plaintext footer, set footer signing algorithm
- auto file_encryption_properties = properties_->file_encryption_properties();
- if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) {
- EncryptionAlgorithm signing_algorithm;
- EncryptionAlgorithm algo = file_encryption_properties->algorithm();
- signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique;
- signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix;
- if (!algo.aad.supply_aad_prefix) {
- signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix;
- }
- signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
-
- metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm));
- const std::string& footer_signing_key_metadata =
- file_encryption_properties->footer_key_metadata();
- if (footer_signing_key_metadata.size() > 0) {
- metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata);
- }
- }
-
- ToParquet(static_cast<parquet::schema::GroupNode*>(schema_->schema_root().get()),
- &metadata_->schema);
- auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData());
- file_meta_data->impl_->metadata_ = std::move(metadata_);
- file_meta_data->impl_->InitSchema();
- file_meta_data->impl_->InitKeyValueMetadata();
- return file_meta_data;
- }
-
- std::unique_ptr<FileCryptoMetaData> BuildFileCryptoMetaData() {
- if (crypto_metadata_ == nullptr) {
- return nullptr;
- }
-
- auto file_encryption_properties = properties_->file_encryption_properties();
-
- crypto_metadata_->__set_encryption_algorithm(
- ToThrift(file_encryption_properties->algorithm()));
- std::string key_metadata = file_encryption_properties->footer_key_metadata();
-
- if (!key_metadata.empty()) {
- crypto_metadata_->__set_key_metadata(key_metadata);
- }
-
- std::unique_ptr<FileCryptoMetaData> file_crypto_metadata =
- std::unique_ptr<FileCryptoMetaData>(new FileCryptoMetaData());
- file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_);
-
- return file_crypto_metadata;
- }
-
- protected:
- std::unique_ptr<format::FileMetaData> metadata_;
- std::unique_ptr<format::FileCryptoMetaData> crypto_metadata_;
-
- private:
- const std::shared_ptr<WriterProperties> properties_;
- std::vector<format::RowGroup> row_groups_;
-
- std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
- const SchemaDescriptor* schema_;
- std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
-};
-
-std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
- return std::unique_ptr<FileMetaDataBuilder>(
- new FileMetaDataBuilder(schema, std::move(props), std::move(key_value_metadata)));
-}
-
-FileMetaDataBuilder::FileMetaDataBuilder(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
- schema, std::move(props), std::move(key_value_metadata)))} {}
-
-FileMetaDataBuilder::~FileMetaDataBuilder() = default;
-
-RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() {
- return impl_->AppendRowGroup();
-}
-
-std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() { return impl_->Finish(); }
-
-std::unique_ptr<FileCryptoMetaData> FileMetaDataBuilder::GetCryptoMetaData() {
- return impl_->BuildFileCryptoMetaData();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/metadata.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/memory.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/schema_internal.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h"
+
+namespace parquet {
+
+const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 8, 0);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_816_FIXED_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 2, 9);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() {
+ static ApplicationVersion version("parquet-cpp", 1, 3, 0);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 10, 0);
+ return version;
+}
+
+std::string ParquetVersionToString(ParquetVersion::type ver) {
+ switch (ver) {
+ case ParquetVersion::PARQUET_1_0:
+ return "1.0";
+ case ParquetVersion::PARQUET_2_0:
+ return "2.0";
+ }
+
+ // This should be unreachable
+ return "UNKNOWN";
+}
+
+template <typename DType>
+static std::shared_ptr<Statistics> MakeTypedColumnStats(
+ const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
+ // If ColumnOrder is defined, return max_value and min_value
+ if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
+ return MakeStatistics<DType>(
+ descr, metadata.statistics.min_value, metadata.statistics.max_value,
+ metadata.num_values - metadata.statistics.null_count,
+ metadata.statistics.null_count, metadata.statistics.distinct_count,
+ metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value,
+ metadata.statistics.__isset.null_count,
+ metadata.statistics.__isset.distinct_count);
+ }
+ // Default behavior
+ return MakeStatistics<DType>(
+ descr, metadata.statistics.min, metadata.statistics.max,
+ metadata.num_values - metadata.statistics.null_count,
+ metadata.statistics.null_count, metadata.statistics.distinct_count,
+ metadata.statistics.__isset.max || metadata.statistics.__isset.min,
+ metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
+}
+
+std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_data,
+ const ColumnDescriptor* descr) {
+ switch (static_cast<Type::type>(meta_data.type)) {
+ case Type::BOOLEAN:
+ return MakeTypedColumnStats<BooleanType>(meta_data, descr);
+ case Type::INT32:
+ return MakeTypedColumnStats<Int32Type>(meta_data, descr);
+ case Type::INT64:
+ return MakeTypedColumnStats<Int64Type>(meta_data, descr);
+ case Type::INT96:
+ return MakeTypedColumnStats<Int96Type>(meta_data, descr);
+ case Type::DOUBLE:
+ return MakeTypedColumnStats<DoubleType>(meta_data, descr);
+ case Type::FLOAT:
+ return MakeTypedColumnStats<FloatType>(meta_data, descr);
+ case Type::BYTE_ARRAY:
+ return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return MakeTypedColumnStats<FLBAType>(meta_data, descr);
+ case Type::UNDEFINED:
+ break;
+ }
+ throw ParquetException("Can't decode page statistics for selected column type");
+}
+
+// MetaData Accessor
+
+// ColumnCryptoMetaData
+class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl {
+ public:
+ explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata)
+ : crypto_metadata_(crypto_metadata) {}
+
+ bool encrypted_with_footer_key() const {
+ return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY;
+ }
+ bool encrypted_with_column_key() const {
+ return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY;
+ }
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const {
+ return std::make_shared<schema::ColumnPath>(
+ crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
+ }
+ const std::string& key_metadata() const {
+ return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+ }
+
+ private:
+ const format::ColumnCryptoMetaData* crypto_metadata_;
+};
+
+std::unique_ptr<ColumnCryptoMetaData> ColumnCryptoMetaData::Make(
+ const uint8_t* metadata) {
+ return std::unique_ptr<ColumnCryptoMetaData>(new ColumnCryptoMetaData(metadata));
+}
+
+ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata)
+ : impl_(new ColumnCryptoMetaDataImpl(
+ reinterpret_cast<const format::ColumnCryptoMetaData*>(metadata))) {}
+
+ColumnCryptoMetaData::~ColumnCryptoMetaData() = default;
+
+std::shared_ptr<schema::ColumnPath> ColumnCryptoMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+bool ColumnCryptoMetaData::encrypted_with_footer_key() const {
+ return impl_->encrypted_with_footer_key();
+}
+const std::string& ColumnCryptoMetaData::key_metadata() const {
+ return impl_->key_metadata();
+}
+
+// ColumnChunk metadata
+class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
+ public:
+ explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
+ const ColumnDescriptor* descr,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : column_(column), descr_(descr), writer_version_(writer_version) {
+ column_metadata_ = &column->meta_data;
+ if (column->__isset.crypto_metadata) { // column metadata is encrypted
+ format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
+
+ if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) {
+ // should decrypt metadata
+ std::shared_ptr<schema::ColumnPath> path = std::make_shared<schema::ColumnPath>(
+ ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
+ std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+
+ std::string aad_column_metadata = encryption::CreateModuleAad(
+ file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal,
+ column_ordinal, static_cast<int16_t>(-1));
+ auto decryptor = file_decryptor->GetColumnMetaDecryptor(
+ path->ToDotString(), key_metadata, aad_column_metadata);
+ auto len = static_cast<uint32_t>(column->encrypted_column_metadata.size());
+ DeserializeThriftMsg(
+ reinterpret_cast<const uint8_t*>(column->encrypted_column_metadata.c_str()),
+ &len, &decrypted_metadata_, decryptor);
+ column_metadata_ = &decrypted_metadata_;
+ } else {
+ throw ParquetException(
+ "Cannot decrypt ColumnMetadata."
+ " FileDecryption is not setup correctly");
+ }
+ }
+ }
+ for (const auto& encoding : column_metadata_->encodings) {
+ encodings_.push_back(LoadEnumSafe(&encoding));
+ }
+ for (const auto& encoding_stats : column_metadata_->encoding_stats) {
+ encoding_stats_.push_back({LoadEnumSafe(&encoding_stats.page_type),
+ LoadEnumSafe(&encoding_stats.encoding),
+ encoding_stats.count});
+ }
+ possible_stats_ = nullptr;
+ }
+
+ bool Equals(const ColumnChunkMetaDataImpl& other) const {
+ return *column_metadata_ == *other.column_metadata_;
+ }
+
+ // column chunk
+ inline int64_t file_offset() const { return column_->file_offset; }
+ inline const std::string& file_path() const { return column_->file_path; }
+
+ inline Type::type type() const { return LoadEnumSafe(&column_metadata_->type); }
+
+ inline int64_t num_values() const { return column_metadata_->num_values; }
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() {
+ return std::make_shared<schema::ColumnPath>(column_metadata_->path_in_schema);
+ }
+
+ // Check if statistics are set and are valid
+ // 1) Must be set in the metadata
+ // 2) Statistics must not be corrupted
+ inline bool is_stats_set() const {
+ DCHECK(writer_version_ != nullptr);
+ // If the column statistics don't exist or column sort order is unknown
+ // we cannot use the column stats
+ if (!column_metadata_->__isset.statistics ||
+ descr_->sort_order() == SortOrder::UNKNOWN) {
+ return false;
+ }
+ if (possible_stats_ == nullptr) {
+ possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
+ }
+ EncodedStatistics encodedStatistics = possible_stats_->Encode();
+ return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
+ descr_->sort_order());
+ }
+
+ inline std::shared_ptr<Statistics> statistics() const {
+ return is_stats_set() ? possible_stats_ : nullptr;
+ }
+
+ inline Compression::type compression() const {
+ return LoadEnumSafe(&column_metadata_->codec);
+ }
+
+ const std::vector<Encoding::type>& encodings() const { return encodings_; }
+
+ const std::vector<PageEncodingStats>& encoding_stats() const { return encoding_stats_; }
+
+ inline bool has_dictionary_page() const {
+ return column_metadata_->__isset.dictionary_page_offset;
+ }
+
+ inline int64_t dictionary_page_offset() const {
+ return column_metadata_->dictionary_page_offset;
+ }
+
+ inline int64_t data_page_offset() const { return column_metadata_->data_page_offset; }
+
+ inline bool has_index_page() const {
+ return column_metadata_->__isset.index_page_offset;
+ }
+
+ inline int64_t index_page_offset() const { return column_metadata_->index_page_offset; }
+
+ inline int64_t total_compressed_size() const {
+ return column_metadata_->total_compressed_size;
+ }
+
+ inline int64_t total_uncompressed_size() const {
+ return column_metadata_->total_uncompressed_size;
+ }
+
+ inline std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const {
+ if (column_->__isset.crypto_metadata) {
+ return ColumnCryptoMetaData::Make(
+ reinterpret_cast<const uint8_t*>(&column_->crypto_metadata));
+ } else {
+ return nullptr;
+ }
+ }
+
+ private:
+ mutable std::shared_ptr<Statistics> possible_stats_;
+ std::vector<Encoding::type> encodings_;
+ std::vector<PageEncodingStats> encoding_stats_;
+ const format::ColumnChunk* column_;
+ const format::ColumnMetaData* column_metadata_;
+ format::ColumnMetaData decrypted_metadata_;
+ const ColumnDescriptor* descr_;
+ const ApplicationVersion* writer_version_;
+};
+
+std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
+ const void* metadata, const ColumnDescriptor* descr,
+ const ApplicationVersion* writer_version, int16_t row_group_ordinal,
+ int16_t column_ordinal, std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ return std::unique_ptr<ColumnChunkMetaData>(
+ new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal,
+ writer_version, std::move(file_decryptor)));
+}
+
+ColumnChunkMetaData::ColumnChunkMetaData(
+ const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+ int16_t column_ordinal, const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{new ColumnChunkMetaDataImpl(
+ reinterpret_cast<const format::ColumnChunk*>(metadata), descr,
+ row_group_ordinal, column_ordinal, writer_version, std::move(file_decryptor))} {
+}
+
+ColumnChunkMetaData::~ColumnChunkMetaData() = default;
+
+// column chunk
+int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); }
+
+const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); }
+
+Type::type ColumnChunkMetaData::type() const { return impl_->type(); }
+
+int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); }
+
+std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+
+std::shared_ptr<Statistics> ColumnChunkMetaData::statistics() const {
+ return impl_->statistics();
+}
+
+bool ColumnChunkMetaData::is_stats_set() const { return impl_->is_stats_set(); }
+
+bool ColumnChunkMetaData::has_dictionary_page() const {
+ return impl_->has_dictionary_page();
+}
+
+int64_t ColumnChunkMetaData::dictionary_page_offset() const {
+ return impl_->dictionary_page_offset();
+}
+
+int64_t ColumnChunkMetaData::data_page_offset() const {
+ return impl_->data_page_offset();
+}
+
+bool ColumnChunkMetaData::has_index_page() const { return impl_->has_index_page(); }
+
+int64_t ColumnChunkMetaData::index_page_offset() const {
+ return impl_->index_page_offset();
+}
+
+Compression::type ColumnChunkMetaData::compression() const {
+ return impl_->compression();
+}
+
+bool ColumnChunkMetaData::can_decompress() const {
+ return ::arrow::util::Codec::IsAvailable(compression());
+}
+
+const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
+ return impl_->encodings();
+}
+
+const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats() const {
+ return impl_->encoding_stats();
+}
+
+int64_t ColumnChunkMetaData::total_uncompressed_size() const {
+ return impl_->total_uncompressed_size();
+}
+
+int64_t ColumnChunkMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() const {
+ return impl_->crypto_metadata();
+}
+
+bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+// row-group metadata
+class RowGroupMetaData::RowGroupMetaDataImpl {
+ public:
+ explicit RowGroupMetaDataImpl(const format::RowGroup* row_group,
+ const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : row_group_(row_group),
+ schema_(schema),
+ writer_version_(writer_version),
+ file_decryptor_(std::move(file_decryptor)) {}
+
+ bool Equals(const RowGroupMetaDataImpl& other) const {
+ return *row_group_ == *other.row_group_;
+ }
+
+ inline int num_columns() const { return static_cast<int>(row_group_->columns.size()); }
+
+ inline int64_t num_rows() const { return row_group_->num_rows; }
+
+ inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
+
+ inline int64_t total_compressed_size() const {
+ return row_group_->total_compressed_size;
+ }
+
+ inline int64_t file_offset() const { return row_group_->file_offset; }
+
+ inline const SchemaDescriptor* schema() const { return schema_; }
+
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
+ if (i < num_columns()) {
+ return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
+ writer_version_, row_group_->ordinal,
+ static_cast<int16_t>(i), file_decryptor_);
+ }
+ throw ParquetException("The file only has ", num_columns(),
+ " columns, requested metadata for column: ", i);
+ }
+
+ private:
+ const format::RowGroup* row_group_;
+ const SchemaDescriptor* schema_;
+ const ApplicationVersion* writer_version_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+};
+
+std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ return std::unique_ptr<RowGroupMetaData>(
+ new RowGroupMetaData(metadata, schema, writer_version, std::move(file_decryptor)));
+}
+
+RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{new RowGroupMetaDataImpl(reinterpret_cast<const format::RowGroup*>(metadata),
+ schema, writer_version, std::move(file_decryptor))} {
+}
+
+RowGroupMetaData::~RowGroupMetaData() = default;
+
+bool RowGroupMetaData::Equals(const RowGroupMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+int RowGroupMetaData::num_columns() const { return impl_->num_columns(); }
+
+int64_t RowGroupMetaData::num_rows() const { return impl_->num_rows(); }
+
+int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); }
+
+int64_t RowGroupMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+int64_t RowGroupMetaData::file_offset() const { return impl_->file_offset(); }
+
+const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); }
+
+std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const {
+ return impl_->ColumnChunk(i);
+}
+
+bool RowGroupMetaData::can_decompress() const {
+ int n_columns = num_columns();
+ for (int i = 0; i < n_columns; i++) {
+ if (!ColumnChunk(i)->can_decompress()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// file metadata
+class FileMetaData::FileMetaDataImpl {
+ public:
+ FileMetaDataImpl() = default;
+
+ explicit FileMetaDataImpl(
+ const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
+ : file_decryptor_(file_decryptor) {
+ metadata_.reset(new format::FileMetaData);
+
+ auto footer_decryptor =
+ file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr;
+
+ DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(metadata), metadata_len,
+ metadata_.get(), footer_decryptor);
+ metadata_len_ = *metadata_len;
+
+ if (metadata_->__isset.created_by) {
+ writer_version_ = ApplicationVersion(metadata_->created_by);
+ } else {
+ writer_version_ = ApplicationVersion("unknown 0.0.0");
+ }
+
+ InitSchema();
+ InitColumnOrders();
+ InitKeyValueMetadata();
+ }
+
+ bool VerifySignature(const void* signature) {
+ // verify decryption properties are set
+ if (file_decryptor_ == nullptr) {
+ throw ParquetException("Decryption not set properly. cannot verify signature");
+ }
+ // serialize the footer
+ uint8_t* serialized_data;
+ uint32_t serialized_len = metadata_len_;
+ ThriftSerializer serializer;
+ serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
+
+ // encrypt with nonce
+ auto nonce = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature));
+ auto tag = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature)) +
+ encryption::kNonceLength;
+
+ std::string key = file_decryptor_->GetFooterKey();
+ std::string aad = encryption::CreateFooterAad(file_decryptor_->file_aad());
+
+ auto aes_encryptor = encryption::AesEncryptor::Make(
+ file_decryptor_->algorithm(), static_cast<int>(key.size()), true, nullptr);
+
+ std::shared_ptr<Buffer> encrypted_buffer = std::static_pointer_cast<ResizableBuffer>(
+ AllocateBuffer(file_decryptor_->pool(),
+ aes_encryptor->CiphertextSizeDelta() + serialized_len));
+ uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
+ serialized_data, serialized_len, str2bytes(key), static_cast<int>(key.size()),
+ str2bytes(aad), static_cast<int>(aad.size()), nonce,
+ encrypted_buffer->mutable_data());
+ // Delete AES encryptor object. It was created only to verify the footer signature.
+ aes_encryptor->WipeOut();
+ delete aes_encryptor;
+ return 0 ==
+ memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
+ tag, encryption::kGcmTagLength);
+ }
+
+ inline uint32_t size() const { return metadata_len_; }
+ inline int num_columns() const { return schema_.num_columns(); }
+ inline int64_t num_rows() const { return metadata_->num_rows; }
+ inline int num_row_groups() const {
+ return static_cast<int>(metadata_->row_groups.size());
+ }
+ inline int32_t version() const { return metadata_->version; }
+ inline const std::string& created_by() const { return metadata_->created_by; }
+ inline int num_schema_elements() const {
+ return static_cast<int>(metadata_->schema.size());
+ }
+
+ inline bool is_encryption_algorithm_set() const {
+ return metadata_->__isset.encryption_algorithm;
+ }
+ inline EncryptionAlgorithm encryption_algorithm() {
+ return FromThrift(metadata_->encryption_algorithm);
+ }
+ inline const std::string& footer_signing_key_metadata() {
+ return metadata_->footer_signing_key_metadata;
+ }
+
+ const ApplicationVersion& writer_version() const { return writer_version_; }
+
+ void WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor) const {
+ ThriftSerializer serializer;
+ // Only in encrypted files with plaintext footers the
+ // encryption_algorithm is set in footer
+ if (is_encryption_algorithm_set()) {
+ uint8_t* serialized_data;
+ uint32_t serialized_len;
+ serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
+
+ // encrypt the footer key
+ std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
+ serialized_len);
+ unsigned encrypted_len =
+ encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
+
+ // write unencrypted footer
+ PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
+ // Write signature (nonce and tag)
+ PARQUET_THROW_NOT_OK(
+ dst->Write(encrypted_data.data() + 4, encryption::kNonceLength));
+ PARQUET_THROW_NOT_OK(
+ dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength,
+ encryption::kGcmTagLength));
+ } else { // either plaintext file (when encryptor is null)
+ // or encrypted file with encrypted footer
+ serializer.Serialize(metadata_.get(), dst, encryptor);
+ }
+ }
+
+ std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
+ if (!(i < num_row_groups())) {
+ std::stringstream ss;
+ ss << "The file only has " << num_row_groups()
+ << " row groups, requested metadata for row group: " << i;
+ throw ParquetException(ss.str());
+ }
+ return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_,
+ file_decryptor_);
+ }
+
+ bool Equals(const FileMetaDataImpl& other) const {
+ return *metadata_ == *other.metadata_;
+ }
+
+ const SchemaDescriptor* schema() const { return &schema_; }
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
+ void set_file_path(const std::string& path) {
+ for (format::RowGroup& row_group : metadata_->row_groups) {
+ for (format::ColumnChunk& chunk : row_group.columns) {
+ chunk.__set_file_path(path);
+ }
+ }
+ }
+
+ format::RowGroup& row_group(int i) {
+ DCHECK_LT(i, num_row_groups());
+ return metadata_->row_groups[i];
+ }
+
+ void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
+ if (!schema()->Equals(*other->schema())) {
+ throw ParquetException("AppendRowGroups requires equal schemas.");
+ }
+
+ format::RowGroup other_rg;
+ for (int i = 0; i < other->num_row_groups(); i++) {
+ other_rg = other->row_group(i);
+ metadata_->row_groups.push_back(other_rg);
+ metadata_->num_rows += other_rg.num_rows;
+ }
+ }
+
+ std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) {
+ for (int i : row_groups) {
+ if (i < num_row_groups()) continue;
+
+ throw ParquetException(
+ "The file only has ", num_row_groups(),
+ " row groups, but requested a subset including row group: ", i);
+ }
+
+ std::shared_ptr<FileMetaData> out(new FileMetaData());
+ out->impl_.reset(new FileMetaDataImpl());
+ out->impl_->metadata_.reset(new format::FileMetaData());
+
+ auto metadata = out->impl_->metadata_.get();
+ metadata->version = metadata_->version;
+ metadata->schema = metadata_->schema;
+
+ metadata->row_groups.resize(row_groups.size());
+ int i = 0;
+ for (int selected_index : row_groups) {
+ metadata->num_rows += row_group(selected_index).num_rows;
+ metadata->row_groups[i++] = row_group(selected_index);
+ }
+
+ metadata->key_value_metadata = metadata_->key_value_metadata;
+ metadata->created_by = metadata_->created_by;
+ metadata->column_orders = metadata_->column_orders;
+ metadata->encryption_algorithm = metadata_->encryption_algorithm;
+ metadata->footer_signing_key_metadata = metadata_->footer_signing_key_metadata;
+ metadata->__isset = metadata_->__isset;
+
+ out->impl_->schema_ = schema_;
+ out->impl_->writer_version_ = writer_version_;
+ out->impl_->key_value_metadata_ = key_value_metadata_;
+ out->impl_->file_decryptor_ = file_decryptor_;
+
+ return out;
+ }
+
+ void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ file_decryptor_ = file_decryptor;
+ }
+
+ private:
+ friend FileMetaDataBuilder;
+ uint32_t metadata_len_ = 0;
+ std::unique_ptr<format::FileMetaData> metadata_;
+ SchemaDescriptor schema_;
+ ApplicationVersion writer_version_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+
+ void InitSchema() {
+ if (metadata_->schema.empty()) {
+ throw ParquetException("Empty file schema (no root)");
+ }
+ schema_.Init(schema::Unflatten(&metadata_->schema[0],
+ static_cast<int>(metadata_->schema.size())));
+ }
+
+ void InitColumnOrders() {
+ // update ColumnOrder
+ std::vector<parquet::ColumnOrder> column_orders;
+ if (metadata_->__isset.column_orders) {
+ for (auto column_order : metadata_->column_orders) {
+ if (column_order.__isset.TYPE_ORDER) {
+ column_orders.push_back(ColumnOrder::type_defined_);
+ } else {
+ column_orders.push_back(ColumnOrder::undefined_);
+ }
+ }
+ } else {
+ column_orders.resize(schema_.num_columns(), ColumnOrder::undefined_);
+ }
+
+ schema_.updateColumnOrders(column_orders);
+ }
+
+ void InitKeyValueMetadata() {
+ std::shared_ptr<KeyValueMetadata> metadata = nullptr;
+ if (metadata_->__isset.key_value_metadata) {
+ metadata = std::make_shared<KeyValueMetadata>();
+ for (const auto& it : metadata_->key_value_metadata) {
+ metadata->Append(it.key, it.value);
+ }
+ }
+ key_value_metadata_ = std::move(metadata);
+ }
+};
+
+std::shared_ptr<FileMetaData> FileMetaData::Make(
+ const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ // This FileMetaData ctor is private, not compatible with std::make_shared
+ return std::shared_ptr<FileMetaData>(
+ new FileMetaData(metadata, metadata_len, file_decryptor));
+}
+
+FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{std::unique_ptr<FileMetaDataImpl>(
+ new FileMetaDataImpl(metadata, metadata_len, file_decryptor))} {}
+
+FileMetaData::FileMetaData()
+ : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {}
+
+FileMetaData::~FileMetaData() = default;
+
+bool FileMetaData::Equals(const FileMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const {
+ return impl_->RowGroup(i);
+}
+
+bool FileMetaData::VerifySignature(const void* signature) {
+ return impl_->VerifySignature(signature);
+}
+
+uint32_t FileMetaData::size() const { return impl_->size(); }
+
+int FileMetaData::num_columns() const { return impl_->num_columns(); }
+
+int64_t FileMetaData::num_rows() const { return impl_->num_rows(); }
+
+int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); }
+
+bool FileMetaData::can_decompress() const {
+ int n_row_groups = num_row_groups();
+ for (int i = 0; i < n_row_groups; i++) {
+ if (!RowGroup(i)->can_decompress()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FileMetaData::is_encryption_algorithm_set() const {
+ return impl_->is_encryption_algorithm_set();
+}
+
+EncryptionAlgorithm FileMetaData::encryption_algorithm() const {
+ return impl_->encryption_algorithm();
+}
+
+const std::string& FileMetaData::footer_signing_key_metadata() const {
+ return impl_->footer_signing_key_metadata();
+}
+
+void FileMetaData::set_file_decryptor(
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ impl_->set_file_decryptor(file_decryptor);
+}
+
+ParquetVersion::type FileMetaData::version() const {
+ switch (impl_->version()) {
+ case 1:
+ return ParquetVersion::PARQUET_1_0;
+ case 2:
+ return ParquetVersion::PARQUET_2_0;
+ default:
+ // Improperly set version, assuming Parquet 1.0
+ break;
+ }
+ return ParquetVersion::PARQUET_1_0;
+}
+
+const ApplicationVersion& FileMetaData::writer_version() const {
+ return impl_->writer_version();
+}
+
+const std::string& FileMetaData::created_by() const { return impl_->created_by(); }
+
+int FileMetaData::num_schema_elements() const { return impl_->num_schema_elements(); }
+
+const SchemaDescriptor* FileMetaData::schema() const { return impl_->schema(); }
+
+const std::shared_ptr<const KeyValueMetadata>& FileMetaData::key_value_metadata() const {
+ return impl_->key_value_metadata();
+}
+
+void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
+
+void FileMetaData::AppendRowGroups(const FileMetaData& other) {
+ impl_->AppendRowGroups(other.impl_);
+}
+
+std::shared_ptr<FileMetaData> FileMetaData::Subset(
+ const std::vector<int>& row_groups) const {
+ return impl_->Subset(row_groups);
+}
+
+void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor) const {
+ return impl_->WriteTo(dst, encryptor);
+}
+
+class FileCryptoMetaData::FileCryptoMetaDataImpl {
+ public:
+ FileCryptoMetaDataImpl() = default;
+
+ explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) {
+ metadata_.reset(new format::FileCryptoMetaData);
+ DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
+ metadata_len_ = *metadata_len;
+ }
+
+ EncryptionAlgorithm encryption_algorithm() {
+ return FromThrift(metadata_->encryption_algorithm);
+ }
+ const std::string& key_metadata() { return metadata_->key_metadata; }
+ void WriteTo(::arrow::io::OutputStream* dst) const {
+ ThriftSerializer serializer;
+ serializer.Serialize(metadata_.get(), dst);
+ }
+
+ private:
+ friend FileMetaDataBuilder;
+ std::unique_ptr<format::FileCryptoMetaData> metadata_;
+ uint32_t metadata_len_;
+};
+
+EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const {
+ return impl_->encryption_algorithm();
+}
+
+const std::string& FileCryptoMetaData::key_metadata() const {
+ return impl_->key_metadata();
+}
+
+std::shared_ptr<FileCryptoMetaData> FileCryptoMetaData::Make(
+ const uint8_t* serialized_metadata, uint32_t* metadata_len) {
+ return std::shared_ptr<FileCryptoMetaData>(
+ new FileCryptoMetaData(serialized_metadata, metadata_len));
+}
+
+FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata,
+ uint32_t* metadata_len)
+ : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {}
+
+FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {}
+
+FileCryptoMetaData::~FileCryptoMetaData() = default;
+
+void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const {
+ impl_->WriteTo(dst);
+}
+
+std::string FileMetaData::SerializeToString() const {
+ // We need to pass in an initial size. Since it will automatically
+ // increase the buffer size to hold the metadata, we just leave it 0.
+ PARQUET_ASSIGN_OR_THROW(auto serializer, ::arrow::io::BufferOutputStream::Create(0));
+ WriteTo(serializer.get());
+ PARQUET_ASSIGN_OR_THROW(auto metadata_buffer, serializer->Finish());
+ return metadata_buffer->ToString();
+}
+
+ApplicationVersion::ApplicationVersion(std::string application, int major, int minor,
+ int patch)
+ : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
+
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+// "${APPLICATION_NAME}"
+// "${APPLICATION_NAME} version ${VERSION}"
+// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+// parquet-cpp
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+// "${MAJOR}"
+// "${MAJOR}.${MINOR}"
+// "${MAJOR}.${MINOR}.${PATCH}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+// 1
+// 1.5
+// 1.5.0
+// 1.5.0ab
+// 1.5.0ab-cdh5.5.0
+// 1.5.0ab-cdh5.5.0+cd
+// 1.5.0ab+cd
+// 1.5.0-cdh5.5.0
+// 1.5.0-cdh5.5.0+cd
+// 1.5.0+cd
+class ApplicationVersionParser {
+ public:
+ ApplicationVersionParser(const std::string& created_by,
+ ApplicationVersion& application_version)
+ : created_by_(created_by),
+ application_version_(application_version),
+ spaces_(" \t\v\r\n\f"),
+ digits_("0123456789") {}
+
+ void Parse() {
+ application_version_.application_ = "unknown";
+ application_version_.version = {0, 0, 0, "", "", ""};
+
+ if (!ParseApplicationName()) {
+ return;
+ }
+ if (!ParseVersion()) {
+ return;
+ }
+ if (!ParseBuildName()) {
+ return;
+ }
+ }
+
+ private:
+ bool IsSpace(const std::string& string, const size_t& offset) {
+ auto target = ::arrow::util::string_view(string).substr(offset, 1);
+ return target.find_first_of(spaces_) != ::arrow::util::string_view::npos;
+ }
+
+ void RemovePrecedingSpaces(const std::string& string, size_t& start,
+ const size_t& end) {
+ while (start < end && IsSpace(string, start)) {
+ ++start;
+ }
+ }
+
+ void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+ while (start < (end - 1) && (end - 1) < string.size() && IsSpace(string, end - 1)) {
+ --end;
+ }
+ }
+
+ bool ParseApplicationName() {
+ std::string version_mark(" version ");
+ auto version_mark_position = created_by_.find(version_mark);
+ size_t application_name_end;
+ // No VERSION and BUILD_NAME.
+ if (version_mark_position == std::string::npos) {
+ version_start_ = std::string::npos;
+ application_name_end = created_by_.size();
+ } else {
+ version_start_ = version_mark_position + version_mark.size();
+ application_name_end = version_mark_position;
+ }
+
+ size_t application_name_start = 0;
+ RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+ RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+ application_version_.application_ = created_by_.substr(
+ application_name_start, application_name_end - application_name_start);
+
+ return true;
+ }
+
+ bool ParseVersion() {
+ // No VERSION.
+ if (version_start_ == std::string::npos) {
+ return false;
+ }
+
+ RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+ version_end_ = created_by_.find(" (", version_start_);
+ // No BUILD_NAME.
+ if (version_end_ == std::string::npos) {
+ version_end_ = created_by_.size();
+ }
+ RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+ // No VERSION.
+ if (version_start_ == version_end_) {
+ return false;
+ }
+ version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+ if (!ParseVersionMajor()) {
+ return false;
+ }
+ if (!ParseVersionMinor()) {
+ return false;
+ }
+ if (!ParseVersionPatch()) {
+ return false;
+ }
+ if (!ParseVersionUnknown()) {
+ return false;
+ }
+ if (!ParseVersionPreRelease()) {
+ return false;
+ }
+ if (!ParseVersionBuildInfo()) {
+ return false;
+ }
+
+ return true;
+ }
+
+ bool ParseVersionMajor() {
+ size_t version_major_start = 0;
+ auto version_major_end = version_string_.find_first_not_of(digits_);
+ // MAJOR only.
+ if (version_major_end == std::string::npos) {
+ version_major_end = version_string_.size();
+ version_parsing_position_ = version_major_end;
+ } else {
+ // No ".".
+ if (version_string_[version_major_end] != '.') {
+ return false;
+ }
+ // No MAJOR.
+ if (version_major_end == version_major_start) {
+ return false;
+ }
+ version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
+ }
+ auto version_major_string = version_string_.substr(
+ version_major_start, version_major_end - version_major_start);
+ application_version_.version.major = atoi(version_major_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionMinor() {
+ auto version_minor_start = version_parsing_position_;
+ auto version_minor_end =
+ version_string_.find_first_not_of(digits_, version_minor_start);
+ // MAJOR.MINOR only.
+ if (version_minor_end == std::string::npos) {
+ version_minor_end = version_string_.size();
+ version_parsing_position_ = version_minor_end;
+ } else {
+ // No ".".
+ if (version_string_[version_minor_end] != '.') {
+ return false;
+ }
+ // No MINOR.
+ if (version_minor_end == version_minor_start) {
+ return false;
+ }
+ version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
+ }
+ auto version_minor_string = version_string_.substr(
+ version_minor_start, version_minor_end - version_minor_start);
+ application_version_.version.minor = atoi(version_minor_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionPatch() {
+ auto version_patch_start = version_parsing_position_;
+ auto version_patch_end =
+ version_string_.find_first_not_of(digits_, version_patch_start);
+ // No UNKNOWN, PRE_RELEASE and BUILD_INFO.
+ if (version_patch_end == std::string::npos) {
+ version_patch_end = version_string_.size();
+ }
+ // No PATCH.
+ if (version_patch_end == version_patch_start) {
+ return false;
+ }
+ auto version_patch_string = version_string_.substr(
+ version_patch_start, version_patch_end - version_patch_start);
+ application_version_.version.patch = atoi(version_patch_string.c_str());
+ version_parsing_position_ = version_patch_end;
+ return true;
+ }
+
+ bool ParseVersionUnknown() {
+ // No UNKNOWN.
+ if (version_parsing_position_ == version_string_.size()) {
+ return true;
+ }
+ auto version_unknown_start = version_parsing_position_;
+ auto version_unknown_end = version_string_.find_first_of("-+", version_unknown_start);
+ // No PRE_RELEASE and BUILD_INFO
+ if (version_unknown_end == std::string::npos) {
+ version_unknown_end = version_string_.size();
+ }
+ application_version_.version.unknown = version_string_.substr(
+ version_unknown_start, version_unknown_end - version_unknown_start);
+ version_parsing_position_ = version_unknown_end;
+ return true;
+ }
+
+ bool ParseVersionPreRelease() {
+ // No PRE_RELEASE.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '-') {
+ return true;
+ }
+
+ auto version_pre_release_start = version_parsing_position_ + 1; // +1 is for '-'.
+ auto version_pre_release_end =
+ version_string_.find_first_of("+", version_pre_release_start);
+ // No BUILD_INFO
+ if (version_pre_release_end == std::string::npos) {
+ version_pre_release_end = version_string_.size();
+ }
+ application_version_.version.pre_release = version_string_.substr(
+ version_pre_release_start, version_pre_release_end - version_pre_release_start);
+ version_parsing_position_ = version_pre_release_end;
+ return true;
+ }
+
+ bool ParseVersionBuildInfo() {
+ // No BUILD_INFO.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '+') {
+ return true;
+ }
+
+ auto version_build_info_start = version_parsing_position_ + 1; // +1 is for '+'.
+ application_version_.version.build_info =
+ version_string_.substr(version_build_info_start);
+ return true;
+ }
+
+ bool ParseBuildName() {
+ std::string build_mark(" (build ");
+ auto build_mark_position = created_by_.find(build_mark, version_end_);
+ // No BUILD_NAME.
+ if (build_mark_position == std::string::npos) {
+ return false;
+ }
+ auto build_name_start = build_mark_position + build_mark.size();
+ RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
+ auto build_name_end = created_by_.find_first_of(")", build_name_start);
+ // No end ")".
+ if (build_name_end == std::string::npos) {
+ return false;
+ }
+ RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
+ application_version_.build_ =
+ created_by_.substr(build_name_start, build_name_end - build_name_start);
+
+ return true;
+ }
+
+ const std::string& created_by_;
+ ApplicationVersion& application_version_;
+
+ // For parsing.
+ std::string spaces_;
+ std::string digits_;
+ size_t version_parsing_position_;
+ size_t version_start_;
+ size_t version_end_;
+ std::string version_string_;
+};
+} // namespace
+
+ApplicationVersion::ApplicationVersion(const std::string& created_by) {
+ ApplicationVersionParser parser(created_by, *this);
+ parser.Parse();
+}
+
+bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
+ if (application_ != other_version.application_) return false;
+
+ if (version.major < other_version.version.major) return true;
+ if (version.major > other_version.version.major) return false;
+ DCHECK_EQ(version.major, other_version.version.major);
+ if (version.minor < other_version.version.minor) return true;
+ if (version.minor > other_version.version.minor) return false;
+ DCHECK_EQ(version.minor, other_version.version.minor);
+ return version.patch < other_version.version.patch;
+}
+
+bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
+ return application_ == other_version.application_ &&
+ version.major == other_version.version.major &&
+ version.minor == other_version.version.minor &&
+ version.patch == other_version.version.patch;
+}
+
+// Reference:
+// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
+// PARQUET-686 has more discussion on statistics
+bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
+ EncodedStatistics& statistics,
+ SortOrder::type sort_order) const {
+ // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
+ // correctly for all types
+ if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
+ (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
+ // Only SIGNED are valid unless max and min are the same
+ // (in which case the sort order does not matter)
+ bool max_equals_min = statistics.has_min && statistics.has_max
+ ? statistics.min() == statistics.max()
+ : false;
+ if (SortOrder::SIGNED != sort_order && !max_equals_min) {
+ return false;
+ }
+
+ // Statistics of other types are OK
+ if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
+ return true;
+ }
+ }
+ // created_by is not populated, which could have been caused by
+ // parquet-mr during the same time as PARQUET-251, see PARQUET-297
+ if (application_ == "unknown") {
+ return true;
+ }
+
+ // Unknown sort order has incorrect stats
+ if (SortOrder::UNKNOWN == sort_order) {
+ return false;
+ }
+
+ // PARQUET-251
+ if (VersionLt(PARQUET_251_FIXED_VERSION())) {
+ return false;
+ }
+
+ return true;
+}
+
+// MetaData Builders
+// row-group metadata
+class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
+ public:
+ explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column)
+ : owned_column_chunk_(new format::ColumnChunk),
+ properties_(std::move(props)),
+ column_(column) {
+ Init(owned_column_chunk_.get());
+ }
+
+ explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column,
+ format::ColumnChunk* column_chunk)
+ : properties_(std::move(props)), column_(column) {
+ Init(column_chunk);
+ }
+
+ const void* contents() const { return column_chunk_; }
+
+ // column chunk
+ void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
+
+ // column metadata
+ void SetStatistics(const EncodedStatistics& val) {
+ column_chunk_->meta_data.__set_statistics(ToThrift(val));
+ }
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+ bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ if (dictionary_page_offset > 0) {
+ column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
+ column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
+ } else {
+ column_chunk_->__set_file_offset(data_page_offset + compressed_size);
+ }
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__set_num_values(num_values);
+ if (index_page_offset >= 0) {
+ column_chunk_->meta_data.__set_index_page_offset(index_page_offset);
+ }
+ column_chunk_->meta_data.__set_data_page_offset(data_page_offset);
+ column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size);
+ column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
+
+ std::vector<format::Encoding::type> thrift_encodings;
+ if (has_dictionary) {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
+ if (properties_->version() == ParquetVersion::PARQUET_1_0) {
+ thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
+ } else {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
+ }
+ } else { // Dictionary not enabled
+ thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
+ }
+ thrift_encodings.push_back(ToThrift(Encoding::RLE));
+ // Only PLAIN encoding is supported for fallback in V1
+ // TODO(majetideepak): Use user specified encoding for V2
+ if (dictionary_fallback) {
+ thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
+ }
+ column_chunk_->meta_data.__set_encodings(thrift_encodings);
+ std::vector<format::PageEncodingStats> thrift_encoding_stats;
+ // Add dictionary page encoding stats
+ for (const auto& entry : dict_encoding_stats) {
+ format::PageEncodingStats dict_enc_stat;
+ dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
+ dict_enc_stat.__set_encoding(ToThrift(entry.first));
+ dict_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(dict_enc_stat);
+ }
+ // Add data page encoding stats
+ for (const auto& entry : data_encoding_stats) {
+ format::PageEncodingStats data_enc_stat;
+ data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
+ data_enc_stat.__set_encoding(ToThrift(entry.first));
+ data_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(data_enc_stat);
+ }
+ column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
+
+ const auto& encrypt_md =
+ properties_->column_encryption_properties(column_->path()->ToDotString());
+ // column is encrypted
+ if (encrypt_md != nullptr && encrypt_md->is_encrypted()) {
+ column_chunk_->__isset.crypto_metadata = true;
+ format::ColumnCryptoMetaData ccmd;
+ if (encrypt_md->is_encrypted_with_footer_key()) {
+ // encrypted with footer key
+ ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+ ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey());
+ } else { // encrypted with column key
+ format::EncryptionWithColumnKey eck;
+ eck.__set_key_metadata(encrypt_md->key_metadata());
+ eck.__set_path_in_schema(column_->path()->ToDotVector());
+ ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+ ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
+ }
+ column_chunk_->__set_crypto_metadata(ccmd);
+
+ bool encrypted_footer =
+ properties_->file_encryption_properties()->encrypted_footer();
+ bool encrypt_metadata =
+ !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key();
+ if (encrypt_metadata) {
+ ThriftSerializer serializer;
+ // Serialize and encrypt ColumnMetadata separately
+ // Thrift-serialize the ColumnMetaData structure,
+ // encrypt it with the column key, and write to encrypted_column_metadata
+ uint8_t* serialized_data;
+ uint32_t serialized_len;
+
+ serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len,
+ &serialized_data);
+
+ std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
+ serialized_len);
+ unsigned encrypted_len =
+ encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
+
+ const char* temp =
+ const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
+ std::string encrypted_column_metadata(temp, encrypted_len);
+ column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata);
+
+ if (encrypted_footer) {
+ column_chunk_->__isset.meta_data = false;
+ } else {
+ // Keep redacted metadata version for old readers
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__isset.statistics = false;
+ column_chunk_->meta_data.__isset.encoding_stats = false;
+ }
+ }
+ }
+ }
+
+ void WriteTo(::arrow::io::OutputStream* sink) {
+ ThriftSerializer serializer;
+ serializer.Serialize(column_chunk_, sink);
+ }
+
+ const ColumnDescriptor* descr() const { return column_; }
+ int64_t total_compressed_size() const {
+ return column_chunk_->meta_data.total_compressed_size;
+ }
+
+ private:
+ void Init(format::ColumnChunk* column_chunk) {
+ column_chunk_ = column_chunk;
+
+ column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type()));
+ column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector());
+ column_chunk_->meta_data.__set_codec(
+ ToThrift(properties_->compression(column_->path())));
+ }
+
+ format::ColumnChunk* column_chunk_;
+ std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const ColumnDescriptor* column_;
+};
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(std::move(props), column, contents));
+}
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(std::move(props), column));
+}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(
+ std::move(props), column,
+ reinterpret_cast<format::ColumnChunk*>(contents)))} {}
+
+ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;
+
+const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
+
+void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
+ impl_->set_file_path(path);
+}
+
+void ColumnChunkMetaDataBuilder::Finish(
+ int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset,
+ int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size,
+ bool has_dictionary, bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
+ compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
+ dict_encoding_stats, data_encoding_stats, encryptor);
+}
+
+void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) {
+ impl_->WriteTo(sink);
+}
+
+const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const {
+ return impl_->descr();
+}
+
+void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) {
+ impl_->SetStatistics(result);
+}
+
+int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
+ public:
+ explicit RowGroupMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema, void* contents)
+ : properties_(std::move(props)), schema_(schema), next_column_(0) {
+ row_group_ = reinterpret_cast<format::RowGroup*>(contents);
+ InitializeColumns(schema->num_columns());
+ }
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk() {
+ if (!(next_column_ < num_columns())) {
+ std::stringstream ss;
+ ss << "The schema only has " << num_columns()
+ << " columns, requested metadata for column: " << next_column_;
+ throw ParquetException(ss.str());
+ }
+ auto column = schema_->Column(next_column_);
+ auto column_builder = ColumnChunkMetaDataBuilder::Make(
+ properties_, column, &row_group_->columns[next_column_++]);
+ auto column_builder_ptr = column_builder.get();
+ column_builders_.push_back(std::move(column_builder));
+ return column_builder_ptr;
+ }
+
+ int current_column() { return next_column_ - 1; }
+
+ void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) {
+ if (!(next_column_ == schema_->num_columns())) {
+ std::stringstream ss;
+ ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns()
+ << " columns are initialized";
+ throw ParquetException(ss.str());
+ }
+
+ int64_t file_offset = 0;
+ int64_t total_compressed_size = 0;
+ for (int i = 0; i < schema_->num_columns(); i++) {
+ if (!(row_group_->columns[i].file_offset >= 0)) {
+ std::stringstream ss;
+ ss << "Column " << i << " is not complete.";
+ throw ParquetException(ss.str());
+ }
+ if (i == 0) {
+ file_offset = row_group_->columns[0].file_offset;
+ }
+ // sometimes column metadata is encrypted and not available to read,
+ // so we must get total_compressed_size from column builder
+ total_compressed_size += column_builders_[i]->total_compressed_size();
+ }
+
+ row_group_->__set_file_offset(file_offset);
+ row_group_->__set_total_compressed_size(total_compressed_size);
+ row_group_->__set_total_byte_size(total_bytes_written);
+ row_group_->__set_ordinal(row_group_ordinal);
+ }
+
+ void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }
+
+ int num_columns() { return static_cast<int>(row_group_->columns.size()); }
+
+ int64_t num_rows() { return row_group_->num_rows; }
+
+ private:
+ void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); }
+
+ format::RowGroup* row_group_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const SchemaDescriptor* schema_;
+ std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_;
+ int next_column_;
+};
+
+std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+ void* contents) {
+ return std::unique_ptr<RowGroupMetaDataBuilder>(
+ new RowGroupMetaDataBuilder(std::move(props), schema_, contents));
+}
+
+RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema_,
+ void* contents)
+ : impl_{new RowGroupMetaDataBuilderImpl(std::move(props), schema_, contents)} {}
+
+RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() = default;
+
+ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() {
+ return impl_->NextColumnChunk();
+}
+
+int RowGroupMetaDataBuilder::current_column() const { return impl_->current_column(); }
+
+int RowGroupMetaDataBuilder::num_columns() { return impl_->num_columns(); }
+
+int64_t RowGroupMetaDataBuilder::num_rows() { return impl_->num_rows(); }
+
+void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) {
+ impl_->set_num_rows(num_rows);
+}
+
+void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written,
+ int16_t row_group_ordinal) {
+ impl_->Finish(total_bytes_written, row_group_ordinal);
+}
+
+// file metadata
+// TODO(PARQUET-595) Support key_value_metadata
+class FileMetaDataBuilder::FileMetaDataBuilderImpl {
+ public:
+ explicit FileMetaDataBuilderImpl(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : metadata_(new format::FileMetaData()),
+ properties_(std::move(props)),
+ schema_(schema),
+ key_value_metadata_(std::move(key_value_metadata)) {
+ if (properties_->file_encryption_properties() != nullptr &&
+ properties_->file_encryption_properties()->encrypted_footer()) {
+ crypto_metadata_.reset(new format::FileCryptoMetaData());
+ }
+ }
+
+ RowGroupMetaDataBuilder* AppendRowGroup() {
+ row_groups_.emplace_back();
+ current_row_group_builder_ =
+ RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back());
+ return current_row_group_builder_.get();
+ }
+
+ std::unique_ptr<FileMetaData> Finish() {
+ int64_t total_rows = 0;
+ for (auto row_group : row_groups_) {
+ total_rows += row_group.num_rows;
+ }
+ metadata_->__set_num_rows(total_rows);
+ metadata_->__set_row_groups(row_groups_);
+
+ if (key_value_metadata_) {
+ metadata_->key_value_metadata.clear();
+ metadata_->key_value_metadata.reserve(key_value_metadata_->size());
+ for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
+ format::KeyValue kv_pair;
+ kv_pair.__set_key(key_value_metadata_->key(i));
+ kv_pair.__set_value(key_value_metadata_->value(i));
+ metadata_->key_value_metadata.push_back(kv_pair);
+ }
+ metadata_->__isset.key_value_metadata = true;
+ }
+
+ int32_t file_version = 0;
+ switch (properties_->version()) {
+ case ParquetVersion::PARQUET_1_0:
+ file_version = 1;
+ break;
+ case ParquetVersion::PARQUET_2_0:
+ file_version = 2;
+ break;
+ default:
+ break;
+ }
+ metadata_->__set_version(file_version);
+ metadata_->__set_created_by(properties_->created_by());
+
+ // Users cannot set the `ColumnOrder` since we donot not have user defined sort order
+ // in the spec yet.
+ // We always default to `TYPE_DEFINED_ORDER`. We can expose it in
+ // the API once we have user defined sort orders in the Parquet format.
+ // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
+ format::TypeDefinedOrder type_defined_order;
+ format::ColumnOrder column_order;
+ column_order.__set_TYPE_ORDER(type_defined_order);
+ column_order.__isset.TYPE_ORDER = true;
+ metadata_->column_orders.resize(schema_->num_columns(), column_order);
+ metadata_->__isset.column_orders = true;
+
+ // if plaintext footer, set footer signing algorithm
+ auto file_encryption_properties = properties_->file_encryption_properties();
+ if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) {
+ EncryptionAlgorithm signing_algorithm;
+ EncryptionAlgorithm algo = file_encryption_properties->algorithm();
+ signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique;
+ signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix;
+ if (!algo.aad.supply_aad_prefix) {
+ signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix;
+ }
+ signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
+
+ metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm));
+ const std::string& footer_signing_key_metadata =
+ file_encryption_properties->footer_key_metadata();
+ if (footer_signing_key_metadata.size() > 0) {
+ metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata);
+ }
+ }
+
+ ToParquet(static_cast<parquet::schema::GroupNode*>(schema_->schema_root().get()),
+ &metadata_->schema);
+ auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData());
+ file_meta_data->impl_->metadata_ = std::move(metadata_);
+ file_meta_data->impl_->InitSchema();
+ file_meta_data->impl_->InitKeyValueMetadata();
+ return file_meta_data;
+ }
+
+ std::unique_ptr<FileCryptoMetaData> BuildFileCryptoMetaData() {
+ if (crypto_metadata_ == nullptr) {
+ return nullptr;
+ }
+
+ auto file_encryption_properties = properties_->file_encryption_properties();
+
+ crypto_metadata_->__set_encryption_algorithm(
+ ToThrift(file_encryption_properties->algorithm()));
+ std::string key_metadata = file_encryption_properties->footer_key_metadata();
+
+ if (!key_metadata.empty()) {
+ crypto_metadata_->__set_key_metadata(key_metadata);
+ }
+
+ std::unique_ptr<FileCryptoMetaData> file_crypto_metadata =
+ std::unique_ptr<FileCryptoMetaData>(new FileCryptoMetaData());
+ file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_);
+
+ return file_crypto_metadata;
+ }
+
+ protected:
+ std::unique_ptr<format::FileMetaData> metadata_;
+ std::unique_ptr<format::FileCryptoMetaData> crypto_metadata_;
+
+ private:
+ const std::shared_ptr<WriterProperties> properties_;
+ std::vector<format::RowGroup> row_groups_;
+
+ std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
+ const SchemaDescriptor* schema_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+};
+
+std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ return std::unique_ptr<FileMetaDataBuilder>(
+ new FileMetaDataBuilder(schema, std::move(props), std::move(key_value_metadata)));
+}
+
+FileMetaDataBuilder::FileMetaDataBuilder(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
+ schema, std::move(props), std::move(key_value_metadata)))} {}
+
+FileMetaDataBuilder::~FileMetaDataBuilder() = default;
+
+RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() {
+ return impl_->AppendRowGroup();
+}
+
+std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() { return impl_->Finish(); }
+
+std::unique_ptr<FileCryptoMetaData> FileMetaDataBuilder::GetCryptoMetaData() {
+ return impl_->BuildFileCryptoMetaData();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
index 1865115e423..b432c20cf64 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
@@ -1,484 +1,484 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-class ColumnDescriptor;
-class EncodedStatistics;
-class Statistics;
-class SchemaDescriptor;
-
-class FileCryptoMetaData;
-class InternalFileDecryptor;
-class Decryptor;
-class Encryptor;
-class FooterSigningEncryptor;
-
-namespace schema {
-
-class ColumnPath;
-
-} // namespace schema
-
-using KeyValueMetadata = ::arrow::KeyValueMetadata;
-
-class PARQUET_EXPORT ApplicationVersion {
- public:
- // Known Versions with Issues
- static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
- static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
- static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
- static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
-
- // Application that wrote the file. e.g. "IMPALA"
- std::string application_;
- // Build name
- std::string build_;
-
- // Version of the application that wrote the file, expressed as
- // (<major>.<minor>.<patch>). Unmatched parts default to 0.
- // "1.2.3" => {1, 2, 3}
- // "1.2" => {1, 2, 0}
- // "1.2-cdh5" => {1, 2, 0}
- struct {
- int major;
- int minor;
- int patch;
- std::string unknown;
- std::string pre_release;
- std::string build_info;
- } version;
-
- ApplicationVersion() = default;
- explicit ApplicationVersion(const std::string& created_by);
- ApplicationVersion(std::string application, int major, int minor, int patch);
-
- // Returns true if version is strictly less than other_version
- bool VersionLt(const ApplicationVersion& other_version) const;
-
- // Returns true if version is strictly less than other_version
- bool VersionEq(const ApplicationVersion& other_version) const;
-
- // Checks if the Version has the correct statistics for a given column
- bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
- SortOrder::type sort_order = SortOrder::SIGNED) const;
-};
-
-class PARQUET_EXPORT ColumnCryptoMetaData {
- public:
- static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
- ~ColumnCryptoMetaData();
-
- bool Equals(const ColumnCryptoMetaData& other) const;
-
- std::shared_ptr<schema::ColumnPath> path_in_schema() const;
- bool encrypted_with_footer_key() const;
- const std::string& key_metadata() const;
-
- private:
- explicit ColumnCryptoMetaData(const uint8_t* metadata);
-
- class ColumnCryptoMetaDataImpl;
- std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
-};
-
-/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
-struct PageEncodingStats {
- PageType::type page_type;
- Encoding::type encoding;
- int32_t count;
-};
-
-/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
-class PARQUET_EXPORT ColumnChunkMetaData {
- public:
- // API convenience to get a MetaData accessor
- static std::unique_ptr<ColumnChunkMetaData> Make(
- const void* metadata, const ColumnDescriptor* descr,
- const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
- int16_t column_ordinal = -1,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- ~ColumnChunkMetaData();
-
- bool Equals(const ColumnChunkMetaData& other) const;
-
- // column chunk
- int64_t file_offset() const;
-
- // parameter is only used when a dataset is spread across multiple files
- const std::string& file_path() const;
-
- // column metadata
- bool is_metadata_set() const;
- Type::type type() const;
- int64_t num_values() const;
- std::shared_ptr<schema::ColumnPath> path_in_schema() const;
- bool is_stats_set() const;
- std::shared_ptr<Statistics> statistics() const;
-
- Compression::type compression() const;
- // Indicate if the ColumnChunk compression is supported by the current
- // compiled parquet library.
- bool can_decompress() const;
-
- const std::vector<Encoding::type>& encodings() const;
- const std::vector<PageEncodingStats>& encoding_stats() const;
- bool has_dictionary_page() const;
- int64_t dictionary_page_offset() const;
- int64_t data_page_offset() const;
- bool has_index_page() const;
- int64_t index_page_offset() const;
- int64_t total_compressed_size() const;
- int64_t total_uncompressed_size() const;
- std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
-
- private:
- explicit ColumnChunkMetaData(
- const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
- int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
- // PIMPL Idiom
- class ColumnChunkMetaDataImpl;
- std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
-};
-
-/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
-class PARQUET_EXPORT RowGroupMetaData {
- public:
- /// \brief Create a RowGroupMetaData from a serialized thrift message.
- static std::unique_ptr<RowGroupMetaData> Make(
- const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version = NULLPTR,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- ~RowGroupMetaData();
-
- bool Equals(const RowGroupMetaData& other) const;
-
- /// \brief The number of columns in this row group. The order must match the
- /// parent's column ordering.
- int num_columns() const;
-
- /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
- ///
- /// WARNING, the returned object references memory location in it's parent
- /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
- /// object.
- ///
- /// \param[in] index of the ColumnChunkMetaData to retrieve.
- ///
- /// \throws ParquetException if the index is out of bound.
- std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
-
- /// \brief Number of rows in this row group.
- int64_t num_rows() const;
-
- /// \brief Total byte size of all the uncompressed column data in this row group.
- int64_t total_byte_size() const;
-
- /// \brief Total byte size of all the compressed (and potentially encrypted)
- /// column data in this row group.
- ///
- /// This information is optional and may be 0 if omitted.
- int64_t total_compressed_size() const;
-
- /// \brief Byte offset from beginning of file to first page (data or
- /// dictionary) in this row group
- ///
- /// The file_offset field that this method exposes is optional. This method
- /// will return 0 if that field is not set to a meaningful value.
- int64_t file_offset() const;
- // Return const-pointer to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const;
- // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
- bool can_decompress() const;
-
- private:
- explicit RowGroupMetaData(
- const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version = NULLPTR,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
- // PIMPL Idiom
- class RowGroupMetaDataImpl;
- std::unique_ptr<RowGroupMetaDataImpl> impl_;
-};
-
-class FileMetaDataBuilder;
-
-/// \brief FileMetaData is a proxy around format::FileMetaData.
-class PARQUET_EXPORT FileMetaData {
- public:
- /// \brief Create a FileMetaData from a serialized thrift message.
- static std::shared_ptr<FileMetaData> Make(
- const void* serialized_metadata, uint32_t* inout_metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- ~FileMetaData();
-
- bool Equals(const FileMetaData& other) const;
-
- /// \brief The number of top-level columns in the schema.
- ///
- /// Parquet thrift definition requires that nested schema elements are
- /// flattened. This method returns the number of columns in the un-flattened
- /// version.
- int num_columns() const;
-
- /// \brief The number of flattened schema elements.
- ///
- /// Parquet thrift definition requires that nested schema elements are
- /// flattened. This method returns the total number of elements in the
- /// flattened list.
- int num_schema_elements() const;
-
- /// \brief The total number of rows.
- int64_t num_rows() const;
-
- /// \brief The number of row groups in the file.
- int num_row_groups() const;
-
- /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
- ///
- /// WARNING, the returned object references memory location in it's parent
- /// (FileMetaData) object. Hence, the parent must outlive the returned object.
- ///
- /// \param[in] index of the RowGroup to retrieve.
- ///
- /// \throws ParquetException if the index is out of bound.
- std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
-
- /// \brief Return the version of the file.
- ParquetVersion::type version() const;
-
- /// \brief Return the application's user-agent string of the writer.
- const std::string& created_by() const;
-
- /// \brief Return the application's version of the writer.
- const ApplicationVersion& writer_version() const;
-
- /// \brief Size of the original thrift encoded metadata footer.
- uint32_t size() const;
-
- /// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
- ///
- /// This will return false if any of the RowGroup's page is compressed with a
- /// compression format which is not compiled in the current parquet library.
- bool can_decompress() const;
-
- bool is_encryption_algorithm_set() const;
- EncryptionAlgorithm encryption_algorithm() const;
- const std::string& footer_signing_key_metadata() const;
-
- /// \brief Verify signature of FileMetaData when file is encrypted but footer
- /// is not encrypted (plaintext footer).
- bool VerifySignature(const void* signature);
-
- void WriteTo(::arrow::io::OutputStream* dst,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
-
- /// \brief Return Thrift-serialized representation of the metadata as a
- /// string
- std::string SerializeToString() const;
-
- // Return const-pointer to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const;
-
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
-
- /// \brief Set a path to all ColumnChunk for all RowGroups.
- ///
- /// Commonly used by systems (Dask, Spark) who generates an metadata-only
- /// parquet file. The path is usually relative to said index file.
- ///
- /// \param[in] path to set.
- void set_file_path(const std::string& path);
-
- /// \brief Merge row groups from another metadata file into this one.
- ///
- /// The schema of the input FileMetaData must be equal to the
- /// schema of this object.
- ///
- /// This is used by systems who creates an aggregate metadata-only file by
- /// concatenating the row groups of multiple files. This newly created
- /// metadata file acts as an index of all available row groups.
- ///
- /// \param[in] other FileMetaData to merge the row groups from.
- ///
- /// \throws ParquetException if schemas are not equal.
- void AppendRowGroups(const FileMetaData& other);
-
- /// \brief Return a FileMetaData containing a subset of the row groups in this
- /// FileMetaData.
- std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
-
- private:
- friend FileMetaDataBuilder;
- friend class SerializedFile;
-
- explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
-
- // PIMPL Idiom
- FileMetaData();
- class FileMetaDataImpl;
- std::unique_ptr<FileMetaDataImpl> impl_;
-};
-
-class PARQUET_EXPORT FileCryptoMetaData {
- public:
- // API convenience to get a MetaData accessor
- static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
- uint32_t* metadata_len);
- ~FileCryptoMetaData();
-
- EncryptionAlgorithm encryption_algorithm() const;
- const std::string& key_metadata() const;
-
- void WriteTo(::arrow::io::OutputStream* dst) const;
-
- private:
- friend FileMetaDataBuilder;
- FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
-
- // PIMPL Idiom
- FileCryptoMetaData();
- class FileCryptoMetaDataImpl;
- std::unique_ptr<FileCryptoMetaDataImpl> impl_;
-};
-
-// Builder API
-class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
- public:
- // API convenience to get a MetaData reader
- static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
-
- static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
- void* contents);
-
- ~ColumnChunkMetaDataBuilder();
-
- // column chunk
- // Used when a dataset is spread across multiple files
- void set_file_path(const std::string& path);
- // column metadata
- void SetStatistics(const EncodedStatistics& stats);
- // get the column descriptor
- const ColumnDescriptor* descr() const;
-
- int64_t total_compressed_size() const;
- // commit the metadata
-
- void Finish(int64_t num_values, int64_t dictionary_page_offset,
- int64_t index_page_offset, int64_t data_page_offset,
- int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
- bool dictionary_fallback,
- const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
- const std::map<Encoding::type, int32_t>& data_encoding_stats_,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
-
- // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
- const void* contents() const;
-
- // For writing metadata at end of column chunk
- void WriteTo(::arrow::io::OutputStream* sink);
-
- private:
- explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column);
- explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column, void* contents);
- // PIMPL Idiom
- class ColumnChunkMetaDataBuilderImpl;
- std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
-};
-
-class PARQUET_EXPORT RowGroupMetaDataBuilder {
- public:
- // API convenience to get a MetaData reader
- static std::unique_ptr<RowGroupMetaDataBuilder> Make(
- std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
- void* contents);
-
- ~RowGroupMetaDataBuilder();
-
- ColumnChunkMetaDataBuilder* NextColumnChunk();
- int num_columns();
- int64_t num_rows();
- int current_column() const;
-
- void set_num_rows(int64_t num_rows);
-
- // commit the metadata
- void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
-
- private:
- explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const SchemaDescriptor* schema_, void* contents);
- // PIMPL Idiom
- class RowGroupMetaDataBuilderImpl;
- std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
-};
-
-class PARQUET_EXPORT FileMetaDataBuilder {
- public:
- // API convenience to get a MetaData reader
- static std::unique_ptr<FileMetaDataBuilder> Make(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
-
- ~FileMetaDataBuilder();
-
- // The prior RowGroupMetaDataBuilder (if any) is destroyed
- RowGroupMetaDataBuilder* AppendRowGroup();
-
- // Complete the Thrift structure
- std::unique_ptr<FileMetaData> Finish();
-
- // crypto metadata
- std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
-
- private:
- explicit FileMetaDataBuilder(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
- // PIMPL Idiom
- class FileMetaDataBuilderImpl;
- std::unique_ptr<FileMetaDataBuilderImpl> impl_;
-};
-
-PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class ColumnDescriptor;
+class EncodedStatistics;
+class Statistics;
+class SchemaDescriptor;
+
+class FileCryptoMetaData;
+class InternalFileDecryptor;
+class Decryptor;
+class Encryptor;
+class FooterSigningEncryptor;
+
+namespace schema {
+
+class ColumnPath;
+
+} // namespace schema
+
+using KeyValueMetadata = ::arrow::KeyValueMetadata;
+
+class PARQUET_EXPORT ApplicationVersion {
+ public:
+ // Known Versions with Issues
+ static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
+ static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
+ static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
+ static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
+
+ // Application that wrote the file. e.g. "IMPALA"
+ std::string application_;
+ // Build name
+ std::string build_;
+
+ // Version of the application that wrote the file, expressed as
+ // (<major>.<minor>.<patch>). Unmatched parts default to 0.
+ // "1.2.3" => {1, 2, 3}
+ // "1.2" => {1, 2, 0}
+ // "1.2-cdh5" => {1, 2, 0}
+ struct {
+ int major;
+ int minor;
+ int patch;
+ std::string unknown;
+ std::string pre_release;
+ std::string build_info;
+ } version;
+
+ ApplicationVersion() = default;
+ explicit ApplicationVersion(const std::string& created_by);
+ ApplicationVersion(std::string application, int major, int minor, int patch);
+
+ // Returns true if version is strictly less than other_version
+ bool VersionLt(const ApplicationVersion& other_version) const;
+
+ // Returns true if version is strictly less than other_version
+ bool VersionEq(const ApplicationVersion& other_version) const;
+
+ // Checks if the Version has the correct statistics for a given column
+ bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
+ SortOrder::type sort_order = SortOrder::SIGNED) const;
+};
+
+class PARQUET_EXPORT ColumnCryptoMetaData {
+ public:
+ static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
+ ~ColumnCryptoMetaData();
+
+ bool Equals(const ColumnCryptoMetaData& other) const;
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool encrypted_with_footer_key() const;
+ const std::string& key_metadata() const;
+
+ private:
+ explicit ColumnCryptoMetaData(const uint8_t* metadata);
+
+ class ColumnCryptoMetaDataImpl;
+ std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
+};
+
+/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
+struct PageEncodingStats {
+ PageType::type page_type;
+ Encoding::type encoding;
+ int32_t count;
+};
+
+/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
+class PARQUET_EXPORT ColumnChunkMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::unique_ptr<ColumnChunkMetaData> Make(
+ const void* metadata, const ColumnDescriptor* descr,
+ const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
+ int16_t column_ordinal = -1,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~ColumnChunkMetaData();
+
+ bool Equals(const ColumnChunkMetaData& other) const;
+
+ // column chunk
+ int64_t file_offset() const;
+
+ // parameter is only used when a dataset is spread across multiple files
+ const std::string& file_path() const;
+
+ // column metadata
+ bool is_metadata_set() const;
+ Type::type type() const;
+ int64_t num_values() const;
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool is_stats_set() const;
+ std::shared_ptr<Statistics> statistics() const;
+
+ Compression::type compression() const;
+ // Indicate if the ColumnChunk compression is supported by the current
+ // compiled parquet library.
+ bool can_decompress() const;
+
+ const std::vector<Encoding::type>& encodings() const;
+ const std::vector<PageEncodingStats>& encoding_stats() const;
+ bool has_dictionary_page() const;
+ int64_t dictionary_page_offset() const;
+ int64_t data_page_offset() const;
+ bool has_index_page() const;
+ int64_t index_page_offset() const;
+ int64_t total_compressed_size() const;
+ int64_t total_uncompressed_size() const;
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
+
+ private:
+ explicit ColumnChunkMetaData(
+ const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+ int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataImpl;
+ std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
+};
+
+/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
+class PARQUET_EXPORT RowGroupMetaData {
+ public:
+ /// \brief Create a RowGroupMetaData from a serialized thrift message.
+ static std::unique_ptr<RowGroupMetaData> Make(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~RowGroupMetaData();
+
+ bool Equals(const RowGroupMetaData& other) const;
+
+ /// \brief The number of columns in this row group. The order must match the
+ /// parent's column ordering.
+ int num_columns() const;
+
+ /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
+ ///
+ /// WARNING, the returned object references memory location in it's parent
+ /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
+ /// object.
+ ///
+ /// \param[in] index of the ColumnChunkMetaData to retrieve.
+ ///
+ /// \throws ParquetException if the index is out of bound.
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
+
+ /// \brief Number of rows in this row group.
+ int64_t num_rows() const;
+
+ /// \brief Total byte size of all the uncompressed column data in this row group.
+ int64_t total_byte_size() const;
+
+ /// \brief Total byte size of all the compressed (and potentially encrypted)
+ /// column data in this row group.
+ ///
+ /// This information is optional and may be 0 if omitted.
+ int64_t total_compressed_size() const;
+
+ /// \brief Byte offset from beginning of file to first page (data or
+ /// dictionary) in this row group
+ ///
+ /// The file_offset field that this method exposes is optional. This method
+ /// will return 0 if that field is not set to a meaningful value.
+ int64_t file_offset() const;
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const;
+ // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
+ bool can_decompress() const;
+
+ private:
+ explicit RowGroupMetaData(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+ // PIMPL Idiom
+ class RowGroupMetaDataImpl;
+ std::unique_ptr<RowGroupMetaDataImpl> impl_;
+};
+
+class FileMetaDataBuilder;
+
+/// \brief FileMetaData is a proxy around format::FileMetaData.
+class PARQUET_EXPORT FileMetaData {
+ public:
+ /// \brief Create a FileMetaData from a serialized thrift message.
+ static std::shared_ptr<FileMetaData> Make(
+ const void* serialized_metadata, uint32_t* inout_metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~FileMetaData();
+
+ bool Equals(const FileMetaData& other) const;
+
+ /// \brief The number of top-level columns in the schema.
+ ///
+ /// Parquet thrift definition requires that nested schema elements are
+ /// flattened. This method returns the number of columns in the un-flattened
+ /// version.
+ int num_columns() const;
+
+ /// \brief The number of flattened schema elements.
+ ///
+ /// Parquet thrift definition requires that nested schema elements are
+ /// flattened. This method returns the total number of elements in the
+ /// flattened list.
+ int num_schema_elements() const;
+
+ /// \brief The total number of rows.
+ int64_t num_rows() const;
+
+ /// \brief The number of row groups in the file.
+ int num_row_groups() const;
+
+ /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
+ ///
+ /// WARNING, the returned object references memory location in it's parent
+ /// (FileMetaData) object. Hence, the parent must outlive the returned object.
+ ///
+ /// \param[in] index of the RowGroup to retrieve.
+ ///
+ /// \throws ParquetException if the index is out of bound.
+ std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
+
+ /// \brief Return the version of the file.
+ ParquetVersion::type version() const;
+
+ /// \brief Return the application's user-agent string of the writer.
+ const std::string& created_by() const;
+
+ /// \brief Return the application's version of the writer.
+ const ApplicationVersion& writer_version() const;
+
+ /// \brief Size of the original thrift encoded metadata footer.
+ uint32_t size() const;
+
+ /// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
+ ///
+ /// This will return false if any of the RowGroup's page is compressed with a
+ /// compression format which is not compiled in the current parquet library.
+ bool can_decompress() const;
+
+ bool is_encryption_algorithm_set() const;
+ EncryptionAlgorithm encryption_algorithm() const;
+ const std::string& footer_signing_key_metadata() const;
+
+ /// \brief Verify signature of FileMetaData when file is encrypted but footer
+ /// is not encrypted (plaintext footer).
+ bool VerifySignature(const void* signature);
+
+ void WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
+
+ /// \brief Return Thrift-serialized representation of the metadata as a
+ /// string
+ std::string SerializeToString() const;
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const;
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ /// \brief Set a path to all ColumnChunk for all RowGroups.
+ ///
+ /// Commonly used by systems (Dask, Spark) who generates an metadata-only
+ /// parquet file. The path is usually relative to said index file.
+ ///
+ /// \param[in] path to set.
+ void set_file_path(const std::string& path);
+
+ /// \brief Merge row groups from another metadata file into this one.
+ ///
+ /// The schema of the input FileMetaData must be equal to the
+ /// schema of this object.
+ ///
+ /// This is used by systems who creates an aggregate metadata-only file by
+ /// concatenating the row groups of multiple files. This newly created
+ /// metadata file acts as an index of all available row groups.
+ ///
+ /// \param[in] other FileMetaData to merge the row groups from.
+ ///
+ /// \throws ParquetException if schemas are not equal.
+ void AppendRowGroups(const FileMetaData& other);
+
+ /// \brief Return a FileMetaData containing a subset of the row groups in this
+ /// FileMetaData.
+ std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
+
+ private:
+ friend FileMetaDataBuilder;
+ friend class SerializedFile;
+
+ explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
+
+ // PIMPL Idiom
+ FileMetaData();
+ class FileMetaDataImpl;
+ std::unique_ptr<FileMetaDataImpl> impl_;
+};
+
+class PARQUET_EXPORT FileCryptoMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
+ uint32_t* metadata_len);
+ ~FileCryptoMetaData();
+
+ EncryptionAlgorithm encryption_algorithm() const;
+ const std::string& key_metadata() const;
+
+ void WriteTo(::arrow::io::OutputStream* dst) const;
+
+ private:
+ friend FileMetaDataBuilder;
+ FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
+
+ // PIMPL Idiom
+ FileCryptoMetaData();
+ class FileCryptoMetaDataImpl;
+ std::unique_ptr<FileCryptoMetaDataImpl> impl_;
+};
+
+// Builder API
+class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
+
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents);
+
+ ~ColumnChunkMetaDataBuilder();
+
+ // column chunk
+ // Used when a dataset is spread across multiple files
+ void set_file_path(const std::string& path);
+ // column metadata
+ void SetStatistics(const EncodedStatistics& stats);
+ // get the column descriptor
+ const ColumnDescriptor* descr() const;
+
+ int64_t total_compressed_size() const;
+ // commit the metadata
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+ bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats_,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
+
+ // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
+ const void* contents() const;
+
+ // For writing metadata at end of column chunk
+ void WriteTo(::arrow::io::OutputStream* sink);
+
+ private:
+ explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column);
+ explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column, void* contents);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataBuilderImpl;
+ std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT RowGroupMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<RowGroupMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+ void* contents);
+
+ ~RowGroupMetaDataBuilder();
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk();
+ int num_columns();
+ int64_t num_rows();
+ int current_column() const;
+
+ void set_num_rows(int64_t num_rows);
+
+ // commit the metadata
+ void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
+
+ private:
+ explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema_, void* contents);
+ // PIMPL Idiom
+ class RowGroupMetaDataBuilderImpl;
+ std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT FileMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<FileMetaDataBuilder> Make(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+ ~FileMetaDataBuilder();
+
+ // The prior RowGroupMetaDataBuilder (if any) is destroyed
+ RowGroupMetaDataBuilder* AppendRowGroup();
+
+ // Complete the Thrift structure
+ std::unique_ptr<FileMetaData> Finish();
+
+ // crypto metadata
+ std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
+
+ private:
+ explicit FileMetaDataBuilder(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+ // PIMPL Idiom
+ class FileMetaDataBuilderImpl;
+ std::unique_ptr<FileMetaDataBuilderImpl> impl_;
+};
+
+PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
index 07a936e0412..69b38478172 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
@@ -1,222 +1,222 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "parquet/murmur3.h"
-
-namespace parquet {
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE __forceinline
-#define ROTL64(x, y) _rotl64(x, y)
-
-#else // defined(_MSC_VER)
-
-#define FORCE_INLINE inline __attribute__((always_inline))
-inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); }
-#define ROTL64(x, y) rotl64(x, y)
-
-#endif // !defined(_MSC_VER)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) { return p[i]; }
-
-FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) { return p[i]; }
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32(uint32_t h) {
- h ^= h >> 16;
- h *= 0x85ebca6b;
- h ^= h >> 13;
- h *= 0xc2b2ae35;
- h ^= h >> 16;
-
- return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64(uint64_t k) {
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xff51afd7ed558ccd);
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
- k ^= k >> 33;
-
- return k;
-}
-
-//-----------------------------------------------------------------------------
-
-void Hash_x64_128(const void* key, const int len, const uint32_t seed, uint64_t out[2]) {
- const uint8_t* data = (const uint8_t*)key;
- const int nblocks = len / 16;
-
- uint64_t h1 = seed;
- uint64_t h2 = seed;
-
- const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
- const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
- //----------
- // body
-
- const uint64_t* blocks = (const uint64_t*)(data);
-
- for (int i = 0; i < nblocks; i++) {
- uint64_t k1 = getblock64(blocks, i * 2 + 0);
- uint64_t k2 = getblock64(blocks, i * 2 + 1);
-
- k1 *= c1;
- k1 = ROTL64(k1, 31);
- k1 *= c2;
- h1 ^= k1;
-
- h1 = ROTL64(h1, 27);
- h1 += h2;
- h1 = h1 * 5 + 0x52dce729;
-
- k2 *= c2;
- k2 = ROTL64(k2, 33);
- k2 *= c1;
- h2 ^= k2;
-
- h2 = ROTL64(h2, 31);
- h2 += h1;
- h2 = h2 * 5 + 0x38495ab5;
- }
-
- //----------
- // tail
-
- const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
-
- uint64_t k1 = 0;
- uint64_t k2 = 0;
-
- switch (len & 15) {
- case 15:
- k2 ^= ((uint64_t)tail[14]) << 48; // fall through
- case 14:
- k2 ^= ((uint64_t)tail[13]) << 40; // fall through
- case 13:
- k2 ^= ((uint64_t)tail[12]) << 32; // fall through
- case 12:
- k2 ^= ((uint64_t)tail[11]) << 24; // fall through
- case 11:
- k2 ^= ((uint64_t)tail[10]) << 16; // fall through
- case 10:
- k2 ^= ((uint64_t)tail[9]) << 8; // fall through
- case 9:
- k2 ^= ((uint64_t)tail[8]) << 0;
- k2 *= c2;
- k2 = ROTL64(k2, 33);
- k2 *= c1;
- h2 ^= k2; // fall through
-
- case 8:
- k1 ^= ((uint64_t)tail[7]) << 56; // fall through
- case 7:
- k1 ^= ((uint64_t)tail[6]) << 48; // fall through
- case 6:
- k1 ^= ((uint64_t)tail[5]) << 40; // fall through
- case 5:
- k1 ^= ((uint64_t)tail[4]) << 32; // fall through
- case 4:
- k1 ^= ((uint64_t)tail[3]) << 24; // fall through
- case 3:
- k1 ^= ((uint64_t)tail[2]) << 16; // fall through
- case 2:
- k1 ^= ((uint64_t)tail[1]) << 8; // fall through
- case 1:
- k1 ^= ((uint64_t)tail[0]) << 0;
- k1 *= c1;
- k1 = ROTL64(k1, 31);
- k1 *= c2;
- h1 ^= k1;
- }
-
- //----------
- // finalization
-
- h1 ^= len;
- h2 ^= len;
-
- h1 += h2;
- h2 += h1;
-
- h1 = fmix64(h1);
- h2 = fmix64(h2);
-
- h1 += h2;
- h2 += h1;
-
- reinterpret_cast<uint64_t*>(out)[0] = h1;
- reinterpret_cast<uint64_t*>(out)[1] = h2;
-}
-
-template <typename T>
-uint64_t HashHelper(T value, uint32_t seed) {
- uint64_t output[2];
- Hash_x64_128(reinterpret_cast<void*>(&value), sizeof(T), seed, output);
- return output[0];
-}
-
-uint64_t MurmurHash3::Hash(int32_t value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(int64_t value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(float value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(double value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(const FLBA* value, uint32_t len) const {
- uint64_t out[2];
- Hash_x64_128(reinterpret_cast<const void*>(value->ptr), len, seed_, out);
- return out[0];
-}
-
-uint64_t MurmurHash3::Hash(const Int96* value) const {
- uint64_t out[2];
- Hash_x64_128(reinterpret_cast<const void*>(value->value), sizeof(value->value), seed_,
- out);
- return out[0];
-}
-
-uint64_t MurmurHash3::Hash(const ByteArray* value) const {
- uint64_t out[2];
- Hash_x64_128(reinterpret_cast<const void*>(value->ptr), value->len, seed_, out);
- return out[0];
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "parquet/murmur3.h"
+
+namespace parquet {
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+#define ROTL64(x, y) _rotl64(x, y)
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); }
+#define ROTL64(x, y) rotl64(x, y)
+
+#endif // !defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) { return p[i]; }
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) { return p[i]; }
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void Hash_x64_128(const void* key, const int len, const uint32_t seed, uint64_t out[2]) {
+ const uint8_t* data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
+
+ const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ //----------
+ // body
+
+ const uint64_t* blocks = (const uint64_t*)(data);
+
+ for (int i = 0; i < nblocks; i++) {
+ uint64_t k1 = getblock64(blocks, i * 2 + 0);
+ uint64_t k2 = getblock64(blocks, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+ uint64_t k1 = 0;
+ uint64_t k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 ^= ((uint64_t)tail[14]) << 48; // fall through
+ case 14:
+ k2 ^= ((uint64_t)tail[13]) << 40; // fall through
+ case 13:
+ k2 ^= ((uint64_t)tail[12]) << 32; // fall through
+ case 12:
+ k2 ^= ((uint64_t)tail[11]) << 24; // fall through
+ case 11:
+ k2 ^= ((uint64_t)tail[10]) << 16; // fall through
+ case 10:
+ k2 ^= ((uint64_t)tail[9]) << 8; // fall through
+ case 9:
+ k2 ^= ((uint64_t)tail[8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2; // fall through
+
+ case 8:
+ k1 ^= ((uint64_t)tail[7]) << 56; // fall through
+ case 7:
+ k1 ^= ((uint64_t)tail[6]) << 48; // fall through
+ case 6:
+ k1 ^= ((uint64_t)tail[5]) << 40; // fall through
+ case 5:
+ k1 ^= ((uint64_t)tail[4]) << 32; // fall through
+ case 4:
+ k1 ^= ((uint64_t)tail[3]) << 24; // fall through
+ case 3:
+ k1 ^= ((uint64_t)tail[2]) << 16; // fall through
+ case 2:
+ k1 ^= ((uint64_t)tail[1]) << 8; // fall through
+ case 1:
+ k1 ^= ((uint64_t)tail[0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ reinterpret_cast<uint64_t*>(out)[0] = h1;
+ reinterpret_cast<uint64_t*>(out)[1] = h2;
+}
+
+template <typename T>
+uint64_t HashHelper(T value, uint32_t seed) {
+ uint64_t output[2];
+ Hash_x64_128(reinterpret_cast<void*>(&value), sizeof(T), seed, output);
+ return output[0];
+}
+
+uint64_t MurmurHash3::Hash(int32_t value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(int64_t value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(float value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(double value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(const FLBA* value, uint32_t len) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->ptr), len, seed_, out);
+ return out[0];
+}
+
+uint64_t MurmurHash3::Hash(const Int96* value) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->value), sizeof(value->value), seed_,
+ out);
+ return out[0];
+}
+
+uint64_t MurmurHash3::Hash(const ByteArray* value) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->ptr), value->len, seed_, out);
+ return out[0];
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
index acf7088e44b..2dcb8b5bffa 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
@@ -1,54 +1,54 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#pragma once
-
-#include <cstdint>
-
-#include "parquet/hasher.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-/// Source:
-/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
-class PARQUET_EXPORT MurmurHash3 : public Hasher {
- public:
- MurmurHash3() : seed_(DEFAULT_SEED) {}
- uint64_t Hash(int32_t value) const override;
- uint64_t Hash(int64_t value) const override;
- uint64_t Hash(float value) const override;
- uint64_t Hash(double value) const override;
- uint64_t Hash(const Int96* value) const override;
- uint64_t Hash(const ByteArray* value) const override;
- uint64_t Hash(const FLBA* val, uint32_t len) const override;
-
- private:
- // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
- // by System.nanoTime() of java.
- static constexpr int DEFAULT_SEED = 1361930890;
-
- uint32_t seed_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#pragma once
+
+#include <cstdint>
+
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Source:
+/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
+class PARQUET_EXPORT MurmurHash3 : public Hasher {
+ public:
+ MurmurHash3() : seed_(DEFAULT_SEED) {}
+ uint64_t Hash(int32_t value) const override;
+ uint64_t Hash(int64_t value) const override;
+ uint64_t Hash(float value) const override;
+ uint64_t Hash(double value) const override;
+ uint64_t Hash(const Int96* value) const override;
+ uint64_t Hash(const ByteArray* value) const override;
+ uint64_t Hash(const FLBA* val, uint32_t len) const override;
+
+ private:
+ // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
+ // by System.nanoTime() of java.
+ static constexpr int DEFAULT_SEED = 1361930890;
+
+ uint32_t seed_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc b/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
index 5c355c28be1..70ed6f73df3 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
@@ -1,41 +1,41 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/platform.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "arrow/io/memory.h"
-
-#include "parquet/exception.h"
-
-namespace parquet {
-
-std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(MemoryPool* pool) {
- PARQUET_ASSIGN_OR_THROW(auto stream, ::arrow::io::BufferOutputStream::Create(
- kDefaultOutputStreamSize, pool));
- return stream;
-}
-
-std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t size) {
- PARQUET_ASSIGN_OR_THROW(auto result, ::arrow::AllocateResizableBuffer(size, pool));
- return std::move(result);
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/platform.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/memory.h"
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(MemoryPool* pool) {
+ PARQUET_ASSIGN_OR_THROW(auto stream, ::arrow::io::BufferOutputStream::Create(
+ kDefaultOutputStreamSize, pool));
+ return stream;
+}
+
+std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t size) {
+ PARQUET_ASSIGN_OR_THROW(auto result, ::arrow::AllocateResizableBuffer(size, pool));
+ return std::move(result);
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/platform.h b/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
index 00a193f144a..cd41aa7f5c6 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
@@ -1,111 +1,111 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "arrow/buffer.h" // IWYU pragma: export
-#include "arrow/io/interfaces.h" // IWYU pragma: export
-#include "arrow/status.h" // IWYU pragma: export
-#include "arrow/type_fwd.h" // IWYU pragma: export
-#include "arrow/util/macros.h" // IWYU pragma: export
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-// Disable warning for STL types usage in DLL interface
-// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
-#pragma warning(disable : 4275 4251)
-// Disable diamond inheritance warnings
-#pragma warning(disable : 4250)
-// Disable macro redefinition warnings
-#pragma warning(disable : 4005)
-// Disable extern before exported template warnings
-#pragma warning(disable : 4910)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
-
-#ifdef PARQUET_STATIC
-#define PARQUET_EXPORT
-#elif defined(PARQUET_EXPORTING)
-#define PARQUET_EXPORT __declspec(dllexport)
-#else
-#define PARQUET_EXPORT __declspec(dllimport)
-#endif
-
-#define PARQUET_NO_EXPORT
-
-#else // Not Windows
-#ifndef PARQUET_EXPORT
-#define PARQUET_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef PARQUET_NO_EXPORT
-#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif // Non-Windows
-
-// This is a complicated topic, some reading on it:
-// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
-#if defined(_MSC_VER) || defined(__clang__)
-#define PARQUET_TEMPLATE_CLASS_EXPORT
-#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
-#else
-#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
-#define PARQUET_TEMPLATE_EXPORT
-#endif
-
-#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
-
-#define PARQUET_NORETURN ARROW_NORETURN
-#define PARQUET_DEPRECATED ARROW_DEPRECATED
-
-// If ARROW_VALGRIND set when compiling unit tests, also define
-// PARQUET_VALGRIND
-#ifdef ARROW_VALGRIND
-#define PARQUET_VALGRIND
-#endif
-
-namespace parquet {
-
-using Buffer = ::arrow::Buffer;
-using Codec = ::arrow::util::Codec;
-using Compression = ::arrow::Compression;
-using MemoryPool = ::arrow::MemoryPool;
-using MutableBuffer = ::arrow::MutableBuffer;
-using ResizableBuffer = ::arrow::ResizableBuffer;
-using ResizableBuffer = ::arrow::ResizableBuffer;
-using ArrowInputFile = ::arrow::io::RandomAccessFile;
-using ArrowInputStream = ::arrow::io::InputStream;
-using ArrowOutputStream = ::arrow::io::OutputStream;
-
-constexpr int64_t kDefaultOutputStreamSize = 1024;
-
-constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
-
-PARQUET_EXPORT
-std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
-PARQUET_EXPORT
-std::shared_ptr<ResizableBuffer> AllocateBuffer(
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h" // IWYU pragma: export
+#include "arrow/io/interfaces.h" // IWYU pragma: export
+#include "arrow/status.h" // IWYU pragma: export
+#include "arrow/type_fwd.h" // IWYU pragma: export
+#include "arrow/util/macros.h" // IWYU pragma: export
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// Disable warning for STL types usage in DLL interface
+// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
+#pragma warning(disable : 4275 4251)
+// Disable diamond inheritance warnings
+#pragma warning(disable : 4250)
+// Disable macro redefinition warnings
+#pragma warning(disable : 4005)
+// Disable extern before exported template warnings
+#pragma warning(disable : 4910)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef PARQUET_STATIC
+#define PARQUET_EXPORT
+#elif defined(PARQUET_EXPORTING)
+#define PARQUET_EXPORT __declspec(dllexport)
+#else
+#define PARQUET_EXPORT __declspec(dllimport)
+#endif
+
+#define PARQUET_NO_EXPORT
+
+#else // Not Windows
+#ifndef PARQUET_EXPORT
+#define PARQUET_EXPORT __attribute__((visibility("default")))
+#endif
+#ifndef PARQUET_NO_EXPORT
+#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+#endif // Non-Windows
+
+// This is a complicated topic, some reading on it:
+// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
+#if defined(_MSC_VER) || defined(__clang__)
+#define PARQUET_TEMPLATE_CLASS_EXPORT
+#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
+#else
+#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
+#define PARQUET_TEMPLATE_EXPORT
+#endif
+
+#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
+
+#define PARQUET_NORETURN ARROW_NORETURN
+#define PARQUET_DEPRECATED ARROW_DEPRECATED
+
+// If ARROW_VALGRIND set when compiling unit tests, also define
+// PARQUET_VALGRIND
+#ifdef ARROW_VALGRIND
+#define PARQUET_VALGRIND
+#endif
+
+namespace parquet {
+
+using Buffer = ::arrow::Buffer;
+using Codec = ::arrow::util::Codec;
+using Compression = ::arrow::Compression;
+using MemoryPool = ::arrow::MemoryPool;
+using MutableBuffer = ::arrow::MutableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ArrowInputFile = ::arrow::io::RandomAccessFile;
+using ArrowInputStream = ::arrow::io::InputStream;
+using ArrowOutputStream = ::arrow::io::OutputStream;
+
+constexpr int64_t kDefaultOutputStreamSize = 1024;
+
+constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
+
+PARQUET_EXPORT
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+PARQUET_EXPORT
+std::shared_ptr<ResizableBuffer> AllocateBuffer(
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
index dfd4bd802ee..df2b4c50b5d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
@@ -1,297 +1,297 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/printer.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/string.h"
-
-#include "parquet/column_scanner.h"
-#include "parquet/exception.h"
-#include "parquet/file_reader.h"
-#include "parquet/metadata.h"
-#include "parquet/schema.h"
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-class ColumnReader;
-
-// ----------------------------------------------------------------------
-// ParquetFilePrinter::DebugPrint
-
-// the fixed initial size is just for an example
-#define COL_WIDTH 30
-
-void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values, bool format_dump,
- bool print_key_value_metadata, const char* filename) {
- const FileMetaData* file_metadata = fileReader->metadata().get();
-
- stream << "File Name: " << filename << "\n";
- stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
- stream << "Created By: " << file_metadata->created_by() << "\n";
- stream << "Total rows: " << file_metadata->num_rows() << "\n";
-
- if (print_key_value_metadata && file_metadata->key_value_metadata()) {
- auto key_value_metadata = file_metadata->key_value_metadata();
- int64_t size_of_key_value_metadata = key_value_metadata->size();
- stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
- for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
- stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
- << key_value_metadata->value(i) << "\n";
- }
- }
-
- stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
- stream << "Number of Real Columns: "
- << file_metadata->schema()->group_node()->field_count() << "\n";
-
- if (selected_columns.size() == 0) {
- for (int i = 0; i < file_metadata->num_columns(); i++) {
- selected_columns.push_back(i);
- }
- } else {
- for (auto i : selected_columns) {
- if (i < 0 || i >= file_metadata->num_columns()) {
- throw ParquetException("Selected column is out of range");
- }
- }
- }
-
- stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
- stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
- for (auto i : selected_columns) {
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
- << TypeToString(descr->physical_type());
- const auto& logical_type = descr->logical_type();
- if (!logical_type->is_none()) {
- stream << " / " << logical_type->ToString();
- }
- if (descr->converted_type() != ConvertedType::NONE) {
- stream << " / " << ConvertedTypeToString(descr->converted_type());
- if (descr->converted_type() == ConvertedType::DECIMAL) {
- stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
- }
- }
- stream << ")" << std::endl;
- }
-
- for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
- stream << "--- Row Group: " << r << " ---\n";
-
- auto group_reader = fileReader->RowGroup(r);
- std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
-
- stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
- stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
- << " ---\n";
- stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
-
- // Print column metadata
- for (auto i : selected_columns) {
- auto column_chunk = group_metadata->ColumnChunk(i);
- std::shared_ptr<Statistics> stats = column_chunk->statistics();
-
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
- if (column_chunk->is_stats_set()) {
- std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << ", Null Values: " << stats->null_count()
- << ", Distinct Values: " << stats->distinct_count() << std::endl
- << " Max: " << FormatStatValue(descr->physical_type(), max)
- << ", Min: " << FormatStatValue(descr->physical_type(), min);
- } else {
- stream << " Statistics Not Set";
- }
- stream << std::endl
- << " Compression: "
- << ::arrow::internal::AsciiToUpper(
- Codec::GetCodecAsString(column_chunk->compression()))
- << ", Encodings:";
- for (auto encoding : column_chunk->encodings()) {
- stream << " " << EncodingToString(encoding);
- }
- stream << std::endl
- << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
- << ", Compressed Size: " << column_chunk->total_compressed_size()
- << std::endl;
- }
-
- if (!print_values) {
- continue;
- }
- stream << "--- Values ---\n";
-
- static constexpr int bufsize = COL_WIDTH + 1;
- char buffer[bufsize];
-
- // Create readers for selected columns and print contents
- std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
- int j = 0;
- for (auto i : selected_columns) {
- std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
- // This is OK in this method as long as the RowGroupReader does not get
- // deleted
- auto& scanner = scanners[j++] = Scanner::Make(col_reader);
-
- if (format_dump) {
- stream << "Column " << i << std::endl;
- while (scanner->HasNext()) {
- scanner->PrintNext(stream, 0, true);
- stream << "\n";
- }
- continue;
- }
-
- snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
- file_metadata->schema()->Column(i)->name().c_str());
- stream << buffer << '|';
- }
- if (format_dump) {
- continue;
- }
- stream << "\n";
-
- bool hasRow;
- do {
- hasRow = false;
- for (auto scanner : scanners) {
- if (scanner->HasNext()) {
- hasRow = true;
- scanner->PrintNext(stream, COL_WIDTH);
- stream << '|';
- }
- }
- stream << "\n";
- } while (hasRow);
- }
-}
-
-void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
- const char* filename) {
- const FileMetaData* file_metadata = fileReader->metadata().get();
- stream << "{\n";
- stream << " \"FileName\": \"" << filename << "\",\n";
- stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
- << "\",\n";
- stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
- stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
- stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
- stream << " \"NumberOfRealColumns\": \""
- << file_metadata->schema()->group_node()->field_count() << "\",\n";
- stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
-
- if (selected_columns.size() == 0) {
- for (int i = 0; i < file_metadata->num_columns(); i++) {
- selected_columns.push_back(i);
- }
- } else {
- for (auto i : selected_columns) {
- if (i < 0 || i >= file_metadata->num_columns()) {
- throw ParquetException("Selected column is out of range");
- }
- }
- }
-
- stream << " \"Columns\": [\n";
- int c = 0;
- for (auto i : selected_columns) {
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << " { \"Id\": \"" << i << "\","
- << " \"Name\": \"" << descr->path()->ToDotString() << "\","
- << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
- << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
- << "\","
- << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
- c++;
- if (c != static_cast<int>(selected_columns.size())) {
- stream << ",\n";
- }
- }
-
- stream << "\n ],\n \"RowGroups\": [\n";
- for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
- stream << " {\n \"Id\": \"" << r << "\", ";
-
- auto group_reader = fileReader->RowGroup(r);
- std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
-
- stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
- stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
- << "\", ";
- stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
-
- // Print column metadata
- stream << " \"ColumnChunks\": [\n";
- int c1 = 0;
- for (auto i : selected_columns) {
- auto column_chunk = group_metadata->ColumnChunk(i);
- std::shared_ptr<Statistics> stats = column_chunk->statistics();
-
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << " {\"Id\": \"" << i << "\", \"Values\": \""
- << column_chunk->num_values() << "\", "
- << "\"StatsSet\": ";
- if (column_chunk->is_stats_set()) {
- stream << "\"True\", \"Stats\": {";
- std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
- << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
- << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
- << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
- << "\" },";
- } else {
- stream << "\"False\",";
- }
- stream << "\n \"Compression\": \""
- << ::arrow::internal::AsciiToUpper(
- Codec::GetCodecAsString(column_chunk->compression()))
- << "\", \"Encodings\": \"";
- for (auto encoding : column_chunk->encodings()) {
- stream << EncodingToString(encoding) << " ";
- }
- stream << "\", "
- << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
- << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
-
- // end of a ColumnChunk
- stream << "\" }";
- c1++;
- if (c1 != static_cast<int>(selected_columns.size())) {
- stream << ",\n";
- }
- }
-
- stream << "\n ]\n }";
- if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
- stream << ",\n";
- }
- }
- stream << "\n ]\n}\n";
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/printer.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/string.h"
+
+#include "parquet/column_scanner.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class ColumnReader;
+
+// ----------------------------------------------------------------------
+// ParquetFilePrinter::DebugPrint
+
+// the fixed initial size is just for an example
+#define COL_WIDTH 30
+
+void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values, bool format_dump,
+ bool print_key_value_metadata, const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+
+ stream << "File Name: " << filename << "\n";
+ stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
+ stream << "Created By: " << file_metadata->created_by() << "\n";
+ stream << "Total rows: " << file_metadata->num_rows() << "\n";
+
+ if (print_key_value_metadata && file_metadata->key_value_metadata()) {
+ auto key_value_metadata = file_metadata->key_value_metadata();
+ int64_t size_of_key_value_metadata = key_value_metadata->size();
+ stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
+ for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
+ stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
+ << key_value_metadata->value(i) << "\n";
+ }
+ }
+
+ stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
+ stream << "Number of Real Columns: "
+ << file_metadata->schema()->group_node()->field_count() << "\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
+ stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
+ << TypeToString(descr->physical_type());
+ const auto& logical_type = descr->logical_type();
+ if (!logical_type->is_none()) {
+ stream << " / " << logical_type->ToString();
+ }
+ if (descr->converted_type() != ConvertedType::NONE) {
+ stream << " / " << ConvertedTypeToString(descr->converted_type());
+ if (descr->converted_type() == ConvertedType::DECIMAL) {
+ stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+ }
+ }
+ stream << ")" << std::endl;
+ }
+
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << "--- Row Group: " << r << " ---\n";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
+ stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
+ << " ---\n";
+ stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
+
+ // Print column metadata
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<Statistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
+ if (column_chunk->is_stats_set()) {
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << ", Null Values: " << stats->null_count()
+ << ", Distinct Values: " << stats->distinct_count() << std::endl
+ << " Max: " << FormatStatValue(descr->physical_type(), max)
+ << ", Min: " << FormatStatValue(descr->physical_type(), min);
+ } else {
+ stream << " Statistics Not Set";
+ }
+ stream << std::endl
+ << " Compression: "
+ << ::arrow::internal::AsciiToUpper(
+ Codec::GetCodecAsString(column_chunk->compression()))
+ << ", Encodings:";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << " " << EncodingToString(encoding);
+ }
+ stream << std::endl
+ << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
+ << ", Compressed Size: " << column_chunk->total_compressed_size()
+ << std::endl;
+ }
+
+ if (!print_values) {
+ continue;
+ }
+ stream << "--- Values ---\n";
+
+ static constexpr int bufsize = COL_WIDTH + 1;
+ char buffer[bufsize];
+
+ // Create readers for selected columns and print contents
+ std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
+ int j = 0;
+ for (auto i : selected_columns) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ // This is OK in this method as long as the RowGroupReader does not get
+ // deleted
+ auto& scanner = scanners[j++] = Scanner::Make(col_reader);
+
+ if (format_dump) {
+ stream << "Column " << i << std::endl;
+ while (scanner->HasNext()) {
+ scanner->PrintNext(stream, 0, true);
+ stream << "\n";
+ }
+ continue;
+ }
+
+ snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
+ file_metadata->schema()->Column(i)->name().c_str());
+ stream << buffer << '|';
+ }
+ if (format_dump) {
+ continue;
+ }
+ stream << "\n";
+
+ bool hasRow;
+ do {
+ hasRow = false;
+ for (auto scanner : scanners) {
+ if (scanner->HasNext()) {
+ hasRow = true;
+ scanner->PrintNext(stream, COL_WIDTH);
+ stream << '|';
+ }
+ }
+ stream << "\n";
+ } while (hasRow);
+ }
+}
+
+void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+ const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+ stream << "{\n";
+ stream << " \"FileName\": \"" << filename << "\",\n";
+ stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
+ << "\",\n";
+ stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
+ stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
+ stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
+ stream << " \"NumberOfRealColumns\": \""
+ << file_metadata->schema()->group_node()->field_count() << "\",\n";
+ stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << " \"Columns\": [\n";
+ int c = 0;
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " { \"Id\": \"" << i << "\","
+ << " \"Name\": \"" << descr->path()->ToDotString() << "\","
+ << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
+ << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
+ << "\","
+ << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
+ c++;
+ if (c != static_cast<int>(selected_columns.size())) {
+ stream << ",\n";
+ }
+ }
+
+ stream << "\n ],\n \"RowGroups\": [\n";
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << " {\n \"Id\": \"" << r << "\", ";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
+ stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
+ << "\", ";
+ stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
+
+ // Print column metadata
+ stream << " \"ColumnChunks\": [\n";
+ int c1 = 0;
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<Statistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " {\"Id\": \"" << i << "\", \"Values\": \""
+ << column_chunk->num_values() << "\", "
+ << "\"StatsSet\": ";
+ if (column_chunk->is_stats_set()) {
+ stream << "\"True\", \"Stats\": {";
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
+ << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
+ << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
+ << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
+ << "\" },";
+ } else {
+ stream << "\"False\",";
+ }
+ stream << "\n \"Compression\": \""
+ << ::arrow::internal::AsciiToUpper(
+ Codec::GetCodecAsString(column_chunk->compression()))
+ << "\", \"Encodings\": \"";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << EncodingToString(encoding) << " ";
+ }
+ stream << "\", "
+ << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
+ << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
+
+ // end of a ColumnChunk
+ stream << "\" }";
+ c1++;
+ if (c1 != static_cast<int>(selected_columns.size())) {
+ stream << ",\n";
+ }
+ }
+
+ stream << "\n ]\n }";
+ if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
+ stream << ",\n";
+ }
+ }
+ stream << "\n ]\n}\n";
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.h b/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
index 6bdf5b456fa..b29b1bd6d7a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
@@ -1,46 +1,46 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <iosfwd>
-#include <list>
-
-#include "parquet/platform.h"
-
-namespace parquet {
-
-class ParquetFileReader;
-
-class PARQUET_EXPORT ParquetFilePrinter {
- private:
- ParquetFileReader* fileReader;
-
- public:
- explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
- ~ParquetFilePrinter() {}
-
- void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values = false, bool format_dump = false,
- bool print_key_value_metadata = false,
- const char* filename = "No Name");
-
- void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
- const char* filename = "No Name");
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <list>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+
+class ParquetFileReader;
+
+class PARQUET_EXPORT ParquetFilePrinter {
+ private:
+ ParquetFileReader* fileReader;
+
+ public:
+ explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
+ ~ParquetFilePrinter() {}
+
+ void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values = false, bool format_dump = false,
+ bool print_key_value_metadata = false,
+ const char* filename = "No Name");
+
+ void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+ const char* filename = "No Name");
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc b/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
index 93638dbe28a..1a28fb81e40 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
@@ -1,64 +1,64 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <sstream>
-#include <utility>
-
-#include "parquet/properties.h"
-
-#include "arrow/io/buffered.h"
-#include "arrow/io/memory.h"
-#include "arrow/util/logging.h"
-
-namespace parquet {
-
-std::shared_ptr<ArrowInputStream> ReaderProperties::GetStream(
- std::shared_ptr<ArrowInputFile> source, int64_t start, int64_t num_bytes) {
- if (buffered_stream_enabled_) {
- // ARROW-6180 / PARQUET-1636 Create isolated reader that references segment
- // of source
- std::shared_ptr<::arrow::io::InputStream> safe_stream =
- ::arrow::io::RandomAccessFile::GetStream(source, start, num_bytes);
- PARQUET_ASSIGN_OR_THROW(
- auto stream, ::arrow::io::BufferedInputStream::Create(buffer_size_, pool_,
- safe_stream, num_bytes));
- return std::move(stream);
- } else {
- PARQUET_ASSIGN_OR_THROW(auto data, source->ReadAt(start, num_bytes));
-
- if (data->size() != num_bytes) {
- std::stringstream ss;
- ss << "Tried reading " << num_bytes << " bytes starting at position " << start
- << " from file but only got " << data->size();
- throw ParquetException(ss.str());
- }
- return std::make_shared<::arrow::io::BufferReader>(data);
- }
-}
-
-ArrowReaderProperties default_arrow_reader_properties() {
- static ArrowReaderProperties default_reader_props;
- return default_reader_props;
-}
-
-std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties() {
- static std::shared_ptr<ArrowWriterProperties> default_writer_properties =
- ArrowWriterProperties::Builder().build();
- return default_writer_properties;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+#include <utility>
+
+#include "parquet/properties.h"
+
+#include "arrow/io/buffered.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/logging.h"
+
+namespace parquet {
+
+std::shared_ptr<ArrowInputStream> ReaderProperties::GetStream(
+ std::shared_ptr<ArrowInputFile> source, int64_t start, int64_t num_bytes) {
+ if (buffered_stream_enabled_) {
+ // ARROW-6180 / PARQUET-1636 Create isolated reader that references segment
+ // of source
+ std::shared_ptr<::arrow::io::InputStream> safe_stream =
+ ::arrow::io::RandomAccessFile::GetStream(source, start, num_bytes);
+ PARQUET_ASSIGN_OR_THROW(
+ auto stream, ::arrow::io::BufferedInputStream::Create(buffer_size_, pool_,
+ safe_stream, num_bytes));
+ return std::move(stream);
+ } else {
+ PARQUET_ASSIGN_OR_THROW(auto data, source->ReadAt(start, num_bytes));
+
+ if (data->size() != num_bytes) {
+ std::stringstream ss;
+ ss << "Tried reading " << num_bytes << " bytes starting at position " << start
+ << " from file but only got " << data->size();
+ throw ParquetException(ss.str());
+ }
+ return std::make_shared<::arrow::io::BufferReader>(data);
+ }
+}
+
+ArrowReaderProperties default_arrow_reader_properties() {
+ static ArrowReaderProperties default_reader_props;
+ return default_reader_props;
+}
+
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties() {
+ static std::shared_ptr<ArrowWriterProperties> default_writer_properties =
+ ArrowWriterProperties::Builder().build();
+ return default_writer_properties;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/properties.h b/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
index d217b8efa52..bc86f98ef7f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
@@ -1,813 +1,813 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include "arrow/io/caching.h"
-#include "arrow/type.h"
-#include "arrow/util/compression.h"
-#include "parquet/encryption/encryption.h"
-#include "parquet/exception.h"
-#include "parquet/parquet_version.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/type_fwd.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-/// Determines use of Parquet Format version >= 2.0.0 logical types. For
-/// example, when writing from Arrow data structures, PARQUET_2_0 will enable
-/// use of INT_* and UINT_* converted types as well as nanosecond timestamps
-/// stored physically as INT64. Since some Parquet implementations do not
-/// support the logical types added in the 2.0.0 format version, if you want to
-/// maximize compatibility of your files you may want to use PARQUET_1_0.
-///
-/// Note that the 2.x format version series also introduced new serialized
-/// data page metadata and on disk data page layout. To enable this, use
-/// ParquetDataPageVersion.
-struct ParquetVersion;
-
-/// Controls serialization format of data pages. parquet-format v2.0.0
-/// introduced a new data page metadata type DataPageV2 and serialized page
-/// structure (for example, encoded levels are no longer compressed). Prior to
-/// the completion of PARQUET-457 in 2020, this library did not implement
-/// DataPageV2 correctly, so if you use the V2 data page format, you may have
-/// forward compatibility issues (older versions of the library will be unable
-/// to read the files). Note that some Parquet implementations do not implement
-/// DataPageV2 at all.
-enum class ParquetDataPageVersion { V1, V2 };
-
-/// Align the default buffer size to a small multiple of a page size.
-constexpr int64_t kDefaultBufferSize = 4096 * 4;
-
-class PARQUET_EXPORT ReaderProperties {
- public:
- explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
- : pool_(pool) {}
-
- MemoryPool* memory_pool() const { return pool_; }
-
- std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
- int64_t start, int64_t num_bytes);
-
- /// Buffered stream reading allows the user to control the memory usage of
- /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
- /// wrapped in a buffered reader that uses a fix sized buffer (of size
- /// `buffer_size()`) instead of the full size of the ReadAt.
- ///
- /// The primary reason for this control knobs is for resource control and not
- /// performance.
- bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
- void enable_buffered_stream() { buffered_stream_enabled_ = true; }
- void disable_buffered_stream() { buffered_stream_enabled_ = false; }
-
- int64_t buffer_size() const { return buffer_size_; }
- void set_buffer_size(int64_t size) { buffer_size_ = size; }
-
- void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
- file_decryption_properties_ = std::move(decryption);
- }
-
- const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
- return file_decryption_properties_;
- }
-
- private:
- MemoryPool* pool_;
- int64_t buffer_size_ = kDefaultBufferSize;
- bool buffered_stream_enabled_ = false;
- std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
-};
-
-ReaderProperties PARQUET_EXPORT default_reader_properties();
-
-static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
-static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
-static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
-static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
-static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
-static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
-static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
-static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
-static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
-static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
-
-class PARQUET_EXPORT ColumnProperties {
- public:
- ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
- Compression::type codec = DEFAULT_COMPRESSION_TYPE,
- bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
- bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
- size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
- : encoding_(encoding),
- codec_(codec),
- dictionary_enabled_(dictionary_enabled),
- statistics_enabled_(statistics_enabled),
- max_stats_size_(max_stats_size),
- compression_level_(Codec::UseDefaultCompressionLevel()) {}
-
- void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
-
- void set_compression(Compression::type codec) { codec_ = codec; }
-
- void set_dictionary_enabled(bool dictionary_enabled) {
- dictionary_enabled_ = dictionary_enabled;
- }
-
- void set_statistics_enabled(bool statistics_enabled) {
- statistics_enabled_ = statistics_enabled;
- }
-
- void set_max_statistics_size(size_t max_stats_size) {
- max_stats_size_ = max_stats_size;
- }
-
- void set_compression_level(int compression_level) {
- compression_level_ = compression_level;
- }
-
- Encoding::type encoding() const { return encoding_; }
-
- Compression::type compression() const { return codec_; }
-
- bool dictionary_enabled() const { return dictionary_enabled_; }
-
- bool statistics_enabled() const { return statistics_enabled_; }
-
- size_t max_statistics_size() const { return max_stats_size_; }
-
- int compression_level() const { return compression_level_; }
-
- private:
- Encoding::type encoding_;
- Compression::type codec_;
- bool dictionary_enabled_;
- bool statistics_enabled_;
- size_t max_stats_size_;
- int compression_level_;
-};
-
-class PARQUET_EXPORT WriterProperties {
- public:
- class Builder {
- public:
- Builder()
- : pool_(::arrow::default_memory_pool()),
- dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
- write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
- max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
- pagesize_(kDefaultDataPageSize),
- version_(ParquetVersion::PARQUET_1_0),
- data_page_version_(ParquetDataPageVersion::V1),
- created_by_(DEFAULT_CREATED_BY) {}
- virtual ~Builder() {}
-
- Builder* memory_pool(MemoryPool* pool) {
- pool_ = pool;
- return this;
- }
-
- Builder* enable_dictionary() {
- default_column_properties_.set_dictionary_enabled(true);
- return this;
- }
-
- Builder* disable_dictionary() {
- default_column_properties_.set_dictionary_enabled(false);
- return this;
- }
-
- Builder* enable_dictionary(const std::string& path) {
- dictionary_enabled_[path] = true;
- return this;
- }
-
- Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->enable_dictionary(path->ToDotString());
- }
-
- Builder* disable_dictionary(const std::string& path) {
- dictionary_enabled_[path] = false;
- return this;
- }
-
- Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->disable_dictionary(path->ToDotString());
- }
-
- Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
- dictionary_pagesize_limit_ = dictionary_psize_limit;
- return this;
- }
-
- Builder* write_batch_size(int64_t write_batch_size) {
- write_batch_size_ = write_batch_size;
- return this;
- }
-
- Builder* max_row_group_length(int64_t max_row_group_length) {
- max_row_group_length_ = max_row_group_length;
- return this;
- }
-
- Builder* data_pagesize(int64_t pg_size) {
- pagesize_ = pg_size;
- return this;
- }
-
- Builder* data_page_version(ParquetDataPageVersion data_page_version) {
- data_page_version_ = data_page_version;
- return this;
- }
-
- Builder* version(ParquetVersion::type version) {
- version_ = version;
- return this;
- }
-
- Builder* created_by(const std::string& created_by) {
- created_by_ = created_by;
- return this;
- }
-
- /**
- * Define the encoding that is used when we don't utilise dictionary encoding.
- *
- * This either apply if dictionary encoding is disabled or if we fallback
- * as the dictionary grew too large.
- */
- Builder* encoding(Encoding::type encoding_type) {
- if (encoding_type == Encoding::PLAIN_DICTIONARY ||
- encoding_type == Encoding::RLE_DICTIONARY) {
- throw ParquetException("Can't use dictionary encoding as fallback encoding");
- }
-
- default_column_properties_.set_encoding(encoding_type);
- return this;
- }
-
- /**
- * Define the encoding that is used when we don't utilise dictionary encoding.
- *
- * This either apply if dictionary encoding is disabled or if we fallback
- * as the dictionary grew too large.
- */
- Builder* encoding(const std::string& path, Encoding::type encoding_type) {
- if (encoding_type == Encoding::PLAIN_DICTIONARY ||
- encoding_type == Encoding::RLE_DICTIONARY) {
- throw ParquetException("Can't use dictionary encoding as fallback encoding");
- }
-
- encodings_[path] = encoding_type;
- return this;
- }
-
- /**
- * Define the encoding that is used when we don't utilise dictionary encoding.
- *
- * This either apply if dictionary encoding is disabled or if we fallback
- * as the dictionary grew too large.
- */
- Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
- Encoding::type encoding_type) {
- return this->encoding(path->ToDotString(), encoding_type);
- }
-
- Builder* compression(Compression::type codec) {
- default_column_properties_.set_compression(codec);
- return this;
- }
-
- Builder* max_statistics_size(size_t max_stats_sz) {
- default_column_properties_.set_max_statistics_size(max_stats_sz);
- return this;
- }
-
- Builder* compression(const std::string& path, Compression::type codec) {
- codecs_[path] = codec;
- return this;
- }
-
- Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
- Compression::type codec) {
- return this->compression(path->ToDotString(), codec);
- }
-
- /// \brief Specify the default compression level for the compressor in
- /// every column. In case a column does not have an explicitly specified
- /// compression level, the default one would be used.
- ///
- /// The provided compression level is compressor specific. The user would
- /// have to familiarize oneself with the available levels for the selected
- /// compressor. If the compressor does not allow for selecting different
- /// compression levels, calling this function would not have any effect.
- /// Parquet and Arrow do not validate the passed compression level. If no
- /// level is selected by the user or if the special
- /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
- /// compression level.
- Builder* compression_level(int compression_level) {
- default_column_properties_.set_compression_level(compression_level);
- return this;
- }
-
- /// \brief Specify a compression level for the compressor for the column
- /// described by path.
- ///
- /// The provided compression level is compressor specific. The user would
- /// have to familiarize oneself with the available levels for the selected
- /// compressor. If the compressor does not allow for selecting different
- /// compression levels, calling this function would not have any effect.
- /// Parquet and Arrow do not validate the passed compression level. If no
- /// level is selected by the user or if the special
- /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
- /// compression level.
- Builder* compression_level(const std::string& path, int compression_level) {
- codecs_compression_level_[path] = compression_level;
- return this;
- }
-
- /// \brief Specify a compression level for the compressor for the column
- /// described by path.
- ///
- /// The provided compression level is compressor specific. The user would
- /// have to familiarize oneself with the available levels for the selected
- /// compressor. If the compressor does not allow for selecting different
- /// compression levels, calling this function would not have any effect.
- /// Parquet and Arrow do not validate the passed compression level. If no
- /// level is selected by the user or if the special
- /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
- /// compression level.
- Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
- int compression_level) {
- return this->compression_level(path->ToDotString(), compression_level);
- }
-
- Builder* encryption(
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
- file_encryption_properties_ = std::move(file_encryption_properties);
- return this;
- }
-
- Builder* enable_statistics() {
- default_column_properties_.set_statistics_enabled(true);
- return this;
- }
-
- Builder* disable_statistics() {
- default_column_properties_.set_statistics_enabled(false);
- return this;
- }
-
- Builder* enable_statistics(const std::string& path) {
- statistics_enabled_[path] = true;
- return this;
- }
-
- Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->enable_statistics(path->ToDotString());
- }
-
- Builder* disable_statistics(const std::string& path) {
- statistics_enabled_[path] = false;
- return this;
- }
-
- Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->disable_statistics(path->ToDotString());
- }
-
- std::shared_ptr<WriterProperties> build() {
- std::unordered_map<std::string, ColumnProperties> column_properties;
- auto get = [&](const std::string& key) -> ColumnProperties& {
- auto it = column_properties.find(key);
- if (it == column_properties.end())
- return column_properties[key] = default_column_properties_;
- else
- return it->second;
- };
-
- for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
- for (const auto& item : codecs_) get(item.first).set_compression(item.second);
- for (const auto& item : codecs_compression_level_)
- get(item.first).set_compression_level(item.second);
- for (const auto& item : dictionary_enabled_)
- get(item.first).set_dictionary_enabled(item.second);
- for (const auto& item : statistics_enabled_)
- get(item.first).set_statistics_enabled(item.second);
-
- return std::shared_ptr<WriterProperties>(new WriterProperties(
- pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
- pagesize_, version_, created_by_, std::move(file_encryption_properties_),
- default_column_properties_, column_properties, data_page_version_));
- }
-
- private:
- MemoryPool* pool_;
- int64_t dictionary_pagesize_limit_;
- int64_t write_batch_size_;
- int64_t max_row_group_length_;
- int64_t pagesize_;
- ParquetVersion::type version_;
- ParquetDataPageVersion data_page_version_;
- std::string created_by_;
-
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
-
- // Settings used for each column unless overridden in any of the maps below
- ColumnProperties default_column_properties_;
- std::unordered_map<std::string, Encoding::type> encodings_;
- std::unordered_map<std::string, Compression::type> codecs_;
- std::unordered_map<std::string, int32_t> codecs_compression_level_;
- std::unordered_map<std::string, bool> dictionary_enabled_;
- std::unordered_map<std::string, bool> statistics_enabled_;
- };
-
- inline MemoryPool* memory_pool() const { return pool_; }
-
- inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
-
- inline int64_t write_batch_size() const { return write_batch_size_; }
-
- inline int64_t max_row_group_length() const { return max_row_group_length_; }
-
- inline int64_t data_pagesize() const { return pagesize_; }
-
- inline ParquetDataPageVersion data_page_version() const {
- return parquet_data_page_version_;
- }
-
- inline ParquetVersion::type version() const { return parquet_version_; }
-
- inline std::string created_by() const { return parquet_created_by_; }
-
- inline Encoding::type dictionary_index_encoding() const {
- if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
- return Encoding::PLAIN_DICTIONARY;
- } else {
- return Encoding::RLE_DICTIONARY;
- }
- }
-
- inline Encoding::type dictionary_page_encoding() const {
- if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
- return Encoding::PLAIN_DICTIONARY;
- } else {
- return Encoding::PLAIN;
- }
- }
-
- const ColumnProperties& column_properties(
- const std::shared_ptr<schema::ColumnPath>& path) const {
- auto it = column_properties_.find(path->ToDotString());
- if (it != column_properties_.end()) return it->second;
- return default_column_properties_;
- }
-
- Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).encoding();
- }
-
- Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).compression();
- }
-
- int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).compression_level();
- }
-
- bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).dictionary_enabled();
- }
-
- bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).statistics_enabled();
- }
-
- size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).max_statistics_size();
- }
-
- inline FileEncryptionProperties* file_encryption_properties() const {
- return file_encryption_properties_.get();
- }
-
- std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
- const std::string& path) const {
- if (file_encryption_properties_) {
- return file_encryption_properties_->column_encryption_properties(path);
- } else {
- return NULLPTR;
- }
- }
-
- private:
- explicit WriterProperties(
- MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
- int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
- const std::string& created_by,
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
- const ColumnProperties& default_column_properties,
- const std::unordered_map<std::string, ColumnProperties>& column_properties,
- ParquetDataPageVersion data_page_version)
- : pool_(pool),
- dictionary_pagesize_limit_(dictionary_pagesize_limit),
- write_batch_size_(write_batch_size),
- max_row_group_length_(max_row_group_length),
- pagesize_(pagesize),
- parquet_data_page_version_(data_page_version),
- parquet_version_(version),
- parquet_created_by_(created_by),
- file_encryption_properties_(file_encryption_properties),
- default_column_properties_(default_column_properties),
- column_properties_(column_properties) {}
-
- MemoryPool* pool_;
- int64_t dictionary_pagesize_limit_;
- int64_t write_batch_size_;
- int64_t max_row_group_length_;
- int64_t pagesize_;
- ParquetDataPageVersion parquet_data_page_version_;
- ParquetVersion::type parquet_version_;
- std::string parquet_created_by_;
-
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
-
- ColumnProperties default_column_properties_;
- std::unordered_map<std::string, ColumnProperties> column_properties_;
-};
-
-PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
-
-// ----------------------------------------------------------------------
-// Properties specific to Apache Arrow columnar read and write
-
-static constexpr bool kArrowDefaultUseThreads = false;
-
-// Default number of rows to read when using ::arrow::RecordBatchReader
-static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
-
-/// EXPERIMENTAL: Properties for configuring FileReader behavior.
-class PARQUET_EXPORT ArrowReaderProperties {
- public:
- explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
- : use_threads_(use_threads),
- read_dict_indices_(),
- batch_size_(kArrowDefaultBatchSize),
- pre_buffer_(false),
- cache_options_(::arrow::io::CacheOptions::Defaults()),
- coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
-
- void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
-
- bool use_threads() const { return use_threads_; }
-
- void set_read_dictionary(int column_index, bool read_dict) {
- if (read_dict) {
- read_dict_indices_.insert(column_index);
- } else {
- read_dict_indices_.erase(column_index);
- }
- }
- bool read_dictionary(int column_index) const {
- if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
- return true;
- } else {
- return false;
- }
- }
-
- void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
-
- int64_t batch_size() const { return batch_size_; }
-
- /// Enable read coalescing.
- ///
- /// When enabled, the Arrow reader will pre-buffer necessary regions
- /// of the file in-memory. This is intended to improve performance on
- /// high-latency filesystems (e.g. Amazon S3).
- void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
-
- bool pre_buffer() const { return pre_buffer_; }
-
- /// Set options for read coalescing. This can be used to tune the
- /// implementation for characteristics of different filesystems.
- void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
-
- const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
-
- /// Set execution context for read coalescing.
- void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
-
- const ::arrow::io::IOContext& io_context() const { return io_context_; }
-
- /// Set timestamp unit to use for deprecated INT96-encoded timestamps
- /// (default is NANO).
- void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
- coerce_int96_timestamp_unit_ = unit;
- }
-
- ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
- return coerce_int96_timestamp_unit_;
- }
-
- private:
- bool use_threads_;
- std::unordered_set<int> read_dict_indices_;
- int64_t batch_size_;
- bool pre_buffer_;
- ::arrow::io::IOContext io_context_;
- ::arrow::io::CacheOptions cache_options_;
- ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
-};
-
-/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
-PARQUET_EXPORT
-ArrowReaderProperties default_arrow_reader_properties();
-
-class PARQUET_EXPORT ArrowWriterProperties {
- public:
- enum EngineVersion {
- V1, // Supports only nested lists.
- V2 // Full support for all nesting combinations
- };
- class Builder {
- public:
- Builder()
- : write_timestamps_as_int96_(false),
- coerce_timestamps_enabled_(false),
- coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
- truncated_timestamps_allowed_(false),
- store_schema_(false),
- // TODO: At some point we should flip this.
- compliant_nested_types_(false),
- engine_version_(V2) {}
- virtual ~Builder() = default;
-
- Builder* disable_deprecated_int96_timestamps() {
- write_timestamps_as_int96_ = false;
- return this;
- }
-
- Builder* enable_deprecated_int96_timestamps() {
- write_timestamps_as_int96_ = true;
- return this;
- }
-
- Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
- coerce_timestamps_enabled_ = true;
- coerce_timestamps_unit_ = unit;
- return this;
- }
-
- Builder* allow_truncated_timestamps() {
- truncated_timestamps_allowed_ = true;
- return this;
- }
-
- Builder* disallow_truncated_timestamps() {
- truncated_timestamps_allowed_ = false;
- return this;
- }
-
- /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
- /// to enable certain read options (like "read_dictionary") to be set
- /// automatically
- Builder* store_schema() {
- store_schema_ = true;
- return this;
- }
-
- Builder* enable_compliant_nested_types() {
- compliant_nested_types_ = true;
- return this;
- }
-
- Builder* disable_compliant_nested_types() {
- compliant_nested_types_ = false;
- return this;
- }
-
- Builder* set_engine_version(EngineVersion version) {
- engine_version_ = version;
- return this;
- }
-
- std::shared_ptr<ArrowWriterProperties> build() {
- return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
- write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
- truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
- engine_version_));
- }
-
- private:
- bool write_timestamps_as_int96_;
-
- bool coerce_timestamps_enabled_;
- ::arrow::TimeUnit::type coerce_timestamps_unit_;
- bool truncated_timestamps_allowed_;
-
- bool store_schema_;
- bool compliant_nested_types_;
- EngineVersion engine_version_;
- };
-
- bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
-
- bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
- ::arrow::TimeUnit::type coerce_timestamps_unit() const {
- return coerce_timestamps_unit_;
- }
-
- bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
-
- bool store_schema() const { return store_schema_; }
-
- /// \brief Enable nested type naming according to the parquet specification.
- ///
- /// Older versions of arrow wrote out field names for nested lists based on the name
- /// of the field. According to the parquet specification they should always be
- /// "element".
- bool compliant_nested_types() const { return compliant_nested_types_; }
-
- /// \brief The underlying engine version to use when writing Arrow data.
- ///
- /// V2 is currently the latest V1 is considered deprecated but left in
- /// place in case there are bugs detected in V2.
- EngineVersion engine_version() const { return engine_version_; }
-
- private:
- explicit ArrowWriterProperties(bool write_nanos_as_int96,
- bool coerce_timestamps_enabled,
- ::arrow::TimeUnit::type coerce_timestamps_unit,
- bool truncated_timestamps_allowed, bool store_schema,
- bool compliant_nested_types,
- EngineVersion engine_version)
- : write_timestamps_as_int96_(write_nanos_as_int96),
- coerce_timestamps_enabled_(coerce_timestamps_enabled),
- coerce_timestamps_unit_(coerce_timestamps_unit),
- truncated_timestamps_allowed_(truncated_timestamps_allowed),
- store_schema_(store_schema),
- compliant_nested_types_(compliant_nested_types),
- engine_version_(engine_version) {}
-
- const bool write_timestamps_as_int96_;
- const bool coerce_timestamps_enabled_;
- const ::arrow::TimeUnit::type coerce_timestamps_unit_;
- const bool truncated_timestamps_allowed_;
- const bool store_schema_;
- const bool compliant_nested_types_;
- const EngineVersion engine_version_;
-};
-
-/// \brief State object used for writing Arrow data directly to a Parquet
-/// column chunk. API possibly not stable
-struct ArrowWriteContext {
- ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
- : memory_pool(memory_pool),
- properties(properties),
- data_buffer(AllocateBuffer(memory_pool)),
- def_levels_buffer(AllocateBuffer(memory_pool)) {}
-
- template <typename T>
- ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
- ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
- *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
- return ::arrow::Status::OK();
- }
-
- MemoryPool* memory_pool;
- const ArrowWriterProperties* properties;
-
- // Buffer used for storing the data of an array converted to the physical type
- // as expected by parquet-cpp.
- std::shared_ptr<ResizableBuffer> data_buffer;
-
- // We use the shared ownership of this buffer
- std::shared_ptr<ResizableBuffer> def_levels_buffer;
-};
-
-PARQUET_EXPORT
-std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/type.h"
+#include "arrow/util/compression.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/exception.h"
+#include "parquet/parquet_version.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Determines use of Parquet Format version >= 2.0.0 logical types. For
+/// example, when writing from Arrow data structures, PARQUET_2_0 will enable
+/// use of INT_* and UINT_* converted types as well as nanosecond timestamps
+/// stored physically as INT64. Since some Parquet implementations do not
+/// support the logical types added in the 2.0.0 format version, if you want to
+/// maximize compatibility of your files you may want to use PARQUET_1_0.
+///
+/// Note that the 2.x format version series also introduced new serialized
+/// data page metadata and on disk data page layout. To enable this, use
+/// ParquetDataPageVersion.
+struct ParquetVersion;
+
+/// Controls serialization format of data pages. parquet-format v2.0.0
+/// introduced a new data page metadata type DataPageV2 and serialized page
+/// structure (for example, encoded levels are no longer compressed). Prior to
+/// the completion of PARQUET-457 in 2020, this library did not implement
+/// DataPageV2 correctly, so if you use the V2 data page format, you may have
+/// forward compatibility issues (older versions of the library will be unable
+/// to read the files). Note that some Parquet implementations do not implement
+/// DataPageV2 at all.
+enum class ParquetDataPageVersion { V1, V2 };
+
+/// Align the default buffer size to a small multiple of a page size.
+constexpr int64_t kDefaultBufferSize = 4096 * 4;
+
+class PARQUET_EXPORT ReaderProperties {
+ public:
+ explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
+ : pool_(pool) {}
+
+ MemoryPool* memory_pool() const { return pool_; }
+
+ std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
+ int64_t start, int64_t num_bytes);
+
+ /// Buffered stream reading allows the user to control the memory usage of
+ /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
+ /// wrapped in a buffered reader that uses a fix sized buffer (of size
+ /// `buffer_size()`) instead of the full size of the ReadAt.
+ ///
+ /// The primary reason for this control knobs is for resource control and not
+ /// performance.
+ bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
+ void enable_buffered_stream() { buffered_stream_enabled_ = true; }
+ void disable_buffered_stream() { buffered_stream_enabled_ = false; }
+
+ int64_t buffer_size() const { return buffer_size_; }
+ void set_buffer_size(int64_t size) { buffer_size_ = size; }
+
+ void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
+ file_decryption_properties_ = std::move(decryption);
+ }
+
+ const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
+ return file_decryption_properties_;
+ }
+
+ private:
+ MemoryPool* pool_;
+ int64_t buffer_size_ = kDefaultBufferSize;
+ bool buffered_stream_enabled_ = false;
+ std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
+};
+
+ReaderProperties PARQUET_EXPORT default_reader_properties();
+
+static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
+static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
+static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
+static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
+static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
+static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
+
+class PARQUET_EXPORT ColumnProperties {
+ public:
+ ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
+ Compression::type codec = DEFAULT_COMPRESSION_TYPE,
+ bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
+ bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
+ size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
+ : encoding_(encoding),
+ codec_(codec),
+ dictionary_enabled_(dictionary_enabled),
+ statistics_enabled_(statistics_enabled),
+ max_stats_size_(max_stats_size),
+ compression_level_(Codec::UseDefaultCompressionLevel()) {}
+
+ void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
+
+ void set_compression(Compression::type codec) { codec_ = codec; }
+
+ void set_dictionary_enabled(bool dictionary_enabled) {
+ dictionary_enabled_ = dictionary_enabled;
+ }
+
+ void set_statistics_enabled(bool statistics_enabled) {
+ statistics_enabled_ = statistics_enabled;
+ }
+
+ void set_max_statistics_size(size_t max_stats_size) {
+ max_stats_size_ = max_stats_size;
+ }
+
+ void set_compression_level(int compression_level) {
+ compression_level_ = compression_level;
+ }
+
+ Encoding::type encoding() const { return encoding_; }
+
+ Compression::type compression() const { return codec_; }
+
+ bool dictionary_enabled() const { return dictionary_enabled_; }
+
+ bool statistics_enabled() const { return statistics_enabled_; }
+
+ size_t max_statistics_size() const { return max_stats_size_; }
+
+ int compression_level() const { return compression_level_; }
+
+ private:
+ Encoding::type encoding_;
+ Compression::type codec_;
+ bool dictionary_enabled_;
+ bool statistics_enabled_;
+ size_t max_stats_size_;
+ int compression_level_;
+};
+
+class PARQUET_EXPORT WriterProperties {
+ public:
+ class Builder {
+ public:
+ Builder()
+ : pool_(::arrow::default_memory_pool()),
+ dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+ write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+ max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
+ pagesize_(kDefaultDataPageSize),
+ version_(ParquetVersion::PARQUET_1_0),
+ data_page_version_(ParquetDataPageVersion::V1),
+ created_by_(DEFAULT_CREATED_BY) {}
+ virtual ~Builder() {}
+
+ Builder* memory_pool(MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+ }
+
+ Builder* enable_dictionary() {
+ default_column_properties_.set_dictionary_enabled(true);
+ return this;
+ }
+
+ Builder* disable_dictionary() {
+ default_column_properties_.set_dictionary_enabled(false);
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_dictionary(path->ToDotString());
+ }
+
+ Builder* disable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = false;
+ return this;
+ }
+
+ Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->disable_dictionary(path->ToDotString());
+ }
+
+ Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
+ dictionary_pagesize_limit_ = dictionary_psize_limit;
+ return this;
+ }
+
+ Builder* write_batch_size(int64_t write_batch_size) {
+ write_batch_size_ = write_batch_size;
+ return this;
+ }
+
+ Builder* max_row_group_length(int64_t max_row_group_length) {
+ max_row_group_length_ = max_row_group_length;
+ return this;
+ }
+
+ Builder* data_pagesize(int64_t pg_size) {
+ pagesize_ = pg_size;
+ return this;
+ }
+
+ Builder* data_page_version(ParquetDataPageVersion data_page_version) {
+ data_page_version_ = data_page_version;
+ return this;
+ }
+
+ Builder* version(ParquetVersion::type version) {
+ version_ = version;
+ return this;
+ }
+
+ Builder* created_by(const std::string& created_by) {
+ created_by_ = created_by;
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+
+ default_column_properties_.set_encoding(encoding_type);
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::string& path, Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+
+ encodings_[path] = encoding_type;
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
+ Encoding::type encoding_type) {
+ return this->encoding(path->ToDotString(), encoding_type);
+ }
+
+ Builder* compression(Compression::type codec) {
+ default_column_properties_.set_compression(codec);
+ return this;
+ }
+
+ Builder* max_statistics_size(size_t max_stats_sz) {
+ default_column_properties_.set_max_statistics_size(max_stats_sz);
+ return this;
+ }
+
+ Builder* compression(const std::string& path, Compression::type codec) {
+ codecs_[path] = codec;
+ return this;
+ }
+
+ Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
+ Compression::type codec) {
+ return this->compression(path->ToDotString(), codec);
+ }
+
+ /// \brief Specify the default compression level for the compressor in
+ /// every column. In case a column does not have an explicitly specified
+ /// compression level, the default one would be used.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(int compression_level) {
+ default_column_properties_.set_compression_level(compression_level);
+ return this;
+ }
+
+ /// \brief Specify a compression level for the compressor for the column
+ /// described by path.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(const std::string& path, int compression_level) {
+ codecs_compression_level_[path] = compression_level;
+ return this;
+ }
+
+ /// \brief Specify a compression level for the compressor for the column
+ /// described by path.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
+ int compression_level) {
+ return this->compression_level(path->ToDotString(), compression_level);
+ }
+
+ Builder* encryption(
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
+ file_encryption_properties_ = std::move(file_encryption_properties);
+ return this;
+ }
+
+ Builder* enable_statistics() {
+ default_column_properties_.set_statistics_enabled(true);
+ return this;
+ }
+
+ Builder* disable_statistics() {
+ default_column_properties_.set_statistics_enabled(false);
+ return this;
+ }
+
+ Builder* enable_statistics(const std::string& path) {
+ statistics_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_statistics(path->ToDotString());
+ }
+
+ Builder* disable_statistics(const std::string& path) {
+ statistics_enabled_[path] = false;
+ return this;
+ }
+
+ Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->disable_statistics(path->ToDotString());
+ }
+
+ std::shared_ptr<WriterProperties> build() {
+ std::unordered_map<std::string, ColumnProperties> column_properties;
+ auto get = [&](const std::string& key) -> ColumnProperties& {
+ auto it = column_properties.find(key);
+ if (it == column_properties.end())
+ return column_properties[key] = default_column_properties_;
+ else
+ return it->second;
+ };
+
+ for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
+ for (const auto& item : codecs_) get(item.first).set_compression(item.second);
+ for (const auto& item : codecs_compression_level_)
+ get(item.first).set_compression_level(item.second);
+ for (const auto& item : dictionary_enabled_)
+ get(item.first).set_dictionary_enabled(item.second);
+ for (const auto& item : statistics_enabled_)
+ get(item.first).set_statistics_enabled(item.second);
+
+ return std::shared_ptr<WriterProperties>(new WriterProperties(
+ pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
+ pagesize_, version_, created_by_, std::move(file_encryption_properties_),
+ default_column_properties_, column_properties, data_page_version_));
+ }
+
+ private:
+ MemoryPool* pool_;
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
+ int64_t pagesize_;
+ ParquetVersion::type version_;
+ ParquetDataPageVersion data_page_version_;
+ std::string created_by_;
+
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+ // Settings used for each column unless overridden in any of the maps below
+ ColumnProperties default_column_properties_;
+ std::unordered_map<std::string, Encoding::type> encodings_;
+ std::unordered_map<std::string, Compression::type> codecs_;
+ std::unordered_map<std::string, int32_t> codecs_compression_level_;
+ std::unordered_map<std::string, bool> dictionary_enabled_;
+ std::unordered_map<std::string, bool> statistics_enabled_;
+ };
+
+ inline MemoryPool* memory_pool() const { return pool_; }
+
+ inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
+
+ inline int64_t write_batch_size() const { return write_batch_size_; }
+
+ inline int64_t max_row_group_length() const { return max_row_group_length_; }
+
+ inline int64_t data_pagesize() const { return pagesize_; }
+
+ inline ParquetDataPageVersion data_page_version() const {
+ return parquet_data_page_version_;
+ }
+
+ inline ParquetVersion::type version() const { return parquet_version_; }
+
+ inline std::string created_by() const { return parquet_created_by_; }
+
+ inline Encoding::type dictionary_index_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::RLE_DICTIONARY;
+ }
+ }
+
+ inline Encoding::type dictionary_page_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::PLAIN;
+ }
+ }
+
+ const ColumnProperties& column_properties(
+ const std::shared_ptr<schema::ColumnPath>& path) const {
+ auto it = column_properties_.find(path->ToDotString());
+ if (it != column_properties_.end()) return it->second;
+ return default_column_properties_;
+ }
+
+ Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).encoding();
+ }
+
+ Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).compression();
+ }
+
+ int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).compression_level();
+ }
+
+ bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).dictionary_enabled();
+ }
+
+ bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).statistics_enabled();
+ }
+
+ size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).max_statistics_size();
+ }
+
+ inline FileEncryptionProperties* file_encryption_properties() const {
+ return file_encryption_properties_.get();
+ }
+
+ std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+ const std::string& path) const {
+ if (file_encryption_properties_) {
+ return file_encryption_properties_->column_encryption_properties(path);
+ } else {
+ return NULLPTR;
+ }
+ }
+
+ private:
+ explicit WriterProperties(
+ MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
+ int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
+ const std::string& created_by,
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+ const ColumnProperties& default_column_properties,
+ const std::unordered_map<std::string, ColumnProperties>& column_properties,
+ ParquetDataPageVersion data_page_version)
+ : pool_(pool),
+ dictionary_pagesize_limit_(dictionary_pagesize_limit),
+ write_batch_size_(write_batch_size),
+ max_row_group_length_(max_row_group_length),
+ pagesize_(pagesize),
+ parquet_data_page_version_(data_page_version),
+ parquet_version_(version),
+ parquet_created_by_(created_by),
+ file_encryption_properties_(file_encryption_properties),
+ default_column_properties_(default_column_properties),
+ column_properties_(column_properties) {}
+
+ MemoryPool* pool_;
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
+ int64_t pagesize_;
+ ParquetDataPageVersion parquet_data_page_version_;
+ ParquetVersion::type parquet_version_;
+ std::string parquet_created_by_;
+
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+ ColumnProperties default_column_properties_;
+ std::unordered_map<std::string, ColumnProperties> column_properties_;
+};
+
+PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
+
+// ----------------------------------------------------------------------
+// Properties specific to Apache Arrow columnar read and write
+
+static constexpr bool kArrowDefaultUseThreads = false;
+
+// Default number of rows to read when using ::arrow::RecordBatchReader
+static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
+
+/// EXPERIMENTAL: Properties for configuring FileReader behavior.
+class PARQUET_EXPORT ArrowReaderProperties {
+ public:
+ explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
+ : use_threads_(use_threads),
+ read_dict_indices_(),
+ batch_size_(kArrowDefaultBatchSize),
+ pre_buffer_(false),
+ cache_options_(::arrow::io::CacheOptions::Defaults()),
+ coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
+
+ void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
+
+ bool use_threads() const { return use_threads_; }
+
+ void set_read_dictionary(int column_index, bool read_dict) {
+ if (read_dict) {
+ read_dict_indices_.insert(column_index);
+ } else {
+ read_dict_indices_.erase(column_index);
+ }
+ }
+ bool read_dictionary(int column_index) const {
+ if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
+
+ int64_t batch_size() const { return batch_size_; }
+
+ /// Enable read coalescing.
+ ///
+ /// When enabled, the Arrow reader will pre-buffer necessary regions
+ /// of the file in-memory. This is intended to improve performance on
+ /// high-latency filesystems (e.g. Amazon S3).
+ void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
+
+ bool pre_buffer() const { return pre_buffer_; }
+
+ /// Set options for read coalescing. This can be used to tune the
+ /// implementation for characteristics of different filesystems.
+ void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
+
+ const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
+
+ /// Set execution context for read coalescing.
+ void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
+
+ const ::arrow::io::IOContext& io_context() const { return io_context_; }
+
+ /// Set timestamp unit to use for deprecated INT96-encoded timestamps
+ /// (default is NANO).
+ void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
+ coerce_int96_timestamp_unit_ = unit;
+ }
+
+ ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
+ return coerce_int96_timestamp_unit_;
+ }
+
+ private:
+ bool use_threads_;
+ std::unordered_set<int> read_dict_indices_;
+ int64_t batch_size_;
+ bool pre_buffer_;
+ ::arrow::io::IOContext io_context_;
+ ::arrow::io::CacheOptions cache_options_;
+ ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+};
+
+/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
+PARQUET_EXPORT
+ArrowReaderProperties default_arrow_reader_properties();
+
+class PARQUET_EXPORT ArrowWriterProperties {
+ public:
+ enum EngineVersion {
+ V1, // Supports only nested lists.
+ V2 // Full support for all nesting combinations
+ };
+ class Builder {
+ public:
+ Builder()
+ : write_timestamps_as_int96_(false),
+ coerce_timestamps_enabled_(false),
+ coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+ truncated_timestamps_allowed_(false),
+ store_schema_(false),
+ // TODO: At some point we should flip this.
+ compliant_nested_types_(false),
+ engine_version_(V2) {}
+ virtual ~Builder() = default;
+
+ Builder* disable_deprecated_int96_timestamps() {
+ write_timestamps_as_int96_ = false;
+ return this;
+ }
+
+ Builder* enable_deprecated_int96_timestamps() {
+ write_timestamps_as_int96_ = true;
+ return this;
+ }
+
+ Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
+ coerce_timestamps_enabled_ = true;
+ coerce_timestamps_unit_ = unit;
+ return this;
+ }
+
+ Builder* allow_truncated_timestamps() {
+ truncated_timestamps_allowed_ = true;
+ return this;
+ }
+
+ Builder* disallow_truncated_timestamps() {
+ truncated_timestamps_allowed_ = false;
+ return this;
+ }
+
+ /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
+ /// to enable certain read options (like "read_dictionary") to be set
+ /// automatically
+ Builder* store_schema() {
+ store_schema_ = true;
+ return this;
+ }
+
+ Builder* enable_compliant_nested_types() {
+ compliant_nested_types_ = true;
+ return this;
+ }
+
+ Builder* disable_compliant_nested_types() {
+ compliant_nested_types_ = false;
+ return this;
+ }
+
+ Builder* set_engine_version(EngineVersion version) {
+ engine_version_ = version;
+ return this;
+ }
+
+ std::shared_ptr<ArrowWriterProperties> build() {
+ return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+ write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
+ truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
+ engine_version_));
+ }
+
+ private:
+ bool write_timestamps_as_int96_;
+
+ bool coerce_timestamps_enabled_;
+ ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ bool truncated_timestamps_allowed_;
+
+ bool store_schema_;
+ bool compliant_nested_types_;
+ EngineVersion engine_version_;
+ };
+
+ bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
+
+ bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
+ ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+ return coerce_timestamps_unit_;
+ }
+
+ bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
+
+ bool store_schema() const { return store_schema_; }
+
+ /// \brief Enable nested type naming according to the parquet specification.
+ ///
+ /// Older versions of arrow wrote out field names for nested lists based on the name
+ /// of the field. According to the parquet specification they should always be
+ /// "element".
+ bool compliant_nested_types() const { return compliant_nested_types_; }
+
+ /// \brief The underlying engine version to use when writing Arrow data.
+ ///
+ /// V2 is currently the latest V1 is considered deprecated but left in
+ /// place in case there are bugs detected in V2.
+ EngineVersion engine_version() const { return engine_version_; }
+
+ private:
+ explicit ArrowWriterProperties(bool write_nanos_as_int96,
+ bool coerce_timestamps_enabled,
+ ::arrow::TimeUnit::type coerce_timestamps_unit,
+ bool truncated_timestamps_allowed, bool store_schema,
+ bool compliant_nested_types,
+ EngineVersion engine_version)
+ : write_timestamps_as_int96_(write_nanos_as_int96),
+ coerce_timestamps_enabled_(coerce_timestamps_enabled),
+ coerce_timestamps_unit_(coerce_timestamps_unit),
+ truncated_timestamps_allowed_(truncated_timestamps_allowed),
+ store_schema_(store_schema),
+ compliant_nested_types_(compliant_nested_types),
+ engine_version_(engine_version) {}
+
+ const bool write_timestamps_as_int96_;
+ const bool coerce_timestamps_enabled_;
+ const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ const bool truncated_timestamps_allowed_;
+ const bool store_schema_;
+ const bool compliant_nested_types_;
+ const EngineVersion engine_version_;
+};
+
+/// \brief State object used for writing Arrow data directly to a Parquet
+/// column chunk. API possibly not stable
+struct ArrowWriteContext {
+ ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
+ : memory_pool(memory_pool),
+ properties(properties),
+ data_buffer(AllocateBuffer(memory_pool)),
+ def_levels_buffer(AllocateBuffer(memory_pool)) {}
+
+ template <typename T>
+ ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
+ ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
+ *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
+ return ::arrow::Status::OK();
+ }
+
+ MemoryPool* memory_pool;
+ const ArrowWriterProperties* properties;
+
+ // Buffer used for storing the data of an array converted to the physical type
+ // as expected by parquet-cpp.
+ std::shared_ptr<ResizableBuffer> data_buffer;
+
+ // We use the shared ownership of this buffer
+ std::shared_ptr<ResizableBuffer> def_levels_buffer;
+};
+
+PARQUET_EXPORT
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc b/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
index cfa6bdb2912..fe4e10d8514 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
@@ -1,945 +1,945 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema.h"
-
-#include <algorithm>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "arrow/util/logging.h"
-#include "parquet/exception.h"
-#include "parquet/schema_internal.h"
-#include "parquet/thrift_internal.h"
-
-using parquet::format::SchemaElement;
-
-namespace parquet {
-
-namespace schema {
-
-namespace {
-
-void ThrowInvalidLogicalType(const LogicalType& logical_type) {
- std::stringstream ss;
- ss << "Invalid logical type: " << logical_type.ToString();
- throw ParquetException(ss.str());
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// ColumnPath
-
-std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
- std::stringstream ss(dotstring);
- std::string item;
- std::vector<std::string> path;
- while (std::getline(ss, item, '.')) {
- path.push_back(item);
- }
- return std::make_shared<ColumnPath>(std::move(path));
-}
-
-std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node) {
- // Build the path in reverse order as we traverse the nodes to the top
- std::vector<std::string> rpath_;
- const Node* cursor = &node;
- // The schema node is not part of the ColumnPath
- while (cursor->parent()) {
- rpath_.push_back(cursor->name());
- cursor = cursor->parent();
- }
-
- // Build ColumnPath in correct order
- std::vector<std::string> path(rpath_.crbegin(), rpath_.crend());
- return std::make_shared<ColumnPath>(std::move(path));
-}
-
-std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
- std::vector<std::string> path;
- path.reserve(path_.size() + 1);
- path.resize(path_.size() + 1);
- std::copy(path_.cbegin(), path_.cend(), path.begin());
- path[path_.size()] = node_name;
-
- return std::make_shared<ColumnPath>(std::move(path));
-}
-
-std::string ColumnPath::ToDotString() const {
- std::stringstream ss;
- for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
- if (it != path_.cbegin()) {
- ss << ".";
- }
- ss << *it;
- }
- return ss.str();
-}
-
-const std::vector<std::string>& ColumnPath::ToDotVector() const { return path_; }
-
-// ----------------------------------------------------------------------
-// Base node
-
-const std::shared_ptr<ColumnPath> Node::path() const {
- // TODO(itaiin): Cache the result, or more precisely, cache ->ToDotString()
- // since it is being used to access the leaf nodes
- return ColumnPath::FromNode(*this);
-}
-
-bool Node::EqualsInternal(const Node* other) const {
- return type_ == other->type_ && name_ == other->name_ &&
- repetition_ == other->repetition_ && converted_type_ == other->converted_type_ &&
- field_id_ == other->field_id() &&
- logical_type_->Equals(*(other->logical_type()));
-}
-
-void Node::SetParent(const Node* parent) { parent_ = parent; }
-
-// ----------------------------------------------------------------------
-// Primitive node
-
-PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
- Type::type type, ConvertedType::type converted_type,
- int length, int precision, int scale, int id)
- : Node(Node::PRIMITIVE, name, repetition, converted_type, id),
- physical_type_(type),
- type_length_(length) {
- std::stringstream ss;
-
- // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
- // set to true, but Impala will raise an incompatible metadata in such cases
- memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
-
- // Check if the physical and logical types match
- // Mapping referred from Apache parquet-mr as on 2016-02-22
- switch (converted_type) {
- case ConvertedType::NONE:
- // Logical type not set
- break;
- case ConvertedType::UTF8:
- case ConvertedType::JSON:
- case ConvertedType::BSON:
- if (type != Type::BYTE_ARRAY) {
- ss << ConvertedTypeToString(converted_type);
- ss << " can only annotate BYTE_ARRAY fields";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::DECIMAL:
- if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
- (type != Type::FIXED_LEN_BYTE_ARRAY)) {
- ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
- throw ParquetException(ss.str());
- }
- if (precision <= 0) {
- ss << "Invalid DECIMAL precision: " << precision
- << ". Precision must be a number between 1 and 38 inclusive";
- throw ParquetException(ss.str());
- }
- if (scale < 0) {
- ss << "Invalid DECIMAL scale: " << scale
- << ". Scale must be a number between 0 and precision inclusive";
- throw ParquetException(ss.str());
- }
- if (scale > precision) {
- ss << "Invalid DECIMAL scale " << scale;
- ss << " cannot be greater than precision " << precision;
- throw ParquetException(ss.str());
- }
- decimal_metadata_.isset = true;
- decimal_metadata_.precision = precision;
- decimal_metadata_.scale = scale;
- break;
- case ConvertedType::DATE:
- case ConvertedType::TIME_MILLIS:
- case ConvertedType::UINT_8:
- case ConvertedType::UINT_16:
- case ConvertedType::UINT_32:
- case ConvertedType::INT_8:
- case ConvertedType::INT_16:
- case ConvertedType::INT_32:
- if (type != Type::INT32) {
- ss << ConvertedTypeToString(converted_type);
- ss << " can only annotate INT32";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::TIME_MICROS:
- case ConvertedType::TIMESTAMP_MILLIS:
- case ConvertedType::TIMESTAMP_MICROS:
- case ConvertedType::UINT_64:
- case ConvertedType::INT_64:
- if (type != Type::INT64) {
- ss << ConvertedTypeToString(converted_type);
- ss << " can only annotate INT64";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::INTERVAL:
- if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
- ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::ENUM:
- if (type != Type::BYTE_ARRAY) {
- ss << "ENUM can only annotate BYTE_ARRAY fields";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::NA:
- // NA can annotate any type
- break;
- default:
- ss << ConvertedTypeToString(converted_type);
- ss << " cannot be applied to a primitive type";
- throw ParquetException(ss.str());
- }
- // For forward compatibility, create an equivalent logical type
- logical_type_ = LogicalType::FromConvertedType(converted_type_, decimal_metadata_);
- if (!(logical_type_ && !logical_type_->is_nested() &&
- logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- if (type == Type::FIXED_LEN_BYTE_ARRAY) {
- if (length <= 0) {
- ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
- throw ParquetException(ss.str());
- }
- type_length_ = length;
- }
-}
-
-PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type,
- Type::type physical_type, int physical_length, int id)
- : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id),
- physical_type_(physical_type),
- type_length_(physical_length) {
- std::stringstream error;
- if (logical_type_) {
- // Check for logical type <=> node type consistency
- if (!logical_type_->is_nested()) {
- // Check for logical type <=> physical type consistency
- if (logical_type_->is_applicable(physical_type, physical_length)) {
- // For backward compatibility, assign equivalent legacy
- // converted type (if possible)
- converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
- } else {
- error << logical_type_->ToString();
- error << " can not be applied to primitive type ";
- error << TypeToString(physical_type);
- throw ParquetException(error.str());
- }
- } else {
- error << "Nested logical type ";
- error << logical_type_->ToString();
- error << " can not be applied to non-group node";
- throw ParquetException(error.str());
- }
- } else {
- logical_type_ = NoLogicalType::Make();
- converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
- }
- if (!(logical_type_ && !logical_type_->is_nested() &&
- logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- if (physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
- if (physical_length <= 0) {
- error << "Invalid FIXED_LEN_BYTE_ARRAY length: " << physical_length;
- throw ParquetException(error.str());
- }
- }
-}
-
-bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
- bool is_equal = true;
- if (physical_type_ != other->physical_type_) {
- return false;
- }
- if (converted_type_ == ConvertedType::DECIMAL) {
- is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
- (decimal_metadata_.scale == other->decimal_metadata_.scale);
- }
- if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
- is_equal &= (type_length_ == other->type_length_);
- }
- return is_equal;
-}
-
-bool PrimitiveNode::Equals(const Node* other) const {
- if (!Node::EqualsInternal(other)) {
- return false;
- }
- return EqualsInternal(static_cast<const PrimitiveNode*>(other));
-}
-
-void PrimitiveNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
-
-void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
- visitor->Visit(this);
-}
-
-// ----------------------------------------------------------------------
-// Group node
-
-GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields, ConvertedType::type converted_type, int id)
- : Node(Node::GROUP, name, repetition, converted_type, id), fields_(fields) {
- // For forward compatibility, create an equivalent logical type
- logical_type_ = LogicalType::FromConvertedType(converted_type_);
- if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
- logical_type_->is_compatible(converted_type_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- field_name_to_idx_.clear();
- auto field_idx = 0;
- for (NodePtr& field : fields_) {
- field->SetParent(this);
- field_name_to_idx_.emplace(field->name(), field_idx++);
- }
-}
-
-GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- std::shared_ptr<const LogicalType> logical_type, int id)
- : Node(Node::GROUP, name, repetition, std::move(logical_type), id), fields_(fields) {
- if (logical_type_) {
- // Check for logical type <=> node type consistency
- if (logical_type_->is_nested()) {
- // For backward compatibility, assign equivalent legacy converted type (if possible)
- converted_type_ = logical_type_->ToConvertedType(nullptr);
- } else {
- std::stringstream error;
- error << "Logical type ";
- error << logical_type_->ToString();
- error << " can not be applied to group node";
- throw ParquetException(error.str());
- }
- } else {
- logical_type_ = NoLogicalType::Make();
- converted_type_ = logical_type_->ToConvertedType(nullptr);
- }
- if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
- logical_type_->is_compatible(converted_type_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- field_name_to_idx_.clear();
- auto field_idx = 0;
- for (NodePtr& field : fields_) {
- field->SetParent(this);
- field_name_to_idx_.emplace(field->name(), field_idx++);
- }
-}
-
-bool GroupNode::EqualsInternal(const GroupNode* other) const {
- if (this == other) {
- return true;
- }
- if (this->field_count() != other->field_count()) {
- return false;
- }
- for (int i = 0; i < this->field_count(); ++i) {
- if (!this->field(i)->Equals(other->field(i).get())) {
- return false;
- }
- }
- return true;
-}
-
-bool GroupNode::Equals(const Node* other) const {
- if (!Node::EqualsInternal(other)) {
- return false;
- }
- return EqualsInternal(static_cast<const GroupNode*>(other));
-}
-
-int GroupNode::FieldIndex(const std::string& name) const {
- auto search = field_name_to_idx_.find(name);
- if (search == field_name_to_idx_.end()) {
- // Not found
- return -1;
- }
- return search->second;
-}
-
-int GroupNode::FieldIndex(const Node& node) const {
- auto search = field_name_to_idx_.equal_range(node.name());
- for (auto it = search.first; it != search.second; ++it) {
- const int idx = it->second;
- if (&node == field(idx).get()) {
- return idx;
- }
- }
- return -1;
-}
-
-void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
-
-void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { visitor->Visit(this); }
-
-// ----------------------------------------------------------------------
-// Node construction from Parquet metadata
-
-std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element,
- NodeVector fields) {
- const format::SchemaElement* element =
- static_cast<const format::SchemaElement*>(opaque_element);
-
- int field_id = -1;
- if (element->__isset.field_id) {
- field_id = element->field_id;
- }
-
- std::unique_ptr<GroupNode> group_node;
- if (element->__isset.logicalType) {
- // updated writer with logical type present
- group_node = std::unique_ptr<GroupNode>(
- new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields,
- LogicalType::FromThrift(element->logicalType), field_id));
- } else {
- group_node = std::unique_ptr<GroupNode>(new GroupNode(
- element->name, LoadEnumSafe(&element->repetition_type), fields,
- (element->__isset.converted_type ? LoadEnumSafe(&element->converted_type)
- : ConvertedType::NONE),
- field_id));
- }
-
- return std::unique_ptr<Node>(group_node.release());
-}
-
-std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element) {
- const format::SchemaElement* element =
- static_cast<const format::SchemaElement*>(opaque_element);
-
- int field_id = -1;
- if (element->__isset.field_id) {
- field_id = element->field_id;
- }
-
- std::unique_ptr<PrimitiveNode> primitive_node;
- if (element->__isset.logicalType) {
- // updated writer with logical type present
- primitive_node = std::unique_ptr<PrimitiveNode>(
- new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type),
- LogicalType::FromThrift(element->logicalType),
- LoadEnumSafe(&element->type), element->type_length, field_id));
- } else if (element->__isset.converted_type) {
- // legacy writer with converted type present
- primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
- element->name, LoadEnumSafe(&element->repetition_type),
- LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
- element->type_length, element->precision, element->scale, field_id));
- } else {
- // logical type not present
- primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
- element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(),
- LoadEnumSafe(&element->type), element->type_length, field_id));
- }
-
- // Return as unique_ptr to the base type
- return std::unique_ptr<Node>(primitive_node.release());
-}
-
-bool GroupNode::HasRepeatedFields() const {
- for (int i = 0; i < this->field_count(); ++i) {
- auto field = this->field(i);
- if (field->repetition() == Repetition::REPEATED) {
- return true;
- }
- if (field->is_group()) {
- const auto& group = static_cast<const GroupNode&>(*field);
- return group.HasRepeatedFields();
- }
- }
- return false;
-}
-
-void GroupNode::ToParquet(void* opaque_element) const {
- format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
- element->__set_name(name_);
- element->__set_num_children(field_count());
- element->__set_repetition_type(ToThrift(repetition_));
- if (converted_type_ != ConvertedType::NONE) {
- element->__set_converted_type(ToThrift(converted_type_));
- }
- if (field_id_ >= 0) {
- element->__set_field_id(field_id_);
- }
- if (logical_type_ && logical_type_->is_serialized()) {
- element->__set_logicalType(logical_type_->ToThrift());
- }
- return;
-}
-
-void PrimitiveNode::ToParquet(void* opaque_element) const {
- format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
- element->__set_name(name_);
- element->__set_repetition_type(ToThrift(repetition_));
- if (converted_type_ != ConvertedType::NONE) {
- if (converted_type_ != ConvertedType::NA) {
- element->__set_converted_type(ToThrift(converted_type_));
- } else {
- // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
- // Never emit it (see PARQUET-1990 for discussion).
- if (!logical_type_ || !logical_type_->is_null()) {
- throw ParquetException(
- "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
- }
- }
- }
- if (field_id_ >= 0) {
- element->__set_field_id(field_id_);
- }
- if (logical_type_ && logical_type_->is_serialized() &&
- // TODO(tpboudreau): remove the following conjunct to enable serialization
- // of IntervalTypes after parquet.thrift recognizes them
- !logical_type_->is_interval()) {
- element->__set_logicalType(logical_type_->ToThrift());
- }
- element->__set_type(ToThrift(physical_type_));
- if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
- element->__set_type_length(type_length_);
- }
- if (decimal_metadata_.isset) {
- element->__set_precision(decimal_metadata_.precision);
- element->__set_scale(decimal_metadata_.scale);
- }
- return;
-}
-
-// ----------------------------------------------------------------------
-// Schema converters
-
-std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length) {
- if (elements[0].num_children == 0) {
- if (length == 1) {
- // Degenerate case of Parquet file with no columns
- return GroupNode::FromParquet(elements, {});
- } else {
- throw ParquetException(
- "Parquet schema had multiple nodes but root had no children");
- }
- }
-
- // We don't check that the root node is repeated since this is not
- // consistently set by implementations
-
- int pos = 0;
-
- std::function<std::unique_ptr<Node>()> NextNode = [&]() {
- if (pos == length) {
- throw ParquetException("Malformed schema: not enough elements");
- }
- const SchemaElement& element = elements[pos++];
- const void* opaque_element = static_cast<const void*>(&element);
-
- if (element.num_children == 0 && element.__isset.type) {
- // Leaf (primitive) node: always has a type
- return PrimitiveNode::FromParquet(opaque_element);
- } else {
- // Group node (may have 0 children, but cannot have a type)
- NodeVector fields;
- for (int i = 0; i < element.num_children; ++i) {
- std::unique_ptr<Node> field = NextNode();
- fields.push_back(NodePtr(field.release()));
- }
- return GroupNode::FromParquet(opaque_element, std::move(fields));
- }
- };
- return NextNode();
-}
-
-std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
- if (schema.empty()) {
- throw ParquetException("Empty file schema (no root)");
- }
- std::unique_ptr<Node> root = Unflatten(&schema[0], static_cast<int>(schema.size()));
- std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
- descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
- return descr;
-}
-
-class SchemaVisitor : public Node::ConstVisitor {
- public:
- explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
- : elements_(elements) {}
-
- void Visit(const Node* node) override {
- format::SchemaElement element;
- node->ToParquet(&element);
- elements_->push_back(element);
-
- if (node->is_group()) {
- const GroupNode* group_node = static_cast<const GroupNode*>(node);
- for (int i = 0; i < group_node->field_count(); ++i) {
- group_node->field(i)->VisitConst(this);
- }
- }
- }
-
- private:
- std::vector<format::SchemaElement>* elements_;
-};
-
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
- SchemaVisitor visitor(out);
- schema->VisitConst(&visitor);
-}
-
-// ----------------------------------------------------------------------
-// Schema printing
-
-static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
- switch (repetition) {
- case Repetition::REQUIRED:
- stream << "required";
- break;
- case Repetition::OPTIONAL:
- stream << "optional";
- break;
- case Repetition::REPEATED:
- stream << "repeated";
- break;
- default:
- break;
- }
-}
-
-static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
- switch (node->physical_type()) {
- case Type::BOOLEAN:
- stream << "boolean";
- break;
- case Type::INT32:
- stream << "int32";
- break;
- case Type::INT64:
- stream << "int64";
- break;
- case Type::INT96:
- stream << "int96";
- break;
- case Type::FLOAT:
- stream << "float";
- break;
- case Type::DOUBLE:
- stream << "double";
- break;
- case Type::BYTE_ARRAY:
- stream << "binary";
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- stream << "fixed_len_byte_array(" << node->type_length() << ")";
- break;
- default:
- break;
- }
-}
-
-static void PrintConvertedType(const PrimitiveNode* node, std::ostream& stream) {
- auto lt = node->converted_type();
- auto la = node->logical_type();
- if (la && la->is_valid() && !la->is_none()) {
- stream << " (" << la->ToString() << ")";
- } else if (lt == ConvertedType::DECIMAL) {
- stream << " (" << ConvertedTypeToString(lt) << "("
- << node->decimal_metadata().precision << "," << node->decimal_metadata().scale
- << "))";
- } else if (lt != ConvertedType::NONE) {
- stream << " (" << ConvertedTypeToString(lt) << ")";
- }
-}
-
-struct SchemaPrinter : public Node::ConstVisitor {
- explicit SchemaPrinter(std::ostream& stream, int indent_width)
- : stream_(stream), indent_(0), indent_width_(2) {}
-
- void Indent() {
- if (indent_ > 0) {
- std::string spaces(indent_, ' ');
- stream_ << spaces;
- }
- }
-
- void Visit(const Node* node) {
- Indent();
- if (node->is_group()) {
- Visit(static_cast<const GroupNode*>(node));
- } else {
- // Primitive
- Visit(static_cast<const PrimitiveNode*>(node));
- }
- }
-
- void Visit(const PrimitiveNode* node) {
- PrintRepLevel(node->repetition(), stream_);
- stream_ << " ";
- PrintType(node, stream_);
- stream_ << " field_id=" << node->field_id() << " " << node->name();
- PrintConvertedType(node, stream_);
- stream_ << ";" << std::endl;
- }
-
- void Visit(const GroupNode* node) {
- PrintRepLevel(node->repetition(), stream_);
- stream_ << " group "
- << "field_id=" << node->field_id() << " " << node->name();
- auto lt = node->converted_type();
- auto la = node->logical_type();
- if (la && la->is_valid() && !la->is_none()) {
- stream_ << " (" << la->ToString() << ")";
- } else if (lt != ConvertedType::NONE) {
- stream_ << " (" << ConvertedTypeToString(lt) << ")";
- }
- stream_ << " {" << std::endl;
-
- indent_ += indent_width_;
- for (int i = 0; i < node->field_count(); ++i) {
- node->field(i)->VisitConst(this);
- }
- indent_ -= indent_width_;
- Indent();
- stream_ << "}" << std::endl;
- }
-
- std::ostream& stream_;
- int indent_;
- int indent_width_;
-};
-
-void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
- SchemaPrinter printer(stream, indent_width);
- printer.Visit(schema);
-}
-
-} // namespace schema
-
-using schema::ColumnPath;
-using schema::GroupNode;
-using schema::Node;
-using schema::NodePtr;
-using schema::PrimitiveNode;
-
-void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
- Init(NodePtr(schema.release()));
-}
-
-class SchemaUpdater : public Node::Visitor {
- public:
- explicit SchemaUpdater(const std::vector<ColumnOrder>& column_orders)
- : column_orders_(column_orders), leaf_count_(0) {}
-
- void Visit(Node* node) override {
- if (node->is_group()) {
- GroupNode* group_node = static_cast<GroupNode*>(node);
- for (int i = 0; i < group_node->field_count(); ++i) {
- group_node->field(i)->Visit(this);
- }
- } else { // leaf node
- PrimitiveNode* leaf_node = static_cast<PrimitiveNode*>(node);
- leaf_node->SetColumnOrder(column_orders_[leaf_count_++]);
- }
- }
-
- private:
- const std::vector<ColumnOrder>& column_orders_;
- int leaf_count_;
-};
-
-void SchemaDescriptor::updateColumnOrders(const std::vector<ColumnOrder>& column_orders) {
- if (static_cast<int>(column_orders.size()) != num_columns()) {
- throw ParquetException("Malformed schema: not enough ColumnOrder values");
- }
- SchemaUpdater visitor(column_orders);
- const_cast<GroupNode*>(group_node_)->Visit(&visitor);
-}
-
-void SchemaDescriptor::Init(NodePtr schema) {
- schema_ = std::move(schema);
-
- if (!schema_->is_group()) {
- throw ParquetException("Must initialize with a schema group");
- }
-
- group_node_ = static_cast<const GroupNode*>(schema_.get());
- leaves_.clear();
-
- for (int i = 0; i < group_node_->field_count(); ++i) {
- BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
- }
-}
-
-bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
- if (this->num_columns() != other.num_columns()) {
- return false;
- }
-
- for (int i = 0; i < this->num_columns(); ++i) {
- if (!this->Column(i)->Equals(*other.Column(i))) {
- return false;
- }
- }
-
- return true;
-}
-
-void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
- int16_t max_rep_level, const NodePtr& base) {
- if (node->is_optional()) {
- ++max_def_level;
- } else if (node->is_repeated()) {
- // Repeated fields add a definition level. This is used to distinguish
- // between an empty list and a list with an item in it.
- ++max_rep_level;
- ++max_def_level;
- }
-
- // Now, walk the schema and create a ColumnDescriptor for each leaf node
- if (node->is_group()) {
- const GroupNode* group = static_cast<const GroupNode*>(node.get());
- for (int i = 0; i < group->field_count(); ++i) {
- BuildTree(group->field(i), max_def_level, max_rep_level, base);
- }
- } else {
- node_to_leaf_index_[static_cast<const PrimitiveNode*>(node.get())] =
- static_cast<int>(leaves_.size());
-
- // Primitive node, append to leaves
- leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
- leaf_to_base_.emplace(static_cast<int>(leaves_.size()) - 1, base);
- leaf_to_idx_.emplace(node->path()->ToDotString(),
- static_cast<int>(leaves_.size()) - 1);
- }
-}
-
-int SchemaDescriptor::GetColumnIndex(const PrimitiveNode& node) const {
- auto it = node_to_leaf_index_.find(&node);
- if (it == node_to_leaf_index_.end()) {
- return -1;
- }
- return it->second;
-}
-
-ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
- int16_t max_repetition_level,
- const SchemaDescriptor* schema_descr)
- : node_(std::move(node)),
- max_definition_level_(max_definition_level),
- max_repetition_level_(max_repetition_level) {
- if (!node_->is_primitive()) {
- throw ParquetException("Must be a primitive type");
- }
- primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
-}
-
-bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
- return primitive_node_->Equals(other.primitive_node_) &&
- max_repetition_level() == other.max_repetition_level() &&
- max_definition_level() == other.max_definition_level();
-}
-
-const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
- DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
- return &leaves_[i];
-}
-
-int SchemaDescriptor::ColumnIndex(const std::string& node_path) const {
- auto search = leaf_to_idx_.find(node_path);
- if (search == leaf_to_idx_.end()) {
- // Not found
- return -1;
- }
- return search->second;
-}
-
-int SchemaDescriptor::ColumnIndex(const Node& node) const {
- auto search = leaf_to_idx_.equal_range(node.path()->ToDotString());
- for (auto it = search.first; it != search.second; ++it) {
- const int idx = it->second;
- if (&node == Column(idx)->schema_node().get()) {
- return idx;
- }
- }
- return -1;
-}
-
-const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const {
- DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
- return leaf_to_base_.find(i)->second.get();
-}
-
-bool SchemaDescriptor::HasRepeatedFields() const {
- return group_node_->HasRepeatedFields();
-}
-
-std::string SchemaDescriptor::ToString() const {
- std::ostringstream ss;
- PrintSchema(schema_.get(), ss);
- return ss.str();
-}
-
-std::string ColumnDescriptor::ToString() const {
- std::ostringstream ss;
- ss << "column descriptor = {" << std::endl
- << " name: " << name() << "," << std::endl
- << " path: " << path()->ToDotString() << "," << std::endl
- << " physical_type: " << TypeToString(physical_type()) << "," << std::endl
- << " converted_type: " << ConvertedTypeToString(converted_type()) << ","
- << std::endl
- << " logical_type: " << logical_type()->ToString() << "," << std::endl
- << " max_definition_level: " << max_definition_level() << "," << std::endl
- << " max_repetition_level: " << max_repetition_level() << "," << std::endl;
-
- if (physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
- ss << " length: " << type_length() << "," << std::endl;
- }
-
- if (converted_type() == parquet::ConvertedType::DECIMAL) {
- ss << " precision: " << type_precision() << "," << std::endl
- << " scale: " << type_scale() << "," << std::endl;
- }
-
- ss << "}";
- return ss.str();
-}
-
-int ColumnDescriptor::type_scale() const {
- return primitive_node_->decimal_metadata().scale;
-}
-
-int ColumnDescriptor::type_precision() const {
- return primitive_node_->decimal_metadata().precision;
-}
-
-int ColumnDescriptor::type_length() const { return primitive_node_->type_length(); }
-
-const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
- return primitive_node_->path();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema.h"
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/logging.h"
+#include "parquet/exception.h"
+#include "parquet/schema_internal.h"
+#include "parquet/thrift_internal.h"
+
+using parquet::format::SchemaElement;
+
+namespace parquet {
+
+namespace schema {
+
+namespace {
+
+void ThrowInvalidLogicalType(const LogicalType& logical_type) {
+ std::stringstream ss;
+ ss << "Invalid logical type: " << logical_type.ToString();
+ throw ParquetException(ss.str());
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+ std::stringstream ss(dotstring);
+ std::string item;
+ std::vector<std::string> path;
+ while (std::getline(ss, item, '.')) {
+ path.push_back(item);
+ }
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node) {
+ // Build the path in reverse order as we traverse the nodes to the top
+ std::vector<std::string> rpath_;
+ const Node* cursor = &node;
+ // The schema node is not part of the ColumnPath
+ while (cursor->parent()) {
+ rpath_.push_back(cursor->name());
+ cursor = cursor->parent();
+ }
+
+ // Build ColumnPath in correct order
+ std::vector<std::string> path(rpath_.crbegin(), rpath_.crend());
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+ std::vector<std::string> path;
+ path.reserve(path_.size() + 1);
+ path.resize(path_.size() + 1);
+ std::copy(path_.cbegin(), path_.cend(), path.begin());
+ path[path_.size()] = node_name;
+
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::string ColumnPath::ToDotString() const {
+ std::stringstream ss;
+ for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+ if (it != path_.cbegin()) {
+ ss << ".";
+ }
+ ss << *it;
+ }
+ return ss.str();
+}
+
+const std::vector<std::string>& ColumnPath::ToDotVector() const { return path_; }
+
+// ----------------------------------------------------------------------
+// Base node
+
+const std::shared_ptr<ColumnPath> Node::path() const {
+ // TODO(itaiin): Cache the result, or more precisely, cache ->ToDotString()
+ // since it is being used to access the leaf nodes
+ return ColumnPath::FromNode(*this);
+}
+
+bool Node::EqualsInternal(const Node* other) const {
+ return type_ == other->type_ && name_ == other->name_ &&
+ repetition_ == other->repetition_ && converted_type_ == other->converted_type_ &&
+ field_id_ == other->field_id() &&
+ logical_type_->Equals(*(other->logical_type()));
+}
+
+void Node::SetParent(const Node* parent) { parent_ = parent; }
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ Type::type type, ConvertedType::type converted_type,
+ int length, int precision, int scale, int id)
+ : Node(Node::PRIMITIVE, name, repetition, converted_type, id),
+ physical_type_(type),
+ type_length_(length) {
+ std::stringstream ss;
+
+ // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
+ // set to true, but Impala will raise an incompatible metadata in such cases
+ memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
+
+ // Check if the physical and logical types match
+ // Mapping referred from Apache parquet-mr as on 2016-02-22
+ switch (converted_type) {
+ case ConvertedType::NONE:
+ // Logical type not set
+ break;
+ case ConvertedType::UTF8:
+ case ConvertedType::JSON:
+ case ConvertedType::BSON:
+ if (type != Type::BYTE_ARRAY) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::DECIMAL:
+ if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
+ (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+ ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+ throw ParquetException(ss.str());
+ }
+ if (precision <= 0) {
+ ss << "Invalid DECIMAL precision: " << precision
+ << ". Precision must be a number between 1 and 38 inclusive";
+ throw ParquetException(ss.str());
+ }
+ if (scale < 0) {
+ ss << "Invalid DECIMAL scale: " << scale
+ << ". Scale must be a number between 0 and precision inclusive";
+ throw ParquetException(ss.str());
+ }
+ if (scale > precision) {
+ ss << "Invalid DECIMAL scale " << scale;
+ ss << " cannot be greater than precision " << precision;
+ throw ParquetException(ss.str());
+ }
+ decimal_metadata_.isset = true;
+ decimal_metadata_.precision = precision;
+ decimal_metadata_.scale = scale;
+ break;
+ case ConvertedType::DATE:
+ case ConvertedType::TIME_MILLIS:
+ case ConvertedType::UINT_8:
+ case ConvertedType::UINT_16:
+ case ConvertedType::UINT_32:
+ case ConvertedType::INT_8:
+ case ConvertedType::INT_16:
+ case ConvertedType::INT_32:
+ if (type != Type::INT32) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate INT32";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::TIME_MICROS:
+ case ConvertedType::TIMESTAMP_MILLIS:
+ case ConvertedType::TIMESTAMP_MICROS:
+ case ConvertedType::UINT_64:
+ case ConvertedType::INT_64:
+ if (type != Type::INT64) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate INT64";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::INTERVAL:
+ if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+ ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::ENUM:
+ if (type != Type::BYTE_ARRAY) {
+ ss << "ENUM can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::NA:
+ // NA can annotate any type
+ break;
+ default:
+ ss << ConvertedTypeToString(converted_type);
+ ss << " cannot be applied to a primitive type";
+ throw ParquetException(ss.str());
+ }
+ // For forward compatibility, create an equivalent logical type
+ logical_type_ = LogicalType::FromConvertedType(converted_type_, decimal_metadata_);
+ if (!(logical_type_ && !logical_type_->is_nested() &&
+ logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (length <= 0) {
+ ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+ throw ParquetException(ss.str());
+ }
+ type_length_ = length;
+ }
+}
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type physical_type, int physical_length, int id)
+ : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id),
+ physical_type_(physical_type),
+ type_length_(physical_length) {
+ std::stringstream error;
+ if (logical_type_) {
+ // Check for logical type <=> node type consistency
+ if (!logical_type_->is_nested()) {
+ // Check for logical type <=> physical type consistency
+ if (logical_type_->is_applicable(physical_type, physical_length)) {
+ // For backward compatibility, assign equivalent legacy
+ // converted type (if possible)
+ converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
+ } else {
+ error << logical_type_->ToString();
+ error << " can not be applied to primitive type ";
+ error << TypeToString(physical_type);
+ throw ParquetException(error.str());
+ }
+ } else {
+ error << "Nested logical type ";
+ error << logical_type_->ToString();
+ error << " can not be applied to non-group node";
+ throw ParquetException(error.str());
+ }
+ } else {
+ logical_type_ = NoLogicalType::Make();
+ converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
+ }
+ if (!(logical_type_ && !logical_type_->is_nested() &&
+ logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ if (physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (physical_length <= 0) {
+ error << "Invalid FIXED_LEN_BYTE_ARRAY length: " << physical_length;
+ throw ParquetException(error.str());
+ }
+ }
+}
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+ bool is_equal = true;
+ if (physical_type_ != other->physical_type_) {
+ return false;
+ }
+ if (converted_type_ == ConvertedType::DECIMAL) {
+ is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
+ (decimal_metadata_.scale == other->decimal_metadata_.scale);
+ }
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ is_equal &= (type_length_ == other->type_length_);
+ }
+ return is_equal;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
+
+void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, ConvertedType::type converted_type, int id)
+ : Node(Node::GROUP, name, repetition, converted_type, id), fields_(fields) {
+ // For forward compatibility, create an equivalent logical type
+ logical_type_ = LogicalType::FromConvertedType(converted_type_);
+ if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
+ logical_type_->is_compatible(converted_type_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ field_name_to_idx_.clear();
+ auto field_idx = 0;
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ field_name_to_idx_.emplace(field->name(), field_idx++);
+ }
+}
+
+GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ std::shared_ptr<const LogicalType> logical_type, int id)
+ : Node(Node::GROUP, name, repetition, std::move(logical_type), id), fields_(fields) {
+ if (logical_type_) {
+ // Check for logical type <=> node type consistency
+ if (logical_type_->is_nested()) {
+ // For backward compatibility, assign equivalent legacy converted type (if possible)
+ converted_type_ = logical_type_->ToConvertedType(nullptr);
+ } else {
+ std::stringstream error;
+ error << "Logical type ";
+ error << logical_type_->ToString();
+ error << " can not be applied to group node";
+ throw ParquetException(error.str());
+ }
+ } else {
+ logical_type_ = NoLogicalType::Make();
+ converted_type_ = logical_type_->ToConvertedType(nullptr);
+ }
+ if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
+ logical_type_->is_compatible(converted_type_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ field_name_to_idx_.clear();
+ auto field_idx = 0;
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ field_name_to_idx_.emplace(field->name(), field_idx++);
+ }
+}
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+ if (this == other) {
+ return true;
+ }
+ if (this->field_count() != other->field_count()) {
+ return false;
+ }
+ for (int i = 0; i < this->field_count(); ++i) {
+ if (!this->field(i)->Equals(other->field(i).get())) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+int GroupNode::FieldIndex(const std::string& name) const {
+ auto search = field_name_to_idx_.find(name);
+ if (search == field_name_to_idx_.end()) {
+ // Not found
+ return -1;
+ }
+ return search->second;
+}
+
+int GroupNode::FieldIndex(const Node& node) const {
+ auto search = field_name_to_idx_.equal_range(node.name());
+ for (auto it = search.first; it != search.second; ++it) {
+ const int idx = it->second;
+ if (&node == field(idx).get()) {
+ return idx;
+ }
+ }
+ return -1;
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
+
+void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { visitor->Visit(this); }
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element,
+ NodeVector fields) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+
+ int field_id = -1;
+ if (element->__isset.field_id) {
+ field_id = element->field_id;
+ }
+
+ std::unique_ptr<GroupNode> group_node;
+ if (element->__isset.logicalType) {
+ // updated writer with logical type present
+ group_node = std::unique_ptr<GroupNode>(
+ new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields,
+ LogicalType::FromThrift(element->logicalType), field_id));
+ } else {
+ group_node = std::unique_ptr<GroupNode>(new GroupNode(
+ element->name, LoadEnumSafe(&element->repetition_type), fields,
+ (element->__isset.converted_type ? LoadEnumSafe(&element->converted_type)
+ : ConvertedType::NONE),
+ field_id));
+ }
+
+ return std::unique_ptr<Node>(group_node.release());
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+
+ int field_id = -1;
+ if (element->__isset.field_id) {
+ field_id = element->field_id;
+ }
+
+ std::unique_ptr<PrimitiveNode> primitive_node;
+ if (element->__isset.logicalType) {
+ // updated writer with logical type present
+ primitive_node = std::unique_ptr<PrimitiveNode>(
+ new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type),
+ LogicalType::FromThrift(element->logicalType),
+ LoadEnumSafe(&element->type), element->type_length, field_id));
+ } else if (element->__isset.converted_type) {
+ // legacy writer with converted type present
+ primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
+ element->name, LoadEnumSafe(&element->repetition_type),
+ LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
+ element->type_length, element->precision, element->scale, field_id));
+ } else {
+ // logical type not present
+ primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
+ element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(),
+ LoadEnumSafe(&element->type), element->type_length, field_id));
+ }
+
+ // Return as unique_ptr to the base type
+ return std::unique_ptr<Node>(primitive_node.release());
+}
+
+bool GroupNode::HasRepeatedFields() const {
+ for (int i = 0; i < this->field_count(); ++i) {
+ auto field = this->field(i);
+ if (field->repetition() == Repetition::REPEATED) {
+ return true;
+ }
+ if (field->is_group()) {
+ const auto& group = static_cast<const GroupNode&>(*field);
+ return group.HasRepeatedFields();
+ }
+ }
+ return false;
+}
+
+void GroupNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_num_children(field_count());
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (converted_type_ != ConvertedType::NONE) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ }
+ if (field_id_ >= 0) {
+ element->__set_field_id(field_id_);
+ }
+ if (logical_type_ && logical_type_->is_serialized()) {
+ element->__set_logicalType(logical_type_->ToThrift());
+ }
+ return;
+}
+
+void PrimitiveNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (converted_type_ != ConvertedType::NONE) {
+ if (converted_type_ != ConvertedType::NA) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ } else {
+ // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
+ // Never emit it (see PARQUET-1990 for discussion).
+ if (!logical_type_ || !logical_type_->is_null()) {
+ throw ParquetException(
+ "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
+ }
+ }
+ }
+ if (field_id_ >= 0) {
+ element->__set_field_id(field_id_);
+ }
+ if (logical_type_ && logical_type_->is_serialized() &&
+ // TODO(tpboudreau): remove the following conjunct to enable serialization
+ // of IntervalTypes after parquet.thrift recognizes them
+ !logical_type_->is_interval()) {
+ element->__set_logicalType(logical_type_->ToThrift());
+ }
+ element->__set_type(ToThrift(physical_type_));
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ element->__set_type_length(type_length_);
+ }
+ if (decimal_metadata_.isset) {
+ element->__set_precision(decimal_metadata_.precision);
+ element->__set_scale(decimal_metadata_.scale);
+ }
+ return;
+}
+
+// ----------------------------------------------------------------------
+// Schema converters
+
+std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length) {
+ if (elements[0].num_children == 0) {
+ if (length == 1) {
+ // Degenerate case of Parquet file with no columns
+ return GroupNode::FromParquet(elements, {});
+ } else {
+ throw ParquetException(
+ "Parquet schema had multiple nodes but root had no children");
+ }
+ }
+
+ // We don't check that the root node is repeated since this is not
+ // consistently set by implementations
+
+ int pos = 0;
+
+ std::function<std::unique_ptr<Node>()> NextNode = [&]() {
+ if (pos == length) {
+ throw ParquetException("Malformed schema: not enough elements");
+ }
+ const SchemaElement& element = elements[pos++];
+ const void* opaque_element = static_cast<const void*>(&element);
+
+ if (element.num_children == 0 && element.__isset.type) {
+ // Leaf (primitive) node: always has a type
+ return PrimitiveNode::FromParquet(opaque_element);
+ } else {
+ // Group node (may have 0 children, but cannot have a type)
+ NodeVector fields;
+ for (int i = 0; i < element.num_children; ++i) {
+ std::unique_ptr<Node> field = NextNode();
+ fields.push_back(NodePtr(field.release()));
+ }
+ return GroupNode::FromParquet(opaque_element, std::move(fields));
+ }
+ };
+ return NextNode();
+}
+
+std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
+ if (schema.empty()) {
+ throw ParquetException("Empty file schema (no root)");
+ }
+ std::unique_ptr<Node> root = Unflatten(&schema[0], static_cast<int>(schema.size()));
+ std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
+ descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
+ return descr;
+}
+
+class SchemaVisitor : public Node::ConstVisitor {
+ public:
+ explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
+ : elements_(elements) {}
+
+ void Visit(const Node* node) override {
+ format::SchemaElement element;
+ node->ToParquet(&element);
+ elements_->push_back(element);
+
+ if (node->is_group()) {
+ const GroupNode* group_node = static_cast<const GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->VisitConst(this);
+ }
+ }
+ }
+
+ private:
+ std::vector<format::SchemaElement>* elements_;
+};
+
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
+ SchemaVisitor visitor(out);
+ schema->VisitConst(&visitor);
+}
+
+// ----------------------------------------------------------------------
+// Schema printing
+
+static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
+ switch (repetition) {
+ case Repetition::REQUIRED:
+ stream << "required";
+ break;
+ case Repetition::OPTIONAL:
+ stream << "optional";
+ break;
+ case Repetition::REPEATED:
+ stream << "repeated";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
+ switch (node->physical_type()) {
+ case Type::BOOLEAN:
+ stream << "boolean";
+ break;
+ case Type::INT32:
+ stream << "int32";
+ break;
+ case Type::INT64:
+ stream << "int64";
+ break;
+ case Type::INT96:
+ stream << "int96";
+ break;
+ case Type::FLOAT:
+ stream << "float";
+ break;
+ case Type::DOUBLE:
+ stream << "double";
+ break;
+ case Type::BYTE_ARRAY:
+ stream << "binary";
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ stream << "fixed_len_byte_array(" << node->type_length() << ")";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintConvertedType(const PrimitiveNode* node, std::ostream& stream) {
+ auto lt = node->converted_type();
+ auto la = node->logical_type();
+ if (la && la->is_valid() && !la->is_none()) {
+ stream << " (" << la->ToString() << ")";
+ } else if (lt == ConvertedType::DECIMAL) {
+ stream << " (" << ConvertedTypeToString(lt) << "("
+ << node->decimal_metadata().precision << "," << node->decimal_metadata().scale
+ << "))";
+ } else if (lt != ConvertedType::NONE) {
+ stream << " (" << ConvertedTypeToString(lt) << ")";
+ }
+}
+
+struct SchemaPrinter : public Node::ConstVisitor {
+ explicit SchemaPrinter(std::ostream& stream, int indent_width)
+ : stream_(stream), indent_(0), indent_width_(2) {}
+
+ void Indent() {
+ if (indent_ > 0) {
+ std::string spaces(indent_, ' ');
+ stream_ << spaces;
+ }
+ }
+
+ void Visit(const Node* node) {
+ Indent();
+ if (node->is_group()) {
+ Visit(static_cast<const GroupNode*>(node));
+ } else {
+ // Primitive
+ Visit(static_cast<const PrimitiveNode*>(node));
+ }
+ }
+
+ void Visit(const PrimitiveNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " ";
+ PrintType(node, stream_);
+ stream_ << " field_id=" << node->field_id() << " " << node->name();
+ PrintConvertedType(node, stream_);
+ stream_ << ";" << std::endl;
+ }
+
+ void Visit(const GroupNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " group "
+ << "field_id=" << node->field_id() << " " << node->name();
+ auto lt = node->converted_type();
+ auto la = node->logical_type();
+ if (la && la->is_valid() && !la->is_none()) {
+ stream_ << " (" << la->ToString() << ")";
+ } else if (lt != ConvertedType::NONE) {
+ stream_ << " (" << ConvertedTypeToString(lt) << ")";
+ }
+ stream_ << " {" << std::endl;
+
+ indent_ += indent_width_;
+ for (int i = 0; i < node->field_count(); ++i) {
+ node->field(i)->VisitConst(this);
+ }
+ indent_ -= indent_width_;
+ Indent();
+ stream_ << "}" << std::endl;
+ }
+
+ std::ostream& stream_;
+ int indent_;
+ int indent_width_;
+};
+
+void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
+ SchemaPrinter printer(stream, indent_width);
+ printer.Visit(schema);
+}
+
+} // namespace schema
+
+using schema::ColumnPath;
+using schema::GroupNode;
+using schema::Node;
+using schema::NodePtr;
+using schema::PrimitiveNode;
+
+void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
+ Init(NodePtr(schema.release()));
+}
+
+class SchemaUpdater : public Node::Visitor {
+ public:
+ explicit SchemaUpdater(const std::vector<ColumnOrder>& column_orders)
+ : column_orders_(column_orders), leaf_count_(0) {}
+
+ void Visit(Node* node) override {
+ if (node->is_group()) {
+ GroupNode* group_node = static_cast<GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->Visit(this);
+ }
+ } else { // leaf node
+ PrimitiveNode* leaf_node = static_cast<PrimitiveNode*>(node);
+ leaf_node->SetColumnOrder(column_orders_[leaf_count_++]);
+ }
+ }
+
+ private:
+ const std::vector<ColumnOrder>& column_orders_;
+ int leaf_count_;
+};
+
+void SchemaDescriptor::updateColumnOrders(const std::vector<ColumnOrder>& column_orders) {
+ if (static_cast<int>(column_orders.size()) != num_columns()) {
+ throw ParquetException("Malformed schema: not enough ColumnOrder values");
+ }
+ SchemaUpdater visitor(column_orders);
+ const_cast<GroupNode*>(group_node_)->Visit(&visitor);
+}
+
+void SchemaDescriptor::Init(NodePtr schema) {
+ schema_ = std::move(schema);
+
+ if (!schema_->is_group()) {
+ throw ParquetException("Must initialize with a schema group");
+ }
+
+ group_node_ = static_cast<const GroupNode*>(schema_.get());
+ leaves_.clear();
+
+ for (int i = 0; i < group_node_->field_count(); ++i) {
+ BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
+ }
+}
+
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+ if (this->num_columns() != other.num_columns()) {
+ return false;
+ }
+
+ for (int i = 0; i < this->num_columns(); ++i) {
+ if (!this->Column(i)->Equals(*other.Column(i))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const NodePtr& base) {
+ if (node->is_optional()) {
+ ++max_def_level;
+ } else if (node->is_repeated()) {
+ // Repeated fields add a definition level. This is used to distinguish
+ // between an empty list and a list with an item in it.
+ ++max_rep_level;
+ ++max_def_level;
+ }
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node->is_group()) {
+ const GroupNode* group = static_cast<const GroupNode*>(node.get());
+ for (int i = 0; i < group->field_count(); ++i) {
+ BuildTree(group->field(i), max_def_level, max_rep_level, base);
+ }
+ } else {
+ node_to_leaf_index_[static_cast<const PrimitiveNode*>(node.get())] =
+ static_cast<int>(leaves_.size());
+
+ // Primitive node, append to leaves
+ leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
+ leaf_to_base_.emplace(static_cast<int>(leaves_.size()) - 1, base);
+ leaf_to_idx_.emplace(node->path()->ToDotString(),
+ static_cast<int>(leaves_.size()) - 1);
+ }
+}
+
+int SchemaDescriptor::GetColumnIndex(const PrimitiveNode& node) const {
+ auto it = node_to_leaf_index_.find(&node);
+ if (it == node_to_leaf_index_.end()) {
+ return -1;
+ }
+ return it->second;
+}
+
+ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr)
+ : node_(std::move(node)),
+ max_definition_level_(max_definition_level),
+ max_repetition_level_(max_repetition_level) {
+ if (!node_->is_primitive()) {
+ throw ParquetException("Must be a primitive type");
+ }
+ primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
+}
+
+bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
+ return primitive_node_->Equals(other.primitive_node_) &&
+ max_repetition_level() == other.max_repetition_level() &&
+ max_definition_level() == other.max_definition_level();
+}
+
+const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return &leaves_[i];
+}
+
+int SchemaDescriptor::ColumnIndex(const std::string& node_path) const {
+ auto search = leaf_to_idx_.find(node_path);
+ if (search == leaf_to_idx_.end()) {
+ // Not found
+ return -1;
+ }
+ return search->second;
+}
+
+int SchemaDescriptor::ColumnIndex(const Node& node) const {
+ auto search = leaf_to_idx_.equal_range(node.path()->ToDotString());
+ for (auto it = search.first; it != search.second; ++it) {
+ const int idx = it->second;
+ if (&node == Column(idx)->schema_node().get()) {
+ return idx;
+ }
+ }
+ return -1;
+}
+
+const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return leaf_to_base_.find(i)->second.get();
+}
+
+bool SchemaDescriptor::HasRepeatedFields() const {
+ return group_node_->HasRepeatedFields();
+}
+
+std::string SchemaDescriptor::ToString() const {
+ std::ostringstream ss;
+ PrintSchema(schema_.get(), ss);
+ return ss.str();
+}
+
+std::string ColumnDescriptor::ToString() const {
+ std::ostringstream ss;
+ ss << "column descriptor = {" << std::endl
+ << " name: " << name() << "," << std::endl
+ << " path: " << path()->ToDotString() << "," << std::endl
+ << " physical_type: " << TypeToString(physical_type()) << "," << std::endl
+ << " converted_type: " << ConvertedTypeToString(converted_type()) << ","
+ << std::endl
+ << " logical_type: " << logical_type()->ToString() << "," << std::endl
+ << " max_definition_level: " << max_definition_level() << "," << std::endl
+ << " max_repetition_level: " << max_repetition_level() << "," << std::endl;
+
+ if (physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+ ss << " length: " << type_length() << "," << std::endl;
+ }
+
+ if (converted_type() == parquet::ConvertedType::DECIMAL) {
+ ss << " precision: " << type_precision() << "," << std::endl
+ << " scale: " << type_scale() << "," << std::endl;
+ }
+
+ ss << "}";
+ return ss.str();
+}
+
+int ColumnDescriptor::type_scale() const {
+ return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+ return primitive_node_->decimal_metadata().precision;
+}
+
+int ColumnDescriptor::type_length() const { return primitive_node_->type_length(); }
+
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+ return primitive_node_->path();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema.h b/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
index 7dcfa7d144e..63fc4706c7e 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
@@ -1,494 +1,494 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This module contains the logical parquet-cpp types (independent of Thrift
-// structures), schema nodes, and related type tools
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-class SchemaDescriptor;
-
-namespace schema {
-
-class Node;
-
-// List encodings: using the terminology from Impala to define different styles
-// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
-// the converted type named in the Parquet metadata is ConvertedType::LIST we
-// use that terminology here. It also helps distinguish from the *_ARRAY
-// primitive types.
-//
-// One-level encoding: Only allows required lists with required cells
-// repeated value_type name
-//
-// Two-level encoding: Enables optional lists with only required cells
-// <required/optional> group list
-// repeated value_type item
-//
-// Three-level encoding: Enables optional lists with optional cells
-// <required/optional> group bag
-// repeated group list
-// <required/optional> value_type item
-//
-// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
-// the non-repeated nodes set to required.
-//
-// The "official" encoding recommended in the Parquet spec is the 3-level, and
-// we use that as the default when creating list types. For semantic completeness
-// we allow the other two. Since all types of encodings will occur "in the
-// wild" we need to be able to interpret the associated definition levels in
-// the context of the actual encoding used in the file.
-//
-// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
-// SchemaElement, which could make things challenging if we are trying to infer
-// that a sequence of nodes semantically represents an array according to one
-// of these encodings (versus a struct containing an array). We should refuse
-// the temptation to guess, as they say.
-struct ListEncoding {
- enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
-};
-
-class PARQUET_EXPORT ColumnPath {
- public:
- ColumnPath() : path_() {}
- explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
- explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
-
- static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
- static std::shared_ptr<ColumnPath> FromNode(const Node& node);
-
- std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
- std::string ToDotString() const;
- const std::vector<std::string>& ToDotVector() const;
-
- protected:
- std::vector<std::string> path_;
-};
-
-// Base class for logical schema types. A type has a name, repetition level,
-// and optionally a logical type (ConvertedType in Parquet metadata parlance)
-class PARQUET_EXPORT Node {
- public:
- enum type { PRIMITIVE, GROUP };
-
- virtual ~Node() {}
-
- bool is_primitive() const { return type_ == Node::PRIMITIVE; }
-
- bool is_group() const { return type_ == Node::GROUP; }
-
- bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
-
- bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
-
- bool is_required() const { return repetition_ == Repetition::REQUIRED; }
-
- virtual bool Equals(const Node* other) const = 0;
-
- const std::string& name() const { return name_; }
-
- Node::type node_type() const { return type_; }
-
- Repetition::type repetition() const { return repetition_; }
-
- ConvertedType::type converted_type() const { return converted_type_; }
-
- const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
-
- /// \brief The field_id value for the serialized SchemaElement. If the
- /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
- /// Thrift.
- int field_id() const { return field_id_; }
-
- PARQUET_DEPRECATED("id() is deprecated. Use field_id() instead")
- int id() const { return field_id_; }
-
- const Node* parent() const { return parent_; }
-
- const std::shared_ptr<ColumnPath> path() const;
-
- virtual void ToParquet(void* element) const = 0;
-
- // Node::Visitor abstract class for walking schemas with the visitor pattern
- class Visitor {
- public:
- virtual ~Visitor() {}
-
- virtual void Visit(Node* node) = 0;
- };
- class ConstVisitor {
- public:
- virtual ~ConstVisitor() {}
-
- virtual void Visit(const Node* node) = 0;
- };
-
- virtual void Visit(Visitor* visitor) = 0;
- virtual void VisitConst(ConstVisitor* visitor) const = 0;
-
- protected:
- friend class GroupNode;
-
- Node(Node::type type, const std::string& name, Repetition::type repetition,
- ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
- : type_(type),
- name_(name),
- repetition_(repetition),
- converted_type_(converted_type),
- field_id_(field_id),
- parent_(NULLPTR) {}
-
- Node(Node::type type, const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
- : type_(type),
- name_(name),
- repetition_(repetition),
- logical_type_(std::move(logical_type)),
- field_id_(field_id),
- parent_(NULLPTR) {}
-
- Node::type type_;
- std::string name_;
- Repetition::type repetition_;
- ConvertedType::type converted_type_;
- std::shared_ptr<const LogicalType> logical_type_;
- int field_id_;
- // Nodes should not be shared, they have a single parent.
- const Node* parent_;
-
- bool EqualsInternal(const Node* other) const;
- void SetParent(const Node* p_parent);
-
- private:
- PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
-};
-
-// Save our breath all over the place with these typedefs
-typedef std::shared_ptr<Node> NodePtr;
-typedef std::vector<NodePtr> NodeVector;
-
-// A type that is one of the primitive Parquet storage types. In addition to
-// the other type metadata (name, repetition level, logical type), also has the
-// physical storage type and their type-specific metadata (byte width, decimal
-// parameters)
-class PARQUET_EXPORT PrimitiveNode : public Node {
- public:
- static std::unique_ptr<Node> FromParquet(const void* opaque_element);
-
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- Type::type type,
- ConvertedType::type converted_type = ConvertedType::NONE,
- int length = -1, int precision = -1, int scale = -1,
- int field_id = -1) {
- return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
- precision, scale, field_id));
- }
-
- // If no logical type, pass LogicalType::None() or nullptr
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type,
- Type::type primitive_type, int primitive_length = -1,
- int field_id = -1) {
- return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
- primitive_length, field_id));
- }
-
- bool Equals(const Node* other) const override;
-
- Type::type physical_type() const { return physical_type_; }
-
- ColumnOrder column_order() const { return column_order_; }
-
- void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
-
- int32_t type_length() const { return type_length_; }
-
- const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
-
- void ToParquet(void* element) const override;
- void Visit(Visitor* visitor) override;
- void VisitConst(ConstVisitor* visitor) const override;
-
- private:
- PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
- ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
- int precision = -1, int scale = -1, int field_id = -1);
-
- PrimitiveNode(const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type,
- Type::type primitive_type, int primitive_length = -1, int field_id = -1);
-
- Type::type physical_type_;
- int32_t type_length_;
- DecimalMetadata decimal_metadata_;
- ColumnOrder column_order_;
-
- // For FIXED_LEN_BYTE_ARRAY
- void SetTypeLength(int32_t length) { type_length_ = length; }
-
- bool EqualsInternal(const PrimitiveNode* other) const;
-
- FRIEND_TEST(TestPrimitiveNode, Attrs);
- FRIEND_TEST(TestPrimitiveNode, Equals);
- FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
- FRIEND_TEST(TestPrimitiveNode, FromParquet);
-};
-
-class PARQUET_EXPORT GroupNode : public Node {
- public:
- static std::unique_ptr<Node> FromParquet(const void* opaque_element,
- NodeVector fields = {});
-
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- ConvertedType::type converted_type = ConvertedType::NONE,
- int field_id = -1) {
- return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
- }
-
- // If no logical type, pass nullptr
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- std::shared_ptr<const LogicalType> logical_type,
- int field_id = -1) {
- return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
- }
-
- bool Equals(const Node* other) const override;
-
- NodePtr field(int i) const { return fields_[i]; }
- // Get the index of a field by its name, or negative value if not found.
- // If several fields share the same name, it is unspecified which one
- // is returned.
- int FieldIndex(const std::string& name) const;
- // Get the index of a field by its node, or negative value if not found.
- int FieldIndex(const Node& node) const;
-
- int field_count() const { return static_cast<int>(fields_.size()); }
-
- void ToParquet(void* element) const override;
- void Visit(Visitor* visitor) override;
- void VisitConst(ConstVisitor* visitor) const override;
-
- /// \brief Return true if this node or any child node has REPEATED repetition
- /// type
- bool HasRepeatedFields() const;
-
- private:
- GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
-
- GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
- int field_id = -1);
-
- NodeVector fields_;
- bool EqualsInternal(const GroupNode* other) const;
-
- // Mapping between field name to the field index
- std::unordered_multimap<std::string, int> field_name_to_idx_;
-
- FRIEND_TEST(TestGroupNode, Attrs);
- FRIEND_TEST(TestGroupNode, Equals);
- FRIEND_TEST(TestGroupNode, FieldIndex);
- FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
-};
-
-// ----------------------------------------------------------------------
-// Convenience primitive type factory functions
-
-#define PRIMITIVE_FACTORY(FuncName, TYPE) \
- static inline NodePtr FuncName(const std::string& name, \
- Repetition::type repetition = Repetition::OPTIONAL, \
- int field_id = -1) { \
- return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
- /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
- }
-
-PRIMITIVE_FACTORY(Boolean, BOOLEAN)
-PRIMITIVE_FACTORY(Int32, INT32)
-PRIMITIVE_FACTORY(Int64, INT64)
-PRIMITIVE_FACTORY(Int96, INT96)
-PRIMITIVE_FACTORY(Float, FLOAT)
-PRIMITIVE_FACTORY(Double, DOUBLE)
-PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
-
-void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
- int indent_width = 2);
-
-} // namespace schema
-
-// The ColumnDescriptor encapsulates information necessary to interpret
-// primitive column data in the context of a particular schema. We have to
-// examine the node structure of a column's path to the root in the schema tree
-// to be able to reassemble the nested structure from the repetition and
-// definition levels.
-class PARQUET_EXPORT ColumnDescriptor {
- public:
- ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
- int16_t max_repetition_level,
- const SchemaDescriptor* schema_descr = NULLPTR);
-
- bool Equals(const ColumnDescriptor& other) const;
-
- int16_t max_definition_level() const { return max_definition_level_; }
-
- int16_t max_repetition_level() const { return max_repetition_level_; }
-
- Type::type physical_type() const { return primitive_node_->physical_type(); }
-
- ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
-
- const std::shared_ptr<const LogicalType>& logical_type() const {
- return primitive_node_->logical_type();
- }
-
- ColumnOrder column_order() const { return primitive_node_->column_order(); }
-
- SortOrder::type sort_order() const {
- auto la = logical_type();
- auto pt = physical_type();
- return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
- }
-
- const std::string& name() const { return primitive_node_->name(); }
-
- const std::shared_ptr<schema::ColumnPath> path() const;
-
- const schema::NodePtr& schema_node() const { return node_; }
-
- std::string ToString() const;
-
- int type_length() const;
-
- int type_precision() const;
-
- int type_scale() const;
-
- private:
- schema::NodePtr node_;
- const schema::PrimitiveNode* primitive_node_;
-
- int16_t max_definition_level_;
- int16_t max_repetition_level_;
-};
-
-// Container for the converted Parquet schema with a computed information from
-// the schema analysis needed for file reading
-//
-// * Column index to Node
-// * Max repetition / definition levels for each primitive node
-//
-// The ColumnDescriptor objects produced by this class can be used to assist in
-// the reconstruction of fully materialized data structures from the
-// repetition-definition level encoding of nested data
-//
-// TODO(wesm): this object can be recomputed from a Schema
-class PARQUET_EXPORT SchemaDescriptor {
- public:
- SchemaDescriptor() {}
- ~SchemaDescriptor() {}
-
- // Analyze the schema
- void Init(std::unique_ptr<schema::Node> schema);
- void Init(schema::NodePtr schema);
-
- const ColumnDescriptor* Column(int i) const;
-
- // Get the index of a column by its dotstring path, or negative value if not found.
- // If several columns share the same dotstring path, it is unspecified which one
- // is returned.
- int ColumnIndex(const std::string& node_path) const;
- // Get the index of a column by its node, or negative value if not found.
- int ColumnIndex(const schema::Node& node) const;
-
- bool Equals(const SchemaDescriptor& other) const;
-
- // The number of physical columns appearing in the file
- int num_columns() const { return static_cast<int>(leaves_.size()); }
-
- const schema::NodePtr& schema_root() const { return schema_; }
-
- const schema::GroupNode* group_node() const { return group_node_; }
-
- // Returns the root (child of the schema root) node of the leaf(column) node
- const schema::Node* GetColumnRoot(int i) const;
-
- const std::string& name() const { return group_node_->name(); }
-
- std::string ToString() const;
-
- void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
-
- /// \brief Return column index corresponding to a particular
- /// PrimitiveNode. Returns -1 if not found
- int GetColumnIndex(const schema::PrimitiveNode& node) const;
-
- /// \brief Return true if any field or their children have REPEATED repetition
- /// type
- bool HasRepeatedFields() const;
-
- private:
- friend class ColumnDescriptor;
-
- // Root Node
- schema::NodePtr schema_;
- // Root Node
- const schema::GroupNode* group_node_;
-
- void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
- int16_t max_rep_level, const schema::NodePtr& base);
-
- // Result of leaf node / tree analysis
- std::vector<ColumnDescriptor> leaves_;
-
- std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
-
- // Mapping between leaf nodes and root group of leaf (first node
- // below the schema's root group)
- //
- // For example, the leaf `a.b.c.d` would have a link back to `a`
- //
- // -- a <------
- // -- -- b |
- // -- -- -- c |
- // -- -- -- -- d
- std::unordered_map<int, schema::NodePtr> leaf_to_base_;
-
- // Mapping between ColumnPath DotString to the leaf index
- std::unordered_multimap<std::string, int> leaf_to_idx_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class SchemaDescriptor;
+
+namespace schema {
+
+class Node;
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+// repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+// <required/optional> group list
+// repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+// <required/optional> group bag
+// repeated group list
+// <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+ enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
+};
+
+class PARQUET_EXPORT ColumnPath {
+ public:
+ ColumnPath() : path_() {}
+ explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+ explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
+
+ static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+ static std::shared_ptr<ColumnPath> FromNode(const Node& node);
+
+ std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+ std::string ToDotString() const;
+ const std::vector<std::string>& ToDotVector() const;
+
+ protected:
+ std::vector<std::string> path_;
+};
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class PARQUET_EXPORT Node {
+ public:
+ enum type { PRIMITIVE, GROUP };
+
+ virtual ~Node() {}
+
+ bool is_primitive() const { return type_ == Node::PRIMITIVE; }
+
+ bool is_group() const { return type_ == Node::GROUP; }
+
+ bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
+
+ bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
+
+ bool is_required() const { return repetition_ == Repetition::REQUIRED; }
+
+ virtual bool Equals(const Node* other) const = 0;
+
+ const std::string& name() const { return name_; }
+
+ Node::type node_type() const { return type_; }
+
+ Repetition::type repetition() const { return repetition_; }
+
+ ConvertedType::type converted_type() const { return converted_type_; }
+
+ const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
+
+ /// \brief The field_id value for the serialized SchemaElement. If the
+ /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
+ /// Thrift.
+ int field_id() const { return field_id_; }
+
+ PARQUET_DEPRECATED("id() is deprecated. Use field_id() instead")
+ int id() const { return field_id_; }
+
+ const Node* parent() const { return parent_; }
+
+ const std::shared_ptr<ColumnPath> path() const;
+
+ virtual void ToParquet(void* element) const = 0;
+
+ // Node::Visitor abstract class for walking schemas with the visitor pattern
+ class Visitor {
+ public:
+ virtual ~Visitor() {}
+
+ virtual void Visit(Node* node) = 0;
+ };
+ class ConstVisitor {
+ public:
+ virtual ~ConstVisitor() {}
+
+ virtual void Visit(const Node* node) = 0;
+ };
+
+ virtual void Visit(Visitor* visitor) = 0;
+ virtual void VisitConst(ConstVisitor* visitor) const = 0;
+
+ protected:
+ friend class GroupNode;
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ converted_type_(converted_type),
+ field_id_(field_id),
+ parent_(NULLPTR) {}
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ logical_type_(std::move(logical_type)),
+ field_id_(field_id),
+ parent_(NULLPTR) {}
+
+ Node::type type_;
+ std::string name_;
+ Repetition::type repetition_;
+ ConvertedType::type converted_type_;
+ std::shared_ptr<const LogicalType> logical_type_;
+ int field_id_;
+ // Nodes should not be shared, they have a single parent.
+ const Node* parent_;
+
+ bool EqualsInternal(const Node* other) const;
+ void SetParent(const Node* p_parent);
+
+ private:
+ PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PARQUET_EXPORT PrimitiveNode : public Node {
+ public:
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element);
+
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ Type::type type,
+ ConvertedType::type converted_type = ConvertedType::NONE,
+ int length = -1, int precision = -1, int scale = -1,
+ int field_id = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
+ precision, scale, field_id));
+ }
+
+ // If no logical type, pass LogicalType::None() or nullptr
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type primitive_type, int primitive_length = -1,
+ int field_id = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
+ primitive_length, field_id));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ Type::type physical_type() const { return physical_type_; }
+
+ ColumnOrder column_order() const { return column_order_; }
+
+ void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
+
+ int32_t type_length() const { return type_length_; }
+
+ const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
+
+ void ToParquet(void* element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+ PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
+ ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
+ int precision = -1, int scale = -1, int field_id = -1);
+
+ PrimitiveNode(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type primitive_type, int primitive_length = -1, int field_id = -1);
+
+ Type::type physical_type_;
+ int32_t type_length_;
+ DecimalMetadata decimal_metadata_;
+ ColumnOrder column_order_;
+
+ // For FIXED_LEN_BYTE_ARRAY
+ void SetTypeLength(int32_t length) { type_length_ = length; }
+
+ bool EqualsInternal(const PrimitiveNode* other) const;
+
+ FRIEND_TEST(TestPrimitiveNode, Attrs);
+ FRIEND_TEST(TestPrimitiveNode, Equals);
+ FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
+ FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class PARQUET_EXPORT GroupNode : public Node {
+ public:
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element,
+ NodeVector fields = {});
+
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ ConvertedType::type converted_type = ConvertedType::NONE,
+ int field_id = -1) {
+ return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
+ }
+
+ // If no logical type, pass nullptr
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ std::shared_ptr<const LogicalType> logical_type,
+ int field_id = -1) {
+ return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ NodePtr field(int i) const { return fields_[i]; }
+ // Get the index of a field by its name, or negative value if not found.
+ // If several fields share the same name, it is unspecified which one
+ // is returned.
+ int FieldIndex(const std::string& name) const;
+ // Get the index of a field by its node, or negative value if not found.
+ int FieldIndex(const Node& node) const;
+
+ int field_count() const { return static_cast<int>(fields_.size()); }
+
+ void ToParquet(void* element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ /// \brief Return true if this node or any child node has REPEATED repetition
+ /// type
+ bool HasRepeatedFields() const;
+
+ private:
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
+
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
+ int field_id = -1);
+
+ NodeVector fields_;
+ bool EqualsInternal(const GroupNode* other) const;
+
+ // Mapping between field name to the field index
+ std::unordered_multimap<std::string, int> field_name_to_idx_;
+
+ FRIEND_TEST(TestGroupNode, Attrs);
+ FRIEND_TEST(TestGroupNode, Equals);
+ FRIEND_TEST(TestGroupNode, FieldIndex);
+ FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE) \
+ static inline NodePtr FuncName(const std::string& name, \
+ Repetition::type repetition = Repetition::OPTIONAL, \
+ int field_id = -1) { \
+ return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
+ /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
+ }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN)
+PRIMITIVE_FACTORY(Int32, INT32)
+PRIMITIVE_FACTORY(Int64, INT64)
+PRIMITIVE_FACTORY(Int96, INT96)
+PRIMITIVE_FACTORY(Float, FLOAT)
+PRIMITIVE_FACTORY(Double, DOUBLE)
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
+
+void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
+ int indent_width = 2);
+
+} // namespace schema
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class PARQUET_EXPORT ColumnDescriptor {
+ public:
+ ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr = NULLPTR);
+
+ bool Equals(const ColumnDescriptor& other) const;
+
+ int16_t max_definition_level() const { return max_definition_level_; }
+
+ int16_t max_repetition_level() const { return max_repetition_level_; }
+
+ Type::type physical_type() const { return primitive_node_->physical_type(); }
+
+ ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
+
+ const std::shared_ptr<const LogicalType>& logical_type() const {
+ return primitive_node_->logical_type();
+ }
+
+ ColumnOrder column_order() const { return primitive_node_->column_order(); }
+
+ SortOrder::type sort_order() const {
+ auto la = logical_type();
+ auto pt = physical_type();
+ return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
+ }
+
+ const std::string& name() const { return primitive_node_->name(); }
+
+ const std::shared_ptr<schema::ColumnPath> path() const;
+
+ const schema::NodePtr& schema_node() const { return node_; }
+
+ std::string ToString() const;
+
+ int type_length() const;
+
+ int type_precision() const;
+
+ int type_scale() const;
+
+ private:
+ schema::NodePtr node_;
+ const schema::PrimitiveNode* primitive_node_;
+
+ int16_t max_definition_level_;
+ int16_t max_repetition_level_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class PARQUET_EXPORT SchemaDescriptor {
+ public:
+ SchemaDescriptor() {}
+ ~SchemaDescriptor() {}
+
+ // Analyze the schema
+ void Init(std::unique_ptr<schema::Node> schema);
+ void Init(schema::NodePtr schema);
+
+ const ColumnDescriptor* Column(int i) const;
+
+ // Get the index of a column by its dotstring path, or negative value if not found.
+ // If several columns share the same dotstring path, it is unspecified which one
+ // is returned.
+ int ColumnIndex(const std::string& node_path) const;
+ // Get the index of a column by its node, or negative value if not found.
+ int ColumnIndex(const schema::Node& node) const;
+
+ bool Equals(const SchemaDescriptor& other) const;
+
+ // The number of physical columns appearing in the file
+ int num_columns() const { return static_cast<int>(leaves_.size()); }
+
+ const schema::NodePtr& schema_root() const { return schema_; }
+
+ const schema::GroupNode* group_node() const { return group_node_; }
+
+ // Returns the root (child of the schema root) node of the leaf(column) node
+ const schema::Node* GetColumnRoot(int i) const;
+
+ const std::string& name() const { return group_node_->name(); }
+
+ std::string ToString() const;
+
+ void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
+
+ /// \brief Return column index corresponding to a particular
+ /// PrimitiveNode. Returns -1 if not found
+ int GetColumnIndex(const schema::PrimitiveNode& node) const;
+
+ /// \brief Return true if any field or their children have REPEATED repetition
+ /// type
+ bool HasRepeatedFields() const;
+
+ private:
+ friend class ColumnDescriptor;
+
+ // Root Node
+ schema::NodePtr schema_;
+ // Root Node
+ const schema::GroupNode* group_node_;
+
+ void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const schema::NodePtr& base);
+
+ // Result of leaf node / tree analysis
+ std::vector<ColumnDescriptor> leaves_;
+
+ std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
+
+ // Mapping between leaf nodes and root group of leaf (first node
+ // below the schema's root group)
+ //
+ // For example, the leaf `a.b.c.d` would have a link back to `a`
+ //
+ // -- a <------
+ // -- -- b |
+ // -- -- -- c |
+ // -- -- -- -- d
+ std::unordered_map<int, schema::NodePtr> leaf_to_base_;
+
+ // Mapping between ColumnPath DotString to the leaf index
+ std::unordered_multimap<std::string, int> leaf_to_idx_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
index c0cfffc87e2..42102884bb0 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
@@ -1,54 +1,54 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Non-public Thrift schema serialization utilities
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-namespace format {
-class SchemaElement;
-}
-
-namespace schema {
-
-// ----------------------------------------------------------------------
-// Conversion from Parquet Thrift metadata
-
-PARQUET_EXPORT
-std::shared_ptr<SchemaDescriptor> FromParquet(
- const std::vector<format::SchemaElement>& schema);
-
-PARQUET_EXPORT
-std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length);
-
-// ----------------------------------------------------------------------
-// Conversion to Parquet Thrift metadata
-
-PARQUET_EXPORT
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
-
-} // namespace schema
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Non-public Thrift schema serialization utilities
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace format {
+class SchemaElement;
+}
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Conversion from Parquet Thrift metadata
+
+PARQUET_EXPORT
+std::shared_ptr<SchemaDescriptor> FromParquet(
+ const std::vector<format::SchemaElement>& schema);
+
+PARQUET_EXPORT
+std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length);
+
+// ----------------------------------------------------------------------
+// Conversion to Parquet Thrift metadata
+
+PARQUET_EXPORT
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
+
+} // namespace schema
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
index 72341590e75..3b037ac74bf 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
@@ -1,885 +1,885 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/statistics.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <type_traits>
-#include <utility>
-
-#include "arrow/array.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/encoding.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-using arrow::default_memory_pool;
-using arrow::MemoryPool;
-using arrow::internal::checked_cast;
-using arrow::util::SafeCopy;
-
-namespace parquet {
-namespace {
-
-// ----------------------------------------------------------------------
-// Comparator implementations
-
-constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
-constexpr int value_length(int type_length, const FLBA& value) { return type_length; }
-
-template <typename DType, bool is_signed>
-struct CompareHelper {
- using T = typename DType::c_type;
-
- static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
- "T is an unsigned numeric");
-
- constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
- constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
-
- // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
- // standard requirements.
- template <typename T1 = T>
- static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
- T fallback) {
- return std::isnan(val) ? fallback : val;
- }
-
- template <typename T1 = T>
- static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(
- T val, T fallback) {
- return val;
- }
-
- static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
-
- static T Min(int type_length, T a, T b) { return a < b ? a : b; }
- static T Max(int type_length, T a, T b) { return a < b ? b : a; }
-};
-
-template <typename DType>
-struct UnsignedCompareHelperBase {
- using T = typename DType::c_type;
- using UCType = typename std::make_unsigned<T>::type;
-
- static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
- static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");
-
- // NOTE: according to the C++ spec, unsigned-to-signed conversion is
- // implementation-defined if the original value does not fit in the signed type
- // (i.e., two's complement cannot be assumed even on mainstream machines,
- // because the compiler may decide otherwise). Hence the use of `SafeCopy`
- // below for deterministic bit-casting.
- // (see "Integer conversions" in
- // https://en.cppreference.com/w/cpp/language/implicit_conversion)
-
- static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
- static const T DefaultMax() { return 0; }
-
- static T Coalesce(T val, T fallback) { return val; }
-
- static bool Compare(int type_length, T a, T b) {
- return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
- }
-
- static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
- static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; }
-};
-
-template <>
-struct CompareHelper<Int32Type, false> : public UnsignedCompareHelperBase<Int32Type> {};
-
-template <>
-struct CompareHelper<Int64Type, false> : public UnsignedCompareHelperBase<Int64Type> {};
-
-template <bool is_signed>
-struct CompareHelper<Int96Type, is_signed> {
- using T = typename Int96Type::c_type;
- using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;
-
- static T DefaultMin() {
- uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
- uint32_t kMax = std::numeric_limits<uint32_t>::max();
- return {kMax, kMax, kMsbMax};
- }
- static T DefaultMax() {
- uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
- uint32_t kMin = std::numeric_limits<uint32_t>::min();
- return {kMin, kMin, kMsbMin};
- }
- static T Coalesce(T val, T fallback) { return val; }
-
- static inline bool Compare(int type_length, const T& a, const T& b) {
- if (a.value[2] != b.value[2]) {
- // Only the MSB bit is by Signed comparison. For little-endian, this is the
- // last bit of Int96 type.
- return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
- } else if (a.value[1] != b.value[1]) {
- return (a.value[1] < b.value[1]);
- }
- return (a.value[0] < b.value[0]);
- }
-
- static T Min(int type_length, const T& a, const T& b) {
- return Compare(0, a, b) ? a : b;
- }
- static T Max(int type_length, const T& a, const T& b) {
- return Compare(0, a, b) ? b : a;
- }
-};
-
-template <typename T, bool is_signed>
-struct BinaryLikeComparer {};
-
-template <typename T>
-struct BinaryLikeComparer<T, /*is_signed=*/false> {
- static bool Compare(int type_length, const T& a, const T& b) {
- int a_length = value_length(type_length, a);
- int b_length = value_length(type_length, b);
- // Unsigned comparison is used for non-numeric types so straight
- // lexiographic comparison makes sense. (a.ptr is always unsigned)....
- return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length);
- }
-};
-
-template <typename T>
-struct BinaryLikeComparer<T, /*is_signed=*/true> {
- static bool Compare(int type_length, const T& a, const T& b) {
- // Is signed is used for integers encoded as big-endian twos
- // complement integers. (e.g. decimals).
- int a_length = value_length(type_length, a);
- int b_length = value_length(type_length, b);
-
- // At least of the lengths is zero.
- if (a_length == 0 || b_length == 0) {
- return a_length == 0 && b_length > 0;
- }
-
- int8_t first_a = *a.ptr;
- int8_t first_b = *b.ptr;
- // We can short circuit for different signed numbers or
- // for equal length bytes arrays that have different first bytes.
- // The equality requirement is necessary for sign extension cases.
- // 0xFF10 should be eqaul to 0x10 (due to big endian sign extension).
- if ((0x80 & first_a) != (0x80 & first_b) ||
- (a_length == b_length && first_a != first_b)) {
- return first_a < first_b;
- }
- // When the lengths are unequal and the numbers are of the same
- // sign we need to do comparison by sign extending the shorter
- // value first, and once we get to equal sized arrays, lexicographical
- // unsigned comparison of everything but the first byte is sufficient.
- const uint8_t* a_start = a.ptr;
- const uint8_t* b_start = b.ptr;
- if (a_length != b_length) {
- const uint8_t* lead_start = nullptr;
- const uint8_t* lead_end = nullptr;
- if (a_length > b_length) {
- int lead_length = a_length - b_length;
- lead_start = a.ptr;
- lead_end = a.ptr + lead_length;
- a_start += lead_length;
- } else {
- DCHECK_LT(a_length, b_length);
- int lead_length = b_length - a_length;
- lead_start = b.ptr;
- lead_end = b.ptr + lead_length;
- b_start += lead_length;
- }
- // Compare extra bytes to the sign extension of the first
- // byte of the other number.
- uint8_t extension = first_a < 0 ? 0xFF : 0;
- bool not_equal = std::any_of(lead_start, lead_end,
- [extension](uint8_t a) { return extension != a; });
- if (not_equal) {
- // Since sign extension are extrema values for unsigned bytes:
- //
- // Four cases exist:
- // negative values:
- // b is the longer value.
- // b must be the lesser value: return false
- // else:
- // a must be the lesser value: return true
- //
- // positive values:
- // b is the longer value.
- // values in b must be greater than a: return true
- // else:
- // values in a must be greater than b: return false
- bool negative_values = first_a < 0;
- bool b_longer = a_length < b_length;
- return negative_values != b_longer;
- }
- } else {
- a_start++;
- b_start++;
- }
- return std::lexicographical_compare(a_start, a.ptr + a_length, b_start,
- b.ptr + b_length);
- }
-};
-
-template <typename DType, bool is_signed>
-struct BinaryLikeCompareHelperBase {
- using T = typename DType::c_type;
-
- static T DefaultMin() { return {}; }
- static T DefaultMax() { return {}; }
- static T Coalesce(T val, T fallback) { return val; }
-
- static inline bool Compare(int type_length, const T& a, const T& b) {
- return BinaryLikeComparer<T, is_signed>::Compare(type_length, a, b);
- }
- static T Min(int type_length, const T& a, const T& b) {
- if (a.ptr == nullptr) return b;
- if (b.ptr == nullptr) return a;
- return Compare(type_length, a, b) ? a : b;
- }
-
- static T Max(int type_length, const T& a, const T& b) {
- if (a.ptr == nullptr) return b;
- if (b.ptr == nullptr) return a;
- return Compare(type_length, a, b) ? b : a;
- }
-};
-
-template <bool is_signed>
-struct CompareHelper<ByteArrayType, is_signed>
- : public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};
-
-template <bool is_signed>
-struct CompareHelper<FLBAType, is_signed>
- : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
-
-using ::arrow::util::optional;
-
-template <typename T>
-::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
-CleanStatistic(std::pair<T, T> min_max) {
- return min_max;
-}
-
-// In case of floating point types, the following rules are applied (as per
-// upstream parquet-mr):
-// - If any of min/max is NaN, return nothing.
-// - If min is 0.0f, replace with -0.0f
-// - If max is -0.0f, replace with 0.0f
-template <typename T>
-::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
-CleanStatistic(std::pair<T, T> min_max) {
- T min = min_max.first;
- T max = min_max.second;
-
- // Ignore if one of the value is nan.
- if (std::isnan(min) || std::isnan(max)) {
- return ::arrow::util::nullopt;
- }
-
- if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {
- return ::arrow::util::nullopt;
- }
-
- T zero{};
-
- if (min == zero && !std::signbit(min)) {
- min = -min;
- }
-
- if (max == zero && std::signbit(max)) {
- max = -max;
- }
-
- return {{min, max}};
-}
-
-optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max) {
- if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
- return ::arrow::util::nullopt;
- }
- return min_max;
-}
-
-optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
- std::pair<ByteArray, ByteArray> min_max) {
- if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
- return ::arrow::util::nullopt;
- }
- return min_max;
-}
-
-template <bool is_signed, typename DType>
-class TypedComparatorImpl : virtual public TypedComparator<DType> {
- public:
- using T = typename DType::c_type;
- using Helper = CompareHelper<DType, is_signed>;
-
- explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
-
- bool CompareInline(const T& a, const T& b) const {
- return Helper::Compare(type_length_, a, b);
- }
-
- bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }
-
- std::pair<T, T> GetMinMax(const T* values, int64_t length) override {
- DCHECK_GT(length, 0);
-
- T min = Helper::DefaultMin();
- T max = Helper::DefaultMax();
-
- for (int64_t i = 0; i < length; i++) {
- auto val = values[i];
- min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin()));
- max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax()));
- }
-
- return {min, max};
- }
-
- std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- DCHECK_GT(length, 0);
-
- T min = Helper::DefaultMin();
- T max = Helper::DefaultMax();
-
- ::arrow::internal::VisitSetBitRunsVoid(
- valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; i++) {
- const auto val = values[i + position];
- min = Helper::Min(type_length_, min,
- Helper::Coalesce(val, Helper::DefaultMin()));
- max = Helper::Max(type_length_, max,
- Helper::Coalesce(val, Helper::DefaultMax()));
- }
- });
-
- return {min, max};
- }
-
- std::pair<T, T> GetMinMax(const ::arrow::Array& values) override;
-
- private:
- int type_length_;
-};
-
-// ARROW-11675: A hand-written version of GetMinMax(), to work around
-// what looks like a MSVC code generation bug.
-// This does not seem to be required for GetMinMaxSpaced().
-template <>
-std::pair<int32_t, int32_t>
-TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
- int64_t length) {
- DCHECK_GT(length, 0);
-
- const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
- uint32_t min = std::numeric_limits<uint32_t>::max();
- uint32_t max = std::numeric_limits<uint32_t>::lowest();
-
- for (int64_t i = 0; i < length; i++) {
- const auto val = unsigned_values[i];
- min = std::min<uint32_t>(min, val);
- max = std::max<uint32_t>(max, val);
- }
-
- return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
-}
-
-template <bool is_signed, typename DType>
-std::pair<typename DType::c_type, typename DType::c_type>
-TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
- ParquetException::NYI(values.type()->ToString());
-}
-
-template <bool is_signed>
-std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
- const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
- const ::arrow::Array& values) {
- using Helper = CompareHelper<ByteArrayType, is_signed>;
-
- ByteArray min = Helper::DefaultMin();
- ByteArray max = Helper::DefaultMax();
- constexpr int type_length = -1;
-
- const auto valid_func = [&](ByteArray val) {
- min = Helper::Min(type_length, val, min);
- max = Helper::Max(type_length, val, max);
- };
- const auto null_func = [&]() {};
-
- if (::arrow::is_binary_like(values.type_id())) {
- ::arrow::VisitArrayDataInline<::arrow::BinaryType>(
- *values.data(), std::move(valid_func), std::move(null_func));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- ::arrow::VisitArrayDataInline<::arrow::LargeBinaryType>(
- *values.data(), std::move(valid_func), std::move(null_func));
- }
-
- return {min, max};
-}
-
-template <>
-std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
- const ::arrow::Array& values) {
- return GetMinMaxBinaryHelper<true>(*this, values);
-}
-
-template <>
-std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
- const ::arrow::Array& values) {
- return GetMinMaxBinaryHelper<false>(*this, values);
-}
-
-template <typename DType>
-class TypedStatisticsImpl : public TypedStatistics<DType> {
- public:
- using T = typename DType::c_type;
-
- TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
- : descr_(descr),
- pool_(pool),
- min_buffer_(AllocateBuffer(pool_, 0)),
- max_buffer_(AllocateBuffer(pool_, 0)) {
- auto comp = Comparator::Make(descr);
- comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
- Reset();
- has_null_count_ = true;
- has_distinct_count_ = true;
- }
-
- TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
- int64_t distinct_count)
- : pool_(default_memory_pool()),
- min_buffer_(AllocateBuffer(pool_, 0)),
- max_buffer_(AllocateBuffer(pool_, 0)) {
- IncrementNumValues(num_values);
- IncrementNullCount(null_count);
- IncrementDistinctCount(distinct_count);
-
- Copy(min, &min_, min_buffer_.get());
- Copy(max, &max_, max_buffer_.get());
- has_min_max_ = true;
- }
-
- TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
- const std::string& encoded_max, int64_t num_values,
- int64_t null_count, int64_t distinct_count, bool has_min_max,
- bool has_null_count, bool has_distinct_count, MemoryPool* pool)
- : TypedStatisticsImpl(descr, pool) {
- IncrementNumValues(num_values);
- if (has_null_count_) {
- IncrementNullCount(null_count);
- }
- if (has_distinct_count) {
- IncrementDistinctCount(distinct_count);
- }
-
- if (!encoded_min.empty()) {
- PlainDecode(encoded_min, &min_);
- }
- if (!encoded_max.empty()) {
- PlainDecode(encoded_max, &max_);
- }
- has_min_max_ = has_min_max;
- }
-
- bool HasDistinctCount() const override { return has_distinct_count_; };
- bool HasMinMax() const override { return has_min_max_; }
- bool HasNullCount() const override { return has_null_count_; };
-
- bool Equals(const Statistics& raw_other) const override {
- if (physical_type() != raw_other.physical_type()) return false;
-
- const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
-
- if (has_min_max_ != other.has_min_max_) return false;
-
- return (has_min_max_ && MinMaxEqual(other)) && null_count() == other.null_count() &&
- distinct_count() == other.distinct_count() &&
- num_values() == other.num_values();
- }
-
- bool MinMaxEqual(const TypedStatisticsImpl& other) const;
-
- void Reset() override {
- ResetCounts();
- has_min_max_ = false;
- has_distinct_count_ = false;
- has_null_count_ = false;
- }
-
- void SetMinMax(const T& arg_min, const T& arg_max) override {
- SetMinMaxPair({arg_min, arg_max});
- }
-
- void Merge(const TypedStatistics<DType>& other) override {
- this->num_values_ += other.num_values();
- if (other.HasNullCount()) {
- this->statistics_.null_count += other.null_count();
- }
- if (other.HasDistinctCount()) {
- this->statistics_.distinct_count += other.distinct_count();
- }
- if (other.HasMinMax()) {
- SetMinMax(other.min(), other.max());
- }
- }
-
- void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
- void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
- int64_t num_not_null, int64_t num_null) override;
-
- void Update(const ::arrow::Array& values) override {
- IncrementNullCount(values.null_count());
- IncrementNumValues(values.length() - values.null_count());
-
- if (values.null_count() == values.length()) {
- return;
- }
-
- SetMinMaxPair(comparator_->GetMinMax(values));
- }
-
- const T& min() const override { return min_; }
-
- const T& max() const override { return max_; }
-
- Type::type physical_type() const override { return descr_->physical_type(); }
-
- const ColumnDescriptor* descr() const override { return descr_; }
-
- std::string EncodeMin() const override {
- std::string s;
- if (HasMinMax()) this->PlainEncode(min_, &s);
- return s;
- }
-
- std::string EncodeMax() const override {
- std::string s;
- if (HasMinMax()) this->PlainEncode(max_, &s);
- return s;
- }
-
- EncodedStatistics Encode() override {
- EncodedStatistics s;
- if (HasMinMax()) {
- s.set_min(this->EncodeMin());
- s.set_max(this->EncodeMax());
- }
- if (HasNullCount()) {
- s.set_null_count(this->null_count());
- }
- return s;
- }
-
- int64_t null_count() const override { return statistics_.null_count; }
- int64_t distinct_count() const override { return statistics_.distinct_count; }
- int64_t num_values() const override { return num_values_; }
-
- private:
- const ColumnDescriptor* descr_;
- bool has_min_max_ = false;
- bool has_null_count_ = false;
- bool has_distinct_count_ = false;
- T min_;
- T max_;
- ::arrow::MemoryPool* pool_;
- int64_t num_values_ = 0;
- EncodedStatistics statistics_;
- std::shared_ptr<TypedComparator<DType>> comparator_;
- std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
-
- void PlainEncode(const T& src, std::string* dst) const;
- void PlainDecode(const std::string& src, T* dst) const;
-
- void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
-
- void IncrementNullCount(int64_t n) {
- statistics_.null_count += n;
- has_null_count_ = true;
- }
-
- void IncrementNumValues(int64_t n) { num_values_ += n; }
-
- void IncrementDistinctCount(int64_t n) {
- statistics_.distinct_count += n;
- has_distinct_count_ = true;
- }
-
- void ResetCounts() {
- this->statistics_.null_count = 0;
- this->statistics_.distinct_count = 0;
- this->num_values_ = 0;
- }
-
- void SetMinMaxPair(std::pair<T, T> min_max) {
- // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
- auto maybe_min_max = CleanStatistic(min_max);
- if (!maybe_min_max) return;
-
- auto min = maybe_min_max.value().first;
- auto max = maybe_min_max.value().second;
-
- if (!has_min_max_) {
- has_min_max_ = true;
- Copy(min, &min_, min_buffer_.get());
- Copy(max, &max_, max_buffer_.get());
- } else {
- Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
- Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
- }
- }
-};
-
-template <>
-inline bool TypedStatisticsImpl<FLBAType>::MinMaxEqual(
- const TypedStatisticsImpl<FLBAType>& other) const {
- uint32_t len = descr_->type_length();
- return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 &&
- std::memcmp(max_.ptr, other.max_.ptr, len) == 0;
-}
-
-template <typename DType>
-bool TypedStatisticsImpl<DType>::MinMaxEqual(
- const TypedStatisticsImpl<DType>& other) const {
- return min_ != other.min_ && max_ != other.max_;
-}
-
-template <>
-inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
- ResizableBuffer* buffer) {
- if (dst->ptr == src.ptr) return;
- uint32_t len = descr_->type_length();
- PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
- std::memcpy(buffer->mutable_data(), src.ptr, len);
- *dst = FLBA(buffer->data());
-}
-
-template <>
-inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
- ResizableBuffer* buffer) {
- if (dst->ptr == src.ptr) return;
- PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
- std::memcpy(buffer->mutable_data(), src.ptr, src.len);
- *dst = ByteArray(src.len, buffer->data());
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
- int64_t num_null) {
- DCHECK_GE(num_not_null, 0);
- DCHECK_GE(num_null, 0);
-
- IncrementNullCount(num_null);
- IncrementNumValues(num_not_null);
-
- if (num_not_null == 0) return;
- SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- int64_t num_not_null, int64_t num_null) {
- DCHECK_GE(num_not_null, 0);
- DCHECK_GE(num_null, 0);
-
- IncrementNullCount(num_null);
- IncrementNumValues(num_not_null);
-
- if (num_not_null == 0) return;
-
- int64_t length = num_null + num_not_null;
- SetMinMaxPair(
- comparator_->GetMinMaxSpaced(values, length, valid_bits, valid_bits_offset));
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) const {
- auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
- encoder->Put(&src, 1);
- auto buffer = encoder->FlushValues();
- auto ptr = reinterpret_cast<const char*>(buffer->data());
- dst->assign(ptr, buffer->size());
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) const {
- auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
- decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
- static_cast<int>(src.size()));
- decoder->Decode(dst, 1);
-}
-
-template <>
-void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src,
- std::string* dst) const {
- dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
-}
-
-template <>
-void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
- T* dst) const {
- dst->len = static_cast<uint32_t>(src.size());
- dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// Public factory functions
-
-std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
- SortOrder::type sort_order,
- int type_length) {
- if (SortOrder::SIGNED == sort_order) {
- switch (physical_type) {
- case Type::BOOLEAN:
- return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
- case Type::INT32:
- return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
- case Type::INT64:
- return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
- case Type::INT96:
- return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
- case Type::FLOAT:
- return std::make_shared<TypedComparatorImpl<true, FloatType>>();
- case Type::DOUBLE:
- return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
- default:
- ParquetException::NYI("Signed Compare not implemented");
- }
- } else if (SortOrder::UNSIGNED == sort_order) {
- switch (physical_type) {
- case Type::INT32:
- return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
- case Type::INT64:
- return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
- case Type::INT96:
- return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
- default:
- ParquetException::NYI("Unsigned Compare not implemented");
- }
- } else {
- throw ParquetException("UNKNOWN Sort Order");
- }
- return nullptr;
-}
-
-std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
- return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
-}
-
-std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool) {
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
- case Type::INT32:
- return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
- case Type::INT64:
- return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
- case Type::FLOAT:
- return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
- case Type::DOUBLE:
- return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
- default:
- ParquetException::NYI("Statistics not implemented");
- }
-}
-
-std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
- const void* max, int64_t num_values,
- int64_t null_count, int64_t distinct_count) {
-#define MAKE_STATS(CAP_TYPE, KLASS) \
- case Type::CAP_TYPE: \
- return std::make_shared<TypedStatisticsImpl<KLASS>>( \
- *reinterpret_cast<const typename KLASS::c_type*>(min), \
- *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
- distinct_count)
-
- switch (physical_type) {
- MAKE_STATS(BOOLEAN, BooleanType);
- MAKE_STATS(INT32, Int32Type);
- MAKE_STATS(INT64, Int64Type);
- MAKE_STATS(FLOAT, FloatType);
- MAKE_STATS(DOUBLE, DoubleType);
- MAKE_STATS(BYTE_ARRAY, ByteArrayType);
- MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
- default:
- break;
- }
-#undef MAKE_STATS
- DCHECK(false) << "Cannot reach here";
- return nullptr;
-}
-
-std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
- const std::string& encoded_min,
- const std::string& encoded_max,
- int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max,
- bool has_null_count, bool has_distinct_count,
- ::arrow::MemoryPool* pool) {
-#define MAKE_STATS(CAP_TYPE, KLASS) \
- case Type::CAP_TYPE: \
- return std::make_shared<TypedStatisticsImpl<KLASS>>( \
- descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
- has_min_max, has_null_count, has_distinct_count, pool)
-
- switch (descr->physical_type()) {
- MAKE_STATS(BOOLEAN, BooleanType);
- MAKE_STATS(INT32, Int32Type);
- MAKE_STATS(INT64, Int64Type);
- MAKE_STATS(FLOAT, FloatType);
- MAKE_STATS(DOUBLE, DoubleType);
- MAKE_STATS(BYTE_ARRAY, ByteArrayType);
- MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
- default:
- break;
- }
-#undef MAKE_STATS
- DCHECK(false) << "Cannot reach here";
- return nullptr;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/statistics.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/encoding.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+using arrow::default_memory_pool;
+using arrow::MemoryPool;
+using arrow::internal::checked_cast;
+using arrow::util::SafeCopy;
+
+namespace parquet {
+namespace {
+
+// ----------------------------------------------------------------------
+// Comparator implementations
+
+constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
+constexpr int value_length(int type_length, const FLBA& value) { return type_length; }
+
+template <typename DType, bool is_signed>
+struct CompareHelper {
+ using T = typename DType::c_type;
+
+ static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
+ "T is an unsigned numeric");
+
+ constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
+ constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
+
+ // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
+ // standard requirements.
+ template <typename T1 = T>
+ static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
+ T fallback) {
+ return std::isnan(val) ? fallback : val;
+ }
+
+ template <typename T1 = T>
+ static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(
+ T val, T fallback) {
+ return val;
+ }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
+
+ static T Min(int type_length, T a, T b) { return a < b ? a : b; }
+ static T Max(int type_length, T a, T b) { return a < b ? b : a; }
+};
+
+template <typename DType>
+struct UnsignedCompareHelperBase {
+ using T = typename DType::c_type;
+ using UCType = typename std::make_unsigned<T>::type;
+
+ static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
+ static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");
+
+ // NOTE: according to the C++ spec, unsigned-to-signed conversion is
+ // implementation-defined if the original value does not fit in the signed type
+ // (i.e., two's complement cannot be assumed even on mainstream machines,
+ // because the compiler may decide otherwise). Hence the use of `SafeCopy`
+ // below for deterministic bit-casting.
+ // (see "Integer conversions" in
+ // https://en.cppreference.com/w/cpp/language/implicit_conversion)
+
+ static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
+ static const T DefaultMax() { return 0; }
+
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static bool Compare(int type_length, T a, T b) {
+ return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
+ }
+
+ static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
+ static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; }
+};
+
+template <>
+struct CompareHelper<Int32Type, false> : public UnsignedCompareHelperBase<Int32Type> {};
+
+template <>
+struct CompareHelper<Int64Type, false> : public UnsignedCompareHelperBase<Int64Type> {};
+
+template <bool is_signed>
+struct CompareHelper<Int96Type, is_signed> {
+ using T = typename Int96Type::c_type;
+ using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;
+
+ static T DefaultMin() {
+ uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
+ uint32_t kMax = std::numeric_limits<uint32_t>::max();
+ return {kMax, kMax, kMsbMax};
+ }
+ static T DefaultMax() {
+ uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
+ uint32_t kMin = std::numeric_limits<uint32_t>::min();
+ return {kMin, kMin, kMsbMin};
+ }
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) {
+ if (a.value[2] != b.value[2]) {
+ // Only the MSB bit is by Signed comparison. For little-endian, this is the
+ // last bit of Int96 type.
+ return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
+ } else if (a.value[1] != b.value[1]) {
+ return (a.value[1] < b.value[1]);
+ }
+ return (a.value[0] < b.value[0]);
+ }
+
+ static T Min(int type_length, const T& a, const T& b) {
+ return Compare(0, a, b) ? a : b;
+ }
+ static T Max(int type_length, const T& a, const T& b) {
+ return Compare(0, a, b) ? b : a;
+ }
+};
+
+template <typename T, bool is_signed>
+struct BinaryLikeComparer {};
+
+template <typename T>
+struct BinaryLikeComparer<T, /*is_signed=*/false> {
+ static bool Compare(int type_length, const T& a, const T& b) {
+ int a_length = value_length(type_length, a);
+ int b_length = value_length(type_length, b);
+ // Unsigned comparison is used for non-numeric types so straight
+ // lexiographic comparison makes sense. (a.ptr is always unsigned)....
+ return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length);
+ }
+};
+
+template <typename T>
+struct BinaryLikeComparer<T, /*is_signed=*/true> {
+ static bool Compare(int type_length, const T& a, const T& b) {
+ // Is signed is used for integers encoded as big-endian twos
+ // complement integers. (e.g. decimals).
+ int a_length = value_length(type_length, a);
+ int b_length = value_length(type_length, b);
+
+ // At least of the lengths is zero.
+ if (a_length == 0 || b_length == 0) {
+ return a_length == 0 && b_length > 0;
+ }
+
+ int8_t first_a = *a.ptr;
+ int8_t first_b = *b.ptr;
+ // We can short circuit for different signed numbers or
+ // for equal length bytes arrays that have different first bytes.
+ // The equality requirement is necessary for sign extension cases.
+ // 0xFF10 should be eqaul to 0x10 (due to big endian sign extension).
+ if ((0x80 & first_a) != (0x80 & first_b) ||
+ (a_length == b_length && first_a != first_b)) {
+ return first_a < first_b;
+ }
+ // When the lengths are unequal and the numbers are of the same
+ // sign we need to do comparison by sign extending the shorter
+ // value first, and once we get to equal sized arrays, lexicographical
+ // unsigned comparison of everything but the first byte is sufficient.
+ const uint8_t* a_start = a.ptr;
+ const uint8_t* b_start = b.ptr;
+ if (a_length != b_length) {
+ const uint8_t* lead_start = nullptr;
+ const uint8_t* lead_end = nullptr;
+ if (a_length > b_length) {
+ int lead_length = a_length - b_length;
+ lead_start = a.ptr;
+ lead_end = a.ptr + lead_length;
+ a_start += lead_length;
+ } else {
+ DCHECK_LT(a_length, b_length);
+ int lead_length = b_length - a_length;
+ lead_start = b.ptr;
+ lead_end = b.ptr + lead_length;
+ b_start += lead_length;
+ }
+ // Compare extra bytes to the sign extension of the first
+ // byte of the other number.
+ uint8_t extension = first_a < 0 ? 0xFF : 0;
+ bool not_equal = std::any_of(lead_start, lead_end,
+ [extension](uint8_t a) { return extension != a; });
+ if (not_equal) {
+ // Since sign extension are extrema values for unsigned bytes:
+ //
+ // Four cases exist:
+ // negative values:
+ // b is the longer value.
+ // b must be the lesser value: return false
+ // else:
+ // a must be the lesser value: return true
+ //
+ // positive values:
+ // b is the longer value.
+ // values in b must be greater than a: return true
+ // else:
+ // values in a must be greater than b: return false
+ bool negative_values = first_a < 0;
+ bool b_longer = a_length < b_length;
+ return negative_values != b_longer;
+ }
+ } else {
+ a_start++;
+ b_start++;
+ }
+ return std::lexicographical_compare(a_start, a.ptr + a_length, b_start,
+ b.ptr + b_length);
+ }
+};
+
+template <typename DType, bool is_signed>
+struct BinaryLikeCompareHelperBase {
+ using T = typename DType::c_type;
+
+ static T DefaultMin() { return {}; }
+ static T DefaultMax() { return {}; }
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) {
+ return BinaryLikeComparer<T, is_signed>::Compare(type_length, a, b);
+ }
+ static T Min(int type_length, const T& a, const T& b) {
+ if (a.ptr == nullptr) return b;
+ if (b.ptr == nullptr) return a;
+ return Compare(type_length, a, b) ? a : b;
+ }
+
+ static T Max(int type_length, const T& a, const T& b) {
+ if (a.ptr == nullptr) return b;
+ if (b.ptr == nullptr) return a;
+ return Compare(type_length, a, b) ? b : a;
+ }
+};
+
+template <bool is_signed>
+struct CompareHelper<ByteArrayType, is_signed>
+ : public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};
+
+template <bool is_signed>
+struct CompareHelper<FLBAType, is_signed>
+ : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
+
+using ::arrow::util::optional;
+
+template <typename T>
+::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
+CleanStatistic(std::pair<T, T> min_max) {
+ return min_max;
+}
+
+// In case of floating point types, the following rules are applied (as per
+// upstream parquet-mr):
+// - If any of min/max is NaN, return nothing.
+// - If min is 0.0f, replace with -0.0f
+// - If max is -0.0f, replace with 0.0f
+template <typename T>
+::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
+CleanStatistic(std::pair<T, T> min_max) {
+ T min = min_max.first;
+ T max = min_max.second;
+
+ // Ignore if one of the value is nan.
+ if (std::isnan(min) || std::isnan(max)) {
+ return ::arrow::util::nullopt;
+ }
+
+ if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {
+ return ::arrow::util::nullopt;
+ }
+
+ T zero{};
+
+ if (min == zero && !std::signbit(min)) {
+ min = -min;
+ }
+
+ if (max == zero && std::signbit(max)) {
+ max = -max;
+ }
+
+ return {{min, max}};
+}
+
+optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max) {
+ if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
+ return ::arrow::util::nullopt;
+ }
+ return min_max;
+}
+
+optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
+ std::pair<ByteArray, ByteArray> min_max) {
+ if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
+ return ::arrow::util::nullopt;
+ }
+ return min_max;
+}
+
+template <bool is_signed, typename DType>
+class TypedComparatorImpl : virtual public TypedComparator<DType> {
+ public:
+ using T = typename DType::c_type;
+ using Helper = CompareHelper<DType, is_signed>;
+
+ explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
+
+ bool CompareInline(const T& a, const T& b) const {
+ return Helper::Compare(type_length_, a, b);
+ }
+
+ bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }
+
+ std::pair<T, T> GetMinMax(const T* values, int64_t length) override {
+ DCHECK_GT(length, 0);
+
+ T min = Helper::DefaultMin();
+ T max = Helper::DefaultMax();
+
+ for (int64_t i = 0; i < length; i++) {
+ auto val = values[i];
+ min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin()));
+ max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax()));
+ }
+
+ return {min, max};
+ }
+
+ std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ DCHECK_GT(length, 0);
+
+ T min = Helper::DefaultMin();
+ T max = Helper::DefaultMax();
+
+ ::arrow::internal::VisitSetBitRunsVoid(
+ valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ const auto val = values[i + position];
+ min = Helper::Min(type_length_, min,
+ Helper::Coalesce(val, Helper::DefaultMin()));
+ max = Helper::Max(type_length_, max,
+ Helper::Coalesce(val, Helper::DefaultMax()));
+ }
+ });
+
+ return {min, max};
+ }
+
+ std::pair<T, T> GetMinMax(const ::arrow::Array& values) override;
+
+ private:
+ int type_length_;
+};
+
+// ARROW-11675: A hand-written version of GetMinMax(), to work around
+// what looks like a MSVC code generation bug.
+// This does not seem to be required for GetMinMaxSpaced().
+template <>
+std::pair<int32_t, int32_t>
+TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
+ int64_t length) {
+ DCHECK_GT(length, 0);
+
+ const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
+ uint32_t min = std::numeric_limits<uint32_t>::max();
+ uint32_t max = std::numeric_limits<uint32_t>::lowest();
+
+ for (int64_t i = 0; i < length; i++) {
+ const auto val = unsigned_values[i];
+ min = std::min<uint32_t>(min, val);
+ max = std::max<uint32_t>(max, val);
+ }
+
+ return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
+}
+
+template <bool is_signed, typename DType>
+std::pair<typename DType::c_type, typename DType::c_type>
+TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
+ ParquetException::NYI(values.type()->ToString());
+}
+
+template <bool is_signed>
+std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
+ const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
+ const ::arrow::Array& values) {
+ using Helper = CompareHelper<ByteArrayType, is_signed>;
+
+ ByteArray min = Helper::DefaultMin();
+ ByteArray max = Helper::DefaultMax();
+ constexpr int type_length = -1;
+
+ const auto valid_func = [&](ByteArray val) {
+ min = Helper::Min(type_length, val, min);
+ max = Helper::Max(type_length, val, max);
+ };
+ const auto null_func = [&]() {};
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ ::arrow::VisitArrayDataInline<::arrow::BinaryType>(
+ *values.data(), std::move(valid_func), std::move(null_func));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ ::arrow::VisitArrayDataInline<::arrow::LargeBinaryType>(
+ *values.data(), std::move(valid_func), std::move(null_func));
+ }
+
+ return {min, max};
+}
+
+template <>
+std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
+ const ::arrow::Array& values) {
+ return GetMinMaxBinaryHelper<true>(*this, values);
+}
+
+template <>
+std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
+ const ::arrow::Array& values) {
+ return GetMinMaxBinaryHelper<false>(*this, values);
+}
+
+template <typename DType>
+class TypedStatisticsImpl : public TypedStatistics<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
+ : descr_(descr),
+ pool_(pool),
+ min_buffer_(AllocateBuffer(pool_, 0)),
+ max_buffer_(AllocateBuffer(pool_, 0)) {
+ auto comp = Comparator::Make(descr);
+ comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
+ Reset();
+ has_null_count_ = true;
+ has_distinct_count_ = true;
+ }
+
+ TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count)
+ : pool_(default_memory_pool()),
+ min_buffer_(AllocateBuffer(pool_, 0)),
+ max_buffer_(AllocateBuffer(pool_, 0)) {
+ IncrementNumValues(num_values);
+ IncrementNullCount(null_count);
+ IncrementDistinctCount(distinct_count);
+
+ Copy(min, &min_, min_buffer_.get());
+ Copy(max, &max_, max_buffer_.get());
+ has_min_max_ = true;
+ }
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count, bool has_min_max,
+ bool has_null_count, bool has_distinct_count, MemoryPool* pool)
+ : TypedStatisticsImpl(descr, pool) {
+ IncrementNumValues(num_values);
+ if (has_null_count_) {
+ IncrementNullCount(null_count);
+ }
+ if (has_distinct_count) {
+ IncrementDistinctCount(distinct_count);
+ }
+
+ if (!encoded_min.empty()) {
+ PlainDecode(encoded_min, &min_);
+ }
+ if (!encoded_max.empty()) {
+ PlainDecode(encoded_max, &max_);
+ }
+ has_min_max_ = has_min_max;
+ }
+
+ bool HasDistinctCount() const override { return has_distinct_count_; };
+ bool HasMinMax() const override { return has_min_max_; }
+ bool HasNullCount() const override { return has_null_count_; };
+
+ bool Equals(const Statistics& raw_other) const override {
+ if (physical_type() != raw_other.physical_type()) return false;
+
+ const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
+
+ if (has_min_max_ != other.has_min_max_) return false;
+
+ return (has_min_max_ && MinMaxEqual(other)) && null_count() == other.null_count() &&
+ distinct_count() == other.distinct_count() &&
+ num_values() == other.num_values();
+ }
+
+ bool MinMaxEqual(const TypedStatisticsImpl& other) const;
+
+ void Reset() override {
+ ResetCounts();
+ has_min_max_ = false;
+ has_distinct_count_ = false;
+ has_null_count_ = false;
+ }
+
+ void SetMinMax(const T& arg_min, const T& arg_max) override {
+ SetMinMaxPair({arg_min, arg_max});
+ }
+
+ void Merge(const TypedStatistics<DType>& other) override {
+ this->num_values_ += other.num_values();
+ if (other.HasNullCount()) {
+ this->statistics_.null_count += other.null_count();
+ }
+ if (other.HasDistinctCount()) {
+ this->statistics_.distinct_count += other.distinct_count();
+ }
+ if (other.HasMinMax()) {
+ SetMinMax(other.min(), other.max());
+ }
+ }
+
+ void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
+ void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
+ int64_t num_not_null, int64_t num_null) override;
+
+ void Update(const ::arrow::Array& values) override {
+ IncrementNullCount(values.null_count());
+ IncrementNumValues(values.length() - values.null_count());
+
+ if (values.null_count() == values.length()) {
+ return;
+ }
+
+ SetMinMaxPair(comparator_->GetMinMax(values));
+ }
+
+ const T& min() const override { return min_; }
+
+ const T& max() const override { return max_; }
+
+ Type::type physical_type() const override { return descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return descr_; }
+
+ std::string EncodeMin() const override {
+ std::string s;
+ if (HasMinMax()) this->PlainEncode(min_, &s);
+ return s;
+ }
+
+ std::string EncodeMax() const override {
+ std::string s;
+ if (HasMinMax()) this->PlainEncode(max_, &s);
+ return s;
+ }
+
+ EncodedStatistics Encode() override {
+ EncodedStatistics s;
+ if (HasMinMax()) {
+ s.set_min(this->EncodeMin());
+ s.set_max(this->EncodeMax());
+ }
+ if (HasNullCount()) {
+ s.set_null_count(this->null_count());
+ }
+ return s;
+ }
+
+ int64_t null_count() const override { return statistics_.null_count; }
+ int64_t distinct_count() const override { return statistics_.distinct_count; }
+ int64_t num_values() const override { return num_values_; }
+
+ private:
+ const ColumnDescriptor* descr_;
+ bool has_min_max_ = false;
+ bool has_null_count_ = false;
+ bool has_distinct_count_ = false;
+ T min_;
+ T max_;
+ ::arrow::MemoryPool* pool_;
+ int64_t num_values_ = 0;
+ EncodedStatistics statistics_;
+ std::shared_ptr<TypedComparator<DType>> comparator_;
+ std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
+
+ void PlainEncode(const T& src, std::string* dst) const;
+ void PlainDecode(const std::string& src, T* dst) const;
+
+ void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
+
+ void IncrementNullCount(int64_t n) {
+ statistics_.null_count += n;
+ has_null_count_ = true;
+ }
+
+ void IncrementNumValues(int64_t n) { num_values_ += n; }
+
+ void IncrementDistinctCount(int64_t n) {
+ statistics_.distinct_count += n;
+ has_distinct_count_ = true;
+ }
+
+ void ResetCounts() {
+ this->statistics_.null_count = 0;
+ this->statistics_.distinct_count = 0;
+ this->num_values_ = 0;
+ }
+
+ void SetMinMaxPair(std::pair<T, T> min_max) {
+ // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
+ auto maybe_min_max = CleanStatistic(min_max);
+ if (!maybe_min_max) return;
+
+ auto min = maybe_min_max.value().first;
+ auto max = maybe_min_max.value().second;
+
+ if (!has_min_max_) {
+ has_min_max_ = true;
+ Copy(min, &min_, min_buffer_.get());
+ Copy(max, &max_, max_buffer_.get());
+ } else {
+ Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
+ Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
+ }
+ }
+};
+
+template <>
+inline bool TypedStatisticsImpl<FLBAType>::MinMaxEqual(
+ const TypedStatisticsImpl<FLBAType>& other) const {
+ uint32_t len = descr_->type_length();
+ return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 &&
+ std::memcmp(max_.ptr, other.max_.ptr, len) == 0;
+}
+
+template <typename DType>
+bool TypedStatisticsImpl<DType>::MinMaxEqual(
+ const TypedStatisticsImpl<DType>& other) const {
+ return min_ != other.min_ && max_ != other.max_;
+}
+
+template <>
+inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
+ ResizableBuffer* buffer) {
+ if (dst->ptr == src.ptr) return;
+ uint32_t len = descr_->type_length();
+ PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
+ std::memcpy(buffer->mutable_data(), src.ptr, len);
+ *dst = FLBA(buffer->data());
+}
+
+template <>
+inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
+ ResizableBuffer* buffer) {
+ if (dst->ptr == src.ptr) return;
+ PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
+ std::memcpy(buffer->mutable_data(), src.ptr, src.len);
+ *dst = ByteArray(src.len, buffer->data());
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
+ int64_t num_null) {
+ DCHECK_GE(num_not_null, 0);
+ DCHECK_GE(num_null, 0);
+
+ IncrementNullCount(num_null);
+ IncrementNumValues(num_not_null);
+
+ if (num_not_null == 0) return;
+ SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ int64_t num_not_null, int64_t num_null) {
+ DCHECK_GE(num_not_null, 0);
+ DCHECK_GE(num_null, 0);
+
+ IncrementNullCount(num_null);
+ IncrementNumValues(num_not_null);
+
+ if (num_not_null == 0) return;
+
+ int64_t length = num_null + num_not_null;
+ SetMinMaxPair(
+ comparator_->GetMinMaxSpaced(values, length, valid_bits, valid_bits_offset));
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) const {
+ auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
+ encoder->Put(&src, 1);
+ auto buffer = encoder->FlushValues();
+ auto ptr = reinterpret_cast<const char*>(buffer->data());
+ dst->assign(ptr, buffer->size());
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) const {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
+ static_cast<int>(src.size()));
+ decoder->Decode(dst, 1);
+}
+
+template <>
+void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src,
+ std::string* dst) const {
+ dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
+}
+
+template <>
+void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
+ T* dst) const {
+ dst->len = static_cast<uint32_t>(src.size());
+ dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length) {
+ if (SortOrder::SIGNED == sort_order) {
+ switch (physical_type) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
+ case Type::INT32:
+ return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
+ case Type::INT64:
+ return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
+ case Type::INT96:
+ return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
+ case Type::FLOAT:
+ return std::make_shared<TypedComparatorImpl<true, FloatType>>();
+ case Type::DOUBLE:
+ return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
+ default:
+ ParquetException::NYI("Signed Compare not implemented");
+ }
+ } else if (SortOrder::UNSIGNED == sort_order) {
+ switch (physical_type) {
+ case Type::INT32:
+ return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
+ case Type::INT64:
+ return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
+ case Type::INT96:
+ return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
+ default:
+ ParquetException::NYI("Unsigned Compare not implemented");
+ }
+ } else {
+ throw ParquetException("UNKNOWN Sort Order");
+ }
+ return nullptr;
+}
+
+std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
+ return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
+}
+
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
+ case Type::INT32:
+ return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
+ case Type::INT64:
+ return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
+ default:
+ ParquetException::NYI("Statistics not implemented");
+ }
+}
+
+std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
+ const void* max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count) {
+#define MAKE_STATS(CAP_TYPE, KLASS) \
+ case Type::CAP_TYPE: \
+ return std::make_shared<TypedStatisticsImpl<KLASS>>( \
+ *reinterpret_cast<const typename KLASS::c_type*>(min), \
+ *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
+ distinct_count)
+
+ switch (physical_type) {
+ MAKE_STATS(BOOLEAN, BooleanType);
+ MAKE_STATS(INT32, Int32Type);
+ MAKE_STATS(INT64, Int64Type);
+ MAKE_STATS(FLOAT, FloatType);
+ MAKE_STATS(DOUBLE, DoubleType);
+ MAKE_STATS(BYTE_ARRAY, ByteArrayType);
+ MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+ default:
+ break;
+ }
+#undef MAKE_STATS
+ DCHECK(false) << "Cannot reach here";
+ return nullptr;
+}
+
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+ const std::string& encoded_min,
+ const std::string& encoded_max,
+ int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max,
+ bool has_null_count, bool has_distinct_count,
+ ::arrow::MemoryPool* pool) {
+#define MAKE_STATS(CAP_TYPE, KLASS) \
+ case Type::CAP_TYPE: \
+ return std::make_shared<TypedStatisticsImpl<KLASS>>( \
+ descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
+ has_min_max, has_null_count, has_distinct_count, pool)
+
+ switch (descr->physical_type()) {
+ MAKE_STATS(BOOLEAN, BooleanType);
+ MAKE_STATS(INT32, Int32Type);
+ MAKE_STATS(INT64, Int64Type);
+ MAKE_STATS(FLOAT, FloatType);
+ MAKE_STATS(DOUBLE, DoubleType);
+ MAKE_STATS(BYTE_ARRAY, ByteArrayType);
+ MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+ default:
+ break;
+ }
+#undef MAKE_STATS
+ DCHECK(false) << "Cannot reach here";
+ return nullptr;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
index 18f68f21b87..1242180000c 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
@@ -1,342 +1,342 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-class BinaryArray;
-
-} // namespace arrow
-
-namespace parquet {
-
-class ColumnDescriptor;
-
-// ----------------------------------------------------------------------
-// Value comparator interfaces
-
-/// \brief Base class for value comparators. Generally used with
-/// TypedComparator<T>
-class PARQUET_EXPORT Comparator {
- public:
- virtual ~Comparator() {}
-
- /// \brief Create a comparator explicitly from physical type and
- /// sort order
- /// \param[in] physical_type the physical type for the typed
- /// comparator
- /// \param[in] sort_order either SortOrder::SIGNED or
- /// SortOrder::UNSIGNED
- /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
- static std::shared_ptr<Comparator> Make(Type::type physical_type,
- SortOrder::type sort_order,
- int type_length = -1);
-
- /// \brief Create typed comparator inferring default sort order from
- /// ColumnDescriptor
- /// \param[in] descr the Parquet column schema
- static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
-};
-
-/// \brief Interface for comparison of physical types according to the
-/// semantics of a particular logical type.
-template <typename DType>
-class TypedComparator : public Comparator {
- public:
- using T = typename DType::c_type;
-
- /// \brief Scalar comparison of two elements, return true if first
- /// is strictly less than the second
- virtual bool Compare(const T& a, const T& b) = 0;
-
- /// \brief Compute maximum and minimum elements in a batch of
- /// elements without any nulls
- virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
-
- /// \brief Compute minimum and maximum elements from an Arrow array. Only
- /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
- /// / arrow::BinaryArray
- virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
-
- /// \brief Compute maximum and minimum elements in a batch of
- /// elements with accompanying bitmap indicating which elements are
- /// included (bit set) and excluded (bit not set)
- ///
- /// \param[in] values the sequence of values
- /// \param[in] length the length of the sequence
- /// \param[in] valid_bits a bitmap indicating which elements are
- /// included (1) or excluded (0)
- /// \param[in] valid_bits_offset the bit offset into the bitmap of
- /// the first element in the sequence
- virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) = 0;
-};
-
-/// \brief Typed version of Comparator::Make
-template <typename DType>
-std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
- SortOrder::type sort_order,
- int type_length = -1) {
- return std::static_pointer_cast<TypedComparator<DType>>(
- Comparator::Make(physical_type, sort_order, type_length));
-}
-
-/// \brief Typed version of Comparator::Make
-template <typename DType>
-std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
- return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
-}
-
-// ----------------------------------------------------------------------
-
-/// \brief Structure represented encoded statistics to be written to
-/// and from Parquet serialized metadata
-class PARQUET_EXPORT EncodedStatistics {
- std::shared_ptr<std::string> max_, min_;
- bool is_signed_ = false;
-
- public:
- EncodedStatistics()
- : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
-
- const std::string& max() const { return *max_; }
- const std::string& min() const { return *min_; }
-
- int64_t null_count = 0;
- int64_t distinct_count = 0;
-
- bool has_min = false;
- bool has_max = false;
- bool has_null_count = false;
- bool has_distinct_count = false;
-
- // From parquet-mr
- // Don't write stats larger than the max size rather than truncating. The
- // rationale is that some engines may use the minimum value in the page as
- // the true minimum for aggregations and there is no way to mark that a
- // value has been truncated and is a lower bound and not in the page.
- void ApplyStatSizeLimits(size_t length) {
- if (max_->length() > length) {
- has_max = false;
- }
- if (min_->length() > length) {
- has_min = false;
- }
- }
-
- bool is_set() const {
- return has_min || has_max || has_null_count || has_distinct_count;
- }
-
- bool is_signed() const { return is_signed_; }
-
- void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
-
- EncodedStatistics& set_max(const std::string& value) {
- *max_ = value;
- has_max = true;
- return *this;
- }
-
- EncodedStatistics& set_min(const std::string& value) {
- *min_ = value;
- has_min = true;
- return *this;
- }
-
- EncodedStatistics& set_null_count(int64_t value) {
- null_count = value;
- has_null_count = true;
- return *this;
- }
-
- EncodedStatistics& set_distinct_count(int64_t value) {
- distinct_count = value;
- has_distinct_count = true;
- return *this;
- }
-};
-
-/// \brief Base type for computing column statistics while writing a file
-class PARQUET_EXPORT Statistics {
- public:
- virtual ~Statistics() {}
-
- /// \brief Create a new statistics instance given a column schema
- /// definition
- /// \param[in] descr the column schema
- /// \param[in] pool a memory pool to use for any memory allocations, optional
- static std::shared_ptr<Statistics> Make(
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- /// \brief Create a new statistics instance given a column schema
- /// definition and pre-existing state
- /// \param[in] descr the column schema
- /// \param[in] encoded_min the encoded minimum value
- /// \param[in] encoded_max the encoded maximum value
- /// \param[in] num_values total number of values
- /// \param[in] null_count number of null values
- /// \param[in] distinct_count number of distinct values
- /// \param[in] has_min_max whether the min/max statistics are set
- /// \param[in] has_null_count whether the null_count statistics are set
- /// \param[in] has_distinct_count whether the distinct_count statistics are set
- /// \param[in] pool a memory pool to use for any memory allocations, optional
- static std::shared_ptr<Statistics> Make(
- const ColumnDescriptor* descr, const std::string& encoded_min,
- const std::string& encoded_max, int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max, bool has_null_count,
- bool has_distinct_count,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- /// \brief Return true if the count of null values is set
- virtual bool HasNullCount() const = 0;
-
- /// \brief The number of null values, may not be set
- virtual int64_t null_count() const = 0;
-
- /// \brief Return true if the count of distinct values is set
- virtual bool HasDistinctCount() const = 0;
-
- /// \brief The number of distinct values, may not be set
- virtual int64_t distinct_count() const = 0;
-
- /// \brief The total number of values in the column
- virtual int64_t num_values() const = 0;
-
- /// \brief Return true if the min and max statistics are set. Obtain
- /// with TypedStatistics<T>::min and max
- virtual bool HasMinMax() const = 0;
-
- /// \brief Reset state of object to initial (no data observed) state
- virtual void Reset() = 0;
-
- /// \brief Plain-encoded minimum value
- virtual std::string EncodeMin() const = 0;
-
- /// \brief Plain-encoded maximum value
- virtual std::string EncodeMax() const = 0;
-
- /// \brief The finalized encoded form of the statistics for transport
- virtual EncodedStatistics Encode() = 0;
-
- /// \brief The physical type of the column schema
- virtual Type::type physical_type() const = 0;
-
- /// \brief The full type descriptor from the column schema
- virtual const ColumnDescriptor* descr() const = 0;
-
- /// \brief Check two Statistics for equality
- virtual bool Equals(const Statistics& other) const = 0;
-
- protected:
- static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
- const void* max, int64_t num_values,
- int64_t null_count, int64_t distinct_count);
-};
-
-/// \brief A typed implementation of Statistics
-template <typename DType>
-class TypedStatistics : public Statistics {
- public:
- using T = typename DType::c_type;
-
- /// \brief The current minimum value
- virtual const T& min() const = 0;
-
- /// \brief The current maximum value
- virtual const T& max() const = 0;
-
- /// \brief Update state with state of another Statistics object
- virtual void Merge(const TypedStatistics<DType>& other) = 0;
-
- /// \brief Batch statistics update
- virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
-
- /// \brief Batch statistics update with supplied validity bitmap
- virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
- int64_t valid_bits_offset, int64_t num_not_null,
- int64_t num_null) = 0;
-
- /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
- /// conversion to a primitive Parquet C type. Only implemented for certain
- /// Parquet type / Arrow type combinations like BYTE_ARRAY /
- /// arrow::BinaryArray
- virtual void Update(const ::arrow::Array& values) = 0;
-
- /// \brief Set min and max values to particular values
- virtual void SetMinMax(const T& min, const T& max) = 0;
-};
-
-using BoolStatistics = TypedStatistics<BooleanType>;
-using Int32Statistics = TypedStatistics<Int32Type>;
-using Int64Statistics = TypedStatistics<Int64Type>;
-using FloatStatistics = TypedStatistics<FloatType>;
-using DoubleStatistics = TypedStatistics<DoubleType>;
-using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
-using FLBAStatistics = TypedStatistics<FLBAType>;
-
-/// \brief Typed version of Statistics::Make
-template <typename DType>
-std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
-}
-
-/// \brief Create Statistics initialized to a particular state
-/// \param[in] min the minimum value
-/// \param[in] max the minimum value
-/// \param[in] num_values number of values
-/// \param[in] null_count number of null values
-/// \param[in] distinct_count number of distinct values
-template <typename DType>
-std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
- const typename DType::c_type& max,
- int64_t num_values,
- int64_t null_count,
- int64_t distinct_count) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
- DType::type_num, &min, &max, num_values, null_count, distinct_count));
-}
-
-/// \brief Typed version of Statistics::Make
-template <typename DType>
-std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
- const ColumnDescriptor* descr, const std::string& encoded_min,
- const std::string& encoded_max, int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max, bool has_null_count,
- bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
- descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
- has_min_max, has_null_count, has_distinct_count, pool));
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class BinaryArray;
+
+} // namespace arrow
+
+namespace parquet {
+
+class ColumnDescriptor;
+
+// ----------------------------------------------------------------------
+// Value comparator interfaces
+
+/// \brief Base class for value comparators. Generally used with
+/// TypedComparator<T>
+class PARQUET_EXPORT Comparator {
+ public:
+ virtual ~Comparator() {}
+
+ /// \brief Create a comparator explicitly from physical type and
+ /// sort order
+ /// \param[in] physical_type the physical type for the typed
+ /// comparator
+ /// \param[in] sort_order either SortOrder::SIGNED or
+ /// SortOrder::UNSIGNED
+ /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
+ static std::shared_ptr<Comparator> Make(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length = -1);
+
+ /// \brief Create typed comparator inferring default sort order from
+ /// ColumnDescriptor
+ /// \param[in] descr the Parquet column schema
+ static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
+};
+
+/// \brief Interface for comparison of physical types according to the
+/// semantics of a particular logical type.
+template <typename DType>
+class TypedComparator : public Comparator {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief Scalar comparison of two elements, return true if first
+ /// is strictly less than the second
+ virtual bool Compare(const T& a, const T& b) = 0;
+
+ /// \brief Compute maximum and minimum elements in a batch of
+ /// elements without any nulls
+ virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
+
+ /// \brief Compute minimum and maximum elements from an Arrow array. Only
+ /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
+ /// / arrow::BinaryArray
+ virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
+
+ /// \brief Compute maximum and minimum elements in a batch of
+ /// elements with accompanying bitmap indicating which elements are
+ /// included (bit set) and excluded (bit not set)
+ ///
+ /// \param[in] values the sequence of values
+ /// \param[in] length the length of the sequence
+ /// \param[in] valid_bits a bitmap indicating which elements are
+ /// included (1) or excluded (0)
+ /// \param[in] valid_bits_offset the bit offset into the bitmap of
+ /// the first element in the sequence
+ virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) = 0;
+};
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length = -1) {
+ return std::static_pointer_cast<TypedComparator<DType>>(
+ Comparator::Make(physical_type, sort_order, type_length));
+}
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
+ return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
+}
+
+// ----------------------------------------------------------------------
+
+/// \brief Structure represented encoded statistics to be written to
+/// and from Parquet serialized metadata
+class PARQUET_EXPORT EncodedStatistics {
+ std::shared_ptr<std::string> max_, min_;
+ bool is_signed_ = false;
+
+ public:
+ EncodedStatistics()
+ : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
+
+ const std::string& max() const { return *max_; }
+ const std::string& min() const { return *min_; }
+
+ int64_t null_count = 0;
+ int64_t distinct_count = 0;
+
+ bool has_min = false;
+ bool has_max = false;
+ bool has_null_count = false;
+ bool has_distinct_count = false;
+
+ // From parquet-mr
+ // Don't write stats larger than the max size rather than truncating. The
+ // rationale is that some engines may use the minimum value in the page as
+ // the true minimum for aggregations and there is no way to mark that a
+ // value has been truncated and is a lower bound and not in the page.
+ void ApplyStatSizeLimits(size_t length) {
+ if (max_->length() > length) {
+ has_max = false;
+ }
+ if (min_->length() > length) {
+ has_min = false;
+ }
+ }
+
+ bool is_set() const {
+ return has_min || has_max || has_null_count || has_distinct_count;
+ }
+
+ bool is_signed() const { return is_signed_; }
+
+ void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
+
+ EncodedStatistics& set_max(const std::string& value) {
+ *max_ = value;
+ has_max = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_min(const std::string& value) {
+ *min_ = value;
+ has_min = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_null_count(int64_t value) {
+ null_count = value;
+ has_null_count = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_distinct_count(int64_t value) {
+ distinct_count = value;
+ has_distinct_count = true;
+ return *this;
+ }
+};
+
+/// \brief Base type for computing column statistics while writing a file
+class PARQUET_EXPORT Statistics {
+ public:
+ virtual ~Statistics() {}
+
+ /// \brief Create a new statistics instance given a column schema
+ /// definition
+ /// \param[in] descr the column schema
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ /// \brief Create a new statistics instance given a column schema
+ /// definition and pre-existing state
+ /// \param[in] descr the column schema
+ /// \param[in] encoded_min the encoded minimum value
+ /// \param[in] encoded_max the encoded maximum value
+ /// \param[in] num_values total number of values
+ /// \param[in] null_count number of null values
+ /// \param[in] distinct_count number of distinct values
+ /// \param[in] has_min_max whether the min/max statistics are set
+ /// \param[in] has_null_count whether the null_count statistics are set
+ /// \param[in] has_distinct_count whether the distinct_count statistics are set
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ /// \brief Return true if the count of null values is set
+ virtual bool HasNullCount() const = 0;
+
+ /// \brief The number of null values, may not be set
+ virtual int64_t null_count() const = 0;
+
+ /// \brief Return true if the count of distinct values is set
+ virtual bool HasDistinctCount() const = 0;
+
+ /// \brief The number of distinct values, may not be set
+ virtual int64_t distinct_count() const = 0;
+
+ /// \brief The total number of values in the column
+ virtual int64_t num_values() const = 0;
+
+ /// \brief Return true if the min and max statistics are set. Obtain
+ /// with TypedStatistics<T>::min and max
+ virtual bool HasMinMax() const = 0;
+
+ /// \brief Reset state of object to initial (no data observed) state
+ virtual void Reset() = 0;
+
+ /// \brief Plain-encoded minimum value
+ virtual std::string EncodeMin() const = 0;
+
+ /// \brief Plain-encoded maximum value
+ virtual std::string EncodeMax() const = 0;
+
+ /// \brief The finalized encoded form of the statistics for transport
+ virtual EncodedStatistics Encode() = 0;
+
+ /// \brief The physical type of the column schema
+ virtual Type::type physical_type() const = 0;
+
+ /// \brief The full type descriptor from the column schema
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ /// \brief Check two Statistics for equality
+ virtual bool Equals(const Statistics& other) const = 0;
+
+ protected:
+ static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
+ const void* max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count);
+};
+
+/// \brief A typed implementation of Statistics
+template <typename DType>
+class TypedStatistics : public Statistics {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief The current minimum value
+ virtual const T& min() const = 0;
+
+ /// \brief The current maximum value
+ virtual const T& max() const = 0;
+
+ /// \brief Update state with state of another Statistics object
+ virtual void Merge(const TypedStatistics<DType>& other) = 0;
+
+ /// \brief Batch statistics update
+ virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
+
+ /// \brief Batch statistics update with supplied validity bitmap
+ virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t num_not_null,
+ int64_t num_null) = 0;
+
+ /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
+ /// conversion to a primitive Parquet C type. Only implemented for certain
+ /// Parquet type / Arrow type combinations like BYTE_ARRAY /
+ /// arrow::BinaryArray
+ virtual void Update(const ::arrow::Array& values) = 0;
+
+ /// \brief Set min and max values to particular values
+ virtual void SetMinMax(const T& min, const T& max) = 0;
+};
+
+using BoolStatistics = TypedStatistics<BooleanType>;
+using Int32Statistics = TypedStatistics<Int32Type>;
+using Int64Statistics = TypedStatistics<Int64Type>;
+using FloatStatistics = TypedStatistics<FloatType>;
+using DoubleStatistics = TypedStatistics<DoubleType>;
+using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
+using FLBAStatistics = TypedStatistics<FLBAType>;
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
+}
+
+/// \brief Create Statistics initialized to a particular state
+/// \param[in] min the minimum value
+/// \param[in] max the minimum value
+/// \param[in] num_values number of values
+/// \param[in] null_count number of null values
+/// \param[in] distinct_count number of distinct values
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
+ const typename DType::c_type& max,
+ int64_t num_values,
+ int64_t null_count,
+ int64_t distinct_count) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+ DType::type_num, &min, &max, num_values, null_count, distinct_count));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+ descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+ has_min_max, has_null_count, has_distinct_count, pool));
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
index 9a7cc8cdf86..af7a35ddbc1 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
@@ -1,521 +1,521 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/stream_reader.h"
-
-#include <set>
-#include <utility>
-
-namespace parquet {
-
-constexpr int64_t StreamReader::kBatchSizeOne;
-
-// The converted type expected by the stream reader does not always
-// exactly match with the schema in the Parquet file. The following
-// is a list of converted types which are allowed instead of the
-// expected converted type.
-// Each pair given is:
-// {<StreamReader expected type>, <Parquet file converted type>}
-// So for example {ConvertedType::INT_32, ConvertedType::NONE} means
-// that if the StreamReader was expecting the converted type INT_32,
-// then it will allow the Parquet file to use the converted type
-// NONE.
-//
-static const std::set<std::pair<ConvertedType::type, ConvertedType::type> >
- converted_type_exceptions = {{ConvertedType::INT_32, ConvertedType::NONE},
- {ConvertedType::INT_64, ConvertedType::NONE},
- {ConvertedType::INT_32, ConvertedType::DECIMAL},
- {ConvertedType::INT_64, ConvertedType::DECIMAL},
- {ConvertedType::UTF8, ConvertedType::NONE}};
-
-StreamReader::StreamReader(std::unique_ptr<ParquetFileReader> reader)
- : file_reader_{std::move(reader)}, eof_{false} {
- file_metadata_ = file_reader_->metadata();
-
- auto schema = file_metadata_->schema();
- auto group_node = schema->group_node();
-
- nodes_.resize(schema->num_columns());
-
- for (auto i = 0; i < schema->num_columns(); ++i) {
- nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
- }
- NextRowGroup();
-}
-
-int StreamReader::num_columns() const {
- // Check for file metadata i.e. object is not default constructed.
- if (file_metadata_) {
- return file_metadata_->num_columns();
- }
- return 0;
-}
-
-int64_t StreamReader::num_rows() const {
- // Check for file metadata i.e. object is not default constructed.
- if (file_metadata_) {
- return file_metadata_->num_rows();
- }
- return 0;
-}
-
-StreamReader& StreamReader::operator>>(bool& v) {
- CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
- Read<BoolReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int8_t& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_8);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint8_t& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_8);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int16_t& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_16);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint16_t& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_16);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int32_t& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_32);
- Read<Int32Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint32_t& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_32);
- Read<Int32Reader>(reinterpret_cast<int32_t*>(&v));
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int64_t& v) {
- CheckColumn(Type::INT64, ConvertedType::INT_64);
- Read<Int64Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint64_t& v) {
- CheckColumn(Type::INT64, ConvertedType::UINT_64);
- Read<Int64Reader>(reinterpret_cast<int64_t*>(&v));
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(std::chrono::milliseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
- int64_t tmp;
- Read<Int64Reader>(&tmp);
- v = std::chrono::milliseconds{tmp};
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(std::chrono::microseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
- int64_t tmp;
- Read<Int64Reader>(&tmp);
- v = std::chrono::microseconds{tmp};
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(float& v) {
- CheckColumn(Type::FLOAT, ConvertedType::NONE);
- Read<FloatReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(double& v) {
- CheckColumn(Type::DOUBLE, ConvertedType::NONE);
- Read<DoubleReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(char& v) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
- FixedLenByteArray flba;
-
- Read(&flba);
- v = static_cast<char>(flba.ptr[0]);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(std::string& v) {
- CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
- ByteArray ba;
-
- Read(&ba);
- v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<bool>& v) {
- CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
- ReadOptional<BoolReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int8_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_8);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint8_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_8);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int16_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_16);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint16_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_16);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int32_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_32);
- ReadOptional<Int32Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint32_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_32);
- ReadOptional<Int32Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int64_t>& v) {
- CheckColumn(Type::INT64, ConvertedType::INT_64);
- ReadOptional<Int64Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint64_t>& v) {
- CheckColumn(Type::INT64, ConvertedType::UINT_64);
- ReadOptional<Int64Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<float>& v) {
- CheckColumn(Type::FLOAT, ConvertedType::NONE);
- ReadOptional<FloatReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<double>& v) {
- CheckColumn(Type::DOUBLE, ConvertedType::NONE);
- ReadOptional<DoubleReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<std::chrono::milliseconds>& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
- ReadOptional<Int64Reader, int64_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<std::chrono::microseconds>& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
- ReadOptional<Int64Reader, int64_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<char>& v) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
- FixedLenByteArray flba;
-
- if (ReadOptional(&flba)) {
- v = static_cast<char>(flba.ptr[0]);
- } else {
- v.reset();
- }
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<std::string>& v) {
- CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
- ByteArray ba;
-
- if (ReadOptional(&ba)) {
- v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
- } else {
- v.reset();
- }
- return *this;
-}
-
-void StreamReader::ReadFixedLength(char* ptr, int len) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, len);
- FixedLenByteArray flba;
- Read(&flba);
- std::memcpy(ptr, flba.ptr, len);
-}
-
-void StreamReader::Read(ByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read != 1) {
- ThrowReadFailedException(node);
- }
-}
-
-bool StreamReader::ReadOptional(ByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read == 1) {
- return true;
- } else if ((values_read == 0) && (def_level == 0)) {
- return false;
- }
- ThrowReadFailedException(node);
-}
-
-void StreamReader::Read(FixedLenByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader =
- static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read != 1) {
- ThrowReadFailedException(node);
- }
-}
-
-bool StreamReader::ReadOptional(FixedLenByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader =
- static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read == 1) {
- return true;
- } else if ((values_read == 0) && (def_level == 0)) {
- return false;
- }
- ThrowReadFailedException(node);
-}
-
-void StreamReader::EndRow() {
- if (!file_reader_) {
- throw ParquetException("StreamReader not initialized");
- }
- if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
- throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
- " of " + std::to_string(nodes_.size()) + " columns read");
- }
- column_index_ = 0;
- ++current_row_;
-
- if (!column_readers_[0]->HasNext()) {
- NextRowGroup();
- }
-}
-
-void StreamReader::NextRowGroup() {
- // Find next none-empty row group
- while (row_group_index_ < file_metadata_->num_row_groups()) {
- row_group_reader_ = file_reader_->RowGroup(row_group_index_);
- ++row_group_index_;
-
- column_readers_.resize(file_metadata_->num_columns());
-
- for (int i = 0; i < file_metadata_->num_columns(); ++i) {
- column_readers_[i] = row_group_reader_->Column(i);
- }
- if (column_readers_[0]->HasNext()) {
- row_group_row_offset_ = current_row_;
- return;
- }
- }
- // No more row groups found.
- SetEof();
-}
-
-void StreamReader::SetEof() {
- // Do not reset file_metadata_ to ensure queries on the number of
- // rows/columns still function.
- eof_ = true;
- file_reader_.reset();
- row_group_reader_.reset();
- column_readers_.clear();
- nodes_.clear();
-}
-
-int64_t StreamReader::SkipRows(int64_t num_rows_to_skip) {
- if (0 != column_index_) {
- throw ParquetException("Must finish reading current row before skipping rows.");
- }
- int64_t num_rows_remaining_to_skip = num_rows_to_skip;
-
- while (!eof_ && (num_rows_remaining_to_skip > 0)) {
- int64_t num_rows_in_row_group = row_group_reader_->metadata()->num_rows();
- int64_t num_rows_remaining_in_row_group =
- num_rows_in_row_group - current_row_ - row_group_row_offset_;
-
- if (num_rows_remaining_in_row_group > num_rows_remaining_to_skip) {
- for (auto reader : column_readers_) {
- SkipRowsInColumn(reader.get(), num_rows_remaining_to_skip);
- }
- current_row_ += num_rows_remaining_to_skip;
- num_rows_remaining_to_skip = 0;
- } else {
- num_rows_remaining_to_skip -= num_rows_remaining_in_row_group;
- current_row_ += num_rows_remaining_in_row_group;
- NextRowGroup();
- }
- }
- return num_rows_to_skip - num_rows_remaining_to_skip;
-}
-
-int64_t StreamReader::SkipColumns(int64_t num_columns_to_skip) {
- int64_t num_columns_skipped = 0;
-
- if (!eof_) {
- for (; (num_columns_to_skip > num_columns_skipped) &&
- static_cast<std::size_t>(column_index_) < nodes_.size();
- ++column_index_) {
- SkipRowsInColumn(column_readers_[column_index_].get(), 1);
- ++num_columns_skipped;
- }
- }
- return num_columns_skipped;
-}
-
-void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip) {
- int64_t num_skipped = 0;
-
- switch (reader->type()) {
- case Type::BOOLEAN:
- num_skipped = static_cast<BoolReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::INT32:
- num_skipped = static_cast<Int32Reader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::INT64:
- num_skipped = static_cast<Int64Reader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::BYTE_ARRAY:
- num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::FLOAT:
- num_skipped = static_cast<FloatReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::DOUBLE:
- num_skipped = static_cast<DoubleReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::INT96:
- num_skipped = static_cast<Int96Reader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::UNDEFINED:
- throw ParquetException("Unexpected type: " + TypeToString(reader->type()));
- break;
- }
- if (num_rows_to_skip != num_skipped) {
- throw ParquetException("Skipped " + std::to_string(num_skipped) + "/" +
- std::to_string(num_rows_to_skip) + " rows in column " +
- reader->descr()->name());
- }
-}
-
-void StreamReader::CheckColumn(Type::type physical_type,
- ConvertedType::type converted_type, int length) {
- if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
- if (eof_) {
- ParquetException::EofException();
- }
- throw ParquetException("Column index out-of-bounds. Index " +
- std::to_string(column_index_) + " is invalid for " +
- std::to_string(nodes_.size()) + " columns");
- }
- const auto& node = nodes_[column_index_];
-
- if (physical_type != node->physical_type()) {
- throw ParquetException("Column physical type mismatch. Column '" + node->name() +
- "' has physical type '" + TypeToString(node->physical_type()) +
- "' not '" + TypeToString(physical_type) + "'");
- }
- if (converted_type != node->converted_type()) {
- // The converted type does not always match with the value
- // provided so check the set of exceptions.
- if (converted_type_exceptions.find({converted_type, node->converted_type()}) ==
- converted_type_exceptions.end()) {
- throw ParquetException("Column converted type mismatch. Column '" + node->name() +
- "' has converted type '" +
- ConvertedTypeToString(node->converted_type()) + "' not '" +
- ConvertedTypeToString(converted_type) + "'");
- }
- }
- // Length must be exact.
- if (length != node->type_length()) {
- throw ParquetException("Column length mismatch. Column '" + node->name() +
- "' has length " + std::to_string(node->type_length()) +
- "] not " + std::to_string(length));
- }
-} // namespace parquet
-
-void StreamReader::ThrowReadFailedException(
- const std::shared_ptr<schema::PrimitiveNode>& node) {
- throw ParquetException("Failed to read value for column '" + node->name() +
- "' on row " + std::to_string(current_row_));
-}
-
-StreamReader& operator>>(StreamReader& os, EndRowType) {
- os.EndRow();
- return os;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/stream_reader.h"
+
+#include <set>
+#include <utility>
+
+namespace parquet {
+
+constexpr int64_t StreamReader::kBatchSizeOne;
+
+// The converted type expected by the stream reader does not always
+// exactly match with the schema in the Parquet file. The following
+// is a list of converted types which are allowed instead of the
+// expected converted type.
+// Each pair given is:
+// {<StreamReader expected type>, <Parquet file converted type>}
+// So for example {ConvertedType::INT_32, ConvertedType::NONE} means
+// that if the StreamReader was expecting the converted type INT_32,
+// then it will allow the Parquet file to use the converted type
+// NONE.
+//
+static const std::set<std::pair<ConvertedType::type, ConvertedType::type> >
+ converted_type_exceptions = {{ConvertedType::INT_32, ConvertedType::NONE},
+ {ConvertedType::INT_64, ConvertedType::NONE},
+ {ConvertedType::INT_32, ConvertedType::DECIMAL},
+ {ConvertedType::INT_64, ConvertedType::DECIMAL},
+ {ConvertedType::UTF8, ConvertedType::NONE}};
+
+StreamReader::StreamReader(std::unique_ptr<ParquetFileReader> reader)
+ : file_reader_{std::move(reader)}, eof_{false} {
+ file_metadata_ = file_reader_->metadata();
+
+ auto schema = file_metadata_->schema();
+ auto group_node = schema->group_node();
+
+ nodes_.resize(schema->num_columns());
+
+ for (auto i = 0; i < schema->num_columns(); ++i) {
+ nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
+ }
+ NextRowGroup();
+}
+
+int StreamReader::num_columns() const {
+ // Check for file metadata i.e. object is not default constructed.
+ if (file_metadata_) {
+ return file_metadata_->num_columns();
+ }
+ return 0;
+}
+
+int64_t StreamReader::num_rows() const {
+ // Check for file metadata i.e. object is not default constructed.
+ if (file_metadata_) {
+ return file_metadata_->num_rows();
+ }
+ return 0;
+}
+
+StreamReader& StreamReader::operator>>(bool& v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ Read<BoolReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int8_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint8_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int16_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint16_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int32_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ Read<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint32_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ Read<Int32Reader>(reinterpret_cast<int32_t*>(&v));
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int64_t& v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ Read<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint64_t& v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ Read<Int64Reader>(reinterpret_cast<int64_t*>(&v));
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::chrono::milliseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ int64_t tmp;
+ Read<Int64Reader>(&tmp);
+ v = std::chrono::milliseconds{tmp};
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::chrono::microseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ int64_t tmp;
+ Read<Int64Reader>(&tmp);
+ v = std::chrono::microseconds{tmp};
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(float& v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ Read<FloatReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(double& v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ Read<DoubleReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(char& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
+ FixedLenByteArray flba;
+
+ Read(&flba);
+ v = static_cast<char>(flba.ptr[0]);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::string& v) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+ ByteArray ba;
+
+ Read(&ba);
+ v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<bool>& v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ ReadOptional<BoolReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int8_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint8_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int16_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint16_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int32_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ ReadOptional<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint32_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ ReadOptional<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int64_t>& v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ ReadOptional<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint64_t>& v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ ReadOptional<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<float>& v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ ReadOptional<FloatReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<double>& v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ ReadOptional<DoubleReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::chrono::milliseconds>& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ ReadOptional<Int64Reader, int64_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::chrono::microseconds>& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ ReadOptional<Int64Reader, int64_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<char>& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
+ FixedLenByteArray flba;
+
+ if (ReadOptional(&flba)) {
+ v = static_cast<char>(flba.ptr[0]);
+ } else {
+ v.reset();
+ }
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::string>& v) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+ ByteArray ba;
+
+ if (ReadOptional(&ba)) {
+ v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
+ } else {
+ v.reset();
+ }
+ return *this;
+}
+
+void StreamReader::ReadFixedLength(char* ptr, int len) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, len);
+ FixedLenByteArray flba;
+ Read(&flba);
+ std::memcpy(ptr, flba.ptr, len);
+}
+
+void StreamReader::Read(ByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+}
+
+bool StreamReader::ReadOptional(ByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read == 1) {
+ return true;
+ } else if ((values_read == 0) && (def_level == 0)) {
+ return false;
+ }
+ ThrowReadFailedException(node);
+}
+
+void StreamReader::Read(FixedLenByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader =
+ static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+}
+
+bool StreamReader::ReadOptional(FixedLenByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader =
+ static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read == 1) {
+ return true;
+ } else if ((values_read == 0) && (def_level == 0)) {
+ return false;
+ }
+ ThrowReadFailedException(node);
+}
+
+void StreamReader::EndRow() {
+ if (!file_reader_) {
+ throw ParquetException("StreamReader not initialized");
+ }
+ if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
+ throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
+ " of " + std::to_string(nodes_.size()) + " columns read");
+ }
+ column_index_ = 0;
+ ++current_row_;
+
+ if (!column_readers_[0]->HasNext()) {
+ NextRowGroup();
+ }
+}
+
+void StreamReader::NextRowGroup() {
+ // Find next none-empty row group
+ while (row_group_index_ < file_metadata_->num_row_groups()) {
+ row_group_reader_ = file_reader_->RowGroup(row_group_index_);
+ ++row_group_index_;
+
+ column_readers_.resize(file_metadata_->num_columns());
+
+ for (int i = 0; i < file_metadata_->num_columns(); ++i) {
+ column_readers_[i] = row_group_reader_->Column(i);
+ }
+ if (column_readers_[0]->HasNext()) {
+ row_group_row_offset_ = current_row_;
+ return;
+ }
+ }
+ // No more row groups found.
+ SetEof();
+}
+
+void StreamReader::SetEof() {
+ // Do not reset file_metadata_ to ensure queries on the number of
+ // rows/columns still function.
+ eof_ = true;
+ file_reader_.reset();
+ row_group_reader_.reset();
+ column_readers_.clear();
+ nodes_.clear();
+}
+
+int64_t StreamReader::SkipRows(int64_t num_rows_to_skip) {
+ if (0 != column_index_) {
+ throw ParquetException("Must finish reading current row before skipping rows.");
+ }
+ int64_t num_rows_remaining_to_skip = num_rows_to_skip;
+
+ while (!eof_ && (num_rows_remaining_to_skip > 0)) {
+ int64_t num_rows_in_row_group = row_group_reader_->metadata()->num_rows();
+ int64_t num_rows_remaining_in_row_group =
+ num_rows_in_row_group - current_row_ - row_group_row_offset_;
+
+ if (num_rows_remaining_in_row_group > num_rows_remaining_to_skip) {
+ for (auto reader : column_readers_) {
+ SkipRowsInColumn(reader.get(), num_rows_remaining_to_skip);
+ }
+ current_row_ += num_rows_remaining_to_skip;
+ num_rows_remaining_to_skip = 0;
+ } else {
+ num_rows_remaining_to_skip -= num_rows_remaining_in_row_group;
+ current_row_ += num_rows_remaining_in_row_group;
+ NextRowGroup();
+ }
+ }
+ return num_rows_to_skip - num_rows_remaining_to_skip;
+}
+
+int64_t StreamReader::SkipColumns(int64_t num_columns_to_skip) {
+ int64_t num_columns_skipped = 0;
+
+ if (!eof_) {
+ for (; (num_columns_to_skip > num_columns_skipped) &&
+ static_cast<std::size_t>(column_index_) < nodes_.size();
+ ++column_index_) {
+ SkipRowsInColumn(column_readers_[column_index_].get(), 1);
+ ++num_columns_skipped;
+ }
+ }
+ return num_columns_skipped;
+}
+
+void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip) {
+ int64_t num_skipped = 0;
+
+ switch (reader->type()) {
+ case Type::BOOLEAN:
+ num_skipped = static_cast<BoolReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT32:
+ num_skipped = static_cast<Int32Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT64:
+ num_skipped = static_cast<Int64Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::BYTE_ARRAY:
+ num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::FLOAT:
+ num_skipped = static_cast<FloatReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::DOUBLE:
+ num_skipped = static_cast<DoubleReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT96:
+ num_skipped = static_cast<Int96Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::UNDEFINED:
+ throw ParquetException("Unexpected type: " + TypeToString(reader->type()));
+ break;
+ }
+ if (num_rows_to_skip != num_skipped) {
+ throw ParquetException("Skipped " + std::to_string(num_skipped) + "/" +
+ std::to_string(num_rows_to_skip) + " rows in column " +
+ reader->descr()->name());
+ }
+}
+
+void StreamReader::CheckColumn(Type::type physical_type,
+ ConvertedType::type converted_type, int length) {
+ if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
+ if (eof_) {
+ ParquetException::EofException();
+ }
+ throw ParquetException("Column index out-of-bounds. Index " +
+ std::to_string(column_index_) + " is invalid for " +
+ std::to_string(nodes_.size()) + " columns");
+ }
+ const auto& node = nodes_[column_index_];
+
+ if (physical_type != node->physical_type()) {
+ throw ParquetException("Column physical type mismatch. Column '" + node->name() +
+ "' has physical type '" + TypeToString(node->physical_type()) +
+ "' not '" + TypeToString(physical_type) + "'");
+ }
+ if (converted_type != node->converted_type()) {
+ // The converted type does not always match with the value
+ // provided so check the set of exceptions.
+ if (converted_type_exceptions.find({converted_type, node->converted_type()}) ==
+ converted_type_exceptions.end()) {
+ throw ParquetException("Column converted type mismatch. Column '" + node->name() +
+ "' has converted type '" +
+ ConvertedTypeToString(node->converted_type()) + "' not '" +
+ ConvertedTypeToString(converted_type) + "'");
+ }
+ }
+ // Length must be exact.
+ if (length != node->type_length()) {
+ throw ParquetException("Column length mismatch. Column '" + node->name() +
+ "' has length " + std::to_string(node->type_length()) +
+ "] not " + std::to_string(length));
+ }
+} // namespace parquet
+
+void StreamReader::ThrowReadFailedException(
+ const std::shared_ptr<schema::PrimitiveNode>& node) {
+ throw ParquetException("Failed to read value for column '" + node->name() +
+ "' on row " + std::to_string(current_row_));
+}
+
+StreamReader& operator>>(StreamReader& os, EndRowType) {
+ os.EndRow();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
index 806b0e8ad9a..3dfebb27146 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
@@ -1,299 +1,299 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <array>
-#include <chrono>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/util/optional.h"
-#include "parquet/column_reader.h"
-#include "parquet/file_reader.h"
-#include "parquet/stream_writer.h"
-
-namespace parquet {
-
-/// \brief A class for reading Parquet files using an output stream type API.
-///
-/// The values given must be of the correct type i.e. the type must
-/// match the file schema exactly otherwise a ParquetException will be
-/// thrown.
-///
-/// The user must explicitly advance to the next row using the
-/// EndRow() function or EndRow input manipulator.
-///
-/// Required and optional fields are supported:
-/// - Required fields are read using operator>>(T)
-/// - Optional fields are read with
-/// operator>>(arrow::util::optional<T>)
-///
-/// Note that operator>>(arrow::util::optional<T>) can be used to read
-/// required fields.
-///
-/// Similarly operator>>(T) can be used to read optional fields.
-/// However, if the value is not present then a ParquetException will
-/// be raised.
-///
-/// Currently there is no support for repeated fields.
-///
-class PARQUET_EXPORT StreamReader {
- public:
- template <typename T>
- using optional = ::arrow::util::optional<T>;
-
- // N.B. Default constructed objects are not usable. This
- // constructor is provided so that the object may be move
- // assigned afterwards.
- StreamReader() = default;
-
- explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
-
- ~StreamReader() = default;
-
- bool eof() const { return eof_; }
-
- int current_column() const { return column_index_; }
-
- int64_t current_row() const { return current_row_; }
-
- int num_columns() const;
-
- int64_t num_rows() const;
-
- // Moving is possible.
- StreamReader(StreamReader&&) = default;
- StreamReader& operator=(StreamReader&&) = default;
-
- // Copying is not allowed.
- StreamReader(const StreamReader&) = delete;
- StreamReader& operator=(const StreamReader&) = delete;
-
- StreamReader& operator>>(bool& v);
-
- StreamReader& operator>>(int8_t& v);
-
- StreamReader& operator>>(uint8_t& v);
-
- StreamReader& operator>>(int16_t& v);
-
- StreamReader& operator>>(uint16_t& v);
-
- StreamReader& operator>>(int32_t& v);
-
- StreamReader& operator>>(uint32_t& v);
-
- StreamReader& operator>>(int64_t& v);
-
- StreamReader& operator>>(uint64_t& v);
-
- StreamReader& operator>>(std::chrono::milliseconds& v);
-
- StreamReader& operator>>(std::chrono::microseconds& v);
-
- StreamReader& operator>>(float& v);
-
- StreamReader& operator>>(double& v);
-
- StreamReader& operator>>(char& v);
-
- template <int N>
- StreamReader& operator>>(char (&v)[N]) {
- ReadFixedLength(v, N);
- return *this;
- }
-
- template <std::size_t N>
- StreamReader& operator>>(std::array<char, N>& v) {
- ReadFixedLength(v.data(), static_cast<int>(N));
- return *this;
- }
-
- // N.B. Cannot allow for reading to a arbitrary char pointer as the
- // length cannot be verified. Also it would overshadow the
- // char[N] input operator.
- // StreamReader& operator>>(char * v);
-
- StreamReader& operator>>(std::string& v);
-
- // Input operators for optional fields.
-
- StreamReader& operator>>(optional<bool>& v);
-
- StreamReader& operator>>(optional<int8_t>& v);
-
- StreamReader& operator>>(optional<uint8_t>& v);
-
- StreamReader& operator>>(optional<int16_t>& v);
-
- StreamReader& operator>>(optional<uint16_t>& v);
-
- StreamReader& operator>>(optional<int32_t>& v);
-
- StreamReader& operator>>(optional<uint32_t>& v);
-
- StreamReader& operator>>(optional<int64_t>& v);
-
- StreamReader& operator>>(optional<uint64_t>& v);
-
- StreamReader& operator>>(optional<float>& v);
-
- StreamReader& operator>>(optional<double>& v);
-
- StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
-
- StreamReader& operator>>(optional<std::chrono::microseconds>& v);
-
- StreamReader& operator>>(optional<char>& v);
-
- StreamReader& operator>>(optional<std::string>& v);
-
- template <std::size_t N>
- StreamReader& operator>>(optional<std::array<char, N>>& v) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
- FixedLenByteArray flba;
- if (ReadOptional(&flba)) {
- v = std::array<char, N>{};
- std::memcpy(v->data(), flba.ptr, N);
- } else {
- v.reset();
- }
- return *this;
- }
-
- /// \brief Terminate current row and advance to next one.
- /// \throws ParquetException if all columns in the row were not
- /// read or skipped.
- void EndRow();
-
- /// \brief Skip the data in the next columns.
- /// If the number of columns exceeds the columns remaining on the
- /// current row then skipping is terminated - it does _not_ continue
- /// skipping columns on the next row.
- /// Skipping of columns still requires the use 'EndRow' even if all
- /// remaining columns were skipped.
- /// \return Number of columns actually skipped.
- int64_t SkipColumns(int64_t num_columns_to_skip);
-
- /// \brief Skip the data in the next rows.
- /// Skipping of rows is not allowed if reading of data for the
- /// current row is not finished.
- /// Skipping of rows will be terminated if the end of file is
- /// reached.
- /// \return Number of rows actually skipped.
- int64_t SkipRows(int64_t num_rows_to_skip);
-
- protected:
- [[noreturn]] void ThrowReadFailedException(
- const std::shared_ptr<schema::PrimitiveNode>& node);
-
- template <typename ReaderType, typename T>
- void Read(T* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read != 1) {
- ThrowReadFailedException(node);
- }
- }
-
- template <typename ReaderType, typename ReadType, typename T>
- void Read(T* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- ReadType tmp;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
-
- if (values_read == 1) {
- *v = tmp;
- } else {
- ThrowReadFailedException(node);
- }
- }
-
- template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
- void ReadOptional(optional<T>* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- ReadType tmp;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
-
- if (values_read == 1) {
- *v = T(tmp);
- } else if ((values_read == 0) && (def_level == 0)) {
- v->reset();
- } else {
- ThrowReadFailedException(node);
- }
- }
-
- void ReadFixedLength(char* ptr, int len);
-
- void Read(ByteArray* v);
-
- void Read(FixedLenByteArray* v);
-
- bool ReadOptional(ByteArray* v);
-
- bool ReadOptional(FixedLenByteArray* v);
-
- void NextRowGroup();
-
- void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
- int length = 0);
-
- void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
-
- void SetEof();
-
- private:
- std::unique_ptr<ParquetFileReader> file_reader_;
- std::shared_ptr<FileMetaData> file_metadata_;
- std::shared_ptr<RowGroupReader> row_group_reader_;
- std::vector<std::shared_ptr<ColumnReader>> column_readers_;
- std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
-
- bool eof_{true};
- int row_group_index_{0};
- int column_index_{0};
- int64_t current_row_{0};
- int64_t row_group_row_offset_{0};
-
- static constexpr int64_t kBatchSizeOne = 1;
-}; // namespace parquet
-
-PARQUET_EXPORT
-StreamReader& operator>>(StreamReader&, EndRowType);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/stream_writer.h"
+
+namespace parquet {
+
+/// \brief A class for reading Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly advance to the next row using the
+/// EndRow() function or EndRow input manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are read using operator>>(T)
+/// - Optional fields are read with
+/// operator>>(arrow::util::optional<T>)
+///
+/// Note that operator>>(arrow::util::optional<T>) can be used to read
+/// required fields.
+///
+/// Similarly operator>>(T) can be used to read optional fields.
+/// However, if the value is not present then a ParquetException will
+/// be raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamReader {
+ public:
+ template <typename T>
+ using optional = ::arrow::util::optional<T>;
+
+ // N.B. Default constructed objects are not usable. This
+ // constructor is provided so that the object may be move
+ // assigned afterwards.
+ StreamReader() = default;
+
+ explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
+
+ ~StreamReader() = default;
+
+ bool eof() const { return eof_; }
+
+ int current_column() const { return column_index_; }
+
+ int64_t current_row() const { return current_row_; }
+
+ int num_columns() const;
+
+ int64_t num_rows() const;
+
+ // Moving is possible.
+ StreamReader(StreamReader&&) = default;
+ StreamReader& operator=(StreamReader&&) = default;
+
+ // Copying is not allowed.
+ StreamReader(const StreamReader&) = delete;
+ StreamReader& operator=(const StreamReader&) = delete;
+
+ StreamReader& operator>>(bool& v);
+
+ StreamReader& operator>>(int8_t& v);
+
+ StreamReader& operator>>(uint8_t& v);
+
+ StreamReader& operator>>(int16_t& v);
+
+ StreamReader& operator>>(uint16_t& v);
+
+ StreamReader& operator>>(int32_t& v);
+
+ StreamReader& operator>>(uint32_t& v);
+
+ StreamReader& operator>>(int64_t& v);
+
+ StreamReader& operator>>(uint64_t& v);
+
+ StreamReader& operator>>(std::chrono::milliseconds& v);
+
+ StreamReader& operator>>(std::chrono::microseconds& v);
+
+ StreamReader& operator>>(float& v);
+
+ StreamReader& operator>>(double& v);
+
+ StreamReader& operator>>(char& v);
+
+ template <int N>
+ StreamReader& operator>>(char (&v)[N]) {
+ ReadFixedLength(v, N);
+ return *this;
+ }
+
+ template <std::size_t N>
+ StreamReader& operator>>(std::array<char, N>& v) {
+ ReadFixedLength(v.data(), static_cast<int>(N));
+ return *this;
+ }
+
+ // N.B. Cannot allow for reading to a arbitrary char pointer as the
+ // length cannot be verified. Also it would overshadow the
+ // char[N] input operator.
+ // StreamReader& operator>>(char * v);
+
+ StreamReader& operator>>(std::string& v);
+
+ // Input operators for optional fields.
+
+ StreamReader& operator>>(optional<bool>& v);
+
+ StreamReader& operator>>(optional<int8_t>& v);
+
+ StreamReader& operator>>(optional<uint8_t>& v);
+
+ StreamReader& operator>>(optional<int16_t>& v);
+
+ StreamReader& operator>>(optional<uint16_t>& v);
+
+ StreamReader& operator>>(optional<int32_t>& v);
+
+ StreamReader& operator>>(optional<uint32_t>& v);
+
+ StreamReader& operator>>(optional<int64_t>& v);
+
+ StreamReader& operator>>(optional<uint64_t>& v);
+
+ StreamReader& operator>>(optional<float>& v);
+
+ StreamReader& operator>>(optional<double>& v);
+
+ StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
+
+ StreamReader& operator>>(optional<std::chrono::microseconds>& v);
+
+ StreamReader& operator>>(optional<char>& v);
+
+ StreamReader& operator>>(optional<std::string>& v);
+
+ template <std::size_t N>
+ StreamReader& operator>>(optional<std::array<char, N>>& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
+ FixedLenByteArray flba;
+ if (ReadOptional(&flba)) {
+ v = std::array<char, N>{};
+ std::memcpy(v->data(), flba.ptr, N);
+ } else {
+ v.reset();
+ }
+ return *this;
+ }
+
+ /// \brief Terminate current row and advance to next one.
+ /// \throws ParquetException if all columns in the row were not
+ /// read or skipped.
+ void EndRow();
+
+ /// \brief Skip the data in the next columns.
+ /// If the number of columns exceeds the columns remaining on the
+ /// current row then skipping is terminated - it does _not_ continue
+ /// skipping columns on the next row.
+ /// Skipping of columns still requires the use 'EndRow' even if all
+ /// remaining columns were skipped.
+ /// \return Number of columns actually skipped.
+ int64_t SkipColumns(int64_t num_columns_to_skip);
+
+ /// \brief Skip the data in the next rows.
+ /// Skipping of rows is not allowed if reading of data for the
+ /// current row is not finished.
+ /// Skipping of rows will be terminated if the end of file is
+ /// reached.
+ /// \return Number of rows actually skipped.
+ int64_t SkipRows(int64_t num_rows_to_skip);
+
+ protected:
+ [[noreturn]] void ThrowReadFailedException(
+ const std::shared_ptr<schema::PrimitiveNode>& node);
+
+ template <typename ReaderType, typename T>
+ void Read(T* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ template <typename ReaderType, typename ReadType, typename T>
+ void Read(T* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ ReadType tmp;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+ if (values_read == 1) {
+ *v = tmp;
+ } else {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
+ void ReadOptional(optional<T>* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ ReadType tmp;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+ if (values_read == 1) {
+ *v = T(tmp);
+ } else if ((values_read == 0) && (def_level == 0)) {
+ v->reset();
+ } else {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ void ReadFixedLength(char* ptr, int len);
+
+ void Read(ByteArray* v);
+
+ void Read(FixedLenByteArray* v);
+
+ bool ReadOptional(ByteArray* v);
+
+ bool ReadOptional(FixedLenByteArray* v);
+
+ void NextRowGroup();
+
+ void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+ int length = 0);
+
+ void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
+
+ void SetEof();
+
+ private:
+ std::unique_ptr<ParquetFileReader> file_reader_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+ std::shared_ptr<RowGroupReader> row_group_reader_;
+ std::vector<std::shared_ptr<ColumnReader>> column_readers_;
+ std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
+
+ bool eof_{true};
+ int row_group_index_{0};
+ int column_index_{0};
+ int64_t current_row_{0};
+ int64_t row_group_row_offset_{0};
+
+ static constexpr int64_t kBatchSizeOne = 1;
+}; // namespace parquet
+
+PARQUET_EXPORT
+StreamReader& operator>>(StreamReader&, EndRowType);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
index 253ebf1bc91..2ebbd3c5e23 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
@@ -1,324 +1,324 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/stream_writer.h"
-
-#include <utility>
-
-namespace parquet {
-
-int64_t StreamWriter::default_row_group_size_{512 * 1024 * 1024}; // 512MB
-
-constexpr int16_t StreamWriter::kDefLevelZero;
-constexpr int16_t StreamWriter::kDefLevelOne;
-constexpr int16_t StreamWriter::kRepLevelZero;
-constexpr int64_t StreamWriter::kBatchSizeOne;
-
-StreamWriter::FixedStringView::FixedStringView(const char* data_ptr)
- : data{data_ptr}, size{std::strlen(data_ptr)} {}
-
-StreamWriter::FixedStringView::FixedStringView(const char* data_ptr, std::size_t data_len)
- : data{data_ptr}, size{data_len} {}
-
-StreamWriter::StreamWriter(std::unique_ptr<ParquetFileWriter> writer)
- : file_writer_{std::move(writer)},
- row_group_writer_{file_writer_->AppendBufferedRowGroup()} {
- auto schema = file_writer_->schema();
- auto group_node = schema->group_node();
-
- nodes_.resize(schema->num_columns());
-
- for (auto i = 0; i < schema->num_columns(); ++i) {
- nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
- }
-}
-
-void StreamWriter::SetDefaultMaxRowGroupSize(int64_t max_size) {
- default_row_group_size_ = max_size;
-}
-
-void StreamWriter::SetMaxRowGroupSize(int64_t max_size) {
- max_row_group_size_ = max_size;
-}
-
-int StreamWriter::num_columns() const { return static_cast<int>(nodes_.size()); }
-
-StreamWriter& StreamWriter::operator<<(bool v) {
- CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
- return Write<BoolWriter>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(int8_t v) {
- CheckColumn(Type::INT32, ConvertedType::INT_8);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(uint8_t v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_8);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(int16_t v) {
- CheckColumn(Type::INT32, ConvertedType::INT_16);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(uint16_t v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_16);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(int32_t v) {
- CheckColumn(Type::INT32, ConvertedType::INT_32);
- return Write<Int32Writer>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(uint32_t v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_32);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(int64_t v) {
- CheckColumn(Type::INT64, ConvertedType::INT_64);
- return Write<Int64Writer>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(uint64_t v) {
- CheckColumn(Type::INT64, ConvertedType::UINT_64);
- return Write<Int64Writer>(static_cast<int64_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(const std::chrono::milliseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
- return Write<Int64Writer>(static_cast<int64_t>(v.count()));
-}
-
-StreamWriter& StreamWriter::operator<<(const std::chrono::microseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
- return Write<Int64Writer>(static_cast<int64_t>(v.count()));
-}
-
-StreamWriter& StreamWriter::operator<<(float v) {
- CheckColumn(Type::FLOAT, ConvertedType::NONE);
- return Write<FloatWriter>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(double v) {
- CheckColumn(Type::DOUBLE, ConvertedType::NONE);
- return Write<DoubleWriter>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(char v) { return WriteFixedLength(&v, 1); }
-
-StreamWriter& StreamWriter::operator<<(FixedStringView v) {
- return WriteFixedLength(v.data, v.size);
-}
-
-StreamWriter& StreamWriter::operator<<(const char* v) {
- return WriteVariableLength(v, std::strlen(v));
-}
-
-StreamWriter& StreamWriter::operator<<(const std::string& v) {
- return WriteVariableLength(v.data(), v.size());
-}
-
-StreamWriter& StreamWriter::operator<<(::arrow::util::string_view v) {
- return WriteVariableLength(v.data(), v.size());
-}
-
-StreamWriter& StreamWriter::WriteVariableLength(const char* data_ptr,
- std::size_t data_len) {
- CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
-
- auto writer = static_cast<ByteArrayWriter*>(row_group_writer_->column(column_index_++));
-
- if (data_ptr != nullptr) {
- ByteArray ba_value;
-
- ba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
- ba_value.len = static_cast<uint32_t>(data_len);
-
- writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &ba_value);
- } else {
- writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
- }
- if (max_row_group_size_ > 0) {
- row_group_size_ += writer->EstimatedBufferedValueBytes();
- }
- return *this;
-}
-
-StreamWriter& StreamWriter::WriteFixedLength(const char* data_ptr, std::size_t data_len) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
- static_cast<int>(data_len));
-
- auto writer =
- static_cast<FixedLenByteArrayWriter*>(row_group_writer_->column(column_index_++));
-
- if (data_ptr != nullptr) {
- FixedLenByteArray flba_value;
-
- flba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
- writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &flba_value);
- } else {
- writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
- }
- if (max_row_group_size_ > 0) {
- row_group_size_ += writer->EstimatedBufferedValueBytes();
- }
- return *this;
-}
-
-void StreamWriter::CheckColumn(Type::type physical_type,
- ConvertedType::type converted_type, int length) {
- if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
- throw ParquetException("Column index out-of-bounds. Index " +
- std::to_string(column_index_) + " is invalid for " +
- std::to_string(nodes_.size()) + " columns");
- }
- const auto& node = nodes_[column_index_];
-
- if (physical_type != node->physical_type()) {
- throw ParquetException("Column physical type mismatch. Column '" + node->name() +
- "' has physical type '" + TypeToString(node->physical_type()) +
- "' not '" + TypeToString(physical_type) + "'");
- }
- if (converted_type != node->converted_type()) {
- throw ParquetException("Column converted type mismatch. Column '" + node->name() +
- "' has converted type[" +
- ConvertedTypeToString(node->converted_type()) + "] not '" +
- ConvertedTypeToString(converted_type) + "'");
- }
- // Length must be exact.
- // A shorter length fixed array is not acceptable as it would
- // result in array bound read errors.
- //
- if (length != node->type_length()) {
- throw ParquetException("Column length mismatch. Column '" + node->name() +
- "' has length " + std::to_string(node->type_length()) +
- " not " + std::to_string(length));
- }
-}
-
-int64_t StreamWriter::SkipColumns(int num_columns_to_skip) {
- int num_columns_skipped = 0;
-
- for (; (num_columns_to_skip > num_columns_skipped) &&
- static_cast<std::size_t>(column_index_) < nodes_.size();
- ++num_columns_skipped) {
- const auto& node = nodes_[column_index_];
-
- if (node->is_required()) {
- throw ParquetException("Cannot skip column '" + node->name() +
- "' as it is required.");
- }
- auto writer = row_group_writer_->column(column_index_++);
-
- WriteNullValue(writer);
- }
- return num_columns_skipped;
-}
-
-void StreamWriter::WriteNullValue(ColumnWriter* writer) {
- switch (writer->type()) {
- case Type::BOOLEAN:
- static_cast<BoolWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::INT32:
- static_cast<Int32Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::INT64:
- static_cast<Int64Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::BYTE_ARRAY:
- static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
- kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
- break;
- case Type::FLOAT:
- static_cast<FloatWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::DOUBLE:
- static_cast<DoubleWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::INT96:
- case Type::UNDEFINED:
- throw ParquetException("Unexpected type: " + TypeToString(writer->type()));
- break;
- }
-}
-
-void StreamWriter::SkipOptionalColumn() {
- if (SkipColumns(1) != 1) {
- throw ParquetException("Failed to skip optional column at column index " +
- std::to_string(column_index_));
- }
-}
-
-void StreamWriter::EndRow() {
- if (!file_writer_) {
- throw ParquetException("StreamWriter not initialized");
- }
- if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
- throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
- " of " + std::to_string(nodes_.size()) + " columns written");
- }
- column_index_ = 0;
- ++current_row_;
-
- if (max_row_group_size_ > 0) {
- if (row_group_size_ > max_row_group_size_) {
- EndRowGroup();
- }
- // Initialize for each row with size already written
- // (compressed + uncompressed).
- //
- row_group_size_ = row_group_writer_->total_bytes_written() +
- row_group_writer_->total_compressed_bytes();
- }
-}
-
-void StreamWriter::EndRowGroup() {
- if (!file_writer_) {
- throw ParquetException("StreamWriter not initialized");
- }
- // Avoid creating empty row groups.
- if (row_group_writer_->num_rows() > 0) {
- row_group_writer_->Close();
- row_group_writer_.reset(file_writer_->AppendBufferedRowGroup());
- }
-}
-
-StreamWriter& operator<<(StreamWriter& os, EndRowType) {
- os.EndRow();
- return os;
-}
-
-StreamWriter& operator<<(StreamWriter& os, EndRowGroupType) {
- os.EndRowGroup();
- return os;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/stream_writer.h"
+
+#include <utility>
+
+namespace parquet {
+
+int64_t StreamWriter::default_row_group_size_{512 * 1024 * 1024}; // 512MB
+
+constexpr int16_t StreamWriter::kDefLevelZero;
+constexpr int16_t StreamWriter::kDefLevelOne;
+constexpr int16_t StreamWriter::kRepLevelZero;
+constexpr int64_t StreamWriter::kBatchSizeOne;
+
+StreamWriter::FixedStringView::FixedStringView(const char* data_ptr)
+ : data{data_ptr}, size{std::strlen(data_ptr)} {}
+
+StreamWriter::FixedStringView::FixedStringView(const char* data_ptr, std::size_t data_len)
+ : data{data_ptr}, size{data_len} {}
+
+StreamWriter::StreamWriter(std::unique_ptr<ParquetFileWriter> writer)
+ : file_writer_{std::move(writer)},
+ row_group_writer_{file_writer_->AppendBufferedRowGroup()} {
+ auto schema = file_writer_->schema();
+ auto group_node = schema->group_node();
+
+ nodes_.resize(schema->num_columns());
+
+ for (auto i = 0; i < schema->num_columns(); ++i) {
+ nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
+ }
+}
+
+void StreamWriter::SetDefaultMaxRowGroupSize(int64_t max_size) {
+ default_row_group_size_ = max_size;
+}
+
+void StreamWriter::SetMaxRowGroupSize(int64_t max_size) {
+ max_row_group_size_ = max_size;
+}
+
+int StreamWriter::num_columns() const { return static_cast<int>(nodes_.size()); }
+
+StreamWriter& StreamWriter::operator<<(bool v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ return Write<BoolWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(int8_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(uint8_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int16_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(uint16_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int32_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ return Write<Int32Writer>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(uint32_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int64_t v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ return Write<Int64Writer>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(uint64_t v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ return Write<Int64Writer>(static_cast<int64_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::chrono::milliseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ return Write<Int64Writer>(static_cast<int64_t>(v.count()));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::chrono::microseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ return Write<Int64Writer>(static_cast<int64_t>(v.count()));
+}
+
+StreamWriter& StreamWriter::operator<<(float v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ return Write<FloatWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(double v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ return Write<DoubleWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(char v) { return WriteFixedLength(&v, 1); }
+
+StreamWriter& StreamWriter::operator<<(FixedStringView v) {
+ return WriteFixedLength(v.data, v.size);
+}
+
+StreamWriter& StreamWriter::operator<<(const char* v) {
+ return WriteVariableLength(v, std::strlen(v));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::string& v) {
+ return WriteVariableLength(v.data(), v.size());
+}
+
+StreamWriter& StreamWriter::operator<<(::arrow::util::string_view v) {
+ return WriteVariableLength(v.data(), v.size());
+}
+
+StreamWriter& StreamWriter::WriteVariableLength(const char* data_ptr,
+ std::size_t data_len) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+
+ auto writer = static_cast<ByteArrayWriter*>(row_group_writer_->column(column_index_++));
+
+ if (data_ptr != nullptr) {
+ ByteArray ba_value;
+
+ ba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
+ ba_value.len = static_cast<uint32_t>(data_len);
+
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &ba_value);
+ } else {
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ }
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+}
+
+StreamWriter& StreamWriter::WriteFixedLength(const char* data_ptr, std::size_t data_len) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
+ static_cast<int>(data_len));
+
+ auto writer =
+ static_cast<FixedLenByteArrayWriter*>(row_group_writer_->column(column_index_++));
+
+ if (data_ptr != nullptr) {
+ FixedLenByteArray flba_value;
+
+ flba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &flba_value);
+ } else {
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ }
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+}
+
+void StreamWriter::CheckColumn(Type::type physical_type,
+ ConvertedType::type converted_type, int length) {
+ if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
+ throw ParquetException("Column index out-of-bounds. Index " +
+ std::to_string(column_index_) + " is invalid for " +
+ std::to_string(nodes_.size()) + " columns");
+ }
+ const auto& node = nodes_[column_index_];
+
+ if (physical_type != node->physical_type()) {
+ throw ParquetException("Column physical type mismatch. Column '" + node->name() +
+ "' has physical type '" + TypeToString(node->physical_type()) +
+ "' not '" + TypeToString(physical_type) + "'");
+ }
+ if (converted_type != node->converted_type()) {
+ throw ParquetException("Column converted type mismatch. Column '" + node->name() +
+ "' has converted type[" +
+ ConvertedTypeToString(node->converted_type()) + "] not '" +
+ ConvertedTypeToString(converted_type) + "'");
+ }
+ // Length must be exact.
+ // A shorter length fixed array is not acceptable as it would
+ // result in array bound read errors.
+ //
+ if (length != node->type_length()) {
+ throw ParquetException("Column length mismatch. Column '" + node->name() +
+ "' has length " + std::to_string(node->type_length()) +
+ " not " + std::to_string(length));
+ }
+}
+
+int64_t StreamWriter::SkipColumns(int num_columns_to_skip) {
+ int num_columns_skipped = 0;
+
+ for (; (num_columns_to_skip > num_columns_skipped) &&
+ static_cast<std::size_t>(column_index_) < nodes_.size();
+ ++num_columns_skipped) {
+ const auto& node = nodes_[column_index_];
+
+ if (node->is_required()) {
+ throw ParquetException("Cannot skip column '" + node->name() +
+ "' as it is required.");
+ }
+ auto writer = row_group_writer_->column(column_index_++);
+
+ WriteNullValue(writer);
+ }
+ return num_columns_skipped;
+}
+
+void StreamWriter::WriteNullValue(ColumnWriter* writer) {
+ switch (writer->type()) {
+ case Type::BOOLEAN:
+ static_cast<BoolWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT32:
+ static_cast<Int32Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT64:
+ static_cast<Int64Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::BYTE_ARRAY:
+ static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
+ kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ break;
+ case Type::FLOAT:
+ static_cast<FloatWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::DOUBLE:
+ static_cast<DoubleWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT96:
+ case Type::UNDEFINED:
+ throw ParquetException("Unexpected type: " + TypeToString(writer->type()));
+ break;
+ }
+}
+
+void StreamWriter::SkipOptionalColumn() {
+ if (SkipColumns(1) != 1) {
+ throw ParquetException("Failed to skip optional column at column index " +
+ std::to_string(column_index_));
+ }
+}
+
+void StreamWriter::EndRow() {
+ if (!file_writer_) {
+ throw ParquetException("StreamWriter not initialized");
+ }
+ if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
+ throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
+ " of " + std::to_string(nodes_.size()) + " columns written");
+ }
+ column_index_ = 0;
+ ++current_row_;
+
+ if (max_row_group_size_ > 0) {
+ if (row_group_size_ > max_row_group_size_) {
+ EndRowGroup();
+ }
+ // Initialize for each row with size already written
+ // (compressed + uncompressed).
+ //
+ row_group_size_ = row_group_writer_->total_bytes_written() +
+ row_group_writer_->total_compressed_bytes();
+ }
+}
+
+void StreamWriter::EndRowGroup() {
+ if (!file_writer_) {
+ throw ParquetException("StreamWriter not initialized");
+ }
+ // Avoid creating empty row groups.
+ if (row_group_writer_->num_rows() > 0) {
+ row_group_writer_->Close();
+ row_group_writer_.reset(file_writer_->AppendBufferedRowGroup());
+ }
+}
+
+StreamWriter& operator<<(StreamWriter& os, EndRowType) {
+ os.EndRow();
+ return os;
+}
+
+StreamWriter& operator<<(StreamWriter& os, EndRowGroupType) {
+ os.EndRowGroup();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
index d0db850c341..ebd9a278a2b 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
@@ -1,243 +1,243 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <array>
-#include <chrono>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/util/optional.h"
-#include "arrow/util/string_view.h"
-#include "parquet/column_writer.h"
-#include "parquet/file_writer.h"
-
-namespace parquet {
-
-/// \brief A class for writing Parquet files using an output stream type API.
-///
-/// The values given must be of the correct type i.e. the type must
-/// match the file schema exactly otherwise a ParquetException will be
-/// thrown.
-///
-/// The user must explicitly indicate the end of the row using the
-/// EndRow() function or EndRow output manipulator.
-///
-/// A maximum row group size can be configured, the default size is
-/// 512MB. Alternatively the row group size can be set to zero and the
-/// user can create new row groups by calling the EndRowGroup()
-/// function or using the EndRowGroup output manipulator.
-///
-/// Required and optional fields are supported:
-/// - Required fields are written using operator<<(T)
-/// - Optional fields are written using
-/// operator<<(arrow::util::optional<T>).
-///
-/// Note that operator<<(T) can be used to write optional fields.
-///
-/// Similarly, operator<<(arrow::util::optional<T>) can be used to
-/// write required fields. However if the optional parameter does not
-/// have a value (i.e. it is nullopt) then a ParquetException will be
-/// raised.
-///
-/// Currently there is no support for repeated fields.
-///
-class PARQUET_EXPORT StreamWriter {
- public:
- template <typename T>
- using optional = ::arrow::util::optional<T>;
-
- // N.B. Default constructed objects are not usable. This
- // constructor is provided so that the object may be move
- // assigned afterwards.
- StreamWriter() = default;
-
- explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
-
- ~StreamWriter() = default;
-
- static void SetDefaultMaxRowGroupSize(int64_t max_size);
-
- void SetMaxRowGroupSize(int64_t max_size);
-
- int current_column() const { return column_index_; }
-
- int64_t current_row() const { return current_row_; }
-
- int num_columns() const;
-
- // Moving is possible.
- StreamWriter(StreamWriter&&) = default;
- StreamWriter& operator=(StreamWriter&&) = default;
-
- // Copying is not allowed.
- StreamWriter(const StreamWriter&) = delete;
- StreamWriter& operator=(const StreamWriter&) = delete;
-
- /// \brief Output operators for required fields.
- /// These can also be used for optional fields when a value must be set.
- StreamWriter& operator<<(bool v);
-
- StreamWriter& operator<<(int8_t v);
-
- StreamWriter& operator<<(uint8_t v);
-
- StreamWriter& operator<<(int16_t v);
-
- StreamWriter& operator<<(uint16_t v);
-
- StreamWriter& operator<<(int32_t v);
-
- StreamWriter& operator<<(uint32_t v);
-
- StreamWriter& operator<<(int64_t v);
-
- StreamWriter& operator<<(uint64_t v);
-
- StreamWriter& operator<<(const std::chrono::milliseconds& v);
-
- StreamWriter& operator<<(const std::chrono::microseconds& v);
-
- StreamWriter& operator<<(float v);
-
- StreamWriter& operator<<(double v);
-
- StreamWriter& operator<<(char v);
-
- /// \brief Helper class to write fixed length strings.
- /// This is useful as the standard string view (such as
- /// arrow::util::string_view) is for variable length data.
- struct PARQUET_EXPORT FixedStringView {
- FixedStringView() = default;
-
- explicit FixedStringView(const char* data_ptr);
-
- FixedStringView(const char* data_ptr, std::size_t data_len);
-
- const char* data{NULLPTR};
- std::size_t size{0};
- };
-
- /// \brief Output operators for fixed length strings.
- template <int N>
- StreamWriter& operator<<(const char (&v)[N]) {
- return WriteFixedLength(v, N);
- }
- template <std::size_t N>
- StreamWriter& operator<<(const std::array<char, N>& v) {
- return WriteFixedLength(v.data(), N);
- }
- StreamWriter& operator<<(FixedStringView v);
-
- /// \brief Output operators for variable length strings.
- StreamWriter& operator<<(const char* v);
- StreamWriter& operator<<(const std::string& v);
- StreamWriter& operator<<(::arrow::util::string_view v);
-
- /// \brief Output operator for optional fields.
- template <typename T>
- StreamWriter& operator<<(const optional<T>& v) {
- if (v) {
- return operator<<(*v);
- }
- SkipOptionalColumn();
- return *this;
- }
-
- /// \brief Skip the next N columns of optional data. If there are
- /// less than N columns remaining then the excess columns are
- /// ignored.
- /// \throws ParquetException if there is an attempt to skip any
- /// required column.
- /// \return Number of columns actually skipped.
- int64_t SkipColumns(int num_columns_to_skip);
-
- /// \brief Terminate the current row and advance to next one.
- /// \throws ParquetException if all columns in the row were not
- /// written or skipped.
- void EndRow();
-
- /// \brief Terminate the current row group and create new one.
- void EndRowGroup();
-
- protected:
- template <typename WriterType, typename T>
- StreamWriter& Write(const T v) {
- auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
-
- writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
-
- if (max_row_group_size_ > 0) {
- row_group_size_ += writer->EstimatedBufferedValueBytes();
- }
- return *this;
- }
-
- StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
-
- StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
-
- void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
- int length = -1);
-
- /// \brief Skip the next column which must be optional.
- /// \throws ParquetException if the next column does not exist or is
- /// not optional.
- void SkipOptionalColumn();
-
- void WriteNullValue(ColumnWriter* writer);
-
- private:
- using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
-
- struct null_deleter {
- void operator()(void*) {}
- };
-
- int32_t column_index_{0};
- int64_t current_row_{0};
- int64_t row_group_size_{0};
- int64_t max_row_group_size_{default_row_group_size_};
-
- std::unique_ptr<ParquetFileWriter> file_writer_;
- std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
- std::vector<node_ptr_type> nodes_;
-
- static constexpr int16_t kDefLevelZero = 0;
- static constexpr int16_t kDefLevelOne = 1;
- static constexpr int16_t kRepLevelZero = 0;
- static constexpr int64_t kBatchSizeOne = 1;
-
- static int64_t default_row_group_size_;
-};
-
-struct PARQUET_EXPORT EndRowType {};
-constexpr EndRowType EndRow = {};
-
-struct PARQUET_EXPORT EndRowGroupType {};
-constexpr EndRowGroupType EndRowGroup = {};
-
-PARQUET_EXPORT
-StreamWriter& operator<<(StreamWriter&, EndRowType);
-
-PARQUET_EXPORT
-StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "arrow/util/string_view.h"
+#include "parquet/column_writer.h"
+#include "parquet/file_writer.h"
+
+namespace parquet {
+
+/// \brief A class for writing Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly indicate the end of the row using the
+/// EndRow() function or EndRow output manipulator.
+///
+/// A maximum row group size can be configured, the default size is
+/// 512MB. Alternatively the row group size can be set to zero and the
+/// user can create new row groups by calling the EndRowGroup()
+/// function or using the EndRowGroup output manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are written using operator<<(T)
+/// - Optional fields are written using
+/// operator<<(arrow::util::optional<T>).
+///
+/// Note that operator<<(T) can be used to write optional fields.
+///
+/// Similarly, operator<<(arrow::util::optional<T>) can be used to
+/// write required fields. However if the optional parameter does not
+/// have a value (i.e. it is nullopt) then a ParquetException will be
+/// raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamWriter {
+ public:
+ template <typename T>
+ using optional = ::arrow::util::optional<T>;
+
+ // N.B. Default constructed objects are not usable. This
+ // constructor is provided so that the object may be move
+ // assigned afterwards.
+ StreamWriter() = default;
+
+ explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
+
+ ~StreamWriter() = default;
+
+ static void SetDefaultMaxRowGroupSize(int64_t max_size);
+
+ void SetMaxRowGroupSize(int64_t max_size);
+
+ int current_column() const { return column_index_; }
+
+ int64_t current_row() const { return current_row_; }
+
+ int num_columns() const;
+
+ // Moving is possible.
+ StreamWriter(StreamWriter&&) = default;
+ StreamWriter& operator=(StreamWriter&&) = default;
+
+ // Copying is not allowed.
+ StreamWriter(const StreamWriter&) = delete;
+ StreamWriter& operator=(const StreamWriter&) = delete;
+
+ /// \brief Output operators for required fields.
+ /// These can also be used for optional fields when a value must be set.
+ StreamWriter& operator<<(bool v);
+
+ StreamWriter& operator<<(int8_t v);
+
+ StreamWriter& operator<<(uint8_t v);
+
+ StreamWriter& operator<<(int16_t v);
+
+ StreamWriter& operator<<(uint16_t v);
+
+ StreamWriter& operator<<(int32_t v);
+
+ StreamWriter& operator<<(uint32_t v);
+
+ StreamWriter& operator<<(int64_t v);
+
+ StreamWriter& operator<<(uint64_t v);
+
+ StreamWriter& operator<<(const std::chrono::milliseconds& v);
+
+ StreamWriter& operator<<(const std::chrono::microseconds& v);
+
+ StreamWriter& operator<<(float v);
+
+ StreamWriter& operator<<(double v);
+
+ StreamWriter& operator<<(char v);
+
+ /// \brief Helper class to write fixed length strings.
+ /// This is useful as the standard string view (such as
+ /// arrow::util::string_view) is for variable length data.
+ struct PARQUET_EXPORT FixedStringView {
+ FixedStringView() = default;
+
+ explicit FixedStringView(const char* data_ptr);
+
+ FixedStringView(const char* data_ptr, std::size_t data_len);
+
+ const char* data{NULLPTR};
+ std::size_t size{0};
+ };
+
+ /// \brief Output operators for fixed length strings.
+ template <int N>
+ StreamWriter& operator<<(const char (&v)[N]) {
+ return WriteFixedLength(v, N);
+ }
+ template <std::size_t N>
+ StreamWriter& operator<<(const std::array<char, N>& v) {
+ return WriteFixedLength(v.data(), N);
+ }
+ StreamWriter& operator<<(FixedStringView v);
+
+ /// \brief Output operators for variable length strings.
+ StreamWriter& operator<<(const char* v);
+ StreamWriter& operator<<(const std::string& v);
+ StreamWriter& operator<<(::arrow::util::string_view v);
+
+ /// \brief Output operator for optional fields.
+ template <typename T>
+ StreamWriter& operator<<(const optional<T>& v) {
+ if (v) {
+ return operator<<(*v);
+ }
+ SkipOptionalColumn();
+ return *this;
+ }
+
+ /// \brief Skip the next N columns of optional data. If there are
+ /// less than N columns remaining then the excess columns are
+ /// ignored.
+ /// \throws ParquetException if there is an attempt to skip any
+ /// required column.
+ /// \return Number of columns actually skipped.
+ int64_t SkipColumns(int num_columns_to_skip);
+
+ /// \brief Terminate the current row and advance to next one.
+ /// \throws ParquetException if all columns in the row were not
+ /// written or skipped.
+ void EndRow();
+
+ /// \brief Terminate the current row group and create new one.
+ void EndRowGroup();
+
+ protected:
+ template <typename WriterType, typename T>
+ StreamWriter& Write(const T v) {
+ auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
+
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
+
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+ }
+
+ StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
+
+ StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
+
+ void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+ int length = -1);
+
+ /// \brief Skip the next column which must be optional.
+ /// \throws ParquetException if the next column does not exist or is
+ /// not optional.
+ void SkipOptionalColumn();
+
+ void WriteNullValue(ColumnWriter* writer);
+
+ private:
+ using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
+
+ struct null_deleter {
+ void operator()(void*) {}
+ };
+
+ int32_t column_index_{0};
+ int64_t current_row_{0};
+ int64_t row_group_size_{0};
+ int64_t max_row_group_size_{default_row_group_size_};
+
+ std::unique_ptr<ParquetFileWriter> file_writer_;
+ std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
+ std::vector<node_ptr_type> nodes_;
+
+ static constexpr int16_t kDefLevelZero = 0;
+ static constexpr int16_t kDefLevelOne = 1;
+ static constexpr int16_t kRepLevelZero = 0;
+ static constexpr int64_t kBatchSizeOne = 1;
+
+ static int64_t default_row_group_size_;
+};
+
+struct PARQUET_EXPORT EndRowType {};
+constexpr EndRowType EndRow = {};
+
+struct PARQUET_EXPORT EndRowGroupType {};
+constexpr EndRowGroupType EndRowGroup = {};
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowType);
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map b/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
index 4bf032dd584..9df019e5fcd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
@@ -1,40 +1,40 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-{
- # Symbols marked as 'local' are not exported by the DSO and thus may not
- # be used by client applications.
- local:
- # devtoolset / static-libstdc++ symbols
- __cxa_*;
- __once_proxy;
-
- extern "C++" {
- # boost
- boost::*;
-
- # thrift
- apache::thrift::*;
-
- # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
- # links c++11 symbols into binaries so that the result may be executed on
- # a system with an older libstdc++ which doesn't include the necessary
- # c++11 symbols.
- std::*;
- *std::__once_call*;
- };
-};
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{
+ # Symbols marked as 'local' are not exported by the DSO and thus may not
+ # be used by client applications.
+ local:
+ # devtoolset / static-libstdc++ symbols
+ __cxa_*;
+ __once_proxy;
+
+ extern "C++" {
+ # boost
+ boost::*;
+
+ # thrift
+ apache::thrift::*;
+
+ # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
+ # links c++11 symbols into binaries so that the result may be executed on
+ # a system with an older libstdc++ which doesn't include the necessary
+ # c++11 symbols.
+ std::*;
+ *std::__once_call*;
+ };
+};
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
index ea7df209621..443d948e30a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
@@ -1,494 +1,494 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/util/windows_compatibility.h"
-
-#include <cstdint>
-// Check if thrift version < 0.11.0
-// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
-#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
-#include <boost/shared_ptr.hpp>
-#else
-#include <memory>
-#endif
-#include <string>
-#include <vector>
-
-// TCompactProtocol requires some #defines to work right.
-#define SIGNED_RIGHT_SHIFT_IS 1
-#define ARITHMETIC_RIGHT_SHIFT 1
-#include <thrift/TApplicationException.h>
-#include <thrift/protocol/TCompactProtocol.h>
-#include <thrift/protocol/TDebugProtocol.h>
-
-#include <thrift/protocol/TBinaryProtocol.h>
-#include <thrift/transport/TBufferTransports.h>
-#include <sstream>
-
-#include "arrow/util/logging.h"
-
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-
-#include "generated/parquet_types.h" // IYWU pragma: export
-
-namespace parquet {
-
-// Check if thrift version < 0.11.0
-// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
-#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
-using ::boost::shared_ptr;
-#else
-using ::std::shared_ptr;
-#endif
-
-// ----------------------------------------------------------------------
-// Convert Thrift enums to Parquet enums
-
-// Unsafe enum converters (input is not checked for validity)
-
-static inline Type::type FromThriftUnsafe(format::Type::type type) {
- return static_cast<Type::type>(type);
-}
-
-static inline ConvertedType::type FromThriftUnsafe(format::ConvertedType::type type) {
- // item 0 is NONE
- return static_cast<ConvertedType::type>(static_cast<int>(type) + 1);
-}
-
-static inline Repetition::type FromThriftUnsafe(format::FieldRepetitionType::type type) {
- return static_cast<Repetition::type>(type);
-}
-
-static inline Encoding::type FromThriftUnsafe(format::Encoding::type type) {
- return static_cast<Encoding::type>(type);
-}
-
-static inline PageType::type FromThriftUnsafe(format::PageType::type type) {
- return static_cast<PageType::type>(type);
-}
-
-static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type type) {
- switch (type) {
- case format::CompressionCodec::UNCOMPRESSED:
- return Compression::UNCOMPRESSED;
- case format::CompressionCodec::SNAPPY:
- return Compression::SNAPPY;
- case format::CompressionCodec::GZIP:
- return Compression::GZIP;
- case format::CompressionCodec::LZO:
- return Compression::LZO;
- case format::CompressionCodec::BROTLI:
- return Compression::BROTLI;
- case format::CompressionCodec::LZ4:
- return Compression::LZ4_HADOOP;
- case format::CompressionCodec::LZ4_RAW:
- return Compression::LZ4;
- case format::CompressionCodec::ZSTD:
- return Compression::ZSTD;
- default:
- DCHECK(false) << "Cannot reach here";
- return Compression::UNCOMPRESSED;
- }
-}
-
-namespace internal {
-
-template <typename T>
-struct ThriftEnumTypeTraits {};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::Type::type> {
- using ParquetEnum = Type;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::ConvertedType::type> {
- using ParquetEnum = ConvertedType;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::FieldRepetitionType::type> {
- using ParquetEnum = Repetition;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::Encoding::type> {
- using ParquetEnum = Encoding;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::PageType::type> {
- using ParquetEnum = PageType;
-};
-
-// If the parquet file is corrupted it is possible the enum value decoded
-// will not be in the range of defined values, which is undefined behaviour.
-// This facility prevents this by loading the value as the underlying type
-// and checking to make sure it is in range.
-
-template <typename EnumType,
- typename EnumTypeRaw = typename std::underlying_type<EnumType>::type>
-inline static EnumTypeRaw LoadEnumRaw(const EnumType* in) {
- EnumTypeRaw raw_value;
- // Use memcpy(), as a regular cast would be undefined behaviour on invalid values
- memcpy(&raw_value, in, sizeof(EnumType));
- return raw_value;
-}
-
-template <typename ApiType>
-struct SafeLoader {
- using ApiTypeEnum = typename ApiType::type;
- using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;
-
- template <typename ThriftType>
- inline static ApiTypeRawEnum LoadRaw(const ThriftType* in) {
- static_assert(sizeof(ApiTypeEnum) == sizeof(ThriftType),
- "parquet type should always be the same size as thrift type");
- return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
- }
-
- template <typename ThriftType, bool IsUnsigned = true>
- inline static ApiTypeEnum LoadChecked(
- const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
- auto raw_value = LoadRaw(in);
- if (ARROW_PREDICT_FALSE(raw_value >=
- static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
- return ApiType::UNDEFINED;
- }
- return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
- }
-
- template <typename ThriftType, bool IsUnsigned = false>
- inline static ApiTypeEnum LoadChecked(
- const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
- auto raw_value = LoadRaw(in);
- if (ARROW_PREDICT_FALSE(raw_value >=
- static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
- raw_value < 0)) {
- return ApiType::UNDEFINED;
- }
- return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
- }
-
- template <typename ThriftType>
- inline static ApiTypeEnum Load(const ThriftType* in) {
- return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
- }
-};
-
-} // namespace internal
-
-// Safe enum loader: will check for invalid enum value before converting
-
-template <typename ThriftType,
- typename ParquetEnum =
- typename internal::ThriftEnumTypeTraits<ThriftType>::ParquetEnum>
-inline typename ParquetEnum::type LoadEnumSafe(const ThriftType* in) {
- return internal::SafeLoader<ParquetEnum>::Load(in);
-}
-
-inline typename Compression::type LoadEnumSafe(const format::CompressionCodec::type* in) {
- const auto raw_value = internal::LoadEnumRaw(in);
- // Check bounds manually, as Compression::type doesn't have the same values
- // as format::CompressionCodec.
- const auto min_value =
- static_cast<decltype(raw_value)>(format::CompressionCodec::UNCOMPRESSED);
- const auto max_value =
- static_cast<decltype(raw_value)>(format::CompressionCodec::LZ4_RAW);
- if (raw_value < min_value || raw_value > max_value) {
- return Compression::UNCOMPRESSED;
- }
- return FromThriftUnsafe(*in);
-}
-
-// Safe non-enum converters
-
-static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
- return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
- aesGcmV1.supply_aad_prefix};
-}
-
-static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
- return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique,
- aesGcmCtrV1.supply_aad_prefix};
-}
-
-static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) {
- EncryptionAlgorithm encryption_algorithm;
-
- if (encryption.__isset.AES_GCM_V1) {
- encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
- encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1);
- } else if (encryption.__isset.AES_GCM_CTR_V1) {
- encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1;
- encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1);
- } else {
- throw ParquetException("Unsupported algorithm");
- }
- return encryption_algorithm;
-}
-
-// ----------------------------------------------------------------------
-// Convert Thrift enums from Parquet enums
-
-static inline format::Type::type ToThrift(Type::type type) {
- return static_cast<format::Type::type>(type);
-}
-
-static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
- // item 0 is NONE
- DCHECK_NE(type, ConvertedType::NONE);
- // it is forbidden to emit "NA" (PARQUET-1990)
- DCHECK_NE(type, ConvertedType::NA);
- DCHECK_NE(type, ConvertedType::UNDEFINED);
- return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
-}
-
-static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) {
- return static_cast<format::FieldRepetitionType::type>(type);
-}
-
-static inline format::Encoding::type ToThrift(Encoding::type type) {
- return static_cast<format::Encoding::type>(type);
-}
-
-static inline format::CompressionCodec::type ToThrift(Compression::type type) {
- switch (type) {
- case Compression::UNCOMPRESSED:
- return format::CompressionCodec::UNCOMPRESSED;
- case Compression::SNAPPY:
- return format::CompressionCodec::SNAPPY;
- case Compression::GZIP:
- return format::CompressionCodec::GZIP;
- case Compression::LZO:
- return format::CompressionCodec::LZO;
- case Compression::BROTLI:
- return format::CompressionCodec::BROTLI;
- case Compression::LZ4:
- return format::CompressionCodec::LZ4_RAW;
- case Compression::LZ4_HADOOP:
- // Deprecated "LZ4" Parquet compression has Hadoop-specific framing
- return format::CompressionCodec::LZ4;
- case Compression::ZSTD:
- return format::CompressionCodec::ZSTD;
- default:
- DCHECK(false) << "Cannot reach here";
- return format::CompressionCodec::UNCOMPRESSED;
- }
-}
-
-static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
- format::Statistics statistics;
- if (stats.has_min) {
- statistics.__set_min_value(stats.min());
- // If the order is SIGNED, then the old min value must be set too.
- // This for backward compatibility
- if (stats.is_signed()) {
- statistics.__set_min(stats.min());
- }
- }
- if (stats.has_max) {
- statistics.__set_max_value(stats.max());
- // If the order is SIGNED, then the old max value must be set too.
- // This for backward compatibility
- if (stats.is_signed()) {
- statistics.__set_max(stats.max());
- }
- }
- if (stats.has_null_count) {
- statistics.__set_null_count(stats.null_count);
- }
- if (stats.has_distinct_count) {
- statistics.__set_distinct_count(stats.distinct_count);
- }
-
- return statistics;
-}
-
-static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) {
- format::AesGcmV1 aesGcmV1;
- // aad_file_unique is always set
- aesGcmV1.__set_aad_file_unique(aad.aad_file_unique);
- aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
- if (!aad.aad_prefix.empty()) {
- aesGcmV1.__set_aad_prefix(aad.aad_prefix);
- }
- return aesGcmV1;
-}
-
-static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) {
- format::AesGcmCtrV1 aesGcmCtrV1;
- // aad_file_unique is always set
- aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique);
- aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
- if (!aad.aad_prefix.empty()) {
- aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix);
- }
- return aesGcmCtrV1;
-}
-
-static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) {
- format::EncryptionAlgorithm encryption_algorithm;
- if (encryption.algorithm == ParquetCipher::AES_GCM_V1) {
- encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad));
- } else {
- encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad));
- }
- return encryption_algorithm;
-}
-
-// ----------------------------------------------------------------------
-// Thrift struct serialization / deserialization utilities
-
-using ThriftBuffer = apache::thrift::transport::TMemoryBuffer;
-
-template <class T>
-inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
- T* deserialized_msg) {
- // Deserialize msg bytes into c++ thrift msg using memory transport.
- shared_ptr<ThriftBuffer> tmem_transport(
- new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
- apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
- // Protect against CPU and memory bombs
- tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
- // Structs in the thrift definition are relatively large (at least 300 bytes).
- // This limits total memory to the same order of magnitude as stringSize.
- tproto_factory.setContainerSizeLimit(1000 * 1000);
- shared_ptr<apache::thrift::protocol::TProtocol> tproto = //
- tproto_factory.getProtocol(tmem_transport);
- try {
- deserialized_msg->read(tproto.get());
- } catch (std::exception& e) {
- std::stringstream ss;
- ss << "Couldn't deserialize thrift: " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- uint32_t bytes_left = tmem_transport->available_read();
- *len = *len - bytes_left;
-}
-
-// Deserialize a thrift message from buf/len. buf/len must at least contain
-// all the bytes needed to store the thrift message. On return, len will be
-// set to the actual length of the header.
-template <class T>
-inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg,
- const std::shared_ptr<Decryptor>& decryptor = NULLPTR) {
- // thrift message is not encrypted
- if (decryptor == NULLPTR) {
- DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg);
- } else { // thrift message is encrypted
- uint32_t clen;
- clen = *len;
- // decrypt
- std::shared_ptr<ResizableBuffer> decrypted_buffer =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
- decryptor->pool(),
- static_cast<int64_t>(clen - decryptor->CiphertextSizeDelta())));
- const uint8_t* cipher_buf = buf;
- uint32_t decrypted_buffer_len =
- decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data());
- if (decrypted_buffer_len <= 0) {
- throw ParquetException("Couldn't decrypt buffer\n");
- }
- *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta();
- DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len,
- deserialized_msg);
- }
-}
-
-/// Utility class to serialize thrift objects to a binary format. This object
-/// should be reused if possible to reuse the underlying memory.
-/// Note: thrift will encode NULLs into the serialized buffer so it is not valid
-/// to treat it as a string.
-class ThriftSerializer {
- public:
- explicit ThriftSerializer(int initial_buffer_size = 1024)
- : mem_buffer_(new ThriftBuffer(initial_buffer_size)) {
- apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> factory;
- protocol_ = factory.getProtocol(mem_buffer_);
- }
-
- /// Serialize obj into a memory buffer. The result is returned in buffer/len. The
- /// memory returned is owned by this object and will be invalid when another object
- /// is serialized.
- template <class T>
- void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) {
- SerializeObject(obj);
- mem_buffer_->getBuffer(buffer, len);
- }
-
- template <class T>
- void SerializeToString(const T* obj, std::string* result) {
- SerializeObject(obj);
- *result = mem_buffer_->getBufferAsString();
- }
-
- template <class T>
- int64_t Serialize(const T* obj, ArrowOutputStream* out,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR) {
- uint8_t* out_buffer;
- uint32_t out_length;
- SerializeToBuffer(obj, &out_length, &out_buffer);
-
- // obj is not encrypted
- if (encryptor == NULLPTR) {
- PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length));
- return static_cast<int64_t>(out_length);
- } else { // obj is encrypted
- return SerializeEncryptedObj(out, out_buffer, out_length, encryptor);
- }
- }
-
- private:
- template <class T>
- void SerializeObject(const T* obj) {
- try {
- mem_buffer_->resetBuffer();
- obj->write(protocol_.get());
- } catch (std::exception& e) {
- std::stringstream ss;
- ss << "Couldn't serialize thrift: " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- }
-
- int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer,
- uint32_t out_length,
- const std::shared_ptr<Encryptor>& encryptor) {
- std::shared_ptr<ResizableBuffer> cipher_buffer =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
- encryptor->pool(),
- static_cast<int64_t>(encryptor->CiphertextSizeDelta() + out_length)));
- int cipher_buffer_len =
- encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data());
-
- PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));
- return static_cast<int64_t>(cipher_buffer_len);
- }
-
- shared_ptr<ThriftBuffer> mem_buffer_;
- shared_ptr<apache::thrift::protocol::TProtocol> protocol_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#include <cstdint>
+// Check if thrift version < 0.11.0
+// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
+#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
+#include <boost/shared_ptr.hpp>
+#else
+#include <memory>
+#endif
+#include <string>
+#include <vector>
+
+// TCompactProtocol requires some #defines to work right.
+#define SIGNED_RIGHT_SHIFT_IS 1
+#define ARITHMETIC_RIGHT_SHIFT 1
+#include <thrift/TApplicationException.h>
+#include <thrift/protocol/TCompactProtocol.h>
+#include <thrift/protocol/TDebugProtocol.h>
+
+#include <thrift/protocol/TBinaryProtocol.h>
+#include <thrift/transport/TBufferTransports.h>
+#include <sstream>
+
+#include "arrow/util/logging.h"
+
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+#include "generated/parquet_types.h" // IYWU pragma: export
+
+namespace parquet {
+
+// Check if thrift version < 0.11.0
+// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
+#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
+using ::boost::shared_ptr;
+#else
+using ::std::shared_ptr;
+#endif
+
+// ----------------------------------------------------------------------
+// Convert Thrift enums to Parquet enums
+
+// Unsafe enum converters (input is not checked for validity)
+
+static inline Type::type FromThriftUnsafe(format::Type::type type) {
+ return static_cast<Type::type>(type);
+}
+
+static inline ConvertedType::type FromThriftUnsafe(format::ConvertedType::type type) {
+ // item 0 is NONE
+ return static_cast<ConvertedType::type>(static_cast<int>(type) + 1);
+}
+
+static inline Repetition::type FromThriftUnsafe(format::FieldRepetitionType::type type) {
+ return static_cast<Repetition::type>(type);
+}
+
+static inline Encoding::type FromThriftUnsafe(format::Encoding::type type) {
+ return static_cast<Encoding::type>(type);
+}
+
+static inline PageType::type FromThriftUnsafe(format::PageType::type type) {
+ return static_cast<PageType::type>(type);
+}
+
+static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type type) {
+ switch (type) {
+ case format::CompressionCodec::UNCOMPRESSED:
+ return Compression::UNCOMPRESSED;
+ case format::CompressionCodec::SNAPPY:
+ return Compression::SNAPPY;
+ case format::CompressionCodec::GZIP:
+ return Compression::GZIP;
+ case format::CompressionCodec::LZO:
+ return Compression::LZO;
+ case format::CompressionCodec::BROTLI:
+ return Compression::BROTLI;
+ case format::CompressionCodec::LZ4:
+ return Compression::LZ4_HADOOP;
+ case format::CompressionCodec::LZ4_RAW:
+ return Compression::LZ4;
+ case format::CompressionCodec::ZSTD:
+ return Compression::ZSTD;
+ default:
+ DCHECK(false) << "Cannot reach here";
+ return Compression::UNCOMPRESSED;
+ }
+}
+
+namespace internal {
+
+template <typename T>
+struct ThriftEnumTypeTraits {};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::Type::type> {
+ using ParquetEnum = Type;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::ConvertedType::type> {
+ using ParquetEnum = ConvertedType;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::FieldRepetitionType::type> {
+ using ParquetEnum = Repetition;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::Encoding::type> {
+ using ParquetEnum = Encoding;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::PageType::type> {
+ using ParquetEnum = PageType;
+};
+
+// If the parquet file is corrupted it is possible the enum value decoded
+// will not be in the range of defined values, which is undefined behaviour.
+// This facility prevents this by loading the value as the underlying type
+// and checking to make sure it is in range.
+
+template <typename EnumType,
+ typename EnumTypeRaw = typename std::underlying_type<EnumType>::type>
+inline static EnumTypeRaw LoadEnumRaw(const EnumType* in) {
+ EnumTypeRaw raw_value;
+ // Use memcpy(), as a regular cast would be undefined behaviour on invalid values
+ memcpy(&raw_value, in, sizeof(EnumType));
+ return raw_value;
+}
+
+template <typename ApiType>
+struct SafeLoader {
+ using ApiTypeEnum = typename ApiType::type;
+ using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;
+
+ template <typename ThriftType>
+ inline static ApiTypeRawEnum LoadRaw(const ThriftType* in) {
+ static_assert(sizeof(ApiTypeEnum) == sizeof(ThriftType),
+ "parquet type should always be the same size as thrift type");
+ return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
+ }
+
+ template <typename ThriftType, bool IsUnsigned = true>
+ inline static ApiTypeEnum LoadChecked(
+ const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
+ auto raw_value = LoadRaw(in);
+ if (ARROW_PREDICT_FALSE(raw_value >=
+ static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
+ return ApiType::UNDEFINED;
+ }
+ return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
+ }
+
+ template <typename ThriftType, bool IsUnsigned = false>
+ inline static ApiTypeEnum LoadChecked(
+ const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
+ auto raw_value = LoadRaw(in);
+ if (ARROW_PREDICT_FALSE(raw_value >=
+ static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
+ raw_value < 0)) {
+ return ApiType::UNDEFINED;
+ }
+ return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
+ }
+
+ template <typename ThriftType>
+ inline static ApiTypeEnum Load(const ThriftType* in) {
+ return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
+ }
+};
+
+} // namespace internal
+
+// Safe enum loader: will check for invalid enum value before converting
+
+template <typename ThriftType,
+ typename ParquetEnum =
+ typename internal::ThriftEnumTypeTraits<ThriftType>::ParquetEnum>
+inline typename ParquetEnum::type LoadEnumSafe(const ThriftType* in) {
+ return internal::SafeLoader<ParquetEnum>::Load(in);
+}
+
+inline typename Compression::type LoadEnumSafe(const format::CompressionCodec::type* in) {
+ const auto raw_value = internal::LoadEnumRaw(in);
+ // Check bounds manually, as Compression::type doesn't have the same values
+ // as format::CompressionCodec.
+ const auto min_value =
+ static_cast<decltype(raw_value)>(format::CompressionCodec::UNCOMPRESSED);
+ const auto max_value =
+ static_cast<decltype(raw_value)>(format::CompressionCodec::LZ4_RAW);
+ if (raw_value < min_value || raw_value > max_value) {
+ return Compression::UNCOMPRESSED;
+ }
+ return FromThriftUnsafe(*in);
+}
+
+// Safe non-enum converters
+
+static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
+ return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
+ aesGcmV1.supply_aad_prefix};
+}
+
+static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
+ return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique,
+ aesGcmCtrV1.supply_aad_prefix};
+}
+
+static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) {
+ EncryptionAlgorithm encryption_algorithm;
+
+ if (encryption.__isset.AES_GCM_V1) {
+ encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
+ encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1);
+ } else if (encryption.__isset.AES_GCM_CTR_V1) {
+ encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1;
+ encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1);
+ } else {
+ throw ParquetException("Unsupported algorithm");
+ }
+ return encryption_algorithm;
+}
+
+// ----------------------------------------------------------------------
+// Convert Thrift enums from Parquet enums
+
+static inline format::Type::type ToThrift(Type::type type) {
+ return static_cast<format::Type::type>(type);
+}
+
+static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
+ // item 0 is NONE
+ DCHECK_NE(type, ConvertedType::NONE);
+ // it is forbidden to emit "NA" (PARQUET-1990)
+ DCHECK_NE(type, ConvertedType::NA);
+ DCHECK_NE(type, ConvertedType::UNDEFINED);
+ return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
+}
+
+static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) {
+ return static_cast<format::FieldRepetitionType::type>(type);
+}
+
+static inline format::Encoding::type ToThrift(Encoding::type type) {
+ return static_cast<format::Encoding::type>(type);
+}
+
+static inline format::CompressionCodec::type ToThrift(Compression::type type) {
+ switch (type) {
+ case Compression::UNCOMPRESSED:
+ return format::CompressionCodec::UNCOMPRESSED;
+ case Compression::SNAPPY:
+ return format::CompressionCodec::SNAPPY;
+ case Compression::GZIP:
+ return format::CompressionCodec::GZIP;
+ case Compression::LZO:
+ return format::CompressionCodec::LZO;
+ case Compression::BROTLI:
+ return format::CompressionCodec::BROTLI;
+ case Compression::LZ4:
+ return format::CompressionCodec::LZ4_RAW;
+ case Compression::LZ4_HADOOP:
+ // Deprecated "LZ4" Parquet compression has Hadoop-specific framing
+ return format::CompressionCodec::LZ4;
+ case Compression::ZSTD:
+ return format::CompressionCodec::ZSTD;
+ default:
+ DCHECK(false) << "Cannot reach here";
+ return format::CompressionCodec::UNCOMPRESSED;
+ }
+}
+
+static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
+ format::Statistics statistics;
+ if (stats.has_min) {
+ statistics.__set_min_value(stats.min());
+ // If the order is SIGNED, then the old min value must be set too.
+ // This for backward compatibility
+ if (stats.is_signed()) {
+ statistics.__set_min(stats.min());
+ }
+ }
+ if (stats.has_max) {
+ statistics.__set_max_value(stats.max());
+ // If the order is SIGNED, then the old max value must be set too.
+ // This for backward compatibility
+ if (stats.is_signed()) {
+ statistics.__set_max(stats.max());
+ }
+ }
+ if (stats.has_null_count) {
+ statistics.__set_null_count(stats.null_count);
+ }
+ if (stats.has_distinct_count) {
+ statistics.__set_distinct_count(stats.distinct_count);
+ }
+
+ return statistics;
+}
+
+static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) {
+ format::AesGcmV1 aesGcmV1;
+ // aad_file_unique is always set
+ aesGcmV1.__set_aad_file_unique(aad.aad_file_unique);
+ aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
+ if (!aad.aad_prefix.empty()) {
+ aesGcmV1.__set_aad_prefix(aad.aad_prefix);
+ }
+ return aesGcmV1;
+}
+
+static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) {
+ format::AesGcmCtrV1 aesGcmCtrV1;
+ // aad_file_unique is always set
+ aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique);
+ aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
+ if (!aad.aad_prefix.empty()) {
+ aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix);
+ }
+ return aesGcmCtrV1;
+}
+
+static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) {
+ format::EncryptionAlgorithm encryption_algorithm;
+ if (encryption.algorithm == ParquetCipher::AES_GCM_V1) {
+ encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad));
+ } else {
+ encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad));
+ }
+ return encryption_algorithm;
+}
+
+// ----------------------------------------------------------------------
+// Thrift struct serialization / deserialization utilities
+
+using ThriftBuffer = apache::thrift::transport::TMemoryBuffer;
+
+template <class T>
+inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
+ T* deserialized_msg) {
+ // Deserialize msg bytes into c++ thrift msg using memory transport.
+ shared_ptr<ThriftBuffer> tmem_transport(
+ new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
+ apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
+ // Protect against CPU and memory bombs
+ tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
+ // Structs in the thrift definition are relatively large (at least 300 bytes).
+ // This limits total memory to the same order of magnitude as stringSize.
+ tproto_factory.setContainerSizeLimit(1000 * 1000);
+ shared_ptr<apache::thrift::protocol::TProtocol> tproto = //
+ tproto_factory.getProtocol(tmem_transport);
+ try {
+ deserialized_msg->read(tproto.get());
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Couldn't deserialize thrift: " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ uint32_t bytes_left = tmem_transport->available_read();
+ *len = *len - bytes_left;
+}
+
+// Deserialize a thrift message from buf/len. buf/len must at least contain
+// all the bytes needed to store the thrift message. On return, len will be
+// set to the actual length of the header.
+template <class T>
+inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg,
+ const std::shared_ptr<Decryptor>& decryptor = NULLPTR) {
+ // thrift message is not encrypted
+ if (decryptor == NULLPTR) {
+ DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg);
+ } else { // thrift message is encrypted
+ uint32_t clen;
+ clen = *len;
+ // decrypt
+ std::shared_ptr<ResizableBuffer> decrypted_buffer =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
+ decryptor->pool(),
+ static_cast<int64_t>(clen - decryptor->CiphertextSizeDelta())));
+ const uint8_t* cipher_buf = buf;
+ uint32_t decrypted_buffer_len =
+ decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data());
+ if (decrypted_buffer_len <= 0) {
+ throw ParquetException("Couldn't decrypt buffer\n");
+ }
+ *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta();
+ DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len,
+ deserialized_msg);
+ }
+}
+
+/// Utility class to serialize thrift objects to a binary format. This object
+/// should be reused if possible to reuse the underlying memory.
+/// Note: thrift will encode NULLs into the serialized buffer so it is not valid
+/// to treat it as a string.
+class ThriftSerializer {
+ public:
+ explicit ThriftSerializer(int initial_buffer_size = 1024)
+ : mem_buffer_(new ThriftBuffer(initial_buffer_size)) {
+ apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> factory;
+ protocol_ = factory.getProtocol(mem_buffer_);
+ }
+
+ /// Serialize obj into a memory buffer. The result is returned in buffer/len. The
+ /// memory returned is owned by this object and will be invalid when another object
+ /// is serialized.
+ template <class T>
+ void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) {
+ SerializeObject(obj);
+ mem_buffer_->getBuffer(buffer, len);
+ }
+
+ template <class T>
+ void SerializeToString(const T* obj, std::string* result) {
+ SerializeObject(obj);
+ *result = mem_buffer_->getBufferAsString();
+ }
+
+ template <class T>
+ int64_t Serialize(const T* obj, ArrowOutputStream* out,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR) {
+ uint8_t* out_buffer;
+ uint32_t out_length;
+ SerializeToBuffer(obj, &out_length, &out_buffer);
+
+ // obj is not encrypted
+ if (encryptor == NULLPTR) {
+ PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length));
+ return static_cast<int64_t>(out_length);
+ } else { // obj is encrypted
+ return SerializeEncryptedObj(out, out_buffer, out_length, encryptor);
+ }
+ }
+
+ private:
+ template <class T>
+ void SerializeObject(const T* obj) {
+ try {
+ mem_buffer_->resetBuffer();
+ obj->write(protocol_.get());
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Couldn't serialize thrift: " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+
+ int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer,
+ uint32_t out_length,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ std::shared_ptr<ResizableBuffer> cipher_buffer =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
+ encryptor->pool(),
+ static_cast<int64_t>(encryptor->CiphertextSizeDelta() + out_length)));
+ int cipher_buffer_len =
+ encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data());
+
+ PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));
+ return static_cast<int64_t>(cipher_buffer_len);
+ }
+
+ shared_ptr<ThriftBuffer> mem_buffer_;
+ shared_ptr<apache::thrift::protocol::TProtocol> protocol_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
index a427f5a9591..2153ea63efb 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
@@ -1,43 +1,43 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-namespace parquet {
-
-struct ParquetVersion {
- enum type { PARQUET_1_0, PARQUET_2_0 };
-};
-
-class FileMetaData;
-class SchemaDescriptor;
-
-class ReaderProperties;
-class ArrowReaderProperties;
-
-class WriterProperties;
-class WriterPropertiesBuilder;
-class ArrowWriterProperties;
-class ArrowWriterPropertiesBuilder;
-
-namespace arrow {
-
-class FileWriter;
-class FileReader;
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace parquet {
+
+struct ParquetVersion {
+ enum type { PARQUET_1_0, PARQUET_2_0 };
+};
+
+class FileMetaData;
+class SchemaDescriptor;
+
+class ReaderProperties;
+class ArrowReaderProperties;
+
+class WriterProperties;
+class WriterPropertiesBuilder;
+class ArrowWriterProperties;
+class ArrowWriterPropertiesBuilder;
+
+namespace arrow {
+
+class FileWriter;
+class FileReader;
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/types.cc b/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
index ef23c40662b..35cc43639b8 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
@@ -1,1567 +1,1567 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cmath>
-#include <cstdint>
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
-#include "arrow/util/logging.h"
-
-#include "parquet/exception.h"
-#include "parquet/types.h"
-
-#include "generated/parquet_types.h"
-
-using arrow::internal::checked_cast;
-using arrow::util::Codec;
-
-namespace parquet {
-
-bool IsCodecSupported(Compression::type codec) {
- switch (codec) {
- case Compression::UNCOMPRESSED:
- case Compression::SNAPPY:
- case Compression::GZIP:
- case Compression::BROTLI:
- case Compression::ZSTD:
- case Compression::LZ4:
- case Compression::LZ4_HADOOP:
- return true;
- default:
- return false;
- }
-}
-
-std::unique_ptr<Codec> GetCodec(Compression::type codec) {
- return GetCodec(codec, Codec::UseDefaultCompressionLevel());
-}
-
-std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level) {
- std::unique_ptr<Codec> result;
- if (codec == Compression::LZO) {
- throw ParquetException(
- "While LZO compression is supported by the Parquet format in "
- "general, it is currently not supported by the C++ implementation.");
- }
-
- if (!IsCodecSupported(codec)) {
- std::stringstream ss;
- ss << "Codec type " << Codec::GetCodecAsString(codec)
- << " not supported in Parquet format";
- throw ParquetException(ss.str());
- }
-
- PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, compression_level));
- return result;
-}
-
-std::string FormatStatValue(Type::type parquet_type, ::arrow::util::string_view val) {
- std::stringstream result;
-
- const char* bytes = val.data();
- switch (parquet_type) {
- case Type::BOOLEAN:
- result << reinterpret_cast<const bool*>(bytes)[0];
- break;
- case Type::INT32:
- result << reinterpret_cast<const int32_t*>(bytes)[0];
- break;
- case Type::INT64:
- result << reinterpret_cast<const int64_t*>(bytes)[0];
- break;
- case Type::DOUBLE:
- result << reinterpret_cast<const double*>(bytes)[0];
- break;
- case Type::FLOAT:
- result << reinterpret_cast<const float*>(bytes)[0];
- break;
- case Type::INT96: {
- auto const i32_val = reinterpret_cast<const int32_t*>(bytes);
- result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
- break;
- }
- case Type::BYTE_ARRAY: {
- return std::string(val);
- }
- case Type::FIXED_LEN_BYTE_ARRAY: {
- return std::string(val);
- }
- case Type::UNDEFINED:
- default:
- break;
- }
- return result.str();
-}
-
-std::string EncodingToString(Encoding::type t) {
- switch (t) {
- case Encoding::PLAIN:
- return "PLAIN";
- case Encoding::PLAIN_DICTIONARY:
- return "PLAIN_DICTIONARY";
- case Encoding::RLE:
- return "RLE";
- case Encoding::BIT_PACKED:
- return "BIT_PACKED";
- case Encoding::DELTA_BINARY_PACKED:
- return "DELTA_BINARY_PACKED";
- case Encoding::DELTA_LENGTH_BYTE_ARRAY:
- return "DELTA_LENGTH_BYTE_ARRAY";
- case Encoding::DELTA_BYTE_ARRAY:
- return "DELTA_BYTE_ARRAY";
- case Encoding::RLE_DICTIONARY:
- return "RLE_DICTIONARY";
- case Encoding::BYTE_STREAM_SPLIT:
- return "BYTE_STREAM_SPLIT";
- default:
- return "UNKNOWN";
- }
-}
-
-std::string TypeToString(Type::type t) {
- switch (t) {
- case Type::BOOLEAN:
- return "BOOLEAN";
- case Type::INT32:
- return "INT32";
- case Type::INT64:
- return "INT64";
- case Type::INT96:
- return "INT96";
- case Type::FLOAT:
- return "FLOAT";
- case Type::DOUBLE:
- return "DOUBLE";
- case Type::BYTE_ARRAY:
- return "BYTE_ARRAY";
- case Type::FIXED_LEN_BYTE_ARRAY:
- return "FIXED_LEN_BYTE_ARRAY";
- case Type::UNDEFINED:
- default:
- return "UNKNOWN";
- }
-}
-
-std::string ConvertedTypeToString(ConvertedType::type t) {
- switch (t) {
- case ConvertedType::NONE:
- return "NONE";
- case ConvertedType::UTF8:
- return "UTF8";
- case ConvertedType::MAP:
- return "MAP";
- case ConvertedType::MAP_KEY_VALUE:
- return "MAP_KEY_VALUE";
- case ConvertedType::LIST:
- return "LIST";
- case ConvertedType::ENUM:
- return "ENUM";
- case ConvertedType::DECIMAL:
- return "DECIMAL";
- case ConvertedType::DATE:
- return "DATE";
- case ConvertedType::TIME_MILLIS:
- return "TIME_MILLIS";
- case ConvertedType::TIME_MICROS:
- return "TIME_MICROS";
- case ConvertedType::TIMESTAMP_MILLIS:
- return "TIMESTAMP_MILLIS";
- case ConvertedType::TIMESTAMP_MICROS:
- return "TIMESTAMP_MICROS";
- case ConvertedType::UINT_8:
- return "UINT_8";
- case ConvertedType::UINT_16:
- return "UINT_16";
- case ConvertedType::UINT_32:
- return "UINT_32";
- case ConvertedType::UINT_64:
- return "UINT_64";
- case ConvertedType::INT_8:
- return "INT_8";
- case ConvertedType::INT_16:
- return "INT_16";
- case ConvertedType::INT_32:
- return "INT_32";
- case ConvertedType::INT_64:
- return "INT_64";
- case ConvertedType::JSON:
- return "JSON";
- case ConvertedType::BSON:
- return "BSON";
- case ConvertedType::INTERVAL:
- return "INTERVAL";
- case ConvertedType::UNDEFINED:
- default:
- return "UNKNOWN";
- }
-}
-
-int GetTypeByteSize(Type::type parquet_type) {
- switch (parquet_type) {
- case Type::BOOLEAN:
- return type_traits<BooleanType::type_num>::value_byte_size;
- case Type::INT32:
- return type_traits<Int32Type::type_num>::value_byte_size;
- case Type::INT64:
- return type_traits<Int64Type::type_num>::value_byte_size;
- case Type::INT96:
- return type_traits<Int96Type::type_num>::value_byte_size;
- case Type::DOUBLE:
- return type_traits<DoubleType::type_num>::value_byte_size;
- case Type::FLOAT:
- return type_traits<FloatType::type_num>::value_byte_size;
- case Type::BYTE_ARRAY:
- return type_traits<ByteArrayType::type_num>::value_byte_size;
- case Type::FIXED_LEN_BYTE_ARRAY:
- return type_traits<FLBAType::type_num>::value_byte_size;
- case Type::UNDEFINED:
- default:
- return 0;
- }
- return 0;
-}
-
-// Return the Sort Order of the Parquet Physical Types
-SortOrder::type DefaultSortOrder(Type::type primitive) {
- switch (primitive) {
- case Type::BOOLEAN:
- case Type::INT32:
- case Type::INT64:
- case Type::FLOAT:
- case Type::DOUBLE:
- return SortOrder::SIGNED;
- case Type::BYTE_ARRAY:
- case Type::FIXED_LEN_BYTE_ARRAY:
- return SortOrder::UNSIGNED;
- case Type::INT96:
- case Type::UNDEFINED:
- return SortOrder::UNKNOWN;
- }
- return SortOrder::UNKNOWN;
-}
-
-// Return the SortOrder of the Parquet Types using Logical or Physical Types
-SortOrder::type GetSortOrder(ConvertedType::type converted, Type::type primitive) {
- if (converted == ConvertedType::NONE) return DefaultSortOrder(primitive);
- switch (converted) {
- case ConvertedType::INT_8:
- case ConvertedType::INT_16:
- case ConvertedType::INT_32:
- case ConvertedType::INT_64:
- case ConvertedType::DATE:
- case ConvertedType::TIME_MICROS:
- case ConvertedType::TIME_MILLIS:
- case ConvertedType::TIMESTAMP_MICROS:
- case ConvertedType::TIMESTAMP_MILLIS:
- return SortOrder::SIGNED;
- case ConvertedType::UINT_8:
- case ConvertedType::UINT_16:
- case ConvertedType::UINT_32:
- case ConvertedType::UINT_64:
- case ConvertedType::ENUM:
- case ConvertedType::UTF8:
- case ConvertedType::BSON:
- case ConvertedType::JSON:
- return SortOrder::UNSIGNED;
- case ConvertedType::DECIMAL:
- case ConvertedType::LIST:
- case ConvertedType::MAP:
- case ConvertedType::MAP_KEY_VALUE:
- case ConvertedType::INTERVAL:
- case ConvertedType::NONE: // required instead of default
- case ConvertedType::NA: // required instead of default
- case ConvertedType::UNDEFINED:
- return SortOrder::UNKNOWN;
- }
- return SortOrder::UNKNOWN;
-}
-
-SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_type,
- Type::type primitive) {
- SortOrder::type o = SortOrder::UNKNOWN;
- if (logical_type && logical_type->is_valid()) {
- o = (logical_type->is_none() ? DefaultSortOrder(primitive)
- : logical_type->sort_order());
- }
- return o;
-}
-
-ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
-ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
-
-// Static methods for LogicalType class
-
-std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
- const ConvertedType::type converted_type,
- const schema::DecimalMetadata converted_decimal_metadata) {
- switch (converted_type) {
- case ConvertedType::UTF8:
- return StringLogicalType::Make();
- case ConvertedType::MAP_KEY_VALUE:
- case ConvertedType::MAP:
- return MapLogicalType::Make();
- case ConvertedType::LIST:
- return ListLogicalType::Make();
- case ConvertedType::ENUM:
- return EnumLogicalType::Make();
- case ConvertedType::DECIMAL:
- return DecimalLogicalType::Make(converted_decimal_metadata.precision,
- converted_decimal_metadata.scale);
- case ConvertedType::DATE:
- return DateLogicalType::Make();
- case ConvertedType::TIME_MILLIS:
- return TimeLogicalType::Make(true, LogicalType::TimeUnit::MILLIS);
- case ConvertedType::TIME_MICROS:
- return TimeLogicalType::Make(true, LogicalType::TimeUnit::MICROS);
- case ConvertedType::TIMESTAMP_MILLIS:
- return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MILLIS,
- /*is_from_converted_type=*/true,
- /*force_set_converted_type=*/false);
- case ConvertedType::TIMESTAMP_MICROS:
- return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MICROS,
- /*is_from_converted_type=*/true,
- /*force_set_converted_type=*/false);
- case ConvertedType::INTERVAL:
- return IntervalLogicalType::Make();
- case ConvertedType::INT_8:
- return IntLogicalType::Make(8, true);
- case ConvertedType::INT_16:
- return IntLogicalType::Make(16, true);
- case ConvertedType::INT_32:
- return IntLogicalType::Make(32, true);
- case ConvertedType::INT_64:
- return IntLogicalType::Make(64, true);
- case ConvertedType::UINT_8:
- return IntLogicalType::Make(8, false);
- case ConvertedType::UINT_16:
- return IntLogicalType::Make(16, false);
- case ConvertedType::UINT_32:
- return IntLogicalType::Make(32, false);
- case ConvertedType::UINT_64:
- return IntLogicalType::Make(64, false);
- case ConvertedType::JSON:
- return JSONLogicalType::Make();
- case ConvertedType::BSON:
- return BSONLogicalType::Make();
- case ConvertedType::NA:
- return NullLogicalType::Make();
- case ConvertedType::NONE:
- return NoLogicalType::Make();
- case ConvertedType::UNDEFINED:
- return UndefinedLogicalType::Make();
- }
- return UndefinedLogicalType::Make();
-}
-
-std::shared_ptr<const LogicalType> LogicalType::FromThrift(
- const format::LogicalType& type) {
- if (type.__isset.STRING) {
- return StringLogicalType::Make();
- } else if (type.__isset.MAP) {
- return MapLogicalType::Make();
- } else if (type.__isset.LIST) {
- return ListLogicalType::Make();
- } else if (type.__isset.ENUM) {
- return EnumLogicalType::Make();
- } else if (type.__isset.DECIMAL) {
- return DecimalLogicalType::Make(type.DECIMAL.precision, type.DECIMAL.scale);
- } else if (type.__isset.DATE) {
- return DateLogicalType::Make();
- } else if (type.__isset.TIME) {
- LogicalType::TimeUnit::unit unit;
- if (type.TIME.unit.__isset.MILLIS) {
- unit = LogicalType::TimeUnit::MILLIS;
- } else if (type.TIME.unit.__isset.MICROS) {
- unit = LogicalType::TimeUnit::MICROS;
- } else if (type.TIME.unit.__isset.NANOS) {
- unit = LogicalType::TimeUnit::NANOS;
- } else {
- unit = LogicalType::TimeUnit::UNKNOWN;
- }
- return TimeLogicalType::Make(type.TIME.isAdjustedToUTC, unit);
- } else if (type.__isset.TIMESTAMP) {
- LogicalType::TimeUnit::unit unit;
- if (type.TIMESTAMP.unit.__isset.MILLIS) {
- unit = LogicalType::TimeUnit::MILLIS;
- } else if (type.TIMESTAMP.unit.__isset.MICROS) {
- unit = LogicalType::TimeUnit::MICROS;
- } else if (type.TIMESTAMP.unit.__isset.NANOS) {
- unit = LogicalType::TimeUnit::NANOS;
- } else {
- unit = LogicalType::TimeUnit::UNKNOWN;
- }
- return TimestampLogicalType::Make(type.TIMESTAMP.isAdjustedToUTC, unit);
- // TODO(tpboudreau): activate the commented code after parquet.thrift
- // recognizes IntervalType as a LogicalType
- //} else if (type.__isset.INTERVAL) {
- // return IntervalLogicalType::Make();
- } else if (type.__isset.INTEGER) {
- return IntLogicalType::Make(static_cast<int>(type.INTEGER.bitWidth),
- type.INTEGER.isSigned);
- } else if (type.__isset.UNKNOWN) {
- return NullLogicalType::Make();
- } else if (type.__isset.JSON) {
- return JSONLogicalType::Make();
- } else if (type.__isset.BSON) {
- return BSONLogicalType::Make();
- } else if (type.__isset.UUID) {
- return UUIDLogicalType::Make();
- } else {
- throw ParquetException("Metadata contains Thrift LogicalType that is not recognized");
- }
-}
-
-std::shared_ptr<const LogicalType> LogicalType::String() {
- return StringLogicalType::Make();
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Map() { return MapLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::List() { return ListLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::Enum() { return EnumLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::Decimal(int32_t precision,
- int32_t scale) {
- return DecimalLogicalType::Make(precision, scale);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Date() { return DateLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::Time(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
- DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
- return TimeLogicalType::Make(is_adjusted_to_utc, time_unit);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Timestamp(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type, bool force_set_converted_type) {
- DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
- return TimestampLogicalType::Make(is_adjusted_to_utc, time_unit, is_from_converted_type,
- force_set_converted_type);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Interval() {
- return IntervalLogicalType::Make();
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Int(int bit_width, bool is_signed) {
- DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8);
- return IntLogicalType::Make(bit_width, is_signed);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Null() { return NullLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::JSON() { return JSONLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::BSON() { return BSONLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
-
-/*
- * The logical type implementation classes are built in four layers: (1) the base
- * layer, which establishes the interface and provides generally reusable implementations
- * for the ToJSON() and Equals() methods; (2) an intermediate derived layer for the
- * "compatibility" methods, which provides implementations for is_compatible() and
- * ToConvertedType(); (3) another intermediate layer for the "applicability" methods
- * that provides several implementations for the is_applicable() method; and (4) the
- * final derived classes, one for each logical type, which supply implementations
- * for those methods that remain virtual (usually just ToString() and ToThrift()) or
- * otherwise need to be overridden.
- */
-
-// LogicalTypeImpl base class
-
-class LogicalType::Impl {
- public:
- virtual bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const = 0;
-
- virtual bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata = {
- false, -1, -1}) const = 0;
-
- virtual ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const = 0;
-
- virtual std::string ToString() const = 0;
-
- virtual bool is_serialized() const {
- return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
- }
-
- virtual std::string ToJSON() const {
- std::stringstream json;
- json << R"({"Type": ")" << ToString() << R"("})";
- return json.str();
- }
-
- virtual format::LogicalType ToThrift() const {
- // logical types inheriting this method should never be serialized
- std::stringstream ss;
- ss << "Logical type " << ToString() << " should not be serialized";
- throw ParquetException(ss.str());
- }
-
- virtual bool Equals(const LogicalType& other) const { return other.type() == type_; }
-
- LogicalType::Type::type type() const { return type_; }
-
- SortOrder::type sort_order() const { return order_; }
-
- Impl(const Impl&) = delete;
- Impl& operator=(const Impl&) = delete;
- virtual ~Impl() noexcept {}
-
- class Compatible;
- class SimpleCompatible;
- class Incompatible;
-
- class Applicable;
- class SimpleApplicable;
- class TypeLengthApplicable;
- class UniversalApplicable;
- class Inapplicable;
-
- class String;
- class Map;
- class List;
- class Enum;
- class Decimal;
- class Date;
- class Time;
- class Timestamp;
- class Interval;
- class Int;
- class Null;
- class JSON;
- class BSON;
- class UUID;
- class No;
- class Undefined;
-
- protected:
- Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
- Impl() = default;
-
- private:
- LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
- SortOrder::type order_ = SortOrder::UNKNOWN;
-};
-
-// Special methods for public LogicalType class
-
-LogicalType::LogicalType() = default;
-LogicalType::~LogicalType() noexcept = default;
-
-// Delegating methods for public LogicalType class
-
-bool LogicalType::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- return impl_->is_applicable(primitive_type, primitive_length);
-}
-
-bool LogicalType::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- return impl_->is_compatible(converted_type, converted_decimal_metadata);
-}
-
-ConvertedType::type LogicalType::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- return impl_->ToConvertedType(out_decimal_metadata);
-}
-
-std::string LogicalType::ToString() const { return impl_->ToString(); }
-
-std::string LogicalType::ToJSON() const { return impl_->ToJSON(); }
-
-format::LogicalType LogicalType::ToThrift() const { return impl_->ToThrift(); }
-
-bool LogicalType::Equals(const LogicalType& other) const { return impl_->Equals(other); }
-
-LogicalType::Type::type LogicalType::type() const { return impl_->type(); }
-
-SortOrder::type LogicalType::sort_order() const { return impl_->sort_order(); }
-
-// Type checks for public LogicalType class
-
-bool LogicalType::is_string() const { return impl_->type() == LogicalType::Type::STRING; }
-bool LogicalType::is_map() const { return impl_->type() == LogicalType::Type::MAP; }
-bool LogicalType::is_list() const { return impl_->type() == LogicalType::Type::LIST; }
-bool LogicalType::is_enum() const { return impl_->type() == LogicalType::Type::ENUM; }
-bool LogicalType::is_decimal() const {
- return impl_->type() == LogicalType::Type::DECIMAL;
-}
-bool LogicalType::is_date() const { return impl_->type() == LogicalType::Type::DATE; }
-bool LogicalType::is_time() const { return impl_->type() == LogicalType::Type::TIME; }
-bool LogicalType::is_timestamp() const {
- return impl_->type() == LogicalType::Type::TIMESTAMP;
-}
-bool LogicalType::is_interval() const {
- return impl_->type() == LogicalType::Type::INTERVAL;
-}
-bool LogicalType::is_int() const { return impl_->type() == LogicalType::Type::INT; }
-bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::NIL; }
-bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; }
-bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
-bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
-bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
-bool LogicalType::is_valid() const {
- return impl_->type() != LogicalType::Type::UNDEFINED;
-}
-bool LogicalType::is_invalid() const { return !is_valid(); }
-bool LogicalType::is_nested() const {
- return (impl_->type() == LogicalType::Type::LIST) ||
- (impl_->type() == LogicalType::Type::MAP);
-}
-bool LogicalType::is_nonnested() const { return !is_nested(); }
-bool LogicalType::is_serialized() const { return impl_->is_serialized(); }
-
-// LogicalTypeImpl intermediate "compatibility" classes
-
-class LogicalType::Impl::Compatible : public virtual LogicalType::Impl {
- protected:
- Compatible() = default;
-};
-
-#define set_decimal_metadata(m___, i___, p___, s___) \
- { \
- if (m___) { \
- (m___)->isset = (i___); \
- (m___)->scale = (s___); \
- (m___)->precision = (p___); \
- } \
- }
-
-#define reset_decimal_metadata(m___) \
- { set_decimal_metadata(m___, false, -1, -1); }
-
-// For logical types that always translate to the same converted type
-class LogicalType::Impl::SimpleCompatible : public virtual LogicalType::Impl::Compatible {
- public:
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override {
- return (converted_type == converted_type_) && !converted_decimal_metadata.isset;
- }
-
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override {
- reset_decimal_metadata(out_decimal_metadata);
- return converted_type_;
- }
-
- protected:
- explicit SimpleCompatible(ConvertedType::type c) : converted_type_(c) {}
-
- private:
- ConvertedType::type converted_type_ = ConvertedType::NA;
-};
-
-// For logical types that have no corresponding converted type
-class LogicalType::Impl::Incompatible : public virtual LogicalType::Impl {
- public:
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override {
- return (converted_type == ConvertedType::NONE ||
- converted_type == ConvertedType::NA) &&
- !converted_decimal_metadata.isset;
- }
-
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override {
- reset_decimal_metadata(out_decimal_metadata);
- return ConvertedType::NONE;
- }
-
- protected:
- Incompatible() = default;
-};
-
-// LogicalTypeImpl intermediate "applicability" classes
-
-class LogicalType::Impl::Applicable : public virtual LogicalType::Impl {
- protected:
- Applicable() = default;
-};
-
-// For logical types that can apply only to a single
-// physical type
-class LogicalType::Impl::SimpleApplicable : public virtual LogicalType::Impl::Applicable {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return primitive_type == type_;
- }
-
- protected:
- explicit SimpleApplicable(parquet::Type::type t) : type_(t) {}
-
- private:
- parquet::Type::type type_;
-};
-
-// For logical types that can apply only to a particular
-// physical type and physical length combination
-class LogicalType::Impl::TypeLengthApplicable
- : public virtual LogicalType::Impl::Applicable {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return primitive_type == type_ && primitive_length == length_;
- }
-
- protected:
- TypeLengthApplicable(parquet::Type::type t, int32_t l) : type_(t), length_(l) {}
-
- private:
- parquet::Type::type type_;
- int32_t length_;
-};
-
-// For logical types that can apply to any physical type
-class LogicalType::Impl::UniversalApplicable
- : public virtual LogicalType::Impl::Applicable {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return true;
- }
-
- protected:
- UniversalApplicable() = default;
-};
-
-// For logical types that can never apply to any primitive
-// physical type
-class LogicalType::Impl::Inapplicable : public virtual LogicalType::Impl {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return false;
- }
-
- protected:
- Inapplicable() = default;
-};
-
-// LogicalType implementation final classes
-
-#define OVERRIDE_TOSTRING(n___) \
- std::string ToString() const override { return #n___; }
-
-#define OVERRIDE_TOTHRIFT(t___, s___) \
- format::LogicalType ToThrift() const override { \
- format::LogicalType type; \
- format::t___ subtype; \
- type.__set_##s___(subtype); \
- return type; \
- }
-
-class LogicalType::Impl::String final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class StringLogicalType;
-
- OVERRIDE_TOSTRING(String)
- OVERRIDE_TOTHRIFT(StringType, STRING)
-
- private:
- String()
- : LogicalType::Impl(LogicalType::Type::STRING, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::UTF8),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-// Each public logical type class's Make() creation method instantiates a corresponding
-// LogicalType::Impl::* object and installs that implementation in the logical type
-// it returns.
-
-#define GENERATE_MAKE(a___) \
- std::shared_ptr<const LogicalType> a___##LogicalType::Make() { \
- auto* logical_type = new a___##LogicalType(); \
- logical_type->impl_.reset(new LogicalType::Impl::a___()); \
- return std::shared_ptr<const LogicalType>(logical_type); \
- }
-
-GENERATE_MAKE(String)
-
-class LogicalType::Impl::Map final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::Inapplicable {
- public:
- friend class MapLogicalType;
-
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override {
- return (converted_type == ConvertedType::MAP ||
- converted_type == ConvertedType::MAP_KEY_VALUE) &&
- !converted_decimal_metadata.isset;
- }
-
- OVERRIDE_TOSTRING(Map)
- OVERRIDE_TOTHRIFT(MapType, MAP)
-
- private:
- Map()
- : LogicalType::Impl(LogicalType::Type::MAP, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::MAP) {}
-};
-
-GENERATE_MAKE(Map)
-
-class LogicalType::Impl::List final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::Inapplicable {
- public:
- friend class ListLogicalType;
-
- OVERRIDE_TOSTRING(List)
- OVERRIDE_TOTHRIFT(ListType, LIST)
-
- private:
- List()
- : LogicalType::Impl(LogicalType::Type::LIST, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::LIST) {}
-};
-
-GENERATE_MAKE(List)
-
-class LogicalType::Impl::Enum final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class EnumLogicalType;
-
- OVERRIDE_TOSTRING(Enum)
- OVERRIDE_TOTHRIFT(EnumType, ENUM)
-
- private:
- Enum()
- : LogicalType::Impl(LogicalType::Type::ENUM, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::ENUM),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-GENERATE_MAKE(Enum)
-
-// The parameterized logical types (currently Decimal, Time, Timestamp, and Int)
-// generally can't reuse the simple method implementations available in the base and
-// intermediate classes and must (re)implement them all
-
-class LogicalType::Impl::Decimal final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::Applicable {
- public:
- friend class DecimalLogicalType;
-
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- int32_t precision() const { return precision_; }
- int32_t scale() const { return scale_; }
-
- private:
- Decimal(int32_t p, int32_t s)
- : LogicalType::Impl(LogicalType::Type::DECIMAL, SortOrder::SIGNED),
- precision_(p),
- scale_(s) {}
- int32_t precision_ = -1;
- int32_t scale_ = -1;
-};
-
-bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- bool ok = false;
- switch (primitive_type) {
- case parquet::Type::INT32: {
- ok = (1 <= precision_) && (precision_ <= 9);
- } break;
- case parquet::Type::INT64: {
- ok = (1 <= precision_) && (precision_ <= 18);
- if (precision_ < 10) {
- // FIXME(tpb): warn that INT32 could be used
- }
- } break;
- case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
- ok = precision_ <= static_cast<int32_t>(std::floor(
- std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
- } break;
- case parquet::Type::BYTE_ARRAY: {
- ok = true;
- } break;
- default: {
- } break;
- }
- return ok;
-}
-
-bool LogicalType::Impl::Decimal::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- return converted_type == ConvertedType::DECIMAL &&
- (converted_decimal_metadata.isset &&
- converted_decimal_metadata.scale == scale_ &&
- converted_decimal_metadata.precision == precision_);
-}
-
-ConvertedType::type LogicalType::Impl::Decimal::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- set_decimal_metadata(out_decimal_metadata, true, precision_, scale_);
- return ConvertedType::DECIMAL;
-}
-
-std::string LogicalType::Impl::Decimal::ToString() const {
- std::stringstream type;
- type << "Decimal(precision=" << precision_ << ", scale=" << scale_ << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Decimal::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Decimal", "precision": )" << precision_ << R"(, "scale": )"
- << scale_ << "}";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Decimal::ToThrift() const {
- format::LogicalType type;
- format::DecimalType decimal_type;
- decimal_type.__set_precision(precision_);
- decimal_type.__set_scale(scale_);
- type.__set_DECIMAL(decimal_type);
- return type;
-}
-
-bool LogicalType::Impl::Decimal::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_decimal()) {
- const auto& other_decimal = checked_cast<const DecimalLogicalType&>(other);
- eq = (precision_ == other_decimal.precision() && scale_ == other_decimal.scale());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> DecimalLogicalType::Make(int32_t precision,
- int32_t scale) {
- if (precision < 1) {
- throw ParquetException(
- "Precision must be greater than or equal to 1 for Decimal logical type");
- }
- if (scale < 0 || scale > precision) {
- throw ParquetException(
- "Scale must be a non-negative integer that does not exceed precision for "
- "Decimal logical type");
- }
- auto* logical_type = new DecimalLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Decimal(precision, scale));
- return std::shared_ptr<const LogicalType>(logical_type);
-}
-
-int32_t DecimalLogicalType::precision() const {
- return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).precision();
-}
-
-int32_t DecimalLogicalType::scale() const {
- return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).scale();
-}
-
-class LogicalType::Impl::Date final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class DateLogicalType;
-
- OVERRIDE_TOSTRING(Date)
- OVERRIDE_TOTHRIFT(DateType, DATE)
-
- private:
- Date()
- : LogicalType::Impl(LogicalType::Type::DATE, SortOrder::SIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::DATE),
- LogicalType::Impl::SimpleApplicable(parquet::Type::INT32) {}
-};
-
-GENERATE_MAKE(Date)
-
-#define time_unit_string(u___) \
- ((u___) == LogicalType::TimeUnit::MILLIS \
- ? "milliseconds" \
- : ((u___) == LogicalType::TimeUnit::MICROS \
- ? "microseconds" \
- : ((u___) == LogicalType::TimeUnit::NANOS ? "nanoseconds" : "unknown")))
-
-class LogicalType::Impl::Time final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::Applicable {
- public:
- friend class TimeLogicalType;
-
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- bool is_adjusted_to_utc() const { return adjusted_; }
- LogicalType::TimeUnit::unit time_unit() const { return unit_; }
-
- private:
- Time(bool a, LogicalType::TimeUnit::unit u)
- : LogicalType::Impl(LogicalType::Type::TIME, SortOrder::SIGNED),
- adjusted_(a),
- unit_(u) {}
- bool adjusted_ = false;
- LogicalType::TimeUnit::unit unit_;
-};
-
-bool LogicalType::Impl::Time::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- return (primitive_type == parquet::Type::INT32 &&
- unit_ == LogicalType::TimeUnit::MILLIS) ||
- (primitive_type == parquet::Type::INT64 &&
- (unit_ == LogicalType::TimeUnit::MICROS ||
- unit_ == LogicalType::TimeUnit::NANOS));
-}
-
-bool LogicalType::Impl::Time::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- if (converted_decimal_metadata.isset) {
- return false;
- } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MILLIS) {
- return converted_type == ConvertedType::TIME_MILLIS;
- } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MICROS) {
- return converted_type == ConvertedType::TIME_MICROS;
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
-}
-
-ConvertedType::type LogicalType::Impl::Time::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- reset_decimal_metadata(out_decimal_metadata);
- if (adjusted_) {
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- return ConvertedType::TIME_MILLIS;
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- return ConvertedType::TIME_MICROS;
- }
- }
- return ConvertedType::NONE;
-}
-
-std::string LogicalType::Impl::Time::ToString() const {
- std::stringstream type;
- type << "Time(isAdjustedToUTC=" << std::boolalpha << adjusted_
- << ", timeUnit=" << time_unit_string(unit_) << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Time::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Time", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
- << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"("})";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Time::ToThrift() const {
- format::LogicalType type;
- format::TimeType time_type;
- format::TimeUnit time_unit;
- DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- format::MilliSeconds millis;
- time_unit.__set_MILLIS(millis);
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- format::MicroSeconds micros;
- time_unit.__set_MICROS(micros);
- } else if (unit_ == LogicalType::TimeUnit::NANOS) {
- format::NanoSeconds nanos;
- time_unit.__set_NANOS(nanos);
- }
- time_type.__set_isAdjustedToUTC(adjusted_);
- time_type.__set_unit(time_unit);
- type.__set_TIME(time_type);
- return type;
-}
-
-bool LogicalType::Impl::Time::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_time()) {
- const auto& other_time = checked_cast<const TimeLogicalType&>(other);
- eq =
- (adjusted_ == other_time.is_adjusted_to_utc() && unit_ == other_time.time_unit());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> TimeLogicalType::Make(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
- if (time_unit == LogicalType::TimeUnit::MILLIS ||
- time_unit == LogicalType::TimeUnit::MICROS ||
- time_unit == LogicalType::TimeUnit::NANOS) {
- auto* logical_type = new TimeLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Time(is_adjusted_to_utc, time_unit));
- return std::shared_ptr<const LogicalType>(logical_type);
- } else {
- throw ParquetException(
- "TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type");
- }
-}
-
-bool TimeLogicalType::is_adjusted_to_utc() const {
- return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).is_adjusted_to_utc();
-}
-
-LogicalType::TimeUnit::unit TimeLogicalType::time_unit() const {
- return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).time_unit();
-}
-
-class LogicalType::Impl::Timestamp final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class TimestampLogicalType;
-
- bool is_serialized() const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- bool is_adjusted_to_utc() const { return adjusted_; }
- LogicalType::TimeUnit::unit time_unit() const { return unit_; }
-
- bool is_from_converted_type() const { return is_from_converted_type_; }
- bool force_set_converted_type() const { return force_set_converted_type_; }
-
- private:
- Timestamp(bool adjusted, LogicalType::TimeUnit::unit unit, bool is_from_converted_type,
- bool force_set_converted_type)
- : LogicalType::Impl(LogicalType::Type::TIMESTAMP, SortOrder::SIGNED),
- LogicalType::Impl::SimpleApplicable(parquet::Type::INT64),
- adjusted_(adjusted),
- unit_(unit),
- is_from_converted_type_(is_from_converted_type),
- force_set_converted_type_(force_set_converted_type) {}
- bool adjusted_ = false;
- LogicalType::TimeUnit::unit unit_;
- bool is_from_converted_type_ = false;
- bool force_set_converted_type_ = false;
-};
-
-bool LogicalType::Impl::Timestamp::is_serialized() const {
- return !is_from_converted_type_;
-}
-
-bool LogicalType::Impl::Timestamp::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- if (converted_decimal_metadata.isset) {
- return false;
- } else if (unit_ == LogicalType::TimeUnit::MILLIS) {
- if (adjusted_ || force_set_converted_type_) {
- return converted_type == ConvertedType::TIMESTAMP_MILLIS;
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- if (adjusted_ || force_set_converted_type_) {
- return converted_type == ConvertedType::TIMESTAMP_MICROS;
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
-}
-
-ConvertedType::type LogicalType::Impl::Timestamp::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- reset_decimal_metadata(out_decimal_metadata);
- if (adjusted_ || force_set_converted_type_) {
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- return ConvertedType::TIMESTAMP_MILLIS;
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- return ConvertedType::TIMESTAMP_MICROS;
- }
- }
- return ConvertedType::NONE;
-}
-
-std::string LogicalType::Impl::Timestamp::ToString() const {
- std::stringstream type;
- type << "Timestamp(isAdjustedToUTC=" << std::boolalpha << adjusted_
- << ", timeUnit=" << time_unit_string(unit_)
- << ", is_from_converted_type=" << is_from_converted_type_
- << ", force_set_converted_type=" << force_set_converted_type_ << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Timestamp::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Timestamp", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
- << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"(")"
- << R"(, "is_from_converted_type": )" << is_from_converted_type_
- << R"(, "force_set_converted_type": )" << force_set_converted_type_ << R"(})";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Timestamp::ToThrift() const {
- format::LogicalType type;
- format::TimestampType timestamp_type;
- format::TimeUnit time_unit;
- DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- format::MilliSeconds millis;
- time_unit.__set_MILLIS(millis);
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- format::MicroSeconds micros;
- time_unit.__set_MICROS(micros);
- } else if (unit_ == LogicalType::TimeUnit::NANOS) {
- format::NanoSeconds nanos;
- time_unit.__set_NANOS(nanos);
- }
- timestamp_type.__set_isAdjustedToUTC(adjusted_);
- timestamp_type.__set_unit(time_unit);
- type.__set_TIMESTAMP(timestamp_type);
- return type;
-}
-
-bool LogicalType::Impl::Timestamp::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_timestamp()) {
- const auto& other_timestamp = checked_cast<const TimestampLogicalType&>(other);
- eq = (adjusted_ == other_timestamp.is_adjusted_to_utc() &&
- unit_ == other_timestamp.time_unit());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> TimestampLogicalType::Make(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type, bool force_set_converted_type) {
- if (time_unit == LogicalType::TimeUnit::MILLIS ||
- time_unit == LogicalType::TimeUnit::MICROS ||
- time_unit == LogicalType::TimeUnit::NANOS) {
- auto* logical_type = new TimestampLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Timestamp(
- is_adjusted_to_utc, time_unit, is_from_converted_type, force_set_converted_type));
- return std::shared_ptr<const LogicalType>(logical_type);
- } else {
- throw ParquetException(
- "TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type");
- }
-}
-
-bool TimestampLogicalType::is_adjusted_to_utc() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).is_adjusted_to_utc();
-}
-
-LogicalType::TimeUnit::unit TimestampLogicalType::time_unit() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).time_unit();
-}
-
-bool TimestampLogicalType::is_from_converted_type() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
- .is_from_converted_type();
-}
-
-bool TimestampLogicalType::force_set_converted_type() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
- .force_set_converted_type();
-}
-
-class LogicalType::Impl::Interval final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::TypeLengthApplicable {
- public:
- friend class IntervalLogicalType;
-
- OVERRIDE_TOSTRING(Interval)
- // TODO(tpboudreau): uncomment the following line to enable serialization after
- // parquet.thrift recognizes IntervalType as a ConvertedType
- // OVERRIDE_TOTHRIFT(IntervalType, INTERVAL)
-
- private:
- Interval()
- : LogicalType::Impl(LogicalType::Type::INTERVAL, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::INTERVAL),
- LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 12) {
- }
-};
-
-GENERATE_MAKE(Interval)
-
-class LogicalType::Impl::Int final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::Applicable {
- public:
- friend class IntLogicalType;
-
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- int bit_width() const { return width_; }
- bool is_signed() const { return signed_; }
-
- private:
- Int(int w, bool s)
- : LogicalType::Impl(LogicalType::Type::INT,
- (s ? SortOrder::SIGNED : SortOrder::UNSIGNED)),
- width_(w),
- signed_(s) {}
- int width_ = 0;
- bool signed_ = false;
-};
-
-bool LogicalType::Impl::Int::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- return (primitive_type == parquet::Type::INT32 && width_ <= 32) ||
- (primitive_type == parquet::Type::INT64 && width_ == 64);
-}
-
-bool LogicalType::Impl::Int::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- if (converted_decimal_metadata.isset) {
- return false;
- } else if (signed_ && width_ == 8) {
- return converted_type == ConvertedType::INT_8;
- } else if (signed_ && width_ == 16) {
- return converted_type == ConvertedType::INT_16;
- } else if (signed_ && width_ == 32) {
- return converted_type == ConvertedType::INT_32;
- } else if (signed_ && width_ == 64) {
- return converted_type == ConvertedType::INT_64;
- } else if (!signed_ && width_ == 8) {
- return converted_type == ConvertedType::UINT_8;
- } else if (!signed_ && width_ == 16) {
- return converted_type == ConvertedType::UINT_16;
- } else if (!signed_ && width_ == 32) {
- return converted_type == ConvertedType::UINT_32;
- } else if (!signed_ && width_ == 64) {
- return converted_type == ConvertedType::UINT_64;
- } else {
- return false;
- }
-}
-
-ConvertedType::type LogicalType::Impl::Int::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- reset_decimal_metadata(out_decimal_metadata);
- if (signed_) {
- switch (width_) {
- case 8:
- return ConvertedType::INT_8;
- case 16:
- return ConvertedType::INT_16;
- case 32:
- return ConvertedType::INT_32;
- case 64:
- return ConvertedType::INT_64;
- }
- } else { // unsigned
- switch (width_) {
- case 8:
- return ConvertedType::UINT_8;
- case 16:
- return ConvertedType::UINT_16;
- case 32:
- return ConvertedType::UINT_32;
- case 64:
- return ConvertedType::UINT_64;
- }
- }
- return ConvertedType::NONE;
-}
-
-std::string LogicalType::Impl::Int::ToString() const {
- std::stringstream type;
- type << "Int(bitWidth=" << width_ << ", isSigned=" << std::boolalpha << signed_ << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Int::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Int", "bitWidth": )" << width_ << R"(, "isSigned": )"
- << std::boolalpha << signed_ << "}";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Int::ToThrift() const {
- format::LogicalType type;
- format::IntType int_type;
- DCHECK(width_ == 64 || width_ == 32 || width_ == 16 || width_ == 8);
- int_type.__set_bitWidth(static_cast<int8_t>(width_));
- int_type.__set_isSigned(signed_);
- type.__set_INTEGER(int_type);
- return type;
-}
-
-bool LogicalType::Impl::Int::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_int()) {
- const auto& other_int = checked_cast<const IntLogicalType&>(other);
- eq = (width_ == other_int.bit_width() && signed_ == other_int.is_signed());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> IntLogicalType::Make(int bit_width, bool is_signed) {
- if (bit_width == 8 || bit_width == 16 || bit_width == 32 || bit_width == 64) {
- auto* logical_type = new IntLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Int(bit_width, is_signed));
- return std::shared_ptr<const LogicalType>(logical_type);
- } else {
- throw ParquetException(
- "Bit width must be exactly 8, 16, 32, or 64 for Int logical type");
- }
-}
-
-int IntLogicalType::bit_width() const {
- return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).bit_width();
-}
-
-bool IntLogicalType::is_signed() const {
- return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).is_signed();
-}
-
-class LogicalType::Impl::Null final : public LogicalType::Impl::Incompatible,
- public LogicalType::Impl::UniversalApplicable {
- public:
- friend class NullLogicalType;
-
- OVERRIDE_TOSTRING(Null)
- OVERRIDE_TOTHRIFT(NullType, UNKNOWN)
-
- private:
- Null() : LogicalType::Impl(LogicalType::Type::NIL, SortOrder::UNKNOWN) {}
-};
-
-GENERATE_MAKE(Null)
-
-class LogicalType::Impl::JSON final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class JSONLogicalType;
-
- OVERRIDE_TOSTRING(JSON)
- OVERRIDE_TOTHRIFT(JsonType, JSON)
-
- private:
- JSON()
- : LogicalType::Impl(LogicalType::Type::JSON, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::JSON),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-GENERATE_MAKE(JSON)
-
-class LogicalType::Impl::BSON final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class BSONLogicalType;
-
- OVERRIDE_TOSTRING(BSON)
- OVERRIDE_TOTHRIFT(BsonType, BSON)
-
- private:
- BSON()
- : LogicalType::Impl(LogicalType::Type::BSON, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::BSON),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-GENERATE_MAKE(BSON)
-
-class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible,
- public LogicalType::Impl::TypeLengthApplicable {
- public:
- friend class UUIDLogicalType;
-
- OVERRIDE_TOSTRING(UUID)
- OVERRIDE_TOTHRIFT(UUIDType, UUID)
-
- private:
- UUID()
- : LogicalType::Impl(LogicalType::Type::UUID, SortOrder::UNSIGNED),
- LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 16) {
- }
-};
-
-GENERATE_MAKE(UUID)
-
-class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::UniversalApplicable {
- public:
- friend class NoLogicalType;
-
- OVERRIDE_TOSTRING(None)
-
- private:
- No()
- : LogicalType::Impl(LogicalType::Type::NONE, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::NONE) {}
-};
-
-GENERATE_MAKE(No)
-
-class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::UniversalApplicable {
- public:
- friend class UndefinedLogicalType;
-
- OVERRIDE_TOSTRING(Undefined)
-
- private:
- Undefined()
- : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
-};
-
-GENERATE_MAKE(Undefined)
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+
+#include "parquet/exception.h"
+#include "parquet/types.h"
+
+#include "generated/parquet_types.h"
+
+using arrow::internal::checked_cast;
+using arrow::util::Codec;
+
+namespace parquet {
+
+bool IsCodecSupported(Compression::type codec) {
+ switch (codec) {
+ case Compression::UNCOMPRESSED:
+ case Compression::SNAPPY:
+ case Compression::GZIP:
+ case Compression::BROTLI:
+ case Compression::ZSTD:
+ case Compression::LZ4:
+ case Compression::LZ4_HADOOP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec) {
+ return GetCodec(codec, Codec::UseDefaultCompressionLevel());
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level) {
+ std::unique_ptr<Codec> result;
+ if (codec == Compression::LZO) {
+ throw ParquetException(
+ "While LZO compression is supported by the Parquet format in "
+ "general, it is currently not supported by the C++ implementation.");
+ }
+
+ if (!IsCodecSupported(codec)) {
+ std::stringstream ss;
+ ss << "Codec type " << Codec::GetCodecAsString(codec)
+ << " not supported in Parquet format";
+ throw ParquetException(ss.str());
+ }
+
+ PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, compression_level));
+ return result;
+}
+
+std::string FormatStatValue(Type::type parquet_type, ::arrow::util::string_view val) {
+ std::stringstream result;
+
+ const char* bytes = val.data();
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ result << reinterpret_cast<const bool*>(bytes)[0];
+ break;
+ case Type::INT32:
+ result << reinterpret_cast<const int32_t*>(bytes)[0];
+ break;
+ case Type::INT64:
+ result << reinterpret_cast<const int64_t*>(bytes)[0];
+ break;
+ case Type::DOUBLE:
+ result << reinterpret_cast<const double*>(bytes)[0];
+ break;
+ case Type::FLOAT:
+ result << reinterpret_cast<const float*>(bytes)[0];
+ break;
+ case Type::INT96: {
+ auto const i32_val = reinterpret_cast<const int32_t*>(bytes);
+ result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+ break;
+ }
+ case Type::BYTE_ARRAY: {
+ return std::string(val);
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY: {
+ return std::string(val);
+ }
+ case Type::UNDEFINED:
+ default:
+ break;
+ }
+ return result.str();
+}
+
+std::string EncodingToString(Encoding::type t) {
+ switch (t) {
+ case Encoding::PLAIN:
+ return "PLAIN";
+ case Encoding::PLAIN_DICTIONARY:
+ return "PLAIN_DICTIONARY";
+ case Encoding::RLE:
+ return "RLE";
+ case Encoding::BIT_PACKED:
+ return "BIT_PACKED";
+ case Encoding::DELTA_BINARY_PACKED:
+ return "DELTA_BINARY_PACKED";
+ case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ return "DELTA_LENGTH_BYTE_ARRAY";
+ case Encoding::DELTA_BYTE_ARRAY:
+ return "DELTA_BYTE_ARRAY";
+ case Encoding::RLE_DICTIONARY:
+ return "RLE_DICTIONARY";
+ case Encoding::BYTE_STREAM_SPLIT:
+ return "BYTE_STREAM_SPLIT";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+std::string TypeToString(Type::type t) {
+ switch (t) {
+ case Type::BOOLEAN:
+ return "BOOLEAN";
+ case Type::INT32:
+ return "INT32";
+ case Type::INT64:
+ return "INT64";
+ case Type::INT96:
+ return "INT96";
+ case Type::FLOAT:
+ return "FLOAT";
+ case Type::DOUBLE:
+ return "DOUBLE";
+ case Type::BYTE_ARRAY:
+ return "BYTE_ARRAY";
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return "FIXED_LEN_BYTE_ARRAY";
+ case Type::UNDEFINED:
+ default:
+ return "UNKNOWN";
+ }
+}
+
+std::string ConvertedTypeToString(ConvertedType::type t) {
+ switch (t) {
+ case ConvertedType::NONE:
+ return "NONE";
+ case ConvertedType::UTF8:
+ return "UTF8";
+ case ConvertedType::MAP:
+ return "MAP";
+ case ConvertedType::MAP_KEY_VALUE:
+ return "MAP_KEY_VALUE";
+ case ConvertedType::LIST:
+ return "LIST";
+ case ConvertedType::ENUM:
+ return "ENUM";
+ case ConvertedType::DECIMAL:
+ return "DECIMAL";
+ case ConvertedType::DATE:
+ return "DATE";
+ case ConvertedType::TIME_MILLIS:
+ return "TIME_MILLIS";
+ case ConvertedType::TIME_MICROS:
+ return "TIME_MICROS";
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return "TIMESTAMP_MILLIS";
+ case ConvertedType::TIMESTAMP_MICROS:
+ return "TIMESTAMP_MICROS";
+ case ConvertedType::UINT_8:
+ return "UINT_8";
+ case ConvertedType::UINT_16:
+ return "UINT_16";
+ case ConvertedType::UINT_32:
+ return "UINT_32";
+ case ConvertedType::UINT_64:
+ return "UINT_64";
+ case ConvertedType::INT_8:
+ return "INT_8";
+ case ConvertedType::INT_16:
+ return "INT_16";
+ case ConvertedType::INT_32:
+ return "INT_32";
+ case ConvertedType::INT_64:
+ return "INT_64";
+ case ConvertedType::JSON:
+ return "JSON";
+ case ConvertedType::BSON:
+ return "BSON";
+ case ConvertedType::INTERVAL:
+ return "INTERVAL";
+ case ConvertedType::UNDEFINED:
+ default:
+ return "UNKNOWN";
+ }
+}
+
+int GetTypeByteSize(Type::type parquet_type) {
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ return type_traits<BooleanType::type_num>::value_byte_size;
+ case Type::INT32:
+ return type_traits<Int32Type::type_num>::value_byte_size;
+ case Type::INT64:
+ return type_traits<Int64Type::type_num>::value_byte_size;
+ case Type::INT96:
+ return type_traits<Int96Type::type_num>::value_byte_size;
+ case Type::DOUBLE:
+ return type_traits<DoubleType::type_num>::value_byte_size;
+ case Type::FLOAT:
+ return type_traits<FloatType::type_num>::value_byte_size;
+ case Type::BYTE_ARRAY:
+ return type_traits<ByteArrayType::type_num>::value_byte_size;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return type_traits<FLBAType::type_num>::value_byte_size;
+ case Type::UNDEFINED:
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+// Return the Sort Order of the Parquet Physical Types
+SortOrder::type DefaultSortOrder(Type::type primitive) {
+ switch (primitive) {
+ case Type::BOOLEAN:
+ case Type::INT32:
+ case Type::INT64:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ return SortOrder::SIGNED;
+ case Type::BYTE_ARRAY:
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return SortOrder::UNSIGNED;
+ case Type::INT96:
+ case Type::UNDEFINED:
+ return SortOrder::UNKNOWN;
+ }
+ return SortOrder::UNKNOWN;
+}
+
+// Return the SortOrder of the Parquet Types using Logical or Physical Types
+SortOrder::type GetSortOrder(ConvertedType::type converted, Type::type primitive) {
+ if (converted == ConvertedType::NONE) return DefaultSortOrder(primitive);
+ switch (converted) {
+ case ConvertedType::INT_8:
+ case ConvertedType::INT_16:
+ case ConvertedType::INT_32:
+ case ConvertedType::INT_64:
+ case ConvertedType::DATE:
+ case ConvertedType::TIME_MICROS:
+ case ConvertedType::TIME_MILLIS:
+ case ConvertedType::TIMESTAMP_MICROS:
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return SortOrder::SIGNED;
+ case ConvertedType::UINT_8:
+ case ConvertedType::UINT_16:
+ case ConvertedType::UINT_32:
+ case ConvertedType::UINT_64:
+ case ConvertedType::ENUM:
+ case ConvertedType::UTF8:
+ case ConvertedType::BSON:
+ case ConvertedType::JSON:
+ return SortOrder::UNSIGNED;
+ case ConvertedType::DECIMAL:
+ case ConvertedType::LIST:
+ case ConvertedType::MAP:
+ case ConvertedType::MAP_KEY_VALUE:
+ case ConvertedType::INTERVAL:
+ case ConvertedType::NONE: // required instead of default
+ case ConvertedType::NA: // required instead of default
+ case ConvertedType::UNDEFINED:
+ return SortOrder::UNKNOWN;
+ }
+ return SortOrder::UNKNOWN;
+}
+
+SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_type,
+ Type::type primitive) {
+ SortOrder::type o = SortOrder::UNKNOWN;
+ if (logical_type && logical_type->is_valid()) {
+ o = (logical_type->is_none() ? DefaultSortOrder(primitive)
+ : logical_type->sort_order());
+ }
+ return o;
+}
+
+ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
+ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
+
+// Static methods for LogicalType class
+
+std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
+ const ConvertedType::type converted_type,
+ const schema::DecimalMetadata converted_decimal_metadata) {
+ switch (converted_type) {
+ case ConvertedType::UTF8:
+ return StringLogicalType::Make();
+ case ConvertedType::MAP_KEY_VALUE:
+ case ConvertedType::MAP:
+ return MapLogicalType::Make();
+ case ConvertedType::LIST:
+ return ListLogicalType::Make();
+ case ConvertedType::ENUM:
+ return EnumLogicalType::Make();
+ case ConvertedType::DECIMAL:
+ return DecimalLogicalType::Make(converted_decimal_metadata.precision,
+ converted_decimal_metadata.scale);
+ case ConvertedType::DATE:
+ return DateLogicalType::Make();
+ case ConvertedType::TIME_MILLIS:
+ return TimeLogicalType::Make(true, LogicalType::TimeUnit::MILLIS);
+ case ConvertedType::TIME_MICROS:
+ return TimeLogicalType::Make(true, LogicalType::TimeUnit::MICROS);
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MILLIS,
+ /*is_from_converted_type=*/true,
+ /*force_set_converted_type=*/false);
+ case ConvertedType::TIMESTAMP_MICROS:
+ return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MICROS,
+ /*is_from_converted_type=*/true,
+ /*force_set_converted_type=*/false);
+ case ConvertedType::INTERVAL:
+ return IntervalLogicalType::Make();
+ case ConvertedType::INT_8:
+ return IntLogicalType::Make(8, true);
+ case ConvertedType::INT_16:
+ return IntLogicalType::Make(16, true);
+ case ConvertedType::INT_32:
+ return IntLogicalType::Make(32, true);
+ case ConvertedType::INT_64:
+ return IntLogicalType::Make(64, true);
+ case ConvertedType::UINT_8:
+ return IntLogicalType::Make(8, false);
+ case ConvertedType::UINT_16:
+ return IntLogicalType::Make(16, false);
+ case ConvertedType::UINT_32:
+ return IntLogicalType::Make(32, false);
+ case ConvertedType::UINT_64:
+ return IntLogicalType::Make(64, false);
+ case ConvertedType::JSON:
+ return JSONLogicalType::Make();
+ case ConvertedType::BSON:
+ return BSONLogicalType::Make();
+ case ConvertedType::NA:
+ return NullLogicalType::Make();
+ case ConvertedType::NONE:
+ return NoLogicalType::Make();
+ case ConvertedType::UNDEFINED:
+ return UndefinedLogicalType::Make();
+ }
+ return UndefinedLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::FromThrift(
+ const format::LogicalType& type) {
+ if (type.__isset.STRING) {
+ return StringLogicalType::Make();
+ } else if (type.__isset.MAP) {
+ return MapLogicalType::Make();
+ } else if (type.__isset.LIST) {
+ return ListLogicalType::Make();
+ } else if (type.__isset.ENUM) {
+ return EnumLogicalType::Make();
+ } else if (type.__isset.DECIMAL) {
+ return DecimalLogicalType::Make(type.DECIMAL.precision, type.DECIMAL.scale);
+ } else if (type.__isset.DATE) {
+ return DateLogicalType::Make();
+ } else if (type.__isset.TIME) {
+ LogicalType::TimeUnit::unit unit;
+ if (type.TIME.unit.__isset.MILLIS) {
+ unit = LogicalType::TimeUnit::MILLIS;
+ } else if (type.TIME.unit.__isset.MICROS) {
+ unit = LogicalType::TimeUnit::MICROS;
+ } else if (type.TIME.unit.__isset.NANOS) {
+ unit = LogicalType::TimeUnit::NANOS;
+ } else {
+ unit = LogicalType::TimeUnit::UNKNOWN;
+ }
+ return TimeLogicalType::Make(type.TIME.isAdjustedToUTC, unit);
+ } else if (type.__isset.TIMESTAMP) {
+ LogicalType::TimeUnit::unit unit;
+ if (type.TIMESTAMP.unit.__isset.MILLIS) {
+ unit = LogicalType::TimeUnit::MILLIS;
+ } else if (type.TIMESTAMP.unit.__isset.MICROS) {
+ unit = LogicalType::TimeUnit::MICROS;
+ } else if (type.TIMESTAMP.unit.__isset.NANOS) {
+ unit = LogicalType::TimeUnit::NANOS;
+ } else {
+ unit = LogicalType::TimeUnit::UNKNOWN;
+ }
+ return TimestampLogicalType::Make(type.TIMESTAMP.isAdjustedToUTC, unit);
+ // TODO(tpboudreau): activate the commented code after parquet.thrift
+ // recognizes IntervalType as a LogicalType
+ //} else if (type.__isset.INTERVAL) {
+ // return IntervalLogicalType::Make();
+ } else if (type.__isset.INTEGER) {
+ return IntLogicalType::Make(static_cast<int>(type.INTEGER.bitWidth),
+ type.INTEGER.isSigned);
+ } else if (type.__isset.UNKNOWN) {
+ return NullLogicalType::Make();
+ } else if (type.__isset.JSON) {
+ return JSONLogicalType::Make();
+ } else if (type.__isset.BSON) {
+ return BSONLogicalType::Make();
+ } else if (type.__isset.UUID) {
+ return UUIDLogicalType::Make();
+ } else {
+ throw ParquetException("Metadata contains Thrift LogicalType that is not recognized");
+ }
+}
+
+std::shared_ptr<const LogicalType> LogicalType::String() {
+ return StringLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Map() { return MapLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::List() { return ListLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Enum() { return EnumLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Decimal(int32_t precision,
+ int32_t scale) {
+ return DecimalLogicalType::Make(precision, scale);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Date() { return DateLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Time(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
+ DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
+ return TimeLogicalType::Make(is_adjusted_to_utc, time_unit);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Timestamp(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type, bool force_set_converted_type) {
+ DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
+ return TimestampLogicalType::Make(is_adjusted_to_utc, time_unit, is_from_converted_type,
+ force_set_converted_type);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Interval() {
+ return IntervalLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Int(int bit_width, bool is_signed) {
+ DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8);
+ return IntLogicalType::Make(bit_width, is_signed);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Null() { return NullLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::JSON() { return JSONLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::BSON() { return BSONLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
+
+/*
+ * The logical type implementation classes are built in four layers: (1) the base
+ * layer, which establishes the interface and provides generally reusable implementations
+ * for the ToJSON() and Equals() methods; (2) an intermediate derived layer for the
+ * "compatibility" methods, which provides implementations for is_compatible() and
+ * ToConvertedType(); (3) another intermediate layer for the "applicability" methods
+ * that provides several implementations for the is_applicable() method; and (4) the
+ * final derived classes, one for each logical type, which supply implementations
+ * for those methods that remain virtual (usually just ToString() and ToThrift()) or
+ * otherwise need to be overridden.
+ */
+
+// LogicalTypeImpl base class
+
+class LogicalType::Impl {
+ public:
+ virtual bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const = 0;
+
+ virtual bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata = {
+ false, -1, -1}) const = 0;
+
+ virtual ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const = 0;
+
+ virtual std::string ToString() const = 0;
+
+ virtual bool is_serialized() const {
+ return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
+ }
+
+ virtual std::string ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": ")" << ToString() << R"("})";
+ return json.str();
+ }
+
+ virtual format::LogicalType ToThrift() const {
+ // logical types inheriting this method should never be serialized
+ std::stringstream ss;
+ ss << "Logical type " << ToString() << " should not be serialized";
+ throw ParquetException(ss.str());
+ }
+
+ virtual bool Equals(const LogicalType& other) const { return other.type() == type_; }
+
+ LogicalType::Type::type type() const { return type_; }
+
+ SortOrder::type sort_order() const { return order_; }
+
+ Impl(const Impl&) = delete;
+ Impl& operator=(const Impl&) = delete;
+ virtual ~Impl() noexcept {}
+
+ class Compatible;
+ class SimpleCompatible;
+ class Incompatible;
+
+ class Applicable;
+ class SimpleApplicable;
+ class TypeLengthApplicable;
+ class UniversalApplicable;
+ class Inapplicable;
+
+ class String;
+ class Map;
+ class List;
+ class Enum;
+ class Decimal;
+ class Date;
+ class Time;
+ class Timestamp;
+ class Interval;
+ class Int;
+ class Null;
+ class JSON;
+ class BSON;
+ class UUID;
+ class No;
+ class Undefined;
+
+ protected:
+ Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
+ Impl() = default;
+
+ private:
+ LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
+ SortOrder::type order_ = SortOrder::UNKNOWN;
+};
+
+// Special methods for public LogicalType class
+
+LogicalType::LogicalType() = default;
+LogicalType::~LogicalType() noexcept = default;
+
+// Delegating methods for public LogicalType class
+
+bool LogicalType::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return impl_->is_applicable(primitive_type, primitive_length);
+}
+
+bool LogicalType::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ return impl_->is_compatible(converted_type, converted_decimal_metadata);
+}
+
+ConvertedType::type LogicalType::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ return impl_->ToConvertedType(out_decimal_metadata);
+}
+
+std::string LogicalType::ToString() const { return impl_->ToString(); }
+
+std::string LogicalType::ToJSON() const { return impl_->ToJSON(); }
+
+format::LogicalType LogicalType::ToThrift() const { return impl_->ToThrift(); }
+
+bool LogicalType::Equals(const LogicalType& other) const { return impl_->Equals(other); }
+
+LogicalType::Type::type LogicalType::type() const { return impl_->type(); }
+
+SortOrder::type LogicalType::sort_order() const { return impl_->sort_order(); }
+
+// Type checks for public LogicalType class
+
+bool LogicalType::is_string() const { return impl_->type() == LogicalType::Type::STRING; }
+bool LogicalType::is_map() const { return impl_->type() == LogicalType::Type::MAP; }
+bool LogicalType::is_list() const { return impl_->type() == LogicalType::Type::LIST; }
+bool LogicalType::is_enum() const { return impl_->type() == LogicalType::Type::ENUM; }
+bool LogicalType::is_decimal() const {
+ return impl_->type() == LogicalType::Type::DECIMAL;
+}
+bool LogicalType::is_date() const { return impl_->type() == LogicalType::Type::DATE; }
+bool LogicalType::is_time() const { return impl_->type() == LogicalType::Type::TIME; }
+bool LogicalType::is_timestamp() const {
+ return impl_->type() == LogicalType::Type::TIMESTAMP;
+}
+bool LogicalType::is_interval() const {
+ return impl_->type() == LogicalType::Type::INTERVAL;
+}
+bool LogicalType::is_int() const { return impl_->type() == LogicalType::Type::INT; }
+bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::NIL; }
+bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; }
+bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
+bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
+bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
+bool LogicalType::is_valid() const {
+ return impl_->type() != LogicalType::Type::UNDEFINED;
+}
+bool LogicalType::is_invalid() const { return !is_valid(); }
+bool LogicalType::is_nested() const {
+ return (impl_->type() == LogicalType::Type::LIST) ||
+ (impl_->type() == LogicalType::Type::MAP);
+}
+bool LogicalType::is_nonnested() const { return !is_nested(); }
+bool LogicalType::is_serialized() const { return impl_->is_serialized(); }
+
+// LogicalTypeImpl intermediate "compatibility" classes
+
+class LogicalType::Impl::Compatible : public virtual LogicalType::Impl {
+ protected:
+ Compatible() = default;
+};
+
+#define set_decimal_metadata(m___, i___, p___, s___) \
+ { \
+ if (m___) { \
+ (m___)->isset = (i___); \
+ (m___)->scale = (s___); \
+ (m___)->precision = (p___); \
+ } \
+ }
+
+#define reset_decimal_metadata(m___) \
+ { set_decimal_metadata(m___, false, -1, -1); }
+
+// For logical types that always translate to the same converted type
+class LogicalType::Impl::SimpleCompatible : public virtual LogicalType::Impl::Compatible {
+ public:
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == converted_type_) && !converted_decimal_metadata.isset;
+ }
+
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override {
+ reset_decimal_metadata(out_decimal_metadata);
+ return converted_type_;
+ }
+
+ protected:
+ explicit SimpleCompatible(ConvertedType::type c) : converted_type_(c) {}
+
+ private:
+ ConvertedType::type converted_type_ = ConvertedType::NA;
+};
+
+// For logical types that have no corresponding converted type
+class LogicalType::Impl::Incompatible : public virtual LogicalType::Impl {
+ public:
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == ConvertedType::NONE ||
+ converted_type == ConvertedType::NA) &&
+ !converted_decimal_metadata.isset;
+ }
+
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override {
+ reset_decimal_metadata(out_decimal_metadata);
+ return ConvertedType::NONE;
+ }
+
+ protected:
+ Incompatible() = default;
+};
+
+// LogicalTypeImpl intermediate "applicability" classes
+
+class LogicalType::Impl::Applicable : public virtual LogicalType::Impl {
+ protected:
+ Applicable() = default;
+};
+
+// For logical types that can apply only to a single
+// physical type
+class LogicalType::Impl::SimpleApplicable : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return primitive_type == type_;
+ }
+
+ protected:
+ explicit SimpleApplicable(parquet::Type::type t) : type_(t) {}
+
+ private:
+ parquet::Type::type type_;
+};
+
+// For logical types that can apply only to a particular
+// physical type and physical length combination
+class LogicalType::Impl::TypeLengthApplicable
+ : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return primitive_type == type_ && primitive_length == length_;
+ }
+
+ protected:
+ TypeLengthApplicable(parquet::Type::type t, int32_t l) : type_(t), length_(l) {}
+
+ private:
+ parquet::Type::type type_;
+ int32_t length_;
+};
+
+// For logical types that can apply to any physical type
+class LogicalType::Impl::UniversalApplicable
+ : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return true;
+ }
+
+ protected:
+ UniversalApplicable() = default;
+};
+
+// For logical types that can never apply to any primitive
+// physical type
+class LogicalType::Impl::Inapplicable : public virtual LogicalType::Impl {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return false;
+ }
+
+ protected:
+ Inapplicable() = default;
+};
+
+// LogicalType implementation final classes
+
+#define OVERRIDE_TOSTRING(n___) \
+ std::string ToString() const override { return #n___; }
+
+#define OVERRIDE_TOTHRIFT(t___, s___) \
+ format::LogicalType ToThrift() const override { \
+ format::LogicalType type; \
+ format::t___ subtype; \
+ type.__set_##s___(subtype); \
+ return type; \
+ }
+
+class LogicalType::Impl::String final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class StringLogicalType;
+
+ OVERRIDE_TOSTRING(String)
+ OVERRIDE_TOTHRIFT(StringType, STRING)
+
+ private:
+ String()
+ : LogicalType::Impl(LogicalType::Type::STRING, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UTF8),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+// Each public logical type class's Make() creation method instantiates a corresponding
+// LogicalType::Impl::* object and installs that implementation in the logical type
+// it returns.
+
+#define GENERATE_MAKE(a___) \
+ std::shared_ptr<const LogicalType> a___##LogicalType::Make() { \
+ auto* logical_type = new a___##LogicalType(); \
+ logical_type->impl_.reset(new LogicalType::Impl::a___()); \
+ return std::shared_ptr<const LogicalType>(logical_type); \
+ }
+
+GENERATE_MAKE(String)
+
+class LogicalType::Impl::Map final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::Inapplicable {
+ public:
+ friend class MapLogicalType;
+
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == ConvertedType::MAP ||
+ converted_type == ConvertedType::MAP_KEY_VALUE) &&
+ !converted_decimal_metadata.isset;
+ }
+
+ OVERRIDE_TOSTRING(Map)
+ OVERRIDE_TOTHRIFT(MapType, MAP)
+
+ private:
+ Map()
+ : LogicalType::Impl(LogicalType::Type::MAP, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::MAP) {}
+};
+
+GENERATE_MAKE(Map)
+
+class LogicalType::Impl::List final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::Inapplicable {
+ public:
+ friend class ListLogicalType;
+
+ OVERRIDE_TOSTRING(List)
+ OVERRIDE_TOTHRIFT(ListType, LIST)
+
+ private:
+ List()
+ : LogicalType::Impl(LogicalType::Type::LIST, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::LIST) {}
+};
+
+GENERATE_MAKE(List)
+
+class LogicalType::Impl::Enum final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class EnumLogicalType;
+
+ OVERRIDE_TOSTRING(Enum)
+ OVERRIDE_TOTHRIFT(EnumType, ENUM)
+
+ private:
+ Enum()
+ : LogicalType::Impl(LogicalType::Type::ENUM, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::ENUM),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(Enum)
+
+// The parameterized logical types (currently Decimal, Time, Timestamp, and Int)
+// generally can't reuse the simple method implementations available in the base and
+// intermediate classes and must (re)implement them all
+
+class LogicalType::Impl::Decimal final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class DecimalLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ int32_t precision() const { return precision_; }
+ int32_t scale() const { return scale_; }
+
+ private:
+ Decimal(int32_t p, int32_t s)
+ : LogicalType::Impl(LogicalType::Type::DECIMAL, SortOrder::SIGNED),
+ precision_(p),
+ scale_(s) {}
+ int32_t precision_ = -1;
+ int32_t scale_ = -1;
+};
+
+bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ bool ok = false;
+ switch (primitive_type) {
+ case parquet::Type::INT32: {
+ ok = (1 <= precision_) && (precision_ <= 9);
+ } break;
+ case parquet::Type::INT64: {
+ ok = (1 <= precision_) && (precision_ <= 18);
+ if (precision_ < 10) {
+ // FIXME(tpb): warn that INT32 could be used
+ }
+ } break;
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ ok = precision_ <= static_cast<int32_t>(std::floor(
+ std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
+ } break;
+ case parquet::Type::BYTE_ARRAY: {
+ ok = true;
+ } break;
+ default: {
+ } break;
+ }
+ return ok;
+}
+
+bool LogicalType::Impl::Decimal::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ return converted_type == ConvertedType::DECIMAL &&
+ (converted_decimal_metadata.isset &&
+ converted_decimal_metadata.scale == scale_ &&
+ converted_decimal_metadata.precision == precision_);
+}
+
+ConvertedType::type LogicalType::Impl::Decimal::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ set_decimal_metadata(out_decimal_metadata, true, precision_, scale_);
+ return ConvertedType::DECIMAL;
+}
+
+std::string LogicalType::Impl::Decimal::ToString() const {
+ std::stringstream type;
+ type << "Decimal(precision=" << precision_ << ", scale=" << scale_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Decimal::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Decimal", "precision": )" << precision_ << R"(, "scale": )"
+ << scale_ << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Decimal::ToThrift() const {
+ format::LogicalType type;
+ format::DecimalType decimal_type;
+ decimal_type.__set_precision(precision_);
+ decimal_type.__set_scale(scale_);
+ type.__set_DECIMAL(decimal_type);
+ return type;
+}
+
+bool LogicalType::Impl::Decimal::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_decimal()) {
+ const auto& other_decimal = checked_cast<const DecimalLogicalType&>(other);
+ eq = (precision_ == other_decimal.precision() && scale_ == other_decimal.scale());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> DecimalLogicalType::Make(int32_t precision,
+ int32_t scale) {
+ if (precision < 1) {
+ throw ParquetException(
+ "Precision must be greater than or equal to 1 for Decimal logical type");
+ }
+ if (scale < 0 || scale > precision) {
+ throw ParquetException(
+ "Scale must be a non-negative integer that does not exceed precision for "
+ "Decimal logical type");
+ }
+ auto* logical_type = new DecimalLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Decimal(precision, scale));
+ return std::shared_ptr<const LogicalType>(logical_type);
+}
+
+int32_t DecimalLogicalType::precision() const {
+ return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).precision();
+}
+
+int32_t DecimalLogicalType::scale() const {
+ return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).scale();
+}
+
+class LogicalType::Impl::Date final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class DateLogicalType;
+
+ OVERRIDE_TOSTRING(Date)
+ OVERRIDE_TOTHRIFT(DateType, DATE)
+
+ private:
+ Date()
+ : LogicalType::Impl(LogicalType::Type::DATE, SortOrder::SIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::DATE),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::INT32) {}
+};
+
+GENERATE_MAKE(Date)
+
+#define time_unit_string(u___) \
+ ((u___) == LogicalType::TimeUnit::MILLIS \
+ ? "milliseconds" \
+ : ((u___) == LogicalType::TimeUnit::MICROS \
+ ? "microseconds" \
+ : ((u___) == LogicalType::TimeUnit::NANOS ? "nanoseconds" : "unknown")))
+
+class LogicalType::Impl::Time final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class TimeLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ bool is_adjusted_to_utc() const { return adjusted_; }
+ LogicalType::TimeUnit::unit time_unit() const { return unit_; }
+
+ private:
+ Time(bool a, LogicalType::TimeUnit::unit u)
+ : LogicalType::Impl(LogicalType::Type::TIME, SortOrder::SIGNED),
+ adjusted_(a),
+ unit_(u) {}
+ bool adjusted_ = false;
+ LogicalType::TimeUnit::unit unit_;
+};
+
+bool LogicalType::Impl::Time::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return (primitive_type == parquet::Type::INT32 &&
+ unit_ == LogicalType::TimeUnit::MILLIS) ||
+ (primitive_type == parquet::Type::INT64 &&
+ (unit_ == LogicalType::TimeUnit::MICROS ||
+ unit_ == LogicalType::TimeUnit::NANOS));
+}
+
+bool LogicalType::Impl::Time::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MILLIS) {
+ return converted_type == ConvertedType::TIME_MILLIS;
+ } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MICROS) {
+ return converted_type == ConvertedType::TIME_MICROS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Time::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (adjusted_) {
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ return ConvertedType::TIME_MILLIS;
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ return ConvertedType::TIME_MICROS;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Time::ToString() const {
+ std::stringstream type;
+ type << "Time(isAdjustedToUTC=" << std::boolalpha << adjusted_
+ << ", timeUnit=" << time_unit_string(unit_) << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Time::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Time", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
+ << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"("})";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Time::ToThrift() const {
+ format::LogicalType type;
+ format::TimeType time_type;
+ format::TimeUnit time_unit;
+ DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ format::MilliSeconds millis;
+ time_unit.__set_MILLIS(millis);
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ format::MicroSeconds micros;
+ time_unit.__set_MICROS(micros);
+ } else if (unit_ == LogicalType::TimeUnit::NANOS) {
+ format::NanoSeconds nanos;
+ time_unit.__set_NANOS(nanos);
+ }
+ time_type.__set_isAdjustedToUTC(adjusted_);
+ time_type.__set_unit(time_unit);
+ type.__set_TIME(time_type);
+ return type;
+}
+
+bool LogicalType::Impl::Time::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_time()) {
+ const auto& other_time = checked_cast<const TimeLogicalType&>(other);
+ eq =
+ (adjusted_ == other_time.is_adjusted_to_utc() && unit_ == other_time.time_unit());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> TimeLogicalType::Make(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
+ if (time_unit == LogicalType::TimeUnit::MILLIS ||
+ time_unit == LogicalType::TimeUnit::MICROS ||
+ time_unit == LogicalType::TimeUnit::NANOS) {
+ auto* logical_type = new TimeLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Time(is_adjusted_to_utc, time_unit));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type");
+ }
+}
+
+bool TimeLogicalType::is_adjusted_to_utc() const {
+ return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).is_adjusted_to_utc();
+}
+
+LogicalType::TimeUnit::unit TimeLogicalType::time_unit() const {
+ return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).time_unit();
+}
+
+class LogicalType::Impl::Timestamp final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class TimestampLogicalType;
+
+ bool is_serialized() const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ bool is_adjusted_to_utc() const { return adjusted_; }
+ LogicalType::TimeUnit::unit time_unit() const { return unit_; }
+
+ bool is_from_converted_type() const { return is_from_converted_type_; }
+ bool force_set_converted_type() const { return force_set_converted_type_; }
+
+ private:
+ Timestamp(bool adjusted, LogicalType::TimeUnit::unit unit, bool is_from_converted_type,
+ bool force_set_converted_type)
+ : LogicalType::Impl(LogicalType::Type::TIMESTAMP, SortOrder::SIGNED),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::INT64),
+ adjusted_(adjusted),
+ unit_(unit),
+ is_from_converted_type_(is_from_converted_type),
+ force_set_converted_type_(force_set_converted_type) {}
+ bool adjusted_ = false;
+ LogicalType::TimeUnit::unit unit_;
+ bool is_from_converted_type_ = false;
+ bool force_set_converted_type_ = false;
+};
+
+bool LogicalType::Impl::Timestamp::is_serialized() const {
+ return !is_from_converted_type_;
+}
+
+bool LogicalType::Impl::Timestamp::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ if (adjusted_ || force_set_converted_type_) {
+ return converted_type == ConvertedType::TIMESTAMP_MILLIS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ if (adjusted_ || force_set_converted_type_) {
+ return converted_type == ConvertedType::TIMESTAMP_MICROS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Timestamp::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (adjusted_ || force_set_converted_type_) {
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ return ConvertedType::TIMESTAMP_MILLIS;
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ return ConvertedType::TIMESTAMP_MICROS;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Timestamp::ToString() const {
+ std::stringstream type;
+ type << "Timestamp(isAdjustedToUTC=" << std::boolalpha << adjusted_
+ << ", timeUnit=" << time_unit_string(unit_)
+ << ", is_from_converted_type=" << is_from_converted_type_
+ << ", force_set_converted_type=" << force_set_converted_type_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Timestamp::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Timestamp", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
+ << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"(")"
+ << R"(, "is_from_converted_type": )" << is_from_converted_type_
+ << R"(, "force_set_converted_type": )" << force_set_converted_type_ << R"(})";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Timestamp::ToThrift() const {
+ format::LogicalType type;
+ format::TimestampType timestamp_type;
+ format::TimeUnit time_unit;
+ DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ format::MilliSeconds millis;
+ time_unit.__set_MILLIS(millis);
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ format::MicroSeconds micros;
+ time_unit.__set_MICROS(micros);
+ } else if (unit_ == LogicalType::TimeUnit::NANOS) {
+ format::NanoSeconds nanos;
+ time_unit.__set_NANOS(nanos);
+ }
+ timestamp_type.__set_isAdjustedToUTC(adjusted_);
+ timestamp_type.__set_unit(time_unit);
+ type.__set_TIMESTAMP(timestamp_type);
+ return type;
+}
+
+bool LogicalType::Impl::Timestamp::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_timestamp()) {
+ const auto& other_timestamp = checked_cast<const TimestampLogicalType&>(other);
+ eq = (adjusted_ == other_timestamp.is_adjusted_to_utc() &&
+ unit_ == other_timestamp.time_unit());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> TimestampLogicalType::Make(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type, bool force_set_converted_type) {
+ if (time_unit == LogicalType::TimeUnit::MILLIS ||
+ time_unit == LogicalType::TimeUnit::MICROS ||
+ time_unit == LogicalType::TimeUnit::NANOS) {
+ auto* logical_type = new TimestampLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Timestamp(
+ is_adjusted_to_utc, time_unit, is_from_converted_type, force_set_converted_type));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type");
+ }
+}
+
+bool TimestampLogicalType::is_adjusted_to_utc() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).is_adjusted_to_utc();
+}
+
+LogicalType::TimeUnit::unit TimestampLogicalType::time_unit() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).time_unit();
+}
+
+bool TimestampLogicalType::is_from_converted_type() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
+ .is_from_converted_type();
+}
+
+bool TimestampLogicalType::force_set_converted_type() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
+ .force_set_converted_type();
+}
+
+class LogicalType::Impl::Interval final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::TypeLengthApplicable {
+ public:
+ friend class IntervalLogicalType;
+
+ OVERRIDE_TOSTRING(Interval)
+ // TODO(tpboudreau): uncomment the following line to enable serialization after
+ // parquet.thrift recognizes IntervalType as a ConvertedType
+ // OVERRIDE_TOTHRIFT(IntervalType, INTERVAL)
+
+ private:
+ Interval()
+ : LogicalType::Impl(LogicalType::Type::INTERVAL, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::INTERVAL),
+ LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 12) {
+ }
+};
+
+GENERATE_MAKE(Interval)
+
+class LogicalType::Impl::Int final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class IntLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ int bit_width() const { return width_; }
+ bool is_signed() const { return signed_; }
+
+ private:
+ Int(int w, bool s)
+ : LogicalType::Impl(LogicalType::Type::INT,
+ (s ? SortOrder::SIGNED : SortOrder::UNSIGNED)),
+ width_(w),
+ signed_(s) {}
+ int width_ = 0;
+ bool signed_ = false;
+};
+
+bool LogicalType::Impl::Int::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return (primitive_type == parquet::Type::INT32 && width_ <= 32) ||
+ (primitive_type == parquet::Type::INT64 && width_ == 64);
+}
+
+bool LogicalType::Impl::Int::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (signed_ && width_ == 8) {
+ return converted_type == ConvertedType::INT_8;
+ } else if (signed_ && width_ == 16) {
+ return converted_type == ConvertedType::INT_16;
+ } else if (signed_ && width_ == 32) {
+ return converted_type == ConvertedType::INT_32;
+ } else if (signed_ && width_ == 64) {
+ return converted_type == ConvertedType::INT_64;
+ } else if (!signed_ && width_ == 8) {
+ return converted_type == ConvertedType::UINT_8;
+ } else if (!signed_ && width_ == 16) {
+ return converted_type == ConvertedType::UINT_16;
+ } else if (!signed_ && width_ == 32) {
+ return converted_type == ConvertedType::UINT_32;
+ } else if (!signed_ && width_ == 64) {
+ return converted_type == ConvertedType::UINT_64;
+ } else {
+ return false;
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Int::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (signed_) {
+ switch (width_) {
+ case 8:
+ return ConvertedType::INT_8;
+ case 16:
+ return ConvertedType::INT_16;
+ case 32:
+ return ConvertedType::INT_32;
+ case 64:
+ return ConvertedType::INT_64;
+ }
+ } else { // unsigned
+ switch (width_) {
+ case 8:
+ return ConvertedType::UINT_8;
+ case 16:
+ return ConvertedType::UINT_16;
+ case 32:
+ return ConvertedType::UINT_32;
+ case 64:
+ return ConvertedType::UINT_64;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Int::ToString() const {
+ std::stringstream type;
+ type << "Int(bitWidth=" << width_ << ", isSigned=" << std::boolalpha << signed_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Int::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Int", "bitWidth": )" << width_ << R"(, "isSigned": )"
+ << std::boolalpha << signed_ << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Int::ToThrift() const {
+ format::LogicalType type;
+ format::IntType int_type;
+ DCHECK(width_ == 64 || width_ == 32 || width_ == 16 || width_ == 8);
+ int_type.__set_bitWidth(static_cast<int8_t>(width_));
+ int_type.__set_isSigned(signed_);
+ type.__set_INTEGER(int_type);
+ return type;
+}
+
+bool LogicalType::Impl::Int::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_int()) {
+ const auto& other_int = checked_cast<const IntLogicalType&>(other);
+ eq = (width_ == other_int.bit_width() && signed_ == other_int.is_signed());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> IntLogicalType::Make(int bit_width, bool is_signed) {
+ if (bit_width == 8 || bit_width == 16 || bit_width == 32 || bit_width == 64) {
+ auto* logical_type = new IntLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Int(bit_width, is_signed));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "Bit width must be exactly 8, 16, 32, or 64 for Int logical type");
+ }
+}
+
+int IntLogicalType::bit_width() const {
+ return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).bit_width();
+}
+
+bool IntLogicalType::is_signed() const {
+ return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).is_signed();
+}
+
+class LogicalType::Impl::Null final : public LogicalType::Impl::Incompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class NullLogicalType;
+
+ OVERRIDE_TOSTRING(Null)
+ OVERRIDE_TOTHRIFT(NullType, UNKNOWN)
+
+ private:
+ Null() : LogicalType::Impl(LogicalType::Type::NIL, SortOrder::UNKNOWN) {}
+};
+
+GENERATE_MAKE(Null)
+
+class LogicalType::Impl::JSON final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class JSONLogicalType;
+
+ OVERRIDE_TOSTRING(JSON)
+ OVERRIDE_TOTHRIFT(JsonType, JSON)
+
+ private:
+ JSON()
+ : LogicalType::Impl(LogicalType::Type::JSON, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::JSON),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(JSON)
+
+class LogicalType::Impl::BSON final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class BSONLogicalType;
+
+ OVERRIDE_TOSTRING(BSON)
+ OVERRIDE_TOTHRIFT(BsonType, BSON)
+
+ private:
+ BSON()
+ : LogicalType::Impl(LogicalType::Type::BSON, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::BSON),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(BSON)
+
+class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible,
+ public LogicalType::Impl::TypeLengthApplicable {
+ public:
+ friend class UUIDLogicalType;
+
+ OVERRIDE_TOSTRING(UUID)
+ OVERRIDE_TOTHRIFT(UUIDType, UUID)
+
+ private:
+ UUID()
+ : LogicalType::Impl(LogicalType::Type::UUID, SortOrder::UNSIGNED),
+ LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 16) {
+ }
+};
+
+GENERATE_MAKE(UUID)
+
+class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class NoLogicalType;
+
+ OVERRIDE_TOSTRING(None)
+
+ private:
+ No()
+ : LogicalType::Impl(LogicalType::Type::NONE, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::NONE) {}
+};
+
+GENERATE_MAKE(No)
+
+class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class UndefinedLogicalType;
+
+ OVERRIDE_TOSTRING(Undefined)
+
+ private:
+ Undefined()
+ : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
+};
+
+GENERATE_MAKE(Undefined)
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/types.h b/contrib/libs/apache/arrow/cpp/src/parquet/types.h
index c25719830ec..40981d9bf1a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/types.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/types.h
@@ -1,765 +1,765 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "arrow/util/string_view.h"
-
-#include "parquet/platform.h"
-#include "parquet/type_fwd.h"
-
-#ifdef _WIN32
-
-// Repetition::OPTIONAL conflicts with a #define, so we undefine it
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
-
-#endif // _WIN32
-
-namespace arrow {
-namespace util {
-
-class Codec;
-
-} // namespace util
-} // namespace arrow
-
-namespace parquet {
-
-// ----------------------------------------------------------------------
-// Metadata enums to match Thrift metadata
-//
-// The reason we maintain our own enums is to avoid transitive dependency on
-// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
-// public API. After building parquet-cpp, you should not need to include
-// Thrift headers in your application. This means some boilerplate to convert
-// between our types and Parquet's Thrift types.
-//
-// We can also add special values like NONE to distinguish between metadata
-// values being set and not set. As an example consider ConvertedType and
-// CompressionCodec
-
-// Mirrors parquet::Type
-struct Type {
- enum type {
- BOOLEAN = 0,
- INT32 = 1,
- INT64 = 2,
- INT96 = 3,
- FLOAT = 4,
- DOUBLE = 5,
- BYTE_ARRAY = 6,
- FIXED_LEN_BYTE_ARRAY = 7,
- // Should always be last element.
- UNDEFINED = 8
- };
-};
-
-// Mirrors parquet::ConvertedType
-struct ConvertedType {
- enum type {
- NONE, // Not a real converted type, but means no converted type is specified
- UTF8,
- MAP,
- MAP_KEY_VALUE,
- LIST,
- ENUM,
- DECIMAL,
- DATE,
- TIME_MILLIS,
- TIME_MICROS,
- TIMESTAMP_MILLIS,
- TIMESTAMP_MICROS,
- UINT_8,
- UINT_16,
- UINT_32,
- UINT_64,
- INT_8,
- INT_16,
- INT_32,
- INT_64,
- JSON,
- BSON,
- INTERVAL,
- // DEPRECATED INVALID ConvertedType for all-null data.
- // Only useful for reading legacy files written out by interim Parquet C++ releases.
- // For writing, always emit LogicalType::Null instead.
- // See PARQUET-1990.
- NA = 25,
- UNDEFINED = 26 // Not a real converted type; should always be last element
- };
-};
-
-// forward declaration
-namespace format {
-
-class LogicalType;
-
-}
-
-// Mirrors parquet::FieldRepetitionType
-struct Repetition {
- enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
-};
-
-// Reference:
-// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
-// format/converter/ParquetMetadataConverter.java
-// Sort order for page and column statistics. Types are associated with sort
-// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
-// aggregated using a sort order. As of parquet-format version 2.3.1, the
-// order used to aggregate stats is always SIGNED and is not stored in the
-// Parquet file. These stats are discarded for types that need unsigned.
-// See PARQUET-686.
-struct SortOrder {
- enum type { SIGNED, UNSIGNED, UNKNOWN };
-};
-
-namespace schema {
-
-struct DecimalMetadata {
- bool isset;
- int32_t scale;
- int32_t precision;
-};
-
-} // namespace schema
-
-/// \brief Implementation of parquet.thrift LogicalType types.
-class PARQUET_EXPORT LogicalType {
- public:
- struct Type {
- enum type {
- UNDEFINED = 0, // Not a real logical type
- STRING = 1,
- MAP,
- LIST,
- ENUM,
- DECIMAL,
- DATE,
- TIME,
- TIMESTAMP,
- INTERVAL,
- INT,
- NIL, // Thrift NullType: annotates data that is always null
- JSON,
- BSON,
- UUID,
- NONE // Not a real logical type; should always be last element
- };
- };
-
- struct TimeUnit {
- enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
- };
-
- /// \brief If possible, return a logical type equivalent to the given legacy
- /// converted type (and decimal metadata if applicable).
- static std::shared_ptr<const LogicalType> FromConvertedType(
- const parquet::ConvertedType::type converted_type,
- const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
- -1});
-
- /// \brief Return the logical type represented by the Thrift intermediary object.
- static std::shared_ptr<const LogicalType> FromThrift(
- const parquet::format::LogicalType& thrift_logical_type);
-
- /// \brief Return the explicitly requested logical type.
- static std::shared_ptr<const LogicalType> String();
- static std::shared_ptr<const LogicalType> Map();
- static std::shared_ptr<const LogicalType> List();
- static std::shared_ptr<const LogicalType> Enum();
- static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
- static std::shared_ptr<const LogicalType> Date();
- static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
- LogicalType::TimeUnit::unit time_unit);
-
- /// \brief Create a Timestamp logical type
- /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
- /// \param[in] time_unit the resolution of the timestamp
- /// \param[in] is_from_converted_type if true, the timestamp was generated
- /// by translating a legacy converted type of TIMESTAMP_MILLIS or
- /// TIMESTAMP_MICROS. Default is false.
- /// \param[in] force_set_converted_type if true, always set the
- /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
- /// metadata. Default is false
- static std::shared_ptr<const LogicalType> Timestamp(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type = false, bool force_set_converted_type = false);
-
- static std::shared_ptr<const LogicalType> Interval();
- static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
-
- /// \brief Create a logical type for data that's always null
- ///
- /// Any physical type can be annotated with this logical type.
- static std::shared_ptr<const LogicalType> Null();
-
- static std::shared_ptr<const LogicalType> JSON();
- static std::shared_ptr<const LogicalType> BSON();
- static std::shared_ptr<const LogicalType> UUID();
-
- /// \brief Create a placeholder for when no logical type is specified
- static std::shared_ptr<const LogicalType> None();
-
- /// \brief Return true if this logical type is consistent with the given underlying
- /// physical type.
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const;
-
- /// \brief Return true if this logical type is equivalent to the given legacy converted
- /// type (and decimal metadata if applicable).
- bool is_compatible(parquet::ConvertedType::type converted_type,
- parquet::schema::DecimalMetadata converted_decimal_metadata = {
- false, -1, -1}) const;
-
- /// \brief If possible, return the legacy converted type (and decimal metadata if
- /// applicable) equivalent to this logical type.
- parquet::ConvertedType::type ToConvertedType(
- parquet::schema::DecimalMetadata* out_decimal_metadata) const;
-
- /// \brief Return a printable representation of this logical type.
- std::string ToString() const;
-
- /// \brief Return a JSON representation of this logical type.
- std::string ToJSON() const;
-
- /// \brief Return a serializable Thrift object for this logical type.
- parquet::format::LogicalType ToThrift() const;
-
- /// \brief Return true if the given logical type is equivalent to this logical type.
- bool Equals(const LogicalType& other) const;
-
- /// \brief Return the enumerated type of this logical type.
- LogicalType::Type::type type() const;
-
- /// \brief Return the appropriate sort order for this logical type.
- SortOrder::type sort_order() const;
-
- // Type checks ...
- bool is_string() const;
- bool is_map() const;
- bool is_list() const;
- bool is_enum() const;
- bool is_decimal() const;
- bool is_date() const;
- bool is_time() const;
- bool is_timestamp() const;
- bool is_interval() const;
- bool is_int() const;
- bool is_null() const;
- bool is_JSON() const;
- bool is_BSON() const;
- bool is_UUID() const;
- bool is_none() const;
- /// \brief Return true if this logical type is of a known type.
- bool is_valid() const;
- bool is_invalid() const;
- /// \brief Return true if this logical type is suitable for a schema GroupNode.
- bool is_nested() const;
- bool is_nonnested() const;
- /// \brief Return true if this logical type is included in the Thrift output for its
- /// node.
- bool is_serialized() const;
-
- LogicalType(const LogicalType&) = delete;
- LogicalType& operator=(const LogicalType&) = delete;
- virtual ~LogicalType() noexcept;
-
- protected:
- LogicalType();
-
- class Impl;
- std::unique_ptr<const Impl> impl_;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
-class PARQUET_EXPORT StringLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- StringLogicalType() = default;
-};
-
-/// \brief Allowed for group nodes only.
-class PARQUET_EXPORT MapLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- MapLogicalType() = default;
-};
-
-/// \brief Allowed for group nodes only.
-class PARQUET_EXPORT ListLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- ListLogicalType() = default;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
-class PARQUET_EXPORT EnumLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- EnumLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
-/// depending on the precision.
-class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
- int32_t precision() const;
- int32_t scale() const;
-
- private:
- DecimalLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32.
-class PARQUET_EXPORT DateLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- DateLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
-class PARQUET_EXPORT TimeLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
- LogicalType::TimeUnit::unit time_unit);
- bool is_adjusted_to_utc() const;
- LogicalType::TimeUnit::unit time_unit() const;
-
- private:
- TimeLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT64.
-class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
- LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type = false,
- bool force_set_converted_type = false);
- bool is_adjusted_to_utc() const;
- LogicalType::TimeUnit::unit time_unit() const;
-
- /// \brief If true, will not set LogicalType in Thrift metadata
- bool is_from_converted_type() const;
-
- /// \brief If true, will set ConvertedType for micros and millis
- /// resolution in legacy ConvertedType Thrift metadata
- bool force_set_converted_type() const;
-
- private:
- TimestampLogicalType() = default;
-};
-
-/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
-class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- IntervalLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
-/// (for bit width 64).
-class PARQUET_EXPORT IntLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
- int bit_width() const;
- bool is_signed() const;
-
- private:
- IntLogicalType() = default;
-};
-
-/// \brief Allowed for any physical type.
-class PARQUET_EXPORT NullLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- NullLogicalType() = default;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY.
-class PARQUET_EXPORT JSONLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- JSONLogicalType() = default;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY.
-class PARQUET_EXPORT BSONLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- BSONLogicalType() = default;
-};
-
-/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
-/// must encode raw UUID bytes.
-class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- UUIDLogicalType() = default;
-};
-
-/// \brief Allowed for any physical type.
-class PARQUET_EXPORT NoLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- NoLogicalType() = default;
-};
-
-// Internal API, for unrecognized logical types
-class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- UndefinedLogicalType() = default;
-};
-
-// Data encodings. Mirrors parquet::Encoding
-struct Encoding {
- enum type {
- PLAIN = 0,
- PLAIN_DICTIONARY = 2,
- RLE = 3,
- BIT_PACKED = 4,
- DELTA_BINARY_PACKED = 5,
- DELTA_LENGTH_BYTE_ARRAY = 6,
- DELTA_BYTE_ARRAY = 7,
- RLE_DICTIONARY = 8,
- BYTE_STREAM_SPLIT = 9,
- // Should always be last element (except UNKNOWN)
- UNDEFINED = 10,
- UNKNOWN = 999
- };
-};
-
-// Exposed data encodings. It is the encoding of the data read from the file,
-// rather than the encoding of the data in the file. E.g., the data encoded as
-// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
-// decoding, in which case the data read from the file is DICTIONARY encoded.
-enum class ExposedEncoding {
- NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
- DICTIONARY = 1
-};
-
-/// \brief Return true if Parquet supports indicated compression type
-PARQUET_EXPORT
-bool IsCodecSupported(Compression::type codec);
-
-PARQUET_EXPORT
-std::unique_ptr<Codec> GetCodec(Compression::type codec);
-
-PARQUET_EXPORT
-std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
-
-struct ParquetCipher {
- enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
-};
-
-struct AadMetadata {
- std::string aad_prefix;
- std::string aad_file_unique;
- bool supply_aad_prefix;
-};
-
-struct EncryptionAlgorithm {
- ParquetCipher::type algorithm;
- AadMetadata aad;
-};
-
-// parquet::PageType
-struct PageType {
- enum type {
- DATA_PAGE,
- INDEX_PAGE,
- DICTIONARY_PAGE,
- DATA_PAGE_V2,
- // Should always be last element
- UNDEFINED
- };
-};
-
-class ColumnOrder {
- public:
- enum type { UNDEFINED, TYPE_DEFINED_ORDER };
- explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
- // Default to Type Defined Order
- ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
- ColumnOrder::type get_order() { return column_order_; }
-
- static ColumnOrder undefined_;
- static ColumnOrder type_defined_;
-
- private:
- ColumnOrder::type column_order_;
-};
-
-// ----------------------------------------------------------------------
-
-struct ByteArray {
- ByteArray() : len(0), ptr(NULLPTR) {}
- ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
-
- ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
- : ByteArray(static_cast<uint32_t>(view.size()),
- reinterpret_cast<const uint8_t*>(view.data())) {}
- uint32_t len;
- const uint8_t* ptr;
-};
-
-inline bool operator==(const ByteArray& left, const ByteArray& right) {
- return left.len == right.len &&
- (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
-}
-
-inline bool operator!=(const ByteArray& left, const ByteArray& right) {
- return !(left == right);
-}
-
-struct FixedLenByteArray {
- FixedLenByteArray() : ptr(NULLPTR) {}
- explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
- const uint8_t* ptr;
-};
-
-using FLBA = FixedLenByteArray;
-
-// Julian day at unix epoch.
-//
-// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
-// the Julian day count starting from noon Universal time, with Julian day
-// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
-// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
-// calendar),
-constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
-constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
-constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
-constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
-constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
-
-MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
-STRUCT_END(Int96, 12);
-
-inline bool operator==(const Int96& left, const Int96& right) {
- return std::equal(left.value, left.value + 3, right.value);
-}
-
-inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
-
-static inline std::string ByteArrayToString(const ByteArray& a) {
- return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
-}
-
-static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
- std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
-}
-
-struct DecodedInt96 {
- uint64_t days_since_epoch;
- uint64_t nanoseconds;
-};
-
-static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
- // We do the computations in the unsigned domain to avoid unsigned behaviour
- // on overflow.
- DecodedInt96 result;
- result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
- result.nanoseconds = 0;
-
- memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
- return result;
-}
-
-static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
- decoded.nanoseconds);
-}
-
-static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
- return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
- microseconds);
-}
-
-static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
- return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
- milliseconds);
-}
-
-static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
- return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
-}
-
-static inline std::string Int96ToString(const Int96& a) {
- std::ostringstream result;
- std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
- return result.str();
-}
-
-static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
- std::ostringstream result;
- std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
- return result.str();
-}
-
-template <Type::type TYPE>
-struct type_traits {};
-
-template <>
-struct type_traits<Type::BOOLEAN> {
- using value_type = bool;
-
- static constexpr int value_byte_size = 1;
- static constexpr const char* printf_code = "d";
-};
-
-template <>
-struct type_traits<Type::INT32> {
- using value_type = int32_t;
-
- static constexpr int value_byte_size = 4;
- static constexpr const char* printf_code = "d";
-};
-
-template <>
-struct type_traits<Type::INT64> {
- using value_type = int64_t;
-
- static constexpr int value_byte_size = 8;
- static constexpr const char* printf_code = "ld";
-};
-
-template <>
-struct type_traits<Type::INT96> {
- using value_type = Int96;
-
- static constexpr int value_byte_size = 12;
- static constexpr const char* printf_code = "s";
-};
-
-template <>
-struct type_traits<Type::FLOAT> {
- using value_type = float;
-
- static constexpr int value_byte_size = 4;
- static constexpr const char* printf_code = "f";
-};
-
-template <>
-struct type_traits<Type::DOUBLE> {
- using value_type = double;
-
- static constexpr int value_byte_size = 8;
- static constexpr const char* printf_code = "lf";
-};
-
-template <>
-struct type_traits<Type::BYTE_ARRAY> {
- using value_type = ByteArray;
-
- static constexpr int value_byte_size = sizeof(ByteArray);
- static constexpr const char* printf_code = "s";
-};
-
-template <>
-struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
- using value_type = FixedLenByteArray;
-
- static constexpr int value_byte_size = sizeof(FixedLenByteArray);
- static constexpr const char* printf_code = "s";
-};
-
-template <Type::type TYPE>
-struct PhysicalType {
- using c_type = typename type_traits<TYPE>::value_type;
- static constexpr Type::type type_num = TYPE;
-};
-
-using BooleanType = PhysicalType<Type::BOOLEAN>;
-using Int32Type = PhysicalType<Type::INT32>;
-using Int64Type = PhysicalType<Type::INT64>;
-using Int96Type = PhysicalType<Type::INT96>;
-using FloatType = PhysicalType<Type::FLOAT>;
-using DoubleType = PhysicalType<Type::DOUBLE>;
-using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
-using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
-
-template <typename Type>
-inline std::string format_fwf(int width) {
- std::stringstream ss;
- ss << "%-" << width << type_traits<Type::type_num>::printf_code;
- return ss.str();
-}
-
-PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
-
-PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
-
-PARQUET_EXPORT std::string TypeToString(Type::type t);
-
-PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
- ::arrow::util::string_view val);
-
-PARQUET_EXPORT int GetTypeByteSize(Type::type t);
-
-PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
-
-PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
- Type::type primitive);
-
-PARQUET_EXPORT SortOrder::type GetSortOrder(
- const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/string_view.h"
+
+#include "parquet/platform.h"
+#include "parquet/type_fwd.h"
+
+#ifdef _WIN32
+
+// Repetition::OPTIONAL conflicts with a #define, so we undefine it
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif // _WIN32
+
+namespace arrow {
+namespace util {
+
+class Codec;
+
+} // namespace util
+} // namespace arrow
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// Metadata enums to match Thrift metadata
+//
+// The reason we maintain our own enums is to avoid transitive dependency on
+// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
+// public API. After building parquet-cpp, you should not need to include
+// Thrift headers in your application. This means some boilerplate to convert
+// between our types and Parquet's Thrift types.
+//
+// We can also add special values like NONE to distinguish between metadata
+// values being set and not set. As an example consider ConvertedType and
+// CompressionCodec
+
+// Mirrors parquet::Type
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7,
+ // Should always be last element.
+ UNDEFINED = 8
+ };
+};
+
+// Mirrors parquet::ConvertedType
+struct ConvertedType {
+ enum type {
+ NONE, // Not a real converted type, but means no converted type is specified
+ UTF8,
+ MAP,
+ MAP_KEY_VALUE,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME_MILLIS,
+ TIME_MICROS,
+ TIMESTAMP_MILLIS,
+ TIMESTAMP_MICROS,
+ UINT_8,
+ UINT_16,
+ UINT_32,
+ UINT_64,
+ INT_8,
+ INT_16,
+ INT_32,
+ INT_64,
+ JSON,
+ BSON,
+ INTERVAL,
+ // DEPRECATED INVALID ConvertedType for all-null data.
+ // Only useful for reading legacy files written out by interim Parquet C++ releases.
+ // For writing, always emit LogicalType::Null instead.
+ // See PARQUET-1990.
+ NA = 25,
+ UNDEFINED = 26 // Not a real converted type; should always be last element
+ };
+};
+
+// forward declaration
+namespace format {
+
+class LogicalType;
+
+}
+
+// Mirrors parquet::FieldRepetitionType
+struct Repetition {
+ enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
+};
+
+// Reference:
+// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
+// format/converter/ParquetMetadataConverter.java
+// Sort order for page and column statistics. Types are associated with sort
+// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
+// aggregated using a sort order. As of parquet-format version 2.3.1, the
+// order used to aggregate stats is always SIGNED and is not stored in the
+// Parquet file. These stats are discarded for types that need unsigned.
+// See PARQUET-686.
+struct SortOrder {
+ enum type { SIGNED, UNSIGNED, UNKNOWN };
+};
+
+namespace schema {
+
+struct DecimalMetadata {
+ bool isset;
+ int32_t scale;
+ int32_t precision;
+};
+
+} // namespace schema
+
+/// \brief Implementation of parquet.thrift LogicalType types.
+class PARQUET_EXPORT LogicalType {
+ public:
+ struct Type {
+ enum type {
+ UNDEFINED = 0, // Not a real logical type
+ STRING = 1,
+ MAP,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME,
+ TIMESTAMP,
+ INTERVAL,
+ INT,
+ NIL, // Thrift NullType: annotates data that is always null
+ JSON,
+ BSON,
+ UUID,
+ NONE // Not a real logical type; should always be last element
+ };
+ };
+
+ struct TimeUnit {
+ enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
+ };
+
+ /// \brief If possible, return a logical type equivalent to the given legacy
+ /// converted type (and decimal metadata if applicable).
+ static std::shared_ptr<const LogicalType> FromConvertedType(
+ const parquet::ConvertedType::type converted_type,
+ const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
+ -1});
+
+ /// \brief Return the logical type represented by the Thrift intermediary object.
+ static std::shared_ptr<const LogicalType> FromThrift(
+ const parquet::format::LogicalType& thrift_logical_type);
+
+ /// \brief Return the explicitly requested logical type.
+ static std::shared_ptr<const LogicalType> String();
+ static std::shared_ptr<const LogicalType> Map();
+ static std::shared_ptr<const LogicalType> List();
+ static std::shared_ptr<const LogicalType> Enum();
+ static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
+ static std::shared_ptr<const LogicalType> Date();
+ static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit);
+
+ /// \brief Create a Timestamp logical type
+ /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
+ /// \param[in] time_unit the resolution of the timestamp
+ /// \param[in] is_from_converted_type if true, the timestamp was generated
+ /// by translating a legacy converted type of TIMESTAMP_MILLIS or
+ /// TIMESTAMP_MICROS. Default is false.
+ /// \param[in] force_set_converted_type if true, always set the
+ /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
+ /// metadata. Default is false
+ static std::shared_ptr<const LogicalType> Timestamp(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type = false, bool force_set_converted_type = false);
+
+ static std::shared_ptr<const LogicalType> Interval();
+ static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+ /// \brief Create a logical type for data that's always null
+ ///
+ /// Any physical type can be annotated with this logical type.
+ static std::shared_ptr<const LogicalType> Null();
+
+ static std::shared_ptr<const LogicalType> JSON();
+ static std::shared_ptr<const LogicalType> BSON();
+ static std::shared_ptr<const LogicalType> UUID();
+
+ /// \brief Create a placeholder for when no logical type is specified
+ static std::shared_ptr<const LogicalType> None();
+
+ /// \brief Return true if this logical type is consistent with the given underlying
+ /// physical type.
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const;
+
+ /// \brief Return true if this logical type is equivalent to the given legacy converted
+ /// type (and decimal metadata if applicable).
+ bool is_compatible(parquet::ConvertedType::type converted_type,
+ parquet::schema::DecimalMetadata converted_decimal_metadata = {
+ false, -1, -1}) const;
+
+ /// \brief If possible, return the legacy converted type (and decimal metadata if
+ /// applicable) equivalent to this logical type.
+ parquet::ConvertedType::type ToConvertedType(
+ parquet::schema::DecimalMetadata* out_decimal_metadata) const;
+
+ /// \brief Return a printable representation of this logical type.
+ std::string ToString() const;
+
+ /// \brief Return a JSON representation of this logical type.
+ std::string ToJSON() const;
+
+ /// \brief Return a serializable Thrift object for this logical type.
+ parquet::format::LogicalType ToThrift() const;
+
+ /// \brief Return true if the given logical type is equivalent to this logical type.
+ bool Equals(const LogicalType& other) const;
+
+ /// \brief Return the enumerated type of this logical type.
+ LogicalType::Type::type type() const;
+
+ /// \brief Return the appropriate sort order for this logical type.
+ SortOrder::type sort_order() const;
+
+ // Type checks ...
+ bool is_string() const;
+ bool is_map() const;
+ bool is_list() const;
+ bool is_enum() const;
+ bool is_decimal() const;
+ bool is_date() const;
+ bool is_time() const;
+ bool is_timestamp() const;
+ bool is_interval() const;
+ bool is_int() const;
+ bool is_null() const;
+ bool is_JSON() const;
+ bool is_BSON() const;
+ bool is_UUID() const;
+ bool is_none() const;
+ /// \brief Return true if this logical type is of a known type.
+ bool is_valid() const;
+ bool is_invalid() const;
+ /// \brief Return true if this logical type is suitable for a schema GroupNode.
+ bool is_nested() const;
+ bool is_nonnested() const;
+ /// \brief Return true if this logical type is included in the Thrift output for its
+ /// node.
+ bool is_serialized() const;
+
+ LogicalType(const LogicalType&) = delete;
+ LogicalType& operator=(const LogicalType&) = delete;
+ virtual ~LogicalType() noexcept;
+
+ protected:
+ LogicalType();
+
+ class Impl;
+ std::unique_ptr<const Impl> impl_;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT StringLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ StringLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT MapLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ MapLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT ListLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ ListLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT EnumLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ EnumLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
+/// depending on the precision.
+class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
+ int32_t precision() const;
+ int32_t scale() const;
+
+ private:
+ DecimalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32.
+class PARQUET_EXPORT DateLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ DateLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
+class PARQUET_EXPORT TimeLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit);
+ bool is_adjusted_to_utc() const;
+ LogicalType::TimeUnit::unit time_unit() const;
+
+ private:
+ TimeLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT64.
+class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type = false,
+ bool force_set_converted_type = false);
+ bool is_adjusted_to_utc() const;
+ LogicalType::TimeUnit::unit time_unit() const;
+
+ /// \brief If true, will not set LogicalType in Thrift metadata
+ bool is_from_converted_type() const;
+
+ /// \brief If true, will set ConvertedType for micros and millis
+ /// resolution in legacy ConvertedType Thrift metadata
+ bool force_set_converted_type() const;
+
+ private:
+ TimestampLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
+class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ IntervalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
+/// (for bit width 64).
+class PARQUET_EXPORT IntLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
+ int bit_width() const;
+ bool is_signed() const;
+
+ private:
+ IntLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NullLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ NullLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT JSONLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ JSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT BSONLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ BSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
+/// must encode raw UUID bytes.
+class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ UUIDLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NoLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ NoLogicalType() = default;
+};
+
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ UndefinedLogicalType() = default;
+};
+
+// Data encodings. Mirrors parquet::Encoding
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9,
+ // Should always be last element (except UNKNOWN)
+ UNDEFINED = 10,
+ UNKNOWN = 999
+ };
+};
+
+// Exposed data encodings. It is the encoding of the data read from the file,
+// rather than the encoding of the data in the file. E.g., the data encoded as
+// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
+// decoding, in which case the data read from the file is DICTIONARY encoded.
+enum class ExposedEncoding {
+ NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
+ DICTIONARY = 1
+};
+
+/// \brief Return true if Parquet supports indicated compression type
+PARQUET_EXPORT
+bool IsCodecSupported(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
+
+struct ParquetCipher {
+ enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
+};
+
+struct AadMetadata {
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+};
+
+struct EncryptionAlgorithm {
+ ParquetCipher::type algorithm;
+ AadMetadata aad;
+};
+
+// parquet::PageType
+struct PageType {
+ enum type {
+ DATA_PAGE,
+ INDEX_PAGE,
+ DICTIONARY_PAGE,
+ DATA_PAGE_V2,
+ // Should always be last element
+ UNDEFINED
+ };
+};
+
+class ColumnOrder {
+ public:
+ enum type { UNDEFINED, TYPE_DEFINED_ORDER };
+ explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
+ // Default to Type Defined Order
+ ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
+ ColumnOrder::type get_order() { return column_order_; }
+
+ static ColumnOrder undefined_;
+ static ColumnOrder type_defined_;
+
+ private:
+ ColumnOrder::type column_order_;
+};
+
+// ----------------------------------------------------------------------
+
+struct ByteArray {
+ ByteArray() : len(0), ptr(NULLPTR) {}
+ ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+
+ ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
+ : ByteArray(static_cast<uint32_t>(view.size()),
+ reinterpret_cast<const uint8_t*>(view.data())) {}
+ uint32_t len;
+ const uint8_t* ptr;
+};
+
+inline bool operator==(const ByteArray& left, const ByteArray& right) {
+ return left.len == right.len &&
+ (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
+}
+
+inline bool operator!=(const ByteArray& left, const ByteArray& right) {
+ return !(left == right);
+}
+
+struct FixedLenByteArray {
+ FixedLenByteArray() : ptr(NULLPTR) {}
+ explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
+ const uint8_t* ptr;
+};
+
+using FLBA = FixedLenByteArray;
+
+// Julian day at unix epoch.
+//
+// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
+// the Julian day count starting from noon Universal time, with Julian day
+// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
+// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
+// calendar),
+constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
+constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
+constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
+constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
+constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
+
+MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
+STRUCT_END(Int96, 12);
+
+inline bool operator==(const Int96& left, const Int96& right) {
+ return std::equal(left.value, left.value + 3, right.value);
+}
+
+inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
+
+static inline std::string ByteArrayToString(const ByteArray& a) {
+ return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
+}
+
+static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
+ std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
+}
+
+struct DecodedInt96 {
+ uint64_t days_since_epoch;
+ uint64_t nanoseconds;
+};
+
+static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
+ // We do the computations in the unsigned domain to avoid unsigned behaviour
+ // on overflow.
+ DecodedInt96 result;
+ result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
+ result.nanoseconds = 0;
+
+ memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
+ return result;
+}
+
+static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
+ decoded.nanoseconds);
+}
+
+static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
+ microseconds);
+}
+
+static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
+ milliseconds);
+}
+
+static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
+}
+
+static inline std::string Int96ToString(const Int96& a) {
+ std::ostringstream result;
+ std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
+ return result.str();
+}
+
+static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
+ std::ostringstream result;
+ std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
+ return result.str();
+}
+
+template <Type::type TYPE>
+struct type_traits {};
+
+template <>
+struct type_traits<Type::BOOLEAN> {
+ using value_type = bool;
+
+ static constexpr int value_byte_size = 1;
+ static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT32> {
+ using value_type = int32_t;
+
+ static constexpr int value_byte_size = 4;
+ static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT64> {
+ using value_type = int64_t;
+
+ static constexpr int value_byte_size = 8;
+ static constexpr const char* printf_code = "ld";
+};
+
+template <>
+struct type_traits<Type::INT96> {
+ using value_type = Int96;
+
+ static constexpr int value_byte_size = 12;
+ static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FLOAT> {
+ using value_type = float;
+
+ static constexpr int value_byte_size = 4;
+ static constexpr const char* printf_code = "f";
+};
+
+template <>
+struct type_traits<Type::DOUBLE> {
+ using value_type = double;
+
+ static constexpr int value_byte_size = 8;
+ static constexpr const char* printf_code = "lf";
+};
+
+template <>
+struct type_traits<Type::BYTE_ARRAY> {
+ using value_type = ByteArray;
+
+ static constexpr int value_byte_size = sizeof(ByteArray);
+ static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
+ using value_type = FixedLenByteArray;
+
+ static constexpr int value_byte_size = sizeof(FixedLenByteArray);
+ static constexpr const char* printf_code = "s";
+};
+
+template <Type::type TYPE>
+struct PhysicalType {
+ using c_type = typename type_traits<TYPE>::value_type;
+ static constexpr Type::type type_num = TYPE;
+};
+
+using BooleanType = PhysicalType<Type::BOOLEAN>;
+using Int32Type = PhysicalType<Type::INT32>;
+using Int64Type = PhysicalType<Type::INT64>;
+using Int96Type = PhysicalType<Type::INT96>;
+using FloatType = PhysicalType<Type::FLOAT>;
+using DoubleType = PhysicalType<Type::DOUBLE>;
+using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
+
+template <typename Type>
+inline std::string format_fwf(int width) {
+ std::stringstream ss;
+ ss << "%-" << width << type_traits<Type::type_num>::printf_code;
+ return ss.str();
+}
+
+PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
+
+PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
+
+PARQUET_EXPORT std::string TypeToString(Type::type t);
+
+PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
+ ::arrow::util::string_view val);
+
+PARQUET_EXPORT int GetTypeByteSize(Type::type t);
+
+PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
+ Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(
+ const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h b/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
index 31ca04c8b66..6e5b6b330e6 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
@@ -1,30 +1,30 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/util/windows_compatibility.h"
-
-#ifdef _WIN32
-
-// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from
-// above, so we undefine it
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
-
-#endif
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#ifdef _WIN32
+
+// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from
+// above, so we undefine it
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif
diff --git a/contrib/libs/apache/arrow/src/arrow/util/config.h b/contrib/libs/apache/arrow/src/arrow/util/config.h
index 2d46017e47e..8464984a732 100644
--- a/contrib/libs/apache/arrow/src/arrow/util/config.h
+++ b/contrib/libs/apache/arrow/src/arrow/util/config.h
@@ -15,18 +15,18 @@
// specific language governing permissions and limitations
// under the License.
-#define ARROW_VERSION_MAJOR 5
+#define ARROW_VERSION_MAJOR 5
#define ARROW_VERSION_MINOR 0
#define ARROW_VERSION_PATCH 0
#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
-#define ARROW_VERSION_STRING "5.0.0"
+#define ARROW_VERSION_STRING "5.0.0"
-#define ARROW_SO_VERSION "500"
-#define ARROW_FULL_SO_VERSION "500.0.0"
+#define ARROW_SO_VERSION "500"
+#define ARROW_FULL_SO_VERSION "500.0.0"
#define ARROW_CXX_COMPILER_ID "GNU"
-#define ARROW_CXX_COMPILER_VERSION "10.2.0"
+#define ARROW_CXX_COMPILER_VERSION "10.2.0"
#define ARROW_CXX_COMPILER_FLAGS " -fdiagnostics-color=always -O3 -DNDEBUG"
#define ARROW_GIT_ID ""
@@ -34,17 +34,17 @@
#define ARROW_PACKAGE_KIND ""
-#define ARROW_COMPUTE
+#define ARROW_COMPUTE
#define ARROW_CSV
-/* #undef ARROW_DATASET */
-/* #undef ARROW_FILESYSTEM */
-/* #undef ARROW_FLIGHT */
-#define ARROW_IPC
-/* #undef ARROW_JSON */
-
+/* #undef ARROW_DATASET */
+/* #undef ARROW_FILESYSTEM */
+/* #undef ARROW_FLIGHT */
+#define ARROW_IPC
+/* #undef ARROW_JSON */
+
/* #undef ARROW_S3 */
-#ifdef __GNUC__
-#define ARROW_USE_NATIVE_INT128
-#endif
+#ifdef __GNUC__
+#define ARROW_USE_NATIVE_INT128
+#endif
/* #undef GRPCPP_PP_INCLUDE */
diff --git a/contrib/libs/apache/arrow/src/parquet/parquet_version.h b/contrib/libs/apache/arrow/src/parquet/parquet_version.h
index e3eec9c46a3..b850604dad6 100644
--- a/contrib/libs/apache/arrow/src/parquet/parquet_version.h
+++ b/contrib/libs/apache/arrow/src/parquet/parquet_version.h
@@ -1,31 +1,31 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PARQUET_VERSION_H
-#define PARQUET_VERSION_H
-
-#define PARQUET_VERSION_MAJOR 5
-#define PARQUET_VERSION_MINOR 0
-#define PARQUET_VERSION_PATCH 0
-
-#define PARQUET_SO_VERSION "500"
-#define PARQUET_FULL_SO_VERSION "500.0.0"
-
-// define the parquet created by version
-#define CREATED_BY_VERSION "parquet-cpp-arrow version 5.0.0"
-
-#endif // PARQUET_VERSION_H
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_VERSION_H
+#define PARQUET_VERSION_H
+
+#define PARQUET_VERSION_MAJOR 5
+#define PARQUET_VERSION_MINOR 0
+#define PARQUET_VERSION_PATCH 0
+
+#define PARQUET_SO_VERSION "500"
+#define PARQUET_FULL_SO_VERSION "500.0.0"
+
+// define the parquet created by version
+#define CREATED_BY_VERSION "parquet-cpp-arrow version 5.0.0"
+
+#endif // PARQUET_VERSION_H
diff --git a/contrib/libs/apache/arrow/ya.make b/contrib/libs/apache/arrow/ya.make
index 27b9235d9e9..ffc8e1955bc 100644
--- a/contrib/libs/apache/arrow/ya.make
+++ b/contrib/libs/apache/arrow/ya.make
@@ -7,9 +7,9 @@ OWNER(
g:cpp-contrib
)
-VERSION(5.0.0)
+VERSION(5.0.0)
-ORIGINAL_SOURCE(https://github.com/apache/arrow/archive/apache-arrow-5.0.0.tar.gz)
+ORIGINAL_SOURCE(https://github.com/apache/arrow/archive/apache-arrow-5.0.0.tar.gz)
LICENSE(
Apache-2.0 AND
@@ -28,33 +28,33 @@ LICENSE(
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
- contrib/libs/apache/orc
+ contrib/libs/apache/orc
contrib/libs/brotli/dec
contrib/libs/brotli/enc
contrib/libs/double-conversion
contrib/libs/lz4
- contrib/libs/re2
+ contrib/libs/re2
contrib/libs/snappy
contrib/libs/utf8proc
contrib/libs/xxhash
contrib/libs/zlib
contrib/libs/zstd
- contrib/restricted/boost
- contrib/restricted/fast_float
- contrib/restricted/thrift
- contrib/restricted/uriparser
+ contrib/restricted/boost
+ contrib/restricted/fast_float
+ contrib/restricted/thrift
+ contrib/restricted/uriparser
)
ADDINCL(
- GLOBAL contrib/libs/apache/arrow/cpp/src
- GLOBAL contrib/libs/apache/arrow/src
- contrib/libs/apache/arrow/cpp/src/generated
- contrib/libs/apache/orc/c++/include
+ GLOBAL contrib/libs/apache/arrow/cpp/src
+ GLOBAL contrib/libs/apache/arrow/src
+ contrib/libs/apache/arrow/cpp/src/generated
+ contrib/libs/apache/orc/c++/include
contrib/libs/flatbuffers/include
contrib/libs/lz4
- contrib/libs/re2
+ contrib/libs/re2
contrib/libs/utf8proc
- contrib/libs/zstd/include
+ contrib/libs/zstd/include
contrib/restricted/boost
)
@@ -64,101 +64,101 @@ NO_UTIL()
CFLAGS(
GLOBAL -DARROW_STATIC
- -DARROW_EXPORTING
+ -DARROW_EXPORTING
-DARROW_WITH_BROTLI
-DARROW_WITH_LZ4
- -DARROW_WITH_RE2
+ -DARROW_WITH_RE2
-DARROW_WITH_SNAPPY
-DARROW_WITH_TIMING_TESTS
-DARROW_WITH_UTF8PROC
-DARROW_WITH_ZLIB
-DARROW_WITH_ZSTD
- -DHAVE_INTTYPES_H
- -DHAVE_NETDB_H
- -DPARQUET_EXPORTING
+ -DHAVE_INTTYPES_H
+ -DHAVE_NETDB_H
+ -DPARQUET_EXPORTING
-DURI_STATIC_BUILD
)
-IF (NOT OS_WINDOWS)
+IF (NOT OS_WINDOWS)
CFLAGS(
-DHAVE_NETINET_IN_H
)
-ENDIF()
-
+ENDIF()
+
SRCS(
- cpp/src/arrow/adapters/orc/adapter.cc
- cpp/src/arrow/adapters/orc/adapter_util.cc
- cpp/src/arrow/array/array_base.cc
- cpp/src/arrow/array/array_binary.cc
- cpp/src/arrow/array/array_decimal.cc
- cpp/src/arrow/array/array_dict.cc
- cpp/src/arrow/array/array_nested.cc
- cpp/src/arrow/array/array_primitive.cc
- cpp/src/arrow/array/builder_adaptive.cc
- cpp/src/arrow/array/builder_base.cc
- cpp/src/arrow/array/builder_binary.cc
- cpp/src/arrow/array/builder_decimal.cc
- cpp/src/arrow/array/builder_dict.cc
- cpp/src/arrow/array/builder_nested.cc
- cpp/src/arrow/array/builder_primitive.cc
- cpp/src/arrow/array/builder_union.cc
- cpp/src/arrow/array/concatenate.cc
- cpp/src/arrow/array/data.cc
- cpp/src/arrow/array/diff.cc
- cpp/src/arrow/array/util.cc
- cpp/src/arrow/array/validate.cc
- cpp/src/arrow/buffer.cc
- cpp/src/arrow/builder.cc
- cpp/src/arrow/c/bridge.cc
- cpp/src/arrow/chunked_array.cc
- cpp/src/arrow/compare.cc
- cpp/src/arrow/compute/api_aggregate.cc
- cpp/src/arrow/compute/api_scalar.cc
- cpp/src/arrow/compute/api_vector.cc
- cpp/src/arrow/compute/cast.cc
- cpp/src/arrow/compute/exec.cc
- cpp/src/arrow/compute/exec/exec_plan.cc
- cpp/src/arrow/compute/exec/expression.cc
- cpp/src/arrow/compute/exec/key_compare.cc
- cpp/src/arrow/compute/exec/key_encode.cc
- cpp/src/arrow/compute/exec/key_hash.cc
- cpp/src/arrow/compute/exec/key_map.cc
- cpp/src/arrow/compute/exec/util.cc
- cpp/src/arrow/compute/function.cc
- cpp/src/arrow/compute/function_internal.cc
- cpp/src/arrow/compute/kernel.cc
- cpp/src/arrow/compute/kernels/aggregate_basic.cc
- cpp/src/arrow/compute/kernels/aggregate_mode.cc
- cpp/src/arrow/compute/kernels/aggregate_quantile.cc
- cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
- cpp/src/arrow/compute/kernels/aggregate_var_std.cc
- cpp/src/arrow/compute/kernels/codegen_internal.cc
- cpp/src/arrow/compute/kernels/hash_aggregate.cc
- cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
- cpp/src/arrow/compute/kernels/scalar_boolean.cc
- cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
- cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
- cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
- cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
- cpp/src/arrow/compute/kernels/scalar_cast_string.cc
- cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
- cpp/src/arrow/compute/kernels/scalar_compare.cc
- cpp/src/arrow/compute/kernels/scalar_fill_null.cc
- cpp/src/arrow/compute/kernels/scalar_if_else.cc
- cpp/src/arrow/compute/kernels/scalar_nested.cc
- cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
- cpp/src/arrow/compute/kernels/scalar_string.cc
- cpp/src/arrow/compute/kernels/scalar_temporal.cc
- cpp/src/arrow/compute/kernels/scalar_validity.cc
- cpp/src/arrow/compute/kernels/util_internal.cc
- cpp/src/arrow/compute/kernels/vector_hash.cc
- cpp/src/arrow/compute/kernels/vector_nested.cc
- cpp/src/arrow/compute/kernels/vector_replace.cc
- cpp/src/arrow/compute/kernels/vector_selection.cc
- cpp/src/arrow/compute/kernels/vector_sort.cc
- cpp/src/arrow/compute/registry.cc
- cpp/src/arrow/config.cc
+ cpp/src/arrow/adapters/orc/adapter.cc
+ cpp/src/arrow/adapters/orc/adapter_util.cc
+ cpp/src/arrow/array/array_base.cc
+ cpp/src/arrow/array/array_binary.cc
+ cpp/src/arrow/array/array_decimal.cc
+ cpp/src/arrow/array/array_dict.cc
+ cpp/src/arrow/array/array_nested.cc
+ cpp/src/arrow/array/array_primitive.cc
+ cpp/src/arrow/array/builder_adaptive.cc
+ cpp/src/arrow/array/builder_base.cc
+ cpp/src/arrow/array/builder_binary.cc
+ cpp/src/arrow/array/builder_decimal.cc
+ cpp/src/arrow/array/builder_dict.cc
+ cpp/src/arrow/array/builder_nested.cc
+ cpp/src/arrow/array/builder_primitive.cc
+ cpp/src/arrow/array/builder_union.cc
+ cpp/src/arrow/array/concatenate.cc
+ cpp/src/arrow/array/data.cc
+ cpp/src/arrow/array/diff.cc
+ cpp/src/arrow/array/util.cc
+ cpp/src/arrow/array/validate.cc
+ cpp/src/arrow/buffer.cc
+ cpp/src/arrow/builder.cc
+ cpp/src/arrow/c/bridge.cc
+ cpp/src/arrow/chunked_array.cc
+ cpp/src/arrow/compare.cc
+ cpp/src/arrow/compute/api_aggregate.cc
+ cpp/src/arrow/compute/api_scalar.cc
+ cpp/src/arrow/compute/api_vector.cc
+ cpp/src/arrow/compute/cast.cc
+ cpp/src/arrow/compute/exec.cc
+ cpp/src/arrow/compute/exec/exec_plan.cc
+ cpp/src/arrow/compute/exec/expression.cc
+ cpp/src/arrow/compute/exec/key_compare.cc
+ cpp/src/arrow/compute/exec/key_encode.cc
+ cpp/src/arrow/compute/exec/key_hash.cc
+ cpp/src/arrow/compute/exec/key_map.cc
+ cpp/src/arrow/compute/exec/util.cc
+ cpp/src/arrow/compute/function.cc
+ cpp/src/arrow/compute/function_internal.cc
+ cpp/src/arrow/compute/kernel.cc
+ cpp/src/arrow/compute/kernels/aggregate_basic.cc
+ cpp/src/arrow/compute/kernels/aggregate_mode.cc
+ cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+ cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
+ cpp/src/arrow/compute/kernels/aggregate_var_std.cc
+ cpp/src/arrow/compute/kernels/codegen_internal.cc
+ cpp/src/arrow/compute/kernels/hash_aggregate.cc
+ cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+ cpp/src/arrow/compute/kernels/scalar_boolean.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+ cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+ cpp/src/arrow/compute/kernels/scalar_compare.cc
+ cpp/src/arrow/compute/kernels/scalar_fill_null.cc
+ cpp/src/arrow/compute/kernels/scalar_if_else.cc
+ cpp/src/arrow/compute/kernels/scalar_nested.cc
+ cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+ cpp/src/arrow/compute/kernels/scalar_string.cc
+ cpp/src/arrow/compute/kernels/scalar_temporal.cc
+ cpp/src/arrow/compute/kernels/scalar_validity.cc
+ cpp/src/arrow/compute/kernels/util_internal.cc
+ cpp/src/arrow/compute/kernels/vector_hash.cc
+ cpp/src/arrow/compute/kernels/vector_nested.cc
+ cpp/src/arrow/compute/kernels/vector_replace.cc
+ cpp/src/arrow/compute/kernels/vector_selection.cc
+ cpp/src/arrow/compute/kernels/vector_sort.cc
+ cpp/src/arrow/compute/registry.cc
+ cpp/src/arrow/config.cc
cpp/src/arrow/csv/chunker.cc
cpp/src/arrow/csv/column_builder.cc
cpp/src/arrow/csv/column_decoder.cc
@@ -167,111 +167,111 @@ SRCS(
cpp/src/arrow/csv/parser.cc
cpp/src/arrow/csv/reader.cc
cpp/src/arrow/csv/writer.cc
- cpp/src/arrow/datum.cc
- cpp/src/arrow/device.cc
- cpp/src/arrow/extension_type.cc
- cpp/src/arrow/io/buffered.cc
- cpp/src/arrow/io/caching.cc
- cpp/src/arrow/io/compressed.cc
- cpp/src/arrow/io/file.cc
- cpp/src/arrow/io/interfaces.cc
- cpp/src/arrow/io/memory.cc
- cpp/src/arrow/io/slow.cc
- cpp/src/arrow/io/stdio.cc
- cpp/src/arrow/io/transform.cc
- cpp/src/arrow/ipc/dictionary.cc
- cpp/src/arrow/ipc/feather.cc
- cpp/src/arrow/ipc/message.cc
- cpp/src/arrow/ipc/metadata_internal.cc
- cpp/src/arrow/ipc/options.cc
- cpp/src/arrow/ipc/reader.cc
- cpp/src/arrow/ipc/writer.cc
- cpp/src/arrow/memory_pool.cc
- cpp/src/arrow/pretty_print.cc
- cpp/src/arrow/record_batch.cc
- cpp/src/arrow/result.cc
- cpp/src/arrow/scalar.cc
- cpp/src/arrow/sparse_tensor.cc
- cpp/src/arrow/status.cc
- cpp/src/arrow/table.cc
- cpp/src/arrow/table_builder.cc
- cpp/src/arrow/tensor.cc
- cpp/src/arrow/tensor/coo_converter.cc
- cpp/src/arrow/tensor/csf_converter.cc
- cpp/src/arrow/tensor/csx_converter.cc
- cpp/src/arrow/type.cc
- cpp/src/arrow/util/basic_decimal.cc
- cpp/src/arrow/util/bit_block_counter.cc
- cpp/src/arrow/util/bit_run_reader.cc
- cpp/src/arrow/util/bit_util.cc
- cpp/src/arrow/util/bitmap.cc
- cpp/src/arrow/util/bitmap_builders.cc
- cpp/src/arrow/util/bitmap_ops.cc
- cpp/src/arrow/util/bpacking.cc
- cpp/src/arrow/util/cancel.cc
- cpp/src/arrow/util/compression.cc
- cpp/src/arrow/util/compression_brotli.cc
- cpp/src/arrow/util/compression_lz4.cc
- cpp/src/arrow/util/compression_snappy.cc
- cpp/src/arrow/util/compression_zlib.cc
- cpp/src/arrow/util/compression_zstd.cc
- cpp/src/arrow/util/cpu_info.cc
- cpp/src/arrow/util/decimal.cc
- cpp/src/arrow/util/delimiting.cc
- cpp/src/arrow/util/formatting.cc
- cpp/src/arrow/util/future.cc
- cpp/src/arrow/util/int_util.cc
- cpp/src/arrow/util/io_util.cc
- cpp/src/arrow/util/key_value_metadata.cc
- cpp/src/arrow/util/logging.cc
- cpp/src/arrow/util/memory.cc
- cpp/src/arrow/util/mutex.cc
- cpp/src/arrow/util/string.cc
- cpp/src/arrow/util/string_builder.cc
- cpp/src/arrow/util/task_group.cc
- cpp/src/arrow/util/tdigest.cc
- cpp/src/arrow/util/thread_pool.cc
- cpp/src/arrow/util/time.cc
- cpp/src/arrow/util/trie.cc
- cpp/src/arrow/util/uri.cc
- cpp/src/arrow/util/utf8.cc
- cpp/src/arrow/util/value_parsing.cc
- cpp/src/arrow/vendored/base64.cpp
- cpp/src/arrow/vendored/datetime/tz.cpp
- cpp/src/arrow/vendored/musl/strptime.c
- cpp/src/arrow/visitor.cc
- cpp/src/generated/parquet_constants.cpp
- cpp/src/generated/parquet_types.cpp
- cpp/src/parquet/arrow/path_internal.cc
- cpp/src/parquet/arrow/reader.cc
- cpp/src/parquet/arrow/reader_internal.cc
- cpp/src/parquet/arrow/schema.cc
- cpp/src/parquet/arrow/schema_internal.cc
- cpp/src/parquet/arrow/writer.cc
- cpp/src/parquet/bloom_filter.cc
- cpp/src/parquet/column_reader.cc
- cpp/src/parquet/column_scanner.cc
- cpp/src/parquet/column_writer.cc
- cpp/src/parquet/encoding.cc
- cpp/src/parquet/encryption/encryption.cc
- cpp/src/parquet/encryption/encryption_internal_nossl.cc
- cpp/src/parquet/encryption/internal_file_decryptor.cc
- cpp/src/parquet/encryption/internal_file_encryptor.cc
- cpp/src/parquet/exception.cc
- cpp/src/parquet/file_reader.cc
- cpp/src/parquet/file_writer.cc
- cpp/src/parquet/level_comparison.cc
- cpp/src/parquet/level_conversion.cc
- cpp/src/parquet/metadata.cc
- cpp/src/parquet/murmur3.cc
- cpp/src/parquet/platform.cc
- cpp/src/parquet/printer.cc
- cpp/src/parquet/properties.cc
- cpp/src/parquet/schema.cc
- cpp/src/parquet/statistics.cc
- cpp/src/parquet/stream_reader.cc
- cpp/src/parquet/stream_writer.cc
- cpp/src/parquet/types.cc
+ cpp/src/arrow/datum.cc
+ cpp/src/arrow/device.cc
+ cpp/src/arrow/extension_type.cc
+ cpp/src/arrow/io/buffered.cc
+ cpp/src/arrow/io/caching.cc
+ cpp/src/arrow/io/compressed.cc
+ cpp/src/arrow/io/file.cc
+ cpp/src/arrow/io/interfaces.cc
+ cpp/src/arrow/io/memory.cc
+ cpp/src/arrow/io/slow.cc
+ cpp/src/arrow/io/stdio.cc
+ cpp/src/arrow/io/transform.cc
+ cpp/src/arrow/ipc/dictionary.cc
+ cpp/src/arrow/ipc/feather.cc
+ cpp/src/arrow/ipc/message.cc
+ cpp/src/arrow/ipc/metadata_internal.cc
+ cpp/src/arrow/ipc/options.cc
+ cpp/src/arrow/ipc/reader.cc
+ cpp/src/arrow/ipc/writer.cc
+ cpp/src/arrow/memory_pool.cc
+ cpp/src/arrow/pretty_print.cc
+ cpp/src/arrow/record_batch.cc
+ cpp/src/arrow/result.cc
+ cpp/src/arrow/scalar.cc
+ cpp/src/arrow/sparse_tensor.cc
+ cpp/src/arrow/status.cc
+ cpp/src/arrow/table.cc
+ cpp/src/arrow/table_builder.cc
+ cpp/src/arrow/tensor.cc
+ cpp/src/arrow/tensor/coo_converter.cc
+ cpp/src/arrow/tensor/csf_converter.cc
+ cpp/src/arrow/tensor/csx_converter.cc
+ cpp/src/arrow/type.cc
+ cpp/src/arrow/util/basic_decimal.cc
+ cpp/src/arrow/util/bit_block_counter.cc
+ cpp/src/arrow/util/bit_run_reader.cc
+ cpp/src/arrow/util/bit_util.cc
+ cpp/src/arrow/util/bitmap.cc
+ cpp/src/arrow/util/bitmap_builders.cc
+ cpp/src/arrow/util/bitmap_ops.cc
+ cpp/src/arrow/util/bpacking.cc
+ cpp/src/arrow/util/cancel.cc
+ cpp/src/arrow/util/compression.cc
+ cpp/src/arrow/util/compression_brotli.cc
+ cpp/src/arrow/util/compression_lz4.cc
+ cpp/src/arrow/util/compression_snappy.cc
+ cpp/src/arrow/util/compression_zlib.cc
+ cpp/src/arrow/util/compression_zstd.cc
+ cpp/src/arrow/util/cpu_info.cc
+ cpp/src/arrow/util/decimal.cc
+ cpp/src/arrow/util/delimiting.cc
+ cpp/src/arrow/util/formatting.cc
+ cpp/src/arrow/util/future.cc
+ cpp/src/arrow/util/int_util.cc
+ cpp/src/arrow/util/io_util.cc
+ cpp/src/arrow/util/key_value_metadata.cc
+ cpp/src/arrow/util/logging.cc
+ cpp/src/arrow/util/memory.cc
+ cpp/src/arrow/util/mutex.cc
+ cpp/src/arrow/util/string.cc
+ cpp/src/arrow/util/string_builder.cc
+ cpp/src/arrow/util/task_group.cc
+ cpp/src/arrow/util/tdigest.cc
+ cpp/src/arrow/util/thread_pool.cc
+ cpp/src/arrow/util/time.cc
+ cpp/src/arrow/util/trie.cc
+ cpp/src/arrow/util/uri.cc
+ cpp/src/arrow/util/utf8.cc
+ cpp/src/arrow/util/value_parsing.cc
+ cpp/src/arrow/vendored/base64.cpp
+ cpp/src/arrow/vendored/datetime/tz.cpp
+ cpp/src/arrow/vendored/musl/strptime.c
+ cpp/src/arrow/visitor.cc
+ cpp/src/generated/parquet_constants.cpp
+ cpp/src/generated/parquet_types.cpp
+ cpp/src/parquet/arrow/path_internal.cc
+ cpp/src/parquet/arrow/reader.cc
+ cpp/src/parquet/arrow/reader_internal.cc
+ cpp/src/parquet/arrow/schema.cc
+ cpp/src/parquet/arrow/schema_internal.cc
+ cpp/src/parquet/arrow/writer.cc
+ cpp/src/parquet/bloom_filter.cc
+ cpp/src/parquet/column_reader.cc
+ cpp/src/parquet/column_scanner.cc
+ cpp/src/parquet/column_writer.cc
+ cpp/src/parquet/encoding.cc
+ cpp/src/parquet/encryption/encryption.cc
+ cpp/src/parquet/encryption/encryption_internal_nossl.cc
+ cpp/src/parquet/encryption/internal_file_decryptor.cc
+ cpp/src/parquet/encryption/internal_file_encryptor.cc
+ cpp/src/parquet/exception.cc
+ cpp/src/parquet/file_reader.cc
+ cpp/src/parquet/file_writer.cc
+ cpp/src/parquet/level_comparison.cc
+ cpp/src/parquet/level_conversion.cc
+ cpp/src/parquet/metadata.cc
+ cpp/src/parquet/murmur3.cc
+ cpp/src/parquet/platform.cc
+ cpp/src/parquet/printer.cc
+ cpp/src/parquet/properties.cc
+ cpp/src/parquet/schema.cc
+ cpp/src/parquet/statistics.cc
+ cpp/src/parquet/stream_reader.cc
+ cpp/src/parquet/stream_writer.cc
+ cpp/src/parquet/types.cc
)
END()
diff --git a/contrib/libs/apache/avro/.yandex_meta/devtools.licenses.report b/contrib/libs/apache/avro/.yandex_meta/devtools.licenses.report
index 0c6b8995a02..2b71d33024e 100644
--- a/contrib/libs/apache/avro/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/apache/avro/.yandex_meta/devtools.licenses.report
@@ -44,7 +44,7 @@ BELONGS ya.make
KEEP Apache-2.0 807648c73ed5fa476be45e9005a86248
BELONGS ya.make
-FILE_INCLUDE NOTICE found in files: api/AvroParse.hh at line 3, api/AvroSerialize.hh at line 3, api/AvroTraits.hh at line 3, api/Compiler.hh at line 3, api/Config.hh at line 3, api/DataFile.hh at line 3, api/Decoder.hh at line 3, api/Encoder.hh at line 3, api/Exception.hh at line 3, api/Generic.hh at line 3, api/GenericDatum.hh at line 3, api/Layout.hh at line 3, api/LogicalType.hh at line 3, api/Node.hh at line 3, api/NodeConcepts.hh at line 3, api/NodeImpl.hh at line 3, api/Parser.hh at line 3, api/Reader.hh at line 3, api/Resolver.hh at line 3, api/ResolverSchema.hh at line 3, api/ResolvingReader.hh at line 3, api/Schema.hh at line 3, api/SchemaResolution.hh at line 3, api/Serializer.hh at line 3, api/Specific.hh at line 3, api/Stream.hh at line 3, api/Types.hh at line 3, api/ValidSchema.hh at line 3, api/Validator.hh at line 3, api/Writer.hh at line 3, api/Zigzag.hh at line 3, api/buffer/Buffer.hh at line 3, api/buffer/BufferReader.hh at line 3, api/buffer/detail/BufferDetail.hh at line 3, api/buffer/detail/BufferDetailIterator.hh at line 3, impl/BinaryDecoder.cc at line 3, impl/BinaryEncoder.cc at line 3, impl/Compiler.cc at line 3, impl/DataFile.cc at line 3, impl/FileStream.cc at line 3, impl/Generic.cc at line 3, impl/GenericDatum.cc at line 3, impl/LogicalType.cc at line 3, impl/Node.cc at line 3, impl/NodeImpl.cc at line 3, impl/Resolver.cc at line 4, impl/ResolverSchema.cc at line 4, impl/Schema.cc at line 3, impl/Stream.cc at line 3, impl/Types.cc at line 3, impl/ValidSchema.cc at line 3, impl/Validator.cc at line 3, impl/Zigzag.cc at line 3, impl/json/JsonDom.cc at line 3, impl/json/JsonDom.hh at line 3, impl/json/JsonIO.cc at line 3, impl/json/JsonIO.hh at line 3, impl/parsing/JsonCodec.cc at line 3, impl/parsing/ResolvingDecoder.cc at line 3, impl/parsing/Symbol.cc at line 3, impl/parsing/Symbol.hh at line 3, impl/parsing/ValidatingCodec.cc at line 3, impl/parsing/ValidatingCodec.hh at line 3
+FILE_INCLUDE NOTICE found in files: api/AvroParse.hh at line 3, api/AvroSerialize.hh at line 3, api/AvroTraits.hh at line 3, api/Compiler.hh at line 3, api/Config.hh at line 3, api/DataFile.hh at line 3, api/Decoder.hh at line 3, api/Encoder.hh at line 3, api/Exception.hh at line 3, api/Generic.hh at line 3, api/GenericDatum.hh at line 3, api/Layout.hh at line 3, api/LogicalType.hh at line 3, api/Node.hh at line 3, api/NodeConcepts.hh at line 3, api/NodeImpl.hh at line 3, api/Parser.hh at line 3, api/Reader.hh at line 3, api/Resolver.hh at line 3, api/ResolverSchema.hh at line 3, api/ResolvingReader.hh at line 3, api/Schema.hh at line 3, api/SchemaResolution.hh at line 3, api/Serializer.hh at line 3, api/Specific.hh at line 3, api/Stream.hh at line 3, api/Types.hh at line 3, api/ValidSchema.hh at line 3, api/Validator.hh at line 3, api/Writer.hh at line 3, api/Zigzag.hh at line 3, api/buffer/Buffer.hh at line 3, api/buffer/BufferReader.hh at line 3, api/buffer/detail/BufferDetail.hh at line 3, api/buffer/detail/BufferDetailIterator.hh at line 3, impl/BinaryDecoder.cc at line 3, impl/BinaryEncoder.cc at line 3, impl/Compiler.cc at line 3, impl/DataFile.cc at line 3, impl/FileStream.cc at line 3, impl/Generic.cc at line 3, impl/GenericDatum.cc at line 3, impl/LogicalType.cc at line 3, impl/Node.cc at line 3, impl/NodeImpl.cc at line 3, impl/Resolver.cc at line 4, impl/ResolverSchema.cc at line 4, impl/Schema.cc at line 3, impl/Stream.cc at line 3, impl/Types.cc at line 3, impl/ValidSchema.cc at line 3, impl/Validator.cc at line 3, impl/Zigzag.cc at line 3, impl/json/JsonDom.cc at line 3, impl/json/JsonDom.hh at line 3, impl/json/JsonIO.cc at line 3, impl/json/JsonIO.hh at line 3, impl/parsing/JsonCodec.cc at line 3, impl/parsing/ResolvingDecoder.cc at line 3, impl/parsing/Symbol.cc at line 3, impl/parsing/Symbol.hh at line 3, impl/parsing/ValidatingCodec.cc at line 3, impl/parsing/ValidatingCodec.hh at line 3
Note: matched license text is too long. Read it in the source files.
Scancode info:
Original SPDX id: Apache-2.0
@@ -52,8 +52,8 @@ FILE_INCLUDE NOTICE found in files: api/AvroParse.hh at line 3, api/AvroSerializ
Match type : NOTICE
Links : http://www.apache.org/licenses/, http://www.apache.org/licenses/LICENSE-2.0, https://spdx.org/licenses/Apache-2.0
Files with this license:
- api/AvroParse.hh [2:16]
- api/AvroSerialize.hh [2:16]
+ api/AvroParse.hh [2:16]
+ api/AvroSerialize.hh [2:16]
api/AvroTraits.hh [2:16]
api/Compiler.hh [2:16]
api/Config.hh [2:16]
@@ -68,20 +68,20 @@ FILE_INCLUDE NOTICE found in files: api/AvroParse.hh at line 3, api/AvroSerializ
api/Node.hh [2:16]
api/NodeConcepts.hh [2:16]
api/NodeImpl.hh [2:16]
- api/Parser.hh [2:16]
+ api/Parser.hh [2:16]
api/Reader.hh [2:16]
api/Resolver.hh [2:16]
api/ResolverSchema.hh [2:16]
- api/ResolvingReader.hh [2:16]
+ api/ResolvingReader.hh [2:16]
api/Schema.hh [2:16]
api/SchemaResolution.hh [2:16]
- api/Serializer.hh [2:16]
+ api/Serializer.hh [2:16]
api/Specific.hh [2:16]
api/Stream.hh [2:16]
api/Types.hh [2:16]
api/ValidSchema.hh [2:16]
api/Validator.hh [2:16]
- api/Writer.hh [2:16]
+ api/Writer.hh [2:16]
api/Zigzag.hh [2:16]
api/buffer/Buffer.hh [2:16]
api/buffer/BufferReader.hh [2:16]
diff --git a/contrib/libs/apache/avro/AUTHORS b/contrib/libs/apache/avro/AUTHORS
index 3ebe7c3f828..ce1e0107b4d 100644
--- a/contrib/libs/apache/avro/AUTHORS
+++ b/contrib/libs/apache/avro/AUTHORS
@@ -1,4 +1,4 @@
-
-See https://avro.apache.org/ for a list of authors
-
-
+
+See https://avro.apache.org/ for a list of authors
+
+
diff --git a/contrib/libs/apache/avro/ChangeLog b/contrib/libs/apache/avro/ChangeLog
index 317b99f4d64..596d5ebc249 100644
--- a/contrib/libs/apache/avro/ChangeLog
+++ b/contrib/libs/apache/avro/ChangeLog
@@ -1 +1 @@
-Refer to CHANGES.txt in the root of avro repository for change log
+Refer to CHANGES.txt in the root of avro repository for change log
diff --git a/contrib/libs/apache/avro/LICENSE b/contrib/libs/apache/avro/LICENSE
index d641439cded..af50186a8b0 100644
--- a/contrib/libs/apache/avro/LICENSE
+++ b/contrib/libs/apache/avro/LICENSE
@@ -1,261 +1,261 @@
-
- Apache License
- Version 2.0, January 2004
- https://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- https://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-----------------------------------------------------------------------
-License for the m4 macros used by the C++ implementation:
-
-Files:
-* lang/c++/m4/m4_ax_boost_system.m4
- Copyright (c) 2008 Thomas Porschberg <[email protected]>
- Copyright (c) 2008 Michael Tindal
- Copyright (c) 2008 Daniel Casimiro <[email protected]>
-* lang/c++/m4/m4_ax_boost_asio.m4
- Copyright (c) 2008 Thomas Porschberg <[email protected]>
- Copyright (c) 2008 Pete Greenwell <[email protected]>
-* lang/c++/m4/m4_ax_boost_filesystem.m4
- Copyright (c) 2009 Thomas Porschberg <[email protected]>
- Copyright (c) 2009 Michael Tindal
- Copyright (c) 2009 Roman Rybalko <[email protected]>
-* lang/c++/m4/m4_ax_boost_thread.m4
- Copyright (c) 2009 Thomas Porschberg <[email protected]>
- Copyright (c) 2009 Michael Tindal
-* lang/c++/m4/m4_ax_boost_regex.m4
- Copyright (c) 2008 Thomas Porschberg <[email protected]>
- Copyright (c) 2008 Michael Tindal
-* lang/c++/m4/m4_ax_boost_base.m4
- Copyright (c) 2008 Thomas Porschberg <[email protected]>
-
-License text:
-| Copying and distribution of this file, with or without modification, are
-| permitted in any medium without royalty provided the copyright notice
-| and this notice are preserved. This file is offered as-is, without any
-| warranty.
-
-----------------------------------------------------------------------
-License for the AVRO_BOOT_NO_TRAIT code in the C++ implementation:
-File: lang/c++/api/Boost.hh
-
-| Boost Software License - Version 1.0 - August 17th, 2003
-|
-| Permission is hereby granted, free of charge, to any person or organization
-| obtaining a copy of the software and accompanying documentation covered by
-| this license (the "Software") to use, reproduce, display, distribute,
-| execute, and transmit the Software, and to prepare derivative works of the
-| Software, and to permit third-parties to whom the Software is furnished to
-| do so, all subject to the following:
-|
-| The copyright notices in the Software and this entire statement, including
-| the above license grant, this restriction and the following disclaimer,
-| must be included in all copies of the Software, in whole or in part, and
-| all derivative works of the Software, unless such copies or derivative
-| works are solely in the form of machine-executable object code generated by
-| a source language processor.
-|
-| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-| FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-| SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-| FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-| ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-| DEALINGS IN THE SOFTWARE.
-
+
+ Apache License
+ Version 2.0, January 2004
+ https://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+----------------------------------------------------------------------
+License for the m4 macros used by the C++ implementation:
+
+Files:
+* lang/c++/m4/m4_ax_boost_system.m4
+ Copyright (c) 2008 Thomas Porschberg <[email protected]>
+ Copyright (c) 2008 Michael Tindal
+ Copyright (c) 2008 Daniel Casimiro <[email protected]>
+* lang/c++/m4/m4_ax_boost_asio.m4
+ Copyright (c) 2008 Thomas Porschberg <[email protected]>
+ Copyright (c) 2008 Pete Greenwell <[email protected]>
+* lang/c++/m4/m4_ax_boost_filesystem.m4
+ Copyright (c) 2009 Thomas Porschberg <[email protected]>
+ Copyright (c) 2009 Michael Tindal
+ Copyright (c) 2009 Roman Rybalko <[email protected]>
+* lang/c++/m4/m4_ax_boost_thread.m4
+ Copyright (c) 2009 Thomas Porschberg <[email protected]>
+ Copyright (c) 2009 Michael Tindal
+* lang/c++/m4/m4_ax_boost_regex.m4
+ Copyright (c) 2008 Thomas Porschberg <[email protected]>
+ Copyright (c) 2008 Michael Tindal
+* lang/c++/m4/m4_ax_boost_base.m4
+ Copyright (c) 2008 Thomas Porschberg <[email protected]>
+
+License text:
+| Copying and distribution of this file, with or without modification, are
+| permitted in any medium without royalty provided the copyright notice
+| and this notice are preserved. This file is offered as-is, without any
+| warranty.
+
+----------------------------------------------------------------------
+License for the AVRO_BOOT_NO_TRAIT code in the C++ implementation:
+File: lang/c++/api/Boost.hh
+
+| Boost Software License - Version 1.0 - August 17th, 2003
+|
+| Permission is hereby granted, free of charge, to any person or organization
+| obtaining a copy of the software and accompanying documentation covered by
+| this license (the "Software") to use, reproduce, display, distribute,
+| execute, and transmit the Software, and to prepare derivative works of the
+| Software, and to permit third-parties to whom the Software is furnished to
+| do so, all subject to the following:
+|
+| The copyright notices in the Software and this entire statement, including
+| the above license grant, this restriction and the following disclaimer,
+| must be included in all copies of the Software, in whole or in part, and
+| all derivative works of the Software, unless such copies or derivative
+| works are solely in the form of machine-executable object code generated by
+| a source language processor.
+|
+| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+| FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+| SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+| FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+| ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+| DEALINGS IN THE SOFTWARE.
+
diff --git a/contrib/libs/apache/avro/MSBUILD.md b/contrib/libs/apache/avro/MSBUILD.md
index 11ffd9851e9..dfd339bc033 100644
--- a/contrib/libs/apache/avro/MSBUILD.md
+++ b/contrib/libs/apache/avro/MSBUILD.md
@@ -1,33 +1,33 @@
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements. See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership. The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License. You may obtain a copy of the License at
-
-https://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-# Visual Studio 2019 Build Instructions
-
-## Prerequisites
-
- * Microsoft Visual Studio 2019.
- * CMake >= 3.12 (should be supplied as part of VS2019 installation).
- * Clone [https://github.com/spektom/snappy-visual-cpp](https://github.com/spektom/snappy-visual-cpp), and follow build instructions in `README.md`.
- * Install Boost from [https://netcologne.dl.sourceforge.net/project/boost/boost-binaries/1.68.0/boost_1_68_0-msvc-14.1-64.exe](https://netcologne.dl.sourceforge.net/project/boost/boost-binaries/1.68.0/boost_1_68_0-msvc-14.1-64.exe).
- * Add `C:\<path to>\boost_1_68_0\lib64-msvc-14.1` to PATH environment variable.
-
-## Building
-
- cd lang\c++
- cmake -G "Visual Studio 16 2019" -DBOOST_ROOT=C:\<path to>\boost_1_68_0 -DBOOST_INCLUDEDIR=c:\<path to>\boost_1_68_0\boost -DBOOST_LIBRARYDIR=c:\<path to>\boost_1_68_0\lib64-msvc-14.1 -DSNAPPY_INCLUDE_DIR=C:\<path to>\snappy-visual-cpp -DSNAPPY_LIBRARIES=C:\<path to>\snappy-visual-cpp\x64\Release\snappy.lib ..
- msbuild Avro-cpp.sln /p:Configuration=Release /p:Platform=x64
-
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Visual Studio 2019 Build Instructions
+
+## Prerequisites
+
+ * Microsoft Visual Studio 2019.
+ * CMake >= 3.12 (should be supplied as part of VS2019 installation).
+ * Clone [https://github.com/spektom/snappy-visual-cpp](https://github.com/spektom/snappy-visual-cpp), and follow build instructions in `README.md`.
+ * Install Boost from [https://netcologne.dl.sourceforge.net/project/boost/boost-binaries/1.68.0/boost_1_68_0-msvc-14.1-64.exe](https://netcologne.dl.sourceforge.net/project/boost/boost-binaries/1.68.0/boost_1_68_0-msvc-14.1-64.exe).
+ * Add `C:\<path to>\boost_1_68_0\lib64-msvc-14.1` to PATH environment variable.
+
+## Building
+
+ cd lang\c++
+ cmake -G "Visual Studio 16 2019" -DBOOST_ROOT=C:\<path to>\boost_1_68_0 -DBOOST_INCLUDEDIR=c:\<path to>\boost_1_68_0\boost -DBOOST_LIBRARYDIR=c:\<path to>\boost_1_68_0\lib64-msvc-14.1 -DSNAPPY_INCLUDE_DIR=C:\<path to>\snappy-visual-cpp -DSNAPPY_LIBRARIES=C:\<path to>\snappy-visual-cpp\x64\Release\snappy.lib ..
+ msbuild Avro-cpp.sln /p:Configuration=Release /p:Platform=x64
+
diff --git a/contrib/libs/apache/avro/NEWS b/contrib/libs/apache/avro/NEWS
index 415280b0a2c..8b7b12d8d2a 100644
--- a/contrib/libs/apache/avro/NEWS
+++ b/contrib/libs/apache/avro/NEWS
@@ -1,5 +1,5 @@
-
-For news, visit the Avro web site at
-https://avro.apache.org/
-
-
+
+For news, visit the Avro web site at
+https://avro.apache.org/
+
+
diff --git a/contrib/libs/apache/avro/NOTICE b/contrib/libs/apache/avro/NOTICE
index 859ea81d74f..35e863231e2 100644
--- a/contrib/libs/apache/avro/NOTICE
+++ b/contrib/libs/apache/avro/NOTICE
@@ -1,6 +1,6 @@
-Apache Avro
-Copyright 2010-2015 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (https://www.apache.org/).
-
+Apache Avro
+Copyright 2010-2015 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (https://www.apache.org/).
+
diff --git a/contrib/libs/apache/avro/README b/contrib/libs/apache/avro/README
index 6b081f13a86..e03d42023e1 100644
--- a/contrib/libs/apache/avro/README
+++ b/contrib/libs/apache/avro/README
@@ -1,69 +1,69 @@
-Avro C++ README.txt
-
-The C++ port is thus far incomplete. Currently, it contains:
-
- - Serializer/Parser- objects for writing/reading raw binary.
-
- - xxxSchema- objects for composing schemas.
-
- - ValidSchema- a schema object that has been converted to a parse tree
- (with some sanity checks).
-
- - ValidSchema.toJson() writes the schema as a json object.
-
- - ValidatingSerializer/ValidatingParser- check that reads/writes
- match the expected schema type (more expensive than the raw
- serializer/parser but they detect errors, and allow dynamic
- discovery of parsed data/attributes).
-
- - Compiler (compileJsonSchema())- converts a Json string schema to a
- ValidSchema.
-
- - Code Generation (experimental) - given a schema it generates C++
- objects of the same data types, and the code to serialize and parse
- it.
-
-What's missing: Rpc containers are not yet implemented. Documentation is sparse.
-
-INSTRUCTIONS
-
-Pre-requisites:
-
-To compile requires boost headers, and the boost regex library. Optionally, it requires Snappy compression library. If Snappy is available, it builds support for Snappy compression and skips it otherwise. (Please see your OS-specific instructions on how to install Boost and Snappy for your OS).
-
-To build one requires cmake 2.6 or later.
-
-To generate a Makefile under Unix, MacOS (using GNU) or Cygwin use:
-
-mkdir build
-cd build
-cmake -G "Unix Makefiles" ..
-
-If it doesn't work, either you are missing some packages (boost, flex or bison),
-or you need to help configure locate them.
-
-If the Makefile is configured correctly, then you can make and run tests:
-
- make
- ctest
-
-To install
-
- make package
-
-and then untar the generated .tar.gz file.
-
-To build and test on MacOS (using Xcode)
-
-mkdir build.mac
-cd build.mac
-cmake -G Xcode
-
-xcodebuild -configuration Release
-ctest -C Release
-
-If debug version is required, replace 'Release' above with 'Debug'.
-
-Note: The LICENSE and NOTICE files in the lang/c++ source directory are used to
-build the binary distribution. The LICENSE and NOTICE information for the Avro
-C++ source distribution is in the root directory.
+Avro C++ README.txt
+
+The C++ port is thus far incomplete. Currently, it contains:
+
+ - Serializer/Parser- objects for writing/reading raw binary.
+
+ - xxxSchema- objects for composing schemas.
+
+ - ValidSchema- a schema object that has been converted to a parse tree
+ (with some sanity checks).
+
+ - ValidSchema.toJson() writes the schema as a json object.
+
+ - ValidatingSerializer/ValidatingParser- check that reads/writes
+ match the expected schema type (more expensive than the raw
+ serializer/parser but they detect errors, and allow dynamic
+ discovery of parsed data/attributes).
+
+ - Compiler (compileJsonSchema())- converts a Json string schema to a
+ ValidSchema.
+
+ - Code Generation (experimental) - given a schema it generates C++
+ objects of the same data types, and the code to serialize and parse
+ it.
+
+What's missing: Rpc containers are not yet implemented. Documentation is sparse.
+
+INSTRUCTIONS
+
+Pre-requisites:
+
+To compile requires boost headers, and the boost regex library. Optionally, it requires Snappy compression library. If Snappy is available, it builds support for Snappy compression and skips it otherwise. (Please see your OS-specific instructions on how to install Boost and Snappy for your OS).
+
+To build one requires cmake 2.6 or later.
+
+To generate a Makefile under Unix, MacOS (using GNU) or Cygwin use:
+
+mkdir build
+cd build
+cmake -G "Unix Makefiles" ..
+
+If it doesn't work, either you are missing some packages (boost, flex or bison),
+or you need to help configure locate them.
+
+If the Makefile is configured correctly, then you can make and run tests:
+
+ make
+ ctest
+
+To install
+
+ make package
+
+and then untar the generated .tar.gz file.
+
+To build and test on MacOS (using Xcode)
+
+mkdir build.mac
+cd build.mac
+cmake -G Xcode
+
+xcodebuild -configuration Release
+ctest -C Release
+
+If debug version is required, replace 'Release' above with 'Debug'.
+
+Note: The LICENSE and NOTICE files in the lang/c++ source directory are used to
+build the binary distribution. The LICENSE and NOTICE information for the Avro
+C++ source distribution is in the root directory.
diff --git a/contrib/libs/apache/avro/api/AvroParse.hh b/contrib/libs/apache/avro/api/AvroParse.hh
index dd2b98c419d..999ff861221 100644
--- a/contrib/libs/apache/avro/api/AvroParse.hh
+++ b/contrib/libs/apache/avro/api/AvroParse.hh
@@ -1,85 +1,85 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_AvroParse_hh__
-#define avro_AvroParse_hh__
-
-#include "Config.hh"
-#include "AvroTraits.hh"
-#include "ResolvingReader.hh"
-
-/// \file
-///
-/// Standalone parse functions for Avro types.
-
-namespace avro {
-
-/// The main parse entry point function. Takes a parser (either validating or
-/// plain) and the object that should receive the parsed data.
-
-template <typename Reader, typename T>
-void parse(Reader &p, T& val)
-{
- parse(p, val, is_serializable<T>());
-}
-
-template <typename T>
-void parse(ResolvingReader &p, T& val)
-{
- translatingParse(p, val, is_serializable<T>());
-}
-
-/// Type trait should be set to is_serializable in otherwise force the compiler to complain.
-
-template <typename Reader, typename T>
-void parse(Reader &p, T& val, const std::false_type &)
-{
- static_assert(sizeof(T) == 0, "Not a valid type to parse");
-}
-
-template <typename Reader, typename T>
-void translatingParse(Reader &p, T& val, const std::false_type &)
-{
- static_assert(sizeof(T) == 0, "Not a valid type to parse");
-}
-
-// @{
-
-/// The remainder of the file includes default implementations for serializable types.
-
-
-template <typename Reader, typename T>
-void parse(Reader &p, T &val, const std::true_type &) {
- p.readValue(val);
-}
-
-template <typename Reader>
-void parse(Reader &p, std::vector<uint8_t> &val, const std::true_type &) {
- p.readBytes(val);
-}
-
-template<typename T>
-void translatingParse(ResolvingReader &p, T& val, const std::true_type &) {
- p.parse(val);
-}
-
-// @}
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_AvroParse_hh__
+#define avro_AvroParse_hh__
+
+#include "Config.hh"
+#include "AvroTraits.hh"
+#include "ResolvingReader.hh"
+
+/// \file
+///
+/// Standalone parse functions for Avro types.
+
+namespace avro {
+
+/// The main parse entry point function. Takes a parser (either validating or
+/// plain) and the object that should receive the parsed data.
+
+template <typename Reader, typename T>
+void parse(Reader &p, T& val)
+{
+ parse(p, val, is_serializable<T>());
+}
+
+template <typename T>
+void parse(ResolvingReader &p, T& val)
+{
+ translatingParse(p, val, is_serializable<T>());
+}
+
+/// Type trait should be set to is_serializable in otherwise force the compiler to complain.
+
+template <typename Reader, typename T>
+void parse(Reader &p, T& val, const std::false_type &)
+{
+ static_assert(sizeof(T) == 0, "Not a valid type to parse");
+}
+
+template <typename Reader, typename T>
+void translatingParse(Reader &p, T& val, const std::false_type &)
+{
+ static_assert(sizeof(T) == 0, "Not a valid type to parse");
+}
+
+// @{
+
+/// The remainder of the file includes default implementations for serializable types.
+
+
+template <typename Reader, typename T>
+void parse(Reader &p, T &val, const std::true_type &) {
+ p.readValue(val);
+}
+
+template <typename Reader>
+void parse(Reader &p, std::vector<uint8_t> &val, const std::true_type &) {
+ p.readBytes(val);
+}
+
+template<typename T>
+void translatingParse(ResolvingReader &p, T& val, const std::true_type &) {
+ p.parse(val);
+}
+
+// @}
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/AvroSerialize.hh b/contrib/libs/apache/avro/api/AvroSerialize.hh
index 9495940c9cb..365e8ff6485 100644
--- a/contrib/libs/apache/avro/api/AvroSerialize.hh
+++ b/contrib/libs/apache/avro/api/AvroSerialize.hh
@@ -1,66 +1,66 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_AvroSerialize_hh__
-#define avro_AvroSerialize_hh__
-
-#include "Config.hh"
-#include "AvroTraits.hh"
-
-/// \file
-///
-/// Standalone serialize functions for Avro types.
-
-namespace avro {
-
-/// The main serializer entry point function. Takes a serializer (either validating or
-/// plain) and the object that should be serialized.
-
-template <typename Writer, typename T>
-void serialize(Writer &s, const T& val)
-{
- serialize(s, val, is_serializable<T>());
-}
-
-/// Type trait should be set to is_serializable in otherwise force the compiler to complain.
-
-template <typename Writer, typename T>
-void serialize(Writer &s, const T& val, const std::false_type &)
-{
- static_assert(sizeof(T) == 0, "Not a valid type to serialize");
-}
-
-/// The remainder of the file includes default implementations for serializable types.
-
-// @{
-
-template <typename Writer, typename T>
-void serialize(Writer &s, T val, const std::true_type &) {
- s.writeValue(val);
-}
-
-template <typename Writer>
-void serialize(Writer &s, const std::vector<uint8_t> &val, const std::true_type &) {
- s.writeBytes(val.data(), val.size());
-}
-
-// @}
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_AvroSerialize_hh__
+#define avro_AvroSerialize_hh__
+
+#include "Config.hh"
+#include "AvroTraits.hh"
+
+/// \file
+///
+/// Standalone serialize functions for Avro types.
+
+namespace avro {
+
+/// The main serializer entry point function. Takes a serializer (either validating or
+/// plain) and the object that should be serialized.
+
+template <typename Writer, typename T>
+void serialize(Writer &s, const T& val)
+{
+ serialize(s, val, is_serializable<T>());
+}
+
+/// Type trait should be set to is_serializable in otherwise force the compiler to complain.
+
+template <typename Writer, typename T>
+void serialize(Writer &s, const T& val, const std::false_type &)
+{
+ static_assert(sizeof(T) == 0, "Not a valid type to serialize");
+}
+
+/// The remainder of the file includes default implementations for serializable types.
+
+// @{
+
+template <typename Writer, typename T>
+void serialize(Writer &s, T val, const std::true_type &) {
+ s.writeValue(val);
+}
+
+template <typename Writer>
+void serialize(Writer &s, const std::vector<uint8_t> &val, const std::true_type &) {
+ s.writeBytes(val.data(), val.size());
+}
+
+// @}
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/AvroTraits.hh b/contrib/libs/apache/avro/api/AvroTraits.hh
index 91e2130c76f..d4d76efb2ec 100644
--- a/contrib/libs/apache/avro/api/AvroTraits.hh
+++ b/contrib/libs/apache/avro/api/AvroTraits.hh
@@ -1,116 +1,116 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_AvroTraits_hh__
-#define avro_AvroTraits_hh__
-
-#include "Config.hh"
-#include "Types.hh"
-#include <stdint.h>
-#include <type_traits>
-
-/** @file
- *
- * This header contains type traits and similar utilities used by the library.
- */
-namespace avro {
-
-/**
- * Define an is_serializable trait for types we can serialize natively.
- * New types will need to define the trait as well.
- */
-template <typename T>
-struct is_serializable : public std::false_type{};
-
-template <typename T>
-struct is_promotable : public std::false_type{};
-
-template <typename T>
-struct type_to_avro {
- static const Type type = AVRO_NUM_TYPES;
-};
-
-/**
- * Check if a \p T is a complete type i.e. it is defined as opposed to just
- * declared.
- *
- * is_defined<T>::value will be true or false depending on whether T is a
- * complete type or not respectively.
- */
-template <class T>
-struct is_defined {
-
- typedef char yes[1];
-
- typedef char no[2];
-
- template <class U> static yes& test(char(*)[sizeof(U)]) { throw 0; };
-
- template <class U> static no& test(...) { throw 0; };
-
- static const bool value = sizeof(test<T>(0)) == sizeof(yes);
-};
-
-/**
- * Similar to is_defined, but used to check if T is not defined.
- *
- * is_not_defined<T>::value will be true or false depending on whether T is an
- * incomplete type or not respectively.
- */
-template <class T>
-struct is_not_defined {
-
- typedef char yes[1];
-
- typedef char no[2];
-
- template <class U> static yes& test(char(*)[sizeof(U)]) { throw 0; };
-
- template <class U> static no& test(...) { throw 0; };
-
- static const bool value = sizeof(test<T>(0)) == sizeof(no);
-};
-
-#define DEFINE_PRIMITIVE(CTYPE, AVROTYPE) \
-template <> \
-struct is_serializable<CTYPE> : public std::true_type{}; \
-\
-template <> \
-struct type_to_avro<CTYPE> { \
- static const Type type = AVROTYPE; \
-};
-
-#define DEFINE_PROMOTABLE_PRIMITIVE(CTYPE, AVROTYPE) \
-template <> \
-struct is_promotable<CTYPE> : public std::true_type{}; \
-\
-DEFINE_PRIMITIVE(CTYPE, AVROTYPE)
-
-DEFINE_PROMOTABLE_PRIMITIVE(int32_t, AVRO_INT)
-DEFINE_PROMOTABLE_PRIMITIVE(int64_t, AVRO_LONG)
-DEFINE_PROMOTABLE_PRIMITIVE(float, AVRO_FLOAT)
-DEFINE_PRIMITIVE(double, AVRO_DOUBLE)
-DEFINE_PRIMITIVE(bool, AVRO_BOOL)
-DEFINE_PRIMITIVE(Null, AVRO_NULL)
-DEFINE_PRIMITIVE(std::string, AVRO_STRING)
-DEFINE_PRIMITIVE(std::vector<uint8_t>, AVRO_BYTES)
-
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_AvroTraits_hh__
+#define avro_AvroTraits_hh__
+
+#include "Config.hh"
+#include "Types.hh"
+#include <stdint.h>
+#include <type_traits>
+
+/** @file
+ *
+ * This header contains type traits and similar utilities used by the library.
+ */
+namespace avro {
+
+/**
+ * Define an is_serializable trait for types we can serialize natively.
+ * New types will need to define the trait as well.
+ */
+template <typename T>
+struct is_serializable : public std::false_type{};
+
+template <typename T>
+struct is_promotable : public std::false_type{};
+
+template <typename T>
+struct type_to_avro {
+ static const Type type = AVRO_NUM_TYPES;
+};
+
+/**
+ * Check if a \p T is a complete type i.e. it is defined as opposed to just
+ * declared.
+ *
+ * is_defined<T>::value will be true or false depending on whether T is a
+ * complete type or not respectively.
+ */
+template <class T>
+struct is_defined {
+
+ typedef char yes[1];
+
+ typedef char no[2];
+
+ template <class U> static yes& test(char(*)[sizeof(U)]) { throw 0; };
+
+ template <class U> static no& test(...) { throw 0; };
+
+ static const bool value = sizeof(test<T>(0)) == sizeof(yes);
+};
+
+/**
+ * Similar to is_defined, but used to check if T is not defined.
+ *
+ * is_not_defined<T>::value will be true or false depending on whether T is an
+ * incomplete type or not respectively.
+ */
+template <class T>
+struct is_not_defined {
+
+ typedef char yes[1];
+
+ typedef char no[2];
+
+ template <class U> static yes& test(char(*)[sizeof(U)]) { throw 0; };
+
+ template <class U> static no& test(...) { throw 0; };
+
+ static const bool value = sizeof(test<T>(0)) == sizeof(no);
+};
+
+#define DEFINE_PRIMITIVE(CTYPE, AVROTYPE) \
+template <> \
+struct is_serializable<CTYPE> : public std::true_type{}; \
+\
+template <> \
+struct type_to_avro<CTYPE> { \
+ static const Type type = AVROTYPE; \
+};
+
+#define DEFINE_PROMOTABLE_PRIMITIVE(CTYPE, AVROTYPE) \
+template <> \
+struct is_promotable<CTYPE> : public std::true_type{}; \
+\
+DEFINE_PRIMITIVE(CTYPE, AVROTYPE)
+
+DEFINE_PROMOTABLE_PRIMITIVE(int32_t, AVRO_INT)
+DEFINE_PROMOTABLE_PRIMITIVE(int64_t, AVRO_LONG)
+DEFINE_PROMOTABLE_PRIMITIVE(float, AVRO_FLOAT)
+DEFINE_PRIMITIVE(double, AVRO_DOUBLE)
+DEFINE_PRIMITIVE(bool, AVRO_BOOL)
+DEFINE_PRIMITIVE(Null, AVRO_NULL)
+DEFINE_PRIMITIVE(std::string, AVRO_STRING)
+DEFINE_PRIMITIVE(std::vector<uint8_t>, AVRO_BYTES)
+
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Compiler.hh b/contrib/libs/apache/avro/api/Compiler.hh
index 892d60580d0..3df875cd164 100644
--- a/contrib/libs/apache/avro/api/Compiler.hh
+++ b/contrib/libs/apache/avro/api/Compiler.hh
@@ -1,63 +1,63 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Compiler_hh__
-#define avro_Compiler_hh__
-
-#include "Config.hh"
-#include <stdint.h>
-#include <istream>
-
-namespace avro {
-
-class AVRO_DECL InputStream;
-
-/// This class is used to implement an avro spec parser using a flex/bison
-/// compiler. In order for the lexer to be reentrant, this class provides a
-/// lexer object for each parse. The bison parser also uses this class to
-/// build up an avro parse tree as the avro spec is parsed.
-
-class AVRO_DECL ValidSchema;
-
-/// Given a stream comtaining a JSON schema, compiles the schema to a
-/// ValidSchema object. Throws if the schema cannot be compiled to a valid
-/// schema
-
-AVRO_DECL void compileJsonSchema(std::istream &is, ValidSchema &schema);
-
-/// Non-throwing version of compileJsonSchema.
-///
-/// \return True if no error, false if error (with the error string set)
-///
-
-AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema,
- std::string &error);
-
-AVRO_DECL ValidSchema compileJsonSchemaFromStream(InputStream& is);
-
-AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t* input, size_t len);
-
-AVRO_DECL ValidSchema compileJsonSchemaFromString(const char* input);
-
-AVRO_DECL ValidSchema compileJsonSchemaFromString(const std::string& input);
-
-AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char* filename);
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Compiler_hh__
+#define avro_Compiler_hh__
+
+#include "Config.hh"
+#include <stdint.h>
+#include <istream>
+
+namespace avro {
+
+class AVRO_DECL InputStream;
+
+/// This class is used to implement an avro spec parser using a flex/bison
+/// compiler. In order for the lexer to be reentrant, this class provides a
+/// lexer object for each parse. The bison parser also uses this class to
+/// build up an avro parse tree as the avro spec is parsed.
+
+class AVRO_DECL ValidSchema;
+
+/// Given a stream comtaining a JSON schema, compiles the schema to a
+/// ValidSchema object. Throws if the schema cannot be compiled to a valid
+/// schema
+
+AVRO_DECL void compileJsonSchema(std::istream &is, ValidSchema &schema);
+
+/// Non-throwing version of compileJsonSchema.
+///
+/// \return True if no error, false if error (with the error string set)
+///
+
+AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema,
+ std::string &error);
+
+AVRO_DECL ValidSchema compileJsonSchemaFromStream(InputStream& is);
+
+AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t* input, size_t len);
+
+AVRO_DECL ValidSchema compileJsonSchemaFromString(const char* input);
+
+AVRO_DECL ValidSchema compileJsonSchemaFromString(const std::string& input);
+
+AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char* filename);
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Config.hh b/contrib/libs/apache/avro/api/Config.hh
index 69d36f2abe1..6daa843e9c9 100644
--- a/contrib/libs/apache/avro/api/Config.hh
+++ b/contrib/libs/apache/avro/api/Config.hh
@@ -1,45 +1,45 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Config_hh
-#define avro_Config_hh
-
-// Windows DLL suport
-
-#ifdef _WIN32
-#pragma warning (disable: 4275 4251)
-
-#if defined(AVRO_DYN_LINK)
-#ifdef AVRO_SOURCE
-# define AVRO_DECL __declspec(dllexport)
-#else
-# define AVRO_DECL __declspec(dllimport)
-#endif // AVRO_SOURCE
-#endif // AVRO_DYN_LINK
-
-#include <intsafe.h>
-typedef SSIZE_T ssize_t;
-
-#endif // _WIN32
-
-#ifndef AVRO_DECL
-#define AVRO_DECL
-#endif
-
-#endif
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Config_hh
+#define avro_Config_hh
+
+// Windows DLL suport
+
+#ifdef _WIN32
+#pragma warning (disable: 4275 4251)
+
+#if defined(AVRO_DYN_LINK)
+#ifdef AVRO_SOURCE
+# define AVRO_DECL __declspec(dllexport)
+#else
+# define AVRO_DECL __declspec(dllimport)
+#endif // AVRO_SOURCE
+#endif // AVRO_DYN_LINK
+
+#include <intsafe.h>
+typedef SSIZE_T ssize_t;
+
+#endif // _WIN32
+
+#ifndef AVRO_DECL
+#define AVRO_DECL
+#endif
+
+#endif
+
diff --git a/contrib/libs/apache/avro/api/DataFile.hh b/contrib/libs/apache/avro/api/DataFile.hh
index 50169106b19..cc333d70ba0 100644
--- a/contrib/libs/apache/avro/api/DataFile.hh
+++ b/contrib/libs/apache/avro/api/DataFile.hh
@@ -1,419 +1,419 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_DataFile_hh__
-#define avro_DataFile_hh__
-
-#include "Config.hh"
-#include "Encoder.hh"
-#include "buffer/Buffer.hh"
-#include "ValidSchema.hh"
-#include "Specific.hh"
-#include "Stream.hh"
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "array"
-#include "boost/utility.hpp"
-#include <boost/iostreams/filtering_stream.hpp>
-
-namespace avro {
-
-/** Specify type of compression to use when writing data files. */
-enum Codec {
- NULL_CODEC,
- DEFLATE_CODEC,
-
-#ifdef SNAPPY_CODEC_AVAILABLE
- SNAPPY_CODEC
-#endif
-
-};
-
-const int SyncSize = 16;
-/**
- * The sync value.
- */
-typedef std::array<uint8_t, SyncSize> DataFileSync;
-
-/**
- * Type-independent portion of DataFileWriter.
- * At any given point in time, at most one file can be written using
- * this object.
- */
-class AVRO_DECL DataFileWriterBase : boost::noncopyable {
- const std::string filename_;
- const ValidSchema schema_;
- const EncoderPtr encoderPtr_;
- const size_t syncInterval_;
- Codec codec_;
-
- std::unique_ptr<OutputStream> stream_;
- std::unique_ptr<OutputStream> buffer_;
- const DataFileSync sync_;
- int64_t objectCount_;
-
- typedef std::map<std::string, std::vector<uint8_t> > Metadata;
-
- Metadata metadata_;
- int64_t lastSync_;
-
- static std::unique_ptr<OutputStream> makeStream(const char* filename);
- static DataFileSync makeSync();
-
- void writeHeader();
- void setMetadata(const std::string& key, const std::string& value);
-
- /**
- * Generates a sync marker in the file.
- */
- void sync();
-
- /**
- * Shared constructor portion since we aren't using C++11
- */
- void init(const ValidSchema &schema, size_t syncInterval, const Codec &codec);
-
-public:
- /**
- * Returns the current encoder for this writer.
- */
- Encoder& encoder() const { return *encoderPtr_; }
-
- /**
- * Returns true if the buffer has sufficient data for a sync to be
- * inserted.
- */
- void syncIfNeeded();
-
- /**
- * Returns the byte offset (within the current file) of the start of the current block being written.
- */
- uint64_t getCurrentBlockStart();
-
- /**
- * Increments the object count.
- */
- void incr() {
- ++objectCount_;
- }
- /**
- * Constructs a data file writer with the given sync interval and name.
- */
- DataFileWriterBase(const char* filename, const ValidSchema& schema,
- size_t syncInterval, Codec codec = NULL_CODEC);
- DataFileWriterBase(std::unique_ptr<OutputStream> outputStream,
- const ValidSchema& schema, size_t syncInterval, Codec codec);
-
- ~DataFileWriterBase();
- /**
- * Closes the current file. Once closed this datafile object cannot be
- * used for writing any more.
- */
- void close();
-
- /**
- * Returns the schema for this data file.
- */
- const ValidSchema& schema() const { return schema_; }
-
- /**
- * Flushes any unwritten data into the file.
- */
- void flush();
-};
-
-/**
- * An Avro datafile that can store objects of type T.
- */
-template <typename T>
-class DataFileWriter : boost::noncopyable {
- std::unique_ptr<DataFileWriterBase> base_;
-public:
- /**
- * Constructs a new data file.
- */
- DataFileWriter(const char* filename, const ValidSchema& schema,
- size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC) :
- base_(new DataFileWriterBase(filename, schema, syncInterval, codec)) { }
-
- DataFileWriter(std::unique_ptr<OutputStream> outputStream, const ValidSchema& schema,
- size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC) :
- base_(new DataFileWriterBase(std::move(outputStream), schema, syncInterval, codec)) { }
-
- /**
- * Writes the given piece of data into the file.
- */
- void write(const T& datum) {
- base_->syncIfNeeded();
- avro::encode(base_->encoder(), datum);
- base_->incr();
- }
-
- /**
- * Returns the byte offset (within the current file) of the start of the current block being written.
- */
- uint64_t getCurrentBlockStart() { return base_->getCurrentBlockStart(); }
-
-
- /**
- * Closes the current file. Once closed this datafile object cannot be
- * used for writing any more.
- */
- void close() { base_->close(); }
-
- /**
- * Returns the schema for this data file.
- */
- const ValidSchema& schema() const { return base_->schema(); }
-
- /**
- * Flushes any unwritten data into the file.
- */
- void flush() { base_->flush(); }
-};
-
-/**
- * The type independent portion of reader.
- */
-class AVRO_DECL DataFileReaderBase : boost::noncopyable {
- const std::string filename_;
- const std::unique_ptr<InputStream> stream_;
- const DecoderPtr decoder_;
- int64_t objectCount_;
- bool eof_;
- Codec codec_;
- int64_t blockStart_;
- int64_t blockEnd_;
-
- ValidSchema readerSchema_;
- ValidSchema dataSchema_;
- DecoderPtr dataDecoder_;
- std::unique_ptr<InputStream> dataStream_;
- typedef std::map<std::string, std::vector<uint8_t> > Metadata;
-
- Metadata metadata_;
- DataFileSync sync_;
-
- // for compressed buffer
- std::unique_ptr<boost::iostreams::filtering_istream> os_;
- std::vector<char> compressed_;
- std::string uncompressed;
- void readHeader();
-
- void readDataBlock();
- void doSeek(int64_t position);
-public:
- /**
- * Returns the current decoder for this reader.
- */
- Decoder& decoder() { return *dataDecoder_; }
-
- /**
- * Returns true if and only if there is more to read.
- */
- bool hasMore();
-
- /**
- * Decrements the number of objects yet to read.
- */
- void decr() { --objectCount_; }
-
- /**
- * Constructs the reader for the given file and the reader is
- * expected to use the schema that is used with data.
- * This function should be called exactly once after constructing
- * the DataFileReaderBase object.
- */
- DataFileReaderBase(const char* filename);
-
- DataFileReaderBase(std::unique_ptr<InputStream> inputStream);
-
- /**
- * Initializes the reader so that the reader and writer schemas
- * are the same.
- */
- void init();
-
- /**
- * Initializes the reader to read objects according to the given
- * schema. This gives an opportunity for the reader to see the schema
- * in the data file before deciding the right schema to use for reading.
- * This must be called exactly once after constructing the
- * DataFileReaderBase object.
- */
- void init(const ValidSchema& readerSchema);
-
- /**
- * Returns the schema for this object.
- */
- const ValidSchema& readerSchema() { return readerSchema_; }
-
- /**
- * Returns the schema stored with the data file.
- */
- const ValidSchema& dataSchema() { return dataSchema_; }
-
- /**
- * Closes the reader. No further operation is possible on this reader.
- */
- void close();
-
- /**
- * Move to a specific, known synchronization point, for example one returned
- * from tell() after sync().
- */
- void seek(int64_t position);
-
- /**
- * Move to the next synchronization point after a position. To process a
- * range of file entries, call this with the starting position, then check
- * pastSync() with the end point before each use of decoder().
- */
- void sync(int64_t position);
-
- /**
- * Return true if past the next synchronization point after a position.
- */
- bool pastSync(int64_t position);
-
- /**
- * Return the last synchronization point before our current position.
- */
- int64_t previousSync();
-};
-
-/**
- * Reads the contents of data file one after another.
- */
-template <typename T>
-class DataFileReader : boost::noncopyable {
- std::unique_ptr<DataFileReaderBase> base_;
-public:
- /**
- * Constructs the reader for the given file and the reader is
- * expected to use the given schema.
- */
- DataFileReader(const char* filename, const ValidSchema& readerSchema) :
- base_(new DataFileReaderBase(filename)) {
- base_->init(readerSchema);
- }
-
- DataFileReader(std::unique_ptr<InputStream> inputStream, const ValidSchema& readerSchema) :
- base_(new DataFileReaderBase(std::move(inputStream))) {
- base_->init(readerSchema);
- }
-
- /**
- * Constructs the reader for the given file and the reader is
- * expected to use the schema that is used with data.
- */
- DataFileReader(const char* filename) :
- base_(new DataFileReaderBase(filename)) {
- base_->init();
- }
-
- DataFileReader(std::unique_ptr<InputStream> inputStream) :
- base_(new DataFileReaderBase(std::move(inputStream))) {
- base_->init();
- }
-
- /**
- * Constructs a reader using the reader base. This form of constructor
- * allows the user to examine the schema of a given file and then
- * decide to use the right type of data to be deserialize. Without this
- * the user must know the type of data for the template _before_
- * he knows the schema within the file.
- * The schema present in the data file will be used for reading
- * from this reader.
- */
- DataFileReader(std::unique_ptr<DataFileReaderBase> base) : base_(std::move(base)) {
- base_->init();
- }
-
- /**
- * Constructs a reader using the reader base. This form of constructor
- * allows the user to examine the schema of a given file and then
- * decide to use the right type of data to be deserialize. Without this
- * the user must know the type of data for the template _before_
- * he knows the schema within the file.
- * The argument readerSchema will be used for reading
- * from this reader.
- */
- DataFileReader(std::unique_ptr<DataFileReaderBase> base,
- const ValidSchema& readerSchema) : base_(std::move(base)) {
- base_->init(readerSchema);
- }
-
- /**
- * Reads the next entry from the data file.
- * \return true if an object has been successfully read into \p datum and
- * false if there are no more entries in the file.
- */
- bool read(T& datum) {
- if (base_->hasMore()) {
- base_->decr();
- avro::decode(base_->decoder(), datum);
- return true;
- }
- return false;
- }
-
- /**
- * Returns the schema for this object.
- */
- const ValidSchema& readerSchema() { return base_->readerSchema(); }
-
- /**
- * Returns the schema stored with the data file.
- */
- const ValidSchema& dataSchema() { return base_->dataSchema(); }
-
- /**
- * Closes the reader. No further operation is possible on this reader.
- */
- void close() { return base_->close(); }
-
- /**
- * Move to a specific, known synchronization point, for example one returned
- * from previousSync().
- */
- void seek(int64_t position) { base_->seek(position); }
-
- /**
- * Move to the next synchronization point after a position. To process a
- * range of file entries, call this with the starting position, then check
- * pastSync() with the end point before each call to read().
- */
- void sync(int64_t position) { base_->sync(position); }
-
- /**
- * Return true if past the next synchronization point after a position.
- */
- bool pastSync(int64_t position) { return base_->pastSync(position); }
-
- /**
- * Return the last synchronization point before our current position.
- */
- int64_t previousSync() { return base_->previousSync(); }
-};
-
-} // namespace avro
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_DataFile_hh__
+#define avro_DataFile_hh__
+
+#include "Config.hh"
+#include "Encoder.hh"
+#include "buffer/Buffer.hh"
+#include "ValidSchema.hh"
+#include "Specific.hh"
+#include "Stream.hh"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "array"
+#include "boost/utility.hpp"
+#include <boost/iostreams/filtering_stream.hpp>
+
+namespace avro {
+
+/** Specify type of compression to use when writing data files. */
+enum Codec {
+ NULL_CODEC,
+ DEFLATE_CODEC,
+
+#ifdef SNAPPY_CODEC_AVAILABLE
+ SNAPPY_CODEC
+#endif
+
+};
+
+const int SyncSize = 16;
+/**
+ * The sync value.
+ */
+typedef std::array<uint8_t, SyncSize> DataFileSync;
+
+/**
+ * Type-independent portion of DataFileWriter.
+ * At any given point in time, at most one file can be written using
+ * this object.
+ */
+class AVRO_DECL DataFileWriterBase : boost::noncopyable {
+ const std::string filename_;
+ const ValidSchema schema_;
+ const EncoderPtr encoderPtr_;
+ const size_t syncInterval_;
+ Codec codec_;
+
+ std::unique_ptr<OutputStream> stream_;
+ std::unique_ptr<OutputStream> buffer_;
+ const DataFileSync sync_;
+ int64_t objectCount_;
+
+ typedef std::map<std::string, std::vector<uint8_t> > Metadata;
+
+ Metadata metadata_;
+ int64_t lastSync_;
+
+ static std::unique_ptr<OutputStream> makeStream(const char* filename);
+ static DataFileSync makeSync();
+
+ void writeHeader();
+ void setMetadata(const std::string& key, const std::string& value);
+
+ /**
+ * Generates a sync marker in the file.
+ */
+ void sync();
+
+ /**
+ * Shared constructor portion since we aren't using C++11
+ */
+ void init(const ValidSchema &schema, size_t syncInterval, const Codec &codec);
+
+public:
+ /**
+ * Returns the current encoder for this writer.
+ */
+ Encoder& encoder() const { return *encoderPtr_; }
+
+ /**
+ * Returns true if the buffer has sufficient data for a sync to be
+ * inserted.
+ */
+ void syncIfNeeded();
+
+ /**
+ * Returns the byte offset (within the current file) of the start of the current block being written.
+ */
+ uint64_t getCurrentBlockStart();
+
+ /**
+ * Increments the object count.
+ */
+ void incr() {
+ ++objectCount_;
+ }
+ /**
+ * Constructs a data file writer with the given sync interval and name.
+ */
+ DataFileWriterBase(const char* filename, const ValidSchema& schema,
+ size_t syncInterval, Codec codec = NULL_CODEC);
+ DataFileWriterBase(std::unique_ptr<OutputStream> outputStream,
+ const ValidSchema& schema, size_t syncInterval, Codec codec);
+
+ ~DataFileWriterBase();
+ /**
+ * Closes the current file. Once closed this datafile object cannot be
+ * used for writing any more.
+ */
+ void close();
+
+ /**
+ * Returns the schema for this data file.
+ */
+ const ValidSchema& schema() const { return schema_; }
+
+ /**
+ * Flushes any unwritten data into the file.
+ */
+ void flush();
+};
+
+/**
+ * An Avro datafile that can store objects of type T.
+ */
+template <typename T>
+class DataFileWriter : boost::noncopyable {
+ std::unique_ptr<DataFileWriterBase> base_;
+public:
+ /**
+ * Constructs a new data file.
+ */
+ DataFileWriter(const char* filename, const ValidSchema& schema,
+ size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC) :
+ base_(new DataFileWriterBase(filename, schema, syncInterval, codec)) { }
+
+ DataFileWriter(std::unique_ptr<OutputStream> outputStream, const ValidSchema& schema,
+ size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC) :
+ base_(new DataFileWriterBase(std::move(outputStream), schema, syncInterval, codec)) { }
+
+ /**
+ * Writes the given piece of data into the file.
+ */
+ void write(const T& datum) {
+ base_->syncIfNeeded();
+ avro::encode(base_->encoder(), datum);
+ base_->incr();
+ }
+
+ /**
+ * Returns the byte offset (within the current file) of the start of the current block being written.
+ */
+ uint64_t getCurrentBlockStart() { return base_->getCurrentBlockStart(); }
+
+
+ /**
+ * Closes the current file. Once closed this datafile object cannot be
+ * used for writing any more.
+ */
+ void close() { base_->close(); }
+
+ /**
+ * Returns the schema for this data file.
+ */
+ const ValidSchema& schema() const { return base_->schema(); }
+
+ /**
+ * Flushes any unwritten data into the file.
+ */
+ void flush() { base_->flush(); }
+};
+
+/**
+ * The type independent portion of reader.
+ */
+class AVRO_DECL DataFileReaderBase : boost::noncopyable {
+ const std::string filename_;
+ const std::unique_ptr<InputStream> stream_;
+ const DecoderPtr decoder_;
+ int64_t objectCount_;
+ bool eof_;
+ Codec codec_;
+ int64_t blockStart_;
+ int64_t blockEnd_;
+
+ ValidSchema readerSchema_;
+ ValidSchema dataSchema_;
+ DecoderPtr dataDecoder_;
+ std::unique_ptr<InputStream> dataStream_;
+ typedef std::map<std::string, std::vector<uint8_t> > Metadata;
+
+ Metadata metadata_;
+ DataFileSync sync_;
+
+ // for compressed buffer
+ std::unique_ptr<boost::iostreams::filtering_istream> os_;
+ std::vector<char> compressed_;
+ std::string uncompressed;
+ void readHeader();
+
+ void readDataBlock();
+ void doSeek(int64_t position);
+public:
+ /**
+ * Returns the current decoder for this reader.
+ */
+ Decoder& decoder() { return *dataDecoder_; }
+
+ /**
+ * Returns true if and only if there is more to read.
+ */
+ bool hasMore();
+
+ /**
+ * Decrements the number of objects yet to read.
+ */
+ void decr() { --objectCount_; }
+
+ /**
+ * Constructs the reader for the given file and the reader is
+ * expected to use the schema that is used with data.
+ * This function should be called exactly once after constructing
+ * the DataFileReaderBase object.
+ */
+ DataFileReaderBase(const char* filename);
+
+ DataFileReaderBase(std::unique_ptr<InputStream> inputStream);
+
+ /**
+ * Initializes the reader so that the reader and writer schemas
+ * are the same.
+ */
+ void init();
+
+ /**
+ * Initializes the reader to read objects according to the given
+ * schema. This gives an opportunity for the reader to see the schema
+ * in the data file before deciding the right schema to use for reading.
+ * This must be called exactly once after constructing the
+ * DataFileReaderBase object.
+ */
+ void init(const ValidSchema& readerSchema);
+
+ /**
+ * Returns the schema for this object.
+ */
+ const ValidSchema& readerSchema() { return readerSchema_; }
+
+ /**
+ * Returns the schema stored with the data file.
+ */
+ const ValidSchema& dataSchema() { return dataSchema_; }
+
+ /**
+ * Closes the reader. No further operation is possible on this reader.
+ */
+ void close();
+
+ /**
+ * Move to a specific, known synchronization point, for example one returned
+ * from tell() after sync().
+ */
+ void seek(int64_t position);
+
+ /**
+ * Move to the next synchronization point after a position. To process a
+ * range of file entries, call this with the starting position, then check
+ * pastSync() with the end point before each use of decoder().
+ */
+ void sync(int64_t position);
+
+ /**
+ * Return true if past the next synchronization point after a position.
+ */
+ bool pastSync(int64_t position);
+
+ /**
+ * Return the last synchronization point before our current position.
+ */
+ int64_t previousSync();
+};
+
+/**
+ * Reads the contents of data file one after another.
+ */
+template <typename T>
+class DataFileReader : boost::noncopyable {
+ std::unique_ptr<DataFileReaderBase> base_;
+public:
+ /**
+ * Constructs the reader for the given file and the reader is
+ * expected to use the given schema.
+ */
+ DataFileReader(const char* filename, const ValidSchema& readerSchema) :
+ base_(new DataFileReaderBase(filename)) {
+ base_->init(readerSchema);
+ }
+
+ DataFileReader(std::unique_ptr<InputStream> inputStream, const ValidSchema& readerSchema) :
+ base_(new DataFileReaderBase(std::move(inputStream))) {
+ base_->init(readerSchema);
+ }
+
+ /**
+ * Constructs the reader for the given file and the reader is
+ * expected to use the schema that is used with data.
+ */
+ DataFileReader(const char* filename) :
+ base_(new DataFileReaderBase(filename)) {
+ base_->init();
+ }
+
+ DataFileReader(std::unique_ptr<InputStream> inputStream) :
+ base_(new DataFileReaderBase(std::move(inputStream))) {
+ base_->init();
+ }
+
+ /**
+ * Constructs a reader using the reader base. This form of constructor
+ * allows the user to examine the schema of a given file and then
+ * decide to use the right type of data to be deserialize. Without this
+ * the user must know the type of data for the template _before_
+ * he knows the schema within the file.
+ * The schema present in the data file will be used for reading
+ * from this reader.
+ */
+ DataFileReader(std::unique_ptr<DataFileReaderBase> base) : base_(std::move(base)) {
+ base_->init();
+ }
+
+ /**
+ * Constructs a reader using the reader base. This form of constructor
+ * allows the user to examine the schema of a given file and then
+ * decide to use the right type of data to be deserialize. Without this
+ * the user must know the type of data for the template _before_
+ * he knows the schema within the file.
+ * The argument readerSchema will be used for reading
+ * from this reader.
+ */
+ DataFileReader(std::unique_ptr<DataFileReaderBase> base,
+ const ValidSchema& readerSchema) : base_(std::move(base)) {
+ base_->init(readerSchema);
+ }
+
+ /**
+ * Reads the next entry from the data file.
+ * \return true if an object has been successfully read into \p datum and
+ * false if there are no more entries in the file.
+ */
+ bool read(T& datum) {
+ if (base_->hasMore()) {
+ base_->decr();
+ avro::decode(base_->decoder(), datum);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Returns the schema for this object.
+ */
+ const ValidSchema& readerSchema() { return base_->readerSchema(); }
+
+ /**
+ * Returns the schema stored with the data file.
+ */
+ const ValidSchema& dataSchema() { return base_->dataSchema(); }
+
+ /**
+ * Closes the reader. No further operation is possible on this reader.
+ */
+ void close() { return base_->close(); }
+
+ /**
+ * Move to a specific, known synchronization point, for example one returned
+ * from previousSync().
+ */
+ void seek(int64_t position) { base_->seek(position); }
+
+ /**
+ * Move to the next synchronization point after a position. To process a
+ * range of file entries, call this with the starting position, then check
+ * pastSync() with the end point before each call to read().
+ */
+ void sync(int64_t position) { base_->sync(position); }
+
+ /**
+ * Return true if past the next synchronization point after a position.
+ */
+ bool pastSync(int64_t position) { return base_->pastSync(position); }
+
+ /**
+ * Return the last synchronization point before our current position.
+ */
+ int64_t previousSync() { return base_->previousSync(); }
+};
+
+} // namespace avro
+#endif
diff --git a/contrib/libs/apache/avro/api/Decoder.hh b/contrib/libs/apache/avro/api/Decoder.hh
index 5356d79f32f..c57fc05505f 100644
--- a/contrib/libs/apache/avro/api/Decoder.hh
+++ b/contrib/libs/apache/avro/api/Decoder.hh
@@ -1,226 +1,226 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Decoder_hh__
-#define avro_Decoder_hh__
-
-#include "Config.hh"
-#include <stdint.h>
-#include <string>
-#include <vector>
-#include <memory>
-
-#include "ValidSchema.hh"
-#include "Stream.hh"
-
-/// \file
-///
-/// Low level support for decoding avro values.
-/// This class has two types of functions. One type of functions support
-/// decoding of leaf values (for example, decodeLong and
-/// decodeString). These functions have analogs in Encoder.
-///
-/// The other type of functions support decoding of maps and arrays.
-/// These functions are arrayStart, startItem, and arrayEnd
-/// (and similar functions for maps).
-
-namespace avro {
-
-/**
- * Decoder is an interface implemented by every decoder capable
- * of decoding Avro data.
- */
-class AVRO_DECL Decoder {
-public:
- virtual ~Decoder() { };
- /// All future decoding will come from is, which should be valid
- /// until replaced by another call to init() or this Decoder is
- /// destructed.
- virtual void init(InputStream& is) = 0;
-
- /// Decodes a null from the current stream.
- virtual void decodeNull() = 0;
-
- /// Decodes a bool from the current stream
- virtual bool decodeBool() = 0;
-
- /// Decodes a 32-bit int from the current stream.
- virtual int32_t decodeInt() = 0;
-
- /// Decodes a 64-bit signed int from the current stream.
- virtual int64_t decodeLong() = 0;
-
- /// Decodes a single-precision floating point number from current stream.
- virtual float decodeFloat() = 0;
-
- /// Decodes a double-precision floating point number from current stream.
- virtual double decodeDouble() = 0;
-
- /// Decodes a UTF-8 string from the current stream.
- std::string decodeString() {
- std::string result;
- decodeString(result);
- return result;
- }
-
- /**
- * Decodes a UTF-8 string from the stream and assigns it to value.
- */
- virtual void decodeString(std::string& value) = 0;
-
- /// Skips a string on the current stream.
- virtual void skipString() = 0;
-
- /// Decodes arbitray binary data from the current stream.
- std::vector<uint8_t> decodeBytes() {
- std::vector<uint8_t> result;
- decodeBytes(result);
- return result;
- }
-
- /// Decodes arbitrary binary data from the current stream and puts it
- /// in value.
- virtual void decodeBytes(std::vector<uint8_t>& value) = 0;
-
- /// Skips bytes on the current stream.
- virtual void skipBytes() = 0;
-
- /**
- * Decodes fixed length binary from the current stream.
- * \param[in] n The size (byte count) of the fixed being read.
- * \return The fixed data that has been read. The size of the returned
- * vector is guaranteed to be equal to \p n.
- */
- std::vector<uint8_t> decodeFixed(size_t n) {
- std::vector<uint8_t> result;
- decodeFixed(n, result);
- return result;
- }
-
- /**
- * Decodes a fixed from the current stream.
- * \param[in] n The size (byte count) of the fixed being read.
- * \param[out] value The value that receives the fixed. The vector will
- * be size-adjusted based on the fixed's size.
- */
- virtual void decodeFixed(size_t n, std::vector<uint8_t>& value) = 0;
-
- /// Skips fixed length binary on the current stream.
- virtual void skipFixed(size_t n) = 0;
-
- /// Decodes enum from the current stream.
- virtual size_t decodeEnum() = 0;
-
- /// Start decoding an array. Returns the number of entries in first chunk.
- virtual size_t arrayStart() = 0;
-
- /// Returns the number of entries in next chunk. 0 if last.
- virtual size_t arrayNext() = 0;
-
- /// Tries to skip an array. If it can, it returns 0. Otherwise
- /// it returns the number of elements to be skipped. The client
- /// should skip the individual items. In such cases, skipArray
- /// is identical to arrayStart.
- virtual size_t skipArray() = 0;
-
- /// Start decoding a map. Returns the number of entries in first chunk.
- virtual size_t mapStart() = 0;
-
- /// Returns the number of entries in next chunk. 0 if last.
- virtual size_t mapNext() = 0;
-
- /// Tries to skip a map. If it can, it returns 0. Otherwise
- /// it returns the number of elements to be skipped. The client
- /// should skip the individual items. In such cases, skipMap
- /// is identical to mapStart.
- virtual size_t skipMap() = 0;
-
- /// Decodes a branch of a union. The actual value is to follow.
- virtual size_t decodeUnionIndex() = 0;
-
- /// Drains any additional data at the end of the current entry in a stream.
- /// It also returns any unused bytes back to any underlying input stream.
- /// One situation this happens is when the reader's schema and
- /// the writer's schema are records but are different and the writer's
- /// record has more fields at the end of the record.
- /// Leaving such data unread is usually not a problem. If multiple
- /// records are stored consecutively in a stream (e.g. Avro data file)
- /// any attempt to read the next record will automatically skip
- /// those extra fields of the current record. It would still leave
- /// the extra fields at the end of the last record in the stream.
- /// This would mean that the stream is not in a good state. For example,
- /// if some non-avro information is stored at the end of the stream,
- /// the consumers of such data would see the bytes left behind
- /// by the avro decoder. Similar set of problems occur if the Decoder
- /// consumes more than what it should.
- virtual void drain() = 0;
-};
-
-/**
- * Shared pointer to Decoder.
- */
-typedef std::shared_ptr<Decoder> DecoderPtr;
-
-/**
- * ResolvingDecoder is derived from \ref Decoder, with an additional
- * function to obtain the field ordering of fields within a record.
- */
-class AVRO_DECL ResolvingDecoder : public Decoder {
-public:
- /// Returns the order of fields for records.
- /// The order of fields could be different from the order of their
- /// order in the schema because the writer's field order could
- /// be different. In order to avoid buffering and later use,
- /// we return the values in the writer's field order.
- virtual const std::vector<size_t>& fieldOrder() = 0;
-};
-
-/**
- * Shared pointer to ResolvingDecoder.
- */
-typedef std::shared_ptr<ResolvingDecoder> ResolvingDecoderPtr;
-/**
- * Returns an decoder that can decode binary Avro standard.
- */
-AVRO_DECL DecoderPtr binaryDecoder();
-
-/**
- * Returns an decoder that validates sequence of calls to an underlying
- * Decoder against the given schema.
- */
-AVRO_DECL DecoderPtr validatingDecoder(const ValidSchema& schema,
- const DecoderPtr& base);
-
-/**
- * Returns an decoder that can decode Avro standard for JSON.
- */
-AVRO_DECL DecoderPtr jsonDecoder(const ValidSchema& schema);
-
-/**
- * Returns a decoder that decodes avro data from base written according to
- * writerSchema and resolves against readerSchema.
- * The client uses the decoder as if the data were written using readerSchema.
- * // FIXME: Handle out of order fields.
- */
-AVRO_DECL ResolvingDecoderPtr resolvingDecoder(const ValidSchema& writer,
- const ValidSchema& reader, const DecoderPtr& base);
-
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Decoder_hh__
+#define avro_Decoder_hh__
+
+#include "Config.hh"
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "ValidSchema.hh"
+#include "Stream.hh"
+
+/// \file
+///
+/// Low level support for decoding avro values.
+/// This class has two types of functions. One type of functions support
+/// decoding of leaf values (for example, decodeLong and
+/// decodeString). These functions have analogs in Encoder.
+///
+/// The other type of functions support decoding of maps and arrays.
+/// These functions are arrayStart, startItem, and arrayEnd
+/// (and similar functions for maps).
+
+namespace avro {
+
+/**
+ * Decoder is an interface implemented by every decoder capable
+ * of decoding Avro data.
+ */
+class AVRO_DECL Decoder {
+public:
+ virtual ~Decoder() { };
+ /// All future decoding will come from is, which should be valid
+ /// until replaced by another call to init() or this Decoder is
+ /// destructed.
+ virtual void init(InputStream& is) = 0;
+
+ /// Decodes a null from the current stream.
+ virtual void decodeNull() = 0;
+
+ /// Decodes a bool from the current stream
+ virtual bool decodeBool() = 0;
+
+ /// Decodes a 32-bit int from the current stream.
+ virtual int32_t decodeInt() = 0;
+
+ /// Decodes a 64-bit signed int from the current stream.
+ virtual int64_t decodeLong() = 0;
+
+ /// Decodes a single-precision floating point number from current stream.
+ virtual float decodeFloat() = 0;
+
+ /// Decodes a double-precision floating point number from current stream.
+ virtual double decodeDouble() = 0;
+
+ /// Decodes a UTF-8 string from the current stream.
+ std::string decodeString() {
+ std::string result;
+ decodeString(result);
+ return result;
+ }
+
+ /**
+ * Decodes a UTF-8 string from the stream and assigns it to value.
+ */
+ virtual void decodeString(std::string& value) = 0;
+
+ /// Skips a string on the current stream.
+ virtual void skipString() = 0;
+
+ /// Decodes arbitray binary data from the current stream.
+ std::vector<uint8_t> decodeBytes() {
+ std::vector<uint8_t> result;
+ decodeBytes(result);
+ return result;
+ }
+
+ /// Decodes arbitrary binary data from the current stream and puts it
+ /// in value.
+ virtual void decodeBytes(std::vector<uint8_t>& value) = 0;
+
+ /// Skips bytes on the current stream.
+ virtual void skipBytes() = 0;
+
+ /**
+ * Decodes fixed length binary from the current stream.
+ * \param[in] n The size (byte count) of the fixed being read.
+ * \return The fixed data that has been read. The size of the returned
+ * vector is guaranteed to be equal to \p n.
+ */
+ std::vector<uint8_t> decodeFixed(size_t n) {
+ std::vector<uint8_t> result;
+ decodeFixed(n, result);
+ return result;
+ }
+
+ /**
+ * Decodes a fixed from the current stream.
+ * \param[in] n The size (byte count) of the fixed being read.
+ * \param[out] value The value that receives the fixed. The vector will
+ * be size-adjusted based on the fixed's size.
+ */
+ virtual void decodeFixed(size_t n, std::vector<uint8_t>& value) = 0;
+
+ /// Skips fixed length binary on the current stream.
+ virtual void skipFixed(size_t n) = 0;
+
+ /// Decodes enum from the current stream.
+ virtual size_t decodeEnum() = 0;
+
+ /// Start decoding an array. Returns the number of entries in first chunk.
+ virtual size_t arrayStart() = 0;
+
+ /// Returns the number of entries in next chunk. 0 if last.
+ virtual size_t arrayNext() = 0;
+
+ /// Tries to skip an array. If it can, it returns 0. Otherwise
+ /// it returns the number of elements to be skipped. The client
+ /// should skip the individual items. In such cases, skipArray
+ /// is identical to arrayStart.
+ virtual size_t skipArray() = 0;
+
+ /// Start decoding a map. Returns the number of entries in first chunk.
+ virtual size_t mapStart() = 0;
+
+ /// Returns the number of entries in next chunk. 0 if last.
+ virtual size_t mapNext() = 0;
+
+ /// Tries to skip a map. If it can, it returns 0. Otherwise
+ /// it returns the number of elements to be skipped. The client
+ /// should skip the individual items. In such cases, skipMap
+ /// is identical to mapStart.
+ virtual size_t skipMap() = 0;
+
+ /// Decodes a branch of a union. The actual value is to follow.
+ virtual size_t decodeUnionIndex() = 0;
+
+ /// Drains any additional data at the end of the current entry in a stream.
+ /// It also returns any unused bytes back to any underlying input stream.
+ /// One situation this happens is when the reader's schema and
+ /// the writer's schema are records but are different and the writer's
+ /// record has more fields at the end of the record.
+ /// Leaving such data unread is usually not a problem. If multiple
+ /// records are stored consecutively in a stream (e.g. Avro data file)
+ /// any attempt to read the next record will automatically skip
+ /// those extra fields of the current record. It would still leave
+ /// the extra fields at the end of the last record in the stream.
+ /// This would mean that the stream is not in a good state. For example,
+ /// if some non-avro information is stored at the end of the stream,
+ /// the consumers of such data would see the bytes left behind
+ /// by the avro decoder. Similar set of problems occur if the Decoder
+ /// consumes more than what it should.
+ virtual void drain() = 0;
+};
+
+/**
+ * Shared pointer to Decoder.
+ */
+typedef std::shared_ptr<Decoder> DecoderPtr;
+
+/**
+ * ResolvingDecoder is derived from \ref Decoder, with an additional
+ * function to obtain the field ordering of fields within a record.
+ */
+class AVRO_DECL ResolvingDecoder : public Decoder {
+public:
+ /// Returns the order of fields for records.
+ /// The order of fields could be different from the order of their
+ /// order in the schema because the writer's field order could
+ /// be different. In order to avoid buffering and later use,
+ /// we return the values in the writer's field order.
+ virtual const std::vector<size_t>& fieldOrder() = 0;
+};
+
+/**
+ * Shared pointer to ResolvingDecoder.
+ */
+typedef std::shared_ptr<ResolvingDecoder> ResolvingDecoderPtr;
+/**
+ * Returns an decoder that can decode binary Avro standard.
+ */
+AVRO_DECL DecoderPtr binaryDecoder();
+
+/**
+ * Returns an decoder that validates sequence of calls to an underlying
+ * Decoder against the given schema.
+ */
+AVRO_DECL DecoderPtr validatingDecoder(const ValidSchema& schema,
+ const DecoderPtr& base);
+
+/**
+ * Returns an decoder that can decode Avro standard for JSON.
+ */
+AVRO_DECL DecoderPtr jsonDecoder(const ValidSchema& schema);
+
+/**
+ * Returns a decoder that decodes avro data from base written according to
+ * writerSchema and resolves against readerSchema.
+ * The client uses the decoder as if the data were written using readerSchema.
+ * // FIXME: Handle out of order fields.
+ */
+AVRO_DECL ResolvingDecoderPtr resolvingDecoder(const ValidSchema& writer,
+ const ValidSchema& reader, const DecoderPtr& base);
+
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Encoder.hh b/contrib/libs/apache/avro/api/Encoder.hh
index 7849e934586..0d1f198e76c 100644
--- a/contrib/libs/apache/avro/api/Encoder.hh
+++ b/contrib/libs/apache/avro/api/Encoder.hh
@@ -1,173 +1,173 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Encoder_hh__
-#define avro_Encoder_hh__
-
-#include "Config.hh"
-#include <stdint.h>
-#include <string>
-#include <vector>
-#include <memory>
-
-#include "ValidSchema.hh"
-#include "Stream.hh"
-
-/// \file
-///
-/// Low level support for encoding avro values.
-/// This class has two types of funtions. One type of functions support
-/// the writing of leaf values (for example, encodeLong and
-/// encodeString). These functions have analogs in Decoder.
-///
-/// The other type of functions support the writing of maps and arrays.
-/// These functions are arrayStart, startItem, and arrayEnd
-/// (and similar functions for maps).
-/// Some implementations of Encoder handle the
-/// buffering required to break large maps and arrays into blocks,
-/// which is necessary for applications that want to do streaming.
-
-namespace avro {
-
-/**
- * The abstract base class for all Avro encoders. The implementations
- * differ in the method of encoding (binary vresus JSON) or in capabilities
- * such as ability to verify the order of invocation of different functions.
- */
-class AVRO_DECL Encoder {
-public:
- virtual ~Encoder() { };
- /// All future encodings will go to os, which should be valid until
- /// it is reset with another call to init() or the encoder is
- /// destructed.
- virtual void init(OutputStream& os) = 0;
-
- /// Flushes any data in internal buffers.
- virtual void flush() = 0;
-
- /// Returns the number of bytes produced so far.
- /// For a meaningful value, do a flush() before invoking this function.
- virtual int64_t byteCount() const = 0;
-
- /// Encodes a null to the current stream.
- virtual void encodeNull() = 0;
-
- /// Encodes a bool to the current stream
- virtual void encodeBool(bool b) = 0;
-
- /// Encodes a 32-bit int to the current stream.
- virtual void encodeInt(int32_t i) = 0;
-
- /// Encodes a 64-bit signed int to the current stream.
- virtual void encodeLong(int64_t l) = 0;
-
- /// Encodes a single-precision floating point number to the current stream.
- virtual void encodeFloat(float f) = 0;
-
- /// Encodes a double-precision floating point number to the current stream.
- virtual void encodeDouble(double d) = 0;
-
- /// Encodes a UTF-8 string to the current stream.
- virtual void encodeString(const std::string& s) = 0;
-
- /**
- * Encodes aribtray binary data into tthe current stream as Avro "bytes"
- * data type.
- * \param bytes Where the data is
- * \param len Number of bytes at \p bytes.
- */
- virtual void encodeBytes(const uint8_t *bytes, size_t len) = 0;
-
- /**
- * Encodes aribtray binary data into tthe current stream as Avro "bytes"
- * data type.
- * \param bytes The data.
- */
- void encodeBytes(const std::vector<uint8_t>& bytes) {
- uint8_t b = 0;
- encodeBytes(bytes.empty() ? &b : bytes.data(), bytes.size());
- }
-
- /// Encodes fixed length binary to the current stream.
- virtual void encodeFixed(const uint8_t *bytes, size_t len) = 0;
-
- /**
- * Encodes an Avro data type Fixed.
- * \param bytes The fixed, the length of which is taken as the size
- * of fixed.
- */
- void encodeFixed(const std::vector<uint8_t>& bytes) {
- encodeFixed(bytes.data(), bytes.size());
- }
-
- /// Encodes enum to the current stream.
- virtual void encodeEnum(size_t e) = 0;
-
- /// Indicates that an array of items is being encoded.
- virtual void arrayStart() = 0;
-
- /// Indicates that the current array of items have ended.
- virtual void arrayEnd() = 0;
-
- /// Indicates that a map of items is being encoded.
- virtual void mapStart() = 0;
-
- /// Indicates that the current map of items have ended.
- virtual void mapEnd() = 0;
-
- /// Indicates that count number of items are to follow in the current array
- /// or map.
- virtual void setItemCount(size_t count) = 0;
-
- /// Marks a beginning of an item in the current array or map.
- virtual void startItem() = 0;
-
- /// Encodes a branch of a union. The actual value is to follow.
- virtual void encodeUnionIndex(size_t e) = 0;
-};
-
-/**
- * Shared pointer to Encoder.
- */
-typedef std::shared_ptr<Encoder> EncoderPtr;
-
-/**
- * Returns an encoder that can encode binary Avro standard.
- */
-AVRO_DECL EncoderPtr binaryEncoder();
-
-/**
- * Returns an encoder that validates sequence of calls to an underlying
- * Encoder against the given schema.
- */
-AVRO_DECL EncoderPtr validatingEncoder(const ValidSchema& schema,
- const EncoderPtr& base);
-
-/**
- * Returns an encoder that encodes Avro standard for JSON.
- */
-AVRO_DECL EncoderPtr jsonEncoder(const ValidSchema& schema);
-
-/**
- * Returns an encoder that encodes Avro standard for pretty printed JSON.
- */
-AVRO_DECL EncoderPtr jsonPrettyEncoder(const ValidSchema& schema);
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Encoder_hh__
+#define avro_Encoder_hh__
+
+#include "Config.hh"
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "ValidSchema.hh"
+#include "Stream.hh"
+
+/// \file
+///
+/// Low level support for encoding avro values.
+/// This class has two types of funtions. One type of functions support
+/// the writing of leaf values (for example, encodeLong and
+/// encodeString). These functions have analogs in Decoder.
+///
+/// The other type of functions support the writing of maps and arrays.
+/// These functions are arrayStart, startItem, and arrayEnd
+/// (and similar functions for maps).
+/// Some implementations of Encoder handle the
+/// buffering required to break large maps and arrays into blocks,
+/// which is necessary for applications that want to do streaming.
+
+namespace avro {
+
+/**
+ * The abstract base class for all Avro encoders. The implementations
+ * differ in the method of encoding (binary vresus JSON) or in capabilities
+ * such as ability to verify the order of invocation of different functions.
+ */
+class AVRO_DECL Encoder {
+public:
+ virtual ~Encoder() { };
+ /// All future encodings will go to os, which should be valid until
+ /// it is reset with another call to init() or the encoder is
+ /// destructed.
+ virtual void init(OutputStream& os) = 0;
+
+ /// Flushes any data in internal buffers.
+ virtual void flush() = 0;
+
+ /// Returns the number of bytes produced so far.
+ /// For a meaningful value, do a flush() before invoking this function.
+ virtual int64_t byteCount() const = 0;
+
+ /// Encodes a null to the current stream.
+ virtual void encodeNull() = 0;
+
+ /// Encodes a bool to the current stream
+ virtual void encodeBool(bool b) = 0;
+
+ /// Encodes a 32-bit int to the current stream.
+ virtual void encodeInt(int32_t i) = 0;
+
+ /// Encodes a 64-bit signed int to the current stream.
+ virtual void encodeLong(int64_t l) = 0;
+
+ /// Encodes a single-precision floating point number to the current stream.
+ virtual void encodeFloat(float f) = 0;
+
+ /// Encodes a double-precision floating point number to the current stream.
+ virtual void encodeDouble(double d) = 0;
+
+ /// Encodes a UTF-8 string to the current stream.
+ virtual void encodeString(const std::string& s) = 0;
+
+ /**
+ * Encodes aribtray binary data into tthe current stream as Avro "bytes"
+ * data type.
+ * \param bytes Where the data is
+ * \param len Number of bytes at \p bytes.
+ */
+ virtual void encodeBytes(const uint8_t *bytes, size_t len) = 0;
+
+ /**
+ * Encodes aribtray binary data into tthe current stream as Avro "bytes"
+ * data type.
+ * \param bytes The data.
+ */
+ void encodeBytes(const std::vector<uint8_t>& bytes) {
+ uint8_t b = 0;
+ encodeBytes(bytes.empty() ? &b : bytes.data(), bytes.size());
+ }
+
+ /// Encodes fixed length binary to the current stream.
+ virtual void encodeFixed(const uint8_t *bytes, size_t len) = 0;
+
+ /**
+ * Encodes an Avro data type Fixed.
+ * \param bytes The fixed, the length of which is taken as the size
+ * of fixed.
+ */
+ void encodeFixed(const std::vector<uint8_t>& bytes) {
+ encodeFixed(bytes.data(), bytes.size());
+ }
+
+ /// Encodes enum to the current stream.
+ virtual void encodeEnum(size_t e) = 0;
+
+ /// Indicates that an array of items is being encoded.
+ virtual void arrayStart() = 0;
+
+ /// Indicates that the current array of items have ended.
+ virtual void arrayEnd() = 0;
+
+ /// Indicates that a map of items is being encoded.
+ virtual void mapStart() = 0;
+
+ /// Indicates that the current map of items have ended.
+ virtual void mapEnd() = 0;
+
+ /// Indicates that count number of items are to follow in the current array
+ /// or map.
+ virtual void setItemCount(size_t count) = 0;
+
+ /// Marks a beginning of an item in the current array or map.
+ virtual void startItem() = 0;
+
+ /// Encodes a branch of a union. The actual value is to follow.
+ virtual void encodeUnionIndex(size_t e) = 0;
+};
+
+/**
+ * Shared pointer to Encoder.
+ */
+typedef std::shared_ptr<Encoder> EncoderPtr;
+
+/**
+ * Returns an encoder that can encode binary Avro standard.
+ */
+AVRO_DECL EncoderPtr binaryEncoder();
+
+/**
+ * Returns an encoder that validates sequence of calls to an underlying
+ * Encoder against the given schema.
+ */
+AVRO_DECL EncoderPtr validatingEncoder(const ValidSchema& schema,
+ const EncoderPtr& base);
+
+/**
+ * Returns an encoder that encodes Avro standard for JSON.
+ */
+AVRO_DECL EncoderPtr jsonEncoder(const ValidSchema& schema);
+
+/**
+ * Returns an encoder that encodes Avro standard for pretty printed JSON.
+ */
+AVRO_DECL EncoderPtr jsonPrettyEncoder(const ValidSchema& schema);
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Exception.hh b/contrib/libs/apache/avro/api/Exception.hh
index 7c5410f96be..4bcf63daa5a 100644
--- a/contrib/libs/apache/avro/api/Exception.hh
+++ b/contrib/libs/apache/avro/api/Exception.hh
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Exception_hh__
-#define avro_Exception_hh__
-
-#include "Config.hh"
-#include <stdexcept>
-#include <boost/format.hpp>
-
-namespace avro {
-
-/// Wrapper for std::runtime_error that provides convenience constructor
-/// for boost::format objects
-
-class AVRO_DECL Exception : public virtual std::runtime_error
-{
- public:
-
- Exception(const std::string &msg) :
- std::runtime_error(msg)
- { }
-
- Exception(const boost::format &msg) :
- std::runtime_error( boost::str(msg))
- { }
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Exception_hh__
+#define avro_Exception_hh__
+
+#include "Config.hh"
+#include <stdexcept>
+#include <boost/format.hpp>
+
+namespace avro {
+
+/// Wrapper for std::runtime_error that provides convenience constructor
+/// for boost::format objects
+
+class AVRO_DECL Exception : public virtual std::runtime_error
+{
+ public:
+
+ Exception(const std::string &msg) :
+ std::runtime_error(msg)
+ { }
+
+ Exception(const boost::format &msg) :
+ std::runtime_error( boost::str(msg))
+ { }
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Generic.hh b/contrib/libs/apache/avro/api/Generic.hh
index e1b3a8290c8..12f8610f8da 100644
--- a/contrib/libs/apache/avro/api/Generic.hh
+++ b/contrib/libs/apache/avro/api/Generic.hh
@@ -1,149 +1,149 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Generic_hh__
-#define avro_Generic_hh__
-
-#include <boost/utility.hpp>
-
-#include "Config.hh"
-#include "Types.hh"
-#include "Encoder.hh"
-#include "Decoder.hh"
-#include "GenericDatum.hh"
-
-namespace avro {
-/**
- * A utility class to read generic datum from decoders.
- */
-class AVRO_DECL GenericReader : boost::noncopyable {
- const ValidSchema schema_;
- const bool isResolving_;
- const DecoderPtr decoder_;
-
- static void read(GenericDatum& datum, Decoder& d, bool isResolving);
-public:
- /**
- * Constructs a reader for the given schema using the given decoder.
- */
- GenericReader(const ValidSchema& s, const DecoderPtr& decoder);
-
- /**
- * Constructs a reader for the given reader's schema \c readerSchema
- * using the given
- * decoder which holds data matching writer's schema \c writerSchema.
- */
- GenericReader(const ValidSchema& writerSchema,
- const ValidSchema& readerSchema, const DecoderPtr& decoder);
-
- /**
- * Reads a value off the decoder.
- */
- void read(GenericDatum& datum) const;
-
- /**
- * Drains any residual bytes in the input stream (e.g. because
- * reader's schema has no use of them) and return unused bytes
- * back to the underlying input stream.
- */
- void drain() {
- decoder_->drain();
- }
- /**
- * Reads a generic datum from the stream, using the given schema.
- */
- static void read(Decoder& d, GenericDatum& g);
-
- /**
- * Reads a generic datum from the stream, using the given schema.
- */
- static void read(Decoder& d, GenericDatum& g, const ValidSchema& s);
-};
-
-
-/**
- * A utility class to write generic datum to encoders.
- */
-class AVRO_DECL GenericWriter : boost::noncopyable {
- const ValidSchema schema_;
- const EncoderPtr encoder_;
-
- static void write(const GenericDatum& datum, Encoder& e);
-public:
- /**
- * Constructs a writer for the given schema using the given encoder.
- */
- GenericWriter(const ValidSchema& s, const EncoderPtr& encoder);
-
- /**
- * Writes a value onto the encoder.
- */
- void write(const GenericDatum& datum) const;
-
- /**
- * Writes a generic datum on to the stream.
- */
- static void write(Encoder& e, const GenericDatum& g);
-
- /**
- * Writes a generic datum on to the stream, using the given schema.
- * Retained for backward compatibility.
- */
- static void write(Encoder& e, const GenericDatum& g, const ValidSchema&) {
- write(e, g);
- }
-};
-
-template <typename T> struct codec_traits;
-
-/**
- * Specialization of codec_traits for Generic datum along with its schema.
- * This is maintained for compatibility with old code. Please use the
- * cleaner codec_traits<GenericDatum> instead.
- */
-template <> struct codec_traits<std::pair<ValidSchema, GenericDatum> > {
- /** Encodes */
- static void encode(Encoder& e,
- const std::pair<ValidSchema, GenericDatum>& p) {
- GenericWriter::write(e, p.second, p.first);
- }
-
- /** Decodes */
- static void decode(Decoder& d, std::pair<ValidSchema, GenericDatum>& p) {
- GenericReader::read(d, p.second, p.first);
- }
-};
-
-/**
- * Specialization of codec_traits for GenericDatum.
- */
-template <> struct codec_traits<GenericDatum> {
- /** Encodes */
- static void encode(Encoder& e, const GenericDatum& g) {
- GenericWriter::write(e, g);
- }
-
- /** Decodes */
- static void decode(Decoder& d, GenericDatum& g) {
- GenericReader::read(d, g);
- }
-};
-
-} // namespace avro
-#endif
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Generic_hh__
+#define avro_Generic_hh__
+
+#include <boost/utility.hpp>
+
+#include "Config.hh"
+#include "Types.hh"
+#include "Encoder.hh"
+#include "Decoder.hh"
+#include "GenericDatum.hh"
+
+namespace avro {
+/**
+ * A utility class to read generic datum from decoders.
+ */
+class AVRO_DECL GenericReader : boost::noncopyable {
+ const ValidSchema schema_;
+ const bool isResolving_;
+ const DecoderPtr decoder_;
+
+ static void read(GenericDatum& datum, Decoder& d, bool isResolving);
+public:
+ /**
+ * Constructs a reader for the given schema using the given decoder.
+ */
+ GenericReader(const ValidSchema& s, const DecoderPtr& decoder);
+
+ /**
+ * Constructs a reader for the given reader's schema \c readerSchema
+ * using the given
+ * decoder which holds data matching writer's schema \c writerSchema.
+ */
+ GenericReader(const ValidSchema& writerSchema,
+ const ValidSchema& readerSchema, const DecoderPtr& decoder);
+
+ /**
+ * Reads a value off the decoder.
+ */
+ void read(GenericDatum& datum) const;
+
+ /**
+ * Drains any residual bytes in the input stream (e.g. because
+ * reader's schema has no use of them) and return unused bytes
+ * back to the underlying input stream.
+ */
+ void drain() {
+ decoder_->drain();
+ }
+ /**
+ * Reads a generic datum from the stream, using the given schema.
+ */
+ static void read(Decoder& d, GenericDatum& g);
+
+ /**
+ * Reads a generic datum from the stream, using the given schema.
+ */
+ static void read(Decoder& d, GenericDatum& g, const ValidSchema& s);
+};
+
+
+/**
+ * A utility class to write generic datum to encoders.
+ */
+class AVRO_DECL GenericWriter : boost::noncopyable {
+ const ValidSchema schema_;
+ const EncoderPtr encoder_;
+
+ static void write(const GenericDatum& datum, Encoder& e);
+public:
+ /**
+ * Constructs a writer for the given schema using the given encoder.
+ */
+ GenericWriter(const ValidSchema& s, const EncoderPtr& encoder);
+
+ /**
+ * Writes a value onto the encoder.
+ */
+ void write(const GenericDatum& datum) const;
+
+ /**
+ * Writes a generic datum on to the stream.
+ */
+ static void write(Encoder& e, const GenericDatum& g);
+
+ /**
+ * Writes a generic datum on to the stream, using the given schema.
+ * Retained for backward compatibility.
+ */
+ static void write(Encoder& e, const GenericDatum& g, const ValidSchema&) {
+ write(e, g);
+ }
+};
+
+template <typename T> struct codec_traits;
+
+/**
+ * Specialization of codec_traits for Generic datum along with its schema.
+ * This is maintained for compatibility with old code. Please use the
+ * cleaner codec_traits<GenericDatum> instead.
+ */
+template <> struct codec_traits<std::pair<ValidSchema, GenericDatum> > {
+ /** Encodes */
+ static void encode(Encoder& e,
+ const std::pair<ValidSchema, GenericDatum>& p) {
+ GenericWriter::write(e, p.second, p.first);
+ }
+
+ /** Decodes */
+ static void decode(Decoder& d, std::pair<ValidSchema, GenericDatum>& p) {
+ GenericReader::read(d, p.second, p.first);
+ }
+};
+
+/**
+ * Specialization of codec_traits for GenericDatum.
+ */
+template <> struct codec_traits<GenericDatum> {
+ /** Encodes */
+ static void encode(Encoder& e, const GenericDatum& g) {
+ GenericWriter::write(e, g);
+ }
+
+ /** Decodes */
+ static void decode(Decoder& d, GenericDatum& g) {
+ GenericReader::read(d, g);
+ }
+};
+
+} // namespace avro
+#endif
+
diff --git a/contrib/libs/apache/avro/api/GenericDatum.hh b/contrib/libs/apache/avro/api/GenericDatum.hh
index ac0e5e5e204..15cd53becb2 100644
--- a/contrib/libs/apache/avro/api/GenericDatum.hh
+++ b/contrib/libs/apache/avro/api/GenericDatum.hh
@@ -1,576 +1,576 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_GenericDatum_hh__
-#define avro_GenericDatum_hh__
-
-#include <stdint.h>
-#include <vector>
-#include <map>
-#include <string>
-
-#if __cplusplus >= 201703L
-#include <any>
-#else
-#include "boost/any.hpp"
-#endif
-
-#include "LogicalType.hh"
-#include "Node.hh"
-#include "ValidSchema.hh"
-
-namespace avro {
-
-/**
- * Generic datum which can hold any Avro type. The datum has a type
- * and a value. The type is one of the Avro data types. The C++ type for
- * value corresponds to the Avro type.
- * \li An Avro <tt>null</tt> corresponds to no C++ type. It is illegal to
- * to try to access values for <tt>null</tt>.
- * \li Avro <tt>boolean</tt> maps to C++ <tt>bool</tt>
- * \li Avro <tt>int</tt> maps to C++ <tt>int32_t</tt>.
- * \li Avro <tt>long</tt> maps to C++ <tt>int64_t</tt>.
- * \li Avro <tt>float</tt> maps to C++ <tt>float</tt>.
- * \li Avro <tt>double</tt> maps to C++ <tt>double</tt>.
- * \li Avro <tt>string</tt> maps to C++ <tt>std::string</tt>.
- * \li Avro <tt>bytes</tt> maps to C++ <tt>std::vector&lt;uint_t&gt;</tt>.
- * \li Avro <tt>fixed</tt> maps to C++ class <tt>GenericFixed</tt>.
- * \li Avro <tt>enum</tt> maps to C++ class <tt>GenericEnum</tt>.
- * \li Avro <tt>array</tt> maps to C++ class <tt>GenericArray</tt>.
- * \li Avro <tt>map</tt> maps to C++ class <tt>GenericMap</tt>.
- * \li There is no C++ type corresponding to Avro <tt>union</tt>. The
- * object should have the C++ type corresponing to one of the constituent
- * types of the union.
- *
- */
-class AVRO_DECL GenericDatum {
-protected:
- Type type_;
- LogicalType logicalType_;
-#if __cplusplus >= 201703L
- std::any value_;
-#else
- boost::any value_;
-#endif
-
- GenericDatum(Type t)
- : type_(t), logicalType_(LogicalType::NONE) { }
-
- GenericDatum(Type t, LogicalType logicalType)
- : type_(t), logicalType_(logicalType) { }
-
- template <typename T>
- GenericDatum(Type t, LogicalType logicalType, const T& v)
- : type_(t), logicalType_(logicalType), value_(v) { }
-
- void init(const NodePtr& schema);
-public:
- /**
- * The avro data type this datum holds.
- */
- Type type() const;
-
- /**
- * The avro logical type that augments the main data type this datum holds.
- */
- LogicalType logicalType() const;
-
- /**
- * Returns the value held by this datum.
- * T The type for the value. This must correspond to the
- * avro type returned by type().
- */
- template<typename T> const T& value() const;
-
- /**
- * Returns the reference to the value held by this datum, which
- * can be used to change the contents. Please note that only
- * value can be changed, the data type of the value held cannot
- * be changed.
- *
- * T The type for the value. This must correspond to the
- * avro type returned by type().
- */
- template<typename T> T& value();
-
- /**
- * Returns true if and only if this datum is a union.
- */
- bool isUnion() const { return type_ == AVRO_UNION; }
-
- /**
- * Returns the index of the current branch, if this is a union.
- * \sa isUnion().
- */
- size_t unionBranch() const;
-
- /**
- * Selects a new branch in the union if this is a union.
- * \sa isUnion().
- */
- void selectBranch(size_t branch);
-
- /// Makes a new AVRO_NULL datum.
- GenericDatum() : type_(AVRO_NULL), logicalType_(LogicalType::NONE) { }
-
- /// Makes a new AVRO_BOOL datum whose value is of type bool.
- GenericDatum(bool v)
- : type_(AVRO_BOOL), logicalType_(LogicalType::NONE), value_(v) { }
-
- /// Makes a new AVRO_INT datum whose value is of type int32_t.
- GenericDatum(int32_t v)
- : type_(AVRO_INT), logicalType_(LogicalType::NONE), value_(v) { }
-
- /// Makes a new AVRO_LONG datum whose value is of type int64_t.
- GenericDatum(int64_t v)
- : type_(AVRO_LONG), logicalType_(LogicalType::NONE), value_(v) { }
-
- /// Makes a new AVRO_FLOAT datum whose value is of type float.
- GenericDatum(float v)
- : type_(AVRO_FLOAT), logicalType_(LogicalType::NONE), value_(v) { }
-
- /// Makes a new AVRO_DOUBLE datum whose value is of type double.
- GenericDatum(double v)
- : type_(AVRO_DOUBLE), logicalType_(LogicalType::NONE), value_(v) { }
-
- /// Makes a new AVRO_STRING datum whose value is of type std::string.
- GenericDatum(const std::string& v)
- : type_(AVRO_STRING), logicalType_(LogicalType::NONE), value_(v) { }
-
- /// Makes a new AVRO_BYTES datum whose value is of type
- /// std::vector<uint8_t>.
- GenericDatum(const std::vector<uint8_t>& v) :
- type_(AVRO_BYTES), logicalType_(LogicalType::NONE), value_(v) { }
-
- /**
- * Constructs a datum corresponding to the given avro type.
- * The value will the appropriate default corresponding to the
- * data type.
- * \param schema The schema that defines the avro type.
- */
- GenericDatum(const NodePtr& schema);
-
- /**
- * Constructs a datum corresponding to the given avro type and set
- * the value.
- * \param schema The schema that defines the avro type.
- * \param v The value for this type.
- */
- template<typename T>
- GenericDatum(const NodePtr& schema, const T& v) :
- type_(schema->type()), logicalType_(schema->logicalType()) {
- init(schema);
-#if __cplusplus >= 201703L
- *std::any_cast<T>(&value_) = v;
-#else
- *boost::any_cast<T>(&value_) = v;
-#endif
- }
-
- /**
- * Constructs a datum corresponding to the given avro type.
- * The value will the appropriate default corresponding to the
- * data type.
- * \param schema The schema that defines the avro type.
- */
- GenericDatum(const ValidSchema& schema);
-};
-
-/**
- * The base class for all generic type for containers.
- */
-class AVRO_DECL GenericContainer {
- NodePtr schema_;
- static void assertType(const NodePtr& schema, Type type);
-protected:
- /**
- * Constructs a container corresponding to the given schema.
- */
- GenericContainer(Type type, const NodePtr& s) : schema_(s) {
- assertType(s, type);
- }
-
-public:
- /// Returns the schema for this object
- const NodePtr& schema() const {
- return schema_;
- }
-};
-
-/**
- * Generic container for unions.
- */
-class AVRO_DECL GenericUnion : public GenericContainer {
- size_t curBranch_;
- GenericDatum datum_;
-
-public:
- /**
- * Constructs a generic union corresponding to the given schema \p schema,
- * and the given value. The schema should be of Avro type union
- * and the value should correspond to one of the branches of the union.
- */
- GenericUnion(const NodePtr& schema) :
- GenericContainer(AVRO_UNION, schema), curBranch_(schema->leaves()) {
- selectBranch(0);
- }
-
- /**
- * Returns the index of the current branch.
- */
- size_t currentBranch() const { return curBranch_; }
-
- /**
- * Selects a new branch. The type for the value is changed accordingly.
- * \param branch The index for the selected branch.
- */
- void selectBranch(size_t branch) {
- if (curBranch_ != branch) {
- datum_ = GenericDatum(schema()->leafAt(branch));
- curBranch_ = branch;
- }
- }
-
- /**
- * Returns the datum corresponding to the currently selected branch
- * in this union.
- */
- GenericDatum& datum() {
- return datum_;
- }
-
- /**
- * Returns the datum corresponding to the currently selected branch
- * in this union.
- */
- const GenericDatum& datum() const {
- return datum_;
- }
-};
-
-/**
- * The generic container for Avro records.
- */
-class AVRO_DECL GenericRecord : public GenericContainer {
- std::vector<GenericDatum> fields_;
-public:
- /**
- * Constructs a generic record corresponding to the given schema \p schema,
- * which should be of Avro type record.
- */
- GenericRecord(const NodePtr& schema);
-
- /**
- * Returns the number of fields in the current record.
- */
- size_t fieldCount() const {
- return fields_.size();
- }
-
- /**
- * Returns index of the field with the given name \p name
- */
- size_t fieldIndex(const std::string& name) const {
- size_t index = 0;
- if (!schema()->nameIndex(name, index)) {
- throw Exception("Invalid field name: " + name);
- }
- return index;
- }
-
- /**
- * Returns true if a field with the given name \p name is located in this r
- * false otherwise
- */
- bool hasField(const std::string& name) const {
- size_t index = 0;
- return schema()->nameIndex(name, index);
- }
-
- /**
- * Returns the field with the given name \p name.
- */
- const GenericDatum& field(const std::string& name) const {
- return fieldAt(fieldIndex(name));
- }
-
- /**
- * Returns the reference to the field with the given name \p name,
- * which can be used to change the contents.
- */
- GenericDatum& field(const std::string& name) {
- return fieldAt(fieldIndex(name));
- }
-
- /**
- * Returns the field at the given position \p pos.
- */
- const GenericDatum& fieldAt(size_t pos) const {
- return fields_[pos];
- }
-
- /**
- * Returns the reference to the field at the given position \p pos,
- * which can be used to change the contents.
- */
- GenericDatum& fieldAt(size_t pos) {
- return fields_[pos];
- }
-
- /**
- * Replaces the field at the given position \p pos with \p v.
- */
- void setFieldAt(size_t pos, const GenericDatum& v) {
- // assertSameType(v, schema()->leafAt(pos));
- fields_[pos] = v;
- }
-};
-
-/**
- * The generic container for Avro arrays.
- */
-class AVRO_DECL GenericArray : public GenericContainer {
-public:
- /**
- * The contents type for the array.
- */
- typedef std::vector<GenericDatum> Value;
-
- /**
- * Constructs a generic array corresponding to the given schema \p schema,
- * which should be of Avro type array.
- */
- GenericArray(const NodePtr& schema) : GenericContainer(AVRO_ARRAY, schema) {
- }
-
- /**
- * Returns the contents of this array.
- */
- const Value& value() const {
- return value_;
- }
-
- /**
- * Returns the reference to the contents of this array.
- */
- Value& value() {
- return value_;
- }
-private:
- Value value_;
-};
-
-/**
- * The generic container for Avro maps.
- */
-class AVRO_DECL GenericMap : public GenericContainer {
-public:
- /**
- * The contents type for the map.
- */
- typedef std::vector<std::pair<std::string, GenericDatum> > Value;
-
- /**
- * Constructs a generic map corresponding to the given schema \p schema,
- * which should be of Avro type map.
- */
- GenericMap(const NodePtr& schema) : GenericContainer(AVRO_MAP, schema) {
- }
-
- /**
- * Returns the contents of this map.
- */
- const Value& value() const {
- return value_;
- }
-
- /**
- * Returns the reference to the contents of this map.
- */
- Value& value() {
- return value_;
- }
-private:
- Value value_;
-};
-
-/**
- * Generic container for Avro enum.
- */
-class AVRO_DECL GenericEnum : public GenericContainer {
- size_t value_;
-
- static size_t index(const NodePtr& schema, const std::string& symbol) {
- size_t result;
- if (schema->nameIndex(symbol, result)) {
- return result;
- }
- throw Exception("No such symbol");
- }
-
-public:
- /**
- * Constructs a generic enum corresponding to the given schema \p schema,
- * which should be of Avro type enum.
- */
- GenericEnum(const NodePtr& schema) :
- GenericContainer(AVRO_ENUM, schema), value_(0) {
- }
-
- GenericEnum(const NodePtr& schema, const std::string& symbol) :
- GenericContainer(AVRO_ENUM, schema), value_(index(schema, symbol)) {
- }
-
- /**
- * Returns the symbol corresponding to the cardinal \p n. If the
- * value for \p n is not within the limits an exception is thrown.
- */
- const std::string& symbol(size_t n) {
- if (n < schema()->names()) {
- return schema()->nameAt(n);
- }
- throw Exception("Not as many symbols");
- }
-
- /**
- * Returns the cardinal for the given symbol \c symbol. If the symbol
- * is not defined for this enum and exception is thrown.
- */
- size_t index(const std::string& symbol) const {
- return index(schema(), symbol);
- }
-
- /**
- * Set the value for this enum corresponding to the given symbol \c symbol.
- */
- size_t set(const std::string& symbol) {
- return value_ = index(symbol);
- }
-
- /**
- * Set the value for this enum corresponding to the given cardinal \c n.
- */
- void set(size_t n) {
- if (n < schema()->names()) {
- value_ = n;
- return;
- }
- throw Exception("Not as many symbols");
- }
-
- /**
- * Returns the cardinal for the current value of this enum.
- */
- size_t value() const {
- return value_;
- }
-
- /**
- * Returns the symbol for the current value of this enum.
- */
- const std::string& symbol() const {
- return schema()->nameAt(value_);
- }
-};
-
-/**
- * Generic container for Avro fixed.
- */
-class AVRO_DECL GenericFixed : public GenericContainer {
- std::vector<uint8_t> value_;
-public:
- /**
- * Constructs a generic enum corresponding to the given schema \p schema,
- * which should be of Avro type fixed.
- */
- GenericFixed(const NodePtr& schema) : GenericContainer(AVRO_FIXED, schema) {
- value_.resize(schema->fixedSize());
- }
-
- GenericFixed(const NodePtr& schema, const std::vector<uint8_t>& v) :
- GenericContainer(AVRO_FIXED, schema), value_(v) { }
-
- /**
- * Returns the contents of this fixed.
- */
- const std::vector<uint8_t>& value() const {
- return value_;
- }
-
- /**
- * Returns the reference to the contents of this fixed.
- */
- std::vector<uint8_t>& value() {
- return value_;
- }
-};
-
-inline Type GenericDatum::type() const {
- return (type_ == AVRO_UNION) ?
-#if __cplusplus >= 201703L
- std::any_cast<GenericUnion>(&value_)->datum().type() :
-#else
- boost::any_cast<GenericUnion>(&value_)->datum().type() :
-#endif
- type_;
-}
-
-inline LogicalType GenericDatum::logicalType() const {
- return logicalType_;
-}
-
-template<typename T> T& GenericDatum::value() {
- return (type_ == AVRO_UNION) ?
-#if __cplusplus >= 201703L
- std::any_cast<GenericUnion>(&value_)->datum().value<T>() :
- *std::any_cast<T>(&value_);
-#else
- boost::any_cast<GenericUnion>(&value_)->datum().value<T>() :
- *boost::any_cast<T>(&value_);
-#endif
-}
-
-template<typename T> const T& GenericDatum::value() const {
- return (type_ == AVRO_UNION) ?
-#if __cplusplus >= 201703L
- std::any_cast<GenericUnion>(&value_)->datum().value<T>() :
- *std::any_cast<T>(&value_);
-#else
- boost::any_cast<GenericUnion>(&value_)->datum().value<T>() :
- *boost::any_cast<T>(&value_);
-#endif
-}
-
-inline size_t GenericDatum::unionBranch() const {
-#if __cplusplus >= 201703L
- return std::any_cast<GenericUnion>(&value_)->currentBranch();
-#else
- return boost::any_cast<GenericUnion>(&value_)->currentBranch();
-#endif
-}
-
-inline void GenericDatum::selectBranch(size_t branch) {
-#if __cplusplus >= 201703L
- std::any_cast<GenericUnion>(&value_)->selectBranch(branch);
-#else
- boost::any_cast<GenericUnion>(&value_)->selectBranch(branch);
-#endif
-}
-
-} // namespace avro
-#endif // avro_GenericDatum_hh__
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_GenericDatum_hh__
+#define avro_GenericDatum_hh__
+
+#include <stdint.h>
+#include <vector>
+#include <map>
+#include <string>
+
+#if __cplusplus >= 201703L
+#include <any>
+#else
+#include "boost/any.hpp"
+#endif
+
+#include "LogicalType.hh"
+#include "Node.hh"
+#include "ValidSchema.hh"
+
+namespace avro {
+
+/**
+ * Generic datum which can hold any Avro type. The datum has a type
+ * and a value. The type is one of the Avro data types. The C++ type for
+ * value corresponds to the Avro type.
+ * \li An Avro <tt>null</tt> corresponds to no C++ type. It is illegal to
+ * to try to access values for <tt>null</tt>.
+ * \li Avro <tt>boolean</tt> maps to C++ <tt>bool</tt>
+ * \li Avro <tt>int</tt> maps to C++ <tt>int32_t</tt>.
+ * \li Avro <tt>long</tt> maps to C++ <tt>int64_t</tt>.
+ * \li Avro <tt>float</tt> maps to C++ <tt>float</tt>.
+ * \li Avro <tt>double</tt> maps to C++ <tt>double</tt>.
+ * \li Avro <tt>string</tt> maps to C++ <tt>std::string</tt>.
+ * \li Avro <tt>bytes</tt> maps to C++ <tt>std::vector&lt;uint_t&gt;</tt>.
+ * \li Avro <tt>fixed</tt> maps to C++ class <tt>GenericFixed</tt>.
+ * \li Avro <tt>enum</tt> maps to C++ class <tt>GenericEnum</tt>.
+ * \li Avro <tt>array</tt> maps to C++ class <tt>GenericArray</tt>.
+ * \li Avro <tt>map</tt> maps to C++ class <tt>GenericMap</tt>.
+ * \li There is no C++ type corresponding to Avro <tt>union</tt>. The
+ * object should have the C++ type corresponing to one of the constituent
+ * types of the union.
+ *
+ */
+class AVRO_DECL GenericDatum {
+protected:
+ Type type_;
+ LogicalType logicalType_;
+#if __cplusplus >= 201703L
+ std::any value_;
+#else
+ boost::any value_;
+#endif
+
+ GenericDatum(Type t)
+ : type_(t), logicalType_(LogicalType::NONE) { }
+
+ GenericDatum(Type t, LogicalType logicalType)
+ : type_(t), logicalType_(logicalType) { }
+
+ template <typename T>
+ GenericDatum(Type t, LogicalType logicalType, const T& v)
+ : type_(t), logicalType_(logicalType), value_(v) { }
+
+ void init(const NodePtr& schema);
+public:
+ /**
+ * The avro data type this datum holds.
+ */
+ Type type() const;
+
+ /**
+ * The avro logical type that augments the main data type this datum holds.
+ */
+ LogicalType logicalType() const;
+
+ /**
+ * Returns the value held by this datum.
+ * T The type for the value. This must correspond to the
+ * avro type returned by type().
+ */
+ template<typename T> const T& value() const;
+
+ /**
+ * Returns the reference to the value held by this datum, which
+ * can be used to change the contents. Please note that only
+ * value can be changed, the data type of the value held cannot
+ * be changed.
+ *
+ * T The type for the value. This must correspond to the
+ * avro type returned by type().
+ */
+ template<typename T> T& value();
+
+ /**
+ * Returns true if and only if this datum is a union.
+ */
+ bool isUnion() const { return type_ == AVRO_UNION; }
+
+ /**
+ * Returns the index of the current branch, if this is a union.
+ * \sa isUnion().
+ */
+ size_t unionBranch() const;
+
+ /**
+ * Selects a new branch in the union if this is a union.
+ * \sa isUnion().
+ */
+ void selectBranch(size_t branch);
+
+ /// Makes a new AVRO_NULL datum.
+ GenericDatum() : type_(AVRO_NULL), logicalType_(LogicalType::NONE) { }
+
+ /// Makes a new AVRO_BOOL datum whose value is of type bool.
+ GenericDatum(bool v)
+ : type_(AVRO_BOOL), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /// Makes a new AVRO_INT datum whose value is of type int32_t.
+ GenericDatum(int32_t v)
+ : type_(AVRO_INT), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /// Makes a new AVRO_LONG datum whose value is of type int64_t.
+ GenericDatum(int64_t v)
+ : type_(AVRO_LONG), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /// Makes a new AVRO_FLOAT datum whose value is of type float.
+ GenericDatum(float v)
+ : type_(AVRO_FLOAT), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /// Makes a new AVRO_DOUBLE datum whose value is of type double.
+ GenericDatum(double v)
+ : type_(AVRO_DOUBLE), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /// Makes a new AVRO_STRING datum whose value is of type std::string.
+ GenericDatum(const std::string& v)
+ : type_(AVRO_STRING), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /// Makes a new AVRO_BYTES datum whose value is of type
+ /// std::vector<uint8_t>.
+ GenericDatum(const std::vector<uint8_t>& v) :
+ type_(AVRO_BYTES), logicalType_(LogicalType::NONE), value_(v) { }
+
+ /**
+ * Constructs a datum corresponding to the given avro type.
+ * The value will the appropriate default corresponding to the
+ * data type.
+ * \param schema The schema that defines the avro type.
+ */
+ GenericDatum(const NodePtr& schema);
+
+ /**
+ * Constructs a datum corresponding to the given avro type and set
+ * the value.
+ * \param schema The schema that defines the avro type.
+ * \param v The value for this type.
+ */
+ template<typename T>
+ GenericDatum(const NodePtr& schema, const T& v) :
+ type_(schema->type()), logicalType_(schema->logicalType()) {
+ init(schema);
+#if __cplusplus >= 201703L
+ *std::any_cast<T>(&value_) = v;
+#else
+ *boost::any_cast<T>(&value_) = v;
+#endif
+ }
+
+ /**
+ * Constructs a datum corresponding to the given avro type.
+ * The value will the appropriate default corresponding to the
+ * data type.
+ * \param schema The schema that defines the avro type.
+ */
+ GenericDatum(const ValidSchema& schema);
+};
+
+/**
+ * The base class for all generic type for containers.
+ */
+class AVRO_DECL GenericContainer {
+ NodePtr schema_;
+ static void assertType(const NodePtr& schema, Type type);
+protected:
+ /**
+ * Constructs a container corresponding to the given schema.
+ */
+ GenericContainer(Type type, const NodePtr& s) : schema_(s) {
+ assertType(s, type);
+ }
+
+public:
+ /// Returns the schema for this object
+ const NodePtr& schema() const {
+ return schema_;
+ }
+};
+
+/**
+ * Generic container for unions.
+ */
+class AVRO_DECL GenericUnion : public GenericContainer {
+ size_t curBranch_;
+ GenericDatum datum_;
+
+public:
+ /**
+ * Constructs a generic union corresponding to the given schema \p schema,
+ * and the given value. The schema should be of Avro type union
+ * and the value should correspond to one of the branches of the union.
+ */
+ GenericUnion(const NodePtr& schema) :
+ GenericContainer(AVRO_UNION, schema), curBranch_(schema->leaves()) {
+ selectBranch(0);
+ }
+
+ /**
+ * Returns the index of the current branch.
+ */
+ size_t currentBranch() const { return curBranch_; }
+
+ /**
+ * Selects a new branch. The type for the value is changed accordingly.
+ * \param branch The index for the selected branch.
+ */
+ void selectBranch(size_t branch) {
+ if (curBranch_ != branch) {
+ datum_ = GenericDatum(schema()->leafAt(branch));
+ curBranch_ = branch;
+ }
+ }
+
+ /**
+ * Returns the datum corresponding to the currently selected branch
+ * in this union.
+ */
+ GenericDatum& datum() {
+ return datum_;
+ }
+
+ /**
+ * Returns the datum corresponding to the currently selected branch
+ * in this union.
+ */
+ const GenericDatum& datum() const {
+ return datum_;
+ }
+};
+
+/**
+ * The generic container for Avro records.
+ */
+class AVRO_DECL GenericRecord : public GenericContainer {
+ std::vector<GenericDatum> fields_;
+public:
+ /**
+ * Constructs a generic record corresponding to the given schema \p schema,
+ * which should be of Avro type record.
+ */
+ GenericRecord(const NodePtr& schema);
+
+ /**
+ * Returns the number of fields in the current record.
+ */
+ size_t fieldCount() const {
+ return fields_.size();
+ }
+
+ /**
+ * Returns index of the field with the given name \p name
+ */
+ size_t fieldIndex(const std::string& name) const {
+ size_t index = 0;
+ if (!schema()->nameIndex(name, index)) {
+ throw Exception("Invalid field name: " + name);
+ }
+ return index;
+ }
+
+ /**
+ * Returns true if a field with the given name \p name is located in this r
+ * false otherwise
+ */
+ bool hasField(const std::string& name) const {
+ size_t index = 0;
+ return schema()->nameIndex(name, index);
+ }
+
+ /**
+ * Returns the field with the given name \p name.
+ */
+ const GenericDatum& field(const std::string& name) const {
+ return fieldAt(fieldIndex(name));
+ }
+
+ /**
+ * Returns the reference to the field with the given name \p name,
+ * which can be used to change the contents.
+ */
+ GenericDatum& field(const std::string& name) {
+ return fieldAt(fieldIndex(name));
+ }
+
+ /**
+ * Returns the field at the given position \p pos.
+ */
+ const GenericDatum& fieldAt(size_t pos) const {
+ return fields_[pos];
+ }
+
+ /**
+ * Returns the reference to the field at the given position \p pos,
+ * which can be used to change the contents.
+ */
+ GenericDatum& fieldAt(size_t pos) {
+ return fields_[pos];
+ }
+
+ /**
+ * Replaces the field at the given position \p pos with \p v.
+ */
+ void setFieldAt(size_t pos, const GenericDatum& v) {
+ // assertSameType(v, schema()->leafAt(pos));
+ fields_[pos] = v;
+ }
+};
+
+/**
+ * The generic container for Avro arrays.
+ */
+class AVRO_DECL GenericArray : public GenericContainer {
+public:
+ /**
+ * The contents type for the array.
+ */
+ typedef std::vector<GenericDatum> Value;
+
+ /**
+ * Constructs a generic array corresponding to the given schema \p schema,
+ * which should be of Avro type array.
+ */
+ GenericArray(const NodePtr& schema) : GenericContainer(AVRO_ARRAY, schema) {
+ }
+
+ /**
+ * Returns the contents of this array.
+ */
+ const Value& value() const {
+ return value_;
+ }
+
+ /**
+ * Returns the reference to the contents of this array.
+ */
+ Value& value() {
+ return value_;
+ }
+private:
+ Value value_;
+};
+
+/**
+ * The generic container for Avro maps.
+ */
+class AVRO_DECL GenericMap : public GenericContainer {
+public:
+ /**
+ * The contents type for the map.
+ */
+ typedef std::vector<std::pair<std::string, GenericDatum> > Value;
+
+ /**
+ * Constructs a generic map corresponding to the given schema \p schema,
+ * which should be of Avro type map.
+ */
+ GenericMap(const NodePtr& schema) : GenericContainer(AVRO_MAP, schema) {
+ }
+
+ /**
+ * Returns the contents of this map.
+ */
+ const Value& value() const {
+ return value_;
+ }
+
+ /**
+ * Returns the reference to the contents of this map.
+ */
+ Value& value() {
+ return value_;
+ }
+private:
+ Value value_;
+};
+
+/**
+ * Generic container for Avro enum.
+ */
+class AVRO_DECL GenericEnum : public GenericContainer {
+ size_t value_;
+
+ static size_t index(const NodePtr& schema, const std::string& symbol) {
+ size_t result;
+ if (schema->nameIndex(symbol, result)) {
+ return result;
+ }
+ throw Exception("No such symbol");
+ }
+
+public:
+ /**
+ * Constructs a generic enum corresponding to the given schema \p schema,
+ * which should be of Avro type enum.
+ */
+ GenericEnum(const NodePtr& schema) :
+ GenericContainer(AVRO_ENUM, schema), value_(0) {
+ }
+
+ GenericEnum(const NodePtr& schema, const std::string& symbol) :
+ GenericContainer(AVRO_ENUM, schema), value_(index(schema, symbol)) {
+ }
+
+ /**
+ * Returns the symbol corresponding to the cardinal \p n. If the
+ * value for \p n is not within the limits an exception is thrown.
+ */
+ const std::string& symbol(size_t n) {
+ if (n < schema()->names()) {
+ return schema()->nameAt(n);
+ }
+ throw Exception("Not as many symbols");
+ }
+
+ /**
+ * Returns the cardinal for the given symbol \c symbol. If the symbol
+ * is not defined for this enum and exception is thrown.
+ */
+ size_t index(const std::string& symbol) const {
+ return index(schema(), symbol);
+ }
+
+ /**
+ * Set the value for this enum corresponding to the given symbol \c symbol.
+ */
+ size_t set(const std::string& symbol) {
+ return value_ = index(symbol);
+ }
+
+ /**
+ * Set the value for this enum corresponding to the given cardinal \c n.
+ */
+ void set(size_t n) {
+ if (n < schema()->names()) {
+ value_ = n;
+ return;
+ }
+ throw Exception("Not as many symbols");
+ }
+
+ /**
+ * Returns the cardinal for the current value of this enum.
+ */
+ size_t value() const {
+ return value_;
+ }
+
+ /**
+ * Returns the symbol for the current value of this enum.
+ */
+ const std::string& symbol() const {
+ return schema()->nameAt(value_);
+ }
+};
+
+/**
+ * Generic container for Avro fixed.
+ */
+class AVRO_DECL GenericFixed : public GenericContainer {
+ std::vector<uint8_t> value_;
+public:
+ /**
+ * Constructs a generic enum corresponding to the given schema \p schema,
+ * which should be of Avro type fixed.
+ */
+ GenericFixed(const NodePtr& schema) : GenericContainer(AVRO_FIXED, schema) {
+ value_.resize(schema->fixedSize());
+ }
+
+ GenericFixed(const NodePtr& schema, const std::vector<uint8_t>& v) :
+ GenericContainer(AVRO_FIXED, schema), value_(v) { }
+
+ /**
+ * Returns the contents of this fixed.
+ */
+ const std::vector<uint8_t>& value() const {
+ return value_;
+ }
+
+ /**
+ * Returns the reference to the contents of this fixed.
+ */
+ std::vector<uint8_t>& value() {
+ return value_;
+ }
+};
+
+inline Type GenericDatum::type() const {
+ return (type_ == AVRO_UNION) ?
+#if __cplusplus >= 201703L
+ std::any_cast<GenericUnion>(&value_)->datum().type() :
+#else
+ boost::any_cast<GenericUnion>(&value_)->datum().type() :
+#endif
+ type_;
+}
+
+inline LogicalType GenericDatum::logicalType() const {
+ return logicalType_;
+}
+
+template<typename T> T& GenericDatum::value() {
+ return (type_ == AVRO_UNION) ?
+#if __cplusplus >= 201703L
+ std::any_cast<GenericUnion>(&value_)->datum().value<T>() :
+ *std::any_cast<T>(&value_);
+#else
+ boost::any_cast<GenericUnion>(&value_)->datum().value<T>() :
+ *boost::any_cast<T>(&value_);
+#endif
+}
+
+template<typename T> const T& GenericDatum::value() const {
+ return (type_ == AVRO_UNION) ?
+#if __cplusplus >= 201703L
+ std::any_cast<GenericUnion>(&value_)->datum().value<T>() :
+ *std::any_cast<T>(&value_);
+#else
+ boost::any_cast<GenericUnion>(&value_)->datum().value<T>() :
+ *boost::any_cast<T>(&value_);
+#endif
+}
+
+inline size_t GenericDatum::unionBranch() const {
+#if __cplusplus >= 201703L
+ return std::any_cast<GenericUnion>(&value_)->currentBranch();
+#else
+ return boost::any_cast<GenericUnion>(&value_)->currentBranch();
+#endif
+}
+
+inline void GenericDatum::selectBranch(size_t branch) {
+#if __cplusplus >= 201703L
+ std::any_cast<GenericUnion>(&value_)->selectBranch(branch);
+#else
+ boost::any_cast<GenericUnion>(&value_)->selectBranch(branch);
+#endif
+}
+
+} // namespace avro
+#endif // avro_GenericDatum_hh__
diff --git a/contrib/libs/apache/avro/api/Layout.hh b/contrib/libs/apache/avro/api/Layout.hh
index ffd810c8b06..693bed1b424 100644
--- a/contrib/libs/apache/avro/api/Layout.hh
+++ b/contrib/libs/apache/avro/api/Layout.hh
@@ -1,83 +1,83 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Layout_hh__
-#define avro_Layout_hh__
-
-#include <boost/noncopyable.hpp>
-#include "Config.hh"
-
-/// \file Layout.hh
-///
-
-namespace avro {
-
-class AVRO_DECL Layout : private boost::noncopyable {
-
- protected:
-
- Layout(size_t offset = 0) :
- offset_(offset)
- {}
-
- public:
-
- size_t offset() const {
- return offset_;
- }
-
- virtual ~Layout() {}
-
- private:
-
- const size_t offset_;
-};
-
-class AVRO_DECL PrimitiveLayout : public Layout {
-
- public:
-
- PrimitiveLayout(size_t offset = 0) :
- Layout(offset)
- {}
-};
-
-class AVRO_DECL CompoundLayout : public Layout {
-
- public:
-
- CompoundLayout(size_t offset = 0) :
- Layout(offset)
- {}
-
- void add(std::unique_ptr<Layout> &layout) {
- layouts_.push_back(std::move(layout));
- }
-
- const Layout &at (size_t idx) const {
- return *layouts_.at(idx);
- }
-
- private:
-
- std::vector<std::unique_ptr<Layout> > layouts_;
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Layout_hh__
+#define avro_Layout_hh__
+
+#include <boost/noncopyable.hpp>
+#include "Config.hh"
+
+/// \file Layout.hh
+///
+
+namespace avro {
+
+class AVRO_DECL Layout : private boost::noncopyable {
+
+ protected:
+
+ Layout(size_t offset = 0) :
+ offset_(offset)
+ {}
+
+ public:
+
+ size_t offset() const {
+ return offset_;
+ }
+
+ virtual ~Layout() {}
+
+ private:
+
+ const size_t offset_;
+};
+
+class AVRO_DECL PrimitiveLayout : public Layout {
+
+ public:
+
+ PrimitiveLayout(size_t offset = 0) :
+ Layout(offset)
+ {}
+};
+
+class AVRO_DECL CompoundLayout : public Layout {
+
+ public:
+
+ CompoundLayout(size_t offset = 0) :
+ Layout(offset)
+ {}
+
+ void add(std::unique_ptr<Layout> &layout) {
+ layouts_.push_back(std::move(layout));
+ }
+
+ const Layout &at (size_t idx) const {
+ return *layouts_.at(idx);
+ }
+
+ private:
+
+ std::vector<std::unique_ptr<Layout> > layouts_;
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/LogicalType.hh b/contrib/libs/apache/avro/api/LogicalType.hh
index 33972788fb2..ba9d86db984 100644
--- a/contrib/libs/apache/avro/api/LogicalType.hh
+++ b/contrib/libs/apache/avro/api/LogicalType.hh
@@ -1,65 +1,65 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_LogicalType_hh__
-#define avro_LogicalType_hh__
-
-#include <iostream>
-
-#include "Config.hh"
-
-namespace avro {
-
-class AVRO_DECL LogicalType {
- public:
- enum Type {
- NONE,
- DECIMAL,
- DATE,
- TIME_MILLIS,
- TIME_MICROS,
- TIMESTAMP_MILLIS,
- TIMESTAMP_MICROS,
- DURATION,
- UUID
- };
-
- explicit LogicalType(Type type);
-
- Type type() const;
-
- // Precision and scale can only be set for the DECIMAL logical type.
- // Precision must be positive and scale must be either positive or zero. The
- // setters will throw an exception if they are called on any type other
- // than DECIMAL.
- void setPrecision(int precision);
- int precision() const { return precision_; }
- void setScale(int scale);
- int scale() const { return scale_; }
-
- void printJson(std::ostream& os) const;
-
- private:
- Type type_;
- int precision_;
- int scale_;
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_LogicalType_hh__
+#define avro_LogicalType_hh__
+
+#include <iostream>
+
+#include "Config.hh"
+
+namespace avro {
+
+class AVRO_DECL LogicalType {
+ public:
+ enum Type {
+ NONE,
+ DECIMAL,
+ DATE,
+ TIME_MILLIS,
+ TIME_MICROS,
+ TIMESTAMP_MILLIS,
+ TIMESTAMP_MICROS,
+ DURATION,
+ UUID
+ };
+
+ explicit LogicalType(Type type);
+
+ Type type() const;
+
+ // Precision and scale can only be set for the DECIMAL logical type.
+ // Precision must be positive and scale must be either positive or zero. The
+ // setters will throw an exception if they are called on any type other
+ // than DECIMAL.
+ void setPrecision(int precision);
+ int precision() const { return precision_; }
+ void setScale(int scale);
+ int scale() const { return scale_; }
+
+ void printJson(std::ostream& os) const;
+
+ private:
+ Type type_;
+ int precision_;
+ int scale_;
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Node.hh b/contrib/libs/apache/avro/api/Node.hh
index 205b06f00eb..b8bbf1d5730 100644
--- a/contrib/libs/apache/avro/api/Node.hh
+++ b/contrib/libs/apache/avro/api/Node.hh
@@ -1,211 +1,211 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Node_hh__
-#define avro_Node_hh__
-
-#include "Config.hh"
-
-#include <cassert>
-#include <memory>
-#include <boost/noncopyable.hpp>
-
-#include "Exception.hh"
-#include "LogicalType.hh"
-#include "Types.hh"
-#include "SchemaResolution.hh"
-
-namespace avro {
-
-class Node;
-class GenericDatum;
-
-typedef std::shared_ptr<Node> NodePtr;
-
-class AVRO_DECL Name {
- std::string ns_;
- std::string simpleName_;
-public:
- Name() { }
- Name(const std::string& fullname);
- Name(const std::string& simpleName, const std::string& ns) : ns_(ns), simpleName_(simpleName) { check(); }
-
- const std::string fullname() const;
- const std::string& ns() const { return ns_; }
- const std::string& simpleName() const { return simpleName_; }
-
- void ns(const std::string& n) { ns_ = n; }
- void simpleName(const std::string& n) { simpleName_ = n; }
- void fullname(const std::string& n);
-
- bool operator < (const Name& n) const;
- void check() const;
- bool operator == (const Name& n) const;
- bool operator != (const Name& n) const { return !((*this) == n); }
- void clear() {
- ns_.clear();
- simpleName_.clear();
- }
- operator std::string() const {
- return fullname();
- }
-};
-
-inline
-std::ostream& operator << (std::ostream& os, const Name& n) {
- return os << n.fullname();
-}
-
-/// Node is the building block for parse trees. Each node represents an avro
-/// type. Compound types have leaf nodes that represent the types they are
-/// composed of.
-///
-/// The user does not use the Node object directly, they interface with Schema
-/// objects.
-///
-/// The Node object uses reference-counted pointers. This is so that schemas
-/// may be reused in other schemas, without needing to worry about memory
-/// deallocation for nodes that are added to multiple schema parse trees.
-///
-/// Node has minimal implementation, serving as an abstract base class for
-/// different node types.
-///
-
-class AVRO_DECL Node : private boost::noncopyable
-{
- public:
-
- Node(Type type) :
- type_(type),
- logicalType_(LogicalType::NONE),
- locked_(false)
- {}
-
- virtual ~Node();
-
- Type type() const {
- return type_;
- }
-
- LogicalType logicalType() const {
- return logicalType_;
- }
-
- void setLogicalType(LogicalType logicalType);
-
- void lock() {
- locked_ = true;
- }
-
- bool locked() const {
- return locked_;
- }
-
- virtual bool hasName() const = 0;
-
- void setName(const Name &name) {
- checkLock();
- checkName(name);
- doSetName(name);
- }
- virtual const Name &name() const = 0;
-
- virtual const std::string &getDoc() const = 0;
- void setDoc(const std::string &doc) {
- checkLock();
- doSetDoc(doc);
- }
-
- void addLeaf(const NodePtr &newLeaf) {
- checkLock();
- doAddLeaf(newLeaf);
- }
- virtual size_t leaves() const = 0;
- virtual const NodePtr& leafAt(int index) const = 0;
- virtual const GenericDatum& defaultValueAt(int index) {
- throw Exception(boost::format("No default value at: %1%") % index);
- }
-
- void addName(const std::string &name) {
- checkLock();
- checkName(name);
- doAddName(name);
- }
- virtual size_t names() const = 0;
- virtual const std::string &nameAt(int index) const = 0;
- virtual bool nameIndex(const std::string &name, size_t &index) const = 0;
-
- void setFixedSize(int size) {
- checkLock();
- doSetFixedSize(size);
- }
- virtual int fixedSize() const = 0;
-
- virtual bool isValid() const = 0;
-
- virtual SchemaResolution resolve(const Node &reader) const = 0;
-
- virtual void printJson(std::ostream &os, int depth) const = 0;
-
- virtual void printBasicInfo(std::ostream &os) const = 0;
-
- virtual void setLeafToSymbolic(int index, const NodePtr &node) = 0;
-
- // Serialize the default value GenericDatum g for the node contained
- // in a record node.
- virtual void printDefaultToJson(const GenericDatum& g, std::ostream &os,
- int depth) const = 0;
-
- protected:
-
- void checkLock() const {
- if(locked()) {
- throw Exception("Cannot modify locked schema");
- }
- }
-
- virtual void checkName(const Name &name) const {
- name.check();
- }
-
- virtual void doSetName(const Name &name) = 0;
- virtual void doSetDoc(const std::string &name) = 0;
-
- virtual void doAddLeaf(const NodePtr &newLeaf) = 0;
- virtual void doAddName(const std::string &name) = 0;
- virtual void doSetFixedSize(int size) = 0;
-
- private:
-
- const Type type_;
- LogicalType logicalType_;
- bool locked_;
-};
-
-} // namespace avro
-
-namespace std {
-inline std::ostream& operator<<(std::ostream& os, const avro::Node& n)
-{
- n.printJson(os, 0);
- return os;
-}
-}
-
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Node_hh__
+#define avro_Node_hh__
+
+#include "Config.hh"
+
+#include <cassert>
+#include <memory>
+#include <boost/noncopyable.hpp>
+
+#include "Exception.hh"
+#include "LogicalType.hh"
+#include "Types.hh"
+#include "SchemaResolution.hh"
+
+namespace avro {
+
+class Node;
+class GenericDatum;
+
+typedef std::shared_ptr<Node> NodePtr;
+
+class AVRO_DECL Name {
+ std::string ns_;
+ std::string simpleName_;
+public:
+ Name() { }
+ Name(const std::string& fullname);
+ Name(const std::string& simpleName, const std::string& ns) : ns_(ns), simpleName_(simpleName) { check(); }
+
+ const std::string fullname() const;
+ const std::string& ns() const { return ns_; }
+ const std::string& simpleName() const { return simpleName_; }
+
+ void ns(const std::string& n) { ns_ = n; }
+ void simpleName(const std::string& n) { simpleName_ = n; }
+ void fullname(const std::string& n);
+
+ bool operator < (const Name& n) const;
+ void check() const;
+ bool operator == (const Name& n) const;
+ bool operator != (const Name& n) const { return !((*this) == n); }
+ void clear() {
+ ns_.clear();
+ simpleName_.clear();
+ }
+ operator std::string() const {
+ return fullname();
+ }
+};
+
+inline
+std::ostream& operator << (std::ostream& os, const Name& n) {
+ return os << n.fullname();
+}
+
+/// Node is the building block for parse trees. Each node represents an avro
+/// type. Compound types have leaf nodes that represent the types they are
+/// composed of.
+///
+/// The user does not use the Node object directly, they interface with Schema
+/// objects.
+///
+/// The Node object uses reference-counted pointers. This is so that schemas
+/// may be reused in other schemas, without needing to worry about memory
+/// deallocation for nodes that are added to multiple schema parse trees.
+///
+/// Node has minimal implementation, serving as an abstract base class for
+/// different node types.
+///
+
+class AVRO_DECL Node : private boost::noncopyable
+{
+ public:
+
+ Node(Type type) :
+ type_(type),
+ logicalType_(LogicalType::NONE),
+ locked_(false)
+ {}
+
+ virtual ~Node();
+
+ Type type() const {
+ return type_;
+ }
+
+ LogicalType logicalType() const {
+ return logicalType_;
+ }
+
+ void setLogicalType(LogicalType logicalType);
+
+ void lock() {
+ locked_ = true;
+ }
+
+ bool locked() const {
+ return locked_;
+ }
+
+ virtual bool hasName() const = 0;
+
+ void setName(const Name &name) {
+ checkLock();
+ checkName(name);
+ doSetName(name);
+ }
+ virtual const Name &name() const = 0;
+
+ virtual const std::string &getDoc() const = 0;
+ void setDoc(const std::string &doc) {
+ checkLock();
+ doSetDoc(doc);
+ }
+
+ void addLeaf(const NodePtr &newLeaf) {
+ checkLock();
+ doAddLeaf(newLeaf);
+ }
+ virtual size_t leaves() const = 0;
+ virtual const NodePtr& leafAt(int index) const = 0;
+ virtual const GenericDatum& defaultValueAt(int index) {
+ throw Exception(boost::format("No default value at: %1%") % index);
+ }
+
+ void addName(const std::string &name) {
+ checkLock();
+ checkName(name);
+ doAddName(name);
+ }
+ virtual size_t names() const = 0;
+ virtual const std::string &nameAt(int index) const = 0;
+ virtual bool nameIndex(const std::string &name, size_t &index) const = 0;
+
+ void setFixedSize(int size) {
+ checkLock();
+ doSetFixedSize(size);
+ }
+ virtual int fixedSize() const = 0;
+
+ virtual bool isValid() const = 0;
+
+ virtual SchemaResolution resolve(const Node &reader) const = 0;
+
+ virtual void printJson(std::ostream &os, int depth) const = 0;
+
+ virtual void printBasicInfo(std::ostream &os) const = 0;
+
+ virtual void setLeafToSymbolic(int index, const NodePtr &node) = 0;
+
+ // Serialize the default value GenericDatum g for the node contained
+ // in a record node.
+ virtual void printDefaultToJson(const GenericDatum& g, std::ostream &os,
+ int depth) const = 0;
+
+ protected:
+
+ void checkLock() const {
+ if(locked()) {
+ throw Exception("Cannot modify locked schema");
+ }
+ }
+
+ virtual void checkName(const Name &name) const {
+ name.check();
+ }
+
+ virtual void doSetName(const Name &name) = 0;
+ virtual void doSetDoc(const std::string &name) = 0;
+
+ virtual void doAddLeaf(const NodePtr &newLeaf) = 0;
+ virtual void doAddName(const std::string &name) = 0;
+ virtual void doSetFixedSize(int size) = 0;
+
+ private:
+
+ const Type type_;
+ LogicalType logicalType_;
+ bool locked_;
+};
+
+} // namespace avro
+
+namespace std {
+inline std::ostream& operator<<(std::ostream& os, const avro::Node& n)
+{
+ n.printJson(os, 0);
+ return os;
+}
+}
+
+
+#endif
diff --git a/contrib/libs/apache/avro/api/NodeConcepts.hh b/contrib/libs/apache/avro/api/NodeConcepts.hh
index e914d925b60..36d4f413f1f 100644
--- a/contrib/libs/apache/avro/api/NodeConcepts.hh
+++ b/contrib/libs/apache/avro/api/NodeConcepts.hh
@@ -1,224 +1,224 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_NodeConcepts_hh__
-#define avro_NodeConcepts_hh__
-
-#include "Config.hh"
-
-#include <vector>
-#include <map>
-#include "Exception.hh"
-
-namespace avro {
-
-
-///
-/// The concept classes are used to simplify NodeImpl. Since different types
-/// of avro types carry different attributes, such as names, or field names for
-/// record members. Using the concept class of NoAttribute vs Attribute, the
-/// NodeImpl object can enable/disable the attribute, but the code is the same
-/// in either case.
-///
-/// Furthermore, attributes may have different types, for example, most
-/// attributes are strings, but fixed types have a size attribute, which is
-/// integer.
-///
-/// Since compound types are composed of other types, the leaf attribute
-/// concepts extend a NodeImpl to include leaf nodes, and attributes for leaf
-/// nodes, which are used to build parse trees.
-///
-///
-
-namespace concepts {
-
-template <typename Attribute>
-struct NoAttribute
-{
- static const bool hasAttribute = false;
-
- size_t size() const {
- return 0;
- }
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_NodeConcepts_hh__
+#define avro_NodeConcepts_hh__
+
+#include "Config.hh"
+
+#include <vector>
+#include <map>
+#include "Exception.hh"
+
+namespace avro {
+
+
+///
+/// The concept classes are used to simplify NodeImpl. Since different types
+/// of avro types carry different attributes, such as names, or field names for
+/// record members. Using the concept class of NoAttribute vs Attribute, the
+/// NodeImpl object can enable/disable the attribute, but the code is the same
+/// in either case.
+///
+/// Furthermore, attributes may have different types, for example, most
+/// attributes are strings, but fixed types have a size attribute, which is
+/// integer.
+///
+/// Since compound types are composed of other types, the leaf attribute
+/// concepts extend a NodeImpl to include leaf nodes, and attributes for leaf
+/// nodes, which are used to build parse trees.
+///
+///
+
+namespace concepts {
+
+template <typename Attribute>
+struct NoAttribute
+{
+ static const bool hasAttribute = false;
+
+ size_t size() const {
+ return 0;
+ }
+
void add( const Attribute &) {
- // There must be an add function for the generic NodeImpl, but the
- // Node APIs ensure that it is never called, the throw here is
- // just in case
- throw Exception("This type does not have attribute");
- }
-
+ // There must be an add function for the generic NodeImpl, but the
+ // Node APIs ensure that it is never called, the throw here is
+ // just in case
+ throw Exception("This type does not have attribute");
+ }
+
const Attribute &get(size_t = 0) const {
- // There must be an get function for the generic NodeImpl, but the
- // Node APIs ensure that it is never called, the throw here is
- // just in case
- throw Exception("This type does not have attribute");
- // even though this code is unreachable the compiler requires it
- static const Attribute empty = Attribute();
- return empty;
- }
-
+ // There must be an get function for the generic NodeImpl, but the
+ // Node APIs ensure that it is never called, the throw here is
+ // just in case
+ throw Exception("This type does not have attribute");
+ // even though this code is unreachable the compiler requires it
+ static const Attribute empty = Attribute();
+ return empty;
+ }
+
Attribute &get(size_t = 0) {
- // There must be an get function for the generic NodeImpl, but the
- // Node APIs ensure that it is never called, the throw here is
- // just in case
- throw Exception("This type does not have attribute");
- }
-
-};
-
-template<typename Attribute>
-struct SingleAttribute
-{
- static const bool hasAttribute = true;
-
- SingleAttribute() : attr_()
- { }
-
- SingleAttribute(const Attribute& a) : attr_(a) { }
- // copy constructing from another single attribute is allowed
- SingleAttribute(const SingleAttribute<Attribute> &rhs) :
- attr_(rhs.attr_)
- { }
-
- // copy constructing from a no attribute is allowed
+ // There must be an get function for the generic NodeImpl, but the
+ // Node APIs ensure that it is never called, the throw here is
+ // just in case
+ throw Exception("This type does not have attribute");
+ }
+
+};
+
+template<typename Attribute>
+struct SingleAttribute
+{
+ static const bool hasAttribute = true;
+
+ SingleAttribute() : attr_()
+ { }
+
+ SingleAttribute(const Attribute& a) : attr_(a) { }
+ // copy constructing from another single attribute is allowed
+ SingleAttribute(const SingleAttribute<Attribute> &rhs) :
+ attr_(rhs.attr_)
+ { }
+
+ // copy constructing from a no attribute is allowed
SingleAttribute(const NoAttribute<Attribute> &) :
- attr_()
- { }
-
- size_t size() const {
- return 1;
- }
-
- void add(const Attribute &attr) {
- attr_ = attr;
- }
-
- const Attribute &get(size_t index = 0) const {
- if (index != 0) {
- throw Exception("SingleAttribute has only 1 value");
- }
- return attr_;
- }
-
- Attribute &get(size_t index = 0) {
- if (index != 0) {
- throw Exception("SingleAttribute has only 1 value");
- }
- return attr_;
- }
-
-private:
- template<typename T> friend struct MultiAttribute;
- Attribute attr_;
-};
-
-template<typename Attribute>
-struct MultiAttribute
-{
- static const bool hasAttribute = true;
-
- MultiAttribute()
- { }
-
- // copy constructing from another single attribute is allowed, it
- // pushes the attribute
- MultiAttribute(const SingleAttribute<Attribute> &rhs)
- {
- // since map is the only type that does this we know it's
- // final size will be two, so reserve
- attrs_.reserve(2);
- attrs_.push_back(rhs.attr_);
- }
-
- MultiAttribute(const MultiAttribute<Attribute> &rhs) :
- attrs_(rhs.attrs_)
- { }
-
+ attr_()
+ { }
+
+ size_t size() const {
+ return 1;
+ }
+
+ void add(const Attribute &attr) {
+ attr_ = attr;
+ }
+
+ const Attribute &get(size_t index = 0) const {
+ if (index != 0) {
+ throw Exception("SingleAttribute has only 1 value");
+ }
+ return attr_;
+ }
+
+ Attribute &get(size_t index = 0) {
+ if (index != 0) {
+ throw Exception("SingleAttribute has only 1 value");
+ }
+ return attr_;
+ }
+
+private:
+ template<typename T> friend struct MultiAttribute;
+ Attribute attr_;
+};
+
+template<typename Attribute>
+struct MultiAttribute
+{
+ static const bool hasAttribute = true;
+
+ MultiAttribute()
+ { }
+
+ // copy constructing from another single attribute is allowed, it
+ // pushes the attribute
+ MultiAttribute(const SingleAttribute<Attribute> &rhs)
+ {
+ // since map is the only type that does this we know it's
+ // final size will be two, so reserve
+ attrs_.reserve(2);
+ attrs_.push_back(rhs.attr_);
+ }
+
+ MultiAttribute(const MultiAttribute<Attribute> &rhs) :
+ attrs_(rhs.attrs_)
+ { }
+
MultiAttribute(const NoAttribute<Attribute> &)
- {}
-
- size_t size() const {
- return attrs_.size();
- }
-
- void add(const Attribute &attr) {
- attrs_.push_back(attr);
- }
-
- const Attribute &get(size_t index = 0) const {
- return attrs_.at(index);
- }
-
- Attribute &get(size_t index) {
- return attrs_.at(index);
- }
-
- private:
-
- std::vector<Attribute> attrs_;
-};
-
-
-template<typename T>
-struct NameIndexConcept {
-
+ {}
+
+ size_t size() const {
+ return attrs_.size();
+ }
+
+ void add(const Attribute &attr) {
+ attrs_.push_back(attr);
+ }
+
+ const Attribute &get(size_t index = 0) const {
+ return attrs_.at(index);
+ }
+
+ Attribute &get(size_t index) {
+ return attrs_.at(index);
+ }
+
+ private:
+
+ std::vector<Attribute> attrs_;
+};
+
+
+template<typename T>
+struct NameIndexConcept {
+
bool lookup(const std::string &, size_t &) const {
- throw Exception("Name index does not exist");
- return 0;
- }
-
+ throw Exception("Name index does not exist");
+ return 0;
+ }
+
bool add(const::std::string &, size_t) {
- throw Exception("Name index does not exist");
- return false;
- }
-};
-
-template<>
-struct NameIndexConcept < MultiAttribute<std::string> >
-{
- typedef std::map<std::string, size_t> IndexMap;
-
- bool lookup(const std::string &name, size_t &index) const {
- IndexMap::const_iterator iter = map_.find(name);
- if(iter == map_.end()) {
- return false;
- }
- index = iter->second;
- return true;
- }
-
- bool add(const::std::string &name, size_t index) {
- bool added = false;
- IndexMap::iterator lb = map_.lower_bound(name);
- if(lb == map_.end() || map_.key_comp()(name, lb->first)) {
- map_.insert(lb, IndexMap::value_type(name, index));
- added = true;
- }
- return added;
- }
-
- private:
-
- IndexMap map_;
-};
-
-} // namespace concepts
-} // namespace avro
-
-#endif
+ throw Exception("Name index does not exist");
+ return false;
+ }
+};
+
+template<>
+struct NameIndexConcept < MultiAttribute<std::string> >
+{
+ typedef std::map<std::string, size_t> IndexMap;
+
+ bool lookup(const std::string &name, size_t &index) const {
+ IndexMap::const_iterator iter = map_.find(name);
+ if(iter == map_.end()) {
+ return false;
+ }
+ index = iter->second;
+ return true;
+ }
+
+ bool add(const::std::string &name, size_t index) {
+ bool added = false;
+ IndexMap::iterator lb = map_.lower_bound(name);
+ if(lb == map_.end() || map_.key_comp()(name, lb->first)) {
+ map_.insert(lb, IndexMap::value_type(name, index));
+ added = true;
+ }
+ return added;
+ }
+
+ private:
+
+ IndexMap map_;
+};
+
+} // namespace concepts
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/NodeImpl.hh b/contrib/libs/apache/avro/api/NodeImpl.hh
index debce720a6c..f0f1cd9def8 100644
--- a/contrib/libs/apache/avro/api/NodeImpl.hh
+++ b/contrib/libs/apache/avro/api/NodeImpl.hh
@@ -1,619 +1,619 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_NodeImpl_hh__
-#define avro_NodeImpl_hh__
-
-#include "Config.hh"
-#include "GenericDatum.hh"
-
-#include <limits>
-#include <set>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <memory>
-
-#include "Node.hh"
-#include "NodeConcepts.hh"
-
-namespace avro {
-
-/// Implementation details for Node. NodeImpl represents all the avro types,
-/// whose properties are enabled and disabled by selecting concept classes.
-
-template
-<
- class NameConcept,
- class LeavesConcept,
- class LeafNamesConcept,
- class SizeConcept
->
-class NodeImpl : public Node
-{
-
- protected:
-
- NodeImpl(Type type) :
- Node(type),
- nameAttribute_(),
- docAttribute_(),
- leafAttributes_(),
- leafNameAttributes_(),
- sizeAttribute_()
- { }
-
- NodeImpl(Type type,
- const NameConcept &name,
- const LeavesConcept &leaves,
- const LeafNamesConcept &leafNames,
- const SizeConcept &size) :
- Node(type),
- nameAttribute_(name),
- docAttribute_(),
- leafAttributes_(leaves),
- leafNameAttributes_(leafNames),
- sizeAttribute_(size)
- { }
-
- // Ctor with "doc"
- NodeImpl(Type type,
- const NameConcept &name,
- const concepts::SingleAttribute<std::string> &doc,
- const LeavesConcept &leaves,
- const LeafNamesConcept &leafNames,
- const SizeConcept &size) :
- Node(type),
- nameAttribute_(name),
- docAttribute_(doc),
- leafAttributes_(leaves),
- leafNameAttributes_(leafNames),
- sizeAttribute_(size)
- {}
-
- void swap(NodeImpl& impl) {
- std::swap(nameAttribute_, impl.nameAttribute_);
- std::swap(docAttribute_, impl.docAttribute_);
- std::swap(leafAttributes_, impl.leafAttributes_);
- std::swap(leafNameAttributes_, impl.leafNameAttributes_);
- std::swap(sizeAttribute_, impl.sizeAttribute_);
- std::swap(nameIndex_, impl.nameIndex_);
- }
-
- bool hasName() const {
- // e.g.: true for single and multiattributes, false for noattributes.
- return NameConcept::hasAttribute;
- }
-
- void doSetName(const Name &name) {
- nameAttribute_.add(name);
- }
-
- const Name &name() const {
- return nameAttribute_.get();
- }
-
- void doSetDoc(const std::string &doc) {
- docAttribute_.add(doc);
- }
-
- const std::string &getDoc() const {
- return docAttribute_.get();
- }
-
- void doAddLeaf(const NodePtr &newLeaf) {
- leafAttributes_.add(newLeaf);
- }
-
- size_t leaves() const {
- return leafAttributes_.size();
- }
-
- const NodePtr &leafAt(int index) const {
- return leafAttributes_.get(index);
- }
-
- void doAddName(const std::string &name) {
- if (! nameIndex_.add(name, leafNameAttributes_.size())) {
- throw Exception(boost::format("Cannot add duplicate name: %1%") % name);
- }
- leafNameAttributes_.add(name);
- }
-
- size_t names() const {
- return leafNameAttributes_.size();
- }
-
- const std::string &nameAt(int index) const {
- return leafNameAttributes_.get(index);
- }
-
- bool nameIndex(const std::string &name, size_t &index) const {
- return nameIndex_.lookup(name, index);
- }
-
- void doSetFixedSize(int size) {
- sizeAttribute_.add(size);
- }
-
- int fixedSize() const {
- return sizeAttribute_.get();
- }
-
- virtual bool isValid() const = 0;
-
- void printBasicInfo(std::ostream &os) const;
-
- void setLeafToSymbolic(int index, const NodePtr &node);
-
- SchemaResolution furtherResolution(const Node &reader) const {
- SchemaResolution match = RESOLVE_NO_MATCH;
-
- if (reader.type() == AVRO_SYMBOLIC) {
-
- // resolve the symbolic type, and check again
- const NodePtr &node = reader.leafAt(0);
- match = resolve(*node);
- }
- else if(reader.type() == AVRO_UNION) {
-
- // in this case, need to see if there is an exact match for the
- // writer's type, or if not, the first one that can be promoted to a
- // match
-
- for(size_t i= 0; i < reader.leaves(); ++i) {
-
- const NodePtr &node = reader.leafAt(i);
- SchemaResolution thisMatch = resolve(*node);
-
- // if matched then the search is done
- if(thisMatch == RESOLVE_MATCH) {
- match = thisMatch;
- break;
- }
-
- // thisMatch is either no match, or promotable, this will set match to
- // promotable if it hasn't been set already
- if (match == RESOLVE_NO_MATCH) {
- match = thisMatch;
- }
- }
- }
-
- return match;
- }
-
- NameConcept nameAttribute_;
-
- // Rem: NameConcept type is HasName (= SingleAttribute<Name>), we use std::string instead
- concepts::SingleAttribute<std::string> docAttribute_; /** Doc used to compare schemas */
-
- LeavesConcept leafAttributes_;
- LeafNamesConcept leafNameAttributes_;
- SizeConcept sizeAttribute_;
- concepts::NameIndexConcept<LeafNamesConcept> nameIndex_;
-};
-
-typedef concepts::NoAttribute<Name> NoName;
-typedef concepts::SingleAttribute<Name> HasName;
-
-typedef concepts::SingleAttribute<std::string> HasDoc;
-
-typedef concepts::NoAttribute<NodePtr> NoLeaves;
-typedef concepts::SingleAttribute<NodePtr> SingleLeaf;
-typedef concepts::MultiAttribute<NodePtr> MultiLeaves;
-
-typedef concepts::NoAttribute<std::string> NoLeafNames;
-typedef concepts::MultiAttribute<std::string> LeafNames;
-
-typedef concepts::NoAttribute<int> NoSize;
-typedef concepts::SingleAttribute<int> HasSize;
-
-typedef NodeImpl< NoName, NoLeaves, NoLeafNames, NoSize > NodeImplPrimitive;
-typedef NodeImpl< HasName, NoLeaves, NoLeafNames, NoSize > NodeImplSymbolic;
-
-typedef NodeImpl< HasName, MultiLeaves, LeafNames, NoSize > NodeImplRecord;
-typedef NodeImpl< HasName, NoLeaves, LeafNames, NoSize > NodeImplEnum;
-typedef NodeImpl< NoName, SingleLeaf, NoLeafNames, NoSize > NodeImplArray;
-typedef NodeImpl< NoName, MultiLeaves, NoLeafNames, NoSize > NodeImplMap;
-typedef NodeImpl< NoName, MultiLeaves, NoLeafNames, NoSize > NodeImplUnion;
-typedef NodeImpl< HasName, NoLeaves, NoLeafNames, HasSize > NodeImplFixed;
-
-class AVRO_DECL NodePrimitive : public NodeImplPrimitive
-{
- public:
-
- explicit NodePrimitive(Type type) :
- NodeImplPrimitive(type)
- { }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return true;
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-class AVRO_DECL NodeSymbolic : public NodeImplSymbolic
-{
- typedef std::weak_ptr<Node> NodeWeakPtr;
-
- public:
-
- NodeSymbolic() :
- NodeImplSymbolic(AVRO_SYMBOLIC)
- { }
-
- explicit NodeSymbolic(const HasName &name) :
- NodeImplSymbolic(AVRO_SYMBOLIC, name, NoLeaves(), NoLeafNames(), NoSize())
- { }
-
- NodeSymbolic(const HasName &name, const NodePtr n) :
- NodeImplSymbolic(AVRO_SYMBOLIC, name, NoLeaves(), NoLeafNames(), NoSize()), actualNode_(n)
- { }
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return (nameAttribute_.size() == 1);
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-
- bool isSet() const {
- return (actualNode_.lock() != 0);
- }
-
- NodePtr getNode() const {
- NodePtr node = actualNode_.lock();
- if(!node) {
- throw Exception(boost::format("Could not follow symbol %1%") % name());
- }
- return node;
- }
-
- void setNode(const NodePtr &node) {
- actualNode_ = node;
- }
-
- protected:
-
- NodeWeakPtr actualNode_;
-
-};
-
-class AVRO_DECL NodeRecord : public NodeImplRecord {
- std::vector<GenericDatum> defaultValues;
-public:
- NodeRecord() : NodeImplRecord(AVRO_RECORD) { }
- NodeRecord(const HasName &name, const MultiLeaves &fields,
- const LeafNames &fieldsNames,
- const std::vector<GenericDatum>& dv) :
- NodeImplRecord(AVRO_RECORD, name, fields, fieldsNames, NoSize()),
- defaultValues(dv) {
- for (size_t i = 0; i < leafNameAttributes_.size(); ++i) {
- if (!nameIndex_.add(leafNameAttributes_.get(i), i)) {
- throw Exception(boost::format(
- "Cannot add duplicate field: %1%") %
- leafNameAttributes_.get(i));
- }
- }
- }
-
- NodeRecord(const HasName &name, const HasDoc &doc, const MultiLeaves &fields,
- const LeafNames &fieldsNames,
- const std::vector<GenericDatum> &dv) :
- NodeImplRecord(AVRO_RECORD, name, doc, fields, fieldsNames, NoSize()),
- defaultValues(dv) {
- for (size_t i = 0; i < leafNameAttributes_.size(); ++i) {
- if (!nameIndex_.add(leafNameAttributes_.get(i), i)) {
- throw Exception(boost::format(
- "Cannot add duplicate field: %1%") %
- leafNameAttributes_.get(i));
- }
- }
- }
-
- void swap(NodeRecord& r) {
- NodeImplRecord::swap(r);
- defaultValues.swap(r.defaultValues);
- }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return ((nameAttribute_.size() == 1) &&
- (leafAttributes_.size() == leafNameAttributes_.size()));
- }
-
- const GenericDatum& defaultValueAt(int index) {
- return defaultValues[index];
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-class AVRO_DECL NodeEnum : public NodeImplEnum
-{
- public:
-
- NodeEnum() :
- NodeImplEnum(AVRO_ENUM)
- { }
-
- NodeEnum(const HasName &name, const LeafNames &symbols) :
- NodeImplEnum(AVRO_ENUM, name, NoLeaves(), symbols, NoSize())
- {
- for(size_t i=0; i < leafNameAttributes_.size(); ++i) {
- if(!nameIndex_.add(leafNameAttributes_.get(i), i)) {
- throw Exception(boost::format("Cannot add duplicate enum: %1%") % leafNameAttributes_.get(i));
- }
- }
- }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return (
- (nameAttribute_.size() == 1) &&
- (leafNameAttributes_.size() > 0)
- );
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-class AVRO_DECL NodeArray : public NodeImplArray
-{
- public:
-
- NodeArray() :
- NodeImplArray(AVRO_ARRAY)
- { }
-
- explicit NodeArray(const SingleLeaf &items) :
- NodeImplArray(AVRO_ARRAY, NoName(), items, NoLeafNames(), NoSize())
- { }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return (leafAttributes_.size() == 1);
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-class AVRO_DECL NodeMap : public NodeImplMap
-{
- public:
-
- NodeMap() :
- NodeImplMap(AVRO_MAP)
- {
- NodePtr key(new NodePrimitive(AVRO_STRING));
- doAddLeaf(key);
- }
-
- explicit NodeMap(const SingleLeaf &values) :
- NodeImplMap(AVRO_MAP, NoName(), values, NoLeafNames(), NoSize())
- {
- // need to add the key for the map too
- NodePtr key(new NodePrimitive(AVRO_STRING));
- doAddLeaf(key);
-
- // key goes before value
- std::swap(leafAttributes_.get(0), leafAttributes_.get(1));
- }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return (leafAttributes_.size() == 2);
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-class AVRO_DECL NodeUnion : public NodeImplUnion
-{
- public:
-
- NodeUnion() :
- NodeImplUnion(AVRO_UNION)
- { }
-
- explicit NodeUnion(const MultiLeaves &types) :
- NodeImplUnion(AVRO_UNION, NoName(), types, NoLeafNames(), NoSize())
- { }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- std::set<std::string> seen;
- if (leafAttributes_.size() >= 1) {
- for (size_t i = 0; i < leafAttributes_.size(); ++i) {
- std::string name;
- const NodePtr& n = leafAttributes_.get(i);
- switch (n->type()) {
- case AVRO_STRING:
- name = "string";
- break;
- case AVRO_BYTES:
- name = "bytes";
- break;
- case AVRO_INT:
- name = "int";
- break;
- case AVRO_LONG:
- name = "long";
- break;
- case AVRO_FLOAT:
- name = "float";
- break;
- case AVRO_DOUBLE:
- name = "double";
- break;
- case AVRO_BOOL:
- name = "bool";
- break;
- case AVRO_NULL:
- name = "null";
- break;
- case AVRO_ARRAY:
- name = "array";
- break;
- case AVRO_MAP:
- name = "map";
- break;
- case AVRO_RECORD:
- case AVRO_ENUM:
- case AVRO_UNION:
- case AVRO_FIXED:
- case AVRO_SYMBOLIC:
- name = n->name().fullname();
- break;
- default:
- return false;
- }
- if (seen.find(name) != seen.end()) {
- return false;
- }
- seen.insert(name);
- }
- return true;
- }
- return false;
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-class AVRO_DECL NodeFixed : public NodeImplFixed
-{
- public:
-
- NodeFixed() :
- NodeImplFixed(AVRO_FIXED)
- { }
-
- NodeFixed(const HasName &name, const HasSize &size) :
- NodeImplFixed(AVRO_FIXED, name, NoLeaves(), NoLeafNames(), size)
- { }
-
- SchemaResolution resolve(const Node &reader) const;
-
- void printJson(std::ostream &os, int depth) const;
-
- bool isValid() const {
- return (
- (nameAttribute_.size() == 1) &&
- (sizeAttribute_.size() == 1)
- );
- }
-
- void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
-};
-
-template < class A, class B, class C, class D >
-inline void
-NodeImpl<A,B,C,D>::setLeafToSymbolic(int index, const NodePtr &node)
-{
- if(!B::hasAttribute) {
- throw Exception("Cannot change leaf node for nonexistent leaf");
- }
-
- NodePtr &replaceNode = const_cast<NodePtr &>(leafAttributes_.get(index));
- if(replaceNode->name() != node->name()) {
- throw Exception("Symbolic name does not match the name of the schema it references");
- }
-
- NodePtr symbol(new NodeSymbolic);
- NodeSymbolic *ptr = static_cast<NodeSymbolic *> (symbol.get());
-
- ptr->setName(node->name());
- ptr->setNode(node);
- replaceNode.swap(symbol);
-}
-
-template < class A, class B, class C, class D >
-inline void
-NodeImpl<A,B,C,D>::printBasicInfo(std::ostream &os) const
-{
- os << type();
- if(hasName()) {
- os << ' ' << nameAttribute_.get();
- }
-
- if(D::hasAttribute) {
- os << " " << sizeAttribute_.get();
- }
- os << '\n';
- int count = leaves();
- count = count ? count : names();
- for(int i= 0; i < count; ++i) {
- if( C::hasAttribute ) {
- os << "name " << nameAt(i) << '\n';
- }
- if( type() != AVRO_SYMBOLIC && leafAttributes_.hasAttribute) {
- leafAt(i)->printBasicInfo(os);
- }
- }
- if(isCompound(type())) {
- os << "end " << type() << '\n';
- }
-}
-
-
-inline NodePtr resolveSymbol(const NodePtr &node)
-{
- if(node->type() != AVRO_SYMBOLIC) {
- throw Exception("Only symbolic nodes may be resolved");
- }
- std::shared_ptr<NodeSymbolic> symNode = std::static_pointer_cast<NodeSymbolic>(node);
- return symNode->getNode();
-}
-
-template< typename T >
-inline std::string intToHex(T i)
-{
- std::stringstream stream;
- stream << "\\u"
- << std::setfill('0') << std::setw(sizeof(T))
- << std::hex << i;
- return stream.str();
-}
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_NodeImpl_hh__
+#define avro_NodeImpl_hh__
+
+#include "Config.hh"
+#include "GenericDatum.hh"
+
+#include <limits>
+#include <set>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <memory>
+
+#include "Node.hh"
+#include "NodeConcepts.hh"
+
+namespace avro {
+
+/// Implementation details for Node. NodeImpl represents all the avro types,
+/// whose properties are enabled and disabled by selecting concept classes.
+
+template
+<
+ class NameConcept,
+ class LeavesConcept,
+ class LeafNamesConcept,
+ class SizeConcept
+>
+class NodeImpl : public Node
+{
+
+ protected:
+
+ NodeImpl(Type type) :
+ Node(type),
+ nameAttribute_(),
+ docAttribute_(),
+ leafAttributes_(),
+ leafNameAttributes_(),
+ sizeAttribute_()
+ { }
+
+ NodeImpl(Type type,
+ const NameConcept &name,
+ const LeavesConcept &leaves,
+ const LeafNamesConcept &leafNames,
+ const SizeConcept &size) :
+ Node(type),
+ nameAttribute_(name),
+ docAttribute_(),
+ leafAttributes_(leaves),
+ leafNameAttributes_(leafNames),
+ sizeAttribute_(size)
+ { }
+
+ // Ctor with "doc"
+ NodeImpl(Type type,
+ const NameConcept &name,
+ const concepts::SingleAttribute<std::string> &doc,
+ const LeavesConcept &leaves,
+ const LeafNamesConcept &leafNames,
+ const SizeConcept &size) :
+ Node(type),
+ nameAttribute_(name),
+ docAttribute_(doc),
+ leafAttributes_(leaves),
+ leafNameAttributes_(leafNames),
+ sizeAttribute_(size)
+ {}
+
+ void swap(NodeImpl& impl) {
+ std::swap(nameAttribute_, impl.nameAttribute_);
+ std::swap(docAttribute_, impl.docAttribute_);
+ std::swap(leafAttributes_, impl.leafAttributes_);
+ std::swap(leafNameAttributes_, impl.leafNameAttributes_);
+ std::swap(sizeAttribute_, impl.sizeAttribute_);
+ std::swap(nameIndex_, impl.nameIndex_);
+ }
+
+ bool hasName() const {
+ // e.g.: true for single and multiattributes, false for noattributes.
+ return NameConcept::hasAttribute;
+ }
+
+ void doSetName(const Name &name) {
+ nameAttribute_.add(name);
+ }
+
+ const Name &name() const {
+ return nameAttribute_.get();
+ }
+
+ void doSetDoc(const std::string &doc) {
+ docAttribute_.add(doc);
+ }
+
+ const std::string &getDoc() const {
+ return docAttribute_.get();
+ }
+
+ void doAddLeaf(const NodePtr &newLeaf) {
+ leafAttributes_.add(newLeaf);
+ }
+
+ size_t leaves() const {
+ return leafAttributes_.size();
+ }
+
+ const NodePtr &leafAt(int index) const {
+ return leafAttributes_.get(index);
+ }
+
+ void doAddName(const std::string &name) {
+ if (! nameIndex_.add(name, leafNameAttributes_.size())) {
+ throw Exception(boost::format("Cannot add duplicate name: %1%") % name);
+ }
+ leafNameAttributes_.add(name);
+ }
+
+ size_t names() const {
+ return leafNameAttributes_.size();
+ }
+
+ const std::string &nameAt(int index) const {
+ return leafNameAttributes_.get(index);
+ }
+
+ bool nameIndex(const std::string &name, size_t &index) const {
+ return nameIndex_.lookup(name, index);
+ }
+
+ void doSetFixedSize(int size) {
+ sizeAttribute_.add(size);
+ }
+
+ int fixedSize() const {
+ return sizeAttribute_.get();
+ }
+
+ virtual bool isValid() const = 0;
+
+ void printBasicInfo(std::ostream &os) const;
+
+ void setLeafToSymbolic(int index, const NodePtr &node);
+
+ SchemaResolution furtherResolution(const Node &reader) const {
+ SchemaResolution match = RESOLVE_NO_MATCH;
+
+ if (reader.type() == AVRO_SYMBOLIC) {
+
+ // resolve the symbolic type, and check again
+ const NodePtr &node = reader.leafAt(0);
+ match = resolve(*node);
+ }
+ else if(reader.type() == AVRO_UNION) {
+
+ // in this case, need to see if there is an exact match for the
+ // writer's type, or if not, the first one that can be promoted to a
+ // match
+
+ for(size_t i= 0; i < reader.leaves(); ++i) {
+
+ const NodePtr &node = reader.leafAt(i);
+ SchemaResolution thisMatch = resolve(*node);
+
+ // if matched then the search is done
+ if(thisMatch == RESOLVE_MATCH) {
+ match = thisMatch;
+ break;
+ }
+
+ // thisMatch is either no match, or promotable, this will set match to
+ // promotable if it hasn't been set already
+ if (match == RESOLVE_NO_MATCH) {
+ match = thisMatch;
+ }
+ }
+ }
+
+ return match;
+ }
+
+ NameConcept nameAttribute_;
+
+ // Rem: NameConcept type is HasName (= SingleAttribute<Name>), we use std::string instead
+ concepts::SingleAttribute<std::string> docAttribute_; /** Doc used to compare schemas */
+
+ LeavesConcept leafAttributes_;
+ LeafNamesConcept leafNameAttributes_;
+ SizeConcept sizeAttribute_;
+ concepts::NameIndexConcept<LeafNamesConcept> nameIndex_;
+};
+
+typedef concepts::NoAttribute<Name> NoName;
+typedef concepts::SingleAttribute<Name> HasName;
+
+typedef concepts::SingleAttribute<std::string> HasDoc;
+
+typedef concepts::NoAttribute<NodePtr> NoLeaves;
+typedef concepts::SingleAttribute<NodePtr> SingleLeaf;
+typedef concepts::MultiAttribute<NodePtr> MultiLeaves;
+
+typedef concepts::NoAttribute<std::string> NoLeafNames;
+typedef concepts::MultiAttribute<std::string> LeafNames;
+
+typedef concepts::NoAttribute<int> NoSize;
+typedef concepts::SingleAttribute<int> HasSize;
+
+typedef NodeImpl< NoName, NoLeaves, NoLeafNames, NoSize > NodeImplPrimitive;
+typedef NodeImpl< HasName, NoLeaves, NoLeafNames, NoSize > NodeImplSymbolic;
+
+typedef NodeImpl< HasName, MultiLeaves, LeafNames, NoSize > NodeImplRecord;
+typedef NodeImpl< HasName, NoLeaves, LeafNames, NoSize > NodeImplEnum;
+typedef NodeImpl< NoName, SingleLeaf, NoLeafNames, NoSize > NodeImplArray;
+typedef NodeImpl< NoName, MultiLeaves, NoLeafNames, NoSize > NodeImplMap;
+typedef NodeImpl< NoName, MultiLeaves, NoLeafNames, NoSize > NodeImplUnion;
+typedef NodeImpl< HasName, NoLeaves, NoLeafNames, HasSize > NodeImplFixed;
+
+class AVRO_DECL NodePrimitive : public NodeImplPrimitive
+{
+ public:
+
+ explicit NodePrimitive(Type type) :
+ NodeImplPrimitive(type)
+ { }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return true;
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+class AVRO_DECL NodeSymbolic : public NodeImplSymbolic
+{
+ typedef std::weak_ptr<Node> NodeWeakPtr;
+
+ public:
+
+ NodeSymbolic() :
+ NodeImplSymbolic(AVRO_SYMBOLIC)
+ { }
+
+ explicit NodeSymbolic(const HasName &name) :
+ NodeImplSymbolic(AVRO_SYMBOLIC, name, NoLeaves(), NoLeafNames(), NoSize())
+ { }
+
+ NodeSymbolic(const HasName &name, const NodePtr n) :
+ NodeImplSymbolic(AVRO_SYMBOLIC, name, NoLeaves(), NoLeafNames(), NoSize()), actualNode_(n)
+ { }
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return (nameAttribute_.size() == 1);
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+
+ bool isSet() const {
+ return (actualNode_.lock() != 0);
+ }
+
+ NodePtr getNode() const {
+ NodePtr node = actualNode_.lock();
+ if(!node) {
+ throw Exception(boost::format("Could not follow symbol %1%") % name());
+ }
+ return node;
+ }
+
+ void setNode(const NodePtr &node) {
+ actualNode_ = node;
+ }
+
+ protected:
+
+ NodeWeakPtr actualNode_;
+
+};
+
+class AVRO_DECL NodeRecord : public NodeImplRecord {
+ std::vector<GenericDatum> defaultValues;
+public:
+ NodeRecord() : NodeImplRecord(AVRO_RECORD) { }
+ NodeRecord(const HasName &name, const MultiLeaves &fields,
+ const LeafNames &fieldsNames,
+ const std::vector<GenericDatum>& dv) :
+ NodeImplRecord(AVRO_RECORD, name, fields, fieldsNames, NoSize()),
+ defaultValues(dv) {
+ for (size_t i = 0; i < leafNameAttributes_.size(); ++i) {
+ if (!nameIndex_.add(leafNameAttributes_.get(i), i)) {
+ throw Exception(boost::format(
+ "Cannot add duplicate field: %1%") %
+ leafNameAttributes_.get(i));
+ }
+ }
+ }
+
+ NodeRecord(const HasName &name, const HasDoc &doc, const MultiLeaves &fields,
+ const LeafNames &fieldsNames,
+ const std::vector<GenericDatum> &dv) :
+ NodeImplRecord(AVRO_RECORD, name, doc, fields, fieldsNames, NoSize()),
+ defaultValues(dv) {
+ for (size_t i = 0; i < leafNameAttributes_.size(); ++i) {
+ if (!nameIndex_.add(leafNameAttributes_.get(i), i)) {
+ throw Exception(boost::format(
+ "Cannot add duplicate field: %1%") %
+ leafNameAttributes_.get(i));
+ }
+ }
+ }
+
+ void swap(NodeRecord& r) {
+ NodeImplRecord::swap(r);
+ defaultValues.swap(r.defaultValues);
+ }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return ((nameAttribute_.size() == 1) &&
+ (leafAttributes_.size() == leafNameAttributes_.size()));
+ }
+
+ const GenericDatum& defaultValueAt(int index) {
+ return defaultValues[index];
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+class AVRO_DECL NodeEnum : public NodeImplEnum
+{
+ public:
+
+ NodeEnum() :
+ NodeImplEnum(AVRO_ENUM)
+ { }
+
+ NodeEnum(const HasName &name, const LeafNames &symbols) :
+ NodeImplEnum(AVRO_ENUM, name, NoLeaves(), symbols, NoSize())
+ {
+ for(size_t i=0; i < leafNameAttributes_.size(); ++i) {
+ if(!nameIndex_.add(leafNameAttributes_.get(i), i)) {
+ throw Exception(boost::format("Cannot add duplicate enum: %1%") % leafNameAttributes_.get(i));
+ }
+ }
+ }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return (
+ (nameAttribute_.size() == 1) &&
+ (leafNameAttributes_.size() > 0)
+ );
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+class AVRO_DECL NodeArray : public NodeImplArray
+{
+ public:
+
+ NodeArray() :
+ NodeImplArray(AVRO_ARRAY)
+ { }
+
+ explicit NodeArray(const SingleLeaf &items) :
+ NodeImplArray(AVRO_ARRAY, NoName(), items, NoLeafNames(), NoSize())
+ { }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return (leafAttributes_.size() == 1);
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+class AVRO_DECL NodeMap : public NodeImplMap
+{
+ public:
+
+ NodeMap() :
+ NodeImplMap(AVRO_MAP)
+ {
+ NodePtr key(new NodePrimitive(AVRO_STRING));
+ doAddLeaf(key);
+ }
+
+ explicit NodeMap(const SingleLeaf &values) :
+ NodeImplMap(AVRO_MAP, NoName(), values, NoLeafNames(), NoSize())
+ {
+ // need to add the key for the map too
+ NodePtr key(new NodePrimitive(AVRO_STRING));
+ doAddLeaf(key);
+
+ // key goes before value
+ std::swap(leafAttributes_.get(0), leafAttributes_.get(1));
+ }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return (leafAttributes_.size() == 2);
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+class AVRO_DECL NodeUnion : public NodeImplUnion
+{
+ public:
+
+ NodeUnion() :
+ NodeImplUnion(AVRO_UNION)
+ { }
+
+ explicit NodeUnion(const MultiLeaves &types) :
+ NodeImplUnion(AVRO_UNION, NoName(), types, NoLeafNames(), NoSize())
+ { }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ std::set<std::string> seen;
+ if (leafAttributes_.size() >= 1) {
+ for (size_t i = 0; i < leafAttributes_.size(); ++i) {
+ std::string name;
+ const NodePtr& n = leafAttributes_.get(i);
+ switch (n->type()) {
+ case AVRO_STRING:
+ name = "string";
+ break;
+ case AVRO_BYTES:
+ name = "bytes";
+ break;
+ case AVRO_INT:
+ name = "int";
+ break;
+ case AVRO_LONG:
+ name = "long";
+ break;
+ case AVRO_FLOAT:
+ name = "float";
+ break;
+ case AVRO_DOUBLE:
+ name = "double";
+ break;
+ case AVRO_BOOL:
+ name = "bool";
+ break;
+ case AVRO_NULL:
+ name = "null";
+ break;
+ case AVRO_ARRAY:
+ name = "array";
+ break;
+ case AVRO_MAP:
+ name = "map";
+ break;
+ case AVRO_RECORD:
+ case AVRO_ENUM:
+ case AVRO_UNION:
+ case AVRO_FIXED:
+ case AVRO_SYMBOLIC:
+ name = n->name().fullname();
+ break;
+ default:
+ return false;
+ }
+ if (seen.find(name) != seen.end()) {
+ return false;
+ }
+ seen.insert(name);
+ }
+ return true;
+ }
+ return false;
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+class AVRO_DECL NodeFixed : public NodeImplFixed
+{
+ public:
+
+ NodeFixed() :
+ NodeImplFixed(AVRO_FIXED)
+ { }
+
+ NodeFixed(const HasName &name, const HasSize &size) :
+ NodeImplFixed(AVRO_FIXED, name, NoLeaves(), NoLeafNames(), size)
+ { }
+
+ SchemaResolution resolve(const Node &reader) const;
+
+ void printJson(std::ostream &os, int depth) const;
+
+ bool isValid() const {
+ return (
+ (nameAttribute_.size() == 1) &&
+ (sizeAttribute_.size() == 1)
+ );
+ }
+
+ void printDefaultToJson(const GenericDatum& g, std::ostream &os, int depth) const;
+};
+
+template < class A, class B, class C, class D >
+inline void
+NodeImpl<A,B,C,D>::setLeafToSymbolic(int index, const NodePtr &node)
+{
+ if(!B::hasAttribute) {
+ throw Exception("Cannot change leaf node for nonexistent leaf");
+ }
+
+ NodePtr &replaceNode = const_cast<NodePtr &>(leafAttributes_.get(index));
+ if(replaceNode->name() != node->name()) {
+ throw Exception("Symbolic name does not match the name of the schema it references");
+ }
+
+ NodePtr symbol(new NodeSymbolic);
+ NodeSymbolic *ptr = static_cast<NodeSymbolic *> (symbol.get());
+
+ ptr->setName(node->name());
+ ptr->setNode(node);
+ replaceNode.swap(symbol);
+}
+
+template < class A, class B, class C, class D >
+inline void
+NodeImpl<A,B,C,D>::printBasicInfo(std::ostream &os) const
+{
+ os << type();
+ if(hasName()) {
+ os << ' ' << nameAttribute_.get();
+ }
+
+ if(D::hasAttribute) {
+ os << " " << sizeAttribute_.get();
+ }
+ os << '\n';
+ int count = leaves();
+ count = count ? count : names();
+ for(int i= 0; i < count; ++i) {
+ if( C::hasAttribute ) {
+ os << "name " << nameAt(i) << '\n';
+ }
+ if( type() != AVRO_SYMBOLIC && leafAttributes_.hasAttribute) {
+ leafAt(i)->printBasicInfo(os);
+ }
+ }
+ if(isCompound(type())) {
+ os << "end " << type() << '\n';
+ }
+}
+
+
+inline NodePtr resolveSymbol(const NodePtr &node)
+{
+ if(node->type() != AVRO_SYMBOLIC) {
+ throw Exception("Only symbolic nodes may be resolved");
+ }
+ std::shared_ptr<NodeSymbolic> symNode = std::static_pointer_cast<NodeSymbolic>(node);
+ return symNode->getNode();
+}
+
+template< typename T >
+inline std::string intToHex(T i)
+{
+ std::stringstream stream;
+ stream << "\\u"
+ << std::setfill('0') << std::setw(sizeof(T))
+ << std::hex << i;
+ return stream.str();
+}
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Parser.hh b/contrib/libs/apache/avro/api/Parser.hh
index fdf28fb0d58..3ba6af65e30 100644
--- a/contrib/libs/apache/avro/api/Parser.hh
+++ b/contrib/libs/apache/avro/api/Parser.hh
@@ -1,151 +1,151 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Parser_hh__
-#define avro_Parser_hh__
-
-#include "Config.hh"
-#include "Reader.hh"
-
-#include <array>
-
-namespace avro {
-
-///
-/// Class that wraps a reader or ValidatingReade with an interface that uses
-/// explicit get* names instead of getValue
-///
-
-template<class Reader>
-class Parser : private boost::noncopyable
-{
-
- public:
-
- // Constructor only works with Writer
- explicit Parser(const InputBuffer &in) :
- reader_(in)
- {}
-
- /// Constructor only works with ValidatingWriter
- Parser(const ValidSchema &schema, const InputBuffer &in) :
- reader_(schema, in)
- {}
-
- void readNull() {
- Null null;
- reader_.readValue(null);
- }
-
- bool readBool() {
- bool val;
- reader_.readValue(val);
- return val;
- }
-
- int32_t readInt() {
- int32_t val;
- reader_.readValue(val);
- return val;
- }
-
- int64_t readLong() {
- int64_t val;
- reader_.readValue(val);
- return val;
- }
-
- float readFloat() {
- float val;
- reader_.readValue(val);
- return val;
- }
-
- double readDouble() {
- double val;
- reader_.readValue(val);
- return val;
- }
-
- void readString(std::string &val) {
- reader_.readValue(val);
- }
-
- void readBytes(std::vector<uint8_t> &val) {
- reader_.readBytes(val);
- }
-
- template <size_t N>
- void readFixed(uint8_t (&val)[N]) {
- reader_.readFixed(val);
- }
-
- template<size_t N>
- void readFixed(std::array<uint8_t, N> &val) {
- reader_.readFixed(val);
- }
-
- void readRecord() {
- reader_.readRecord();
- }
-
- void readRecordEnd() {
- reader_.readRecordEnd();
- }
-
- int64_t readArrayBlockSize() {
- return reader_.readArrayBlockSize();
- }
-
- int64_t readUnion() {
- return reader_.readUnion();
- }
-
- int64_t readEnum() {
- return reader_.readEnum();
- }
-
- int64_t readMapBlockSize() {
- return reader_.readMapBlockSize();
- }
-
- private:
-
- friend Type nextType(Parser<ValidatingReader> &p);
- friend bool currentRecordName(Parser<ValidatingReader> &p, std::string &name);
- friend bool nextFieldName(Parser<ValidatingReader> &p, std::string &name);
-
- Reader reader_;
-
-};
-
-inline Type nextType(Parser<ValidatingReader> &p) {
- return p.reader_.nextType();
-}
-
-inline bool currentRecordName(Parser<ValidatingReader> &p, std::string &name) {
- return p.reader_.currentRecordName(name);
-}
-
-inline bool nextFieldName(Parser<ValidatingReader> &p, std::string &name) {
- return p.reader_.nextFieldName(name);
-}
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Parser_hh__
+#define avro_Parser_hh__
+
+#include "Config.hh"
+#include "Reader.hh"
+
+#include <array>
+
+namespace avro {
+
+///
+/// Class that wraps a reader or ValidatingReade with an interface that uses
+/// explicit get* names instead of getValue
+///
+
+template<class Reader>
+class Parser : private boost::noncopyable
+{
+
+ public:
+
+ // Constructor only works with Writer
+ explicit Parser(const InputBuffer &in) :
+ reader_(in)
+ {}
+
+ /// Constructor only works with ValidatingWriter
+ Parser(const ValidSchema &schema, const InputBuffer &in) :
+ reader_(schema, in)
+ {}
+
+ void readNull() {
+ Null null;
+ reader_.readValue(null);
+ }
+
+ bool readBool() {
+ bool val;
+ reader_.readValue(val);
+ return val;
+ }
+
+ int32_t readInt() {
+ int32_t val;
+ reader_.readValue(val);
+ return val;
+ }
+
+ int64_t readLong() {
+ int64_t val;
+ reader_.readValue(val);
+ return val;
+ }
+
+ float readFloat() {
+ float val;
+ reader_.readValue(val);
+ return val;
+ }
+
+ double readDouble() {
+ double val;
+ reader_.readValue(val);
+ return val;
+ }
+
+ void readString(std::string &val) {
+ reader_.readValue(val);
+ }
+
+ void readBytes(std::vector<uint8_t> &val) {
+ reader_.readBytes(val);
+ }
+
+ template <size_t N>
+ void readFixed(uint8_t (&val)[N]) {
+ reader_.readFixed(val);
+ }
+
+ template<size_t N>
+ void readFixed(std::array<uint8_t, N> &val) {
+ reader_.readFixed(val);
+ }
+
+ void readRecord() {
+ reader_.readRecord();
+ }
+
+ void readRecordEnd() {
+ reader_.readRecordEnd();
+ }
+
+ int64_t readArrayBlockSize() {
+ return reader_.readArrayBlockSize();
+ }
+
+ int64_t readUnion() {
+ return reader_.readUnion();
+ }
+
+ int64_t readEnum() {
+ return reader_.readEnum();
+ }
+
+ int64_t readMapBlockSize() {
+ return reader_.readMapBlockSize();
+ }
+
+ private:
+
+ friend Type nextType(Parser<ValidatingReader> &p);
+ friend bool currentRecordName(Parser<ValidatingReader> &p, std::string &name);
+ friend bool nextFieldName(Parser<ValidatingReader> &p, std::string &name);
+
+ Reader reader_;
+
+};
+
+inline Type nextType(Parser<ValidatingReader> &p) {
+ return p.reader_.nextType();
+}
+
+inline bool currentRecordName(Parser<ValidatingReader> &p, std::string &name) {
+ return p.reader_.currentRecordName(name);
+}
+
+inline bool nextFieldName(Parser<ValidatingReader> &p, std::string &name) {
+ return p.reader_.nextFieldName(name);
+}
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Reader.hh b/contrib/libs/apache/avro/api/Reader.hh
index 4f514fbbe93..26488145a22 100644
--- a/contrib/libs/apache/avro/api/Reader.hh
+++ b/contrib/libs/apache/avro/api/Reader.hh
@@ -1,209 +1,209 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Reader_hh__
-#define avro_Reader_hh__
-
-#include <stdint.h>
-#include <vector>
-#include <array>
-#include <boost/noncopyable.hpp>
-
-#include "Config.hh"
-#include "Zigzag.hh"
-#include "Types.hh"
-#include "Validator.hh"
-#include "buffer/BufferReader.hh"
-
-namespace avro {
-
-///
-/// Parses from an avro encoding to the requested type. Assumes the next item
-/// in the avro binary data is the expected type.
-///
-
-template<class ValidatorType>
-class ReaderImpl : private boost::noncopyable
-{
-
- public:
-
- explicit ReaderImpl(const InputBuffer &buffer) :
- reader_(buffer)
- {}
-
- ReaderImpl(const ValidSchema &schema, const InputBuffer &buffer) :
- validator_(schema),
- reader_(buffer)
- {}
-
- void readValue(Null &) {
- validator_.checkTypeExpected(AVRO_NULL);
- }
-
- void readValue(bool &val) {
- validator_.checkTypeExpected(AVRO_BOOL);
- uint8_t ival = 0;
- reader_.read(ival);
- val = (ival != 0);
- }
-
- void readValue(int32_t &val) {
- validator_.checkTypeExpected(AVRO_INT);
- uint32_t encoded = static_cast<uint32_t>(readVarInt());
- val = decodeZigzag32(encoded);
- }
-
- void readValue(int64_t &val) {
- validator_.checkTypeExpected(AVRO_LONG);
- uint64_t encoded = readVarInt();
- val = decodeZigzag64(encoded);
- }
-
- void readValue(float &val) {
- validator_.checkTypeExpected(AVRO_FLOAT);
- union {
- float f;
- uint32_t i;
- } v;
- reader_.read(v.i);
- val = v.f;
- }
-
- void readValue(double &val) {
- validator_.checkTypeExpected(AVRO_DOUBLE);
- union {
- double d;
- uint64_t i;
- } v;
- reader_.read(v.i);
- val = v.d;
- }
-
- void readValue(std::string &val) {
- validator_.checkTypeExpected(AVRO_STRING);
- size_t size = static_cast<size_t>(readSize());
- reader_.read(val, size);
- }
-
- void readBytes(std::vector<uint8_t> &val) {
- validator_.checkTypeExpected(AVRO_BYTES);
- size_t size = static_cast<size_t>(readSize());
- val.resize(size);
- reader_.read(reinterpret_cast<char *>(val.data()), size);
- }
-
- void readFixed(uint8_t *val, size_t size) {
- validator_.checkFixedSizeExpected(size);
- reader_.read(reinterpret_cast<char *>(val), size);
- }
-
- template <size_t N>
- void readFixed(uint8_t (&val)[N]) {
- this->readFixed(val, N);
- }
-
- template <size_t N>
- void readFixed(std::array<uint8_t, N> &val) {
- this->readFixed(val.data(), N);
- }
-
- void readRecord() {
- validator_.checkTypeExpected(AVRO_RECORD);
- validator_.checkTypeExpected(AVRO_LONG);
- validator_.setCount(1);
- }
-
- void readRecordEnd() {
- validator_.checkTypeExpected(AVRO_RECORD);
- validator_.checkTypeExpected(AVRO_LONG);
- validator_.setCount(0);
- }
-
- int64_t readArrayBlockSize() {
- validator_.checkTypeExpected(AVRO_ARRAY);
- return readCount();
- }
-
- int64_t readUnion() {
- validator_.checkTypeExpected(AVRO_UNION);
- return readCount();
- }
-
- int64_t readEnum() {
- validator_.checkTypeExpected(AVRO_ENUM);
- return readCount();
- }
-
- int64_t readMapBlockSize() {
- validator_.checkTypeExpected(AVRO_MAP);
- return readCount();
- }
-
- Type nextType() const {
- return validator_.nextTypeExpected();
- }
-
- bool currentRecordName(std::string &name) const {
- return validator_.getCurrentRecordName(name);
- }
-
- bool nextFieldName(std::string &name) const {
- return validator_.getNextFieldName(name);
- }
-
- private:
-
- uint64_t readVarInt() {
- uint64_t encoded = 0;
- uint8_t val = 0;
- int shift = 0;
- do {
- reader_.read(val);
- uint64_t newbits = static_cast<uint64_t>(val & 0x7f) << shift;
- encoded |= newbits;
- shift += 7;
- } while (val & 0x80);
-
- return encoded;
- }
-
- int64_t readSize() {
- uint64_t encoded = readVarInt();
- int64_t size = decodeZigzag64(encoded);
- return size;
- }
-
- int64_t readCount() {
- validator_.checkTypeExpected(AVRO_LONG);
- int64_t count = readSize();
- validator_.setCount(count);
- return count;
- }
-
- ValidatorType validator_;
- BufferReader reader_;
-
-};
-
-typedef ReaderImpl<NullValidator> Reader;
-typedef ReaderImpl<Validator> ValidatingReader;
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Reader_hh__
+#define avro_Reader_hh__
+
+#include <stdint.h>
+#include <vector>
+#include <array>
+#include <boost/noncopyable.hpp>
+
+#include "Config.hh"
+#include "Zigzag.hh"
+#include "Types.hh"
+#include "Validator.hh"
+#include "buffer/BufferReader.hh"
+
+namespace avro {
+
+///
+/// Parses from an avro encoding to the requested type. Assumes the next item
+/// in the avro binary data is the expected type.
+///
+
+template<class ValidatorType>
+class ReaderImpl : private boost::noncopyable
+{
+
+ public:
+
+ explicit ReaderImpl(const InputBuffer &buffer) :
+ reader_(buffer)
+ {}
+
+ ReaderImpl(const ValidSchema &schema, const InputBuffer &buffer) :
+ validator_(schema),
+ reader_(buffer)
+ {}
+
+ void readValue(Null &) {
+ validator_.checkTypeExpected(AVRO_NULL);
+ }
+
+ void readValue(bool &val) {
+ validator_.checkTypeExpected(AVRO_BOOL);
+ uint8_t ival = 0;
+ reader_.read(ival);
+ val = (ival != 0);
+ }
+
+ void readValue(int32_t &val) {
+ validator_.checkTypeExpected(AVRO_INT);
+ uint32_t encoded = static_cast<uint32_t>(readVarInt());
+ val = decodeZigzag32(encoded);
+ }
+
+ void readValue(int64_t &val) {
+ validator_.checkTypeExpected(AVRO_LONG);
+ uint64_t encoded = readVarInt();
+ val = decodeZigzag64(encoded);
+ }
+
+ void readValue(float &val) {
+ validator_.checkTypeExpected(AVRO_FLOAT);
+ union {
+ float f;
+ uint32_t i;
+ } v;
+ reader_.read(v.i);
+ val = v.f;
+ }
+
+ void readValue(double &val) {
+ validator_.checkTypeExpected(AVRO_DOUBLE);
+ union {
+ double d;
+ uint64_t i;
+ } v;
+ reader_.read(v.i);
+ val = v.d;
+ }
+
+ void readValue(std::string &val) {
+ validator_.checkTypeExpected(AVRO_STRING);
+ size_t size = static_cast<size_t>(readSize());
+ reader_.read(val, size);
+ }
+
+ void readBytes(std::vector<uint8_t> &val) {
+ validator_.checkTypeExpected(AVRO_BYTES);
+ size_t size = static_cast<size_t>(readSize());
+ val.resize(size);
+ reader_.read(reinterpret_cast<char *>(val.data()), size);
+ }
+
+ void readFixed(uint8_t *val, size_t size) {
+ validator_.checkFixedSizeExpected(size);
+ reader_.read(reinterpret_cast<char *>(val), size);
+ }
+
+ template <size_t N>
+ void readFixed(uint8_t (&val)[N]) {
+ this->readFixed(val, N);
+ }
+
+ template <size_t N>
+ void readFixed(std::array<uint8_t, N> &val) {
+ this->readFixed(val.data(), N);
+ }
+
+ void readRecord() {
+ validator_.checkTypeExpected(AVRO_RECORD);
+ validator_.checkTypeExpected(AVRO_LONG);
+ validator_.setCount(1);
+ }
+
+ void readRecordEnd() {
+ validator_.checkTypeExpected(AVRO_RECORD);
+ validator_.checkTypeExpected(AVRO_LONG);
+ validator_.setCount(0);
+ }
+
+ int64_t readArrayBlockSize() {
+ validator_.checkTypeExpected(AVRO_ARRAY);
+ return readCount();
+ }
+
+ int64_t readUnion() {
+ validator_.checkTypeExpected(AVRO_UNION);
+ return readCount();
+ }
+
+ int64_t readEnum() {
+ validator_.checkTypeExpected(AVRO_ENUM);
+ return readCount();
+ }
+
+ int64_t readMapBlockSize() {
+ validator_.checkTypeExpected(AVRO_MAP);
+ return readCount();
+ }
+
+ Type nextType() const {
+ return validator_.nextTypeExpected();
+ }
+
+ bool currentRecordName(std::string &name) const {
+ return validator_.getCurrentRecordName(name);
+ }
+
+ bool nextFieldName(std::string &name) const {
+ return validator_.getNextFieldName(name);
+ }
+
+ private:
+
+ uint64_t readVarInt() {
+ uint64_t encoded = 0;
+ uint8_t val = 0;
+ int shift = 0;
+ do {
+ reader_.read(val);
+ uint64_t newbits = static_cast<uint64_t>(val & 0x7f) << shift;
+ encoded |= newbits;
+ shift += 7;
+ } while (val & 0x80);
+
+ return encoded;
+ }
+
+ int64_t readSize() {
+ uint64_t encoded = readVarInt();
+ int64_t size = decodeZigzag64(encoded);
+ return size;
+ }
+
+ int64_t readCount() {
+ validator_.checkTypeExpected(AVRO_LONG);
+ int64_t count = readSize();
+ validator_.setCount(count);
+ return count;
+ }
+
+ ValidatorType validator_;
+ BufferReader reader_;
+
+};
+
+typedef ReaderImpl<NullValidator> Reader;
+typedef ReaderImpl<Validator> ValidatingReader;
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Resolver.hh b/contrib/libs/apache/avro/api/Resolver.hh
index a0ffcbeac77..91c6de1cb71 100644
--- a/contrib/libs/apache/avro/api/Resolver.hh
+++ b/contrib/libs/apache/avro/api/Resolver.hh
@@ -1,57 +1,57 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Resolver_hh__
-#define avro_Resolver_hh__
-
-#include <memory>
-#include <boost/noncopyable.hpp>
-#include <stdint.h>
-
-#include "Config.hh"
-#include "Reader.hh"
-
-/// \file Resolver.hh
-///
-
-namespace avro {
-
-class ValidSchema;
-class Layout;
-
-class AVRO_DECL Resolver : private boost::noncopyable
-{
-
- public:
-
- virtual void parse(Reader &reader, uint8_t *address) const = 0;
- virtual ~Resolver() {}
-
-};
-
-std::unique_ptr<Resolver> constructResolver(
- const ValidSchema &rwriterSchema,
- const ValidSchema &readerSchema,
- const Layout &readerLayout
- );
-
-
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Resolver_hh__
+#define avro_Resolver_hh__
+
+#include <memory>
+#include <boost/noncopyable.hpp>
+#include <stdint.h>
+
+#include "Config.hh"
+#include "Reader.hh"
+
+/// \file Resolver.hh
+///
+
+namespace avro {
+
+class ValidSchema;
+class Layout;
+
+class AVRO_DECL Resolver : private boost::noncopyable
+{
+
+ public:
+
+ virtual void parse(Reader &reader, uint8_t *address) const = 0;
+ virtual ~Resolver() {}
+
+};
+
+std::unique_ptr<Resolver> constructResolver(
+ const ValidSchema &rwriterSchema,
+ const ValidSchema &readerSchema,
+ const Layout &readerLayout
+ );
+
+
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/ResolverSchema.hh b/contrib/libs/apache/avro/api/ResolverSchema.hh
index 9048a22b9a2..f9420cdfa55 100644
--- a/contrib/libs/apache/avro/api/ResolverSchema.hh
+++ b/contrib/libs/apache/avro/api/ResolverSchema.hh
@@ -1,56 +1,56 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_ResolverSchema_hh__
-#define avro_ResolverSchema_hh__
-
-#include <boost/noncopyable.hpp>
-#include <stdint.h>
-#include <memory>
-
-#include "Config.hh"
-#include "Reader.hh"
-
-/// \file ResolverSchema.hh
-///
-
-namespace avro {
-
-class ValidSchema;
-class Layout;
-class Resolver;
-
-class AVRO_DECL ResolverSchema {
-
- public:
-
- ResolverSchema(const ValidSchema &writer, const ValidSchema &reader, const Layout &readerLayout);
-
- private:
-
- friend class ResolvingReader;
-
- void parse(Reader &reader, uint8_t *address);
-
- std::shared_ptr<Resolver> resolver_;
-
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_ResolverSchema_hh__
+#define avro_ResolverSchema_hh__
+
+#include <boost/noncopyable.hpp>
+#include <stdint.h>
+#include <memory>
+
+#include "Config.hh"
+#include "Reader.hh"
+
+/// \file ResolverSchema.hh
+///
+
+namespace avro {
+
+class ValidSchema;
+class Layout;
+class Resolver;
+
+class AVRO_DECL ResolverSchema {
+
+ public:
+
+ ResolverSchema(const ValidSchema &writer, const ValidSchema &reader, const Layout &readerLayout);
+
+ private:
+
+ friend class ResolvingReader;
+
+ void parse(Reader &reader, uint8_t *address);
+
+ std::shared_ptr<Resolver> resolver_;
+
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/ResolvingReader.hh b/contrib/libs/apache/avro/api/ResolvingReader.hh
index 806e64da56e..7588e5e08d5 100644
--- a/contrib/libs/apache/avro/api/ResolvingReader.hh
+++ b/contrib/libs/apache/avro/api/ResolvingReader.hh
@@ -1,54 +1,54 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_ResolvingReader_hh__
-#define avro_ResolvingReader_hh__
-
-#include <stdint.h>
-#include <boost/noncopyable.hpp>
-
-#include "Config.hh"
-#include "ResolverSchema.hh"
-#include "Reader.hh"
-
-namespace avro {
-
-class AVRO_DECL ResolvingReader : private boost::noncopyable
-{
-
- public:
-
- ResolvingReader(const ResolverSchema &schema, const InputBuffer &in) :
- reader_(in),
- schema_(schema)
- {}
-
- template<typename T>
- void parse(T &object) {
- schema_.parse(reader_, reinterpret_cast<uint8_t *>(&object));
- }
-
- private:
-
- Reader reader_;
- ResolverSchema schema_;
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_ResolvingReader_hh__
+#define avro_ResolvingReader_hh__
+
+#include <stdint.h>
+#include <boost/noncopyable.hpp>
+
+#include "Config.hh"
+#include "ResolverSchema.hh"
+#include "Reader.hh"
+
+namespace avro {
+
+class AVRO_DECL ResolvingReader : private boost::noncopyable
+{
+
+ public:
+
+ ResolvingReader(const ResolverSchema &schema, const InputBuffer &in) :
+ reader_(in),
+ schema_(schema)
+ {}
+
+ template<typename T>
+ void parse(T &object) {
+ schema_.parse(reader_, reinterpret_cast<uint8_t *>(&object));
+ }
+
+ private:
+
+ Reader reader_;
+ ResolverSchema schema_;
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Schema.hh b/contrib/libs/apache/avro/api/Schema.hh
index b8ad92c8256..ee0e79c6ebc 100644
--- a/contrib/libs/apache/avro/api/Schema.hh
+++ b/contrib/libs/apache/avro/api/Schema.hh
@@ -1,146 +1,146 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Schema_hh__
-#define avro_Schema_hh__
-
-#include "Config.hh"
-#include "NodeImpl.hh"
-#include <string>
-
-/// \file
-///
-/// Schemas for representing all the avro types. The compound schema objects
-/// allow composition from other schemas.
-///
-
-namespace avro {
-
-
-/// The root Schema object is a base class. Nobody constructs this class directly.
-
-class AVRO_DECL Schema {
-public:
-
- virtual ~Schema();
-
- Type type() const {
- return node_->type();
- }
-
- const NodePtr &root() const {
- return node_;
- }
-
- NodePtr &root() {
- return node_;
- }
-
- protected:
- Schema();
- explicit Schema(const NodePtr &node);
- explicit Schema(Node *node);
-
- NodePtr node_;
-};
-
-class AVRO_DECL NullSchema : public Schema {
-public:
- NullSchema(): Schema(new NodePrimitive(AVRO_NULL)) {}
-};
-
-class AVRO_DECL BoolSchema : public Schema {
-public:
- BoolSchema(): Schema(new NodePrimitive(AVRO_BOOL)) {}
-};
-
-class AVRO_DECL IntSchema : public Schema {
-public:
- IntSchema(): Schema(new NodePrimitive(AVRO_INT)) {}
-};
-
-class AVRO_DECL LongSchema : public Schema {
-public:
- LongSchema(): Schema(new NodePrimitive(AVRO_LONG)) {}
-};
-
-class AVRO_DECL FloatSchema : public Schema {
-public:
- FloatSchema(): Schema(new NodePrimitive(AVRO_FLOAT)) {}
-};
-
-class AVRO_DECL DoubleSchema : public Schema {
-public:
- DoubleSchema(): Schema(new NodePrimitive(AVRO_DOUBLE)) {}
-};
-
-class AVRO_DECL StringSchema : public Schema {
-public:
- StringSchema(): Schema(new NodePrimitive(AVRO_STRING)) {}
-};
-
-class AVRO_DECL BytesSchema : public Schema {
-public:
- BytesSchema(): Schema(new NodePrimitive(AVRO_BYTES)) {}
-};
-
-class AVRO_DECL RecordSchema : public Schema {
-public:
- RecordSchema(const std::string &name);
- void addField(const std::string &name, const Schema &fieldSchema);
-
- std::string getDoc() const;
- void setDoc(const std::string &);
-};
-
-class AVRO_DECL EnumSchema : public Schema {
-public:
- EnumSchema(const std::string &name);
- void addSymbol(const std::string &symbol);
-};
-
-class AVRO_DECL ArraySchema : public Schema {
-public:
- ArraySchema(const Schema &itemsSchema);
- ArraySchema(const ArraySchema &itemsSchema);
-};
-
-class AVRO_DECL MapSchema : public Schema {
-public:
- MapSchema(const Schema &valuesSchema);
- MapSchema(const MapSchema &itemsSchema);
-};
-
-class AVRO_DECL UnionSchema : public Schema {
-public:
- UnionSchema();
- void addType(const Schema &typeSchema);
-};
-
-class AVRO_DECL FixedSchema : public Schema {
-public:
- FixedSchema(int size, const std::string &name);
-};
-
-class AVRO_DECL SymbolicSchema : public Schema {
-public:
- SymbolicSchema(const Name& name, const NodePtr& link);
-};
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Schema_hh__
+#define avro_Schema_hh__
+
+#include "Config.hh"
+#include "NodeImpl.hh"
+#include <string>
+
+/// \file
+///
+/// Schemas for representing all the avro types. The compound schema objects
+/// allow composition from other schemas.
+///
+
+namespace avro {
+
+
+/// The root Schema object is a base class. Nobody constructs this class directly.
+
+class AVRO_DECL Schema {
+public:
+
+ virtual ~Schema();
+
+ Type type() const {
+ return node_->type();
+ }
+
+ const NodePtr &root() const {
+ return node_;
+ }
+
+ NodePtr &root() {
+ return node_;
+ }
+
+ protected:
+ Schema();
+ explicit Schema(const NodePtr &node);
+ explicit Schema(Node *node);
+
+ NodePtr node_;
+};
+
+class AVRO_DECL NullSchema : public Schema {
+public:
+ NullSchema(): Schema(new NodePrimitive(AVRO_NULL)) {}
+};
+
+class AVRO_DECL BoolSchema : public Schema {
+public:
+ BoolSchema(): Schema(new NodePrimitive(AVRO_BOOL)) {}
+};
+
+class AVRO_DECL IntSchema : public Schema {
+public:
+ IntSchema(): Schema(new NodePrimitive(AVRO_INT)) {}
+};
+
+class AVRO_DECL LongSchema : public Schema {
+public:
+ LongSchema(): Schema(new NodePrimitive(AVRO_LONG)) {}
+};
+
+class AVRO_DECL FloatSchema : public Schema {
+public:
+ FloatSchema(): Schema(new NodePrimitive(AVRO_FLOAT)) {}
+};
+
+class AVRO_DECL DoubleSchema : public Schema {
+public:
+ DoubleSchema(): Schema(new NodePrimitive(AVRO_DOUBLE)) {}
+};
+
+class AVRO_DECL StringSchema : public Schema {
+public:
+ StringSchema(): Schema(new NodePrimitive(AVRO_STRING)) {}
+};
+
+class AVRO_DECL BytesSchema : public Schema {
+public:
+ BytesSchema(): Schema(new NodePrimitive(AVRO_BYTES)) {}
+};
+
+class AVRO_DECL RecordSchema : public Schema {
+public:
+ RecordSchema(const std::string &name);
+ void addField(const std::string &name, const Schema &fieldSchema);
+
+ std::string getDoc() const;
+ void setDoc(const std::string &);
+};
+
+class AVRO_DECL EnumSchema : public Schema {
+public:
+ EnumSchema(const std::string &name);
+ void addSymbol(const std::string &symbol);
+};
+
+class AVRO_DECL ArraySchema : public Schema {
+public:
+ ArraySchema(const Schema &itemsSchema);
+ ArraySchema(const ArraySchema &itemsSchema);
+};
+
+class AVRO_DECL MapSchema : public Schema {
+public:
+ MapSchema(const Schema &valuesSchema);
+ MapSchema(const MapSchema &itemsSchema);
+};
+
+class AVRO_DECL UnionSchema : public Schema {
+public:
+ UnionSchema();
+ void addType(const Schema &typeSchema);
+};
+
+class AVRO_DECL FixedSchema : public Schema {
+public:
+ FixedSchema(int size, const std::string &name);
+};
+
+class AVRO_DECL SymbolicSchema : public Schema {
+public:
+ SymbolicSchema(const Name& name, const NodePtr& link);
+};
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/SchemaResolution.hh b/contrib/libs/apache/avro/api/SchemaResolution.hh
index 765347d9ded..c3a39e1237c 100644
--- a/contrib/libs/apache/avro/api/SchemaResolution.hh
+++ b/contrib/libs/apache/avro/api/SchemaResolution.hh
@@ -1,55 +1,55 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_SchemaResolution_hh__
-#define avro_SchemaResolution_hh__
-
-#include "Config.hh"
-
-namespace avro {
-
-
-enum SchemaResolution {
-
- /// The schemas definitely do not match
-
- RESOLVE_NO_MATCH,
-
- /// The schemas match at a cursory level
- ///
- /// For records and enums, this means the name is the same, but it does not
- /// necessarily mean that every symbol or field is an exact match.
-
- RESOLVE_MATCH,
-
- /// For primitives, the matching may occur if the type is promotable. This means that the
- /// writer matches reader if the writer's type is promoted the specified type.
-
- //@{
-
- RESOLVE_PROMOTABLE_TO_LONG,
- RESOLVE_PROMOTABLE_TO_FLOAT,
- RESOLVE_PROMOTABLE_TO_DOUBLE,
-
- //@}
-
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_SchemaResolution_hh__
+#define avro_SchemaResolution_hh__
+
+#include "Config.hh"
+
+namespace avro {
+
+
+enum SchemaResolution {
+
+ /// The schemas definitely do not match
+
+ RESOLVE_NO_MATCH,
+
+ /// The schemas match at a cursory level
+ ///
+ /// For records and enums, this means the name is the same, but it does not
+ /// necessarily mean that every symbol or field is an exact match.
+
+ RESOLVE_MATCH,
+
+ /// For primitives, the matching may occur if the type is promotable. This means that the
+ /// writer matches reader if the writer's type is promoted the specified type.
+
+ //@{
+
+ RESOLVE_PROMOTABLE_TO_LONG,
+ RESOLVE_PROMOTABLE_TO_FLOAT,
+ RESOLVE_PROMOTABLE_TO_DOUBLE,
+
+ //@}
+
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Serializer.hh b/contrib/libs/apache/avro/api/Serializer.hh
index 15a8878586e..3cc15b5a958 100644
--- a/contrib/libs/apache/avro/api/Serializer.hh
+++ b/contrib/libs/apache/avro/api/Serializer.hh
@@ -1,135 +1,135 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Serializer_hh__
-#define avro_Serializer_hh__
-
-#include <array>
-#include <boost/noncopyable.hpp>
-
-#include "Config.hh"
-#include "Writer.hh"
-
-namespace avro {
-
-/// Class that wraps a Writer or ValidatingWriter with an interface that uses
-/// explicit write* names instead of writeValue
-
-template<class Writer>
-class Serializer : private boost::noncopyable
-{
-
- public:
-
- /// Constructor only works with Writer
- explicit Serializer() :
- writer_()
- {}
-
- /// Constructor only works with ValidatingWriter
- Serializer(const ValidSchema &schema) :
- writer_(schema)
- {}
-
- void writeNull() {
- writer_.writeValue(Null());
- }
-
- void writeBool(bool val) {
- writer_.writeValue(val);
- }
-
- void writeInt(int32_t val) {
- writer_.writeValue(val);
- }
-
- void writeLong(int64_t val) {
- writer_.writeValue(val);
- }
-
- void writeFloat(float val) {
- writer_.writeValue(val);
- }
-
- void writeDouble(double val) {
- writer_.writeValue(val);
- }
-
- void writeBytes(const void *val, size_t size) {
- writer_.writeBytes(val);
- }
-
- template <size_t N>
- void writeFixed(const uint8_t (&val)[N]) {
- writer_.writeFixed(val);
- }
-
- template <size_t N>
- void writeFixed(const std::array<uint8_t, N> &val) {
- writer_.writeFixed(val);
- }
-
- void writeString(const std::string &val) {
- writer_.writeValue(val);
- }
-
- void writeRecord() {
- writer_.writeRecord();
- }
-
- void writeRecordEnd() {
- writer_.writeRecordEnd();
- }
-
- void writeArrayBlock(int64_t size) {
- writer_.writeArrayBlock(size);
- }
-
- void writeArrayEnd() {
- writer_.writeArrayEnd();
- }
-
- void writeMapBlock(int64_t size) {
- writer_.writeMapBlock(size);
- }
-
- void writeMapEnd() {
- writer_.writeMapEnd();
- }
-
- void writeUnion(int64_t choice) {
- writer_.writeUnion(choice);
- }
-
- void writeEnum(int64_t choice) {
- writer_.writeEnum(choice);
- }
-
- InputBuffer buffer() const {
- return writer_.buffer();
- }
-
- private:
-
- Writer writer_;
-
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Serializer_hh__
+#define avro_Serializer_hh__
+
+#include <array>
+#include <boost/noncopyable.hpp>
+
+#include "Config.hh"
+#include "Writer.hh"
+
+namespace avro {
+
+/// Class that wraps a Writer or ValidatingWriter with an interface that uses
+/// explicit write* names instead of writeValue
+
+template<class Writer>
+class Serializer : private boost::noncopyable
+{
+
+ public:
+
+ /// Constructor only works with Writer
+ explicit Serializer() :
+ writer_()
+ {}
+
+ /// Constructor only works with ValidatingWriter
+ Serializer(const ValidSchema &schema) :
+ writer_(schema)
+ {}
+
+ void writeNull() {
+ writer_.writeValue(Null());
+ }
+
+ void writeBool(bool val) {
+ writer_.writeValue(val);
+ }
+
+ void writeInt(int32_t val) {
+ writer_.writeValue(val);
+ }
+
+ void writeLong(int64_t val) {
+ writer_.writeValue(val);
+ }
+
+ void writeFloat(float val) {
+ writer_.writeValue(val);
+ }
+
+ void writeDouble(double val) {
+ writer_.writeValue(val);
+ }
+
+ void writeBytes(const void *val, size_t size) {
+ writer_.writeBytes(val);
+ }
+
+ template <size_t N>
+ void writeFixed(const uint8_t (&val)[N]) {
+ writer_.writeFixed(val);
+ }
+
+ template <size_t N>
+ void writeFixed(const std::array<uint8_t, N> &val) {
+ writer_.writeFixed(val);
+ }
+
+ void writeString(const std::string &val) {
+ writer_.writeValue(val);
+ }
+
+ void writeRecord() {
+ writer_.writeRecord();
+ }
+
+ void writeRecordEnd() {
+ writer_.writeRecordEnd();
+ }
+
+ void writeArrayBlock(int64_t size) {
+ writer_.writeArrayBlock(size);
+ }
+
+ void writeArrayEnd() {
+ writer_.writeArrayEnd();
+ }
+
+ void writeMapBlock(int64_t size) {
+ writer_.writeMapBlock(size);
+ }
+
+ void writeMapEnd() {
+ writer_.writeMapEnd();
+ }
+
+ void writeUnion(int64_t choice) {
+ writer_.writeUnion(choice);
+ }
+
+ void writeEnum(int64_t choice) {
+ writer_.writeEnum(choice);
+ }
+
+ InputBuffer buffer() const {
+ return writer_.buffer();
+ }
+
+ private:
+
+ Writer writer_;
+
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Specific.hh b/contrib/libs/apache/avro/api/Specific.hh
index 53741be4808..8572ffae48e 100644
--- a/contrib/libs/apache/avro/api/Specific.hh
+++ b/contrib/libs/apache/avro/api/Specific.hh
@@ -1,348 +1,348 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Codec_hh__
-#define avro_Codec_hh__
-
-#include <string>
-#include <vector>
-#include <map>
-#include <algorithm>
-#include "array"
-
-#include "boost/blank.hpp"
-
-#include "AvroTraits.hh"
-#include "Config.hh"
-#include "Encoder.hh"
-#include "Decoder.hh"
-
-/**
- * A bunch of templates and specializations for encoding and decoding
- * specific types.
- *
- * Primitive AVRO types BOOLEAN, INT, LONG, FLOAT, DOUBLE, STRING and BYTES
- * get decoded to and encoded from C++ types bool, int32_t, int64_t, float,
- * double, std::string and std::vector<uint8_t> respectively. In addition,
- * std::vector<T> for aribtrary type T gets encoded as an Avro array of T.
- * Similarly, std::map<std::string, T> for arbitrary type T gets encoded
- * as an Avro map with value type T.
- *
- * Users can have their custom types encoded/decoded by specializing
- * avro::codec_traits class for their types.
- */
-namespace avro {
-
-typedef boost::blank null;
-
-template <typename T> void encode(Encoder& e, const T& t);
-template <typename T> void decode(Decoder& d, T& t);
-
-/**
- * Codec_traits tells avro how to encode and decode an object of given type.
- *
- * The class is expected to have two static methods:
- * \li static void encode(Encoder& e, const T& value);
- * \li static void decode(Decoder& e, T& value);
- * The default is empty.
- */
-template <typename T>
-struct codec_traits;
-
-/**
- * codec_traits for Avro boolean.
- */
-template <> struct codec_traits<bool> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, bool b) {
- e.encodeBool(b);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, bool& b) {
- b = d.decodeBool();
- }
-};
-
-/**
- * codec_traits for Avro int.
- */
-template <> struct codec_traits<int32_t> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, int32_t i) {
- e.encodeInt(i);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, int32_t& i) {
- i = d.decodeInt();
- }
-};
-
-/**
- * codec_traits for Avro long.
- */
-template <> struct codec_traits<int64_t> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, int64_t l) {
- e.encodeLong(l);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, int64_t& l) {
- l = d.decodeLong();
- }
-};
-
-/**
- * codec_traits for Avro float.
- */
-template <> struct codec_traits<float> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, float f) {
- e.encodeFloat(f);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, float& f) {
- f = d.decodeFloat();
- }
-};
-
-/**
- * codec_traits for Avro double.
- */
-template <> struct codec_traits<double> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, double d) {
- e.encodeDouble(d);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, double& dbl) {
- dbl = d.decodeDouble();
- }
-};
-
-/**
- * codec_traits for Avro string.
- */
-template <> struct codec_traits<std::string> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, const std::string& s) {
- e.encodeString(s);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, std::string& s) {
- s = d.decodeString();
- }
-};
-
-/**
- * codec_traits for Avro bytes.
- */
-template <> struct codec_traits<std::vector<uint8_t> > {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, const std::vector<uint8_t>& b) {
- e.encodeBytes(b);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, std::vector<uint8_t>& s) {
- d.decodeBytes(s);
- }
-};
-
-/**
- * codec_traits for Avro fixed.
- */
-template <size_t N> struct codec_traits<std::array<uint8_t, N> > {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, const std::array<uint8_t, N>& b) {
- e.encodeFixed(b.data(), N);
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, std::array<uint8_t, N>& s) {
- std::vector<uint8_t> v(N);
- d.decodeFixed(N, v);
- std::copy(v.data(), v.data() + N, s.data());
- }
-};
-
-/**
- * codec_traits for Avro arrays.
- */
-template <typename T> struct codec_traits<std::vector<T> > {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, const std::vector<T>& b) {
- e.arrayStart();
- if (! b.empty()) {
- e.setItemCount(b.size());
- for (typename std::vector<T>::const_iterator it = b.begin();
- it != b.end(); ++it) {
- e.startItem();
- avro::encode(e, *it);
- }
- }
- e.arrayEnd();
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, std::vector<T>& s) {
- s.clear();
- for (size_t n = d.arrayStart(); n != 0; n = d.arrayNext()) {
- for (size_t i = 0; i < n; ++i) {
- T t;
- avro::decode(d, t);
- s.push_back(std::move(t));
- }
- }
- }
-};
-
-typedef codec_traits<std::vector<bool>::const_reference> bool_codec_traits;
-
-template <> struct codec_traits<std::conditional<avro::is_not_defined<bool_codec_traits>::value,
- std::vector<bool>::const_reference, void>::type> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, std::vector<bool>::const_reference b) {
- e.encodeBool(b);
- }
-};
-
-/**
- * codec_traits for Avro maps.
- */
-template <typename T> struct codec_traits<std::map<std::string, T> > {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, const std::map<std::string, T>& b) {
- e.mapStart();
- if (! b.empty()) {
- e.setItemCount(b.size());
- for (typename std::map<std::string, T>::const_iterator
- it = b.begin();
- it != b.end(); ++it) {
- e.startItem();
- avro::encode(e, it->first);
- avro::encode(e, it->second);
- }
- }
- e.mapEnd();
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, std::map<std::string, T>& s) {
- s.clear();
- for (size_t n = d.mapStart(); n != 0; n = d.mapNext()) {
- for (size_t i = 0; i < n; ++i) {
- std::string k;
- avro::decode(d, k);
- T& t = s[std::move(k)];
- avro::decode(d, t);
- }
- }
- }
-};
-
-/**
-* codec_traits for Avro null.
-*/
-template <> struct codec_traits<avro::null> {
- /**
- * Encodes a given value.
- */
- static void encode(Encoder& e, const avro::null&) {
- e.encodeNull();
- }
-
- /**
- * Decodes into a given value.
- */
- static void decode(Decoder& d, avro::null&) {
- d.decodeNull();
- }
-};
-
-
-
-/**
- * Generic encoder function that makes use of the codec_traits.
- */
-template <typename T>
-void encode(Encoder& e, const T& t) {
- codec_traits<T>::encode(e, t);
-}
-
-/**
- * Generic decoder function that makes use of the codec_traits.
- */
-template <typename T>
-void decode(Decoder& d, T& t) {
- codec_traits<T>::decode(d, t);
-}
-
-} // namespace avro
-
-#endif // avro_Codec_hh__
-
-
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Codec_hh__
+#define avro_Codec_hh__
+
+#include <string>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "array"
+
+#include "boost/blank.hpp"
+
+#include "AvroTraits.hh"
+#include "Config.hh"
+#include "Encoder.hh"
+#include "Decoder.hh"
+
+/**
+ * A bunch of templates and specializations for encoding and decoding
+ * specific types.
+ *
+ * Primitive AVRO types BOOLEAN, INT, LONG, FLOAT, DOUBLE, STRING and BYTES
+ * get decoded to and encoded from C++ types bool, int32_t, int64_t, float,
+ * double, std::string and std::vector<uint8_t> respectively. In addition,
+ * std::vector<T> for aribtrary type T gets encoded as an Avro array of T.
+ * Similarly, std::map<std::string, T> for arbitrary type T gets encoded
+ * as an Avro map with value type T.
+ *
+ * Users can have their custom types encoded/decoded by specializing
+ * avro::codec_traits class for their types.
+ */
+namespace avro {
+
+typedef boost::blank null;
+
+template <typename T> void encode(Encoder& e, const T& t);
+template <typename T> void decode(Decoder& d, T& t);
+
+/**
+ * Codec_traits tells avro how to encode and decode an object of given type.
+ *
+ * The class is expected to have two static methods:
+ * \li static void encode(Encoder& e, const T& value);
+ * \li static void decode(Decoder& e, T& value);
+ * The default is empty.
+ */
+template <typename T>
+struct codec_traits;
+
+/**
+ * codec_traits for Avro boolean.
+ */
+template <> struct codec_traits<bool> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, bool b) {
+ e.encodeBool(b);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, bool& b) {
+ b = d.decodeBool();
+ }
+};
+
+/**
+ * codec_traits for Avro int.
+ */
+template <> struct codec_traits<int32_t> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, int32_t i) {
+ e.encodeInt(i);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, int32_t& i) {
+ i = d.decodeInt();
+ }
+};
+
+/**
+ * codec_traits for Avro long.
+ */
+template <> struct codec_traits<int64_t> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, int64_t l) {
+ e.encodeLong(l);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, int64_t& l) {
+ l = d.decodeLong();
+ }
+};
+
+/**
+ * codec_traits for Avro float.
+ */
+template <> struct codec_traits<float> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, float f) {
+ e.encodeFloat(f);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, float& f) {
+ f = d.decodeFloat();
+ }
+};
+
+/**
+ * codec_traits for Avro double.
+ */
+template <> struct codec_traits<double> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, double d) {
+ e.encodeDouble(d);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, double& dbl) {
+ dbl = d.decodeDouble();
+ }
+};
+
+/**
+ * codec_traits for Avro string.
+ */
+template <> struct codec_traits<std::string> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, const std::string& s) {
+ e.encodeString(s);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, std::string& s) {
+ s = d.decodeString();
+ }
+};
+
+/**
+ * codec_traits for Avro bytes.
+ */
+template <> struct codec_traits<std::vector<uint8_t> > {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, const std::vector<uint8_t>& b) {
+ e.encodeBytes(b);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, std::vector<uint8_t>& s) {
+ d.decodeBytes(s);
+ }
+};
+
+/**
+ * codec_traits for Avro fixed.
+ */
+template <size_t N> struct codec_traits<std::array<uint8_t, N> > {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, const std::array<uint8_t, N>& b) {
+ e.encodeFixed(b.data(), N);
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, std::array<uint8_t, N>& s) {
+ std::vector<uint8_t> v(N);
+ d.decodeFixed(N, v);
+ std::copy(v.data(), v.data() + N, s.data());
+ }
+};
+
+/**
+ * codec_traits for Avro arrays.
+ */
+template <typename T> struct codec_traits<std::vector<T> > {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, const std::vector<T>& b) {
+ e.arrayStart();
+ if (! b.empty()) {
+ e.setItemCount(b.size());
+ for (typename std::vector<T>::const_iterator it = b.begin();
+ it != b.end(); ++it) {
+ e.startItem();
+ avro::encode(e, *it);
+ }
+ }
+ e.arrayEnd();
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, std::vector<T>& s) {
+ s.clear();
+ for (size_t n = d.arrayStart(); n != 0; n = d.arrayNext()) {
+ for (size_t i = 0; i < n; ++i) {
+ T t;
+ avro::decode(d, t);
+ s.push_back(std::move(t));
+ }
+ }
+ }
+};
+
+typedef codec_traits<std::vector<bool>::const_reference> bool_codec_traits;
+
+template <> struct codec_traits<std::conditional<avro::is_not_defined<bool_codec_traits>::value,
+ std::vector<bool>::const_reference, void>::type> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, std::vector<bool>::const_reference b) {
+ e.encodeBool(b);
+ }
+};
+
+/**
+ * codec_traits for Avro maps.
+ */
+template <typename T> struct codec_traits<std::map<std::string, T> > {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, const std::map<std::string, T>& b) {
+ e.mapStart();
+ if (! b.empty()) {
+ e.setItemCount(b.size());
+ for (typename std::map<std::string, T>::const_iterator
+ it = b.begin();
+ it != b.end(); ++it) {
+ e.startItem();
+ avro::encode(e, it->first);
+ avro::encode(e, it->second);
+ }
+ }
+ e.mapEnd();
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, std::map<std::string, T>& s) {
+ s.clear();
+ for (size_t n = d.mapStart(); n != 0; n = d.mapNext()) {
+ for (size_t i = 0; i < n; ++i) {
+ std::string k;
+ avro::decode(d, k);
+ T& t = s[std::move(k)];
+ avro::decode(d, t);
+ }
+ }
+ }
+};
+
+/**
+* codec_traits for Avro null.
+*/
+template <> struct codec_traits<avro::null> {
+ /**
+ * Encodes a given value.
+ */
+ static void encode(Encoder& e, const avro::null&) {
+ e.encodeNull();
+ }
+
+ /**
+ * Decodes into a given value.
+ */
+ static void decode(Decoder& d, avro::null&) {
+ d.decodeNull();
+ }
+};
+
+
+
+/**
+ * Generic encoder function that makes use of the codec_traits.
+ */
+template <typename T>
+void encode(Encoder& e, const T& t) {
+ codec_traits<T>::encode(e, t);
+}
+
+/**
+ * Generic decoder function that makes use of the codec_traits.
+ */
+template <typename T>
+void decode(Decoder& d, T& t) {
+ codec_traits<T>::decode(d, t);
+}
+
+} // namespace avro
+
+#endif // avro_Codec_hh__
+
+
+
diff --git a/contrib/libs/apache/avro/api/Stream.hh b/contrib/libs/apache/avro/api/Stream.hh
index 508cb039807..6693fd5c249 100644
--- a/contrib/libs/apache/avro/api/Stream.hh
+++ b/contrib/libs/apache/avro/api/Stream.hh
@@ -1,483 +1,483 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Stream_hh__
-#define avro_Stream_hh__
-
-#include <memory>
-#include <string.h>
-#include <stdint.h>
-
-#include "boost/utility.hpp"
-
-#include "Config.hh"
-#include "Exception.hh"
-
-namespace avro {
-
-/**
- * A no-copy input stream.
- */
-class AVRO_DECL InputStream : boost::noncopyable {
-protected:
-
- /**
- * An empty constuctor.
- */
- InputStream() { }
-
-public:
- /**
- * Destructor.
- */
- virtual ~InputStream() { }
-
- /**
- * Returns some of available data.
- *
- * Returns true if some data is available, false if no more data is
- * available or an error has occurred.
- */
- virtual bool next(const uint8_t** data, size_t* len) = 0;
-
- /**
- * "Returns" back some of the data to the stream. The returned
- * data must be less than what was obtained in the last call to
- * next().
- */
- virtual void backup(size_t len) = 0;
-
- /**
- * Skips number of bytes specified by len.
- */
- virtual void skip(size_t len) = 0;
-
- /**
- * Returns the number of bytes read from this stream so far.
- * All the bytes made available through next are considered
- * to be used unless, retutned back using backup.
- */
- virtual size_t byteCount() const = 0;
-};
-
-typedef std::unique_ptr<InputStream> InputStreamPtr;
-
-/**
- * An InputStream which also supports seeking to a specific offset.
- */
-class AVRO_DECL SeekableInputStream : public InputStream {
-protected:
-
- /**
- * An empty constuctor.
- */
- SeekableInputStream() { }
-
-public:
- /**
- * Destructor.
- */
- virtual ~SeekableInputStream() { }
-
- /**
- * Seek to a specific position in the stream. This may invalidate pointers
- * returned from next(). This will also reset byteCount() to the given
- * position.
- */
- virtual void seek(int64_t position) = 0;
-};
-
-typedef std::unique_ptr<SeekableInputStream> SeekableInputStreamPtr;
-
-/**
- * A no-copy output stream.
- */
-class AVRO_DECL OutputStream : boost::noncopyable {
-protected:
-
- /**
- * An empty constuctor.
- */
- OutputStream() { }
-public:
-
- /**
- * Destructor.
- */
- virtual ~OutputStream() { }
-
- /**
- * Returns a buffer that can be written into.
- * On successful return, data has the pointer to the buffer
- * and len has the number of bytes available at data.
- */
- virtual bool next(uint8_t** data, size_t* len) = 0;
-
- /**
- * "Returns" back to the stream some of the buffer obtained
- * from in the last call to next().
- */
- virtual void backup(size_t len) = 0;
-
- /**
- * Number of bytes written so far into this stream. The whole buffer
- * returned by next() is assumed to be written unless some of
- * it was retutned using backup().
- */
- virtual uint64_t byteCount() const = 0;
-
- /**
- * Flushes any data remaining in the buffer to the stream's underlying
- * store, if any.
- */
- virtual void flush() = 0;
-};
-
-typedef std::unique_ptr<OutputStream> OutputStreamPtr;
-
-/**
- * Returns a new OutputStream, which grows in memory chunks of specified size.
- */
-AVRO_DECL OutputStreamPtr memoryOutputStream(size_t chunkSize = 4 * 1024);
-
-/**
- * Returns a new InputStream, with the data from the given byte array.
- * It does not copy the data, the byte array should remain valid
- * until the InputStream is used.
- */
-AVRO_DECL InputStreamPtr memoryInputStream(const uint8_t* data, size_t len);
-
-/**
- * Returns a new InputStream with the contents written into an
- * outputstream. The output stream must have been returned by
- * an earlier call to memoryOutputStream(). The contents for the new
- * input stream are the snapshot of the outputstream. One can construct
- * any number of memory input stream from a single memory output stream.
- */
-AVRO_DECL InputStreamPtr memoryInputStream(const OutputStream& source);
-
-/**
- * Returns the contents written so far into the output stream, which should
- * be a memory output stream. That is it must have been returned by a pervious
- * call to memoryOutputStream().
- */
-AVRO_DECL std::shared_ptr<std::vector<uint8_t> > snapshot(const OutputStream& source);
-
-/**
- * Returns a new OutputStream whose contents would be stored in a file.
- * Data is written in chunks of given buffer size.
- *
- * If there is a file with the given name, it is truncated and overwritten.
- * If there is no file with the given name, it is created.
- */
-AVRO_DECL OutputStreamPtr fileOutputStream(const char* filename,
- size_t bufferSize = 8 * 1024);
-
-/**
- * Returns a new InputStream whose contents come from the given file.
- * Data is read in chunks of given buffer size.
- */
-AVRO_DECL InputStreamPtr fileInputStream(
- const char *filename, size_t bufferSize = 8 * 1024);
-AVRO_DECL SeekableInputStreamPtr fileSeekableInputStream(
- const char *filename, size_t bufferSize = 8 * 1024);
-
-/**
- * Returns a new OutputStream whose contents will be sent to the given
- * std::ostream. The std::ostream object should outlive the returned
- * OutputStream.
- */
-AVRO_DECL OutputStreamPtr ostreamOutputStream(std::ostream& os,
- size_t bufferSize = 8 * 1024);
-
-/**
- * Returns a new InputStream whose contents come from the given
- * std::istream. The std::istream object should outlive the returned
- * InputStream.
- */
-AVRO_DECL InputStreamPtr istreamInputStream(
- std::istream &in, size_t bufferSize = 8 * 1024);
-
-/**
- * Returns a new InputStream whose contents come from the given
- * std::istream. Use this instead of istreamInputStream if
- * the istream does not support seekg (e.g. compressed streams).
- * The returned InputStream would read off bytes instead of seeking.
- * Of, course it has a performance penalty when reading instead of seeking;
- * So, use this only when seekg does not work.
- * The std::istream object should outlive the returned
- * InputStream.
- */
-AVRO_DECL InputStreamPtr nonSeekableIstreamInputStream(
- std::istream& is, size_t bufferSize = 8 * 1024);
-
-
-/** A convenience class for reading from an InputStream */
-struct StreamReader {
- /**
- * The underlying input stream.
- */
- InputStream* in_;
-
- /**
- * The next location to read from.
- */
- const uint8_t* next_;
-
- /**
- * One past the last valid location.
- */
- const uint8_t* end_;
-
- /**
- * Constructs an empty reader.
- */
- StreamReader() : in_(0), next_(0), end_(0) { }
-
- /**
- * Constructs a reader with the given underlying stream.
- */
- StreamReader(InputStream& in) : in_(0), next_(0), end_(0) { reset(in); }
-
- /**
- * Replaces the current input stream with the given one after backing up
- * the original one if required.
- */
- void reset(InputStream& is) {
- if (in_ != 0 && end_ != next_) {
- in_->backup(end_ - next_);
- }
- in_ = &is;
- next_ = end_ = 0;
- }
-
- /**
- * Read just one byte from the underlying stream. If there are no
- * more data, throws an exception.
- */
- uint8_t read() {
- if (next_ == end_) {
- more();
- }
- return *next_++;
- }
-
- /**
- * Reads the given number of bytes from the underlying stream.
- * If there are not that many bytes, throws an exception.
- */
- void readBytes(uint8_t* b, size_t n) {
- while (n > 0) {
- if (next_ == end_) {
- more();
- }
- size_t q = end_ - next_;
- if (q > n) {
- q = n;
- }
- ::memcpy(b, next_, q);
- next_ += q;
- b += q;
- n -= q;
- }
- }
-
- /**
- * Skips the given number of bytes. Of there are not so that many
- * bytes, throws an exception.
- */
- void skipBytes(size_t n) {
- if (n > static_cast<size_t>(end_ - next_)) {
- n -= end_ - next_;
- next_ = end_;
- in_->skip(n);
- } else {
- next_ += n;
- }
- }
-
- /**
- * Get as many byes from the underlying stream as possible in a single
- * chunk.
- * \return true if some data could be obtained. False is no more
- * data is available on the stream.
- */
- bool fill() {
- size_t n = 0;
- while (in_->next(&next_, &n)) {
- if (n != 0) {
- end_ = next_ + n;
- return true;
- }
- }
- return false;
- }
-
- /**
- * Tries to get more data and if it cannot, throws an exception.
- */
- void more() {
- if (! fill()) {
- throw Exception("EOF reached");
- }
- }
-
- /**
- * Returns true if and only if the end of stream is not reached.
- */
- bool hasMore() {
- return (next_ == end_) ? fill() : true;
- }
-
- /**
- * Returns unused bytes back to the underlying stream.
- * If unRead is true the last byte read is also pushed back.
- */
- void drain(bool unRead) {
- if (unRead) {
- --next_;
- }
- in_->backup(end_ - next_);
- end_ = next_;
- }
-};
-
-/**
- * A convinience class to write data into an OutputStream.
- */
-struct StreamWriter {
- /**
- * The underlying output stream for this writer.
- */
- OutputStream* out_;
-
- /**
- * The next location to write to.
- */
- uint8_t* next_;
-
- /**
- * One past the last location one can write to.
- */
- uint8_t* end_;
-
- /**
- * Constructs a writer with no underlying stream.
- */
- StreamWriter() : out_(0), next_(0), end_(0) { }
-
- /**
- * Constructs a new writer with the given underlying stream.
- */
- StreamWriter(OutputStream& out) : out_(0), next_(0), end_(0) { reset(out); }
-
- /**
- * Replaces the current underlying stream with a new one.
- * If required, it backs up unused bytes in the previous stream.
- */
- void reset(OutputStream& os) {
- if (out_ != 0 && end_ != next_) {
- out_->backup(end_ - next_);
- }
- out_ = &os;
- next_ = end_;
- }
-
- /**
- * Writes a single byte.
- */
- void write(uint8_t c) {
- if (next_ == end_) {
- more();
- }
- *next_++ = c;
- }
-
- /**
- * Writes the specified number of bytes starting at \p b.
- */
- void writeBytes(const uint8_t* b, size_t n) {
- while (n > 0) {
- if (next_ == end_) {
- more();
- }
- size_t q = end_ - next_;
- if (q > n) {
- q = n;
- }
- ::memcpy(next_, b, q);
- next_ += q;
- b += q;
- n -= q;
- }
- }
-
- /**
- * backs up upto the currently written data and flushes the
- * underlying stream.
- */
- void flush() {
- if (next_ != end_) {
- out_->backup(end_ - next_);
- next_ = end_;
- }
- out_->flush();
- }
-
- /**
- * Return the number of bytes written so far. For a meaningful
- * result, call this after a flush().
- */
- int64_t byteCount() const {
- return out_->byteCount();
- }
-
- /**
- * Gets more space to write to. Throws an exception it cannot.
- */
- void more() {
- size_t n = 0;
- while (out_->next(&next_, &n)) {
- if (n != 0) {
- end_ = next_ + n;
- return;
- }
- }
- throw Exception("EOF reached");
- }
-};
-
-/**
- * A convenience function to copy all the contents of an input stream into
- * an output stream.
- */
-inline void copy(InputStream& in, OutputStream& out)
-{
- const uint8_t *p = 0;
- size_t n = 0;
- StreamWriter w(out);
- while (in.next(&p, &n)) {
- w.writeBytes(p, n);
- }
- w.flush();
-}
-
-} // namespace avro
-#endif
-
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Stream_hh__
+#define avro_Stream_hh__
+
+#include <memory>
+#include <string.h>
+#include <stdint.h>
+
+#include "boost/utility.hpp"
+
+#include "Config.hh"
+#include "Exception.hh"
+
+namespace avro {
+
+/**
+ * A no-copy input stream.
+ */
+class AVRO_DECL InputStream : boost::noncopyable {
+protected:
+
+ /**
+ * An empty constuctor.
+ */
+ InputStream() { }
+
+public:
+ /**
+ * Destructor.
+ */
+ virtual ~InputStream() { }
+
+ /**
+ * Returns some of available data.
+ *
+ * Returns true if some data is available, false if no more data is
+ * available or an error has occurred.
+ */
+ virtual bool next(const uint8_t** data, size_t* len) = 0;
+
+ /**
+ * "Returns" back some of the data to the stream. The returned
+ * data must be less than what was obtained in the last call to
+ * next().
+ */
+ virtual void backup(size_t len) = 0;
+
+ /**
+ * Skips number of bytes specified by len.
+ */
+ virtual void skip(size_t len) = 0;
+
+ /**
+ * Returns the number of bytes read from this stream so far.
+ * All the bytes made available through next are considered
+ * to be used unless, retutned back using backup.
+ */
+ virtual size_t byteCount() const = 0;
+};
+
+typedef std::unique_ptr<InputStream> InputStreamPtr;
+
+/**
+ * An InputStream which also supports seeking to a specific offset.
+ */
+class AVRO_DECL SeekableInputStream : public InputStream {
+protected:
+
+ /**
+ * An empty constuctor.
+ */
+ SeekableInputStream() { }
+
+public:
+ /**
+ * Destructor.
+ */
+ virtual ~SeekableInputStream() { }
+
+ /**
+ * Seek to a specific position in the stream. This may invalidate pointers
+ * returned from next(). This will also reset byteCount() to the given
+ * position.
+ */
+ virtual void seek(int64_t position) = 0;
+};
+
+typedef std::unique_ptr<SeekableInputStream> SeekableInputStreamPtr;
+
+/**
+ * A no-copy output stream.
+ */
+class AVRO_DECL OutputStream : boost::noncopyable {
+protected:
+
+ /**
+ * An empty constuctor.
+ */
+ OutputStream() { }
+public:
+
+ /**
+ * Destructor.
+ */
+ virtual ~OutputStream() { }
+
+ /**
+ * Returns a buffer that can be written into.
+ * On successful return, data has the pointer to the buffer
+ * and len has the number of bytes available at data.
+ */
+ virtual bool next(uint8_t** data, size_t* len) = 0;
+
+ /**
+ * "Returns" back to the stream some of the buffer obtained
+ * from in the last call to next().
+ */
+ virtual void backup(size_t len) = 0;
+
+ /**
+ * Number of bytes written so far into this stream. The whole buffer
+ * returned by next() is assumed to be written unless some of
+ * it was retutned using backup().
+ */
+ virtual uint64_t byteCount() const = 0;
+
+ /**
+ * Flushes any data remaining in the buffer to the stream's underlying
+ * store, if any.
+ */
+ virtual void flush() = 0;
+};
+
+typedef std::unique_ptr<OutputStream> OutputStreamPtr;
+
+/**
+ * Returns a new OutputStream, which grows in memory chunks of specified size.
+ */
+AVRO_DECL OutputStreamPtr memoryOutputStream(size_t chunkSize = 4 * 1024);
+
+/**
+ * Returns a new InputStream, with the data from the given byte array.
+ * It does not copy the data, the byte array should remain valid
+ * until the InputStream is used.
+ */
+AVRO_DECL InputStreamPtr memoryInputStream(const uint8_t* data, size_t len);
+
+/**
+ * Returns a new InputStream with the contents written into an
+ * outputstream. The output stream must have been returned by
+ * an earlier call to memoryOutputStream(). The contents for the new
+ * input stream are the snapshot of the outputstream. One can construct
+ * any number of memory input stream from a single memory output stream.
+ */
+AVRO_DECL InputStreamPtr memoryInputStream(const OutputStream& source);
+
+/**
+ * Returns the contents written so far into the output stream, which should
+ * be a memory output stream. That is it must have been returned by a pervious
+ * call to memoryOutputStream().
+ */
+AVRO_DECL std::shared_ptr<std::vector<uint8_t> > snapshot(const OutputStream& source);
+
+/**
+ * Returns a new OutputStream whose contents would be stored in a file.
+ * Data is written in chunks of given buffer size.
+ *
+ * If there is a file with the given name, it is truncated and overwritten.
+ * If there is no file with the given name, it is created.
+ */
+AVRO_DECL OutputStreamPtr fileOutputStream(const char* filename,
+ size_t bufferSize = 8 * 1024);
+
+/**
+ * Returns a new InputStream whose contents come from the given file.
+ * Data is read in chunks of given buffer size.
+ */
+AVRO_DECL InputStreamPtr fileInputStream(
+ const char *filename, size_t bufferSize = 8 * 1024);
+AVRO_DECL SeekableInputStreamPtr fileSeekableInputStream(
+ const char *filename, size_t bufferSize = 8 * 1024);
+
+/**
+ * Returns a new OutputStream whose contents will be sent to the given
+ * std::ostream. The std::ostream object should outlive the returned
+ * OutputStream.
+ */
+AVRO_DECL OutputStreamPtr ostreamOutputStream(std::ostream& os,
+ size_t bufferSize = 8 * 1024);
+
+/**
+ * Returns a new InputStream whose contents come from the given
+ * std::istream. The std::istream object should outlive the returned
+ * InputStream.
+ */
+AVRO_DECL InputStreamPtr istreamInputStream(
+ std::istream &in, size_t bufferSize = 8 * 1024);
+
+/**
+ * Returns a new InputStream whose contents come from the given
+ * std::istream. Use this instead of istreamInputStream if
+ * the istream does not support seekg (e.g. compressed streams).
+ * The returned InputStream would read off bytes instead of seeking.
+ * Of, course it has a performance penalty when reading instead of seeking;
+ * So, use this only when seekg does not work.
+ * The std::istream object should outlive the returned
+ * InputStream.
+ */
+AVRO_DECL InputStreamPtr nonSeekableIstreamInputStream(
+ std::istream& is, size_t bufferSize = 8 * 1024);
+
+
+/** A convenience class for reading from an InputStream */
+struct StreamReader {
+ /**
+ * The underlying input stream.
+ */
+ InputStream* in_;
+
+ /**
+ * The next location to read from.
+ */
+ const uint8_t* next_;
+
+ /**
+ * One past the last valid location.
+ */
+ const uint8_t* end_;
+
+ /**
+ * Constructs an empty reader.
+ */
+ StreamReader() : in_(0), next_(0), end_(0) { }
+
+ /**
+ * Constructs a reader with the given underlying stream.
+ */
+ StreamReader(InputStream& in) : in_(0), next_(0), end_(0) { reset(in); }
+
+ /**
+ * Replaces the current input stream with the given one after backing up
+ * the original one if required.
+ */
+ void reset(InputStream& is) {
+ if (in_ != 0 && end_ != next_) {
+ in_->backup(end_ - next_);
+ }
+ in_ = &is;
+ next_ = end_ = 0;
+ }
+
+ /**
+ * Read just one byte from the underlying stream. If there are no
+ * more data, throws an exception.
+ */
+ uint8_t read() {
+ if (next_ == end_) {
+ more();
+ }
+ return *next_++;
+ }
+
+ /**
+ * Reads the given number of bytes from the underlying stream.
+ * If there are not that many bytes, throws an exception.
+ */
+ void readBytes(uint8_t* b, size_t n) {
+ while (n > 0) {
+ if (next_ == end_) {
+ more();
+ }
+ size_t q = end_ - next_;
+ if (q > n) {
+ q = n;
+ }
+ ::memcpy(b, next_, q);
+ next_ += q;
+ b += q;
+ n -= q;
+ }
+ }
+
+ /**
+ * Skips the given number of bytes. Of there are not so that many
+ * bytes, throws an exception.
+ */
+ void skipBytes(size_t n) {
+ if (n > static_cast<size_t>(end_ - next_)) {
+ n -= end_ - next_;
+ next_ = end_;
+ in_->skip(n);
+ } else {
+ next_ += n;
+ }
+ }
+
+ /**
+ * Get as many byes from the underlying stream as possible in a single
+ * chunk.
+ * \return true if some data could be obtained. False is no more
+ * data is available on the stream.
+ */
+ bool fill() {
+ size_t n = 0;
+ while (in_->next(&next_, &n)) {
+ if (n != 0) {
+ end_ = next_ + n;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Tries to get more data and if it cannot, throws an exception.
+ */
+ void more() {
+ if (! fill()) {
+ throw Exception("EOF reached");
+ }
+ }
+
+ /**
+ * Returns true if and only if the end of stream is not reached.
+ */
+ bool hasMore() {
+ return (next_ == end_) ? fill() : true;
+ }
+
+ /**
+ * Returns unused bytes back to the underlying stream.
+ * If unRead is true the last byte read is also pushed back.
+ */
+ void drain(bool unRead) {
+ if (unRead) {
+ --next_;
+ }
+ in_->backup(end_ - next_);
+ end_ = next_;
+ }
+};
+
+/**
+ * A convinience class to write data into an OutputStream.
+ */
+struct StreamWriter {
+ /**
+ * The underlying output stream for this writer.
+ */
+ OutputStream* out_;
+
+ /**
+ * The next location to write to.
+ */
+ uint8_t* next_;
+
+ /**
+ * One past the last location one can write to.
+ */
+ uint8_t* end_;
+
+ /**
+ * Constructs a writer with no underlying stream.
+ */
+ StreamWriter() : out_(0), next_(0), end_(0) { }
+
+ /**
+ * Constructs a new writer with the given underlying stream.
+ */
+ StreamWriter(OutputStream& out) : out_(0), next_(0), end_(0) { reset(out); }
+
+ /**
+ * Replaces the current underlying stream with a new one.
+ * If required, it backs up unused bytes in the previous stream.
+ */
+ void reset(OutputStream& os) {
+ if (out_ != 0 && end_ != next_) {
+ out_->backup(end_ - next_);
+ }
+ out_ = &os;
+ next_ = end_;
+ }
+
+ /**
+ * Writes a single byte.
+ */
+ void write(uint8_t c) {
+ if (next_ == end_) {
+ more();
+ }
+ *next_++ = c;
+ }
+
+ /**
+ * Writes the specified number of bytes starting at \p b.
+ */
+ void writeBytes(const uint8_t* b, size_t n) {
+ while (n > 0) {
+ if (next_ == end_) {
+ more();
+ }
+ size_t q = end_ - next_;
+ if (q > n) {
+ q = n;
+ }
+ ::memcpy(next_, b, q);
+ next_ += q;
+ b += q;
+ n -= q;
+ }
+ }
+
+ /**
+ * backs up upto the currently written data and flushes the
+ * underlying stream.
+ */
+ void flush() {
+ if (next_ != end_) {
+ out_->backup(end_ - next_);
+ next_ = end_;
+ }
+ out_->flush();
+ }
+
+ /**
+ * Return the number of bytes written so far. For a meaningful
+ * result, call this after a flush().
+ */
+ int64_t byteCount() const {
+ return out_->byteCount();
+ }
+
+ /**
+ * Gets more space to write to. Throws an exception it cannot.
+ */
+ void more() {
+ size_t n = 0;
+ while (out_->next(&next_, &n)) {
+ if (n != 0) {
+ end_ = next_ + n;
+ return;
+ }
+ }
+ throw Exception("EOF reached");
+ }
+};
+
+/**
+ * A convenience function to copy all the contents of an input stream into
+ * an output stream.
+ */
+inline void copy(InputStream& in, OutputStream& out)
+{
+ const uint8_t *p = 0;
+ size_t n = 0;
+ StreamWriter w(out);
+ while (in.next(&p, &n)) {
+ w.writeBytes(p, n);
+ }
+ w.flush();
+}
+
+} // namespace avro
+#endif
+
+
diff --git a/contrib/libs/apache/avro/api/Types.hh b/contrib/libs/apache/avro/api/Types.hh
index f42399e96b5..a72647c483a 100644
--- a/contrib/libs/apache/avro/api/Types.hh
+++ b/contrib/libs/apache/avro/api/Types.hh
@@ -1,114 +1,114 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Types_hh__
-#define avro_Types_hh__
-
-#include <iostream>
-
-#include "Config.hh"
-
-namespace avro {
-
-/**
- * The "type" for the schema.
- */
-enum Type {
-
- AVRO_STRING, /*!< String */
- AVRO_BYTES, /*!< Sequence of variable length bytes data */
- AVRO_INT, /*!< 32-bit integer */
- AVRO_LONG, /*!< 64-bit integer */
- AVRO_FLOAT, /*!< Floating point number */
- AVRO_DOUBLE, /*!< Double precision floating point number */
- AVRO_BOOL, /*!< Boolean value */
- AVRO_NULL, /*!< Null */
-
- AVRO_RECORD, /*!< Record, a sequence of fields */
- AVRO_ENUM, /*!< Enumeration */
- AVRO_ARRAY, /*!< Homogeneous array of some specific type */
- AVRO_MAP, /*!< Homogeneous map from string to some specific type */
- AVRO_UNION, /*!< Union of one or more types */
- AVRO_FIXED, /*!< Fixed number of bytes */
-
- AVRO_NUM_TYPES, /*!< Marker */
-
- // The following is a pseudo-type used in implementation
-
- AVRO_SYMBOLIC = AVRO_NUM_TYPES, /*!< User internally to avoid circular references. */
- AVRO_UNKNOWN = -1 /*!< Used internally. */
-
-};
-
-/**
- * Returns true if and only if the given type is a primitive.
- * Primitive types are: string, bytes, int, long, float, double, boolean
- * and null
- */
-inline bool isPrimitive(Type t) {
- return (t >= AVRO_STRING) && (t < AVRO_RECORD);
-}
-
-/**
- * Returns true if and only if the given type is a non primitive valid type.
- * Primitive types are: string, bytes, int, long, float, double, boolean
- * and null
- */
-inline bool isCompound(Type t) {
- return (t>= AVRO_RECORD) && (t < AVRO_NUM_TYPES);
-}
-
-/**
- * Returns true if and only if the given type is a valid avro type.
- */
-inline bool isAvroType(Type t) {
- return (t >= AVRO_STRING) && (t < AVRO_NUM_TYPES);
-}
-
-/**
- * Returns true if and only if the given type is within the valid range
- * of enumeration.
- */
-inline bool isAvroTypeOrPseudoType(Type t) {
- return (t >= AVRO_STRING) && (t <= AVRO_NUM_TYPES);
-}
-
-/**
- * Converts the given type into a string. Useful for generating messages.
- */
-AVRO_DECL const std::string& toString(Type type);
-
-/**
- * Writes a string form of the given type into the given ostream.
- */
-AVRO_DECL std::ostream &operator<< (std::ostream &os, avro::Type type);
-
-/// define a type to identify Null in template functions
-struct AVRO_DECL Null { };
-
-/**
- * Writes schema for null \p null type to \p os.
- * \param os The ostream to write to.
- * \param null The value to be written.
- */
-std::ostream& operator<< (std::ostream &os, const Null &null);
-
-} // namespace avro
-
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Types_hh__
+#define avro_Types_hh__
+
+#include <iostream>
+
+#include "Config.hh"
+
+namespace avro {
+
+/**
+ * The "type" for the schema.
+ */
+enum Type {
+
+ AVRO_STRING, /*!< String */
+ AVRO_BYTES, /*!< Sequence of variable length bytes data */
+ AVRO_INT, /*!< 32-bit integer */
+ AVRO_LONG, /*!< 64-bit integer */
+ AVRO_FLOAT, /*!< Floating point number */
+ AVRO_DOUBLE, /*!< Double precision floating point number */
+ AVRO_BOOL, /*!< Boolean value */
+ AVRO_NULL, /*!< Null */
+
+ AVRO_RECORD, /*!< Record, a sequence of fields */
+ AVRO_ENUM, /*!< Enumeration */
+ AVRO_ARRAY, /*!< Homogeneous array of some specific type */
+ AVRO_MAP, /*!< Homogeneous map from string to some specific type */
+ AVRO_UNION, /*!< Union of one or more types */
+ AVRO_FIXED, /*!< Fixed number of bytes */
+
+ AVRO_NUM_TYPES, /*!< Marker */
+
+ // The following is a pseudo-type used in implementation
+
+ AVRO_SYMBOLIC = AVRO_NUM_TYPES, /*!< User internally to avoid circular references. */
+ AVRO_UNKNOWN = -1 /*!< Used internally. */
+
+};
+
+/**
+ * Returns true if and only if the given type is a primitive.
+ * Primitive types are: string, bytes, int, long, float, double, boolean
+ * and null
+ */
+inline bool isPrimitive(Type t) {
+ return (t >= AVRO_STRING) && (t < AVRO_RECORD);
+}
+
+/**
+ * Returns true if and only if the given type is a non primitive valid type.
+ * Primitive types are: string, bytes, int, long, float, double, boolean
+ * and null
+ */
+inline bool isCompound(Type t) {
+ return (t>= AVRO_RECORD) && (t < AVRO_NUM_TYPES);
+}
+
+/**
+ * Returns true if and only if the given type is a valid avro type.
+ */
+inline bool isAvroType(Type t) {
+ return (t >= AVRO_STRING) && (t < AVRO_NUM_TYPES);
+}
+
+/**
+ * Returns true if and only if the given type is within the valid range
+ * of enumeration.
+ */
+inline bool isAvroTypeOrPseudoType(Type t) {
+ return (t >= AVRO_STRING) && (t <= AVRO_NUM_TYPES);
+}
+
+/**
+ * Converts the given type into a string. Useful for generating messages.
+ */
+AVRO_DECL const std::string& toString(Type type);
+
+/**
+ * Writes a string form of the given type into the given ostream.
+ */
+AVRO_DECL std::ostream &operator<< (std::ostream &os, avro::Type type);
+
+/// define a type to identify Null in template functions
+struct AVRO_DECL Null { };
+
+/**
+ * Writes schema for null \p null type to \p os.
+ * \param os The ostream to write to.
+ * \param null The value to be written.
+ */
+std::ostream& operator<< (std::ostream &os, const Null &null);
+
+} // namespace avro
+
+
+#endif
diff --git a/contrib/libs/apache/avro/api/ValidSchema.hh b/contrib/libs/apache/avro/api/ValidSchema.hh
index 7b0ec28bed1..eeff4ef6107 100644
--- a/contrib/libs/apache/avro/api/ValidSchema.hh
+++ b/contrib/libs/apache/avro/api/ValidSchema.hh
@@ -1,66 +1,66 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_ValidSchema_hh__
-#define avro_ValidSchema_hh__
-
-#include "Config.hh"
-#include "Node.hh"
-
-namespace avro {
-
-class AVRO_DECL Schema;
-
-/// A ValidSchema is basically a non-mutable Schema that has passed some
-/// minimum of sanity checks. Once validated, any Schema that is part of
-/// this ValidSchema is considered locked, and cannot be modified (an attempt
-/// to modify a locked Schema will throw). Also, as it is validated, any
-/// recursive duplications of schemas are replaced with symbolic links to the
-/// original.
-///
-/// Once a Schema is converted to a valid schema it can be used in validating
-/// parsers/serializers, converted to a json schema, etc.
-///
-
-class AVRO_DECL ValidSchema {
-public:
- explicit ValidSchema(const NodePtr &root);
- explicit ValidSchema(const Schema &schema);
- ValidSchema();
-
- void setSchema(const Schema &schema);
-
- const NodePtr &root() const {
- return root_;
- }
-
- void toJson(std::ostream &os) const;
- std::string toJson(bool prettyPrint = true) const;
-
- void toFlatList(std::ostream &os) const;
-
- protected:
- NodePtr root_;
-
- private:
- static std::string compactSchema(const std::string &schema);
-};
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_ValidSchema_hh__
+#define avro_ValidSchema_hh__
+
+#include "Config.hh"
+#include "Node.hh"
+
+namespace avro {
+
+class AVRO_DECL Schema;
+
+/// A ValidSchema is basically a non-mutable Schema that has passed some
+/// minimum of sanity checks. Once validated, any Schema that is part of
+/// this ValidSchema is considered locked, and cannot be modified (an attempt
+/// to modify a locked Schema will throw). Also, as it is validated, any
+/// recursive duplications of schemas are replaced with symbolic links to the
+/// original.
+///
+/// Once a Schema is converted to a valid schema it can be used in validating
+/// parsers/serializers, converted to a json schema, etc.
+///
+
+class AVRO_DECL ValidSchema {
+public:
+ explicit ValidSchema(const NodePtr &root);
+ explicit ValidSchema(const Schema &schema);
+ ValidSchema();
+
+ void setSchema(const Schema &schema);
+
+ const NodePtr &root() const {
+ return root_;
+ }
+
+ void toJson(std::ostream &os) const;
+ std::string toJson(bool prettyPrint = true) const;
+
+ void toFlatList(std::ostream &os) const;
+
+ protected:
+ NodePtr root_;
+
+ private:
+ static std::string compactSchema(const std::string &schema);
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Validator.hh b/contrib/libs/apache/avro/api/Validator.hh
index 3f542d611a4..8000f6fd225 100644
--- a/contrib/libs/apache/avro/api/Validator.hh
+++ b/contrib/libs/apache/avro/api/Validator.hh
@@ -1,161 +1,161 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Validating_hh__
-#define avro_Validating_hh__
-
-#include <boost/noncopyable.hpp>
-#include <vector>
-#include <stdint.h>
-
-#include "Config.hh"
-#include "Types.hh"
-#include "ValidSchema.hh"
-
-namespace avro {
-
-class AVRO_DECL NullValidator : private boost::noncopyable
-{
- public:
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Validating_hh__
+#define avro_Validating_hh__
+
+#include <boost/noncopyable.hpp>
+#include <vector>
+#include <stdint.h>
+
+#include "Config.hh"
+#include "Types.hh"
+#include "ValidSchema.hh"
+
+namespace avro {
+
+class AVRO_DECL NullValidator : private boost::noncopyable
+{
+ public:
+
explicit NullValidator(const ValidSchema &) {}
- NullValidator() {}
-
+ NullValidator() {}
+
void setCount(int64_t) {}
-
+
bool typeIsExpected(Type) const {
- return true;
- }
-
- Type nextTypeExpected() const {
- return AVRO_UNKNOWN;
- }
-
- int nextSizeExpected() const {
- return 0;
- }
-
+ return true;
+ }
+
+ Type nextTypeExpected() const {
+ return AVRO_UNKNOWN;
+ }
+
+ int nextSizeExpected() const {
+ return 0;
+ }
+
bool getCurrentRecordName(std::string &) const {
- return true;
- }
-
+ return true;
+ }
+
bool getNextFieldName(std::string &) const {
- return true;
- }
-
+ return true;
+ }
+
void checkTypeExpected(Type) { }
void checkFixedSizeExpected(int) { }
-
-
-};
-
-/// This class is used by both the ValidatingSerializer and ValidationParser
-/// objects. It advances the parse tree (containing logic how to advance
-/// through the various compound types, for example a record must advance
-/// through all leaf nodes but a union only skips to one), and reports which
-/// type is next.
-
-class AVRO_DECL Validator : private boost::noncopyable
-{
- public:
-
- explicit Validator(const ValidSchema &schema);
-
- void setCount(int64_t val);
-
- bool typeIsExpected(Type type) const {
- return (expectedTypesFlag_ & typeToFlag(type)) != 0;
- }
-
- Type nextTypeExpected() const {
- return nextType_;
- }
-
- int nextSizeExpected() const;
-
- bool getCurrentRecordName(std::string &name) const;
- bool getNextFieldName(std::string &name) const;
-
- void checkTypeExpected(Type type) {
- if(! typeIsExpected(type)) {
- throw Exception(
- boost::format("Type %1% does not match schema %2%")
- % type % nextType_
- );
- }
- advance();
- }
-
- void checkFixedSizeExpected(int size) {
- if( nextSizeExpected() != size) {
- throw Exception(
- boost::format("Wrong size for fixed, got %1%, expected %2%")
- % size % nextSizeExpected()
- );
- }
- checkTypeExpected(AVRO_FIXED);
- }
-
- private:
-
- typedef uint32_t flag_t;
-
- flag_t typeToFlag(Type type) const {
- flag_t flag = (1L << type);
- return flag;
- }
-
- void setupOperation(const NodePtr &node);
-
- void setWaitingForCount();
-
- void advance();
- void doAdvance();
-
- void enumAdvance();
- bool countingSetup();
- void countingAdvance();
- void unionAdvance();
- void fixedAdvance();
-
- void setupFlag(Type type);
-
- const ValidSchema schema_;
-
- Type nextType_;
- flag_t expectedTypesFlag_;
- bool compoundStarted_;
- bool waitingForCount_;
- int64_t count_;
-
- struct CompoundType {
- explicit CompoundType(const NodePtr &n) :
- node(n), pos(0)
- {}
- NodePtr node; ///< save the node
- size_t pos; ///< track the leaf position to visit
- };
-
- std::vector<CompoundType> compoundStack_;
- std::vector<size_t> counters_;
-
-};
-
-} // namespace avro
-
-#endif
+
+
+};
+
+/// This class is used by both the ValidatingSerializer and ValidationParser
+/// objects. It advances the parse tree (containing logic how to advance
+/// through the various compound types, for example a record must advance
+/// through all leaf nodes but a union only skips to one), and reports which
+/// type is next.
+
+class AVRO_DECL Validator : private boost::noncopyable
+{
+ public:
+
+ explicit Validator(const ValidSchema &schema);
+
+ void setCount(int64_t val);
+
+ bool typeIsExpected(Type type) const {
+ return (expectedTypesFlag_ & typeToFlag(type)) != 0;
+ }
+
+ Type nextTypeExpected() const {
+ return nextType_;
+ }
+
+ int nextSizeExpected() const;
+
+ bool getCurrentRecordName(std::string &name) const;
+ bool getNextFieldName(std::string &name) const;
+
+ void checkTypeExpected(Type type) {
+ if(! typeIsExpected(type)) {
+ throw Exception(
+ boost::format("Type %1% does not match schema %2%")
+ % type % nextType_
+ );
+ }
+ advance();
+ }
+
+ void checkFixedSizeExpected(int size) {
+ if( nextSizeExpected() != size) {
+ throw Exception(
+ boost::format("Wrong size for fixed, got %1%, expected %2%")
+ % size % nextSizeExpected()
+ );
+ }
+ checkTypeExpected(AVRO_FIXED);
+ }
+
+ private:
+
+ typedef uint32_t flag_t;
+
+ flag_t typeToFlag(Type type) const {
+ flag_t flag = (1L << type);
+ return flag;
+ }
+
+ void setupOperation(const NodePtr &node);
+
+ void setWaitingForCount();
+
+ void advance();
+ void doAdvance();
+
+ void enumAdvance();
+ bool countingSetup();
+ void countingAdvance();
+ void unionAdvance();
+ void fixedAdvance();
+
+ void setupFlag(Type type);
+
+ const ValidSchema schema_;
+
+ Type nextType_;
+ flag_t expectedTypesFlag_;
+ bool compoundStarted_;
+ bool waitingForCount_;
+ int64_t count_;
+
+ struct CompoundType {
+ explicit CompoundType(const NodePtr &n) :
+ node(n), pos(0)
+ {}
+ NodePtr node; ///< save the node
+ size_t pos; ///< track the leaf position to visit
+ };
+
+ std::vector<CompoundType> compoundStack_;
+ std::vector<size_t> counters_;
+
+};
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Writer.hh b/contrib/libs/apache/avro/api/Writer.hh
index 74b057ce65b..7e74ce403e0 100644
--- a/contrib/libs/apache/avro/api/Writer.hh
+++ b/contrib/libs/apache/avro/api/Writer.hh
@@ -1,186 +1,186 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Writer_hh__
-#define avro_Writer_hh__
-
-#include <array>
-#include <boost/noncopyable.hpp>
-
-#include "Config.hh"
-#include "buffer/Buffer.hh"
-#include "Zigzag.hh"
-#include "Types.hh"
-#include "Validator.hh"
-
-namespace avro {
-
-/// Class for writing avro data to a stream.
-
-template<class ValidatorType>
-class WriterImpl : private boost::noncopyable
-{
-
- public:
-
- WriterImpl() {}
-
- explicit WriterImpl(const ValidSchema &schema) :
- validator_(schema)
- {}
-
- void writeValue(const Null &) {
- validator_.checkTypeExpected(AVRO_NULL);
- }
-
- void writeValue(bool val) {
- validator_.checkTypeExpected(AVRO_BOOL);
- int8_t byte = (val != 0);
- buffer_.writeTo(byte);
- }
-
- void writeValue(int32_t val) {
- validator_.checkTypeExpected(AVRO_INT);
- std::array<uint8_t, 5> bytes;
- size_t size = encodeInt32(val, bytes);
- buffer_.writeTo(reinterpret_cast<const char *>(bytes.data()), size);
- }
-
- void writeValue(int64_t val) {
- validator_.checkTypeExpected(AVRO_LONG);
- putLong(val);
- }
-
- void writeValue(float val) {
- validator_.checkTypeExpected(AVRO_FLOAT);
- union {
- float f;
- int32_t i;
- } v;
-
- v.f = val;
- buffer_.writeTo(v.i);
- }
-
- void writeValue(double val) {
- validator_.checkTypeExpected(AVRO_DOUBLE);
- union {
- double d;
- int64_t i;
- } v;
-
- v.d = val;
- buffer_.writeTo(v.i);
- }
-
- void writeValue(const std::string &val) {
- validator_.checkTypeExpected(AVRO_STRING);
- putBytes(val.c_str(), val.size());
- }
-
- void writeBytes(const void *val, size_t size) {
- validator_.checkTypeExpected(AVRO_BYTES);
- putBytes(val, size);
- }
-
- template <size_t N>
- void writeFixed(const uint8_t (&val)[N]) {
- validator_.checkFixedSizeExpected(N);
- buffer_.writeTo(reinterpret_cast<const char *>(val), N);
- }
-
- template <size_t N>
- void writeFixed(const std::array<uint8_t, N> &val) {
- validator_.checkFixedSizeExpected(val.size());
- buffer_.writeTo(reinterpret_cast<const char *>(val.data()), val.size());
- }
-
- void writeRecord() {
- validator_.checkTypeExpected(AVRO_RECORD);
- validator_.checkTypeExpected(AVRO_LONG);
- validator_.setCount(1);
- }
-
- void writeRecordEnd() {
- validator_.checkTypeExpected(AVRO_RECORD);
- validator_.checkTypeExpected(AVRO_LONG);
- validator_.setCount(0);
- }
-
- void writeArrayBlock(int64_t size) {
- validator_.checkTypeExpected(AVRO_ARRAY);
- writeCount(size);
- }
-
- void writeArrayEnd() {
- writeArrayBlock(0);
- }
-
- void writeMapBlock(int64_t size) {
- validator_.checkTypeExpected(AVRO_MAP);
- writeCount(size);
- }
-
- void writeMapEnd() {
- writeMapBlock(0);
- }
-
- void writeUnion(int64_t choice) {
- validator_.checkTypeExpected(AVRO_UNION);
- writeCount(choice);
- }
-
- void writeEnum(int64_t choice) {
- validator_.checkTypeExpected(AVRO_ENUM);
- writeCount(choice);
- }
-
- InputBuffer buffer() const {
- return buffer_;
- }
-
- private:
-
- void putLong(int64_t val) {
- std::array<uint8_t, 10> bytes;
- size_t size = encodeInt64(val, bytes);
- buffer_.writeTo(reinterpret_cast<const char *>(bytes.data()), size);
- }
-
- void putBytes(const void *val, size_t size) {
- putLong(size);
- buffer_.writeTo(reinterpret_cast<const char *>(val), size);
- }
-
- void writeCount(int64_t count) {
- validator_.checkTypeExpected(AVRO_LONG);
- validator_.setCount(count);
- putLong(count);
- }
-
- ValidatorType validator_;
- OutputBuffer buffer_;
-
-};
-
-typedef WriterImpl<NullValidator> Writer;
-typedef WriterImpl<Validator> ValidatingWriter;
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Writer_hh__
+#define avro_Writer_hh__
+
+#include <array>
+#include <boost/noncopyable.hpp>
+
+#include "Config.hh"
+#include "buffer/Buffer.hh"
+#include "Zigzag.hh"
+#include "Types.hh"
+#include "Validator.hh"
+
+namespace avro {
+
+/// Class for writing avro data to a stream.
+
+template<class ValidatorType>
+class WriterImpl : private boost::noncopyable
+{
+
+ public:
+
+ WriterImpl() {}
+
+ explicit WriterImpl(const ValidSchema &schema) :
+ validator_(schema)
+ {}
+
+ void writeValue(const Null &) {
+ validator_.checkTypeExpected(AVRO_NULL);
+ }
+
+ void writeValue(bool val) {
+ validator_.checkTypeExpected(AVRO_BOOL);
+ int8_t byte = (val != 0);
+ buffer_.writeTo(byte);
+ }
+
+ void writeValue(int32_t val) {
+ validator_.checkTypeExpected(AVRO_INT);
+ std::array<uint8_t, 5> bytes;
+ size_t size = encodeInt32(val, bytes);
+ buffer_.writeTo(reinterpret_cast<const char *>(bytes.data()), size);
+ }
+
+ void writeValue(int64_t val) {
+ validator_.checkTypeExpected(AVRO_LONG);
+ putLong(val);
+ }
+
+ void writeValue(float val) {
+ validator_.checkTypeExpected(AVRO_FLOAT);
+ union {
+ float f;
+ int32_t i;
+ } v;
+
+ v.f = val;
+ buffer_.writeTo(v.i);
+ }
+
+ void writeValue(double val) {
+ validator_.checkTypeExpected(AVRO_DOUBLE);
+ union {
+ double d;
+ int64_t i;
+ } v;
+
+ v.d = val;
+ buffer_.writeTo(v.i);
+ }
+
+ void writeValue(const std::string &val) {
+ validator_.checkTypeExpected(AVRO_STRING);
+ putBytes(val.c_str(), val.size());
+ }
+
+ void writeBytes(const void *val, size_t size) {
+ validator_.checkTypeExpected(AVRO_BYTES);
+ putBytes(val, size);
+ }
+
+ template <size_t N>
+ void writeFixed(const uint8_t (&val)[N]) {
+ validator_.checkFixedSizeExpected(N);
+ buffer_.writeTo(reinterpret_cast<const char *>(val), N);
+ }
+
+ template <size_t N>
+ void writeFixed(const std::array<uint8_t, N> &val) {
+ validator_.checkFixedSizeExpected(val.size());
+ buffer_.writeTo(reinterpret_cast<const char *>(val.data()), val.size());
+ }
+
+ void writeRecord() {
+ validator_.checkTypeExpected(AVRO_RECORD);
+ validator_.checkTypeExpected(AVRO_LONG);
+ validator_.setCount(1);
+ }
+
+ void writeRecordEnd() {
+ validator_.checkTypeExpected(AVRO_RECORD);
+ validator_.checkTypeExpected(AVRO_LONG);
+ validator_.setCount(0);
+ }
+
+ void writeArrayBlock(int64_t size) {
+ validator_.checkTypeExpected(AVRO_ARRAY);
+ writeCount(size);
+ }
+
+ void writeArrayEnd() {
+ writeArrayBlock(0);
+ }
+
+ void writeMapBlock(int64_t size) {
+ validator_.checkTypeExpected(AVRO_MAP);
+ writeCount(size);
+ }
+
+ void writeMapEnd() {
+ writeMapBlock(0);
+ }
+
+ void writeUnion(int64_t choice) {
+ validator_.checkTypeExpected(AVRO_UNION);
+ writeCount(choice);
+ }
+
+ void writeEnum(int64_t choice) {
+ validator_.checkTypeExpected(AVRO_ENUM);
+ writeCount(choice);
+ }
+
+ InputBuffer buffer() const {
+ return buffer_;
+ }
+
+ private:
+
+ void putLong(int64_t val) {
+ std::array<uint8_t, 10> bytes;
+ size_t size = encodeInt64(val, bytes);
+ buffer_.writeTo(reinterpret_cast<const char *>(bytes.data()), size);
+ }
+
+ void putBytes(const void *val, size_t size) {
+ putLong(size);
+ buffer_.writeTo(reinterpret_cast<const char *>(val), size);
+ }
+
+ void writeCount(int64_t count) {
+ validator_.checkTypeExpected(AVRO_LONG);
+ validator_.setCount(count);
+ putLong(count);
+ }
+
+ ValidatorType validator_;
+ OutputBuffer buffer_;
+
+};
+
+typedef WriterImpl<NullValidator> Writer;
+typedef WriterImpl<Validator> ValidatingWriter;
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/Zigzag.hh b/contrib/libs/apache/avro/api/Zigzag.hh
index d0259b8d50c..ed76aae9316 100644
--- a/contrib/libs/apache/avro/api/Zigzag.hh
+++ b/contrib/libs/apache/avro/api/Zigzag.hh
@@ -1,43 +1,43 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Encoding_hh__
-#define avro_Encoding_hh__
-
-#include <stdint.h>
-#include <array>
-#include <cstddef>
-
-#include "Config.hh"
-/// \file
-/// Functions for encoding and decoding integers with zigzag compression
-
-namespace avro {
-
-AVRO_DECL uint64_t encodeZigzag64(int64_t input);
-AVRO_DECL int64_t decodeZigzag64(uint64_t input);
-
-AVRO_DECL uint32_t encodeZigzag32(int32_t input);
-AVRO_DECL int32_t decodeZigzag32(uint32_t input);
-
-AVRO_DECL size_t encodeInt32(int32_t input, std::array<uint8_t, 5> &output);
-AVRO_DECL size_t encodeInt64(int64_t input, std::array<uint8_t, 10> &output);
-
-} // namespace avro
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Encoding_hh__
+#define avro_Encoding_hh__
+
+#include <stdint.h>
+#include <array>
+#include <cstddef>
+
+#include "Config.hh"
+/// \file
+/// Functions for encoding and decoding integers with zigzag compression
+
+namespace avro {
+
+AVRO_DECL uint64_t encodeZigzag64(int64_t input);
+AVRO_DECL int64_t decodeZigzag64(uint64_t input);
+
+AVRO_DECL uint32_t encodeZigzag32(int32_t input);
+AVRO_DECL int32_t decodeZigzag32(uint32_t input);
+
+AVRO_DECL size_t encodeInt32(int32_t input, std::array<uint8_t, 5> &output);
+AVRO_DECL size_t encodeInt64(int64_t input, std::array<uint8_t, 10> &output);
+
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/api/buffer/Buffer.hh b/contrib/libs/apache/avro/api/buffer/Buffer.hh
index 7d7aaf8679c..b2a04dd6832 100644
--- a/contrib/libs/apache/avro/api/buffer/Buffer.hh
+++ b/contrib/libs/apache/avro/api/buffer/Buffer.hh
@@ -1,526 +1,526 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_Buffer_hh__
-#define avro_Buffer_hh__
-
-#ifndef _WIN32
-#include <sys/uio.h>
-#endif
-#include <vector>
-
-#include "../Config.hh"
-#include "detail/BufferDetail.hh"
-#include "detail/BufferDetailIterator.hh"
-
-/**
- * \file Buffer.hh
- *
- * \brief Definitions for InputBuffer and OutputBuffer classes
- *
- **/
-
-namespace avro {
-
-class OutputBuffer;
-class InputBuffer;
-
-
-/**
- * The OutputBuffer (write-only buffer)
- *
- * Use cases for OutputBuffer
- *
- * - write message to buffer using ostream class or directly
- * - append messages to headers
- * - building up streams of messages via append
- * - converting to read-only buffers for sending
- * - extracting parts of the messages into read-only buffers
- *
- * -# ASIO access:
- * - write to a buffer(s) by asio using iterator
- * - convert to read buffer for deserializing
- *
- * OutputBuffer is assignable and copy-constructable. On copy or assignment,
- * only a pointer is copied, so the two resulting copies are identical, so
- * modifying one will modify both.
- **/
-
-class AVRO_DECL OutputBuffer
-{
-
- public:
-
- typedef detail::size_type size_type;
- typedef detail::data_type data_type;
-
- /**
- * The asio library expects a const_iterator (the const-ness refers to the
- * fact that the underlying avro of buffers will not be modified, even
- * though the data in those buffers is being modified). The iterator
- * provides the list of addresses an operation can write to.
- **/
-
- typedef detail::OutputBufferIterator const_iterator;
-
- /**
- * Default constructor. Will pre-allocate at least the requested size, but
- * can grow larger on demand.
- *
- * Destructor uses the default, which resets a shared pointer, deleting the
- * underlying data if no other copies of exist.
- *
- * Copy and assignment operators are not explicitly provided because the
- * default ones work fine. The default makes only a shallow copy, so the
- * copies will refer to the same memory. This is required by asio
- * functions, which will implicitly make copies for asynchronous
- * operations. Therefore, the user must be careful that if they create
- * multiple copies of the same OutputBuffer, only one is being modified
- * otherwise undefined behavior may occur.
- *
- **/
-
- OutputBuffer(size_type reserveSize = 0) :
- pimpl_(new detail::BufferImpl)
- {
- if(reserveSize) {
- reserve(reserveSize);
- }
- }
-
- /**
- * Reserve enough space for a wroteTo() operation. When using writeTo(),
- * the buffer will grow dynamically as needed. But when using the iterator
- * to write (followed by wroteTo()), data may only be written to the space
- * available, so this ensures there is enough room in the buffer before
- * the write operation.
- **/
-
- void reserve(size_type reserveSize)
- {
- pimpl_->reserveFreeSpace(reserveSize);
- }
-
- /**
- * Write a block of data to the buffer. The buffer size will automatically
- * grow if the size is larger than what is currently free.
- **/
-
- size_type writeTo(const data_type *data, size_type size) {
- return pimpl_->writeTo(data, size);
- }
-
- /**
- * Write a single value to the buffer. The buffer size will automatically
- * grow if there is not room for the byte. The value must be a
- * "fundamental" type, e.g. int, float, etc. (otherwise use the other
- * writeTo tests).
- **/
-
- template<typename T>
- void writeTo(T val) {
- pimpl_->writeTo(val, std::is_fundamental<T>());
- }
-
- /**
- * Update the state of the buffer after writing through the iterator
- * interface. This function exists primarily for the boost:asio which
- * writes directly to the buffer using its iterator. In this case, the
- * internal state of the buffer does not reflect that the data was written
- * This informs the buffer how much data was written.
- *
- * The buffer does not automatically resize in this case, the bytes written
- * cannot exceed the amount of free space. Attempting to write more will
- * throw a std::length_error exception.
- **/
-
- size_type wroteTo(size_type size)
- {
- int wrote = 0;
- if(size) {
- if(size > freeSpace()) {
- throw std::length_error("Impossible to write more data than free space");
- }
- wrote = pimpl_->wroteTo(size);
- }
- return wrote;
- }
-
- /**
- * Does the buffer have any data?
- **/
-
- bool empty() const {
- return (pimpl_->size()==0);
- }
-
- /**
- * Returns the size of the buffer, in bytes.
- */
-
- size_type size() const {
- return pimpl_->size();
- }
-
- /**
- * Returns the current free space that is available to write to in the
- * buffer, in bytes. This is not a strict limit in size, as writeTo() can
- * automatically increase capacity if necessary.
- **/
-
- size_type freeSpace() const {
- return pimpl_->freeSpace();
- }
-
- /**
- * Appends the data in the argument to the end of this buffer. The
- * argument can be either an InputBuffer or OutputBuffer.
- *
- **/
-
- template <class BufferType>
- void append(const BufferType &buf) {
- // don't append an empty buffer
- if(buf.size()) {
- pimpl_->append(*(buf.pimpl_.get()));
- }
- }
-
- /**
- * Return an iterator pointing to the first data chunk of this buffer
- * that may be written to.
- **/
-
- const_iterator begin() const {
- return const_iterator(pimpl_->beginWrite());
- }
-
- /**
- * Return the end iterator for writing.
- **/
-
- const_iterator end() const {
- return const_iterator(pimpl_->endWrite());
- }
-
- /**
- * Discard any data in this buffer.
- **/
-
- void discardData()
- {
- pimpl_->discardData();
- }
-
- /**
- * Discard the specified number of bytes from this data, starting at the beginning.
- * Throws if the size is greater than the number of bytes.
- **/
-
- void discardData(size_t bytes)
- {
- if(bytes > 0) {
- if(bytes < pimpl_->size()) {
- pimpl_->discardData(bytes);
- }
- else if(bytes == pimpl_->size()) {
- pimpl_->discardData();
- }
- else {
- throw std::out_of_range("trying to discard more data than exists");
- }
- }
- }
-
- /**
- * Remove bytes from this buffer, starting from the beginning, and place
- * them into a new buffer. Throws if the number of requested bytes exceeds
- * the size of the buffer. Data and freeSpace in the buffer after bytes
- * remains in this buffer.
- **/
-
- InputBuffer extractData(size_type bytes);
-
- /**
- * Remove all bytes from this buffer, returning them in a new buffer.
- * After removing data, some freeSpace may remain in this buffer.
- **/
-
- InputBuffer extractData();
-
- /**
- * Clone this buffer, creating a copy that contains the same data.
- **/
-
- OutputBuffer clone() const
- {
- detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl(*pimpl_));
- return OutputBuffer(newImpl);
- }
-
- /**
- * Add unmanaged data to the buffer. The buffer will not automatically
- * free the data, but it will call the supplied function when the data is
- * no longer referenced by the buffer (or copies of the buffer).
- **/
-
- void appendForeignData(const data_type *data, size_type size, const detail::free_func &func) {
- pimpl_->appendForeignData(data, size, func);
- }
-
- /**
- * Returns the number of chunks that contain free space.
- **/
-
- int numChunks() const {
- return pimpl_->numFreeChunks();
- }
-
- /**
- * Returns the number of chunks that contain data
- **/
-
- int numDataChunks() const {
- return pimpl_->numDataChunks();
- }
-
- private:
-
- friend class InputBuffer;
- friend class BufferReader;
-
- explicit OutputBuffer(const detail::BufferImpl::SharedPtr &pimpl) :
- pimpl_(pimpl)
- { }
-
- detail::BufferImpl::SharedPtr pimpl_; ///< Must never be null.
-};
-
-/**
- * The InputBuffer (read-only buffer)
- *
- * InputBuffer is an immutable buffer which that may be constructed from an
- * OutputBuffer, or several of OutputBuffer's methods. Once the data is
- * transfered to an InputBuffer it cannot be modified, only read (via
- * BufferReader, istream, or its iterator).
- *
- * Assignments and copies are shallow copies.
- *
- * -# ASIO access: - iterate using const_iterator for sending messages
- *
- **/
-
-class AVRO_DECL InputBuffer
-{
-
- public:
-
- typedef detail::size_type size_type;
- typedef detail::data_type data_type;
-
- // needed for asio
- typedef detail::InputBufferIterator const_iterator;
-
- /**
- * Default InputBuffer creates an empty buffer.
- *
- * Copy/assignment functions use the default ones. They will do a shallow
- * copy, and because InputBuffer is immutable, the copies will be
- * identical.
- *
- * Destructor also uses the default, which resets a shared pointer,
- * deleting the underlying data if no other copies of exist.
- **/
-
- InputBuffer() :
- pimpl_(new detail::BufferImpl)
- { }
-
- /**
- * Construct an InputBuffer that contains the contents of an OutputBuffer.
- * The two buffers will have the same contents, but this copy will be
- * immutable, while the the OutputBuffer may still be written to.
- *
- * If you wish to move the data from the OutputBuffer to a new InputBuffer
- * (leaving only free space in the OutputBuffer),
- * OutputBuffer::extractData() will do this more efficiently.
- *
- * Implicit conversion is allowed.
- **/
-
- InputBuffer(const OutputBuffer &src) :
- pimpl_(new detail::BufferImpl(*src.pimpl_))
- { }
-
- /**
- * Does the buffer have any data?
- **/
-
- bool empty() const {
- return (pimpl_->size() == 0);
- }
-
- /**
- * Returns the size of the buffer, in bytes.
- **/
-
- size_type size() const {
- return pimpl_->size();
- }
-
- /**
- * Return an iterator pointing to the first data chunk of this buffer
- * that contains data.
- **/
-
- const_iterator begin() const {
- return const_iterator(pimpl_->beginRead());
- }
-
- /**
- * Return the end iterator.
- **/
-
- const_iterator end() const {
- return const_iterator(pimpl_->endRead());
- }
-
- /**
- * Returns the number of chunks containing data.
- **/
-
- int numChunks() const {
- return pimpl_->numDataChunks();
- }
-
-
- private:
-
- friend class OutputBuffer; // for append function
- friend class istreambuf;
- friend class BufferReader;
-
- explicit InputBuffer(const detail::BufferImpl::SharedPtr &pimpl) :
- pimpl_(pimpl)
- { }
-
- /**
- * Class to indicate that a copy of a OutputBuffer to InputBuffer should be
- * a shallow copy, used to enable reading of the contents of an
- * OutputBuffer without need to convert it to InputBuffer using a deep
- * copy. It is private and only used by BufferReader and istreambuf
- * classes.
- *
- * Writing to an OutputBuffer while it is being read may lead to undefined
- * behavior.
- **/
-
- class ShallowCopy {};
-
- /**
- * Make a shallow copy of an OutputBuffer in order to read it without
- * causing conversion overhead.
- **/
- InputBuffer(const OutputBuffer &src, const ShallowCopy &) :
- pimpl_(src.pimpl_)
- { }
-
- /**
- * Make a shallow copy of an InputBuffer. The default copy constructor
- * already provides shallow copy, this is just provided for generic
- * algorithms that wish to treat InputBuffer and OutputBuffer in the same
- * manner.
- **/
-
- InputBuffer(const InputBuffer &src, const ShallowCopy &) :
- pimpl_(src.pimpl_)
- { }
-
-
- detail::BufferImpl::ConstSharedPtr pimpl_; ///< Must never be null.
-};
-
-
-/*
- * Implementations of some OutputBuffer functions are inlined here
- * because InputBuffer definition was required before.
- */
-
-inline InputBuffer OutputBuffer::extractData()
-{
- detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl);
- if(pimpl_->size()) {
- pimpl_->extractData(*newImpl);
- }
- return InputBuffer(newImpl);
-}
-
-inline InputBuffer OutputBuffer::extractData(size_type bytes)
-{
- if(bytes > pimpl_->size()) {
- throw std::out_of_range("trying to extract more data than exists");
- }
-
- detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl);
- if(bytes > 0) {
- if(bytes < pimpl_->size()) {
- pimpl_->extractData(*newImpl, bytes);
- }
- else {
- pimpl_->extractData(*newImpl);
- }
- }
-
- return InputBuffer(newImpl);
-}
-
-#ifndef _WIN32
-/**
- * Create an array of iovec structures from the buffer. This utility is used
- * to support writev and readv function calls. The caller should ensure the
- * buffer object is not deleted while using the iovec vector.
- *
- * If the BufferType is an InputBuffer, the iovec will point to the data that
- * already exists in the buffer, for reading.
- *
- * If the BufferType is an OutputBuffer, the iovec will point to the free
- * space, which may be written to. Before writing, the caller should call
- * OutputBuffer::reserve() to create enough room for the desired write (which
- * can be verified by calling OutputBuffer::freeSpace()), and after writing,
- * they MUST call OutputBuffer::wroteTo(), otherwise the buffer will not know
- * the space is not free anymore.
- *
- **/
-
-template<class BufferType>
-inline void toIovec(BufferType &buf, std::vector<struct iovec> &iov)
-{
- const int chunks = buf.numChunks();
- iov.resize(chunks);
- typename BufferType::const_iterator iter = buf.begin();
- for (int i = 0; i < chunks; ++i) {
- iov[i].iov_base = const_cast<typename BufferType::data_type *>(iter->data());
- iov[i].iov_len = iter->size();
- ++iter;
- }
-}
-#endif
-
-} // namespace
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_Buffer_hh__
+#define avro_Buffer_hh__
+
+#ifndef _WIN32
+#include <sys/uio.h>
+#endif
+#include <vector>
+
+#include "../Config.hh"
+#include "detail/BufferDetail.hh"
+#include "detail/BufferDetailIterator.hh"
+
+/**
+ * \file Buffer.hh
+ *
+ * \brief Definitions for InputBuffer and OutputBuffer classes
+ *
+ **/
+
+namespace avro {
+
+class OutputBuffer;
+class InputBuffer;
+
+
+/**
+ * The OutputBuffer (write-only buffer)
+ *
+ * Use cases for OutputBuffer
+ *
+ * - write message to buffer using ostream class or directly
+ * - append messages to headers
+ * - building up streams of messages via append
+ * - converting to read-only buffers for sending
+ * - extracting parts of the messages into read-only buffers
+ *
+ * -# ASIO access:
+ * - write to a buffer(s) by asio using iterator
+ * - convert to read buffer for deserializing
+ *
+ * OutputBuffer is assignable and copy-constructable. On copy or assignment,
+ * only a pointer is copied, so the two resulting copies are identical, so
+ * modifying one will modify both.
+ **/
+
+class AVRO_DECL OutputBuffer
+{
+
+ public:
+
+ typedef detail::size_type size_type;
+ typedef detail::data_type data_type;
+
+ /**
+ * The asio library expects a const_iterator (the const-ness refers to the
+ * fact that the underlying avro of buffers will not be modified, even
+ * though the data in those buffers is being modified). The iterator
+ * provides the list of addresses an operation can write to.
+ **/
+
+ typedef detail::OutputBufferIterator const_iterator;
+
+ /**
+ * Default constructor. Will pre-allocate at least the requested size, but
+ * can grow larger on demand.
+ *
+ * Destructor uses the default, which resets a shared pointer, deleting the
+ * underlying data if no other copies of exist.
+ *
+ * Copy and assignment operators are not explicitly provided because the
+ * default ones work fine. The default makes only a shallow copy, so the
+ * copies will refer to the same memory. This is required by asio
+ * functions, which will implicitly make copies for asynchronous
+ * operations. Therefore, the user must be careful that if they create
+ * multiple copies of the same OutputBuffer, only one is being modified
+ * otherwise undefined behavior may occur.
+ *
+ **/
+
+ OutputBuffer(size_type reserveSize = 0) :
+ pimpl_(new detail::BufferImpl)
+ {
+ if(reserveSize) {
+ reserve(reserveSize);
+ }
+ }
+
+ /**
+ * Reserve enough space for a wroteTo() operation. When using writeTo(),
+ * the buffer will grow dynamically as needed. But when using the iterator
+ * to write (followed by wroteTo()), data may only be written to the space
+ * available, so this ensures there is enough room in the buffer before
+ * the write operation.
+ **/
+
+ void reserve(size_type reserveSize)
+ {
+ pimpl_->reserveFreeSpace(reserveSize);
+ }
+
+ /**
+ * Write a block of data to the buffer. The buffer size will automatically
+ * grow if the size is larger than what is currently free.
+ **/
+
+ size_type writeTo(const data_type *data, size_type size) {
+ return pimpl_->writeTo(data, size);
+ }
+
+ /**
+ * Write a single value to the buffer. The buffer size will automatically
+ * grow if there is not room for the byte. The value must be a
+ * "fundamental" type, e.g. int, float, etc. (otherwise use the other
+ * writeTo tests).
+ **/
+
+ template<typename T>
+ void writeTo(T val) {
+ pimpl_->writeTo(val, std::is_fundamental<T>());
+ }
+
+ /**
+ * Update the state of the buffer after writing through the iterator
+ * interface. This function exists primarily for the boost:asio which
+ * writes directly to the buffer using its iterator. In this case, the
+ * internal state of the buffer does not reflect that the data was written
+ * This informs the buffer how much data was written.
+ *
+ * The buffer does not automatically resize in this case, the bytes written
+ * cannot exceed the amount of free space. Attempting to write more will
+ * throw a std::length_error exception.
+ **/
+
+ size_type wroteTo(size_type size)
+ {
+ int wrote = 0;
+ if(size) {
+ if(size > freeSpace()) {
+ throw std::length_error("Impossible to write more data than free space");
+ }
+ wrote = pimpl_->wroteTo(size);
+ }
+ return wrote;
+ }
+
+ /**
+ * Does the buffer have any data?
+ **/
+
+ bool empty() const {
+ return (pimpl_->size()==0);
+ }
+
+ /**
+ * Returns the size of the buffer, in bytes.
+ */
+
+ size_type size() const {
+ return pimpl_->size();
+ }
+
+ /**
+ * Returns the current free space that is available to write to in the
+ * buffer, in bytes. This is not a strict limit in size, as writeTo() can
+ * automatically increase capacity if necessary.
+ **/
+
+ size_type freeSpace() const {
+ return pimpl_->freeSpace();
+ }
+
+ /**
+ * Appends the data in the argument to the end of this buffer. The
+ * argument can be either an InputBuffer or OutputBuffer.
+ *
+ **/
+
+ template <class BufferType>
+ void append(const BufferType &buf) {
+ // don't append an empty buffer
+ if(buf.size()) {
+ pimpl_->append(*(buf.pimpl_.get()));
+ }
+ }
+
+ /**
+ * Return an iterator pointing to the first data chunk of this buffer
+ * that may be written to.
+ **/
+
+ const_iterator begin() const {
+ return const_iterator(pimpl_->beginWrite());
+ }
+
+ /**
+ * Return the end iterator for writing.
+ **/
+
+ const_iterator end() const {
+ return const_iterator(pimpl_->endWrite());
+ }
+
+ /**
+ * Discard any data in this buffer.
+ **/
+
+ void discardData()
+ {
+ pimpl_->discardData();
+ }
+
+ /**
+ * Discard the specified number of bytes from this data, starting at the beginning.
+ * Throws if the size is greater than the number of bytes.
+ **/
+
+ void discardData(size_t bytes)
+ {
+ if(bytes > 0) {
+ if(bytes < pimpl_->size()) {
+ pimpl_->discardData(bytes);
+ }
+ else if(bytes == pimpl_->size()) {
+ pimpl_->discardData();
+ }
+ else {
+ throw std::out_of_range("trying to discard more data than exists");
+ }
+ }
+ }
+
+ /**
+ * Remove bytes from this buffer, starting from the beginning, and place
+ * them into a new buffer. Throws if the number of requested bytes exceeds
+ * the size of the buffer. Data and freeSpace in the buffer after bytes
+ * remains in this buffer.
+ **/
+
+ InputBuffer extractData(size_type bytes);
+
+ /**
+ * Remove all bytes from this buffer, returning them in a new buffer.
+ * After removing data, some freeSpace may remain in this buffer.
+ **/
+
+ InputBuffer extractData();
+
+ /**
+ * Clone this buffer, creating a copy that contains the same data.
+ **/
+
+ OutputBuffer clone() const
+ {
+ detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl(*pimpl_));
+ return OutputBuffer(newImpl);
+ }
+
+ /**
+ * Add unmanaged data to the buffer. The buffer will not automatically
+ * free the data, but it will call the supplied function when the data is
+ * no longer referenced by the buffer (or copies of the buffer).
+ **/
+
+ void appendForeignData(const data_type *data, size_type size, const detail::free_func &func) {
+ pimpl_->appendForeignData(data, size, func);
+ }
+
+ /**
+ * Returns the number of chunks that contain free space.
+ **/
+
+ int numChunks() const {
+ return pimpl_->numFreeChunks();
+ }
+
+ /**
+ * Returns the number of chunks that contain data
+ **/
+
+ int numDataChunks() const {
+ return pimpl_->numDataChunks();
+ }
+
+ private:
+
+ friend class InputBuffer;
+ friend class BufferReader;
+
+ explicit OutputBuffer(const detail::BufferImpl::SharedPtr &pimpl) :
+ pimpl_(pimpl)
+ { }
+
+ detail::BufferImpl::SharedPtr pimpl_; ///< Must never be null.
+};
+
+/**
+ * The InputBuffer (read-only buffer)
+ *
+ * InputBuffer is an immutable buffer which that may be constructed from an
+ * OutputBuffer, or several of OutputBuffer's methods. Once the data is
+ * transfered to an InputBuffer it cannot be modified, only read (via
+ * BufferReader, istream, or its iterator).
+ *
+ * Assignments and copies are shallow copies.
+ *
+ * -# ASIO access: - iterate using const_iterator for sending messages
+ *
+ **/
+
+class AVRO_DECL InputBuffer
+{
+
+ public:
+
+ typedef detail::size_type size_type;
+ typedef detail::data_type data_type;
+
+ // needed for asio
+ typedef detail::InputBufferIterator const_iterator;
+
+ /**
+ * Default InputBuffer creates an empty buffer.
+ *
+ * Copy/assignment functions use the default ones. They will do a shallow
+ * copy, and because InputBuffer is immutable, the copies will be
+ * identical.
+ *
+ * Destructor also uses the default, which resets a shared pointer,
+ * deleting the underlying data if no other copies of exist.
+ **/
+
+ InputBuffer() :
+ pimpl_(new detail::BufferImpl)
+ { }
+
+ /**
+ * Construct an InputBuffer that contains the contents of an OutputBuffer.
+ * The two buffers will have the same contents, but this copy will be
+ * immutable, while the the OutputBuffer may still be written to.
+ *
+ * If you wish to move the data from the OutputBuffer to a new InputBuffer
+ * (leaving only free space in the OutputBuffer),
+ * OutputBuffer::extractData() will do this more efficiently.
+ *
+ * Implicit conversion is allowed.
+ **/
+
+ InputBuffer(const OutputBuffer &src) :
+ pimpl_(new detail::BufferImpl(*src.pimpl_))
+ { }
+
+ /**
+ * Does the buffer have any data?
+ **/
+
+ bool empty() const {
+ return (pimpl_->size() == 0);
+ }
+
+ /**
+ * Returns the size of the buffer, in bytes.
+ **/
+
+ size_type size() const {
+ return pimpl_->size();
+ }
+
+ /**
+ * Return an iterator pointing to the first data chunk of this buffer
+ * that contains data.
+ **/
+
+ const_iterator begin() const {
+ return const_iterator(pimpl_->beginRead());
+ }
+
+ /**
+ * Return the end iterator.
+ **/
+
+ const_iterator end() const {
+ return const_iterator(pimpl_->endRead());
+ }
+
+ /**
+ * Returns the number of chunks containing data.
+ **/
+
+ int numChunks() const {
+ return pimpl_->numDataChunks();
+ }
+
+
+ private:
+
+ friend class OutputBuffer; // for append function
+ friend class istreambuf;
+ friend class BufferReader;
+
+ explicit InputBuffer(const detail::BufferImpl::SharedPtr &pimpl) :
+ pimpl_(pimpl)
+ { }
+
+ /**
+ * Class to indicate that a copy of a OutputBuffer to InputBuffer should be
+ * a shallow copy, used to enable reading of the contents of an
+ * OutputBuffer without need to convert it to InputBuffer using a deep
+ * copy. It is private and only used by BufferReader and istreambuf
+ * classes.
+ *
+ * Writing to an OutputBuffer while it is being read may lead to undefined
+ * behavior.
+ **/
+
+ class ShallowCopy {};
+
+ /**
+ * Make a shallow copy of an OutputBuffer in order to read it without
+ * causing conversion overhead.
+ **/
+ InputBuffer(const OutputBuffer &src, const ShallowCopy &) :
+ pimpl_(src.pimpl_)
+ { }
+
+ /**
+ * Make a shallow copy of an InputBuffer. The default copy constructor
+ * already provides shallow copy, this is just provided for generic
+ * algorithms that wish to treat InputBuffer and OutputBuffer in the same
+ * manner.
+ **/
+
+ InputBuffer(const InputBuffer &src, const ShallowCopy &) :
+ pimpl_(src.pimpl_)
+ { }
+
+
+ detail::BufferImpl::ConstSharedPtr pimpl_; ///< Must never be null.
+};
+
+
+/*
+ * Implementations of some OutputBuffer functions are inlined here
+ * because InputBuffer definition was required before.
+ */
+
+inline InputBuffer OutputBuffer::extractData()
+{
+ detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl);
+ if(pimpl_->size()) {
+ pimpl_->extractData(*newImpl);
+ }
+ return InputBuffer(newImpl);
+}
+
+inline InputBuffer OutputBuffer::extractData(size_type bytes)
+{
+ if(bytes > pimpl_->size()) {
+ throw std::out_of_range("trying to extract more data than exists");
+ }
+
+ detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl);
+ if(bytes > 0) {
+ if(bytes < pimpl_->size()) {
+ pimpl_->extractData(*newImpl, bytes);
+ }
+ else {
+ pimpl_->extractData(*newImpl);
+ }
+ }
+
+ return InputBuffer(newImpl);
+}
+
+#ifndef _WIN32
+/**
+ * Create an array of iovec structures from the buffer. This utility is used
+ * to support writev and readv function calls. The caller should ensure the
+ * buffer object is not deleted while using the iovec vector.
+ *
+ * If the BufferType is an InputBuffer, the iovec will point to the data that
+ * already exists in the buffer, for reading.
+ *
+ * If the BufferType is an OutputBuffer, the iovec will point to the free
+ * space, which may be written to. Before writing, the caller should call
+ * OutputBuffer::reserve() to create enough room for the desired write (which
+ * can be verified by calling OutputBuffer::freeSpace()), and after writing,
+ * they MUST call OutputBuffer::wroteTo(), otherwise the buffer will not know
+ * the space is not free anymore.
+ *
+ **/
+
+template<class BufferType>
+inline void toIovec(BufferType &buf, std::vector<struct iovec> &iov)
+{
+ const int chunks = buf.numChunks();
+ iov.resize(chunks);
+ typename BufferType::const_iterator iter = buf.begin();
+ for (int i = 0; i < chunks; ++i) {
+ iov[i].iov_base = const_cast<typename BufferType::data_type *>(iter->data());
+ iov[i].iov_len = iter->size();
+ ++iter;
+ }
+}
+#endif
+
+} // namespace
+
+#endif
diff --git a/contrib/libs/apache/avro/api/buffer/BufferReader.hh b/contrib/libs/apache/avro/api/buffer/BufferReader.hh
index 83b6b4b3242..cc1b05880b1 100644
--- a/contrib/libs/apache/avro/api/buffer/BufferReader.hh
+++ b/contrib/libs/apache/avro/api/buffer/BufferReader.hh
@@ -1,289 +1,289 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_BufferReader_hh__
-#define avro_BufferReader_hh__
-
-#include <type_traits>
-#include "Buffer.hh"
-
-#ifdef min
-#undef min
-#endif
-/**
- * \file BufferReader.hh
- *
- * \brief Helper class for reading bytes from buffer in a streaming manner,
- * without the overhead of istreams.
- *
- **/
-
-namespace avro {
-
-/**
- * Helper class for reading bytes from buffer without worrying about
- * chunk boundaries. May read from an InputBuffer or OutputBuffer.
- *
- **/
-class AVRO_DECL BufferReader : private boost::noncopyable
-{
-
- public:
-
- typedef detail::data_type data_type;
- typedef detail::size_type size_type;
-
- private:
-
- size_type chunkRemaining() const {
- return iter_->dataSize() - chunkPos_;
- }
-
- void incrementChunk(size_type howmuch) {
- bytesRemaining_ -= howmuch;
- chunkPos_ += howmuch;
- if(chunkPos_ == iter_->dataSize()) {
- chunkPos_ = 0;
- ++iter_;
- }
- }
-
- void rewind() {
- iter_ = bufferImpl_->beginRead();
- bytesRemaining_ = bytes_;
- chunkPos_ = 0;
- }
-
- const data_type *addr() const {
- return iter_->tellReadPos() + chunkPos_;
- }
-
- public:
-
- BufferReader(const InputBuffer &buf) :
- bufferImpl_(buf.pimpl_),
- iter_(bufferImpl_->beginRead()),
- bytes_(bufferImpl_->size()),
- bytesRemaining_(bytes_),
- chunkPos_(0)
- { }
-
- BufferReader(const OutputBuffer &buf) :
- bufferImpl_(buf.pimpl_),
- iter_(bufferImpl_->beginRead()),
- bytes_(bufferImpl_->size()),
- bytesRemaining_(bytes_),
- chunkPos_(0)
- { }
-
- /**
- * How many bytes are still not read from this buffer.
- **/
-
- size_type bytesRemaining() const {
- return bytesRemaining_;
- }
-
- /**
- * Read a block of data from the front of the buffer.
- **/
-
- size_type bytesRead() const {
- return bytes_ - bytesRemaining_;
- }
-
- /**
- * Read a block of data from the buffer.
- **/
-
- size_type read(data_type *data, size_type size) {
-
- if(size > bytesRemaining_) {
- size = bytesRemaining_;
- }
- size_type sizeToRead = size;
-
- while(sizeToRead) {
- const size_type toRead = std::min(sizeToRead, chunkRemaining());
- memcpy(data, addr(), toRead);
- sizeToRead -= toRead;
- data += toRead;
- incrementChunk(toRead);
- }
-
- return size;
- }
-
- /**
- * Read a block of data from the buffer.
- **/
-
- bool read(std::string &str, size_type size) {
- if(size > bytesRemaining_) {
- return false;
- }
-
- if(size <= chunkRemaining()) {
- fastStringRead(str, size);
- }
- else {
- slowStringRead(str, size);
- }
-
- return true;
- }
-
-
- /**
- * Read a single value from the buffer. The value must be a "fundamental"
- * type, e.g. int, float, etc. (otherwise use the other writeTo tests).
- *
- **/
-
- template<typename T>
- bool read(T &val) {
- return read(val, std::is_fundamental<T>());
- }
-
- /**
- * Skips a block of data from the buffer.
- **/
-
- bool skip(size_type bytes) {
- bool skipped = false;
- if(bytes <= bytesRemaining_) {
- doSkip(bytes);
- skipped = true;
- }
- return skipped;
- }
-
- /**
- * Seek to a position in the buffer.
- **/
-
- bool seek(size_type pos) {
- if(pos > bytes_) {
- return false;
- }
-
- size_type toSkip = pos;
- size_type curPos = bytesRead();
- // if the seek position is ahead, we can use skip to get there
- if(pos >= curPos) {
- toSkip -= curPos;
- }
- // if the seek position is ahead of the start of the chunk we can back up to
- // start of the chunk
- else if(pos >= (curPos - chunkPos_)) {
- curPos -= chunkPos_;
- bytesRemaining_ += chunkPos_;
- chunkPos_ = 0;
- toSkip -= curPos;
- }
- else {
- rewind();
- }
- doSkip(toSkip);
- return true;
- }
-
- bool peek(char &val) {
- bool ret = (bytesRemaining_ > 0);
- if(ret) {
- val = *(addr());
- }
- return ret;
- }
-
- InputBuffer copyData(size_type bytes) {
- if(bytes > bytesRemaining_) {
- // force no copy
- bytes = 0;
- }
- detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl);
- if(bytes) {
- bufferImpl_->copyData(*newImpl, iter_, chunkPos_, bytes);
- doSkip(bytes);
- }
- return InputBuffer(newImpl);
- }
-
- private:
-
- void doSkip(size_type sizeToSkip) {
-
- while(sizeToSkip) {
- const size_type toSkip = std::min(sizeToSkip, chunkRemaining());
- sizeToSkip -= toSkip;
- incrementChunk(toSkip);
- }
- }
-
- template<typename T>
- bool read(T &val, const std::true_type&)
- {
- if(sizeof(T) > bytesRemaining_) {
- return false;
- }
-
- if (sizeof(T) <= chunkRemaining()) {
- val = *(reinterpret_cast<const T*> (addr()));
- incrementChunk(sizeof(T));
- }
- else {
- read(reinterpret_cast<data_type *>(&val), sizeof(T));
- }
- return true;
- }
-
- /// An uninstantiable function, that is if boost::is_fundamental check fails
- template<typename T>
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_BufferReader_hh__
+#define avro_BufferReader_hh__
+
+#include <type_traits>
+#include "Buffer.hh"
+
+#ifdef min
+#undef min
+#endif
+/**
+ * \file BufferReader.hh
+ *
+ * \brief Helper class for reading bytes from buffer in a streaming manner,
+ * without the overhead of istreams.
+ *
+ **/
+
+namespace avro {
+
+/**
+ * Helper class for reading bytes from buffer without worrying about
+ * chunk boundaries. May read from an InputBuffer or OutputBuffer.
+ *
+ **/
+class AVRO_DECL BufferReader : private boost::noncopyable
+{
+
+ public:
+
+ typedef detail::data_type data_type;
+ typedef detail::size_type size_type;
+
+ private:
+
+ size_type chunkRemaining() const {
+ return iter_->dataSize() - chunkPos_;
+ }
+
+ void incrementChunk(size_type howmuch) {
+ bytesRemaining_ -= howmuch;
+ chunkPos_ += howmuch;
+ if(chunkPos_ == iter_->dataSize()) {
+ chunkPos_ = 0;
+ ++iter_;
+ }
+ }
+
+ void rewind() {
+ iter_ = bufferImpl_->beginRead();
+ bytesRemaining_ = bytes_;
+ chunkPos_ = 0;
+ }
+
+ const data_type *addr() const {
+ return iter_->tellReadPos() + chunkPos_;
+ }
+
+ public:
+
+ BufferReader(const InputBuffer &buf) :
+ bufferImpl_(buf.pimpl_),
+ iter_(bufferImpl_->beginRead()),
+ bytes_(bufferImpl_->size()),
+ bytesRemaining_(bytes_),
+ chunkPos_(0)
+ { }
+
+ BufferReader(const OutputBuffer &buf) :
+ bufferImpl_(buf.pimpl_),
+ iter_(bufferImpl_->beginRead()),
+ bytes_(bufferImpl_->size()),
+ bytesRemaining_(bytes_),
+ chunkPos_(0)
+ { }
+
+ /**
+ * How many bytes are still not read from this buffer.
+ **/
+
+ size_type bytesRemaining() const {
+ return bytesRemaining_;
+ }
+
+ /**
+ * Read a block of data from the front of the buffer.
+ **/
+
+ size_type bytesRead() const {
+ return bytes_ - bytesRemaining_;
+ }
+
+ /**
+ * Read a block of data from the buffer.
+ **/
+
+ size_type read(data_type *data, size_type size) {
+
+ if(size > bytesRemaining_) {
+ size = bytesRemaining_;
+ }
+ size_type sizeToRead = size;
+
+ while(sizeToRead) {
+ const size_type toRead = std::min(sizeToRead, chunkRemaining());
+ memcpy(data, addr(), toRead);
+ sizeToRead -= toRead;
+ data += toRead;
+ incrementChunk(toRead);
+ }
+
+ return size;
+ }
+
+ /**
+ * Read a block of data from the buffer.
+ **/
+
+ bool read(std::string &str, size_type size) {
+ if(size > bytesRemaining_) {
+ return false;
+ }
+
+ if(size <= chunkRemaining()) {
+ fastStringRead(str, size);
+ }
+ else {
+ slowStringRead(str, size);
+ }
+
+ return true;
+ }
+
+
+ /**
+ * Read a single value from the buffer. The value must be a "fundamental"
+ * type, e.g. int, float, etc. (otherwise use the other writeTo tests).
+ *
+ **/
+
+ template<typename T>
+ bool read(T &val) {
+ return read(val, std::is_fundamental<T>());
+ }
+
+ /**
+ * Skips a block of data from the buffer.
+ **/
+
+ bool skip(size_type bytes) {
+ bool skipped = false;
+ if(bytes <= bytesRemaining_) {
+ doSkip(bytes);
+ skipped = true;
+ }
+ return skipped;
+ }
+
+ /**
+ * Seek to a position in the buffer.
+ **/
+
+ bool seek(size_type pos) {
+ if(pos > bytes_) {
+ return false;
+ }
+
+ size_type toSkip = pos;
+ size_type curPos = bytesRead();
+ // if the seek position is ahead, we can use skip to get there
+ if(pos >= curPos) {
+ toSkip -= curPos;
+ }
+ // if the seek position is ahead of the start of the chunk we can back up to
+ // start of the chunk
+ else if(pos >= (curPos - chunkPos_)) {
+ curPos -= chunkPos_;
+ bytesRemaining_ += chunkPos_;
+ chunkPos_ = 0;
+ toSkip -= curPos;
+ }
+ else {
+ rewind();
+ }
+ doSkip(toSkip);
+ return true;
+ }
+
+ bool peek(char &val) {
+ bool ret = (bytesRemaining_ > 0);
+ if(ret) {
+ val = *(addr());
+ }
+ return ret;
+ }
+
+ InputBuffer copyData(size_type bytes) {
+ if(bytes > bytesRemaining_) {
+ // force no copy
+ bytes = 0;
+ }
+ detail::BufferImpl::SharedPtr newImpl(new detail::BufferImpl);
+ if(bytes) {
+ bufferImpl_->copyData(*newImpl, iter_, chunkPos_, bytes);
+ doSkip(bytes);
+ }
+ return InputBuffer(newImpl);
+ }
+
+ private:
+
+ void doSkip(size_type sizeToSkip) {
+
+ while(sizeToSkip) {
+ const size_type toSkip = std::min(sizeToSkip, chunkRemaining());
+ sizeToSkip -= toSkip;
+ incrementChunk(toSkip);
+ }
+ }
+
+ template<typename T>
+ bool read(T &val, const std::true_type&)
+ {
+ if(sizeof(T) > bytesRemaining_) {
+ return false;
+ }
+
+ if (sizeof(T) <= chunkRemaining()) {
+ val = *(reinterpret_cast<const T*> (addr()));
+ incrementChunk(sizeof(T));
+ }
+ else {
+ read(reinterpret_cast<data_type *>(&val), sizeof(T));
+ }
+ return true;
+ }
+
+ /// An uninstantiable function, that is if boost::is_fundamental check fails
+ template<typename T>
bool read(T &, const std::false_type&)
- {
- static_assert(sizeof(T) == 0, "Not a valid type to read");
- return false;
- }
-
- void fastStringRead(std::string &str, size_type sizeToCopy) {
- str.assign(addr(), sizeToCopy);
- incrementChunk(sizeToCopy);
- }
-
- void slowStringRead(std::string &str, size_type sizeToCopy) {
- str.clear();
- str.reserve(sizeToCopy);
- while(sizeToCopy) {
- const size_type toCopy = std::min(sizeToCopy, chunkRemaining());
- str.append(addr(), toCopy);
- sizeToCopy -= toCopy;
- incrementChunk(toCopy);
- }
- }
-
- detail::BufferImpl::ConstSharedPtr bufferImpl_;
- detail::BufferImpl::ChunkList::const_iterator iter_;
- size_type bytes_;
- size_type bytesRemaining_;
- size_type chunkPos_;
-};
-
-
-} // namespace
-
-#endif
+ {
+ static_assert(sizeof(T) == 0, "Not a valid type to read");
+ return false;
+ }
+
+ void fastStringRead(std::string &str, size_type sizeToCopy) {
+ str.assign(addr(), sizeToCopy);
+ incrementChunk(sizeToCopy);
+ }
+
+ void slowStringRead(std::string &str, size_type sizeToCopy) {
+ str.clear();
+ str.reserve(sizeToCopy);
+ while(sizeToCopy) {
+ const size_type toCopy = std::min(sizeToCopy, chunkRemaining());
+ str.append(addr(), toCopy);
+ sizeToCopy -= toCopy;
+ incrementChunk(toCopy);
+ }
+ }
+
+ detail::BufferImpl::ConstSharedPtr bufferImpl_;
+ detail::BufferImpl::ChunkList::const_iterator iter_;
+ size_type bytes_;
+ size_type bytesRemaining_;
+ size_type chunkPos_;
+};
+
+
+} // namespace
+
+#endif
diff --git a/contrib/libs/apache/avro/api/buffer/detail/BufferDetail.hh b/contrib/libs/apache/avro/api/buffer/detail/BufferDetail.hh
index 29a2e00b4e0..bbc2948f26e 100644
--- a/contrib/libs/apache/avro/api/buffer/detail/BufferDetail.hh
+++ b/contrib/libs/apache/avro/api/buffer/detail/BufferDetail.hh
@@ -1,555 +1,555 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_BufferDetail_hh__
-#define avro_BufferDetail_hh__
-
-#include <boost/shared_ptr.hpp>
-#include <boost/shared_array.hpp>
-#include <boost/static_assert.hpp>
-#include <boost/function.hpp>
-#include <boost/utility.hpp>
-#ifdef HAVE_BOOST_ASIO
-#include <boost/asio/buffer.hpp>
-#endif
-#include <exception>
-#include <cassert>
-#include <deque>
-
-/**
- * \file BufferDetail.hh
- *
- * \brief The implementation details for the Buffer class.
- *
- **/
-
-namespace avro {
-
-namespace detail {
-
-typedef char data_type;
-typedef size_t size_type;
-#ifdef HAVE_BOOST_ASIO
-typedef boost::asio::const_buffer ConstAsioBuffer;
-typedef boost::asio::mutable_buffer MutableAsioBuffer;
-#endif
-
-/// The size in bytes for blocks backing buffer chunks.
-const size_type kMinBlockSize = 4096;
-const size_type kMaxBlockSize = 16384;
-const size_type kDefaultBlockSize = kMinBlockSize;
-
-typedef boost::function<void(void)> free_func;
-
-/**
- * Simple class to hold a functor that executes on delete
- **/
-class CallOnDestroy {
- public:
- CallOnDestroy(const free_func &func) : func_(func)
- { }
- ~CallOnDestroy() {
- if (func_) {
- func_();
- }
- }
- private:
- free_func func_;
-};
-
-/**
- * \brief A chunk is the building block for buffers.
- *
- * A chunk is backed by a memory block, and internally it maintains information
- * about which area of the block it may use, and the portion of this area that
- * contains valid data. More than one chunk may share the same underlying
- * block, but the areas should never overlap. Chunk holds a shared pointer to
- * an array of bytes so that shared blocks are reference counted.
- *
- * When a chunk is copied, the copy shares the same underlying buffer, but the
- * copy receives its own copies of the start/cursor/end pointers, so each copy
- * can be manipulated independently. This allows different buffers to share
- * the same non-overlapping parts of a chunk, or even overlapping parts of a
- * chunk if the situation arises.
- *
- **/
-
-class Chunk
-{
-
- public:
-
- typedef boost::shared_ptr<Chunk> SharedPtr;
-
- /// Default constructor, allocates a new underlying block for this chunk.
- Chunk(size_type size) :
- underlyingBlock_(new data_type[size]),
- readPos_(underlyingBlock_.get()),
- writePos_(readPos_),
- endPos_(readPos_ + size)
- { }
-
- /// Foreign buffer constructor, uses the supplied data for this chunk, and
- /// only for reading.
- Chunk(const data_type *data, size_type size, const free_func &func) :
- callOnDestroy_(new CallOnDestroy(func)),
- readPos_(const_cast<data_type *>(data)),
- writePos_(readPos_ + size),
- endPos_(writePos_)
- { }
-
- private:
- // reference counted object will call a functor when it's destroyed
- boost::shared_ptr<CallOnDestroy> callOnDestroy_;
-
- public:
-
- /// Remove readable bytes from the front of the chunk by advancing the
- /// chunk start position.
- void truncateFront(size_type howMuch) {
- readPos_ += howMuch;
- assert(readPos_ <= writePos_);
- }
-
- /// Remove readable bytes from the back of the chunk by moving the
- /// chunk cursor position.
- void truncateBack(size_type howMuch) {
- writePos_ -= howMuch;
- assert(readPos_ <= writePos_);
- }
-
- /// Tell the position the next byte may be written to.
- data_type *tellWritePos() const {
- return writePos_;
- }
-
- /// Tell the position of the first byte containing valid data.
- const data_type *tellReadPos() const {
- return readPos_;
- }
-
- /// After a write operation, increment the write position.
- void incrementCursor(size_type howMuch) {
- writePos_ += howMuch;
- assert(writePos_ <= endPos_);
- }
-
- /// Tell how many bytes of data were written to this chunk.
- size_type dataSize() const {
- return (writePos_ - readPos_);
- }
-
- /// Tell how many bytes this chunk has available to write to.
- size_type freeSize() const {
- return (endPos_ - writePos_);
- }
-
- /// Tell how many bytes of data this chunk can hold (used and free).
- size_type capacity() const {
- return (endPos_ - readPos_);
- }
-
- private:
-
- friend bool operator==(const Chunk &lhs, const Chunk &rhs);
- friend bool operator!=(const Chunk &lhs, const Chunk &rhs);
-
- // more than one buffer can share an underlying block, so use SharedPtr
- boost::shared_array<data_type> underlyingBlock_;
-
- data_type *readPos_; ///< The first readable byte in the block
- data_type *writePos_; ///< The end of written data and start of free space
- data_type *endPos_; ///< Marks the end of the usable block area
-};
-
-/**
- * Compare underlying buffers and return true if they are equal
- **/
-inline bool operator==(const Chunk &lhs, const Chunk &rhs) {
- return lhs.underlyingBlock_ == rhs.underlyingBlock_;
-}
-
-/**
- * Compare underlying buffers and return true if they are unequal
- **/
-inline bool operator!=(const Chunk &lhs, const Chunk &rhs) {
- return lhs.underlyingBlock_ != rhs.underlyingBlock_;
-}
-
-
-/**
- * \brief Implementation details for Buffer class
- *
- * Internally, BufferImpl keeps two lists of chunks, one list consists entirely of
- * chunks containing data, and one list which contains chunks with free space.
- *
- *
- */
-
-class BufferImpl : boost::noncopyable
-{
-
- /// Add a new chunk to the list of chunks for this buffer, growing the
- /// buffer by the default block size.
- void allocChunkChecked(size_type size = kDefaultBlockSize)
- {
- writeChunks_.push_back(Chunk(size));
- freeSpace_ += writeChunks_.back().freeSize();
- }
-
- /// Add a new chunk to the list of chunks for this buffer, growing the
- /// buffer by the requested size, but within the range of a minimum and
- /// maximum.
- void allocChunk(size_type size)
- {
- if(size < kMinBlockSize) {
- size = kMinBlockSize;
- }
- else if (size > kMaxBlockSize) {
- size = kMaxBlockSize;
- }
- allocChunkChecked(size);
- }
-
- /// Update the state of the chunks after a write operation. This function
- /// ensures the chunk states are consistent with the write.
- void postWrite(size_type size)
- {
-
- // precondition to this function is that the writeChunk_.front()
- // contains the data that was just written, so make sure writeChunks_
- // is not empty:
-
- assert(size <= freeSpace_ && !writeChunks_.empty());
-
- // This is probably the one tricky part of BufferImpl. The data that
- // was written now exists in writeChunks_.front(). Now we must make
- // sure that same data exists in readChunks_.back().
- //
- // There are two cases:
- //
- // 1. readChunks_.last() and writeChunk_.front() refer to the same
- // underlying block, in which case they both just need their cursor
- // updated to reflect the new state.
- //
- // 2. readChunk_.last() is not the same block as writeChunks_.front(),
- // in which case it should be, since the writeChunk.front() contains
- // the next bit of data that will be appended to readChunks_, and
- // therefore needs to be copied there so we can proceed with updating
- // their state.
- //
-
- // if readChunks_ is not the same as writeChunks_.front(), make a copy
- // of it there
-
- if(readChunks_.empty() || (readChunks_.back() != writeChunks_.front())) {
- const Chunk &curChunk = writeChunks_.front();
- readChunks_.push_back(curChunk);
-
- // Any data that existed in the write chunk previously doesn't
- // belong to this buffer (otherwise it would have already been
- // added to the readChunk_ list). Here, adjust the start of the
- // readChunk to begin after any data already existing in curChunk
-
- readChunks_.back().truncateFront( curChunk.dataSize());
- }
-
- assert(readChunks_.back().freeSize() == writeChunks_.front().freeSize());
-
- // update the states of both readChunks_ and writeChunks_ to indicate that they are
- // holding the new data
-
- readChunks_.back().incrementCursor(size);
- writeChunks_.front().incrementCursor(size);
- size_ += size;
- freeSpace_ -= size;
-
- // if there is no more free space in writeChunks_, the next write cannot use
- // it, so dispose of it now
-
- if(writeChunks_.front().freeSize() == 0) {
- writeChunks_.pop_front();
- }
- }
-
- public:
-
- typedef std::deque<Chunk> ChunkList;
- typedef boost::shared_ptr<BufferImpl> SharedPtr;
- typedef boost::shared_ptr<const BufferImpl> ConstSharedPtr;
-
- /// Default constructor, creates a buffer without any chunks
- BufferImpl() :
- freeSpace_(0),
- size_(0)
- { }
-
- /// Copy constructor, gets a copy of all the chunks with data.
- explicit BufferImpl(const BufferImpl &src) :
- readChunks_(src.readChunks_),
- freeSpace_(0),
- size_(src.size_)
- { }
-
- /// Amount of data held in this buffer.
- size_type size() const {
- return size_;
- }
-
- /// Capacity that may be written before the buffer must allocate more memory.
- size_type freeSpace() const {
- return freeSpace_;
- }
-
- /// Add enough free chunks to make the reservation size available.
- /// Actual amount may be more (rounded up to next chunk).
- void reserveFreeSpace(size_type reserveSize) {
- while(freeSpace_ < reserveSize) {
- allocChunk(reserveSize - freeSpace_);
- }
- }
-
- /// Return the chunk avro's begin iterator for reading.
- ChunkList::const_iterator beginRead() const {
- return readChunks_.begin();
- }
-
- /// Return the chunk avro's end iterator for reading.
- ChunkList::const_iterator endRead() const {
- return readChunks_.end();
- }
-
- /// Return the chunk avro's begin iterator for writing.
- ChunkList::const_iterator beginWrite() const {
- return writeChunks_.begin();
- }
-
- /// Return the chunk avro's end iterator for writing.
- ChunkList::const_iterator endWrite() const {
- return writeChunks_.end();
- }
-
- /// Write a single value to buffer, add a new chunk if necessary.
- template<typename T>
- void writeTo(T val, const std::true_type&)
- {
- if(freeSpace_ && (sizeof(T) <= writeChunks_.front().freeSize())) {
- // fast path, there's enough room in the writeable chunk to just
- // straight out copy it
- *(reinterpret_cast <T*> ( writeChunks_.front().tellWritePos()) ) = val;
- postWrite(sizeof(T));
- }
- else {
- // need to fixup chunks first, so use the regular memcpy
- // writeTo method
- writeTo(reinterpret_cast<data_type*>(&val), sizeof(T));
- }
- }
-
- /// An uninstantiable function, this is if boost::is_fundamental check fails,
- /// and will compile-time assert.
- template<typename T>
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_BufferDetail_hh__
+#define avro_BufferDetail_hh__
+
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/function.hpp>
+#include <boost/utility.hpp>
+#ifdef HAVE_BOOST_ASIO
+#include <boost/asio/buffer.hpp>
+#endif
+#include <exception>
+#include <cassert>
+#include <deque>
+
+/**
+ * \file BufferDetail.hh
+ *
+ * \brief The implementation details for the Buffer class.
+ *
+ **/
+
+namespace avro {
+
+namespace detail {
+
+typedef char data_type;
+typedef size_t size_type;
+#ifdef HAVE_BOOST_ASIO
+typedef boost::asio::const_buffer ConstAsioBuffer;
+typedef boost::asio::mutable_buffer MutableAsioBuffer;
+#endif
+
+/// The size in bytes for blocks backing buffer chunks.
+const size_type kMinBlockSize = 4096;
+const size_type kMaxBlockSize = 16384;
+const size_type kDefaultBlockSize = kMinBlockSize;
+
+typedef boost::function<void(void)> free_func;
+
+/**
+ * Simple class to hold a functor that executes on delete
+ **/
+class CallOnDestroy {
+ public:
+ CallOnDestroy(const free_func &func) : func_(func)
+ { }
+ ~CallOnDestroy() {
+ if (func_) {
+ func_();
+ }
+ }
+ private:
+ free_func func_;
+};
+
+/**
+ * \brief A chunk is the building block for buffers.
+ *
+ * A chunk is backed by a memory block, and internally it maintains information
+ * about which area of the block it may use, and the portion of this area that
+ * contains valid data. More than one chunk may share the same underlying
+ * block, but the areas should never overlap. Chunk holds a shared pointer to
+ * an array of bytes so that shared blocks are reference counted.
+ *
+ * When a chunk is copied, the copy shares the same underlying buffer, but the
+ * copy receives its own copies of the start/cursor/end pointers, so each copy
+ * can be manipulated independently. This allows different buffers to share
+ * the same non-overlapping parts of a chunk, or even overlapping parts of a
+ * chunk if the situation arises.
+ *
+ **/
+
+class Chunk
+{
+
+ public:
+
+ typedef boost::shared_ptr<Chunk> SharedPtr;
+
+ /// Default constructor, allocates a new underlying block for this chunk.
+ Chunk(size_type size) :
+ underlyingBlock_(new data_type[size]),
+ readPos_(underlyingBlock_.get()),
+ writePos_(readPos_),
+ endPos_(readPos_ + size)
+ { }
+
+ /// Foreign buffer constructor, uses the supplied data for this chunk, and
+ /// only for reading.
+ Chunk(const data_type *data, size_type size, const free_func &func) :
+ callOnDestroy_(new CallOnDestroy(func)),
+ readPos_(const_cast<data_type *>(data)),
+ writePos_(readPos_ + size),
+ endPos_(writePos_)
+ { }
+
+ private:
+ // reference counted object will call a functor when it's destroyed
+ boost::shared_ptr<CallOnDestroy> callOnDestroy_;
+
+ public:
+
+ /// Remove readable bytes from the front of the chunk by advancing the
+ /// chunk start position.
+ void truncateFront(size_type howMuch) {
+ readPos_ += howMuch;
+ assert(readPos_ <= writePos_);
+ }
+
+ /// Remove readable bytes from the back of the chunk by moving the
+ /// chunk cursor position.
+ void truncateBack(size_type howMuch) {
+ writePos_ -= howMuch;
+ assert(readPos_ <= writePos_);
+ }
+
+ /// Tell the position the next byte may be written to.
+ data_type *tellWritePos() const {
+ return writePos_;
+ }
+
+ /// Tell the position of the first byte containing valid data.
+ const data_type *tellReadPos() const {
+ return readPos_;
+ }
+
+ /// After a write operation, increment the write position.
+ void incrementCursor(size_type howMuch) {
+ writePos_ += howMuch;
+ assert(writePos_ <= endPos_);
+ }
+
+ /// Tell how many bytes of data were written to this chunk.
+ size_type dataSize() const {
+ return (writePos_ - readPos_);
+ }
+
+ /// Tell how many bytes this chunk has available to write to.
+ size_type freeSize() const {
+ return (endPos_ - writePos_);
+ }
+
+ /// Tell how many bytes of data this chunk can hold (used and free).
+ size_type capacity() const {
+ return (endPos_ - readPos_);
+ }
+
+ private:
+
+ friend bool operator==(const Chunk &lhs, const Chunk &rhs);
+ friend bool operator!=(const Chunk &lhs, const Chunk &rhs);
+
+ // more than one buffer can share an underlying block, so use SharedPtr
+ boost::shared_array<data_type> underlyingBlock_;
+
+ data_type *readPos_; ///< The first readable byte in the block
+ data_type *writePos_; ///< The end of written data and start of free space
+ data_type *endPos_; ///< Marks the end of the usable block area
+};
+
+/**
+ * Compare underlying buffers and return true if they are equal
+ **/
+inline bool operator==(const Chunk &lhs, const Chunk &rhs) {
+ return lhs.underlyingBlock_ == rhs.underlyingBlock_;
+}
+
+/**
+ * Compare underlying buffers and return true if they are unequal
+ **/
+inline bool operator!=(const Chunk &lhs, const Chunk &rhs) {
+ return lhs.underlyingBlock_ != rhs.underlyingBlock_;
+}
+
+
+/**
+ * \brief Implementation details for Buffer class
+ *
+ * Internally, BufferImpl keeps two lists of chunks, one list consists entirely of
+ * chunks containing data, and one list which contains chunks with free space.
+ *
+ *
+ */
+
+class BufferImpl : boost::noncopyable
+{
+
+ /// Add a new chunk to the list of chunks for this buffer, growing the
+ /// buffer by the default block size.
+ void allocChunkChecked(size_type size = kDefaultBlockSize)
+ {
+ writeChunks_.push_back(Chunk(size));
+ freeSpace_ += writeChunks_.back().freeSize();
+ }
+
+ /// Add a new chunk to the list of chunks for this buffer, growing the
+ /// buffer by the requested size, but within the range of a minimum and
+ /// maximum.
+ void allocChunk(size_type size)
+ {
+ if(size < kMinBlockSize) {
+ size = kMinBlockSize;
+ }
+ else if (size > kMaxBlockSize) {
+ size = kMaxBlockSize;
+ }
+ allocChunkChecked(size);
+ }
+
+ /// Update the state of the chunks after a write operation. This function
+ /// ensures the chunk states are consistent with the write.
+ void postWrite(size_type size)
+ {
+
+ // precondition to this function is that the writeChunk_.front()
+ // contains the data that was just written, so make sure writeChunks_
+ // is not empty:
+
+ assert(size <= freeSpace_ && !writeChunks_.empty());
+
+ // This is probably the one tricky part of BufferImpl. The data that
+ // was written now exists in writeChunks_.front(). Now we must make
+ // sure that same data exists in readChunks_.back().
+ //
+ // There are two cases:
+ //
+ // 1. readChunks_.last() and writeChunk_.front() refer to the same
+ // underlying block, in which case they both just need their cursor
+ // updated to reflect the new state.
+ //
+ // 2. readChunk_.last() is not the same block as writeChunks_.front(),
+ // in which case it should be, since the writeChunk.front() contains
+ // the next bit of data that will be appended to readChunks_, and
+ // therefore needs to be copied there so we can proceed with updating
+ // their state.
+ //
+
+ // if readChunks_ is not the same as writeChunks_.front(), make a copy
+ // of it there
+
+ if(readChunks_.empty() || (readChunks_.back() != writeChunks_.front())) {
+ const Chunk &curChunk = writeChunks_.front();
+ readChunks_.push_back(curChunk);
+
+ // Any data that existed in the write chunk previously doesn't
+ // belong to this buffer (otherwise it would have already been
+ // added to the readChunk_ list). Here, adjust the start of the
+ // readChunk to begin after any data already existing in curChunk
+
+ readChunks_.back().truncateFront( curChunk.dataSize());
+ }
+
+ assert(readChunks_.back().freeSize() == writeChunks_.front().freeSize());
+
+ // update the states of both readChunks_ and writeChunks_ to indicate that they are
+ // holding the new data
+
+ readChunks_.back().incrementCursor(size);
+ writeChunks_.front().incrementCursor(size);
+ size_ += size;
+ freeSpace_ -= size;
+
+ // if there is no more free space in writeChunks_, the next write cannot use
+ // it, so dispose of it now
+
+ if(writeChunks_.front().freeSize() == 0) {
+ writeChunks_.pop_front();
+ }
+ }
+
+ public:
+
+ typedef std::deque<Chunk> ChunkList;
+ typedef boost::shared_ptr<BufferImpl> SharedPtr;
+ typedef boost::shared_ptr<const BufferImpl> ConstSharedPtr;
+
+ /// Default constructor, creates a buffer without any chunks
+ BufferImpl() :
+ freeSpace_(0),
+ size_(0)
+ { }
+
+ /// Copy constructor, gets a copy of all the chunks with data.
+ explicit BufferImpl(const BufferImpl &src) :
+ readChunks_(src.readChunks_),
+ freeSpace_(0),
+ size_(src.size_)
+ { }
+
+ /// Amount of data held in this buffer.
+ size_type size() const {
+ return size_;
+ }
+
+ /// Capacity that may be written before the buffer must allocate more memory.
+ size_type freeSpace() const {
+ return freeSpace_;
+ }
+
+ /// Add enough free chunks to make the reservation size available.
+ /// Actual amount may be more (rounded up to next chunk).
+ void reserveFreeSpace(size_type reserveSize) {
+ while(freeSpace_ < reserveSize) {
+ allocChunk(reserveSize - freeSpace_);
+ }
+ }
+
+ /// Return the chunk avro's begin iterator for reading.
+ ChunkList::const_iterator beginRead() const {
+ return readChunks_.begin();
+ }
+
+ /// Return the chunk avro's end iterator for reading.
+ ChunkList::const_iterator endRead() const {
+ return readChunks_.end();
+ }
+
+ /// Return the chunk avro's begin iterator for writing.
+ ChunkList::const_iterator beginWrite() const {
+ return writeChunks_.begin();
+ }
+
+ /// Return the chunk avro's end iterator for writing.
+ ChunkList::const_iterator endWrite() const {
+ return writeChunks_.end();
+ }
+
+ /// Write a single value to buffer, add a new chunk if necessary.
+ template<typename T>
+ void writeTo(T val, const std::true_type&)
+ {
+ if(freeSpace_ && (sizeof(T) <= writeChunks_.front().freeSize())) {
+ // fast path, there's enough room in the writeable chunk to just
+ // straight out copy it
+ *(reinterpret_cast <T*> ( writeChunks_.front().tellWritePos()) ) = val;
+ postWrite(sizeof(T));
+ }
+ else {
+ // need to fixup chunks first, so use the regular memcpy
+ // writeTo method
+ writeTo(reinterpret_cast<data_type*>(&val), sizeof(T));
+ }
+ }
+
+ /// An uninstantiable function, this is if boost::is_fundamental check fails,
+ /// and will compile-time assert.
+ template<typename T>
void writeTo(T, const std::false_type&)
- {
- BOOST_STATIC_ASSERT(sizeof(T)==0);
- }
-
- /// Write a block of data to the buffer, adding new chunks if necessary.
- size_type writeTo(const data_type *data, size_type size)
- {
- size_type bytesLeft = size;
- while(bytesLeft) {
-
- if(freeSpace_ == 0) {
- allocChunkChecked();
- }
-
- Chunk &chunk = writeChunks_.front();
- size_type toCopy = std::min<size_type>(chunk.freeSize(), bytesLeft);
- assert(toCopy);
- memcpy(chunk.tellWritePos(), data, toCopy);
- postWrite(toCopy);
- data += toCopy;
- bytesLeft -= toCopy;
- }
- return size;
- }
-
- /// Update internal status of chunks after data is written using iterator.
- size_type wroteTo(size_type size)
- {
- assert(size <= freeSpace_);
- size_type bytesLeft = size;
- while (bytesLeft) {
-
- Chunk &chunk = writeChunks_.front();
- size_type wrote = std::min<size_type>(chunk.freeSize(), bytesLeft);
- assert(wrote);
- postWrite(wrote);
- bytesLeft -= wrote;
- }
- return size;
- }
-
- /// Append the chunks that have data in src to this buffer
- void append(const BufferImpl &src) {
- std::copy(src.readChunks_.begin(), src.readChunks_.end(), std::back_inserter(readChunks_));
- size_ += src.size_;
- }
-
- /// Remove all the chunks that contain data from this buffer.
- void discardData() {
- readChunks_.clear();
- size_ = 0;
- }
-
- /// Remove the specified amount of data from the chunks, starting at the front.
- void discardData(size_type bytes)
- {
- assert(bytes && bytes <= size_);
-
- size_type bytesToDiscard = bytes;
- while( bytesToDiscard ) {
-
- size_t currentSize = readChunks_.front().dataSize();
-
- // see if entire chunk is discarded
- if(currentSize <= bytesToDiscard) {
- readChunks_.pop_front();
- bytesToDiscard -= currentSize;
- }
- else {
- readChunks_.front().truncateFront(bytesToDiscard);
- bytesToDiscard = 0;
- }
- }
-
- size_ -= bytes;
- }
-
- /// Remove the specified amount of data from the chunks, moving the
- /// data to dest's chunks
- void extractData(BufferImpl &dest, size_type bytes)
- {
- assert(bytes && bytes <= size_);
-
- size_type bytesToExtract = bytes;
- while( bytesToExtract ) {
-
- size_t currentSize = readChunks_.front().dataSize();
- dest.readChunks_.push_back(readChunks_.front());
-
- // see if entire chunk was extracted
- if(currentSize <= bytesToExtract) {
- readChunks_.pop_front();
- bytesToExtract -= currentSize;
- }
- else {
- readChunks_.front().truncateFront(bytesToExtract);
- size_t excess = currentSize - bytesToExtract;
- dest.readChunks_.back().truncateBack(excess);
- bytesToExtract = 0;
- }
- }
-
- size_ -= bytes;
- dest.size_ += bytes;
- }
-
- /// Move data from this to the destination, leaving this buffer without data
- void extractData(BufferImpl &dest)
- {
- assert(dest.readChunks_.empty());
- dest.readChunks_.swap(readChunks_);
- dest.size_ = size_;
- size_ = 0;
- }
-
- /// Copy data to a different buffer by copying the chunks. It's
- /// a bit like extract, but without modifying the source buffer.
- void copyData(BufferImpl &dest,
- ChunkList::const_iterator iter,
- size_type offset,
- size_type bytes) const
- {
- // now we are positioned to start the copying, copy as many
- // chunks as we need, the first chunk may have a non-zero offset
- // if the data to copy is not at the start of the chunk
- size_type copied = 0;
- while(copied < bytes) {
-
- dest.readChunks_.push_back(*iter);
-
- // offset only applies in the first chunk,
- // all subsequent chunks are copied from the start
- dest.readChunks_.back().truncateFront(offset);
- offset = 0;
-
- copied += dest.readChunks_.back().dataSize();
- ++iter;
- }
-
- // if the last chunk copied has more bytes than we need, truncate it
- size_type excess = copied - bytes;
- dest.readChunks_.back().truncateBack(excess);
-
- dest.size_ += bytes;
- }
-
- /// The number of chunks containing data. Used for debugging.
- int numDataChunks() const {
- return readChunks_.size();
- }
-
- /// The number of chunks containing free space (note that an entire chunk
- /// may not be free). Used for debugging.
- int numFreeChunks() const {
- return writeChunks_.size();
- }
-
- /// Add unmanaged data to the buffer. The buffer will not automatically
- /// free the data, but it will call the supplied function when the data is
- /// no longer referenced by the buffer (or copies of the buffer).
- void appendForeignData(const data_type *data, size_type size, const free_func &func) {
- readChunks_.push_back(Chunk(data, size, func));
- size_ += size;
- }
-
- private:
-
- /// Assignment not allowed
- BufferImpl& operator=(const BufferImpl &src);
- /* {
- readChunks_.assign(src.readChunks_.begin(), src.readChunks_.end());
- size_ = src.size();
- return *this;
- } */
-
- ChunkList readChunks_; ///< chunks of this buffer containing data
- ChunkList writeChunks_; ///< chunks of this buffer containing free space
-
- size_type freeSpace_; ///< capacity of buffer before allocation required
- size_type size_; ///< amount of data in buffer
-
-};
-
-} // detail namespace
-
-} // namespace
-
-#endif
+ {
+ BOOST_STATIC_ASSERT(sizeof(T)==0);
+ }
+
+ /// Write a block of data to the buffer, adding new chunks if necessary.
+ size_type writeTo(const data_type *data, size_type size)
+ {
+ size_type bytesLeft = size;
+ while(bytesLeft) {
+
+ if(freeSpace_ == 0) {
+ allocChunkChecked();
+ }
+
+ Chunk &chunk = writeChunks_.front();
+ size_type toCopy = std::min<size_type>(chunk.freeSize(), bytesLeft);
+ assert(toCopy);
+ memcpy(chunk.tellWritePos(), data, toCopy);
+ postWrite(toCopy);
+ data += toCopy;
+ bytesLeft -= toCopy;
+ }
+ return size;
+ }
+
+ /// Update internal status of chunks after data is written using iterator.
+ size_type wroteTo(size_type size)
+ {
+ assert(size <= freeSpace_);
+ size_type bytesLeft = size;
+ while (bytesLeft) {
+
+ Chunk &chunk = writeChunks_.front();
+ size_type wrote = std::min<size_type>(chunk.freeSize(), bytesLeft);
+ assert(wrote);
+ postWrite(wrote);
+ bytesLeft -= wrote;
+ }
+ return size;
+ }
+
+ /// Append the chunks that have data in src to this buffer
+ void append(const BufferImpl &src) {
+ std::copy(src.readChunks_.begin(), src.readChunks_.end(), std::back_inserter(readChunks_));
+ size_ += src.size_;
+ }
+
+ /// Remove all the chunks that contain data from this buffer.
+ void discardData() {
+ readChunks_.clear();
+ size_ = 0;
+ }
+
+ /// Remove the specified amount of data from the chunks, starting at the front.
+ void discardData(size_type bytes)
+ {
+ assert(bytes && bytes <= size_);
+
+ size_type bytesToDiscard = bytes;
+ while( bytesToDiscard ) {
+
+ size_t currentSize = readChunks_.front().dataSize();
+
+ // see if entire chunk is discarded
+ if(currentSize <= bytesToDiscard) {
+ readChunks_.pop_front();
+ bytesToDiscard -= currentSize;
+ }
+ else {
+ readChunks_.front().truncateFront(bytesToDiscard);
+ bytesToDiscard = 0;
+ }
+ }
+
+ size_ -= bytes;
+ }
+
+ /// Remove the specified amount of data from the chunks, moving the
+ /// data to dest's chunks
+ void extractData(BufferImpl &dest, size_type bytes)
+ {
+ assert(bytes && bytes <= size_);
+
+ size_type bytesToExtract = bytes;
+ while( bytesToExtract ) {
+
+ size_t currentSize = readChunks_.front().dataSize();
+ dest.readChunks_.push_back(readChunks_.front());
+
+ // see if entire chunk was extracted
+ if(currentSize <= bytesToExtract) {
+ readChunks_.pop_front();
+ bytesToExtract -= currentSize;
+ }
+ else {
+ readChunks_.front().truncateFront(bytesToExtract);
+ size_t excess = currentSize - bytesToExtract;
+ dest.readChunks_.back().truncateBack(excess);
+ bytesToExtract = 0;
+ }
+ }
+
+ size_ -= bytes;
+ dest.size_ += bytes;
+ }
+
+ /// Move data from this to the destination, leaving this buffer without data
+ void extractData(BufferImpl &dest)
+ {
+ assert(dest.readChunks_.empty());
+ dest.readChunks_.swap(readChunks_);
+ dest.size_ = size_;
+ size_ = 0;
+ }
+
+ /// Copy data to a different buffer by copying the chunks. It's
+ /// a bit like extract, but without modifying the source buffer.
+ void copyData(BufferImpl &dest,
+ ChunkList::const_iterator iter,
+ size_type offset,
+ size_type bytes) const
+ {
+ // now we are positioned to start the copying, copy as many
+ // chunks as we need, the first chunk may have a non-zero offset
+ // if the data to copy is not at the start of the chunk
+ size_type copied = 0;
+ while(copied < bytes) {
+
+ dest.readChunks_.push_back(*iter);
+
+ // offset only applies in the first chunk,
+ // all subsequent chunks are copied from the start
+ dest.readChunks_.back().truncateFront(offset);
+ offset = 0;
+
+ copied += dest.readChunks_.back().dataSize();
+ ++iter;
+ }
+
+ // if the last chunk copied has more bytes than we need, truncate it
+ size_type excess = copied - bytes;
+ dest.readChunks_.back().truncateBack(excess);
+
+ dest.size_ += bytes;
+ }
+
+ /// The number of chunks containing data. Used for debugging.
+ int numDataChunks() const {
+ return readChunks_.size();
+ }
+
+ /// The number of chunks containing free space (note that an entire chunk
+ /// may not be free). Used for debugging.
+ int numFreeChunks() const {
+ return writeChunks_.size();
+ }
+
+ /// Add unmanaged data to the buffer. The buffer will not automatically
+ /// free the data, but it will call the supplied function when the data is
+ /// no longer referenced by the buffer (or copies of the buffer).
+ void appendForeignData(const data_type *data, size_type size, const free_func &func) {
+ readChunks_.push_back(Chunk(data, size, func));
+ size_ += size;
+ }
+
+ private:
+
+ /// Assignment not allowed
+ BufferImpl& operator=(const BufferImpl &src);
+ /* {
+ readChunks_.assign(src.readChunks_.begin(), src.readChunks_.end());
+ size_ = src.size();
+ return *this;
+ } */
+
+ ChunkList readChunks_; ///< chunks of this buffer containing data
+ ChunkList writeChunks_; ///< chunks of this buffer containing free space
+
+ size_type freeSpace_; ///< capacity of buffer before allocation required
+ size_type size_; ///< amount of data in buffer
+
+};
+
+} // detail namespace
+
+} // namespace
+
+#endif
diff --git a/contrib/libs/apache/avro/api/buffer/detail/BufferDetailIterator.hh b/contrib/libs/apache/avro/api/buffer/detail/BufferDetailIterator.hh
index c05f219a64c..ad1dee6fe2a 100644
--- a/contrib/libs/apache/avro/api/buffer/detail/BufferDetailIterator.hh
+++ b/contrib/libs/apache/avro/api/buffer/detail/BufferDetailIterator.hh
@@ -1,230 +1,230 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_BufferDetailIterator_hh__
-#define avro_BufferDetailIterator_hh__
-
-#include "BufferDetail.hh"
-
-/**
- * \file BufferDetailIterator.hh
- *
- * \brief The implementation details for the Buffer iterators.
- **/
-
-namespace avro {
-
-namespace detail {
-
-/**
- * \brief Implements conversion from a chunk to asio::const_buffer
- *
- * Iterators for an InputBuffer will iterate over the avro of chunks, so
- * internally they contain an iterator. But the iterator needs to be
- * convertable to an asio buffer for use in boost::asio functions. This class
- * wraps the iterator with a cast operator to do this conversion.
- **/
-
-struct InputIteratorHelper
-{
- /// Construct a helper with an unnassigned iterator.
- InputIteratorHelper() :
- iter_()
- {}
-
- /// Construct a helper with an iterator.
- InputIteratorHelper(const BufferImpl::ChunkList::const_iterator &iter) :
- iter_(iter)
- {}
-
- /// The location of valid data in this chunk.
- const data_type *data() const {
- return iter_->tellReadPos();
- }
-
- /// The size of valid data in this chunk.
- size_type size() const {
- return iter_->dataSize();
- }
-
- /// Conversion operator. It doesn't check for null, because the only
- /// the only time the chunk should be null is when it's the iterator
- /// end(), which should never be dereferenced anyway.
-#ifdef HAVE_BOOST_ASIO
- operator ConstAsioBuffer() const {
- return ConstAsioBuffer(data(), size());
- }
-#endif
-
- BufferImpl::ChunkList::const_iterator iter_; ///< the current iterator
-};
-
-/**
- * \brief Implements conversion from a chunk to asio::buffer
- *
- * Iterators for an OutputBuffer will iterate over the avro of chunks, so
- * internally they contain an iterator. But the iterator needs to be
- * convertable to an asio buffer for use in boost::asio functions. This class
- * wraps the iterator with a cast operator to do this conversion.
- */
-
-struct OutputIteratorHelper
-{
- /// Construct a helper with an unnassigned iterator.
- OutputIteratorHelper() :
- iter_()
- {}
-
- /// Construct a helper with an iterator.
- OutputIteratorHelper(const BufferImpl::ChunkList::const_iterator &iter) :
- iter_(iter)
- {}
-
- /// The location of the first writable byte in this chunk.
- data_type *data() const {
- return iter_->tellWritePos();
- }
-
- /// The size of area that can be written in this chunk.
- size_type size() const {
- return iter_->freeSize();
- }
-
- /// Conversion operator. It doesn't check for null, because the only
- /// the only time the chunk should be null is when it's the iterator
- /// end(), which should never be dereferenced anyway.
-#ifdef HAVE_BOOST_ASIO
- operator MutableAsioBuffer() const {
- return MutableAsioBuffer(data(), size());
- }
-#endif
-
- BufferImpl::ChunkList::const_iterator iter_; ///< the current iterator
-};
-
-/**
- * \brief Implements the iterator for Buffer, that iterates through the
- * buffer's chunks.
- **/
-
-template<typename Helper>
-class BufferIterator
-{
-
- public:
-
- typedef BufferIterator<Helper> this_type;
-
- /**
- * @name Typedefs
- *
- * STL iterators define the following declarations. According to
- * boost::asio documentation, the library expects the iterator to be
- * bidirectional, however this implements only the forward iterator type.
- * So far this has not created any problems with asio, but may change if
- * future versions of the asio require it.
- **/
-
- //@{
- typedef std::forward_iterator_tag iterator_category; // this is a lie to appease asio
- typedef Helper value_type;
- typedef std::ptrdiff_t difference_type;
- typedef value_type* pointer;
- typedef value_type& reference;
- //@}
-
- /// Construct an unitialized iterator.
- BufferIterator() :
- helper_()
- { }
-
- /* The default implementations are good here
- /// Copy constructor.
- BufferIterator(const BufferIterator &src) :
- helper_(src.helper_)
- { }
- /// Assignment.
- this_type& operator= (const this_type &rhs) {
- helper_ = rhs.helper_;
- return *this;
- }
- */
-
- /// Construct iterator at the position in the buffer's chunk list.
- explicit BufferIterator(BufferImpl::ChunkList::const_iterator iter) :
- helper_(iter)
- { }
-
- /// Dereference iterator, returns InputIteratorHelper or OutputIteratorHelper wrapper.
- reference operator *() {
- return helper_;
- }
-
- /// Dereference iterator, returns const InputIteratorHelper or OutputIteratorHelper wrapper.
- const value_type &operator *() const {
- return helper_;
- }
-
- /// Dereference iterator, returns InputIteratorHelper or OutputIteratorHelper wrapper.
- pointer operator->() {
- return &helper_;
- }
-
- /// Dereference iterator, returns const InputIteratorHelper or OutputIteratorHelper wrapper.
- const value_type *operator->() const {
- return &helper_;
- }
-
- /// Increment to next chunk in list, or to end() iterator.
- this_type& operator++()
- {
- ++helper_.iter_;
- return *this;
- }
-
- /// Increment to next chunk in list, or to end() iterator.
- this_type operator++(int)
- {
- this_type ret = *this;
- ++helper_.iter_;
- return ret;
- }
-
- /// True if iterators point to same chunks.
- bool operator==(const this_type &rhs) const {
- return (helper_.iter_ == rhs.helper_.iter_);
- }
-
- /// True if iterators point to different chunks.
- bool operator!=(const this_type &rhs) const {
- return (helper_.iter_ != rhs.helper_.iter_);
- }
-
- private:
-
- Helper helper_;
-};
-
-typedef BufferIterator<InputIteratorHelper> InputBufferIterator;
-typedef BufferIterator<OutputIteratorHelper> OutputBufferIterator;
-
-} // detail namespace
-
-} // namespace
-
-#endif
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_BufferDetailIterator_hh__
+#define avro_BufferDetailIterator_hh__
+
+#include "BufferDetail.hh"
+
+/**
+ * \file BufferDetailIterator.hh
+ *
+ * \brief The implementation details for the Buffer iterators.
+ **/
+
+namespace avro {
+
+namespace detail {
+
+/**
+ * \brief Implements conversion from a chunk to asio::const_buffer
+ *
+ * Iterators for an InputBuffer will iterate over the avro of chunks, so
+ * internally they contain an iterator. But the iterator needs to be
+ * convertable to an asio buffer for use in boost::asio functions. This class
+ * wraps the iterator with a cast operator to do this conversion.
+ **/
+
+struct InputIteratorHelper
+{
+ /// Construct a helper with an unnassigned iterator.
+ InputIteratorHelper() :
+ iter_()
+ {}
+
+ /// Construct a helper with an iterator.
+ InputIteratorHelper(const BufferImpl::ChunkList::const_iterator &iter) :
+ iter_(iter)
+ {}
+
+ /// The location of valid data in this chunk.
+ const data_type *data() const {
+ return iter_->tellReadPos();
+ }
+
+ /// The size of valid data in this chunk.
+ size_type size() const {
+ return iter_->dataSize();
+ }
+
+ /// Conversion operator. It doesn't check for null, because the only
+ /// the only time the chunk should be null is when it's the iterator
+ /// end(), which should never be dereferenced anyway.
+#ifdef HAVE_BOOST_ASIO
+ operator ConstAsioBuffer() const {
+ return ConstAsioBuffer(data(), size());
+ }
+#endif
+
+ BufferImpl::ChunkList::const_iterator iter_; ///< the current iterator
+};
+
+/**
+ * \brief Implements conversion from a chunk to asio::buffer
+ *
+ * Iterators for an OutputBuffer will iterate over the avro of chunks, so
+ * internally they contain an iterator. But the iterator needs to be
+ * convertable to an asio buffer for use in boost::asio functions. This class
+ * wraps the iterator with a cast operator to do this conversion.
+ */
+
+struct OutputIteratorHelper
+{
+ /// Construct a helper with an unnassigned iterator.
+ OutputIteratorHelper() :
+ iter_()
+ {}
+
+ /// Construct a helper with an iterator.
+ OutputIteratorHelper(const BufferImpl::ChunkList::const_iterator &iter) :
+ iter_(iter)
+ {}
+
+ /// The location of the first writable byte in this chunk.
+ data_type *data() const {
+ return iter_->tellWritePos();
+ }
+
+ /// The size of area that can be written in this chunk.
+ size_type size() const {
+ return iter_->freeSize();
+ }
+
+ /// Conversion operator. It doesn't check for null, because the only
+ /// the only time the chunk should be null is when it's the iterator
+ /// end(), which should never be dereferenced anyway.
+#ifdef HAVE_BOOST_ASIO
+ operator MutableAsioBuffer() const {
+ return MutableAsioBuffer(data(), size());
+ }
+#endif
+
+ BufferImpl::ChunkList::const_iterator iter_; ///< the current iterator
+};
+
+/**
+ * \brief Implements the iterator for Buffer, that iterates through the
+ * buffer's chunks.
+ **/
+
+template<typename Helper>
+class BufferIterator
+{
+
+ public:
+
+ typedef BufferIterator<Helper> this_type;
+
+ /**
+ * @name Typedefs
+ *
+ * STL iterators define the following declarations. According to
+ * boost::asio documentation, the library expects the iterator to be
+ * bidirectional, however this implements only the forward iterator type.
+ * So far this has not created any problems with asio, but may change if
+ * future versions of the asio require it.
+ **/
+
+ //@{
+ typedef std::forward_iterator_tag iterator_category; // this is a lie to appease asio
+ typedef Helper value_type;
+ typedef std::ptrdiff_t difference_type;
+ typedef value_type* pointer;
+ typedef value_type& reference;
+ //@}
+
+ /// Construct an unitialized iterator.
+ BufferIterator() :
+ helper_()
+ { }
+
+ /* The default implementations are good here
+ /// Copy constructor.
+ BufferIterator(const BufferIterator &src) :
+ helper_(src.helper_)
+ { }
+ /// Assignment.
+ this_type& operator= (const this_type &rhs) {
+ helper_ = rhs.helper_;
+ return *this;
+ }
+ */
+
+ /// Construct iterator at the position in the buffer's chunk list.
+ explicit BufferIterator(BufferImpl::ChunkList::const_iterator iter) :
+ helper_(iter)
+ { }
+
+ /// Dereference iterator, returns InputIteratorHelper or OutputIteratorHelper wrapper.
+ reference operator *() {
+ return helper_;
+ }
+
+ /// Dereference iterator, returns const InputIteratorHelper or OutputIteratorHelper wrapper.
+ const value_type &operator *() const {
+ return helper_;
+ }
+
+ /// Dereference iterator, returns InputIteratorHelper or OutputIteratorHelper wrapper.
+ pointer operator->() {
+ return &helper_;
+ }
+
+ /// Dereference iterator, returns const InputIteratorHelper or OutputIteratorHelper wrapper.
+ const value_type *operator->() const {
+ return &helper_;
+ }
+
+ /// Increment to next chunk in list, or to end() iterator.
+ this_type& operator++()
+ {
+ ++helper_.iter_;
+ return *this;
+ }
+
+ /// Increment to next chunk in list, or to end() iterator.
+ this_type operator++(int)
+ {
+ this_type ret = *this;
+ ++helper_.iter_;
+ return ret;
+ }
+
+ /// True if iterators point to same chunks.
+ bool operator==(const this_type &rhs) const {
+ return (helper_.iter_ == rhs.helper_.iter_);
+ }
+
+ /// True if iterators point to different chunks.
+ bool operator!=(const this_type &rhs) const {
+ return (helper_.iter_ != rhs.helper_.iter_);
+ }
+
+ private:
+
+ Helper helper_;
+};
+
+typedef BufferIterator<InputIteratorHelper> InputBufferIterator;
+typedef BufferIterator<OutputIteratorHelper> OutputBufferIterator;
+
+} // detail namespace
+
+} // namespace
+
+#endif
diff --git a/contrib/libs/apache/avro/avro/AvroParse.hh b/contrib/libs/apache/avro/avro/AvroParse.hh
index 1ce878e5d10..06b1930d17f 100644
--- a/contrib/libs/apache/avro/avro/AvroParse.hh
+++ b/contrib/libs/apache/avro/avro/AvroParse.hh
@@ -1 +1 @@
-#include "../api/AvroParse.hh" /* inclink generated by yamaker */
+#include "../api/AvroParse.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/AvroSerialize.hh b/contrib/libs/apache/avro/avro/AvroSerialize.hh
index 17f3da8cc5c..fd2c96a77bc 100644
--- a/contrib/libs/apache/avro/avro/AvroSerialize.hh
+++ b/contrib/libs/apache/avro/avro/AvroSerialize.hh
@@ -1 +1 @@
-#include "../api/AvroSerialize.hh" /* inclink generated by yamaker */
+#include "../api/AvroSerialize.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/AvroTraits.hh b/contrib/libs/apache/avro/avro/AvroTraits.hh
index edb19eb8400..7e6e49a1830 100644
--- a/contrib/libs/apache/avro/avro/AvroTraits.hh
+++ b/contrib/libs/apache/avro/avro/AvroTraits.hh
@@ -1 +1 @@
-#include "../api/AvroTraits.hh" /* inclink generated by yamaker */
+#include "../api/AvroTraits.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Compiler.hh b/contrib/libs/apache/avro/avro/Compiler.hh
index fa43ede18a6..9e80353938b 100644
--- a/contrib/libs/apache/avro/avro/Compiler.hh
+++ b/contrib/libs/apache/avro/avro/Compiler.hh
@@ -1 +1 @@
-#include "../api/Compiler.hh" /* inclink generated by yamaker */
+#include "../api/Compiler.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Config.hh b/contrib/libs/apache/avro/avro/Config.hh
index ae19fb85b8b..105b0f910c3 100644
--- a/contrib/libs/apache/avro/avro/Config.hh
+++ b/contrib/libs/apache/avro/avro/Config.hh
@@ -1 +1 @@
-#include "../api/Config.hh" /* inclink generated by yamaker */
+#include "../api/Config.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/DataFile.hh b/contrib/libs/apache/avro/avro/DataFile.hh
index 401cb5fc6da..1b7a87ca959 100644
--- a/contrib/libs/apache/avro/avro/DataFile.hh
+++ b/contrib/libs/apache/avro/avro/DataFile.hh
@@ -1 +1 @@
-#include "../api/DataFile.hh" /* inclink generated by yamaker */
+#include "../api/DataFile.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Decoder.hh b/contrib/libs/apache/avro/avro/Decoder.hh
index 1d8d79c4e59..b6b77aac74f 100644
--- a/contrib/libs/apache/avro/avro/Decoder.hh
+++ b/contrib/libs/apache/avro/avro/Decoder.hh
@@ -1 +1 @@
-#include "../api/Decoder.hh" /* inclink generated by yamaker */
+#include "../api/Decoder.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Encoder.hh b/contrib/libs/apache/avro/avro/Encoder.hh
index b390aee4df5..8d5d5225490 100644
--- a/contrib/libs/apache/avro/avro/Encoder.hh
+++ b/contrib/libs/apache/avro/avro/Encoder.hh
@@ -1 +1 @@
-#include "../api/Encoder.hh" /* inclink generated by yamaker */
+#include "../api/Encoder.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Exception.hh b/contrib/libs/apache/avro/avro/Exception.hh
index 63ad410a4c5..3037d8c087f 100644
--- a/contrib/libs/apache/avro/avro/Exception.hh
+++ b/contrib/libs/apache/avro/avro/Exception.hh
@@ -1 +1 @@
-#include "../api/Exception.hh" /* inclink generated by yamaker */
+#include "../api/Exception.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Generic.hh b/contrib/libs/apache/avro/avro/Generic.hh
index 8d649971835..29b4e93a16b 100644
--- a/contrib/libs/apache/avro/avro/Generic.hh
+++ b/contrib/libs/apache/avro/avro/Generic.hh
@@ -1 +1 @@
-#include "../api/Generic.hh" /* inclink generated by yamaker */
+#include "../api/Generic.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/GenericDatum.hh b/contrib/libs/apache/avro/avro/GenericDatum.hh
index e98719027e7..b894d3d6c05 100644
--- a/contrib/libs/apache/avro/avro/GenericDatum.hh
+++ b/contrib/libs/apache/avro/avro/GenericDatum.hh
@@ -1 +1 @@
-#include "../api/GenericDatum.hh" /* inclink generated by yamaker */
+#include "../api/GenericDatum.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Layout.hh b/contrib/libs/apache/avro/avro/Layout.hh
index f0c927217f7..89c307fa3b7 100644
--- a/contrib/libs/apache/avro/avro/Layout.hh
+++ b/contrib/libs/apache/avro/avro/Layout.hh
@@ -1 +1 @@
-#include "../api/Layout.hh" /* inclink generated by yamaker */
+#include "../api/Layout.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/LogicalType.hh b/contrib/libs/apache/avro/avro/LogicalType.hh
index 2a7eaa375ac..ec7b1450efd 100644
--- a/contrib/libs/apache/avro/avro/LogicalType.hh
+++ b/contrib/libs/apache/avro/avro/LogicalType.hh
@@ -1 +1 @@
-#include "../api/LogicalType.hh" /* inclink generated by yamaker */
+#include "../api/LogicalType.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Node.hh b/contrib/libs/apache/avro/avro/Node.hh
index 600d9e8e225..2673bee1d46 100644
--- a/contrib/libs/apache/avro/avro/Node.hh
+++ b/contrib/libs/apache/avro/avro/Node.hh
@@ -1 +1 @@
-#include "../api/Node.hh" /* inclink generated by yamaker */
+#include "../api/Node.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/NodeConcepts.hh b/contrib/libs/apache/avro/avro/NodeConcepts.hh
index 5e532e20b15..6a915727aa1 100644
--- a/contrib/libs/apache/avro/avro/NodeConcepts.hh
+++ b/contrib/libs/apache/avro/avro/NodeConcepts.hh
@@ -1 +1 @@
-#include "../api/NodeConcepts.hh" /* inclink generated by yamaker */
+#include "../api/NodeConcepts.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/NodeImpl.hh b/contrib/libs/apache/avro/avro/NodeImpl.hh
index d4893d50a89..23fe131b4a7 100644
--- a/contrib/libs/apache/avro/avro/NodeImpl.hh
+++ b/contrib/libs/apache/avro/avro/NodeImpl.hh
@@ -1 +1 @@
-#include "../api/NodeImpl.hh" /* inclink generated by yamaker */
+#include "../api/NodeImpl.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Parser.hh b/contrib/libs/apache/avro/avro/Parser.hh
index 28eb7c4e166..9c790dae758 100644
--- a/contrib/libs/apache/avro/avro/Parser.hh
+++ b/contrib/libs/apache/avro/avro/Parser.hh
@@ -1 +1 @@
-#include "../api/Parser.hh" /* inclink generated by yamaker */
+#include "../api/Parser.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Reader.hh b/contrib/libs/apache/avro/avro/Reader.hh
index cdfa6dec444..807fbc73a3c 100644
--- a/contrib/libs/apache/avro/avro/Reader.hh
+++ b/contrib/libs/apache/avro/avro/Reader.hh
@@ -1 +1 @@
-#include "../api/Reader.hh" /* inclink generated by yamaker */
+#include "../api/Reader.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Resolver.hh b/contrib/libs/apache/avro/avro/Resolver.hh
index bc3b3fb08f9..40b8ced1802 100644
--- a/contrib/libs/apache/avro/avro/Resolver.hh
+++ b/contrib/libs/apache/avro/avro/Resolver.hh
@@ -1 +1 @@
-#include "../api/Resolver.hh" /* inclink generated by yamaker */
+#include "../api/Resolver.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/ResolverSchema.hh b/contrib/libs/apache/avro/avro/ResolverSchema.hh
index f4d41e1de2e..ad5ae6018d9 100644
--- a/contrib/libs/apache/avro/avro/ResolverSchema.hh
+++ b/contrib/libs/apache/avro/avro/ResolverSchema.hh
@@ -1 +1 @@
-#include "../api/ResolverSchema.hh" /* inclink generated by yamaker */
+#include "../api/ResolverSchema.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/ResolvingReader.hh b/contrib/libs/apache/avro/avro/ResolvingReader.hh
index 04e49127471..3652448bc03 100644
--- a/contrib/libs/apache/avro/avro/ResolvingReader.hh
+++ b/contrib/libs/apache/avro/avro/ResolvingReader.hh
@@ -1 +1 @@
-#include "../api/ResolvingReader.hh" /* inclink generated by yamaker */
+#include "../api/ResolvingReader.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Schema.hh b/contrib/libs/apache/avro/avro/Schema.hh
index 515b9d58556..0a2c3371c05 100644
--- a/contrib/libs/apache/avro/avro/Schema.hh
+++ b/contrib/libs/apache/avro/avro/Schema.hh
@@ -1 +1 @@
-#include "../api/Schema.hh" /* inclink generated by yamaker */
+#include "../api/Schema.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/SchemaResolution.hh b/contrib/libs/apache/avro/avro/SchemaResolution.hh
index 3aef7ff60eb..e4fe3e9cdbe 100644
--- a/contrib/libs/apache/avro/avro/SchemaResolution.hh
+++ b/contrib/libs/apache/avro/avro/SchemaResolution.hh
@@ -1 +1 @@
-#include "../api/SchemaResolution.hh" /* inclink generated by yamaker */
+#include "../api/SchemaResolution.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Serializer.hh b/contrib/libs/apache/avro/avro/Serializer.hh
index 538b57dfa26..04327c2e58d 100644
--- a/contrib/libs/apache/avro/avro/Serializer.hh
+++ b/contrib/libs/apache/avro/avro/Serializer.hh
@@ -1 +1 @@
-#include "../api/Serializer.hh" /* inclink generated by yamaker */
+#include "../api/Serializer.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Specific.hh b/contrib/libs/apache/avro/avro/Specific.hh
index 4f8595c03fb..759a4d768ee 100644
--- a/contrib/libs/apache/avro/avro/Specific.hh
+++ b/contrib/libs/apache/avro/avro/Specific.hh
@@ -1 +1 @@
-#include "../api/Specific.hh" /* inclink generated by yamaker */
+#include "../api/Specific.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Stream.hh b/contrib/libs/apache/avro/avro/Stream.hh
index e056f5ae299..e575e0c80fb 100644
--- a/contrib/libs/apache/avro/avro/Stream.hh
+++ b/contrib/libs/apache/avro/avro/Stream.hh
@@ -1 +1 @@
-#include "../api/Stream.hh" /* inclink generated by yamaker */
+#include "../api/Stream.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Types.hh b/contrib/libs/apache/avro/avro/Types.hh
index ad7ee126f4f..bcc9a982ac1 100644
--- a/contrib/libs/apache/avro/avro/Types.hh
+++ b/contrib/libs/apache/avro/avro/Types.hh
@@ -1 +1 @@
-#include "../api/Types.hh" /* inclink generated by yamaker */
+#include "../api/Types.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/ValidSchema.hh b/contrib/libs/apache/avro/avro/ValidSchema.hh
index a0845d77b1d..2e0adf09dd4 100644
--- a/contrib/libs/apache/avro/avro/ValidSchema.hh
+++ b/contrib/libs/apache/avro/avro/ValidSchema.hh
@@ -1 +1 @@
-#include "../api/ValidSchema.hh" /* inclink generated by yamaker */
+#include "../api/ValidSchema.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Validator.hh b/contrib/libs/apache/avro/avro/Validator.hh
index bca184c46d2..f0bb3673579 100644
--- a/contrib/libs/apache/avro/avro/Validator.hh
+++ b/contrib/libs/apache/avro/avro/Validator.hh
@@ -1 +1 @@
-#include "../api/Validator.hh" /* inclink generated by yamaker */
+#include "../api/Validator.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Writer.hh b/contrib/libs/apache/avro/avro/Writer.hh
index f0ebac09ddf..af44f03bf1a 100644
--- a/contrib/libs/apache/avro/avro/Writer.hh
+++ b/contrib/libs/apache/avro/avro/Writer.hh
@@ -1 +1 @@
-#include "../api/Writer.hh" /* inclink generated by yamaker */
+#include "../api/Writer.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/avro/Zigzag.hh b/contrib/libs/apache/avro/avro/Zigzag.hh
index db263917cab..8b2235f012d 100644
--- a/contrib/libs/apache/avro/avro/Zigzag.hh
+++ b/contrib/libs/apache/avro/avro/Zigzag.hh
@@ -1 +1 @@
-#include "../api/Zigzag.hh" /* inclink generated by yamaker */
+#include "../api/Zigzag.hh" /* inclink generated by yamaker */
diff --git a/contrib/libs/apache/avro/impl/BinaryDecoder.cc b/contrib/libs/apache/avro/impl/BinaryDecoder.cc
index 71cbf9f1078..a6e6055b7f1 100644
--- a/contrib/libs/apache/avro/impl/BinaryDecoder.cc
+++ b/contrib/libs/apache/avro/impl/BinaryDecoder.cc
@@ -1,252 +1,252 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define __STDC_LIMIT_MACROS
-
-#include <memory>
-#include "Decoder.hh"
-#include "Zigzag.hh"
-#include "Exception.hh"
-
-namespace avro {
-
-using std::make_shared;
-
-class BinaryDecoder : public Decoder {
- StreamReader in_;
- const uint8_t* next_;
- const uint8_t* end_;
-
- void init(InputStream& ib);
- void decodeNull();
- bool decodeBool();
- int32_t decodeInt();
- int64_t decodeLong();
- float decodeFloat();
- double decodeDouble();
- void decodeString(std::string& value);
- void skipString();
- void decodeBytes(std::vector<uint8_t>& value);
- void skipBytes();
- void decodeFixed(size_t n, std::vector<uint8_t>& value);
- void skipFixed(size_t n);
- size_t decodeEnum();
- size_t arrayStart();
- size_t arrayNext();
- size_t skipArray();
- size_t mapStart();
- size_t mapNext();
- size_t skipMap();
- size_t decodeUnionIndex();
-
- int64_t doDecodeLong();
- size_t doDecodeItemCount();
- size_t doDecodeLength();
- void drain();
- void more();
-};
-
-DecoderPtr binaryDecoder()
-{
- return make_shared<BinaryDecoder>();
-}
-
-void BinaryDecoder::init(InputStream& is)
-{
- in_.reset(is);
-}
-
-void BinaryDecoder::decodeNull()
-{
-}
-
-bool BinaryDecoder::decodeBool()
-{
- uint8_t v = in_.read();
- if (v == 0) {
- return false;
- } else if (v == 1) {
- return true;
- }
- throw Exception("Invalid value for bool");
-}
-
-int32_t BinaryDecoder::decodeInt()
-{
- int64_t val = doDecodeLong();
- if (val < INT32_MIN || val > INT32_MAX) {
- throw Exception(
- boost::format("Value out of range for Avro int: %1%") % val);
- }
- return static_cast<int32_t>(val);
-}
-
-int64_t BinaryDecoder::decodeLong()
-{
- return doDecodeLong();
-}
-
-float BinaryDecoder::decodeFloat()
-{
- float result;
- in_.readBytes(reinterpret_cast<uint8_t *>(&result), sizeof(float));
- return result;
-}
-
-double BinaryDecoder::decodeDouble()
-{
- double result;
- in_.readBytes(reinterpret_cast<uint8_t *>(&result), sizeof(double));
- return result;
-}
-
-size_t BinaryDecoder::doDecodeLength()
-{
- ssize_t len = decodeInt();
- if (len < 0) {
- throw Exception(
- boost::format("Cannot have negative length: %1%") % len);
- }
- return len;
-}
-
-void BinaryDecoder::drain()
-{
- in_.drain(false);
-}
-
-void BinaryDecoder::decodeString(std::string& value)
-{
- size_t len = doDecodeLength();
- value.resize(len);
- if (len > 0) {
- in_.readBytes(const_cast<uint8_t*>(
- reinterpret_cast<const uint8_t*>(value.c_str())), len);
- }
-}
-
-void BinaryDecoder::skipString()
-{
- size_t len = doDecodeLength();
- in_.skipBytes(len);
-}
-
-void BinaryDecoder::decodeBytes(std::vector<uint8_t>& value)
-{
- size_t len = doDecodeLength();
- value.resize(len);
- if (len > 0) {
- in_.readBytes(value.data(), len);
- }
-}
-
-void BinaryDecoder::skipBytes()
-{
- size_t len = doDecodeLength();
- in_.skipBytes(len);
-}
-
-void BinaryDecoder::decodeFixed(size_t n, std::vector<uint8_t>& value)
-{
- value.resize(n);
- if (n > 0) {
- in_.readBytes(value.data(), n);
- }
-}
-
-void BinaryDecoder::skipFixed(size_t n)
-{
- in_.skipBytes(n);
-}
-
-size_t BinaryDecoder::decodeEnum()
-{
- return static_cast<size_t>(doDecodeLong());
-}
-
-size_t BinaryDecoder::arrayStart()
-{
- return doDecodeItemCount();
-}
-
-size_t BinaryDecoder::doDecodeItemCount()
-{
- int64_t result = doDecodeLong();
- if (result < 0) {
- doDecodeLong();
- return static_cast<size_t>(-result);
- }
- return static_cast<size_t>(result);
-}
-
-size_t BinaryDecoder::arrayNext()
-{
- return static_cast<size_t>(doDecodeLong());
-}
-
-size_t BinaryDecoder::skipArray()
-{
- for (; ;) {
- int64_t r = doDecodeLong();
- if (r < 0) {
- size_t n = static_cast<size_t>(doDecodeLong());
- in_.skipBytes(n);
- } else {
- return static_cast<size_t>(r);
- }
- }
-}
-
-size_t BinaryDecoder::mapStart()
-{
- return doDecodeItemCount();
-}
-
-size_t BinaryDecoder::mapNext()
-{
- return doDecodeItemCount();
-}
-
-size_t BinaryDecoder::skipMap()
-{
- return skipArray();
-}
-
-size_t BinaryDecoder::decodeUnionIndex()
-{
- return static_cast<size_t>(doDecodeLong());
-}
-
-int64_t BinaryDecoder::doDecodeLong() {
- uint64_t encoded = 0;
- int shift = 0;
- uint8_t u;
- do {
- if (shift >= 64) {
- throw Exception("Invalid Avro varint");
- }
- u = in_.read();
- encoded |= static_cast<uint64_t>(u & 0x7f) << shift;
- shift += 7;
- } while (u & 0x80);
-
- return decodeZigzag64(encoded);
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define __STDC_LIMIT_MACROS
+
+#include <memory>
+#include "Decoder.hh"
+#include "Zigzag.hh"
+#include "Exception.hh"
+
+namespace avro {
+
+using std::make_shared;
+
+class BinaryDecoder : public Decoder {
+ StreamReader in_;
+ const uint8_t* next_;
+ const uint8_t* end_;
+
+ void init(InputStream& ib);
+ void decodeNull();
+ bool decodeBool();
+ int32_t decodeInt();
+ int64_t decodeLong();
+ float decodeFloat();
+ double decodeDouble();
+ void decodeString(std::string& value);
+ void skipString();
+ void decodeBytes(std::vector<uint8_t>& value);
+ void skipBytes();
+ void decodeFixed(size_t n, std::vector<uint8_t>& value);
+ void skipFixed(size_t n);
+ size_t decodeEnum();
+ size_t arrayStart();
+ size_t arrayNext();
+ size_t skipArray();
+ size_t mapStart();
+ size_t mapNext();
+ size_t skipMap();
+ size_t decodeUnionIndex();
+
+ int64_t doDecodeLong();
+ size_t doDecodeItemCount();
+ size_t doDecodeLength();
+ void drain();
+ void more();
+};
+
+DecoderPtr binaryDecoder()
+{
+ return make_shared<BinaryDecoder>();
+}
+
+void BinaryDecoder::init(InputStream& is)
+{
+ in_.reset(is);
+}
+
+void BinaryDecoder::decodeNull()
+{
+}
+
+bool BinaryDecoder::decodeBool()
+{
+ uint8_t v = in_.read();
+ if (v == 0) {
+ return false;
+ } else if (v == 1) {
+ return true;
+ }
+ throw Exception("Invalid value for bool");
+}
+
+int32_t BinaryDecoder::decodeInt()
+{
+ int64_t val = doDecodeLong();
+ if (val < INT32_MIN || val > INT32_MAX) {
+ throw Exception(
+ boost::format("Value out of range for Avro int: %1%") % val);
+ }
+ return static_cast<int32_t>(val);
+}
+
+int64_t BinaryDecoder::decodeLong()
+{
+ return doDecodeLong();
+}
+
+float BinaryDecoder::decodeFloat()
+{
+ float result;
+ in_.readBytes(reinterpret_cast<uint8_t *>(&result), sizeof(float));
+ return result;
+}
+
+double BinaryDecoder::decodeDouble()
+{
+ double result;
+ in_.readBytes(reinterpret_cast<uint8_t *>(&result), sizeof(double));
+ return result;
+}
+
+size_t BinaryDecoder::doDecodeLength()
+{
+ ssize_t len = decodeInt();
+ if (len < 0) {
+ throw Exception(
+ boost::format("Cannot have negative length: %1%") % len);
+ }
+ return len;
+}
+
+void BinaryDecoder::drain()
+{
+ in_.drain(false);
+}
+
+void BinaryDecoder::decodeString(std::string& value)
+{
+ size_t len = doDecodeLength();
+ value.resize(len);
+ if (len > 0) {
+ in_.readBytes(const_cast<uint8_t*>(
+ reinterpret_cast<const uint8_t*>(value.c_str())), len);
+ }
+}
+
+void BinaryDecoder::skipString()
+{
+ size_t len = doDecodeLength();
+ in_.skipBytes(len);
+}
+
+void BinaryDecoder::decodeBytes(std::vector<uint8_t>& value)
+{
+ size_t len = doDecodeLength();
+ value.resize(len);
+ if (len > 0) {
+ in_.readBytes(value.data(), len);
+ }
+}
+
+void BinaryDecoder::skipBytes()
+{
+ size_t len = doDecodeLength();
+ in_.skipBytes(len);
+}
+
+void BinaryDecoder::decodeFixed(size_t n, std::vector<uint8_t>& value)
+{
+ value.resize(n);
+ if (n > 0) {
+ in_.readBytes(value.data(), n);
+ }
+}
+
+void BinaryDecoder::skipFixed(size_t n)
+{
+ in_.skipBytes(n);
+}
+
+size_t BinaryDecoder::decodeEnum()
+{
+ return static_cast<size_t>(doDecodeLong());
+}
+
+size_t BinaryDecoder::arrayStart()
+{
+ return doDecodeItemCount();
+}
+
+size_t BinaryDecoder::doDecodeItemCount()
+{
+ int64_t result = doDecodeLong();
+ if (result < 0) {
+ doDecodeLong();
+ return static_cast<size_t>(-result);
+ }
+ return static_cast<size_t>(result);
+}
+
+size_t BinaryDecoder::arrayNext()
+{
+ return static_cast<size_t>(doDecodeLong());
+}
+
+size_t BinaryDecoder::skipArray()
+{
+ for (; ;) {
+ int64_t r = doDecodeLong();
+ if (r < 0) {
+ size_t n = static_cast<size_t>(doDecodeLong());
+ in_.skipBytes(n);
+ } else {
+ return static_cast<size_t>(r);
+ }
+ }
+}
+
+size_t BinaryDecoder::mapStart()
+{
+ return doDecodeItemCount();
+}
+
+size_t BinaryDecoder::mapNext()
+{
+ return doDecodeItemCount();
+}
+
+size_t BinaryDecoder::skipMap()
+{
+ return skipArray();
+}
+
+size_t BinaryDecoder::decodeUnionIndex()
+{
+ return static_cast<size_t>(doDecodeLong());
+}
+
+int64_t BinaryDecoder::doDecodeLong() {
+ uint64_t encoded = 0;
+ int shift = 0;
+ uint8_t u;
+ do {
+ if (shift >= 64) {
+ throw Exception("Invalid Avro varint");
+ }
+ u = in_.read();
+ encoded |= static_cast<uint64_t>(u & 0x7f) << shift;
+ shift += 7;
+ } while (u & 0x80);
+
+ return decodeZigzag64(encoded);
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/BinaryEncoder.cc b/contrib/libs/apache/avro/impl/BinaryEncoder.cc
index 5ceb872f8cd..1fa83fe8ff8 100644
--- a/contrib/libs/apache/avro/impl/BinaryEncoder.cc
+++ b/contrib/libs/apache/avro/impl/BinaryEncoder.cc
@@ -1,168 +1,168 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Encoder.hh"
-#include "Zigzag.hh"
-#include <array>
-
-namespace avro {
-
-using std::make_shared;
-
-class BinaryEncoder : public Encoder {
- StreamWriter out_;
-
- void init(OutputStream& os);
- void flush();
- int64_t byteCount() const;
- void encodeNull();
- void encodeBool(bool b);
- void encodeInt(int32_t i);
- void encodeLong(int64_t l);
- void encodeFloat(float f);
- void encodeDouble(double d);
- void encodeString(const std::string& s);
- void encodeBytes(const uint8_t *bytes, size_t len);
- void encodeFixed(const uint8_t *bytes, size_t len);
- void encodeEnum(size_t e);
- void arrayStart();
- void arrayEnd();
- void mapStart();
- void mapEnd();
- void setItemCount(size_t count);
- void startItem();
- void encodeUnionIndex(size_t e);
-
- void doEncodeLong(int64_t l);
-};
-
-EncoderPtr binaryEncoder()
-{
- return make_shared<BinaryEncoder>();
-}
-
-void BinaryEncoder::init(OutputStream& os)
-{
- out_.reset(os);
-}
-
-void BinaryEncoder::flush()
-{
- out_.flush();
-}
-
-void BinaryEncoder::encodeNull()
-{
-}
-
-void BinaryEncoder::encodeBool(bool b)
-{
- out_.write(b ? 1 : 0);
-}
-
-void BinaryEncoder::encodeInt(int32_t i)
-{
- doEncodeLong(i);
-}
-
-void BinaryEncoder::encodeLong(int64_t l)
-{
- doEncodeLong(l);
-}
-
-void BinaryEncoder::encodeFloat(float f)
-{
- const uint8_t* p = reinterpret_cast<const uint8_t*>(&f);
- out_.writeBytes(p, sizeof(float));
-}
-
-void BinaryEncoder::encodeDouble(double d)
-{
- const uint8_t* p = reinterpret_cast<const uint8_t*>(&d);
- out_.writeBytes(p, sizeof(double));
-}
-
-void BinaryEncoder::encodeString(const std::string& s)
-{
- doEncodeLong(s.size());
- out_.writeBytes(reinterpret_cast<const uint8_t*>(s.c_str()), s.size());
-}
-
-void BinaryEncoder::encodeBytes(const uint8_t *bytes, size_t len)
-{
- doEncodeLong(len);
- out_.writeBytes(bytes, len);
-}
-
-void BinaryEncoder::encodeFixed(const uint8_t *bytes, size_t len)
-{
- out_.writeBytes(bytes, len);
-}
-
-void BinaryEncoder::encodeEnum(size_t e)
-{
- doEncodeLong(e);
-}
-
-void BinaryEncoder::arrayStart()
-{
-}
-
-void BinaryEncoder::arrayEnd()
-{
- doEncodeLong(0);
-}
-
-void BinaryEncoder::mapStart()
-{
-}
-
-void BinaryEncoder::mapEnd()
-{
- doEncodeLong(0);
-}
-
-void BinaryEncoder::setItemCount(size_t count)
-{
- if (count == 0) {
- throw Exception("Count cannot be zero");
- }
- doEncodeLong(count);
-}
-
-void BinaryEncoder::startItem()
-{
-}
-
-void BinaryEncoder::encodeUnionIndex(size_t e)
-{
- doEncodeLong(e);
-}
-
-int64_t BinaryEncoder::byteCount() const {
- return out_.byteCount();
-}
-
-
-void BinaryEncoder::doEncodeLong(int64_t l)
-{
- std::array<uint8_t, 10> bytes;
- size_t size = encodeInt64(l, bytes);
- out_.writeBytes(bytes.data(), size);
-}
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Encoder.hh"
+#include "Zigzag.hh"
+#include <array>
+
+namespace avro {
+
+using std::make_shared;
+
+class BinaryEncoder : public Encoder {
+ StreamWriter out_;
+
+ void init(OutputStream& os);
+ void flush();
+ int64_t byteCount() const;
+ void encodeNull();
+ void encodeBool(bool b);
+ void encodeInt(int32_t i);
+ void encodeLong(int64_t l);
+ void encodeFloat(float f);
+ void encodeDouble(double d);
+ void encodeString(const std::string& s);
+ void encodeBytes(const uint8_t *bytes, size_t len);
+ void encodeFixed(const uint8_t *bytes, size_t len);
+ void encodeEnum(size_t e);
+ void arrayStart();
+ void arrayEnd();
+ void mapStart();
+ void mapEnd();
+ void setItemCount(size_t count);
+ void startItem();
+ void encodeUnionIndex(size_t e);
+
+ void doEncodeLong(int64_t l);
+};
+
+EncoderPtr binaryEncoder()
+{
+ return make_shared<BinaryEncoder>();
+}
+
+void BinaryEncoder::init(OutputStream& os)
+{
+ out_.reset(os);
+}
+
+void BinaryEncoder::flush()
+{
+ out_.flush();
+}
+
+void BinaryEncoder::encodeNull()
+{
+}
+
+void BinaryEncoder::encodeBool(bool b)
+{
+ out_.write(b ? 1 : 0);
+}
+
+void BinaryEncoder::encodeInt(int32_t i)
+{
+ doEncodeLong(i);
+}
+
+void BinaryEncoder::encodeLong(int64_t l)
+{
+ doEncodeLong(l);
+}
+
+void BinaryEncoder::encodeFloat(float f)
+{
+ const uint8_t* p = reinterpret_cast<const uint8_t*>(&f);
+ out_.writeBytes(p, sizeof(float));
+}
+
+void BinaryEncoder::encodeDouble(double d)
+{
+ const uint8_t* p = reinterpret_cast<const uint8_t*>(&d);
+ out_.writeBytes(p, sizeof(double));
+}
+
+void BinaryEncoder::encodeString(const std::string& s)
+{
+ doEncodeLong(s.size());
+ out_.writeBytes(reinterpret_cast<const uint8_t*>(s.c_str()), s.size());
+}
+
+void BinaryEncoder::encodeBytes(const uint8_t *bytes, size_t len)
+{
+ doEncodeLong(len);
+ out_.writeBytes(bytes, len);
+}
+
+void BinaryEncoder::encodeFixed(const uint8_t *bytes, size_t len)
+{
+ out_.writeBytes(bytes, len);
+}
+
+void BinaryEncoder::encodeEnum(size_t e)
+{
+ doEncodeLong(e);
+}
+
+void BinaryEncoder::arrayStart()
+{
+}
+
+void BinaryEncoder::arrayEnd()
+{
+ doEncodeLong(0);
+}
+
+void BinaryEncoder::mapStart()
+{
+}
+
+void BinaryEncoder::mapEnd()
+{
+ doEncodeLong(0);
+}
+
+void BinaryEncoder::setItemCount(size_t count)
+{
+ if (count == 0) {
+ throw Exception("Count cannot be zero");
+ }
+ doEncodeLong(count);
+}
+
+void BinaryEncoder::startItem()
+{
+}
+
+void BinaryEncoder::encodeUnionIndex(size_t e)
+{
+ doEncodeLong(e);
+}
+
+int64_t BinaryEncoder::byteCount() const {
+ return out_.byteCount();
+}
+
+
+void BinaryEncoder::doEncodeLong(int64_t l)
+{
+ std::array<uint8_t, 10> bytes;
+ size_t size = encodeInt64(l, bytes);
+ out_.writeBytes(bytes.data(), size);
+}
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Compiler.cc b/contrib/libs/apache/avro/impl/Compiler.cc
index 6453db8f17f..8b1de49a1d3 100644
--- a/contrib/libs/apache/avro/impl/Compiler.cc
+++ b/contrib/libs/apache/avro/impl/Compiler.cc
@@ -1,591 +1,591 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <boost/algorithm/string/replace.hpp>
-#include <sstream>
-
-#include "Compiler.hh"
-#include "Types.hh"
-#include "Schema.hh"
-#include "ValidSchema.hh"
-#include "Stream.hh"
-
-#include "json/JsonDom.hh"
-
-using std::string;
-using std::map;
-using std::vector;
-using std::pair;
-using std::make_pair;
-
-namespace avro {
-using json::Entity;
-using json::Object;
-using json::Array;
-using json::EntityType;
-
-typedef map<Name, NodePtr> SymbolTable;
-
-
-// #define DEBUG_VERBOSE
-
-static NodePtr makePrimitive(const string& t)
-{
- if (t == "null") {
- return NodePtr(new NodePrimitive(AVRO_NULL));
- } else if (t == "boolean") {
- return NodePtr(new NodePrimitive(AVRO_BOOL));
- } else if (t == "int") {
- return NodePtr(new NodePrimitive(AVRO_INT));
- } else if (t == "long") {
- return NodePtr(new NodePrimitive(AVRO_LONG));
- } else if (t == "float") {
- return NodePtr(new NodePrimitive(AVRO_FLOAT));
- } else if (t == "double") {
- return NodePtr(new NodePrimitive(AVRO_DOUBLE));
- } else if (t == "string") {
- return NodePtr(new NodePrimitive(AVRO_STRING));
- } else if (t == "bytes") {
- return NodePtr(new NodePrimitive(AVRO_BYTES));
- } else {
- return NodePtr();
- }
-}
-
-static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string &ns);
-
-template <typename T>
-concepts::SingleAttribute<T> asSingleAttribute(const T& t)
-{
- concepts::SingleAttribute<T> n;
- n.add(t);
- return n;
-}
-
-static bool isFullName(const string &s)
-{
- return s.find('.') != string::npos;
-}
-
-static Name getName(const string &name, const string &ns)
-{
- return (isFullName(name)) ? Name(name) : Name(name, ns);
-}
-
-static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns)
-{
- NodePtr result = makePrimitive(t);
- if (result) {
- return result;
- }
- Name n = getName(t, ns);
-
- SymbolTable::const_iterator it = st.find(n);
- if (it != st.end()) {
- return NodePtr(new NodeSymbolic(asSingleAttribute(n), it->second));
- }
- throw Exception(boost::format("Unknown type: %1%") % n.fullname());
-}
-
-/** Returns "true" if the field is in the container */
-// e.g.: can be false for non-mandatory fields
-bool containsField(const Object& m, const string& fieldName) {
- Object::const_iterator it = m.find(fieldName);
- return (it != m.end());
-}
-
-const json::Object::const_iterator findField(const Entity& e,
- const Object& m, const string& fieldName)
-{
- Object::const_iterator it = m.find(fieldName);
- if (it == m.end()) {
- throw Exception(boost::format("Missing Json field \"%1%\": %2%") %
- fieldName % e.toString());
- } else {
- return it;
- }
-}
-
-template <typename T> void ensureType(const Entity &e, const string &name)
-{
- if (e.type() != json::type_traits<T>::type()) {
- throw Exception(boost::format("Json field \"%1%\" is not a %2%: %3%") %
- name % json::type_traits<T>::name() % e.toString());
- }
-}
-
-string getStringField(const Entity &e, const Object &m,
- const string &fieldName)
-{
- Object::const_iterator it = findField(e, m, fieldName);
- ensureType<string>(it->second, fieldName);
- return it->second.stringValue();
-}
-
-const Array& getArrayField(const Entity& e, const Object& m,
- const string& fieldName)
-{
- Object::const_iterator it = findField(e, m, fieldName);
- ensureType<Array >(it->second, fieldName);
- return it->second.arrayValue();
-}
-
-const int64_t getLongField(const Entity& e, const Object& m,
- const string& fieldName)
-{
- Object::const_iterator it = findField(e, m, fieldName);
- ensureType<int64_t>(it->second, fieldName);
- return it->second.longValue();
-}
-
-// Unescape double quotes (") for de-serialization. This method complements the
-// method NodeImpl::escape() which is used for serialization.
-static void unescape(string& s) {
- boost::replace_all(s, "\\\"", "\"");
-}
-
-const string getDocField(const Entity& e, const Object& m)
-{
- string doc = getStringField(e, m, "doc");
- unescape(doc);
- return doc;
-}
-
-struct Field {
- const string name;
- const NodePtr schema;
- const GenericDatum defaultValue;
- Field(const string& n, const NodePtr& v, GenericDatum dv) :
- name(n), schema(v), defaultValue(dv) { }
-};
-
-static void assertType(const Entity& e, EntityType et)
-{
- if (e.type() != et) {
- throw Exception(boost::format("Unexpected type for default value: "
- "Expected %1%, but found %2% in line %3%") %
- json::typeToString(et) % json::typeToString(e.type()) %
- e.line());
- }
-}
-
-static vector<uint8_t> toBin(const string& s)
-{
- vector<uint8_t> result(s.size());
- if (s.size() > 0) {
- std::copy(s.c_str(), s.c_str() + s.size(), result.data());
- }
- return result;
-}
-
-static GenericDatum makeGenericDatum(NodePtr n,
- const Entity& e, const SymbolTable& st)
-{
- Type t = n->type();
- EntityType dt = e.type();
-
- if (t == AVRO_SYMBOLIC) {
- n = st.find(n->name())->second;
- t = n->type();
- }
- switch (t) {
- case AVRO_STRING:
- assertType(e, json::etString);
- return GenericDatum(e.stringValue());
- case AVRO_BYTES:
- assertType(e, json::etString);
- return GenericDatum(toBin(e.bytesValue()));
- case AVRO_INT:
- assertType(e, json::etLong);
- return GenericDatum(static_cast<int32_t>(e.longValue()));
- case AVRO_LONG:
- assertType(e, json::etLong);
- return GenericDatum(e.longValue());
- case AVRO_FLOAT:
- if (dt == json::etLong) {
- return GenericDatum(static_cast<float>(e.longValue()));
- }
- assertType(e, json::etDouble);
- return GenericDatum(static_cast<float>(e.doubleValue()));
- case AVRO_DOUBLE:
- if (dt == json::etLong) {
- return GenericDatum(static_cast<double>(e.longValue()));
- }
- assertType(e, json::etDouble);
- return GenericDatum(e.doubleValue());
- case AVRO_BOOL:
- assertType(e, json::etBool);
- return GenericDatum(e.boolValue());
- case AVRO_NULL:
- assertType(e, json::etNull);
- return GenericDatum();
- case AVRO_RECORD:
- {
- assertType(e, json::etObject);
- GenericRecord result(n);
- const map<string, Entity>& v = e.objectValue();
- for (size_t i = 0; i < n->leaves(); ++i) {
- map<string, Entity>::const_iterator it = v.find(n->nameAt(i));
- if (it == v.end()) {
- throw Exception(boost::format(
- "No value found in default for %1%") % n->nameAt(i));
- }
- result.setFieldAt(i,
- makeGenericDatum(n->leafAt(i), it->second, st));
- }
- return GenericDatum(n, result);
- }
- case AVRO_ENUM:
- assertType(e, json::etString);
- return GenericDatum(n, GenericEnum(n, e.stringValue()));
- case AVRO_ARRAY:
- {
- assertType(e, json::etArray);
- GenericArray result(n);
- const vector<Entity>& elements = e.arrayValue();
- for (vector<Entity>::const_iterator it = elements.begin();
- it != elements.end(); ++it) {
- result.value().push_back(makeGenericDatum(n->leafAt(0), *it, st));
- }
- return GenericDatum(n, result);
- }
- case AVRO_MAP:
- {
- assertType(e, json::etObject);
- GenericMap result(n);
- const map<string, Entity>& v = e.objectValue();
- for (map<string, Entity>::const_iterator it = v.begin();
- it != v.end(); ++it) {
- result.value().push_back(make_pair(it->first,
- makeGenericDatum(n->leafAt(1), it->second, st)));
- }
- return GenericDatum(n, result);
- }
- case AVRO_UNION:
- {
- GenericUnion result(n);
- result.selectBranch(0);
- result.datum() = makeGenericDatum(n->leafAt(0), e, st);
- return GenericDatum(n, result);
- }
- case AVRO_FIXED:
- assertType(e, json::etString);
- return GenericDatum(n, GenericFixed(n, toBin(e.bytesValue())));
- default:
- throw Exception(boost::format("Unknown type: %1%") % t);
- }
- return GenericDatum();
-}
-
-
-static Field makeField(const Entity& e, SymbolTable& st, const string& ns)
-{
- const Object& m = e.objectValue();
- const string& n = getStringField(e, m, "name");
- Object::const_iterator it = findField(e, m, "type");
- map<string, Entity>::const_iterator it2 = m.find("default");
- NodePtr node = makeNode(it->second, st, ns);
- if (containsField(m, "doc")) {
- node->setDoc(getDocField(e, m));
- }
- GenericDatum d = (it2 == m.end()) ? GenericDatum() :
- makeGenericDatum(node, it2->second, st);
- return Field(n, node, d);
-}
-
-// Extended makeRecordNode (with doc).
-static NodePtr makeRecordNode(const Entity& e, const Name& name,
- const string* doc, const Object& m,
- SymbolTable& st, const string& ns) {
- const Array& v = getArrayField(e, m, "fields");
- concepts::MultiAttribute<string> fieldNames;
- concepts::MultiAttribute<NodePtr> fieldValues;
- vector<GenericDatum> defaultValues;
-
- for (Array::const_iterator it = v.begin(); it != v.end(); ++it) {
- Field f = makeField(*it, st, ns);
- fieldNames.add(f.name);
- fieldValues.add(f.schema);
- defaultValues.push_back(f.defaultValue);
- }
- NodeRecord* node;
- if (doc == NULL) {
- node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
- defaultValues);
- } else {
- node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
- fieldValues, fieldNames, defaultValues);
- }
- return NodePtr(node);
-}
-
-static LogicalType makeLogicalType(const Entity& e, const Object& m) {
- if (!containsField(m, "logicalType")) {
- return LogicalType(LogicalType::NONE);
- }
-
- const std::string& typeField = getStringField(e, m, "logicalType");
-
- if (typeField == "decimal") {
- LogicalType decimalType(LogicalType::DECIMAL);
- try {
- decimalType.setPrecision(getLongField(e, m, "precision"));
- if (containsField(m, "scale")) {
- decimalType.setScale(getLongField(e, m, "scale"));
- }
- } catch (Exception& ex) {
- // If any part of the logical type is malformed, per the standard we
- // must ignore the whole attribute.
- return LogicalType(LogicalType::NONE);
- }
- return decimalType;
- }
-
- LogicalType::Type t = LogicalType::NONE;
- if (typeField == "date")
- t = LogicalType::DATE;
- else if (typeField == "time-millis")
- t = LogicalType::TIME_MILLIS;
- else if (typeField == "time-micros")
- t = LogicalType::TIME_MICROS;
- else if (typeField == "timestamp-millis")
- t = LogicalType::TIMESTAMP_MILLIS;
- else if (typeField == "timestamp-micros")
- t = LogicalType::TIMESTAMP_MICROS;
- else if (typeField == "duration")
- t = LogicalType::DURATION;
- else if (typeField == "uuid")
- t = LogicalType::UUID;
- return LogicalType(t);
-}
-
-static NodePtr makeEnumNode(const Entity& e,
- const Name& name, const Object& m)
-{
- const Array& v = getArrayField(e, m, "symbols");
- concepts::MultiAttribute<string> symbols;
- for (Array::const_iterator it = v.begin(); it != v.end(); ++it) {
- if (it->type() != json::etString) {
- throw Exception(boost::format("Enum symbol not a string: %1%") %
- it->toString());
- }
- symbols.add(it->stringValue());
- }
- NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
- if (containsField(m, "doc")) {
- node->setDoc(getDocField(e, m));
- }
- return node;
-}
-
-static NodePtr makeFixedNode(const Entity& e,
- const Name& name, const Object& m)
-{
- int v = static_cast<int>(getLongField(e, m, "size"));
- if (v <= 0) {
- throw Exception(boost::format("Size for fixed is not positive: %1%") %
- e.toString());
- }
- NodePtr node =
- NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(v)));
- if (containsField(m, "doc")) {
- node->setDoc(getDocField(e, m));
- }
- return node;
-}
-
-static NodePtr makeArrayNode(const Entity& e, const Object& m,
- SymbolTable& st, const string& ns)
-{
- Object::const_iterator it = findField(e, m, "items");
- NodePtr node = NodePtr(new NodeArray(
- asSingleAttribute(makeNode(it->second, st, ns))));
- if (containsField(m, "doc")) {
- node->setDoc(getDocField(e, m));
- }
- return node;
-}
-
-static NodePtr makeMapNode(const Entity& e, const Object& m,
- SymbolTable& st, const string& ns)
-{
- Object::const_iterator it = findField(e, m, "values");
-
- NodePtr node = NodePtr(new NodeMap(
- asSingleAttribute(makeNode(it->second, st, ns))));
- if (containsField(m, "doc")) {
- node->setDoc(getDocField(e, m));
- }
- return node;
-}
-
-static Name getName(const Entity& e, const Object& m, const string& ns)
-{
- const string& name = getStringField(e, m, "name");
-
- if (isFullName(name)) {
- return Name(name);
- } else {
- Object::const_iterator it = m.find("namespace");
- if (it != m.end()) {
- if (it->second.type() != json::type_traits<string>::type()) {
- throw Exception(boost::format(
- "Json field \"%1%\" is not a %2%: %3%") %
- "namespace" % json::type_traits<string>::name() %
- it->second.toString());
- }
- Name result = Name(name, it->second.stringValue());
- return result;
- }
- return Name(name, ns);
- }
-}
-
-static NodePtr makeNode(const Entity& e, const Object& m,
- SymbolTable& st, const string& ns)
-{
- const string& type = getStringField(e, m, "type");
- NodePtr result;
- if (type == "record" || type == "error" ||
- type == "enum" || type == "fixed") {
- Name nm = getName(e, m, ns);
- if (type == "record" || type == "error") {
- result = NodePtr(new NodeRecord());
- st[nm] = result;
- // Get field doc
- if (containsField(m, "doc")) {
- string doc = getDocField(e, m);
-
- NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
- (std::dynamic_pointer_cast<NodeRecord>(r))->swap(
- *std::dynamic_pointer_cast<NodeRecord>(result));
- } else { // No doc
- NodePtr r =
- makeRecordNode(e, nm, NULL, m, st, nm.ns());
- (std::dynamic_pointer_cast<NodeRecord>(r))
- ->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
- }
- } else {
- result = (type == "enum") ? makeEnumNode(e, nm, m) :
- makeFixedNode(e, nm, m);
- st[nm] = result;
- }
- } else if (type == "array") {
- result = makeArrayNode(e, m, st, ns);
- } else if (type == "map") {
- result = makeMapNode(e, m, st, ns);
- } else {
- result = makePrimitive(type);
- }
-
- if (result) {
- try {
- result->setLogicalType(makeLogicalType(e, m));
- } catch (Exception& ex) {
- // Per the standard we must ignore the logical type attribute if it
- // is malformed.
- }
- return result;
- }
-
- throw Exception(boost::format("Unknown type definition: %1%")
- % e.toString());
-}
-
-static NodePtr makeNode(const Entity& e, const Array& m,
- SymbolTable& st, const string& ns)
-{
- concepts::MultiAttribute<NodePtr> mm;
- for (Array::const_iterator it = m.begin(); it != m.end(); ++it) {
- mm.add(makeNode(*it, st, ns));
- }
- return NodePtr(new NodeUnion(mm));
-}
-
-static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string& ns)
-{
- switch (e.type()) {
- case json::etString:
- return makeNode(e.stringValue(), st, ns);
- case json::etObject:
- return makeNode(e, e.objectValue(), st, ns);
- case json::etArray:
- return makeNode(e, e.arrayValue(), st, ns);
- default:
- throw Exception(boost::format("Invalid Avro type: %1%") % e.toString());
- }
-}
-
-ValidSchema compileJsonSchemaFromStream(InputStream& is)
-{
- json::Entity e = json::loadEntity(is);
- SymbolTable st;
- NodePtr n = makeNode(e, st, "");
- return ValidSchema(n);
-}
-
-AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char* filename)
-{
- std::unique_ptr<InputStream> s = fileInputStream(filename);
- return compileJsonSchemaFromStream(*s);
-}
-
-AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t* input, size_t len)
-{
- return compileJsonSchemaFromStream(*memoryInputStream(input, len));
-}
-
-AVRO_DECL ValidSchema compileJsonSchemaFromString(const char* input)
-{
- return compileJsonSchemaFromMemory(reinterpret_cast<const uint8_t*>(input),
- ::strlen(input));
-}
-
-AVRO_DECL ValidSchema compileJsonSchemaFromString(const string& input)
-{
- return compileJsonSchemaFromMemory(
- reinterpret_cast<const uint8_t*>(input.data()), input.size());
-}
-
-static ValidSchema compile(std::istream& is)
-{
- std::unique_ptr<InputStream> in = istreamInputStream(is);
- return compileJsonSchemaFromStream(*in);
-}
-
-void compileJsonSchema(std::istream &is, ValidSchema &schema)
-{
- if (!is.good()) {
- throw Exception("Input stream is not good");
- }
-
- schema = compile(is);
-}
-
-AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string &error)
-{
- try {
- compileJsonSchema(is, schema);
- return true;
- } catch (const Exception &e) {
- error = e.what();
- return false;
- }
-
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <boost/algorithm/string/replace.hpp>
+#include <sstream>
+
+#include "Compiler.hh"
+#include "Types.hh"
+#include "Schema.hh"
+#include "ValidSchema.hh"
+#include "Stream.hh"
+
+#include "json/JsonDom.hh"
+
+using std::string;
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace avro {
+using json::Entity;
+using json::Object;
+using json::Array;
+using json::EntityType;
+
+typedef map<Name, NodePtr> SymbolTable;
+
+
+// #define DEBUG_VERBOSE
+
+static NodePtr makePrimitive(const string& t)
+{
+ if (t == "null") {
+ return NodePtr(new NodePrimitive(AVRO_NULL));
+ } else if (t == "boolean") {
+ return NodePtr(new NodePrimitive(AVRO_BOOL));
+ } else if (t == "int") {
+ return NodePtr(new NodePrimitive(AVRO_INT));
+ } else if (t == "long") {
+ return NodePtr(new NodePrimitive(AVRO_LONG));
+ } else if (t == "float") {
+ return NodePtr(new NodePrimitive(AVRO_FLOAT));
+ } else if (t == "double") {
+ return NodePtr(new NodePrimitive(AVRO_DOUBLE));
+ } else if (t == "string") {
+ return NodePtr(new NodePrimitive(AVRO_STRING));
+ } else if (t == "bytes") {
+ return NodePtr(new NodePrimitive(AVRO_BYTES));
+ } else {
+ return NodePtr();
+ }
+}
+
+static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string &ns);
+
+template <typename T>
+concepts::SingleAttribute<T> asSingleAttribute(const T& t)
+{
+ concepts::SingleAttribute<T> n;
+ n.add(t);
+ return n;
+}
+
+static bool isFullName(const string &s)
+{
+ return s.find('.') != string::npos;
+}
+
+static Name getName(const string &name, const string &ns)
+{
+ return (isFullName(name)) ? Name(name) : Name(name, ns);
+}
+
+static NodePtr makeNode(const string &t, SymbolTable &st, const string &ns)
+{
+ NodePtr result = makePrimitive(t);
+ if (result) {
+ return result;
+ }
+ Name n = getName(t, ns);
+
+ SymbolTable::const_iterator it = st.find(n);
+ if (it != st.end()) {
+ return NodePtr(new NodeSymbolic(asSingleAttribute(n), it->second));
+ }
+ throw Exception(boost::format("Unknown type: %1%") % n.fullname());
+}
+
+/** Returns "true" if the field is in the container */
+// e.g.: can be false for non-mandatory fields
+bool containsField(const Object& m, const string& fieldName) {
+ Object::const_iterator it = m.find(fieldName);
+ return (it != m.end());
+}
+
+const json::Object::const_iterator findField(const Entity& e,
+ const Object& m, const string& fieldName)
+{
+ Object::const_iterator it = m.find(fieldName);
+ if (it == m.end()) {
+ throw Exception(boost::format("Missing Json field \"%1%\": %2%") %
+ fieldName % e.toString());
+ } else {
+ return it;
+ }
+}
+
+template <typename T> void ensureType(const Entity &e, const string &name)
+{
+ if (e.type() != json::type_traits<T>::type()) {
+ throw Exception(boost::format("Json field \"%1%\" is not a %2%: %3%") %
+ name % json::type_traits<T>::name() % e.toString());
+ }
+}
+
+string getStringField(const Entity &e, const Object &m,
+ const string &fieldName)
+{
+ Object::const_iterator it = findField(e, m, fieldName);
+ ensureType<string>(it->second, fieldName);
+ return it->second.stringValue();
+}
+
+const Array& getArrayField(const Entity& e, const Object& m,
+ const string& fieldName)
+{
+ Object::const_iterator it = findField(e, m, fieldName);
+ ensureType<Array >(it->second, fieldName);
+ return it->second.arrayValue();
+}
+
+const int64_t getLongField(const Entity& e, const Object& m,
+ const string& fieldName)
+{
+ Object::const_iterator it = findField(e, m, fieldName);
+ ensureType<int64_t>(it->second, fieldName);
+ return it->second.longValue();
+}
+
+// Unescape double quotes (") for de-serialization. This method complements the
+// method NodeImpl::escape() which is used for serialization.
+static void unescape(string& s) {
+ boost::replace_all(s, "\\\"", "\"");
+}
+
+const string getDocField(const Entity& e, const Object& m)
+{
+ string doc = getStringField(e, m, "doc");
+ unescape(doc);
+ return doc;
+}
+
+struct Field {
+ const string name;
+ const NodePtr schema;
+ const GenericDatum defaultValue;
+ Field(const string& n, const NodePtr& v, GenericDatum dv) :
+ name(n), schema(v), defaultValue(dv) { }
+};
+
+static void assertType(const Entity& e, EntityType et)
+{
+ if (e.type() != et) {
+ throw Exception(boost::format("Unexpected type for default value: "
+ "Expected %1%, but found %2% in line %3%") %
+ json::typeToString(et) % json::typeToString(e.type()) %
+ e.line());
+ }
+}
+
+static vector<uint8_t> toBin(const string& s)
+{
+ vector<uint8_t> result(s.size());
+ if (s.size() > 0) {
+ std::copy(s.c_str(), s.c_str() + s.size(), result.data());
+ }
+ return result;
+}
+
+static GenericDatum makeGenericDatum(NodePtr n,
+ const Entity& e, const SymbolTable& st)
+{
+ Type t = n->type();
+ EntityType dt = e.type();
+
+ if (t == AVRO_SYMBOLIC) {
+ n = st.find(n->name())->second;
+ t = n->type();
+ }
+ switch (t) {
+ case AVRO_STRING:
+ assertType(e, json::etString);
+ return GenericDatum(e.stringValue());
+ case AVRO_BYTES:
+ assertType(e, json::etString);
+ return GenericDatum(toBin(e.bytesValue()));
+ case AVRO_INT:
+ assertType(e, json::etLong);
+ return GenericDatum(static_cast<int32_t>(e.longValue()));
+ case AVRO_LONG:
+ assertType(e, json::etLong);
+ return GenericDatum(e.longValue());
+ case AVRO_FLOAT:
+ if (dt == json::etLong) {
+ return GenericDatum(static_cast<float>(e.longValue()));
+ }
+ assertType(e, json::etDouble);
+ return GenericDatum(static_cast<float>(e.doubleValue()));
+ case AVRO_DOUBLE:
+ if (dt == json::etLong) {
+ return GenericDatum(static_cast<double>(e.longValue()));
+ }
+ assertType(e, json::etDouble);
+ return GenericDatum(e.doubleValue());
+ case AVRO_BOOL:
+ assertType(e, json::etBool);
+ return GenericDatum(e.boolValue());
+ case AVRO_NULL:
+ assertType(e, json::etNull);
+ return GenericDatum();
+ case AVRO_RECORD:
+ {
+ assertType(e, json::etObject);
+ GenericRecord result(n);
+ const map<string, Entity>& v = e.objectValue();
+ for (size_t i = 0; i < n->leaves(); ++i) {
+ map<string, Entity>::const_iterator it = v.find(n->nameAt(i));
+ if (it == v.end()) {
+ throw Exception(boost::format(
+ "No value found in default for %1%") % n->nameAt(i));
+ }
+ result.setFieldAt(i,
+ makeGenericDatum(n->leafAt(i), it->second, st));
+ }
+ return GenericDatum(n, result);
+ }
+ case AVRO_ENUM:
+ assertType(e, json::etString);
+ return GenericDatum(n, GenericEnum(n, e.stringValue()));
+ case AVRO_ARRAY:
+ {
+ assertType(e, json::etArray);
+ GenericArray result(n);
+ const vector<Entity>& elements = e.arrayValue();
+ for (vector<Entity>::const_iterator it = elements.begin();
+ it != elements.end(); ++it) {
+ result.value().push_back(makeGenericDatum(n->leafAt(0), *it, st));
+ }
+ return GenericDatum(n, result);
+ }
+ case AVRO_MAP:
+ {
+ assertType(e, json::etObject);
+ GenericMap result(n);
+ const map<string, Entity>& v = e.objectValue();
+ for (map<string, Entity>::const_iterator it = v.begin();
+ it != v.end(); ++it) {
+ result.value().push_back(make_pair(it->first,
+ makeGenericDatum(n->leafAt(1), it->second, st)));
+ }
+ return GenericDatum(n, result);
+ }
+ case AVRO_UNION:
+ {
+ GenericUnion result(n);
+ result.selectBranch(0);
+ result.datum() = makeGenericDatum(n->leafAt(0), e, st);
+ return GenericDatum(n, result);
+ }
+ case AVRO_FIXED:
+ assertType(e, json::etString);
+ return GenericDatum(n, GenericFixed(n, toBin(e.bytesValue())));
+ default:
+ throw Exception(boost::format("Unknown type: %1%") % t);
+ }
+ return GenericDatum();
+}
+
+
+static Field makeField(const Entity& e, SymbolTable& st, const string& ns)
+{
+ const Object& m = e.objectValue();
+ const string& n = getStringField(e, m, "name");
+ Object::const_iterator it = findField(e, m, "type");
+ map<string, Entity>::const_iterator it2 = m.find("default");
+ NodePtr node = makeNode(it->second, st, ns);
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ GenericDatum d = (it2 == m.end()) ? GenericDatum() :
+ makeGenericDatum(node, it2->second, st);
+ return Field(n, node, d);
+}
+
+// Extended makeRecordNode (with doc).
+static NodePtr makeRecordNode(const Entity& e, const Name& name,
+ const string* doc, const Object& m,
+ SymbolTable& st, const string& ns) {
+ const Array& v = getArrayField(e, m, "fields");
+ concepts::MultiAttribute<string> fieldNames;
+ concepts::MultiAttribute<NodePtr> fieldValues;
+ vector<GenericDatum> defaultValues;
+
+ for (Array::const_iterator it = v.begin(); it != v.end(); ++it) {
+ Field f = makeField(*it, st, ns);
+ fieldNames.add(f.name);
+ fieldValues.add(f.schema);
+ defaultValues.push_back(f.defaultValue);
+ }
+ NodeRecord* node;
+ if (doc == NULL) {
+ node = new NodeRecord(asSingleAttribute(name), fieldValues, fieldNames,
+ defaultValues);
+ } else {
+ node = new NodeRecord(asSingleAttribute(name), asSingleAttribute(*doc),
+ fieldValues, fieldNames, defaultValues);
+ }
+ return NodePtr(node);
+}
+
+static LogicalType makeLogicalType(const Entity& e, const Object& m) {
+ if (!containsField(m, "logicalType")) {
+ return LogicalType(LogicalType::NONE);
+ }
+
+ const std::string& typeField = getStringField(e, m, "logicalType");
+
+ if (typeField == "decimal") {
+ LogicalType decimalType(LogicalType::DECIMAL);
+ try {
+ decimalType.setPrecision(getLongField(e, m, "precision"));
+ if (containsField(m, "scale")) {
+ decimalType.setScale(getLongField(e, m, "scale"));
+ }
+ } catch (Exception& ex) {
+ // If any part of the logical type is malformed, per the standard we
+ // must ignore the whole attribute.
+ return LogicalType(LogicalType::NONE);
+ }
+ return decimalType;
+ }
+
+ LogicalType::Type t = LogicalType::NONE;
+ if (typeField == "date")
+ t = LogicalType::DATE;
+ else if (typeField == "time-millis")
+ t = LogicalType::TIME_MILLIS;
+ else if (typeField == "time-micros")
+ t = LogicalType::TIME_MICROS;
+ else if (typeField == "timestamp-millis")
+ t = LogicalType::TIMESTAMP_MILLIS;
+ else if (typeField == "timestamp-micros")
+ t = LogicalType::TIMESTAMP_MICROS;
+ else if (typeField == "duration")
+ t = LogicalType::DURATION;
+ else if (typeField == "uuid")
+ t = LogicalType::UUID;
+ return LogicalType(t);
+}
+
+static NodePtr makeEnumNode(const Entity& e,
+ const Name& name, const Object& m)
+{
+ const Array& v = getArrayField(e, m, "symbols");
+ concepts::MultiAttribute<string> symbols;
+ for (Array::const_iterator it = v.begin(); it != v.end(); ++it) {
+ if (it->type() != json::etString) {
+ throw Exception(boost::format("Enum symbol not a string: %1%") %
+ it->toString());
+ }
+ symbols.add(it->stringValue());
+ }
+ NodePtr node = NodePtr(new NodeEnum(asSingleAttribute(name), symbols));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
+}
+
+static NodePtr makeFixedNode(const Entity& e,
+ const Name& name, const Object& m)
+{
+ int v = static_cast<int>(getLongField(e, m, "size"));
+ if (v <= 0) {
+ throw Exception(boost::format("Size for fixed is not positive: %1%") %
+ e.toString());
+ }
+ NodePtr node =
+ NodePtr(new NodeFixed(asSingleAttribute(name), asSingleAttribute(v)));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
+}
+
+static NodePtr makeArrayNode(const Entity& e, const Object& m,
+ SymbolTable& st, const string& ns)
+{
+ Object::const_iterator it = findField(e, m, "items");
+ NodePtr node = NodePtr(new NodeArray(
+ asSingleAttribute(makeNode(it->second, st, ns))));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
+}
+
+static NodePtr makeMapNode(const Entity& e, const Object& m,
+ SymbolTable& st, const string& ns)
+{
+ Object::const_iterator it = findField(e, m, "values");
+
+ NodePtr node = NodePtr(new NodeMap(
+ asSingleAttribute(makeNode(it->second, st, ns))));
+ if (containsField(m, "doc")) {
+ node->setDoc(getDocField(e, m));
+ }
+ return node;
+}
+
+static Name getName(const Entity& e, const Object& m, const string& ns)
+{
+ const string& name = getStringField(e, m, "name");
+
+ if (isFullName(name)) {
+ return Name(name);
+ } else {
+ Object::const_iterator it = m.find("namespace");
+ if (it != m.end()) {
+ if (it->second.type() != json::type_traits<string>::type()) {
+ throw Exception(boost::format(
+ "Json field \"%1%\" is not a %2%: %3%") %
+ "namespace" % json::type_traits<string>::name() %
+ it->second.toString());
+ }
+ Name result = Name(name, it->second.stringValue());
+ return result;
+ }
+ return Name(name, ns);
+ }
+}
+
+static NodePtr makeNode(const Entity& e, const Object& m,
+ SymbolTable& st, const string& ns)
+{
+ const string& type = getStringField(e, m, "type");
+ NodePtr result;
+ if (type == "record" || type == "error" ||
+ type == "enum" || type == "fixed") {
+ Name nm = getName(e, m, ns);
+ if (type == "record" || type == "error") {
+ result = NodePtr(new NodeRecord());
+ st[nm] = result;
+ // Get field doc
+ if (containsField(m, "doc")) {
+ string doc = getDocField(e, m);
+
+ NodePtr r = makeRecordNode(e, nm, &doc, m, st, nm.ns());
+ (std::dynamic_pointer_cast<NodeRecord>(r))->swap(
+ *std::dynamic_pointer_cast<NodeRecord>(result));
+ } else { // No doc
+ NodePtr r =
+ makeRecordNode(e, nm, NULL, m, st, nm.ns());
+ (std::dynamic_pointer_cast<NodeRecord>(r))
+ ->swap(*std::dynamic_pointer_cast<NodeRecord>(result));
+ }
+ } else {
+ result = (type == "enum") ? makeEnumNode(e, nm, m) :
+ makeFixedNode(e, nm, m);
+ st[nm] = result;
+ }
+ } else if (type == "array") {
+ result = makeArrayNode(e, m, st, ns);
+ } else if (type == "map") {
+ result = makeMapNode(e, m, st, ns);
+ } else {
+ result = makePrimitive(type);
+ }
+
+ if (result) {
+ try {
+ result->setLogicalType(makeLogicalType(e, m));
+ } catch (Exception& ex) {
+ // Per the standard we must ignore the logical type attribute if it
+ // is malformed.
+ }
+ return result;
+ }
+
+ throw Exception(boost::format("Unknown type definition: %1%")
+ % e.toString());
+}
+
+static NodePtr makeNode(const Entity& e, const Array& m,
+ SymbolTable& st, const string& ns)
+{
+ concepts::MultiAttribute<NodePtr> mm;
+ for (Array::const_iterator it = m.begin(); it != m.end(); ++it) {
+ mm.add(makeNode(*it, st, ns));
+ }
+ return NodePtr(new NodeUnion(mm));
+}
+
+static NodePtr makeNode(const json::Entity& e, SymbolTable& st, const string& ns)
+{
+ switch (e.type()) {
+ case json::etString:
+ return makeNode(e.stringValue(), st, ns);
+ case json::etObject:
+ return makeNode(e, e.objectValue(), st, ns);
+ case json::etArray:
+ return makeNode(e, e.arrayValue(), st, ns);
+ default:
+ throw Exception(boost::format("Invalid Avro type: %1%") % e.toString());
+ }
+}
+
+ValidSchema compileJsonSchemaFromStream(InputStream& is)
+{
+ json::Entity e = json::loadEntity(is);
+ SymbolTable st;
+ NodePtr n = makeNode(e, st, "");
+ return ValidSchema(n);
+}
+
+AVRO_DECL ValidSchema compileJsonSchemaFromFile(const char* filename)
+{
+ std::unique_ptr<InputStream> s = fileInputStream(filename);
+ return compileJsonSchemaFromStream(*s);
+}
+
+AVRO_DECL ValidSchema compileJsonSchemaFromMemory(const uint8_t* input, size_t len)
+{
+ return compileJsonSchemaFromStream(*memoryInputStream(input, len));
+}
+
+AVRO_DECL ValidSchema compileJsonSchemaFromString(const char* input)
+{
+ return compileJsonSchemaFromMemory(reinterpret_cast<const uint8_t*>(input),
+ ::strlen(input));
+}
+
+AVRO_DECL ValidSchema compileJsonSchemaFromString(const string& input)
+{
+ return compileJsonSchemaFromMemory(
+ reinterpret_cast<const uint8_t*>(input.data()), input.size());
+}
+
+static ValidSchema compile(std::istream& is)
+{
+ std::unique_ptr<InputStream> in = istreamInputStream(is);
+ return compileJsonSchemaFromStream(*in);
+}
+
+void compileJsonSchema(std::istream &is, ValidSchema &schema)
+{
+ if (!is.good()) {
+ throw Exception("Input stream is not good");
+ }
+
+ schema = compile(is);
+}
+
+AVRO_DECL bool compileJsonSchema(std::istream &is, ValidSchema &schema, string &error)
+{
+ try {
+ compileJsonSchema(is, schema);
+ return true;
+ } catch (const Exception &e) {
+ error = e.what();
+ return false;
+ }
+
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/DataFile.cc b/contrib/libs/apache/avro/impl/DataFile.cc
index e20e6058276..8b92440d04a 100644
--- a/contrib/libs/apache/avro/impl/DataFile.cc
+++ b/contrib/libs/apache/avro/impl/DataFile.cc
@@ -1,600 +1,600 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "DataFile.hh"
-#include "Compiler.hh"
-#include "Exception.hh"
-
-#include <sstream>
-
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/iostreams/device/file.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-#include <boost/iostreams/filter/zlib.hpp>
-#include <boost/crc.hpp> // for boost::crc_32_type
-
-#ifdef SNAPPY_CODEC_AVAILABLE
-#include <snappy.h>
-#endif
-
-namespace avro {
-using std::unique_ptr;
-using std::ostringstream;
-using std::istringstream;
-using std::vector;
-using std::copy;
-using std::string;
-
-using std::array;
-
-namespace {
-const string AVRO_SCHEMA_KEY("avro.schema");
-const string AVRO_CODEC_KEY("avro.codec");
-const string AVRO_NULL_CODEC("null");
-const string AVRO_DEFLATE_CODEC("deflate");
-
-#ifdef SNAPPY_CODEC_AVAILABLE
-const string AVRO_SNAPPY_CODEC = "snappy";
-#endif
-
-const size_t minSyncInterval = 32;
-const size_t maxSyncInterval = 1u << 30;
-
-boost::iostreams::zlib_params get_zlib_params() {
- boost::iostreams::zlib_params ret;
- ret.method = boost::iostreams::zlib::deflated;
- ret.noheader = true;
- return ret;
-}
-}
-
-DataFileWriterBase::DataFileWriterBase(const char* filename, const ValidSchema& schema, size_t syncInterval,
- Codec codec) :
- filename_(filename),
- schema_(schema),
- encoderPtr_(binaryEncoder()),
- syncInterval_(syncInterval),
- codec_(codec),
- stream_(fileOutputStream(filename)),
- buffer_(memoryOutputStream()),
- sync_(makeSync()),
- objectCount_(0),
- lastSync_(0)
-{
- init(schema, syncInterval, codec);
-}
-
-DataFileWriterBase::DataFileWriterBase(std::unique_ptr<OutputStream> outputStream,
- const ValidSchema& schema, size_t syncInterval, Codec codec) :
- filename_(),
- schema_(schema),
- encoderPtr_(binaryEncoder()),
- syncInterval_(syncInterval),
- codec_(codec),
- stream_(std::move(outputStream)),
- buffer_(memoryOutputStream()),
- sync_(makeSync()),
- objectCount_(0),
- lastSync_(0)
-{
- init(schema, syncInterval, codec);
-}
-
-void DataFileWriterBase::init(const ValidSchema &schema, size_t syncInterval, const Codec &codec) {
- if (syncInterval < minSyncInterval || syncInterval > maxSyncInterval) {
- throw Exception(boost::format("Invalid sync interval: %1%. "
- "Should be between %2% and %3%") % syncInterval %
- minSyncInterval % maxSyncInterval);
- }
- setMetadata(AVRO_CODEC_KEY, AVRO_NULL_CODEC);
-
- if (codec_ == NULL_CODEC) {
- setMetadata(AVRO_CODEC_KEY, AVRO_NULL_CODEC);
- } else if (codec_ == DEFLATE_CODEC) {
- setMetadata(AVRO_CODEC_KEY, AVRO_DEFLATE_CODEC);
-#ifdef SNAPPY_CODEC_AVAILABLE
- } else if (codec_ == SNAPPY_CODEC) {
- setMetadata(AVRO_CODEC_KEY, AVRO_SNAPPY_CODEC);
-#endif
- } else {
- throw Exception(boost::format("Unknown codec: %1%") % codec);
- }
- setMetadata(AVRO_SCHEMA_KEY, schema.toJson(false));
-
- writeHeader();
- encoderPtr_->init(*buffer_);
-
- lastSync_ = stream_->byteCount();
-}
-
-
-DataFileWriterBase::~DataFileWriterBase()
-{
- if (stream_.get()) {
- close();
- }
-}
-
-void DataFileWriterBase::close()
-{
- flush();
- stream_.reset();
-}
-
-void DataFileWriterBase::sync()
-{
- encoderPtr_->flush();
-
- encoderPtr_->init(*stream_);
- avro::encode(*encoderPtr_, objectCount_);
- if (codec_ == NULL_CODEC) {
- int64_t byteCount = buffer_->byteCount();
- avro::encode(*encoderPtr_, byteCount);
- encoderPtr_->flush();
- std::unique_ptr<InputStream> in = memoryInputStream(*buffer_);
- copy(*in, *stream_);
- } else if (codec_ == DEFLATE_CODEC) {
- std::vector<char> buf;
- {
- boost::iostreams::filtering_ostream os;
- os.push(boost::iostreams::zlib_compressor(get_zlib_params()));
- os.push(boost::iostreams::back_inserter(buf));
- const uint8_t* data;
- size_t len;
-
- std::unique_ptr<InputStream> input = memoryInputStream(*buffer_);
- while (input->next(&data, &len)) {
- boost::iostreams::write(os, reinterpret_cast<const char*>(data), len);
- }
- } // make sure all is flushed
- std::unique_ptr<InputStream> in = memoryInputStream(
- reinterpret_cast<const uint8_t*>(buf.data()), buf.size());
- int64_t byteCount = buf.size();
- avro::encode(*encoderPtr_, byteCount);
- encoderPtr_->flush();
- copy(*in, *stream_);
-#ifdef SNAPPY_CODEC_AVAILABLE
- } else if (codec_ == SNAPPY_CODEC) {
- std::vector<char> temp;
- std::string compressed;
- boost::crc_32_type crc;
- {
- boost::iostreams::filtering_ostream os;
- os.push(boost::iostreams::back_inserter(temp));
- const uint8_t* data;
- size_t len;
-
- std::unique_ptr<InputStream> input = memoryInputStream(*buffer_);
- while (input->next(&data, &len)) {
- boost::iostreams::write(os, reinterpret_cast<const char*>(data),
- len);
- }
- } // make sure all is flushed
-
- crc.process_bytes(reinterpret_cast<const char*>(temp.data()),
- temp.size());
- // For Snappy, add the CRC32 checksum
- int32_t checksum = crc();
-
- // Now compress
- size_t compressed_size = snappy::Compress(
- reinterpret_cast<const char*>(temp.data()), temp.size(),
- &compressed);
- temp.clear();
- {
- boost::iostreams::filtering_ostream os;
- os.push(boost::iostreams::back_inserter(temp));
- boost::iostreams::write(os, compressed.c_str(), compressed_size);
- }
- temp.push_back((checksum >> 24) & 0xFF);
- temp.push_back((checksum >> 16) & 0xFF);
- temp.push_back((checksum >> 8) & 0xFF);
- temp.push_back(checksum & 0xFF);
- std::unique_ptr<InputStream> in = memoryInputStream(
- reinterpret_cast<const uint8_t*>(temp.data()), temp.size());
- int64_t byteCount = temp.size();
- avro::encode(*encoderPtr_, byteCount);
- encoderPtr_->flush();
- copy(*in, *stream_);
-#endif
- }
-
- encoderPtr_->init(*stream_);
- avro::encode(*encoderPtr_, sync_);
- encoderPtr_->flush();
-
- lastSync_ = stream_->byteCount();
-
- buffer_ = memoryOutputStream();
- encoderPtr_->init(*buffer_);
- objectCount_ = 0;
-}
-
-void DataFileWriterBase::syncIfNeeded()
-{
- encoderPtr_->flush();
- if (buffer_->byteCount() >= syncInterval_) {
- sync();
- }
-}
-
-uint64_t DataFileWriterBase::getCurrentBlockStart()
-{
- return lastSync_;
-}
-
-void DataFileWriterBase::flush()
-{
- sync();
-}
-
-boost::mt19937 random(static_cast<uint32_t>(time(0)));
-
-DataFileSync DataFileWriterBase::makeSync()
-{
- DataFileSync sync;
- for (size_t i = 0; i < sync.size(); ++i) {
- sync[i] = random();
- }
- return sync;
-}
-
-typedef array<uint8_t, 4> Magic;
-static Magic magic = { { 'O', 'b', 'j', '\x01' } };
-
-void DataFileWriterBase::writeHeader()
-{
- encoderPtr_->init(*stream_);
- avro::encode(*encoderPtr_, magic);
- avro::encode(*encoderPtr_, metadata_);
- avro::encode(*encoderPtr_, sync_);
- encoderPtr_->flush();
-}
-
-void DataFileWriterBase::setMetadata(const string& key, const string& value)
-{
- vector<uint8_t> v(value.size());
- copy(value.begin(), value.end(), v.begin());
- metadata_[key] = v;
-}
-
-DataFileReaderBase::DataFileReaderBase(const char* filename) :
- filename_(filename), stream_(fileSeekableInputStream(filename)),
- decoder_(binaryDecoder()), objectCount_(0), eof_(false), blockStart_(-1),
- blockEnd_(-1)
-{
- readHeader();
-}
-
-DataFileReaderBase::DataFileReaderBase(std::unique_ptr<InputStream> inputStream) :
- filename_(""), stream_(std::move(inputStream)),
- decoder_(binaryDecoder()), objectCount_(0), eof_(false)
-{
- readHeader();
-}
-
-void DataFileReaderBase::init()
-{
- readerSchema_ = dataSchema_;
- dataDecoder_ = binaryDecoder();
- readDataBlock();
-}
-
-void DataFileReaderBase::init(const ValidSchema& readerSchema)
-{
- readerSchema_ = readerSchema;
- dataDecoder_ = (readerSchema_.toJson(true) != dataSchema_.toJson(true)) ?
- resolvingDecoder(dataSchema_, readerSchema_, binaryDecoder()) :
- binaryDecoder();
- readDataBlock();
-}
-
-static void drain(InputStream& in)
-{
- const uint8_t *p = 0;
- size_t n = 0;
- while (in.next(&p, &n));
-}
-
-char hex(unsigned int x)
-{
- return x + (x < 10 ? '0' : ('a' - 10));
-}
-
-std::ostream& operator << (std::ostream& os, const DataFileSync& s)
-{
- for (size_t i = 0; i < s.size(); ++i) {
- os << hex(s[i] / 16) << hex(s[i] % 16) << ' ';
- }
- os << std::endl;
- return os;
-}
-
-
-bool DataFileReaderBase::hasMore()
-{
- for (; ;) {
- if (eof_) {
- return false;
- } else if (objectCount_ != 0) {
- return true;
- }
-
- dataDecoder_->init(*dataStream_);
- drain(*dataStream_);
- DataFileSync s;
- decoder_->init(*stream_);
- avro::decode(*decoder_, s);
- if (s != sync_) {
- throw Exception("Sync mismatch");
- }
- readDataBlock();
- }
-}
-
-class BoundedInputStream : public InputStream {
- InputStream& in_;
- size_t limit_;
-
- bool next(const uint8_t** data, size_t* len) {
- if (limit_ != 0 && in_.next(data, len)) {
- if (*len > limit_) {
- in_.backup(*len - limit_);
- *len = limit_;
- }
- limit_ -= *len;
- return true;
- }
- return false;
- }
-
- void backup(size_t len) {
- in_.backup(len);
- limit_ += len;
- }
-
- void skip(size_t len) {
- if (len > limit_) {
- len = limit_;
- }
- in_.skip(len);
- limit_ -= len;
- }
-
- size_t byteCount() const {
- return in_.byteCount();
- }
-
-public:
- BoundedInputStream(InputStream& in, size_t limit) :
- in_(in), limit_(limit) { }
-};
-
-unique_ptr<InputStream> boundedInputStream(InputStream& in, size_t limit)
-{
- return unique_ptr<InputStream>(new BoundedInputStream(in, limit));
-}
-
-void DataFileReaderBase::readDataBlock()
-{
- decoder_->init(*stream_);
- blockStart_ = stream_->byteCount();
- const uint8_t* p = 0;
- size_t n = 0;
- if (! stream_->next(&p, &n)) {
- eof_ = true;
- return;
- }
- stream_->backup(n);
- avro::decode(*decoder_, objectCount_);
- int64_t byteCount;
- avro::decode(*decoder_, byteCount);
- decoder_->init(*stream_);
- blockEnd_ = stream_->byteCount() + byteCount;
-
- unique_ptr<InputStream> st = boundedInputStream(*stream_, static_cast<size_t>(byteCount));
- if (codec_ == NULL_CODEC) {
- dataDecoder_->init(*st);
- dataStream_ = std::move(st);
-#ifdef SNAPPY_CODEC_AVAILABLE
- } else if (codec_ == SNAPPY_CODEC) {
- boost::crc_32_type crc;
- uint32_t checksum = 0;
- compressed_.clear();
- uncompressed.clear();
- const uint8_t* data;
- size_t len;
- while (st->next(&data, &len)) {
- compressed_.insert(compressed_.end(), data, data + len);
- }
- len = compressed_.size();
- int b1 = compressed_[len - 4] & 0xFF;
- int b2 = compressed_[len - 3] & 0xFF;
- int b3 = compressed_[len - 2] & 0xFF;
- int b4 = compressed_[len - 1] & 0xFF;
-
- checksum = (b1 << 24) + (b2 << 16) + (b3 << 8) + (b4);
- if (!snappy::Uncompress(reinterpret_cast<const char*>(compressed_.data()),
- len - 4, &uncompressed)) {
- throw Exception(
- "Snappy Compression reported an error when decompressing");
- }
- crc.process_bytes(uncompressed.c_str(), uncompressed.size());
- uint32_t c = crc();
- if (checksum != c) {
- throw Exception(boost::format("Checksum did not match for Snappy compression: Expected: %1%, computed: %2%") % checksum % c);
- }
- os_.reset(new boost::iostreams::filtering_istream());
- os_->push(
- boost::iostreams::basic_array_source<char>(uncompressed.c_str(),
- uncompressed.size()));
- std::unique_ptr<InputStream> in = istreamInputStream(*os_);
-
- dataDecoder_->init(*in);
- dataStream_ = std::move(in);
-#endif
- } else {
- compressed_.clear();
- const uint8_t* data;
- size_t len;
- while (st->next(&data, &len)) {
- compressed_.insert(compressed_.end(), data, data + len);
- }
- // boost::iostreams::write(os, reinterpret_cast<const char*>(data), len);
- os_.reset(new boost::iostreams::filtering_istream());
- os_->push(boost::iostreams::zlib_decompressor(get_zlib_params()));
- os_->push(boost::iostreams::basic_array_source<char>(
- compressed_.data(), compressed_.size()));
-
- std::unique_ptr<InputStream> in = nonSeekableIstreamInputStream(*os_);
- dataDecoder_->init(*in);
- dataStream_ = std::move(in);
- }
-}
-
-void DataFileReaderBase::close()
-{
-}
-
-static string toString(const vector<uint8_t>& v)
-{
- string result;
- result.resize(v.size());
- copy(v.begin(), v.end(), result.begin());
- return result;
-}
-
-static ValidSchema makeSchema(const vector<uint8_t>& v)
-{
- istringstream iss(toString(v));
- ValidSchema vs;
- compileJsonSchema(iss, vs);
- return ValidSchema(vs);
-}
-
-void DataFileReaderBase::readHeader()
-{
- decoder_->init(*stream_);
- Magic m;
- avro::decode(*decoder_, m);
- if (magic != m) {
- throw Exception("Invalid data file. Magic does not match: "
- + filename_);
- }
- avro::decode(*decoder_, metadata_);
- Metadata::const_iterator it = metadata_.find(AVRO_SCHEMA_KEY);
- if (it == metadata_.end()) {
- throw Exception("No schema in metadata");
- }
-
- dataSchema_ = makeSchema(it->second);
- if (! readerSchema_.root()) {
- readerSchema_ = dataSchema();
- }
-
- it = metadata_.find(AVRO_CODEC_KEY);
- if (it != metadata_.end() && toString(it->second) == AVRO_DEFLATE_CODEC) {
- codec_ = DEFLATE_CODEC;
-#ifdef SNAPPY_CODEC_AVAILABLE
- } else if (it != metadata_.end()
- && toString(it->second) == AVRO_SNAPPY_CODEC) {
- codec_ = SNAPPY_CODEC;
-#endif
- } else {
- codec_ = NULL_CODEC;
- if (it != metadata_.end() && toString(it->second) != AVRO_NULL_CODEC) {
- throw Exception("Unknown codec in data file: " + toString(it->second));
- }
- }
-
- avro::decode(*decoder_, sync_);
- decoder_->init(*stream_);
- blockStart_ = stream_->byteCount();
-}
-
-void DataFileReaderBase::doSeek(int64_t position)
-{
- if (SeekableInputStream *ss = dynamic_cast<SeekableInputStream *>(stream_.get())) {
- if (!eof_) {
- dataDecoder_->init(*dataStream_);
- drain(*dataStream_);
- }
- decoder_->init(*stream_);
- ss->seek(position);
- eof_ = false;
- } else {
- throw Exception("seek not supported on non-SeekableInputStream");
- }
-}
-
-void DataFileReaderBase::seek(int64_t position)
-{
- doSeek(position);
- readDataBlock();
-}
-
-void DataFileReaderBase::sync(int64_t position)
-{
- doSeek(position);
- DataFileSync sync_buffer;
- const uint8_t *p = 0;
- size_t n = 0;
- size_t i = 0;
- while (i < SyncSize) {
- if (n == 0 && !stream_->next(&p, &n)) {
- eof_ = true;
- return;
- }
- int len =
- std::min(static_cast<size_t>(SyncSize - i), n);
- memcpy(&sync_buffer[i], p, len);
- p += len;
- n -= len;
- i += len;
- }
- for (;;) {
- size_t j = 0;
- for (; j < SyncSize; ++j) {
- if (sync_[j] != sync_buffer[(i + j) % SyncSize]) {
- break;
- }
- }
- if (j == SyncSize) {
- // Found the sync marker!
- break;
- }
- if (n == 0 && !stream_->next(&p, &n)) {
- eof_ = true;
- return;
- }
- sync_buffer[i++ % SyncSize] = *p++;
- --n;
- }
- stream_->backup(n);
- readDataBlock();
-}
-
-bool DataFileReaderBase::pastSync(int64_t position) {
- return !hasMore() || blockStart_ >= position + SyncSize;
-}
-
-int64_t DataFileReaderBase::previousSync() {
- return blockStart_;
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DataFile.hh"
+#include "Compiler.hh"
+#include "Exception.hh"
+
+#include <sstream>
+
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+#include <boost/crc.hpp> // for boost::crc_32_type
+
+#ifdef SNAPPY_CODEC_AVAILABLE
+#include <snappy.h>
+#endif
+
+namespace avro {
+using std::unique_ptr;
+using std::ostringstream;
+using std::istringstream;
+using std::vector;
+using std::copy;
+using std::string;
+
+using std::array;
+
+namespace {
+const string AVRO_SCHEMA_KEY("avro.schema");
+const string AVRO_CODEC_KEY("avro.codec");
+const string AVRO_NULL_CODEC("null");
+const string AVRO_DEFLATE_CODEC("deflate");
+
+#ifdef SNAPPY_CODEC_AVAILABLE
+const string AVRO_SNAPPY_CODEC = "snappy";
+#endif
+
+const size_t minSyncInterval = 32;
+const size_t maxSyncInterval = 1u << 30;
+
+boost::iostreams::zlib_params get_zlib_params() {
+ boost::iostreams::zlib_params ret;
+ ret.method = boost::iostreams::zlib::deflated;
+ ret.noheader = true;
+ return ret;
+}
+}
+
+DataFileWriterBase::DataFileWriterBase(const char* filename, const ValidSchema& schema, size_t syncInterval,
+ Codec codec) :
+ filename_(filename),
+ schema_(schema),
+ encoderPtr_(binaryEncoder()),
+ syncInterval_(syncInterval),
+ codec_(codec),
+ stream_(fileOutputStream(filename)),
+ buffer_(memoryOutputStream()),
+ sync_(makeSync()),
+ objectCount_(0),
+ lastSync_(0)
+{
+ init(schema, syncInterval, codec);
+}
+
+DataFileWriterBase::DataFileWriterBase(std::unique_ptr<OutputStream> outputStream,
+ const ValidSchema& schema, size_t syncInterval, Codec codec) :
+ filename_(),
+ schema_(schema),
+ encoderPtr_(binaryEncoder()),
+ syncInterval_(syncInterval),
+ codec_(codec),
+ stream_(std::move(outputStream)),
+ buffer_(memoryOutputStream()),
+ sync_(makeSync()),
+ objectCount_(0),
+ lastSync_(0)
+{
+ init(schema, syncInterval, codec);
+}
+
+void DataFileWriterBase::init(const ValidSchema &schema, size_t syncInterval, const Codec &codec) {
+ if (syncInterval < minSyncInterval || syncInterval > maxSyncInterval) {
+ throw Exception(boost::format("Invalid sync interval: %1%. "
+ "Should be between %2% and %3%") % syncInterval %
+ minSyncInterval % maxSyncInterval);
+ }
+ setMetadata(AVRO_CODEC_KEY, AVRO_NULL_CODEC);
+
+ if (codec_ == NULL_CODEC) {
+ setMetadata(AVRO_CODEC_KEY, AVRO_NULL_CODEC);
+ } else if (codec_ == DEFLATE_CODEC) {
+ setMetadata(AVRO_CODEC_KEY, AVRO_DEFLATE_CODEC);
+#ifdef SNAPPY_CODEC_AVAILABLE
+ } else if (codec_ == SNAPPY_CODEC) {
+ setMetadata(AVRO_CODEC_KEY, AVRO_SNAPPY_CODEC);
+#endif
+ } else {
+ throw Exception(boost::format("Unknown codec: %1%") % codec);
+ }
+ setMetadata(AVRO_SCHEMA_KEY, schema.toJson(false));
+
+ writeHeader();
+ encoderPtr_->init(*buffer_);
+
+ lastSync_ = stream_->byteCount();
+}
+
+
+DataFileWriterBase::~DataFileWriterBase()
+{
+ if (stream_.get()) {
+ close();
+ }
+}
+
+void DataFileWriterBase::close()
+{
+ flush();
+ stream_.reset();
+}
+
+void DataFileWriterBase::sync()
+{
+ encoderPtr_->flush();
+
+ encoderPtr_->init(*stream_);
+ avro::encode(*encoderPtr_, objectCount_);
+ if (codec_ == NULL_CODEC) {
+ int64_t byteCount = buffer_->byteCount();
+ avro::encode(*encoderPtr_, byteCount);
+ encoderPtr_->flush();
+ std::unique_ptr<InputStream> in = memoryInputStream(*buffer_);
+ copy(*in, *stream_);
+ } else if (codec_ == DEFLATE_CODEC) {
+ std::vector<char> buf;
+ {
+ boost::iostreams::filtering_ostream os;
+ os.push(boost::iostreams::zlib_compressor(get_zlib_params()));
+ os.push(boost::iostreams::back_inserter(buf));
+ const uint8_t* data;
+ size_t len;
+
+ std::unique_ptr<InputStream> input = memoryInputStream(*buffer_);
+ while (input->next(&data, &len)) {
+ boost::iostreams::write(os, reinterpret_cast<const char*>(data), len);
+ }
+ } // make sure all is flushed
+ std::unique_ptr<InputStream> in = memoryInputStream(
+ reinterpret_cast<const uint8_t*>(buf.data()), buf.size());
+ int64_t byteCount = buf.size();
+ avro::encode(*encoderPtr_, byteCount);
+ encoderPtr_->flush();
+ copy(*in, *stream_);
+#ifdef SNAPPY_CODEC_AVAILABLE
+ } else if (codec_ == SNAPPY_CODEC) {
+ std::vector<char> temp;
+ std::string compressed;
+ boost::crc_32_type crc;
+ {
+ boost::iostreams::filtering_ostream os;
+ os.push(boost::iostreams::back_inserter(temp));
+ const uint8_t* data;
+ size_t len;
+
+ std::unique_ptr<InputStream> input = memoryInputStream(*buffer_);
+ while (input->next(&data, &len)) {
+ boost::iostreams::write(os, reinterpret_cast<const char*>(data),
+ len);
+ }
+ } // make sure all is flushed
+
+ crc.process_bytes(reinterpret_cast<const char*>(temp.data()),
+ temp.size());
+ // For Snappy, add the CRC32 checksum
+ int32_t checksum = crc();
+
+ // Now compress
+ size_t compressed_size = snappy::Compress(
+ reinterpret_cast<const char*>(temp.data()), temp.size(),
+ &compressed);
+ temp.clear();
+ {
+ boost::iostreams::filtering_ostream os;
+ os.push(boost::iostreams::back_inserter(temp));
+ boost::iostreams::write(os, compressed.c_str(), compressed_size);
+ }
+ temp.push_back((checksum >> 24) & 0xFF);
+ temp.push_back((checksum >> 16) & 0xFF);
+ temp.push_back((checksum >> 8) & 0xFF);
+ temp.push_back(checksum & 0xFF);
+ std::unique_ptr<InputStream> in = memoryInputStream(
+ reinterpret_cast<const uint8_t*>(temp.data()), temp.size());
+ int64_t byteCount = temp.size();
+ avro::encode(*encoderPtr_, byteCount);
+ encoderPtr_->flush();
+ copy(*in, *stream_);
+#endif
+ }
+
+ encoderPtr_->init(*stream_);
+ avro::encode(*encoderPtr_, sync_);
+ encoderPtr_->flush();
+
+ lastSync_ = stream_->byteCount();
+
+ buffer_ = memoryOutputStream();
+ encoderPtr_->init(*buffer_);
+ objectCount_ = 0;
+}
+
+void DataFileWriterBase::syncIfNeeded()
+{
+ encoderPtr_->flush();
+ if (buffer_->byteCount() >= syncInterval_) {
+ sync();
+ }
+}
+
+uint64_t DataFileWriterBase::getCurrentBlockStart()
+{
+ return lastSync_;
+}
+
+void DataFileWriterBase::flush()
+{
+ sync();
+}
+
+boost::mt19937 random(static_cast<uint32_t>(time(0)));
+
+DataFileSync DataFileWriterBase::makeSync()
+{
+ DataFileSync sync;
+ for (size_t i = 0; i < sync.size(); ++i) {
+ sync[i] = random();
+ }
+ return sync;
+}
+
+typedef array<uint8_t, 4> Magic;
+static Magic magic = { { 'O', 'b', 'j', '\x01' } };
+
+void DataFileWriterBase::writeHeader()
+{
+ encoderPtr_->init(*stream_);
+ avro::encode(*encoderPtr_, magic);
+ avro::encode(*encoderPtr_, metadata_);
+ avro::encode(*encoderPtr_, sync_);
+ encoderPtr_->flush();
+}
+
+void DataFileWriterBase::setMetadata(const string& key, const string& value)
+{
+ vector<uint8_t> v(value.size());
+ copy(value.begin(), value.end(), v.begin());
+ metadata_[key] = v;
+}
+
+DataFileReaderBase::DataFileReaderBase(const char* filename) :
+ filename_(filename), stream_(fileSeekableInputStream(filename)),
+ decoder_(binaryDecoder()), objectCount_(0), eof_(false), blockStart_(-1),
+ blockEnd_(-1)
+{
+ readHeader();
+}
+
+DataFileReaderBase::DataFileReaderBase(std::unique_ptr<InputStream> inputStream) :
+ filename_(""), stream_(std::move(inputStream)),
+ decoder_(binaryDecoder()), objectCount_(0), eof_(false)
+{
+ readHeader();
+}
+
+void DataFileReaderBase::init()
+{
+ readerSchema_ = dataSchema_;
+ dataDecoder_ = binaryDecoder();
+ readDataBlock();
+}
+
+void DataFileReaderBase::init(const ValidSchema& readerSchema)
+{
+ readerSchema_ = readerSchema;
+ dataDecoder_ = (readerSchema_.toJson(true) != dataSchema_.toJson(true)) ?
+ resolvingDecoder(dataSchema_, readerSchema_, binaryDecoder()) :
+ binaryDecoder();
+ readDataBlock();
+}
+
+static void drain(InputStream& in)
+{
+ const uint8_t *p = 0;
+ size_t n = 0;
+ while (in.next(&p, &n));
+}
+
+char hex(unsigned int x)
+{
+ return x + (x < 10 ? '0' : ('a' - 10));
+}
+
+std::ostream& operator << (std::ostream& os, const DataFileSync& s)
+{
+ for (size_t i = 0; i < s.size(); ++i) {
+ os << hex(s[i] / 16) << hex(s[i] % 16) << ' ';
+ }
+ os << std::endl;
+ return os;
+}
+
+
+bool DataFileReaderBase::hasMore()
+{
+ for (; ;) {
+ if (eof_) {
+ return false;
+ } else if (objectCount_ != 0) {
+ return true;
+ }
+
+ dataDecoder_->init(*dataStream_);
+ drain(*dataStream_);
+ DataFileSync s;
+ decoder_->init(*stream_);
+ avro::decode(*decoder_, s);
+ if (s != sync_) {
+ throw Exception("Sync mismatch");
+ }
+ readDataBlock();
+ }
+}
+
+class BoundedInputStream : public InputStream {
+ InputStream& in_;
+ size_t limit_;
+
+ bool next(const uint8_t** data, size_t* len) {
+ if (limit_ != 0 && in_.next(data, len)) {
+ if (*len > limit_) {
+ in_.backup(*len - limit_);
+ *len = limit_;
+ }
+ limit_ -= *len;
+ return true;
+ }
+ return false;
+ }
+
+ void backup(size_t len) {
+ in_.backup(len);
+ limit_ += len;
+ }
+
+ void skip(size_t len) {
+ if (len > limit_) {
+ len = limit_;
+ }
+ in_.skip(len);
+ limit_ -= len;
+ }
+
+ size_t byteCount() const {
+ return in_.byteCount();
+ }
+
+public:
+ BoundedInputStream(InputStream& in, size_t limit) :
+ in_(in), limit_(limit) { }
+};
+
+unique_ptr<InputStream> boundedInputStream(InputStream& in, size_t limit)
+{
+ return unique_ptr<InputStream>(new BoundedInputStream(in, limit));
+}
+
+void DataFileReaderBase::readDataBlock()
+{
+ decoder_->init(*stream_);
+ blockStart_ = stream_->byteCount();
+ const uint8_t* p = 0;
+ size_t n = 0;
+ if (! stream_->next(&p, &n)) {
+ eof_ = true;
+ return;
+ }
+ stream_->backup(n);
+ avro::decode(*decoder_, objectCount_);
+ int64_t byteCount;
+ avro::decode(*decoder_, byteCount);
+ decoder_->init(*stream_);
+ blockEnd_ = stream_->byteCount() + byteCount;
+
+ unique_ptr<InputStream> st = boundedInputStream(*stream_, static_cast<size_t>(byteCount));
+ if (codec_ == NULL_CODEC) {
+ dataDecoder_->init(*st);
+ dataStream_ = std::move(st);
+#ifdef SNAPPY_CODEC_AVAILABLE
+ } else if (codec_ == SNAPPY_CODEC) {
+ boost::crc_32_type crc;
+ uint32_t checksum = 0;
+ compressed_.clear();
+ uncompressed.clear();
+ const uint8_t* data;
+ size_t len;
+ while (st->next(&data, &len)) {
+ compressed_.insert(compressed_.end(), data, data + len);
+ }
+ len = compressed_.size();
+ int b1 = compressed_[len - 4] & 0xFF;
+ int b2 = compressed_[len - 3] & 0xFF;
+ int b3 = compressed_[len - 2] & 0xFF;
+ int b4 = compressed_[len - 1] & 0xFF;
+
+ checksum = (b1 << 24) + (b2 << 16) + (b3 << 8) + (b4);
+ if (!snappy::Uncompress(reinterpret_cast<const char*>(compressed_.data()),
+ len - 4, &uncompressed)) {
+ throw Exception(
+ "Snappy Compression reported an error when decompressing");
+ }
+ crc.process_bytes(uncompressed.c_str(), uncompressed.size());
+ uint32_t c = crc();
+ if (checksum != c) {
+ throw Exception(boost::format("Checksum did not match for Snappy compression: Expected: %1%, computed: %2%") % checksum % c);
+ }
+ os_.reset(new boost::iostreams::filtering_istream());
+ os_->push(
+ boost::iostreams::basic_array_source<char>(uncompressed.c_str(),
+ uncompressed.size()));
+ std::unique_ptr<InputStream> in = istreamInputStream(*os_);
+
+ dataDecoder_->init(*in);
+ dataStream_ = std::move(in);
+#endif
+ } else {
+ compressed_.clear();
+ const uint8_t* data;
+ size_t len;
+ while (st->next(&data, &len)) {
+ compressed_.insert(compressed_.end(), data, data + len);
+ }
+ // boost::iostreams::write(os, reinterpret_cast<const char*>(data), len);
+ os_.reset(new boost::iostreams::filtering_istream());
+ os_->push(boost::iostreams::zlib_decompressor(get_zlib_params()));
+ os_->push(boost::iostreams::basic_array_source<char>(
+ compressed_.data(), compressed_.size()));
+
+ std::unique_ptr<InputStream> in = nonSeekableIstreamInputStream(*os_);
+ dataDecoder_->init(*in);
+ dataStream_ = std::move(in);
+ }
+}
+
+void DataFileReaderBase::close()
+{
+}
+
+static string toString(const vector<uint8_t>& v)
+{
+ string result;
+ result.resize(v.size());
+ copy(v.begin(), v.end(), result.begin());
+ return result;
+}
+
+static ValidSchema makeSchema(const vector<uint8_t>& v)
+{
+ istringstream iss(toString(v));
+ ValidSchema vs;
+ compileJsonSchema(iss, vs);
+ return ValidSchema(vs);
+}
+
+void DataFileReaderBase::readHeader()
+{
+ decoder_->init(*stream_);
+ Magic m;
+ avro::decode(*decoder_, m);
+ if (magic != m) {
+ throw Exception("Invalid data file. Magic does not match: "
+ + filename_);
+ }
+ avro::decode(*decoder_, metadata_);
+ Metadata::const_iterator it = metadata_.find(AVRO_SCHEMA_KEY);
+ if (it == metadata_.end()) {
+ throw Exception("No schema in metadata");
+ }
+
+ dataSchema_ = makeSchema(it->second);
+ if (! readerSchema_.root()) {
+ readerSchema_ = dataSchema();
+ }
+
+ it = metadata_.find(AVRO_CODEC_KEY);
+ if (it != metadata_.end() && toString(it->second) == AVRO_DEFLATE_CODEC) {
+ codec_ = DEFLATE_CODEC;
+#ifdef SNAPPY_CODEC_AVAILABLE
+ } else if (it != metadata_.end()
+ && toString(it->second) == AVRO_SNAPPY_CODEC) {
+ codec_ = SNAPPY_CODEC;
+#endif
+ } else {
+ codec_ = NULL_CODEC;
+ if (it != metadata_.end() && toString(it->second) != AVRO_NULL_CODEC) {
+ throw Exception("Unknown codec in data file: " + toString(it->second));
+ }
+ }
+
+ avro::decode(*decoder_, sync_);
+ decoder_->init(*stream_);
+ blockStart_ = stream_->byteCount();
+}
+
+void DataFileReaderBase::doSeek(int64_t position)
+{
+ if (SeekableInputStream *ss = dynamic_cast<SeekableInputStream *>(stream_.get())) {
+ if (!eof_) {
+ dataDecoder_->init(*dataStream_);
+ drain(*dataStream_);
+ }
+ decoder_->init(*stream_);
+ ss->seek(position);
+ eof_ = false;
+ } else {
+ throw Exception("seek not supported on non-SeekableInputStream");
+ }
+}
+
+void DataFileReaderBase::seek(int64_t position)
+{
+ doSeek(position);
+ readDataBlock();
+}
+
+void DataFileReaderBase::sync(int64_t position)
+{
+ doSeek(position);
+ DataFileSync sync_buffer;
+ const uint8_t *p = 0;
+ size_t n = 0;
+ size_t i = 0;
+ while (i < SyncSize) {
+ if (n == 0 && !stream_->next(&p, &n)) {
+ eof_ = true;
+ return;
+ }
+ int len =
+ std::min(static_cast<size_t>(SyncSize - i), n);
+ memcpy(&sync_buffer[i], p, len);
+ p += len;
+ n -= len;
+ i += len;
+ }
+ for (;;) {
+ size_t j = 0;
+ for (; j < SyncSize; ++j) {
+ if (sync_[j] != sync_buffer[(i + j) % SyncSize]) {
+ break;
+ }
+ }
+ if (j == SyncSize) {
+ // Found the sync marker!
+ break;
+ }
+ if (n == 0 && !stream_->next(&p, &n)) {
+ eof_ = true;
+ return;
+ }
+ sync_buffer[i++ % SyncSize] = *p++;
+ --n;
+ }
+ stream_->backup(n);
+ readDataBlock();
+}
+
+bool DataFileReaderBase::pastSync(int64_t position) {
+ return !hasMore() || blockStart_ >= position + SyncSize;
+}
+
+int64_t DataFileReaderBase::previousSync() {
+ return blockStart_;
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/FileStream.cc b/contrib/libs/apache/avro/impl/FileStream.cc
index ed601b4c6fa..03013a1224b 100644
--- a/contrib/libs/apache/avro/impl/FileStream.cc
+++ b/contrib/libs/apache/avro/impl/FileStream.cc
@@ -1,397 +1,397 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <fstream>
-#include "Stream.hh"
-#ifndef _WIN32
-#include "unistd.h"
-#include "fcntl.h"
-#include "errno.h"
-
-#ifndef O_BINARY
-#define O_BINARY 0
-#endif
-#else
-#include "Windows.h"
-
-#ifdef min
-#undef min
-#endif
-#endif
-
-using std::unique_ptr;
-using std::istream;
-using std::ostream;
-
-namespace avro {
-namespace {
-struct BufferCopyIn {
- virtual ~BufferCopyIn() { }
- virtual void seek(size_t len) = 0;
- virtual bool read(uint8_t* b, size_t toRead, size_t& actual) = 0;
-
-};
-
-struct FileBufferCopyIn : public BufferCopyIn {
-#ifdef _WIN32
- HANDLE h_;
- FileBufferCopyIn(const char* filename) :
- h_(::CreateFileA(filename, GENERIC_READ, 0, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL)) {
- if (h_ == INVALID_HANDLE_VALUE) {
- throw Exception(boost::format("Cannot open file: %1%") % ::GetLastError());
- }
- }
-
- ~FileBufferCopyIn() {
- ::CloseHandle(h_);
- }
-
- void seek(size_t len) {
- if (::SetFilePointer(h_, len, NULL, FILE_CURRENT) == INVALID_SET_FILE_POINTER && ::GetLastError() != NO_ERROR) {
- throw Exception(boost::format("Cannot skip file: %1%") % ::GetLastError());
- }
- }
-
- bool read(uint8_t* b, size_t toRead, size_t& actual) {
- DWORD dw = 0;
- if (! ::ReadFile(h_, b, toRead, &dw, NULL)) {
- throw Exception(boost::format("Cannot read file: %1%") % ::GetLastError());
- }
- actual = static_cast<size_t>(dw);
- return actual != 0;
- }
-#else
- const int fd_;
-
- FileBufferCopyIn(const char* filename) :
- fd_(open(filename, O_RDONLY | O_BINARY)) {
- if (fd_ < 0) {
- throw Exception(boost::format("Cannot open file: %1%") %
- ::strerror(errno));
- }
- }
-
- ~FileBufferCopyIn() {
- ::close(fd_);
- }
-
- void seek(size_t len) {
- off_t r = ::lseek(fd_, len, SEEK_CUR);
- if (r == static_cast<off_t>(-1)) {
- throw Exception(boost::format("Cannot skip file: %1%") %
- strerror(errno));
- }
- }
-
- bool read(uint8_t* b, size_t toRead, size_t& actual) {
- int n = ::read(fd_, b, toRead);
- if (n > 0) {
- actual = n;
- return true;
- }
- return false;
- }
-#endif
-
-};
-
-struct IStreamBufferCopyIn : public BufferCopyIn {
- istream& is_;
-
- IStreamBufferCopyIn(istream& is) : is_(is) {
- }
-
- void seek(size_t len) {
- if (! is_.seekg(len, std::ios_base::cur)) {
- throw Exception("Cannot skip stream");
- }
- }
-
- bool read(uint8_t* b, size_t toRead, size_t& actual) {
- is_.read(reinterpret_cast<char*>(b), toRead);
- if (is_.bad()) {
- return false;
- }
- actual = static_cast<size_t>(is_.gcount());
- return (! is_.eof() || actual != 0);
- }
-
-};
-
-struct NonSeekableIStreamBufferCopyIn : public IStreamBufferCopyIn {
- NonSeekableIStreamBufferCopyIn(istream& is) : IStreamBufferCopyIn(is) { }
-
- void seek(size_t len) {
- const size_t bufSize = 4096;
- uint8_t buf[bufSize];
- while (len > 0) {
- size_t n = std::min(len, bufSize);
- is_.read(reinterpret_cast<char*>(buf), n);
- if (is_.bad()) {
- throw Exception("Cannot skip stream");
- }
- size_t actual = static_cast<size_t>(is_.gcount());
- if (is_.eof() && actual == 0) {
- throw Exception("Cannot skip stream");
- }
- len -= n;
- }
- }
-};
-
-}
-
-class BufferCopyInInputStream : public SeekableInputStream {
- const size_t bufferSize_;
- uint8_t* const buffer_;
- unique_ptr<BufferCopyIn> in_;
- size_t byteCount_;
- uint8_t* next_;
- size_t available_;
-
- bool next(const uint8_t** data, size_t *size) {
- if (available_ == 0 && ! fill()) {
- return false;
- }
- *data = next_;
- *size = available_;
- next_ += available_;
- byteCount_ += available_;
- available_ = 0;
- return true;
- }
-
- void backup(size_t len) {
- next_ -= len;
- available_ += len;
- byteCount_ -= len;
- }
-
- void skip(size_t len) {
- while (len > 0) {
- if (available_ == 0) {
- in_->seek(len);
- byteCount_ += len;
- return;
- }
- size_t n = std::min(available_, len);
- available_ -= n;
- next_ += n;
- len -= n;
- byteCount_ += n;
- }
- }
-
- size_t byteCount() const { return byteCount_; }
-
- bool fill() {
- size_t n = 0;
- if (in_->read(buffer_, bufferSize_, n)) {
- next_ = buffer_;
- available_ = n;
- return true;
- }
- return false;
- }
-
- void seek(int64_t position) {
- // BufferCopyIn::seek is relative to byteCount_, whereas position is
- // absolute.
- in_->seek(position - byteCount_ - available_);
- byteCount_ = position;
- available_ = 0;
- }
-
-public:
- BufferCopyInInputStream(unique_ptr<BufferCopyIn> in, size_t bufferSize) :
- bufferSize_(bufferSize),
- buffer_(new uint8_t[bufferSize]),
- in_(std::move(in)),
- byteCount_(0),
- next_(buffer_),
- available_(0) { }
-
- ~BufferCopyInInputStream() {
- delete[] buffer_;
- }
-};
-
-namespace {
-struct BufferCopyOut {
- virtual ~BufferCopyOut() { }
- virtual void write(const uint8_t* b, size_t len) = 0;
-};
-
-struct FileBufferCopyOut : public BufferCopyOut {
-#ifdef _WIN32
- HANDLE h_;
- FileBufferCopyOut(const char* filename) :
- h_(::CreateFileA(filename, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL)) {
- if (h_ == INVALID_HANDLE_VALUE) {
- throw Exception(boost::format("Cannot open file: %1%") % ::GetLastError());
- }
- }
-
- ~FileBufferCopyOut() {
- ::CloseHandle(h_);
- }
-
- void write(const uint8_t* b, size_t len) {
- while (len > 0) {
- DWORD dw = 0;
- if (! ::WriteFile(h_, b, len, &dw, NULL)) {
- throw Exception(boost::format("Cannot read file: %1%") % ::GetLastError());
- }
- b += dw;
- len -= dw;
- }
- }
-#else
- const int fd_;
-
- FileBufferCopyOut(const char* filename) :
- fd_(::open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644)) {
-
- if (fd_ < 0) {
- throw Exception(boost::format("Cannot open file: %1%") %
- ::strerror(errno));
- }
- }
-
- ~FileBufferCopyOut() {
- ::close(fd_);
- }
-
- void write(const uint8_t* b, size_t len) {
- if (::write(fd_, b, len) < 0) {
- throw Exception(boost::format("Cannot write file: %1%") %
- ::strerror(errno));
- }
- }
-#endif
-
-};
-
-struct OStreamBufferCopyOut : public BufferCopyOut {
- ostream& os_;
-
- OStreamBufferCopyOut(ostream& os) : os_(os) {
- }
-
- void write(const uint8_t* b, size_t len) {
- os_.write(reinterpret_cast<const char*>(b), len);
- }
-
-};
-
-}
-
-class BufferCopyOutputStream : public OutputStream {
- size_t bufferSize_;
- uint8_t* const buffer_;
- unique_ptr<BufferCopyOut> out_;
- uint8_t* next_;
- size_t available_;
- size_t byteCount_;
-
- // Invaiant: byteCount_ == byteswritten + bufferSize_ - available_;
- bool next(uint8_t** data, size_t* len) {
- if (available_ == 0) {
- flush();
- }
- *data = next_;
- *len = available_;
- next_ += available_;
- byteCount_ += available_;
- available_ = 0;
- return true;
- }
-
- void backup(size_t len) {
- available_ += len;
- next_ -= len;
- byteCount_ -= len;
- }
-
- uint64_t byteCount() const {
- return byteCount_;
- }
-
- void flush() {
- out_->write(buffer_, bufferSize_ - available_);
- next_ = buffer_;
- available_ = bufferSize_;
- }
-
-public:
- BufferCopyOutputStream(unique_ptr<BufferCopyOut> out, size_t bufferSize) :
- bufferSize_(bufferSize),
- buffer_(new uint8_t[bufferSize]),
- out_(std::move(out)),
- next_(buffer_),
- available_(bufferSize_), byteCount_(0) { }
-
- ~BufferCopyOutputStream() {
- delete[] buffer_;
- }
-};
-
-unique_ptr<InputStream> fileInputStream(const char* filename,
- size_t bufferSize)
-{
- unique_ptr<BufferCopyIn> in(new FileBufferCopyIn(filename));
- return unique_ptr<InputStream>( new BufferCopyInInputStream(std::move(in), bufferSize));
-}
-
-unique_ptr<SeekableInputStream> fileSeekableInputStream(const char* filename,
- size_t bufferSize)
-{
- unique_ptr<BufferCopyIn> in(new FileBufferCopyIn(filename));
- return unique_ptr<SeekableInputStream>( new BufferCopyInInputStream(std::move(in),
- bufferSize));
-}
-
-unique_ptr<InputStream> istreamInputStream(istream& is, size_t bufferSize)
-{
- unique_ptr<BufferCopyIn> in(new IStreamBufferCopyIn(is));
- return unique_ptr<InputStream>( new BufferCopyInInputStream(std::move(in), bufferSize));
-}
-
-unique_ptr<InputStream> nonSeekableIstreamInputStream(
- istream& is, size_t bufferSize)
-{
- unique_ptr<BufferCopyIn> in(new NonSeekableIStreamBufferCopyIn(is));
- return unique_ptr<InputStream>( new BufferCopyInInputStream(std::move(in), bufferSize));
-}
-
-unique_ptr<OutputStream> fileOutputStream(const char* filename,
- size_t bufferSize)
-{
- unique_ptr<BufferCopyOut> out(new FileBufferCopyOut(filename));
- return unique_ptr<OutputStream>(new BufferCopyOutputStream(std::move(out), bufferSize));
-}
-
-unique_ptr<OutputStream> ostreamOutputStream(ostream& os,
- size_t bufferSize)
-{
- unique_ptr<BufferCopyOut> out(new OStreamBufferCopyOut(os));
- return unique_ptr<OutputStream>(new BufferCopyOutputStream(std::move(out), bufferSize));
-}
-
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fstream>
+#include "Stream.hh"
+#ifndef _WIN32
+#include "unistd.h"
+#include "fcntl.h"
+#include "errno.h"
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+#else
+#include "Windows.h"
+
+#ifdef min
+#undef min
+#endif
+#endif
+
+using std::unique_ptr;
+using std::istream;
+using std::ostream;
+
+namespace avro {
+namespace {
+struct BufferCopyIn {
+ virtual ~BufferCopyIn() { }
+ virtual void seek(size_t len) = 0;
+ virtual bool read(uint8_t* b, size_t toRead, size_t& actual) = 0;
+
+};
+
+struct FileBufferCopyIn : public BufferCopyIn {
+#ifdef _WIN32
+ HANDLE h_;
+ FileBufferCopyIn(const char* filename) :
+ h_(::CreateFileA(filename, GENERIC_READ, 0, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL)) {
+ if (h_ == INVALID_HANDLE_VALUE) {
+ throw Exception(boost::format("Cannot open file: %1%") % ::GetLastError());
+ }
+ }
+
+ ~FileBufferCopyIn() {
+ ::CloseHandle(h_);
+ }
+
+ void seek(size_t len) {
+ if (::SetFilePointer(h_, len, NULL, FILE_CURRENT) == INVALID_SET_FILE_POINTER && ::GetLastError() != NO_ERROR) {
+ throw Exception(boost::format("Cannot skip file: %1%") % ::GetLastError());
+ }
+ }
+
+ bool read(uint8_t* b, size_t toRead, size_t& actual) {
+ DWORD dw = 0;
+ if (! ::ReadFile(h_, b, toRead, &dw, NULL)) {
+ throw Exception(boost::format("Cannot read file: %1%") % ::GetLastError());
+ }
+ actual = static_cast<size_t>(dw);
+ return actual != 0;
+ }
+#else
+ const int fd_;
+
+ FileBufferCopyIn(const char* filename) :
+ fd_(open(filename, O_RDONLY | O_BINARY)) {
+ if (fd_ < 0) {
+ throw Exception(boost::format("Cannot open file: %1%") %
+ ::strerror(errno));
+ }
+ }
+
+ ~FileBufferCopyIn() {
+ ::close(fd_);
+ }
+
+ void seek(size_t len) {
+ off_t r = ::lseek(fd_, len, SEEK_CUR);
+ if (r == static_cast<off_t>(-1)) {
+ throw Exception(boost::format("Cannot skip file: %1%") %
+ strerror(errno));
+ }
+ }
+
+ bool read(uint8_t* b, size_t toRead, size_t& actual) {
+ int n = ::read(fd_, b, toRead);
+ if (n > 0) {
+ actual = n;
+ return true;
+ }
+ return false;
+ }
+#endif
+
+};
+
+struct IStreamBufferCopyIn : public BufferCopyIn {
+ istream& is_;
+
+ IStreamBufferCopyIn(istream& is) : is_(is) {
+ }
+
+ void seek(size_t len) {
+ if (! is_.seekg(len, std::ios_base::cur)) {
+ throw Exception("Cannot skip stream");
+ }
+ }
+
+ bool read(uint8_t* b, size_t toRead, size_t& actual) {
+ is_.read(reinterpret_cast<char*>(b), toRead);
+ if (is_.bad()) {
+ return false;
+ }
+ actual = static_cast<size_t>(is_.gcount());
+ return (! is_.eof() || actual != 0);
+ }
+
+};
+
+struct NonSeekableIStreamBufferCopyIn : public IStreamBufferCopyIn {
+ NonSeekableIStreamBufferCopyIn(istream& is) : IStreamBufferCopyIn(is) { }
+
+ void seek(size_t len) {
+ const size_t bufSize = 4096;
+ uint8_t buf[bufSize];
+ while (len > 0) {
+ size_t n = std::min(len, bufSize);
+ is_.read(reinterpret_cast<char*>(buf), n);
+ if (is_.bad()) {
+ throw Exception("Cannot skip stream");
+ }
+ size_t actual = static_cast<size_t>(is_.gcount());
+ if (is_.eof() && actual == 0) {
+ throw Exception("Cannot skip stream");
+ }
+ len -= n;
+ }
+ }
+};
+
+}
+
+class BufferCopyInInputStream : public SeekableInputStream {
+ const size_t bufferSize_;
+ uint8_t* const buffer_;
+ unique_ptr<BufferCopyIn> in_;
+ size_t byteCount_;
+ uint8_t* next_;
+ size_t available_;
+
+ bool next(const uint8_t** data, size_t *size) {
+ if (available_ == 0 && ! fill()) {
+ return false;
+ }
+ *data = next_;
+ *size = available_;
+ next_ += available_;
+ byteCount_ += available_;
+ available_ = 0;
+ return true;
+ }
+
+ void backup(size_t len) {
+ next_ -= len;
+ available_ += len;
+ byteCount_ -= len;
+ }
+
+ void skip(size_t len) {
+ while (len > 0) {
+ if (available_ == 0) {
+ in_->seek(len);
+ byteCount_ += len;
+ return;
+ }
+ size_t n = std::min(available_, len);
+ available_ -= n;
+ next_ += n;
+ len -= n;
+ byteCount_ += n;
+ }
+ }
+
+ size_t byteCount() const { return byteCount_; }
+
+ bool fill() {
+ size_t n = 0;
+ if (in_->read(buffer_, bufferSize_, n)) {
+ next_ = buffer_;
+ available_ = n;
+ return true;
+ }
+ return false;
+ }
+
+ void seek(int64_t position) {
+ // BufferCopyIn::seek is relative to byteCount_, whereas position is
+ // absolute.
+ in_->seek(position - byteCount_ - available_);
+ byteCount_ = position;
+ available_ = 0;
+ }
+
+public:
+ BufferCopyInInputStream(unique_ptr<BufferCopyIn> in, size_t bufferSize) :
+ bufferSize_(bufferSize),
+ buffer_(new uint8_t[bufferSize]),
+ in_(std::move(in)),
+ byteCount_(0),
+ next_(buffer_),
+ available_(0) { }
+
+ ~BufferCopyInInputStream() {
+ delete[] buffer_;
+ }
+};
+
+namespace {
+struct BufferCopyOut {
+ virtual ~BufferCopyOut() { }
+ virtual void write(const uint8_t* b, size_t len) = 0;
+};
+
+struct FileBufferCopyOut : public BufferCopyOut {
+#ifdef _WIN32
+ HANDLE h_;
+ FileBufferCopyOut(const char* filename) :
+ h_(::CreateFileA(filename, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL)) {
+ if (h_ == INVALID_HANDLE_VALUE) {
+ throw Exception(boost::format("Cannot open file: %1%") % ::GetLastError());
+ }
+ }
+
+ ~FileBufferCopyOut() {
+ ::CloseHandle(h_);
+ }
+
+ void write(const uint8_t* b, size_t len) {
+ while (len > 0) {
+ DWORD dw = 0;
+ if (! ::WriteFile(h_, b, len, &dw, NULL)) {
+ throw Exception(boost::format("Cannot read file: %1%") % ::GetLastError());
+ }
+ b += dw;
+ len -= dw;
+ }
+ }
+#else
+ const int fd_;
+
+ FileBufferCopyOut(const char* filename) :
+ fd_(::open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644)) {
+
+ if (fd_ < 0) {
+ throw Exception(boost::format("Cannot open file: %1%") %
+ ::strerror(errno));
+ }
+ }
+
+ ~FileBufferCopyOut() {
+ ::close(fd_);
+ }
+
+ void write(const uint8_t* b, size_t len) {
+ if (::write(fd_, b, len) < 0) {
+ throw Exception(boost::format("Cannot write file: %1%") %
+ ::strerror(errno));
+ }
+ }
+#endif
+
+};
+
+struct OStreamBufferCopyOut : public BufferCopyOut {
+ ostream& os_;
+
+ OStreamBufferCopyOut(ostream& os) : os_(os) {
+ }
+
+ void write(const uint8_t* b, size_t len) {
+ os_.write(reinterpret_cast<const char*>(b), len);
+ }
+
+};
+
+}
+
+class BufferCopyOutputStream : public OutputStream {
+ size_t bufferSize_;
+ uint8_t* const buffer_;
+ unique_ptr<BufferCopyOut> out_;
+ uint8_t* next_;
+ size_t available_;
+ size_t byteCount_;
+
+ // Invaiant: byteCount_ == byteswritten + bufferSize_ - available_;
+ bool next(uint8_t** data, size_t* len) {
+ if (available_ == 0) {
+ flush();
+ }
+ *data = next_;
+ *len = available_;
+ next_ += available_;
+ byteCount_ += available_;
+ available_ = 0;
+ return true;
+ }
+
+ void backup(size_t len) {
+ available_ += len;
+ next_ -= len;
+ byteCount_ -= len;
+ }
+
+ uint64_t byteCount() const {
+ return byteCount_;
+ }
+
+ void flush() {
+ out_->write(buffer_, bufferSize_ - available_);
+ next_ = buffer_;
+ available_ = bufferSize_;
+ }
+
+public:
+ BufferCopyOutputStream(unique_ptr<BufferCopyOut> out, size_t bufferSize) :
+ bufferSize_(bufferSize),
+ buffer_(new uint8_t[bufferSize]),
+ out_(std::move(out)),
+ next_(buffer_),
+ available_(bufferSize_), byteCount_(0) { }
+
+ ~BufferCopyOutputStream() {
+ delete[] buffer_;
+ }
+};
+
+unique_ptr<InputStream> fileInputStream(const char* filename,
+ size_t bufferSize)
+{
+ unique_ptr<BufferCopyIn> in(new FileBufferCopyIn(filename));
+ return unique_ptr<InputStream>( new BufferCopyInInputStream(std::move(in), bufferSize));
+}
+
+unique_ptr<SeekableInputStream> fileSeekableInputStream(const char* filename,
+ size_t bufferSize)
+{
+ unique_ptr<BufferCopyIn> in(new FileBufferCopyIn(filename));
+ return unique_ptr<SeekableInputStream>( new BufferCopyInInputStream(std::move(in),
+ bufferSize));
+}
+
+unique_ptr<InputStream> istreamInputStream(istream& is, size_t bufferSize)
+{
+ unique_ptr<BufferCopyIn> in(new IStreamBufferCopyIn(is));
+ return unique_ptr<InputStream>( new BufferCopyInInputStream(std::move(in), bufferSize));
+}
+
+unique_ptr<InputStream> nonSeekableIstreamInputStream(
+ istream& is, size_t bufferSize)
+{
+ unique_ptr<BufferCopyIn> in(new NonSeekableIStreamBufferCopyIn(is));
+ return unique_ptr<InputStream>( new BufferCopyInInputStream(std::move(in), bufferSize));
+}
+
+unique_ptr<OutputStream> fileOutputStream(const char* filename,
+ size_t bufferSize)
+{
+ unique_ptr<BufferCopyOut> out(new FileBufferCopyOut(filename));
+ return unique_ptr<OutputStream>(new BufferCopyOutputStream(std::move(out), bufferSize));
+}
+
+unique_ptr<OutputStream> ostreamOutputStream(ostream& os,
+ size_t bufferSize)
+{
+ unique_ptr<BufferCopyOut> out(new OStreamBufferCopyOut(os));
+ return unique_ptr<OutputStream>(new BufferCopyOutputStream(std::move(out), bufferSize));
+}
+
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Generic.cc b/contrib/libs/apache/avro/impl/Generic.cc
index 8efb7e9ac46..5420e528c82 100644
--- a/contrib/libs/apache/avro/impl/Generic.cc
+++ b/contrib/libs/apache/avro/impl/Generic.cc
@@ -1,260 +1,260 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Generic.hh"
-#include <sstream>
-
-namespace avro {
-
-using std::string;
-using std::vector;
-using std::ostringstream;
-
-typedef vector<uint8_t> bytes;
-
-void GenericContainer::assertType(const NodePtr& schema, Type type) {
- if (schema->type() != type) {
- throw Exception(boost::format("Schema type %1 expected %2") %
- toString(schema->type()) % toString(type));
- }
-}
-
-GenericReader::GenericReader(const ValidSchema& s, const DecoderPtr& decoder) :
- schema_(s), isResolving_(dynamic_cast<ResolvingDecoder*>(&(*decoder)) != 0),
- decoder_(decoder)
-{
-}
-
-GenericReader::GenericReader(const ValidSchema& writerSchema,
- const ValidSchema& readerSchema, const DecoderPtr& decoder) :
- schema_(readerSchema),
- isResolving_(true),
- decoder_(resolvingDecoder(writerSchema, readerSchema, decoder))
-{
-}
-
-void GenericReader::read(GenericDatum& datum) const
-{
- datum = GenericDatum(schema_.root());
- read(datum, *decoder_, isResolving_);
-}
-
-void GenericReader::read(GenericDatum& datum, Decoder& d, bool isResolving)
-{
- if (datum.isUnion()) {
- datum.selectBranch(d.decodeUnionIndex());
- }
- switch (datum.type()) {
- case AVRO_NULL:
- d.decodeNull();
- break;
- case AVRO_BOOL:
- datum.value<bool>() = d.decodeBool();
- break;
- case AVRO_INT:
- datum.value<int32_t>() = d.decodeInt();
- break;
- case AVRO_LONG:
- datum.value<int64_t>() = d.decodeLong();
- break;
- case AVRO_FLOAT:
- datum.value<float>() = d.decodeFloat();
- break;
- case AVRO_DOUBLE:
- datum.value<double>() = d.decodeDouble();
- break;
- case AVRO_STRING:
- d.decodeString(datum.value<string>());
- break;
- case AVRO_BYTES:
- d.decodeBytes(datum.value<bytes>());
- break;
- case AVRO_FIXED:
- {
- GenericFixed& f = datum.value<GenericFixed>();
- d.decodeFixed(f.schema()->fixedSize(), f.value());
- }
- break;
- case AVRO_RECORD:
- {
- GenericRecord& r = datum.value<GenericRecord>();
- size_t c = r.schema()->leaves();
- if (isResolving) {
- std::vector<size_t> fo =
- static_cast<ResolvingDecoder&>(d).fieldOrder();
- for (size_t i = 0; i < c; ++i) {
- read(r.fieldAt(fo[i]), d, isResolving);
- }
- } else {
- for (size_t i = 0; i < c; ++i) {
- read(r.fieldAt(i), d, isResolving);
- }
- }
- }
- break;
- case AVRO_ENUM:
- datum.value<GenericEnum>().set(d.decodeEnum());
- break;
- case AVRO_ARRAY:
- {
- GenericArray& v = datum.value<GenericArray>();
- vector<GenericDatum>& r = v.value();
- const NodePtr& nn = v.schema()->leafAt(0);
- r.resize(0);
- size_t start = 0;
- for (size_t m = d.arrayStart(); m != 0; m = d.arrayNext()) {
- r.resize(r.size() + m);
- for (; start < r.size(); ++start) {
- r[start] = GenericDatum(nn);
- read(r[start], d, isResolving);
- }
- }
- }
- break;
- case AVRO_MAP:
- {
- GenericMap& v = datum.value<GenericMap>();
- GenericMap::Value& r = v.value();
- const NodePtr& nn = v.schema()->leafAt(1);
- r.resize(0);
- size_t start = 0;
- for (size_t m = d.mapStart(); m != 0; m = d.mapNext()) {
- r.resize(r.size() + m);
- for (; start < r.size(); ++start) {
- d.decodeString(r[start].first);
- r[start].second = GenericDatum(nn);
- read(r[start].second, d, isResolving);
- }
- }
- }
- break;
- default:
- throw Exception(boost::format("Unknown schema type %1%") %
- toString(datum.type()));
- }
-}
-
-void GenericReader::read(Decoder& d, GenericDatum& g, const ValidSchema& s)
-{
- g = GenericDatum(s);
- read(d, g);
-}
-
-void GenericReader::read(Decoder& d, GenericDatum& g)
-{
- read(g, d, dynamic_cast<ResolvingDecoder*>(&d) != 0);
-}
-
-GenericWriter::GenericWriter(const ValidSchema& s, const EncoderPtr& encoder) :
- schema_(s), encoder_(encoder)
-{
-}
-
-void GenericWriter::write(const GenericDatum& datum) const
-{
- write(datum, *encoder_);
-}
-
-void GenericWriter::write(const GenericDatum& datum, Encoder& e)
-{
- if (datum.isUnion()) {
- e.encodeUnionIndex(datum.unionBranch());
- }
- switch (datum.type()) {
- case AVRO_NULL:
- e.encodeNull();
- break;
- case AVRO_BOOL:
- e.encodeBool(datum.value<bool>());
- break;
- case AVRO_INT:
- e.encodeInt(datum.value<int32_t>());
- break;
- case AVRO_LONG:
- e.encodeLong(datum.value<int64_t>());
- break;
- case AVRO_FLOAT:
- e.encodeFloat(datum.value<float>());
- break;
- case AVRO_DOUBLE:
- e.encodeDouble(datum.value<double>());
- break;
- case AVRO_STRING:
- e.encodeString(datum.value<string>());
- break;
- case AVRO_BYTES:
- e.encodeBytes(datum.value<bytes>());
- break;
- case AVRO_FIXED:
- e.encodeFixed(datum.value<GenericFixed>().value());
- break;
- case AVRO_RECORD:
- {
- const GenericRecord& r = datum.value<GenericRecord>();
- size_t c = r.schema()->leaves();
- for (size_t i = 0; i < c; ++i) {
- write(r.fieldAt(i), e);
- }
- }
- break;
- case AVRO_ENUM:
- e.encodeEnum(datum.value<GenericEnum>().value());
- break;
- case AVRO_ARRAY:
- {
- const GenericArray::Value& r = datum.value<GenericArray>().value();
- e.arrayStart();
- if (! r.empty()) {
- e.setItemCount(r.size());
- for (GenericArray::Value::const_iterator it = r.begin();
- it != r.end(); ++it) {
- e.startItem();
- write(*it, e);
- }
- }
- e.arrayEnd();
- }
- break;
- case AVRO_MAP:
- {
- const GenericMap::Value& r = datum.value<GenericMap>().value();
- e.mapStart();
- if (! r.empty()) {
- e.setItemCount(r.size());
- for (GenericMap::Value::const_iterator it = r.begin();
- it != r.end(); ++it) {
- e.startItem();
- e.encodeString(it->first);
- write(it->second, e);
- }
- }
- e.mapEnd();
- }
- break;
- default:
- throw Exception(boost::format("Unknown schema type %1%") %
- toString(datum.type()));
- }
-}
-
-void GenericWriter::write(Encoder& e, const GenericDatum& g)
-{
- write(g, e);
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Generic.hh"
+#include <sstream>
+
+namespace avro {
+
+using std::string;
+using std::vector;
+using std::ostringstream;
+
+typedef vector<uint8_t> bytes;
+
+void GenericContainer::assertType(const NodePtr& schema, Type type) {
+ if (schema->type() != type) {
+ throw Exception(boost::format("Schema type %1 expected %2") %
+ toString(schema->type()) % toString(type));
+ }
+}
+
+GenericReader::GenericReader(const ValidSchema& s, const DecoderPtr& decoder) :
+ schema_(s), isResolving_(dynamic_cast<ResolvingDecoder*>(&(*decoder)) != 0),
+ decoder_(decoder)
+{
+}
+
+GenericReader::GenericReader(const ValidSchema& writerSchema,
+ const ValidSchema& readerSchema, const DecoderPtr& decoder) :
+ schema_(readerSchema),
+ isResolving_(true),
+ decoder_(resolvingDecoder(writerSchema, readerSchema, decoder))
+{
+}
+
+void GenericReader::read(GenericDatum& datum) const
+{
+ datum = GenericDatum(schema_.root());
+ read(datum, *decoder_, isResolving_);
+}
+
+void GenericReader::read(GenericDatum& datum, Decoder& d, bool isResolving)
+{
+ if (datum.isUnion()) {
+ datum.selectBranch(d.decodeUnionIndex());
+ }
+ switch (datum.type()) {
+ case AVRO_NULL:
+ d.decodeNull();
+ break;
+ case AVRO_BOOL:
+ datum.value<bool>() = d.decodeBool();
+ break;
+ case AVRO_INT:
+ datum.value<int32_t>() = d.decodeInt();
+ break;
+ case AVRO_LONG:
+ datum.value<int64_t>() = d.decodeLong();
+ break;
+ case AVRO_FLOAT:
+ datum.value<float>() = d.decodeFloat();
+ break;
+ case AVRO_DOUBLE:
+ datum.value<double>() = d.decodeDouble();
+ break;
+ case AVRO_STRING:
+ d.decodeString(datum.value<string>());
+ break;
+ case AVRO_BYTES:
+ d.decodeBytes(datum.value<bytes>());
+ break;
+ case AVRO_FIXED:
+ {
+ GenericFixed& f = datum.value<GenericFixed>();
+ d.decodeFixed(f.schema()->fixedSize(), f.value());
+ }
+ break;
+ case AVRO_RECORD:
+ {
+ GenericRecord& r = datum.value<GenericRecord>();
+ size_t c = r.schema()->leaves();
+ if (isResolving) {
+ std::vector<size_t> fo =
+ static_cast<ResolvingDecoder&>(d).fieldOrder();
+ for (size_t i = 0; i < c; ++i) {
+ read(r.fieldAt(fo[i]), d, isResolving);
+ }
+ } else {
+ for (size_t i = 0; i < c; ++i) {
+ read(r.fieldAt(i), d, isResolving);
+ }
+ }
+ }
+ break;
+ case AVRO_ENUM:
+ datum.value<GenericEnum>().set(d.decodeEnum());
+ break;
+ case AVRO_ARRAY:
+ {
+ GenericArray& v = datum.value<GenericArray>();
+ vector<GenericDatum>& r = v.value();
+ const NodePtr& nn = v.schema()->leafAt(0);
+ r.resize(0);
+ size_t start = 0;
+ for (size_t m = d.arrayStart(); m != 0; m = d.arrayNext()) {
+ r.resize(r.size() + m);
+ for (; start < r.size(); ++start) {
+ r[start] = GenericDatum(nn);
+ read(r[start], d, isResolving);
+ }
+ }
+ }
+ break;
+ case AVRO_MAP:
+ {
+ GenericMap& v = datum.value<GenericMap>();
+ GenericMap::Value& r = v.value();
+ const NodePtr& nn = v.schema()->leafAt(1);
+ r.resize(0);
+ size_t start = 0;
+ for (size_t m = d.mapStart(); m != 0; m = d.mapNext()) {
+ r.resize(r.size() + m);
+ for (; start < r.size(); ++start) {
+ d.decodeString(r[start].first);
+ r[start].second = GenericDatum(nn);
+ read(r[start].second, d, isResolving);
+ }
+ }
+ }
+ break;
+ default:
+ throw Exception(boost::format("Unknown schema type %1%") %
+ toString(datum.type()));
+ }
+}
+
+void GenericReader::read(Decoder& d, GenericDatum& g, const ValidSchema& s)
+{
+ g = GenericDatum(s);
+ read(d, g);
+}
+
+void GenericReader::read(Decoder& d, GenericDatum& g)
+{
+ read(g, d, dynamic_cast<ResolvingDecoder*>(&d) != 0);
+}
+
+GenericWriter::GenericWriter(const ValidSchema& s, const EncoderPtr& encoder) :
+ schema_(s), encoder_(encoder)
+{
+}
+
+void GenericWriter::write(const GenericDatum& datum) const
+{
+ write(datum, *encoder_);
+}
+
+void GenericWriter::write(const GenericDatum& datum, Encoder& e)
+{
+ if (datum.isUnion()) {
+ e.encodeUnionIndex(datum.unionBranch());
+ }
+ switch (datum.type()) {
+ case AVRO_NULL:
+ e.encodeNull();
+ break;
+ case AVRO_BOOL:
+ e.encodeBool(datum.value<bool>());
+ break;
+ case AVRO_INT:
+ e.encodeInt(datum.value<int32_t>());
+ break;
+ case AVRO_LONG:
+ e.encodeLong(datum.value<int64_t>());
+ break;
+ case AVRO_FLOAT:
+ e.encodeFloat(datum.value<float>());
+ break;
+ case AVRO_DOUBLE:
+ e.encodeDouble(datum.value<double>());
+ break;
+ case AVRO_STRING:
+ e.encodeString(datum.value<string>());
+ break;
+ case AVRO_BYTES:
+ e.encodeBytes(datum.value<bytes>());
+ break;
+ case AVRO_FIXED:
+ e.encodeFixed(datum.value<GenericFixed>().value());
+ break;
+ case AVRO_RECORD:
+ {
+ const GenericRecord& r = datum.value<GenericRecord>();
+ size_t c = r.schema()->leaves();
+ for (size_t i = 0; i < c; ++i) {
+ write(r.fieldAt(i), e);
+ }
+ }
+ break;
+ case AVRO_ENUM:
+ e.encodeEnum(datum.value<GenericEnum>().value());
+ break;
+ case AVRO_ARRAY:
+ {
+ const GenericArray::Value& r = datum.value<GenericArray>().value();
+ e.arrayStart();
+ if (! r.empty()) {
+ e.setItemCount(r.size());
+ for (GenericArray::Value::const_iterator it = r.begin();
+ it != r.end(); ++it) {
+ e.startItem();
+ write(*it, e);
+ }
+ }
+ e.arrayEnd();
+ }
+ break;
+ case AVRO_MAP:
+ {
+ const GenericMap::Value& r = datum.value<GenericMap>().value();
+ e.mapStart();
+ if (! r.empty()) {
+ e.setItemCount(r.size());
+ for (GenericMap::Value::const_iterator it = r.begin();
+ it != r.end(); ++it) {
+ e.startItem();
+ e.encodeString(it->first);
+ write(it->second, e);
+ }
+ }
+ e.mapEnd();
+ }
+ break;
+ default:
+ throw Exception(boost::format("Unknown schema type %1%") %
+ toString(datum.type()));
+ }
+}
+
+void GenericWriter::write(Encoder& e, const GenericDatum& g)
+{
+ write(g, e);
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/GenericDatum.cc b/contrib/libs/apache/avro/impl/GenericDatum.cc
index cdf9006eef2..855e1c4a55b 100644
--- a/contrib/libs/apache/avro/impl/GenericDatum.cc
+++ b/contrib/libs/apache/avro/impl/GenericDatum.cc
@@ -1,105 +1,105 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "GenericDatum.hh"
-#include "NodeImpl.hh"
-
-using std::string;
-using std::vector;
-
-namespace avro {
-
-GenericDatum::GenericDatum(const ValidSchema& schema) :
- type_(schema.root()->type()),
- logicalType_(schema.root()->logicalType())
-{
- init(schema.root());
-}
-
-GenericDatum::GenericDatum(const NodePtr& schema) :
- type_(schema->type()),
- logicalType_(schema->logicalType())
-{
- init(schema);
-}
-
-void GenericDatum::init(const NodePtr& schema)
-{
- NodePtr sc = schema;
- if (type_ == AVRO_SYMBOLIC) {
- sc = resolveSymbol(schema);
- type_ = sc->type();
- logicalType_ = sc->logicalType();
- }
- switch (type_) {
- case AVRO_NULL:
- break;
- case AVRO_BOOL:
- value_ = bool();
- break;
- case AVRO_INT:
- value_ = int32_t();
- break;
- case AVRO_LONG:
- value_ = int64_t();
- break;
- case AVRO_FLOAT:
- value_ = float();
- break;
- case AVRO_DOUBLE:
- value_ = double();
- break;
- case AVRO_STRING:
- value_ = string();
- break;
- case AVRO_BYTES:
- value_ = vector<uint8_t>();
- break;
- case AVRO_FIXED:
- value_ = GenericFixed(sc);
- break;
- case AVRO_RECORD:
- value_ = GenericRecord(sc);
- break;
- case AVRO_ENUM:
- value_ = GenericEnum(sc);
- break;
- case AVRO_ARRAY:
- value_ = GenericArray(sc);
- break;
- case AVRO_MAP:
- value_ = GenericMap(sc);
- break;
- case AVRO_UNION:
- value_ = GenericUnion(sc);
- break;
- default:
- throw Exception(boost::format("Unknown schema type %1%") %
- toString(type_));
- }
-}
-
-GenericRecord::GenericRecord(const NodePtr& schema) :
- GenericContainer(AVRO_RECORD, schema) {
- fields_.resize(schema->leaves());
- for (size_t i = 0; i < schema->leaves(); ++i) {
- fields_[i] = GenericDatum(schema->leafAt(i));
- }
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenericDatum.hh"
+#include "NodeImpl.hh"
+
+using std::string;
+using std::vector;
+
+namespace avro {
+
+GenericDatum::GenericDatum(const ValidSchema& schema) :
+ type_(schema.root()->type()),
+ logicalType_(schema.root()->logicalType())
+{
+ init(schema.root());
+}
+
+GenericDatum::GenericDatum(const NodePtr& schema) :
+ type_(schema->type()),
+ logicalType_(schema->logicalType())
+{
+ init(schema);
+}
+
+void GenericDatum::init(const NodePtr& schema)
+{
+ NodePtr sc = schema;
+ if (type_ == AVRO_SYMBOLIC) {
+ sc = resolveSymbol(schema);
+ type_ = sc->type();
+ logicalType_ = sc->logicalType();
+ }
+ switch (type_) {
+ case AVRO_NULL:
+ break;
+ case AVRO_BOOL:
+ value_ = bool();
+ break;
+ case AVRO_INT:
+ value_ = int32_t();
+ break;
+ case AVRO_LONG:
+ value_ = int64_t();
+ break;
+ case AVRO_FLOAT:
+ value_ = float();
+ break;
+ case AVRO_DOUBLE:
+ value_ = double();
+ break;
+ case AVRO_STRING:
+ value_ = string();
+ break;
+ case AVRO_BYTES:
+ value_ = vector<uint8_t>();
+ break;
+ case AVRO_FIXED:
+ value_ = GenericFixed(sc);
+ break;
+ case AVRO_RECORD:
+ value_ = GenericRecord(sc);
+ break;
+ case AVRO_ENUM:
+ value_ = GenericEnum(sc);
+ break;
+ case AVRO_ARRAY:
+ value_ = GenericArray(sc);
+ break;
+ case AVRO_MAP:
+ value_ = GenericMap(sc);
+ break;
+ case AVRO_UNION:
+ value_ = GenericUnion(sc);
+ break;
+ default:
+ throw Exception(boost::format("Unknown schema type %1%") %
+ toString(type_));
+ }
+}
+
+GenericRecord::GenericRecord(const NodePtr& schema) :
+ GenericContainer(AVRO_RECORD, schema) {
+ fields_.resize(schema->leaves());
+ for (size_t i = 0; i < schema->leaves(); ++i) {
+ fields_[i] = GenericDatum(schema->leafAt(i));
+ }
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/LogicalType.cc b/contrib/libs/apache/avro/impl/LogicalType.cc
index a0d9cc3b6fe..9c755903922 100644
--- a/contrib/libs/apache/avro/impl/LogicalType.cc
+++ b/contrib/libs/apache/avro/impl/LogicalType.cc
@@ -1,84 +1,84 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Exception.hh"
-#include "LogicalType.hh"
-
-namespace avro {
-
-LogicalType::LogicalType(Type type)
- : type_(type), precision_(0), scale_(0) {}
-
-LogicalType::Type LogicalType::type() const {
- return type_;
-}
-
-void LogicalType::setPrecision(int precision) {
- if (type_ != DECIMAL) {
- throw Exception("Only logical type DECIMAL can have precision");
- }
- if (precision <= 0) {
- throw Exception(boost::format("Precision cannot be: %1%") % precision);
- }
- precision_ = precision;
-}
-
-void LogicalType::setScale(int scale) {
- if (type_ != DECIMAL) {
- throw Exception("Only logical type DECIMAL can have scale");
- }
- if (scale < 0) {
- throw Exception(boost::format("Scale cannot be: %1%") % scale);
- }
- scale_ = scale;
-}
-
-void LogicalType::printJson(std::ostream& os) const {
- switch (type_) {
- case LogicalType::NONE:
- break;
- case LogicalType::DECIMAL:
- os << "\"logicalType\": \"decimal\"";
- os << ", \"precision\": " << precision_;
- os << ", \"scale\": " << scale_;
- break;
- case DATE:
- os << "\"logicalType\": \"date\"";
- break;
- case TIME_MILLIS:
- os << "\"logicalType\": \"time-millis\"";
- break;
- case TIME_MICROS:
- os << "\"logicalType\": \"time-micros\"";
- break;
- case TIMESTAMP_MILLIS:
- os << "\"logicalType\": \"timestamp-millis\"";
- break;
- case TIMESTAMP_MICROS:
- os << "\"logicalType\": \"timestamp-micros\"";
- break;
- case DURATION:
- os << "\"logicalType\": \"duration\"";
- break;
- case UUID:
- os << "\"logicalType\": \"uuid\"";
- break;
- }
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Exception.hh"
+#include "LogicalType.hh"
+
+namespace avro {
+
+LogicalType::LogicalType(Type type)
+ : type_(type), precision_(0), scale_(0) {}
+
+LogicalType::Type LogicalType::type() const {
+ return type_;
+}
+
+void LogicalType::setPrecision(int precision) {
+ if (type_ != DECIMAL) {
+ throw Exception("Only logical type DECIMAL can have precision");
+ }
+ if (precision <= 0) {
+ throw Exception(boost::format("Precision cannot be: %1%") % precision);
+ }
+ precision_ = precision;
+}
+
+void LogicalType::setScale(int scale) {
+ if (type_ != DECIMAL) {
+ throw Exception("Only logical type DECIMAL can have scale");
+ }
+ if (scale < 0) {
+ throw Exception(boost::format("Scale cannot be: %1%") % scale);
+ }
+ scale_ = scale;
+}
+
+void LogicalType::printJson(std::ostream& os) const {
+ switch (type_) {
+ case LogicalType::NONE:
+ break;
+ case LogicalType::DECIMAL:
+ os << "\"logicalType\": \"decimal\"";
+ os << ", \"precision\": " << precision_;
+ os << ", \"scale\": " << scale_;
+ break;
+ case DATE:
+ os << "\"logicalType\": \"date\"";
+ break;
+ case TIME_MILLIS:
+ os << "\"logicalType\": \"time-millis\"";
+ break;
+ case TIME_MICROS:
+ os << "\"logicalType\": \"time-micros\"";
+ break;
+ case TIMESTAMP_MILLIS:
+ os << "\"logicalType\": \"timestamp-millis\"";
+ break;
+ case TIMESTAMP_MICROS:
+ os << "\"logicalType\": \"timestamp-micros\"";
+ break;
+ case DURATION:
+ os << "\"logicalType\": \"duration\"";
+ break;
+ case UUID:
+ os << "\"logicalType\": \"uuid\"";
+ break;
+ }
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Node.cc b/contrib/libs/apache/avro/impl/Node.cc
index bb510cc1471..87b3652b4d9 100644
--- a/contrib/libs/apache/avro/impl/Node.cc
+++ b/contrib/libs/apache/avro/impl/Node.cc
@@ -1,161 +1,161 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cmath>
-
-#include "Node.hh"
-
-namespace avro {
-
-using std::string;
-
-Node::~Node()
-{ }
-
-Name::Name(const std::string& name)
-{
- fullname(name);
-}
-
-const string Name::fullname() const
-{
- return (ns_.empty()) ? simpleName_ : ns_ + "." + simpleName_;
-}
-
-void Name::fullname(const string& name)
-{
- string::size_type n = name.find_last_of('.');
- if (n == string::npos) {
- simpleName_ = name;
- ns_.clear();
- } else {
- ns_ = name.substr(0, n);
- simpleName_ = name.substr(n + 1);
- }
- check();
-}
-
-bool Name::operator < (const Name& n) const
-{
- return (ns_ < n.ns_) ? true :
- (n.ns_ < ns_) ? false :
- (simpleName_ < n.simpleName_);
-}
-
-static bool invalidChar1(char c)
-{
- return !isalnum(c) && c != '_' && c != '.' && c != '$';
-}
-
-static bool invalidChar2(char c)
-{
- return !isalnum(c) && c != '_';
-}
-
-void Name::check() const
-{
- if (! ns_.empty() && (ns_[0] == '.' || ns_[ns_.size() - 1] == '.' || std::find_if(ns_.begin(), ns_.end(), invalidChar1) != ns_.end())) {
- throw Exception("Invalid namespace: " + ns_);
- }
- if (simpleName_.empty() || std::find_if(simpleName_.begin(), simpleName_.end(), invalidChar2) != simpleName_.end()) {
- throw Exception("Invalid name: " + simpleName_);
- }
-}
-
-bool Name::operator == (const Name& n) const
-{
- return ns_ == n.ns_ && simpleName_ == n.simpleName_;
-}
-
-void Node::setLogicalType(LogicalType logicalType) {
- checkLock();
-
- // Check that the logical type is applicable to the node type.
- switch (logicalType.type()) {
- case LogicalType::NONE:
- break;
- case LogicalType::DECIMAL: {
- if (type_ != AVRO_BYTES && type_ != AVRO_FIXED) {
- throw Exception("DECIMAL logical type can annotate "
- "only BYTES or FIXED type");
- }
- if (type_ == AVRO_FIXED) {
- // Max precision that can be supported by the current size of
- // the FIXED type.
- long maxPrecision = floor(log10(2.0) * (8.0 * fixedSize() - 1));
- if (logicalType.precision() > maxPrecision) {
- throw Exception(
- boost::format(
- "DECIMAL precision %1% is too large for the "
- "FIXED type of size %2%, precision cannot be "
- "larget than %3%") % logicalType.precision() %
- fixedSize() % maxPrecision);
- }
- }
- if (logicalType.scale() > logicalType.precision()) {
- throw Exception("DECIMAL scale cannot exceed precision");
- }
- break;
- }
- case LogicalType::DATE:
- if (type_ != AVRO_INT) {
- throw Exception("DATE logical type can only annotate INT type");
- }
- break;
- case LogicalType::TIME_MILLIS:
- if (type_ != AVRO_INT) {
- throw Exception("TIME-MILLIS logical type can only annotate "
- "INT type");
- }
- break;
- case LogicalType::TIME_MICROS:
- if (type_ != AVRO_LONG) {
- throw Exception("TIME-MICROS logical type can only annotate "
- "LONG type");
- }
- break;
- case LogicalType::TIMESTAMP_MILLIS:
- if (type_ != AVRO_LONG) {
- throw Exception("TIMESTAMP-MILLIS logical type can only annotate "
- "LONG type");
- }
- break;
- case LogicalType::TIMESTAMP_MICROS:
- if (type_ != AVRO_LONG) {
- throw Exception("TIMESTAMP-MICROS logical type can only annotate "
- "LONG type");
- }
- break;
- case LogicalType::DURATION:
- if (type_ != AVRO_FIXED || fixedSize() != 12) {
- throw Exception("DURATION logical type can only annotate "
- "FIXED type of size 12");
- }
- break;
- case LogicalType::UUID:
- if (type_ != AVRO_STRING) {
- throw Exception("UUID logical type can only annotate "
- "STRING type");
- }
- break;
- }
-
- logicalType_ = logicalType;
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+
+#include "Node.hh"
+
+namespace avro {
+
+using std::string;
+
+Node::~Node()
+{ }
+
+Name::Name(const std::string& name)
+{
+ fullname(name);
+}
+
+const string Name::fullname() const
+{
+ return (ns_.empty()) ? simpleName_ : ns_ + "." + simpleName_;
+}
+
+void Name::fullname(const string& name)
+{
+ string::size_type n = name.find_last_of('.');
+ if (n == string::npos) {
+ simpleName_ = name;
+ ns_.clear();
+ } else {
+ ns_ = name.substr(0, n);
+ simpleName_ = name.substr(n + 1);
+ }
+ check();
+}
+
+bool Name::operator < (const Name& n) const
+{
+ return (ns_ < n.ns_) ? true :
+ (n.ns_ < ns_) ? false :
+ (simpleName_ < n.simpleName_);
+}
+
+static bool invalidChar1(char c)
+{
+ return !isalnum(c) && c != '_' && c != '.' && c != '$';
+}
+
+static bool invalidChar2(char c)
+{
+ return !isalnum(c) && c != '_';
+}
+
+void Name::check() const
+{
+ if (! ns_.empty() && (ns_[0] == '.' || ns_[ns_.size() - 1] == '.' || std::find_if(ns_.begin(), ns_.end(), invalidChar1) != ns_.end())) {
+ throw Exception("Invalid namespace: " + ns_);
+ }
+ if (simpleName_.empty() || std::find_if(simpleName_.begin(), simpleName_.end(), invalidChar2) != simpleName_.end()) {
+ throw Exception("Invalid name: " + simpleName_);
+ }
+}
+
+bool Name::operator == (const Name& n) const
+{
+ return ns_ == n.ns_ && simpleName_ == n.simpleName_;
+}
+
+void Node::setLogicalType(LogicalType logicalType) {
+ checkLock();
+
+ // Check that the logical type is applicable to the node type.
+ switch (logicalType.type()) {
+ case LogicalType::NONE:
+ break;
+ case LogicalType::DECIMAL: {
+ if (type_ != AVRO_BYTES && type_ != AVRO_FIXED) {
+ throw Exception("DECIMAL logical type can annotate "
+ "only BYTES or FIXED type");
+ }
+ if (type_ == AVRO_FIXED) {
+ // Max precision that can be supported by the current size of
+ // the FIXED type.
+ long maxPrecision = floor(log10(2.0) * (8.0 * fixedSize() - 1));
+ if (logicalType.precision() > maxPrecision) {
+ throw Exception(
+ boost::format(
+ "DECIMAL precision %1% is too large for the "
+ "FIXED type of size %2%, precision cannot be "
+ "larget than %3%") % logicalType.precision() %
+ fixedSize() % maxPrecision);
+ }
+ }
+ if (logicalType.scale() > logicalType.precision()) {
+ throw Exception("DECIMAL scale cannot exceed precision");
+ }
+ break;
+ }
+ case LogicalType::DATE:
+ if (type_ != AVRO_INT) {
+ throw Exception("DATE logical type can only annotate INT type");
+ }
+ break;
+ case LogicalType::TIME_MILLIS:
+ if (type_ != AVRO_INT) {
+ throw Exception("TIME-MILLIS logical type can only annotate "
+ "INT type");
+ }
+ break;
+ case LogicalType::TIME_MICROS:
+ if (type_ != AVRO_LONG) {
+ throw Exception("TIME-MICROS logical type can only annotate "
+ "LONG type");
+ }
+ break;
+ case LogicalType::TIMESTAMP_MILLIS:
+ if (type_ != AVRO_LONG) {
+ throw Exception("TIMESTAMP-MILLIS logical type can only annotate "
+ "LONG type");
+ }
+ break;
+ case LogicalType::TIMESTAMP_MICROS:
+ if (type_ != AVRO_LONG) {
+ throw Exception("TIMESTAMP-MICROS logical type can only annotate "
+ "LONG type");
+ }
+ break;
+ case LogicalType::DURATION:
+ if (type_ != AVRO_FIXED || fixedSize() != 12) {
+ throw Exception("DURATION logical type can only annotate "
+ "FIXED type of size 12");
+ }
+ break;
+ case LogicalType::UUID:
+ if (type_ != AVRO_STRING) {
+ throw Exception("UUID logical type can only annotate "
+ "STRING type");
+ }
+ break;
+ }
+
+ logicalType_ = logicalType;
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/NodeImpl.cc b/contrib/libs/apache/avro/impl/NodeImpl.cc
index 4a0acb92c0b..124152c4eec 100644
--- a/contrib/libs/apache/avro/impl/NodeImpl.cc
+++ b/contrib/libs/apache/avro/impl/NodeImpl.cc
@@ -1,547 +1,547 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include <sstream>
-#include <iomanip>
-#include <boost/algorithm/string/replace.hpp>
-#include "NodeImpl.hh"
-
-
-using std::string;
-namespace avro {
-
-namespace {
-
-// Escape string for serialization.
-string escape(const string &unescaped) {
- string s;
- s.reserve(unescaped.length());
- for (std::string::const_iterator it = unescaped.begin(); it != unescaped.end(); ++it) {
- char c = *it;
- switch (c) {
- case '\\':
- case '"':
- case '/':
- s += '\\';
- s += c;
- break;
- case '\b':
- s += '\\';
- s += 'b';
- break;
- case '\f':
- s += '\f';
- break;
- case '\n':
- s += '\\';
- s += 'n';
- break;
- case '\r':
- s += '\\';
- s += 'r';
- break;
- case '\t':
- s += '\\';
- s += 't';
- break;
- default:
- if (!std::iscntrl(c, std::locale::classic())) {
- s += c;
- continue;
- }
- s += intToHex(static_cast<unsigned int>(c));
- break;
- }
- }
- return s;
-}
-
-// Wrap an indentation in a struct for ostream operator<<
-struct indent {
- indent(int depth) :
- d(depth)
- { }
- int d;
-};
-
-/// ostream operator for indent
-std::ostream& operator <<(std::ostream &os, indent x)
-{
- static const string spaces(" ");
- while (x.d--) {
- os << spaces;
- }
- return os;
-}
-
-} // anonymous namespace
-
-const int kByteStringSize = 6;
-
-SchemaResolution
-NodePrimitive::resolve(const Node &reader) const
-{
- if (type() == reader.type()) {
- return RESOLVE_MATCH;
- }
-
- switch ( type() ) {
-
- case AVRO_INT:
-
- if ( reader.type() == AVRO_LONG ) {
- return RESOLVE_PROMOTABLE_TO_LONG;
- }
-
- // fall-through intentional
-
- case AVRO_LONG:
-
- if (reader.type() == AVRO_FLOAT) {
- return RESOLVE_PROMOTABLE_TO_FLOAT;
- }
-
- // fall-through intentional
-
- case AVRO_FLOAT:
-
- if (reader.type() == AVRO_DOUBLE) {
- return RESOLVE_PROMOTABLE_TO_DOUBLE;
- }
-
- default:
- break;
- }
-
- return furtherResolution(reader);
-}
-
-SchemaResolution
-NodeRecord::resolve(const Node &reader) const
-{
- if (reader.type() == AVRO_RECORD) {
- if (name() == reader.name()) {
- return RESOLVE_MATCH;
- }
- }
- return furtherResolution(reader);
-}
-
-SchemaResolution
-NodeEnum::resolve(const Node &reader) const
-{
- if (reader.type() == AVRO_ENUM) {
- return (name() == reader.name()) ? RESOLVE_MATCH : RESOLVE_NO_MATCH;
- }
- return furtherResolution(reader);
-}
-
-SchemaResolution
-NodeArray::resolve(const Node &reader) const
-{
- if (reader.type() == AVRO_ARRAY) {
- const NodePtr &arrayType = leafAt(0);
- return arrayType->resolve(*reader.leafAt(0));
- }
- return furtherResolution(reader);
-}
-
-SchemaResolution
-NodeMap::resolve(const Node &reader) const
-{
- if (reader.type() == AVRO_MAP) {
- const NodePtr &mapType = leafAt(1);
- return mapType->resolve(*reader.leafAt(1));
- }
- return furtherResolution(reader);
-}
-
-SchemaResolution
-NodeUnion::resolve(const Node &reader) const
-{
-
- // If the writer is union, resolution only needs to occur when the selected
- // type of the writer is known, so this function is not very helpful.
- //
- // In this case, this function returns if there is a possible match given
- // any writer type, so just search type by type returning the best match
- // found.
-
- SchemaResolution match = RESOLVE_NO_MATCH;
- for (size_t i=0; i < leaves(); ++i) {
- const NodePtr &node = leafAt(i);
- SchemaResolution thisMatch = node->resolve(reader);
- if (thisMatch == RESOLVE_MATCH) {
- match = thisMatch;
- break;
- }
- if (match == RESOLVE_NO_MATCH) {
- match = thisMatch;
- }
- }
- return match;
-}
-
-SchemaResolution
-NodeFixed::resolve(const Node &reader) const
-{
- if (reader.type() == AVRO_FIXED) {
- return (
- (reader.fixedSize() == fixedSize()) &&
- (reader.name() == name())
- ) ?
- RESOLVE_MATCH : RESOLVE_NO_MATCH;
- }
- return furtherResolution(reader);
-}
-
-SchemaResolution
-NodeSymbolic::resolve(const Node &reader) const
-{
- const NodePtr &node = leafAt(0);
- return node->resolve(reader);
-}
-
-void
-NodePrimitive::printJson(std::ostream &os, int depth) const
-{
- bool hasLogicalType = logicalType().type() != LogicalType::NONE;
-
- if (hasLogicalType) {
- os << "{\n" << indent(depth) << "\"type\": ";
- }
-
- os << '\"' << type() << '\"';
-
- if (hasLogicalType) {
- os << ",\n" << indent(depth);
- logicalType().printJson(os);
- os << "\n}";
- }
- if (getDoc().size()) {
- os << ",\n" << indent(depth) << "\"doc\": \""
- << escape(getDoc()) << "\"";
- }
-}
-
-void
-NodeSymbolic::printJson(std::ostream &os, int depth) const
-{
- os << '\"' << nameAttribute_.get() << '\"';
- if (getDoc().size()) {
- os << ",\n" << indent(depth) << "\"doc\": \""
- << escape(getDoc()) << "\"";
- }
-}
-
-static void printName(std::ostream& os, const Name& n, int depth)
-{
- if (!n.ns().empty()) {
- os << indent(depth) << "\"namespace\": \"" << n.ns() << "\",\n";
- }
- os << indent(depth) << "\"name\": \"" << n.simpleName() << "\",\n";
-}
-
-void
-NodeRecord::printJson(std::ostream &os, int depth) const
-{
- os << "{\n";
- os << indent(++depth) << "\"type\": \"record\",\n";
- printName(os, nameAttribute_.get(), depth);
- if (getDoc().size()) {
- os << indent(depth) << "\"doc\": \""
- << escape(getDoc()) << "\",\n";
- }
- os << indent(depth) << "\"fields\": [";
-
- size_t fields = leafAttributes_.size();
- ++depth;
- // Serialize "default" field:
- assert(defaultValues.empty() || (defaultValues.size() == fields));
- for (size_t i = 0; i < fields; ++i) {
- if (i > 0) {
- os << ',';
- }
- os << '\n' << indent(depth) << "{\n";
- os << indent(++depth) << "\"name\": \"" << leafNameAttributes_.get(i) << "\",\n";
- os << indent(depth) << "\"type\": ";
- leafAttributes_.get(i)->printJson(os, depth);
-
- if (!defaultValues.empty()) {
- if (!defaultValues[i].isUnion() &&
- defaultValues[i].type() == AVRO_NULL) {
- // No "default" field.
- } else {
- os << ",\n" << indent(depth) << "\"default\": ";
- leafAttributes_.get(i)->printDefaultToJson(defaultValues[i], os,
- depth);
- }
- }
- os << '\n';
- os << indent(--depth) << '}';
- }
- os << '\n' << indent(--depth) << "]\n";
- os << indent(--depth) << '}';
-}
-
-void NodePrimitive::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- assert(isPrimitive(g.type()));
-
- switch (g.type()) {
- case AVRO_NULL:
- os << "null";
- break;
- case AVRO_BOOL:
- os << (g.value<bool>() ? "true" : "false");
- break;
- case AVRO_INT:
- os << g.value<int32_t>();
- break;
- case AVRO_LONG:
- os << g.value<int64_t>();
- break;
- case AVRO_FLOAT:
- os << g.value<float>();
- break;
- case AVRO_DOUBLE:
- os << g.value<double>();
- break;
- case AVRO_STRING:
- os << "\"" << escape(g.value<string>()) << "\"";
- break;
- case AVRO_BYTES: {
- // Convert to a string:
- const std::vector<uint8_t> &vg = g.value<std::vector<uint8_t> >();
- string s;
- s.resize(vg.size() * kByteStringSize);
- for (unsigned int i = 0; i < vg.size(); i++) {
- string hex_string = intToHex(static_cast<int>(vg[i]));
- s.replace(i*kByteStringSize, kByteStringSize, hex_string);
- }
- os << "\"" << s << "\"";
- } break;
- default:
- break;
- }
-}
-
-void NodeEnum::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- assert(g.type() == AVRO_ENUM);
- os << "\"" << g.value<GenericEnum>().symbol() << "\"";
-}
-
-void NodeFixed::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- assert(g.type() == AVRO_FIXED);
- // ex: "\uOOff"
- // Convert to a string
- const std::vector<uint8_t> &vg = g.value<GenericFixed>().value();
- string s;
- s.resize(vg.size() * kByteStringSize);
- for (unsigned int i = 0; i < vg.size(); i++) {
- string hex_string = intToHex(static_cast<int>(vg[i]));
- s.replace(i*kByteStringSize, kByteStringSize, hex_string);
- }
- os << "\"" << s << "\"";
-}
-
-void NodeUnion::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- leafAt(0)->printDefaultToJson(g, os, depth);
-}
-
-void NodeArray::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- assert(g.type() == AVRO_ARRAY);
- // ex: "default": [1]
- if (g.value<GenericArray>().value().empty()) {
- os << "[]";
- } else {
- os << "[\n";
- depth++;
-
- // Serialize all values of the array with recursive calls:
- for (unsigned int i = 0; i < g.value<GenericArray>().value().size(); i++) {
- if (i > 0) {
- os << ",\n";
- }
- os << indent(depth);
- leafAt(0)->printDefaultToJson(g.value<GenericArray>().value()[i], os,
- depth);
- }
- os << "\n" << indent(--depth) << "]";
- }
-}
-
-void NodeSymbolic::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- getNode()->printDefaultToJson(g, os, depth);
-}
-
-void NodeRecord::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- assert(g.type() == AVRO_RECORD);
- if (g.value<GenericRecord>().fieldCount() == 0) {
- os << "{}";
- } else {
- os << "{\n";
-
- // Serialize all fields of the record with recursive calls:
- for (unsigned int i = 0; i < g.value<GenericRecord>().fieldCount(); i++) {
- if (i == 0) {
- ++depth;
- } else { // i > 0
- os << ",\n";
- }
-
- os << indent(depth) << "\"";
- assert(i < leaves());
- os << leafNameAttributes_.get(i);
- os << "\": ";
-
- // Recursive call on child node to be able to get the name attribute
- // (In case of a record we need the name of the leaves (contained in
- // 'this'))
- leafAt(i)->printDefaultToJson(g.value<GenericRecord>().fieldAt(i), os,
- depth);
- }
- os << "\n" << indent(--depth) << "}";
- }
-}
-
-void NodeMap::printDefaultToJson(const GenericDatum &g, std::ostream &os,
- int depth) const {
- assert(g.type() == AVRO_MAP);
- //{"a": 1}
- if (g.value<GenericMap>().value().empty()) {
- os << "{}";
- } else {
- os << "{\n";
-
- for (unsigned int i = 0; i < g.value<GenericMap>().value().size(); i++) {
- if (i == 0) {
- ++depth;
- } else {
- os << ",\n";
- }
- os << indent(depth) << "\"" << g.value<GenericMap>().value()[i].first
- << "\": ";
-
- leafAt(i)->printDefaultToJson(g.value<GenericMap>().value()[i].second, os,
- depth);
- }
- os << "\n" << indent(--depth) << "}";
- }
-}
-
-void
-NodeEnum::printJson(std::ostream &os, int depth) const
-{
- os << "{\n";
- os << indent(++depth) << "\"type\": \"enum\",\n";
- if (getDoc().size()) {
- os << indent(depth) << "\"doc\": \""
- << escape(getDoc()) << "\",\n";
- }
- printName(os, nameAttribute_.get(), depth);
- os << indent(depth) << "\"symbols\": [\n";
-
- int names = leafNameAttributes_.size();
- ++depth;
- for (int i = 0; i < names; ++i) {
- if (i > 0) {
- os << ",\n";
- }
- os << indent(depth) << '\"' << leafNameAttributes_.get(i) << '\"';
- }
- os << '\n';
- os << indent(--depth) << "]\n";
- os << indent(--depth) << '}';
-}
-
-void
-NodeArray::printJson(std::ostream &os, int depth) const
-{
- os << "{\n";
- os << indent(depth+1) << "\"type\": \"array\",\n";
- if (getDoc().size()) {
- os << indent(depth+1) << "\"doc\": \""
- << escape(getDoc()) << "\",\n";
- }
- os << indent(depth+1) << "\"items\": ";
- leafAttributes_.get()->printJson(os, depth+1);
- os << '\n';
- os << indent(depth) << '}';
-}
-
-void
-NodeMap::printJson(std::ostream &os, int depth) const
-{
- os << "{\n";
- os << indent(depth+1) <<"\"type\": \"map\",\n";
- if (getDoc().size()) {
- os << indent(depth+1) << "\"doc\": \""
- << escape(getDoc()) << "\",\n";
- }
- os << indent(depth+1) << "\"values\": ";
- leafAttributes_.get(1)->printJson(os, depth+1);
- os << '\n';
- os << indent(depth) << '}';
-}
-
-void
-NodeUnion::printJson(std::ostream &os, int depth) const
-{
- os << "[\n";
- int fields = leafAttributes_.size();
- ++depth;
- for (int i = 0; i < fields; ++i) {
- if (i > 0) {
- os << ",\n";
- }
- os << indent(depth);
- leafAttributes_.get(i)->printJson(os, depth);
- }
- os << '\n';
- os << indent(--depth) << ']';
-}
-
-void
-NodeFixed::printJson(std::ostream &os, int depth) const
-{
- os << "{\n";
- os << indent(++depth) << "\"type\": \"fixed\",\n";
- if (getDoc().size()) {
- os << indent(depth) << "\"doc\": \""
- << escape(getDoc()) << "\",\n";
- }
- printName(os, nameAttribute_.get(), depth);
- os << indent(depth) << "\"size\": " << sizeAttribute_.get();
-
- if (logicalType().type() != LogicalType::NONE) {
- os << ",\n" << indent(depth);
- logicalType().printJson(os);
- }
-
- os << "\n" << indent(--depth) << '}';
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <sstream>
+#include <iomanip>
+#include <boost/algorithm/string/replace.hpp>
+#include "NodeImpl.hh"
+
+
+using std::string;
+namespace avro {
+
+namespace {
+
+// Escape string for serialization.
+string escape(const string &unescaped) {
+ string s;
+ s.reserve(unescaped.length());
+ for (std::string::const_iterator it = unescaped.begin(); it != unescaped.end(); ++it) {
+ char c = *it;
+ switch (c) {
+ case '\\':
+ case '"':
+ case '/':
+ s += '\\';
+ s += c;
+ break;
+ case '\b':
+ s += '\\';
+ s += 'b';
+ break;
+ case '\f':
+ s += '\f';
+ break;
+ case '\n':
+ s += '\\';
+ s += 'n';
+ break;
+ case '\r':
+ s += '\\';
+ s += 'r';
+ break;
+ case '\t':
+ s += '\\';
+ s += 't';
+ break;
+ default:
+ if (!std::iscntrl(c, std::locale::classic())) {
+ s += c;
+ continue;
+ }
+ s += intToHex(static_cast<unsigned int>(c));
+ break;
+ }
+ }
+ return s;
+}
+
+// Wrap an indentation in a struct for ostream operator<<
+struct indent {
+ indent(int depth) :
+ d(depth)
+ { }
+ int d;
+};
+
+/// ostream operator for indent
+std::ostream& operator <<(std::ostream &os, indent x)
+{
+ static const string spaces(" ");
+ while (x.d--) {
+ os << spaces;
+ }
+ return os;
+}
+
+} // anonymous namespace
+
+const int kByteStringSize = 6;
+
+SchemaResolution
+NodePrimitive::resolve(const Node &reader) const
+{
+ if (type() == reader.type()) {
+ return RESOLVE_MATCH;
+ }
+
+ switch ( type() ) {
+
+ case AVRO_INT:
+
+ if ( reader.type() == AVRO_LONG ) {
+ return RESOLVE_PROMOTABLE_TO_LONG;
+ }
+
+ // fall-through intentional
+
+ case AVRO_LONG:
+
+ if (reader.type() == AVRO_FLOAT) {
+ return RESOLVE_PROMOTABLE_TO_FLOAT;
+ }
+
+ // fall-through intentional
+
+ case AVRO_FLOAT:
+
+ if (reader.type() == AVRO_DOUBLE) {
+ return RESOLVE_PROMOTABLE_TO_DOUBLE;
+ }
+
+ default:
+ break;
+ }
+
+ return furtherResolution(reader);
+}
+
+SchemaResolution
+NodeRecord::resolve(const Node &reader) const
+{
+ if (reader.type() == AVRO_RECORD) {
+ if (name() == reader.name()) {
+ return RESOLVE_MATCH;
+ }
+ }
+ return furtherResolution(reader);
+}
+
+SchemaResolution
+NodeEnum::resolve(const Node &reader) const
+{
+ if (reader.type() == AVRO_ENUM) {
+ return (name() == reader.name()) ? RESOLVE_MATCH : RESOLVE_NO_MATCH;
+ }
+ return furtherResolution(reader);
+}
+
+SchemaResolution
+NodeArray::resolve(const Node &reader) const
+{
+ if (reader.type() == AVRO_ARRAY) {
+ const NodePtr &arrayType = leafAt(0);
+ return arrayType->resolve(*reader.leafAt(0));
+ }
+ return furtherResolution(reader);
+}
+
+SchemaResolution
+NodeMap::resolve(const Node &reader) const
+{
+ if (reader.type() == AVRO_MAP) {
+ const NodePtr &mapType = leafAt(1);
+ return mapType->resolve(*reader.leafAt(1));
+ }
+ return furtherResolution(reader);
+}
+
+SchemaResolution
+NodeUnion::resolve(const Node &reader) const
+{
+
+ // If the writer is union, resolution only needs to occur when the selected
+ // type of the writer is known, so this function is not very helpful.
+ //
+ // In this case, this function returns if there is a possible match given
+ // any writer type, so just search type by type returning the best match
+ // found.
+
+ SchemaResolution match = RESOLVE_NO_MATCH;
+ for (size_t i=0; i < leaves(); ++i) {
+ const NodePtr &node = leafAt(i);
+ SchemaResolution thisMatch = node->resolve(reader);
+ if (thisMatch == RESOLVE_MATCH) {
+ match = thisMatch;
+ break;
+ }
+ if (match == RESOLVE_NO_MATCH) {
+ match = thisMatch;
+ }
+ }
+ return match;
+}
+
+SchemaResolution
+NodeFixed::resolve(const Node &reader) const
+{
+ if (reader.type() == AVRO_FIXED) {
+ return (
+ (reader.fixedSize() == fixedSize()) &&
+ (reader.name() == name())
+ ) ?
+ RESOLVE_MATCH : RESOLVE_NO_MATCH;
+ }
+ return furtherResolution(reader);
+}
+
+SchemaResolution
+NodeSymbolic::resolve(const Node &reader) const
+{
+ const NodePtr &node = leafAt(0);
+ return node->resolve(reader);
+}
+
+void
+NodePrimitive::printJson(std::ostream &os, int depth) const
+{
+ bool hasLogicalType = logicalType().type() != LogicalType::NONE;
+
+ if (hasLogicalType) {
+ os << "{\n" << indent(depth) << "\"type\": ";
+ }
+
+ os << '\"' << type() << '\"';
+
+ if (hasLogicalType) {
+ os << ",\n" << indent(depth);
+ logicalType().printJson(os);
+ os << "\n}";
+ }
+ if (getDoc().size()) {
+ os << ",\n" << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\"";
+ }
+}
+
+void
+NodeSymbolic::printJson(std::ostream &os, int depth) const
+{
+ os << '\"' << nameAttribute_.get() << '\"';
+ if (getDoc().size()) {
+ os << ",\n" << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\"";
+ }
+}
+
+static void printName(std::ostream& os, const Name& n, int depth)
+{
+ if (!n.ns().empty()) {
+ os << indent(depth) << "\"namespace\": \"" << n.ns() << "\",\n";
+ }
+ os << indent(depth) << "\"name\": \"" << n.simpleName() << "\",\n";
+}
+
+void
+NodeRecord::printJson(std::ostream &os, int depth) const
+{
+ os << "{\n";
+ os << indent(++depth) << "\"type\": \"record\",\n";
+ printName(os, nameAttribute_.get(), depth);
+ if (getDoc().size()) {
+ os << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
+ os << indent(depth) << "\"fields\": [";
+
+ size_t fields = leafAttributes_.size();
+ ++depth;
+ // Serialize "default" field:
+ assert(defaultValues.empty() || (defaultValues.size() == fields));
+ for (size_t i = 0; i < fields; ++i) {
+ if (i > 0) {
+ os << ',';
+ }
+ os << '\n' << indent(depth) << "{\n";
+ os << indent(++depth) << "\"name\": \"" << leafNameAttributes_.get(i) << "\",\n";
+ os << indent(depth) << "\"type\": ";
+ leafAttributes_.get(i)->printJson(os, depth);
+
+ if (!defaultValues.empty()) {
+ if (!defaultValues[i].isUnion() &&
+ defaultValues[i].type() == AVRO_NULL) {
+ // No "default" field.
+ } else {
+ os << ",\n" << indent(depth) << "\"default\": ";
+ leafAttributes_.get(i)->printDefaultToJson(defaultValues[i], os,
+ depth);
+ }
+ }
+ os << '\n';
+ os << indent(--depth) << '}';
+ }
+ os << '\n' << indent(--depth) << "]\n";
+ os << indent(--depth) << '}';
+}
+
+void NodePrimitive::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ assert(isPrimitive(g.type()));
+
+ switch (g.type()) {
+ case AVRO_NULL:
+ os << "null";
+ break;
+ case AVRO_BOOL:
+ os << (g.value<bool>() ? "true" : "false");
+ break;
+ case AVRO_INT:
+ os << g.value<int32_t>();
+ break;
+ case AVRO_LONG:
+ os << g.value<int64_t>();
+ break;
+ case AVRO_FLOAT:
+ os << g.value<float>();
+ break;
+ case AVRO_DOUBLE:
+ os << g.value<double>();
+ break;
+ case AVRO_STRING:
+ os << "\"" << escape(g.value<string>()) << "\"";
+ break;
+ case AVRO_BYTES: {
+ // Convert to a string:
+ const std::vector<uint8_t> &vg = g.value<std::vector<uint8_t> >();
+ string s;
+ s.resize(vg.size() * kByteStringSize);
+ for (unsigned int i = 0; i < vg.size(); i++) {
+ string hex_string = intToHex(static_cast<int>(vg[i]));
+ s.replace(i*kByteStringSize, kByteStringSize, hex_string);
+ }
+ os << "\"" << s << "\"";
+ } break;
+ default:
+ break;
+ }
+}
+
+void NodeEnum::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ assert(g.type() == AVRO_ENUM);
+ os << "\"" << g.value<GenericEnum>().symbol() << "\"";
+}
+
+void NodeFixed::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ assert(g.type() == AVRO_FIXED);
+ // ex: "\uOOff"
+ // Convert to a string
+ const std::vector<uint8_t> &vg = g.value<GenericFixed>().value();
+ string s;
+ s.resize(vg.size() * kByteStringSize);
+ for (unsigned int i = 0; i < vg.size(); i++) {
+ string hex_string = intToHex(static_cast<int>(vg[i]));
+ s.replace(i*kByteStringSize, kByteStringSize, hex_string);
+ }
+ os << "\"" << s << "\"";
+}
+
+void NodeUnion::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ leafAt(0)->printDefaultToJson(g, os, depth);
+}
+
+void NodeArray::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ assert(g.type() == AVRO_ARRAY);
+ // ex: "default": [1]
+ if (g.value<GenericArray>().value().empty()) {
+ os << "[]";
+ } else {
+ os << "[\n";
+ depth++;
+
+ // Serialize all values of the array with recursive calls:
+ for (unsigned int i = 0; i < g.value<GenericArray>().value().size(); i++) {
+ if (i > 0) {
+ os << ",\n";
+ }
+ os << indent(depth);
+ leafAt(0)->printDefaultToJson(g.value<GenericArray>().value()[i], os,
+ depth);
+ }
+ os << "\n" << indent(--depth) << "]";
+ }
+}
+
+void NodeSymbolic::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ getNode()->printDefaultToJson(g, os, depth);
+}
+
+void NodeRecord::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ assert(g.type() == AVRO_RECORD);
+ if (g.value<GenericRecord>().fieldCount() == 0) {
+ os << "{}";
+ } else {
+ os << "{\n";
+
+ // Serialize all fields of the record with recursive calls:
+ for (unsigned int i = 0; i < g.value<GenericRecord>().fieldCount(); i++) {
+ if (i == 0) {
+ ++depth;
+ } else { // i > 0
+ os << ",\n";
+ }
+
+ os << indent(depth) << "\"";
+ assert(i < leaves());
+ os << leafNameAttributes_.get(i);
+ os << "\": ";
+
+ // Recursive call on child node to be able to get the name attribute
+ // (In case of a record we need the name of the leaves (contained in
+ // 'this'))
+ leafAt(i)->printDefaultToJson(g.value<GenericRecord>().fieldAt(i), os,
+ depth);
+ }
+ os << "\n" << indent(--depth) << "}";
+ }
+}
+
+void NodeMap::printDefaultToJson(const GenericDatum &g, std::ostream &os,
+ int depth) const {
+ assert(g.type() == AVRO_MAP);
+ //{"a": 1}
+ if (g.value<GenericMap>().value().empty()) {
+ os << "{}";
+ } else {
+ os << "{\n";
+
+ for (unsigned int i = 0; i < g.value<GenericMap>().value().size(); i++) {
+ if (i == 0) {
+ ++depth;
+ } else {
+ os << ",\n";
+ }
+ os << indent(depth) << "\"" << g.value<GenericMap>().value()[i].first
+ << "\": ";
+
+ leafAt(i)->printDefaultToJson(g.value<GenericMap>().value()[i].second, os,
+ depth);
+ }
+ os << "\n" << indent(--depth) << "}";
+ }
+}
+
+void
+NodeEnum::printJson(std::ostream &os, int depth) const
+{
+ os << "{\n";
+ os << indent(++depth) << "\"type\": \"enum\",\n";
+ if (getDoc().size()) {
+ os << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
+ printName(os, nameAttribute_.get(), depth);
+ os << indent(depth) << "\"symbols\": [\n";
+
+ int names = leafNameAttributes_.size();
+ ++depth;
+ for (int i = 0; i < names; ++i) {
+ if (i > 0) {
+ os << ",\n";
+ }
+ os << indent(depth) << '\"' << leafNameAttributes_.get(i) << '\"';
+ }
+ os << '\n';
+ os << indent(--depth) << "]\n";
+ os << indent(--depth) << '}';
+}
+
+void
+NodeArray::printJson(std::ostream &os, int depth) const
+{
+ os << "{\n";
+ os << indent(depth+1) << "\"type\": \"array\",\n";
+ if (getDoc().size()) {
+ os << indent(depth+1) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
+ os << indent(depth+1) << "\"items\": ";
+ leafAttributes_.get()->printJson(os, depth+1);
+ os << '\n';
+ os << indent(depth) << '}';
+}
+
+void
+NodeMap::printJson(std::ostream &os, int depth) const
+{
+ os << "{\n";
+ os << indent(depth+1) <<"\"type\": \"map\",\n";
+ if (getDoc().size()) {
+ os << indent(depth+1) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
+ os << indent(depth+1) << "\"values\": ";
+ leafAttributes_.get(1)->printJson(os, depth+1);
+ os << '\n';
+ os << indent(depth) << '}';
+}
+
+void
+NodeUnion::printJson(std::ostream &os, int depth) const
+{
+ os << "[\n";
+ int fields = leafAttributes_.size();
+ ++depth;
+ for (int i = 0; i < fields; ++i) {
+ if (i > 0) {
+ os << ",\n";
+ }
+ os << indent(depth);
+ leafAttributes_.get(i)->printJson(os, depth);
+ }
+ os << '\n';
+ os << indent(--depth) << ']';
+}
+
+void
+NodeFixed::printJson(std::ostream &os, int depth) const
+{
+ os << "{\n";
+ os << indent(++depth) << "\"type\": \"fixed\",\n";
+ if (getDoc().size()) {
+ os << indent(depth) << "\"doc\": \""
+ << escape(getDoc()) << "\",\n";
+ }
+ printName(os, nameAttribute_.get(), depth);
+ os << indent(depth) << "\"size\": " << sizeAttribute_.get();
+
+ if (logicalType().type() != LogicalType::NONE) {
+ os << ",\n" << indent(depth);
+ logicalType().printJson(os);
+ }
+
+ os << "\n" << indent(--depth) << '}';
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Resolver.cc b/contrib/libs/apache/avro/impl/Resolver.cc
index 43467c028dd..209f99d98ef 100644
--- a/contrib/libs/apache/avro/impl/Resolver.cc
+++ b/contrib/libs/apache/avro/impl/Resolver.cc
@@ -1,872 +1,872 @@
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <memory>
-#include "Resolver.hh"
-#include "Layout.hh"
-#include "NodeImpl.hh"
-#include "ValidSchema.hh"
-#include "Reader.hh"
-#include "AvroTraits.hh"
-
-namespace avro {
-using std::unique_ptr;
-
-class ResolverFactory;
-typedef std::shared_ptr<Resolver> ResolverPtr;
-typedef std::vector<std::unique_ptr<Resolver> > ResolverPtrVector;
-
-// #define DEBUG_VERBOSE
-
-#ifdef DEBUG_VERBOSE
-#define DEBUG_OUT(str) std::cout << str << '\n'
-#else
-class NoOp {};
-template<typename T> NoOp& operator<<(NoOp &noOp, const T&) {
- return noOp;
-}
-NoOp noop;
-#define DEBUG_OUT(str) noop << str
-#endif
-
-template<typename T>
-class PrimitiveSkipper : public Resolver
-{
- public:
-
- PrimitiveSkipper() :
- Resolver()
- {}
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- T val;
- reader.readValue(val);
- DEBUG_OUT("Skipping " << val);
- }
-};
-
-template<typename T>
-class PrimitiveParser : public Resolver
-{
- public:
-
- PrimitiveParser(const PrimitiveLayout &offset) :
- Resolver(),
- offset_(offset.offset())
- {}
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- T* location = reinterpret_cast<T *> (address + offset_);
- reader.readValue(*location);
- DEBUG_OUT("Reading " << *location);
- }
-
- private:
-
- size_t offset_;
-};
-
-template<typename WT, typename RT>
-class PrimitivePromoter : public Resolver
-{
- public:
-
- PrimitivePromoter(const PrimitiveLayout &offset) :
- Resolver(),
- offset_(offset.offset())
- {}
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- parseIt<WT>(reader, address);
- }
-
- private:
-
- void parseIt(Reader &reader, uint8_t *address, const std::true_type &) const
- {
- WT val;
- reader.readValue(val);
- RT *location = reinterpret_cast<RT *> (address + offset_);
- *location = static_cast<RT>(val);
- DEBUG_OUT("Promoting " << val);
- }
-
- void parseIt(Reader &reader, uint8_t *address, const std::false_type &) const
- { }
-
- template<typename T>
- void parseIt(Reader &reader, uint8_t *address) const
- {
- parseIt(reader, address, is_promotable<T>());
- }
-
- size_t offset_;
-};
-
-template <>
-class PrimitiveSkipper<std::vector<uint8_t> > : public Resolver
-{
- public:
-
- PrimitiveSkipper() :
- Resolver()
- {}
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- std::vector<uint8_t> val;
- reader.readBytes(val);
- DEBUG_OUT("Skipping bytes");
- }
-};
-
-template <>
-class PrimitiveParser<std::vector<uint8_t> > : public Resolver
-{
- public:
-
- PrimitiveParser(const PrimitiveLayout &offset) :
- Resolver(),
- offset_(offset.offset())
- {}
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- std::vector<uint8_t> *location = reinterpret_cast<std::vector<uint8_t> *> (address + offset_);
- reader.readBytes(*location);
- DEBUG_OUT("Reading bytes");
- }
-
- private:
-
- size_t offset_;
-};
-
-class RecordSkipper : public Resolver
-{
- public:
-
- RecordSkipper(ResolverFactory &factory, const NodePtr &writer);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Skipping record");
-
- reader.readRecord();
- size_t steps = resolvers_.size();
- for(size_t i = 0; i < steps; ++i) {
- resolvers_[i]->parse(reader, address);
- }
- }
-
- protected:
-
- ResolverPtrVector resolvers_;
-
-};
-
-class RecordParser : public Resolver
-{
- public:
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading record");
-
- reader.readRecord();
- size_t steps = resolvers_.size();
- for(size_t i = 0; i < steps; ++i) {
- resolvers_[i]->parse(reader, address);
- }
- }
-
- RecordParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
-
- protected:
-
- ResolverPtrVector resolvers_;
-
-};
-
-
-class MapSkipper : public Resolver
-{
- public:
-
- MapSkipper(ResolverFactory &factory, const NodePtr &writer);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Skipping map");
-
- std::string key;
- int64_t size = 0;
- do {
- size = reader.readMapBlockSize();
- for(int64_t i = 0; i < size; ++i) {
- reader.readValue(key);
- resolver_->parse(reader, address);
- }
- } while (size != 0);
- }
-
- protected:
-
- ResolverPtr resolver_;
-};
-
-
-class MapParser : public Resolver
-{
- public:
-
- typedef uint8_t *(*GenericMapSetter)(uint8_t *map, const std::string &key);
-
- MapParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading map");
-
- uint8_t *mapAddress = address + offset_;
-
- std::string key;
- GenericMapSetter* setter = reinterpret_cast<GenericMapSetter *> (address + setFuncOffset_);
-
- int64_t size = 0;
- do {
- size = reader.readMapBlockSize();
- for(int64_t i = 0; i < size; ++i) {
- reader.readValue(key);
-
- // create a new map entry and get the address
- uint8_t *location = (*setter)(mapAddress, key);
- resolver_->parse(reader, location);
- }
- } while (size != 0);
- }
-
- protected:
-
- ResolverPtr resolver_;
- size_t offset_;
- size_t setFuncOffset_;
-};
-
-class ArraySkipper : public Resolver
-{
- public:
-
- ArraySkipper(ResolverFactory &factory, const NodePtr &writer);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Skipping array");
-
- int64_t size = 0;
- do {
- size = reader.readArrayBlockSize();
- for(int64_t i = 0; i < size; ++i) {
- resolver_->parse(reader, address);
- }
- } while (size != 0);
- }
-
- protected:
-
- ResolverPtr resolver_;
-};
-
-typedef uint8_t *(*GenericArraySetter)(uint8_t *array);
-
-class ArrayParser : public Resolver
-{
- public:
-
- ArrayParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading array");
-
- uint8_t *arrayAddress = address + offset_;
-
- GenericArraySetter* setter = reinterpret_cast<GenericArraySetter *> (address + setFuncOffset_);
-
- int64_t size = 0;
- do {
- size = reader.readArrayBlockSize();
- for(int64_t i = 0; i < size; ++i) {
- // create a new map entry and get the address
- uint8_t *location = (*setter)(arrayAddress);
- resolver_->parse(reader, location);
- }
- } while (size != 0);
- }
-
- protected:
-
- ArrayParser() :
- Resolver()
- {}
-
- ResolverPtr resolver_;
- size_t offset_;
- size_t setFuncOffset_;
-};
-
-class EnumSkipper : public Resolver
-{
- public:
-
- EnumSkipper(ResolverFactory &factory, const NodePtr &writer) :
- Resolver()
- { }
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- int64_t val = reader.readEnum();
- DEBUG_OUT("Skipping enum" << val);
- }
-};
-
-class EnumParser : public Resolver
-{
- public:
-
- enum EnumRepresentation {
- VAL
- };
-
- EnumParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver(),
- offset_(offsets.at(0).offset()),
- readerSize_(reader->names())
- {
- const size_t writerSize = writer->names();
-
- mapping_.reserve(writerSize);
-
- for(size_t i = 0; i < writerSize; ++i) {
- const std::string &name = writer->nameAt(i);
- size_t readerIndex = readerSize_;
- reader->nameIndex(name, readerIndex);
- mapping_.push_back(readerIndex);
- }
- }
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- size_t val = static_cast<size_t>(reader.readEnum());
- assert(static_cast<size_t>(val) < mapping_.size());
-
- if(mapping_[val] < readerSize_) {
- EnumRepresentation* location = reinterpret_cast<EnumRepresentation *> (address + offset_);
- *location = static_cast<EnumRepresentation>(mapping_[val]);
- DEBUG_OUT("Setting enum" << *location);
- }
- }
-
-protected:
-
- size_t offset_;
- size_t readerSize_;
- std::vector<size_t> mapping_;
-
-};
-
-class UnionSkipper : public Resolver
-{
- public:
-
- UnionSkipper(ResolverFactory &factory, const NodePtr &writer);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Skipping union");
- size_t choice = static_cast<size_t>(reader.readUnion());
- resolvers_[choice]->parse(reader, address);
- }
-
- protected:
-
- ResolverPtrVector resolvers_;
-};
-
-
-class UnionParser : public Resolver
-{
- public:
-
- typedef uint8_t *(*GenericUnionSetter)(uint8_t *, int64_t);
-
- UnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading union");
- size_t writerChoice = static_cast<size_t>(reader.readUnion());
- int64_t *readerChoice = reinterpret_cast<int64_t *>(address + choiceOffset_);
-
- *readerChoice = choiceMapping_[writerChoice];
- GenericUnionSetter* setter = reinterpret_cast<GenericUnionSetter *> (address + setFuncOffset_);
- uint8_t *value = reinterpret_cast<uint8_t *> (address + offset_);
- uint8_t *location = (*setter)(value, *readerChoice);
-
- resolvers_[writerChoice]->parse(reader, location);
- }
-
- protected:
-
- ResolverPtrVector resolvers_;
- std::vector<int64_t> choiceMapping_;
- size_t offset_;
- size_t choiceOffset_;
- size_t setFuncOffset_;
-};
-
-class UnionToNonUnionParser : public Resolver
-{
- public:
-
- typedef uint8_t *(*GenericUnionSetter)(uint8_t *, int64_t);
-
- UnionToNonUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const Layout &offsets);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading union to non-union");
- size_t choice = static_cast<size_t>(reader.readUnion());
- resolvers_[choice]->parse(reader, address);
- }
-
- protected:
-
- ResolverPtrVector resolvers_;
-};
-
-class NonUnionToUnionParser : public Resolver
-{
- public:
-
- typedef uint8_t *(*GenericUnionSetter)(uint8_t *, int64_t);
-
- NonUnionToUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading non-union to union");
-
- int64_t *choice = reinterpret_cast<int64_t *>(address + choiceOffset_);
- *choice = choice_;
- GenericUnionSetter* setter = reinterpret_cast<GenericUnionSetter *> (address + setFuncOffset_);
- uint8_t *value = reinterpret_cast<uint8_t *> (address + offset_);
- uint8_t *location = (*setter)(value, choice_);
-
- resolver_->parse(reader, location);
- }
-
- protected:
-
- ResolverPtr resolver_;
- size_t choice_;
- size_t offset_;
- size_t choiceOffset_;
- size_t setFuncOffset_;
-};
-
-class FixedSkipper : public Resolver
-{
- public:
-
- FixedSkipper(ResolverFactory &factory, const NodePtr &writer) :
- Resolver()
- {
- size_ = writer->fixedSize();
- }
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Skipping fixed");
- std::unique_ptr<uint8_t[]> val(new uint8_t[size_]);
- reader.readFixed(&val[0], size_);
- }
-
- protected:
-
- int size_;
-
-};
-
-class FixedParser : public Resolver
-{
- public:
-
- FixedParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver()
- {
- size_ = writer->fixedSize();
- offset_ = offsets.at(0).offset();
- }
-
- virtual void parse(Reader &reader, uint8_t *address) const
- {
- DEBUG_OUT("Reading fixed");
- uint8_t *location = reinterpret_cast<uint8_t *> (address + offset_);
- reader.readFixed(location, size_);
- }
-
- protected:
-
- int size_;
- size_t offset_;
-
-};
-
-
-class ResolverFactory : private boost::noncopyable {
-
- template<typename T>
- unique_ptr<Resolver>
- constructPrimitiveSkipper(const NodePtr &writer)
- {
- return unique_ptr<Resolver>(new PrimitiveSkipper<T>());
- }
-
- template<typename T>
- unique_ptr<Resolver>
- constructPrimitive(const NodePtr &writer, const NodePtr &reader, const Layout &offset)
- {
- unique_ptr<Resolver> instruction;
-
- SchemaResolution match = writer->resolve(*reader);
-
- if (match == RESOLVE_NO_MATCH) {
- instruction = unique_ptr<Resolver>(new PrimitiveSkipper<T>());
- }
- else if (reader->type() == AVRO_UNION) {
- const CompoundLayout &compoundLayout = static_cast<const CompoundLayout &>(offset);
- instruction = unique_ptr<Resolver>(new NonUnionToUnionParser(*this, writer, reader, compoundLayout));
- }
- else if (match == RESOLVE_MATCH) {
- const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
- instruction = unique_ptr<Resolver>(new PrimitiveParser<T>(primitiveLayout));
- }
- else if(match == RESOLVE_PROMOTABLE_TO_LONG) {
- const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
- instruction = unique_ptr<Resolver>(new PrimitivePromoter<T, int64_t>(primitiveLayout));
- }
- else if(match == RESOLVE_PROMOTABLE_TO_FLOAT) {
- const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
- instruction = unique_ptr<Resolver>(new PrimitivePromoter<T, float>(primitiveLayout));
- }
- else if(match == RESOLVE_PROMOTABLE_TO_DOUBLE) {
- const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
- instruction = unique_ptr<Resolver>(new PrimitivePromoter<T, double>(primitiveLayout));
- }
- else {
- assert(0);
- }
- return instruction;
- }
-
- template<typename Skipper>
- unique_ptr<Resolver>
- constructCompoundSkipper(const NodePtr &writer)
- {
- return unique_ptr<Resolver>(new Skipper(*this, writer));
- }
-
-
- template<typename Parser, typename Skipper>
- unique_ptr<Resolver>
- constructCompound(const NodePtr &writer, const NodePtr &reader, const Layout &offset)
- {
- unique_ptr<Resolver> instruction;
-
- SchemaResolution match = RESOLVE_NO_MATCH;
-
- match = writer->resolve(*reader);
-
- if (match == RESOLVE_NO_MATCH) {
- instruction = unique_ptr<Resolver>(new Skipper(*this, writer));
- }
- else if(writer->type() != AVRO_UNION && reader->type() == AVRO_UNION) {
- const CompoundLayout &compoundLayout = dynamic_cast<const CompoundLayout &>(offset);
- instruction = unique_ptr<Resolver>(new NonUnionToUnionParser(*this, writer, reader, compoundLayout));
- }
- else if(writer->type() == AVRO_UNION && reader->type() != AVRO_UNION) {
- instruction = unique_ptr<Resolver>(new UnionToNonUnionParser(*this, writer, reader, offset));
- }
- else {
- const CompoundLayout &compoundLayout = dynamic_cast<const CompoundLayout &>(offset);
- instruction = unique_ptr<Resolver>(new Parser(*this, writer, reader, compoundLayout));
- }
-
- return instruction;
- }
-
- public:
-
- unique_ptr<Resolver>
- construct(const NodePtr &writer, const NodePtr &reader, const Layout &offset)
- {
-
- typedef unique_ptr<Resolver> (ResolverFactory::*BuilderFunc)(const NodePtr &writer, const NodePtr &reader, const Layout &offset);
-
- NodePtr currentWriter = (writer->type() == AVRO_SYMBOLIC) ?
- resolveSymbol(writer) : writer;
-
- NodePtr currentReader = (reader->type() == AVRO_SYMBOLIC) ?
- resolveSymbol(reader) : reader;
-
- static const BuilderFunc funcs[] = {
- &ResolverFactory::constructPrimitive<std::string>,
- &ResolverFactory::constructPrimitive<std::vector<uint8_t> >,
- &ResolverFactory::constructPrimitive<int32_t>,
- &ResolverFactory::constructPrimitive<int64_t>,
- &ResolverFactory::constructPrimitive<float>,
- &ResolverFactory::constructPrimitive<double>,
- &ResolverFactory::constructPrimitive<bool>,
- &ResolverFactory::constructPrimitive<Null>,
- &ResolverFactory::constructCompound<RecordParser, RecordSkipper>,
- &ResolverFactory::constructCompound<EnumParser, EnumSkipper>,
- &ResolverFactory::constructCompound<ArrayParser, ArraySkipper>,
- &ResolverFactory::constructCompound<MapParser, MapSkipper>,
- &ResolverFactory::constructCompound<UnionParser, UnionSkipper>,
- &ResolverFactory::constructCompound<FixedParser, FixedSkipper>
- };
-
- static_assert((sizeof(funcs)/sizeof(BuilderFunc)) == (AVRO_NUM_TYPES),
- "Invalid number of builder functions");
-
- BuilderFunc func = funcs[currentWriter->type()];
- assert(func);
-
- return ((this)->*(func))(currentWriter, currentReader, offset);
- }
-
- unique_ptr<Resolver>
- skipper(const NodePtr &writer)
- {
-
- typedef unique_ptr<Resolver> (ResolverFactory::*BuilderFunc)(const NodePtr &writer);
-
- NodePtr currentWriter = (writer->type() == AVRO_SYMBOLIC) ?
- writer->leafAt(0) : writer;
-
- static const BuilderFunc funcs[] = {
- &ResolverFactory::constructPrimitiveSkipper<std::string>,
- &ResolverFactory::constructPrimitiveSkipper<std::vector<uint8_t> >,
- &ResolverFactory::constructPrimitiveSkipper<int32_t>,
- &ResolverFactory::constructPrimitiveSkipper<int64_t>,
- &ResolverFactory::constructPrimitiveSkipper<float>,
- &ResolverFactory::constructPrimitiveSkipper<double>,
- &ResolverFactory::constructPrimitiveSkipper<bool>,
- &ResolverFactory::constructPrimitiveSkipper<Null>,
- &ResolverFactory::constructCompoundSkipper<RecordSkipper>,
- &ResolverFactory::constructCompoundSkipper<EnumSkipper>,
- &ResolverFactory::constructCompoundSkipper<ArraySkipper>,
- &ResolverFactory::constructCompoundSkipper<MapSkipper>,
- &ResolverFactory::constructCompoundSkipper<UnionSkipper>,
- &ResolverFactory::constructCompoundSkipper<FixedSkipper>
- };
-
- static_assert((sizeof(funcs)/sizeof(BuilderFunc)) == (AVRO_NUM_TYPES),
- "Invalid number of builder functions");
-
- BuilderFunc func = funcs[currentWriter->type()];
- assert(func);
-
- return ((this)->*(func))(currentWriter);
- }
-};
-
-
-RecordSkipper::RecordSkipper(ResolverFactory &factory, const NodePtr &writer) :
- Resolver()
-{
- size_t leaves = writer->leaves();
- resolvers_.reserve(leaves);
- for(size_t i = 0; i < leaves; ++i) {
- const NodePtr &w = writer->leafAt(i);
- resolvers_.push_back(factory.skipper(w));
- }
-}
-
-RecordParser::RecordParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver()
-{
- size_t leaves = writer->leaves();
- resolvers_.reserve(leaves);
- for(size_t i = 0; i < leaves; ++i) {
-
- const NodePtr &w = writer->leafAt(i);
-
- const std::string &name = writer->nameAt(i);
-
- size_t readerIndex = 0;
- bool found = reader->nameIndex(name, readerIndex);
-
- if(found) {
- const NodePtr &r = reader->leafAt(readerIndex);
- resolvers_.push_back(factory.construct(w, r, offsets.at(readerIndex)));
- }
- else {
- resolvers_.push_back(factory.skipper(w));
- }
- }
-}
-
-MapSkipper::MapSkipper(ResolverFactory &factory, const NodePtr &writer) :
- Resolver(),
- resolver_(factory.skipper(writer->leafAt(1)))
-{ }
-
-MapParser::MapParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver(),
- resolver_(factory.construct(writer->leafAt(1), reader->leafAt(1), offsets.at(1))),
- offset_(offsets.offset()),
- setFuncOffset_( offsets.at(0).offset())
-{ }
-
-ArraySkipper::ArraySkipper(ResolverFactory &factory, const NodePtr &writer) :
- Resolver(),
- resolver_(factory.skipper(writer->leafAt(0)))
-{ }
-
-ArrayParser::ArrayParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver(),
- resolver_(factory.construct(writer->leafAt(0), reader->leafAt(0), offsets.at(1))),
- offset_(offsets.offset()),
- setFuncOffset_(offsets.at(0).offset())
-{ }
-
-UnionSkipper::UnionSkipper(ResolverFactory &factory, const NodePtr &writer) :
- Resolver()
-{
- size_t leaves = writer->leaves();
- resolvers_.reserve(leaves);
- for(size_t i = 0; i < leaves; ++i) {
- const NodePtr &w = writer->leafAt(i);
- resolvers_.push_back(factory.skipper(w));
- }
-}
-
-namespace {
-
-// asumes the writer is NOT a union, and the reader IS a union
-
-SchemaResolution
-checkUnionMatch(const NodePtr &writer, const NodePtr &reader, size_t &index)
-{
- SchemaResolution bestMatch = RESOLVE_NO_MATCH;
-
- index = 0;
- size_t leaves = reader->leaves();
-
- for(size_t i=0; i < leaves; ++i) {
-
- const NodePtr &leaf = reader->leafAt(i);
- SchemaResolution newMatch = writer->resolve(*leaf);
-
- if(newMatch == RESOLVE_MATCH) {
- bestMatch = newMatch;
- index = i;
- break;
- }
- if(bestMatch == RESOLVE_NO_MATCH) {
- bestMatch = newMatch;
- index = i;
- }
- }
-
- return bestMatch;
-}
-
-};
-
-UnionParser::UnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver(),
- offset_(offsets.offset()),
- choiceOffset_(offsets.at(0).offset()),
- setFuncOffset_(offsets.at(1).offset())
-{
-
- size_t leaves = writer->leaves();
- resolvers_.reserve(leaves);
- choiceMapping_.reserve(leaves);
- for(size_t i = 0; i < leaves; ++i) {
-
- // for each writer, we need a schema match for the reader
- const NodePtr &w = writer->leafAt(i);
- size_t index = 0;
-
- SchemaResolution match = checkUnionMatch(w, reader, index);
-
- if(match == RESOLVE_NO_MATCH) {
- resolvers_.push_back(factory.skipper(w));
- // push back a non-sensical number
- choiceMapping_.push_back(reader->leaves());
- }
- else {
- const NodePtr &r = reader->leafAt(index);
- resolvers_.push_back(factory.construct(w, r, offsets.at(index+2)));
- choiceMapping_.push_back(index);
- }
- }
-}
-
-NonUnionToUnionParser::NonUnionToUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
- Resolver(),
- offset_(offsets.offset()),
- choiceOffset_(offsets.at(0).offset()),
- setFuncOffset_(offsets.at(1).offset())
-{
-#ifndef NDEBUG
- SchemaResolution bestMatch =
-#endif
- checkUnionMatch(writer, reader, choice_);
- assert(bestMatch != RESOLVE_NO_MATCH);
- resolver_ = factory.construct(writer, reader->leafAt(choice_), offsets.at(choice_+2));
-}
-
-UnionToNonUnionParser::UnionToNonUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const Layout &offsets) :
- Resolver()
-{
- size_t leaves = writer->leaves();
- resolvers_.reserve(leaves);
- for(size_t i = 0; i < leaves; ++i) {
- const NodePtr &w = writer->leafAt(i);
- resolvers_.push_back(factory.construct(w, reader, offsets));
- }
-}
-
-unique_ptr<Resolver> constructResolver(const ValidSchema &writerSchema,
- const ValidSchema &readerSchema,
- const Layout &readerLayout)
-{
- ResolverFactory factory;
- return factory.construct(writerSchema.root(), readerSchema.root(), readerLayout);
-}
-
-} // namespace avro
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include "Resolver.hh"
+#include "Layout.hh"
+#include "NodeImpl.hh"
+#include "ValidSchema.hh"
+#include "Reader.hh"
+#include "AvroTraits.hh"
+
+namespace avro {
+using std::unique_ptr;
+
+class ResolverFactory;
+typedef std::shared_ptr<Resolver> ResolverPtr;
+typedef std::vector<std::unique_ptr<Resolver> > ResolverPtrVector;
+
+// #define DEBUG_VERBOSE
+
+#ifdef DEBUG_VERBOSE
+#define DEBUG_OUT(str) std::cout << str << '\n'
+#else
+class NoOp {};
+template<typename T> NoOp& operator<<(NoOp &noOp, const T&) {
+ return noOp;
+}
+NoOp noop;
+#define DEBUG_OUT(str) noop << str
+#endif
+
+template<typename T>
+class PrimitiveSkipper : public Resolver
+{
+ public:
+
+ PrimitiveSkipper() :
+ Resolver()
+ {}
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ T val;
+ reader.readValue(val);
+ DEBUG_OUT("Skipping " << val);
+ }
+};
+
+template<typename T>
+class PrimitiveParser : public Resolver
+{
+ public:
+
+ PrimitiveParser(const PrimitiveLayout &offset) :
+ Resolver(),
+ offset_(offset.offset())
+ {}
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ T* location = reinterpret_cast<T *> (address + offset_);
+ reader.readValue(*location);
+ DEBUG_OUT("Reading " << *location);
+ }
+
+ private:
+
+ size_t offset_;
+};
+
+template<typename WT, typename RT>
+class PrimitivePromoter : public Resolver
+{
+ public:
+
+ PrimitivePromoter(const PrimitiveLayout &offset) :
+ Resolver(),
+ offset_(offset.offset())
+ {}
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ parseIt<WT>(reader, address);
+ }
+
+ private:
+
+ void parseIt(Reader &reader, uint8_t *address, const std::true_type &) const
+ {
+ WT val;
+ reader.readValue(val);
+ RT *location = reinterpret_cast<RT *> (address + offset_);
+ *location = static_cast<RT>(val);
+ DEBUG_OUT("Promoting " << val);
+ }
+
+ void parseIt(Reader &reader, uint8_t *address, const std::false_type &) const
+ { }
+
+ template<typename T>
+ void parseIt(Reader &reader, uint8_t *address) const
+ {
+ parseIt(reader, address, is_promotable<T>());
+ }
+
+ size_t offset_;
+};
+
+template <>
+class PrimitiveSkipper<std::vector<uint8_t> > : public Resolver
+{
+ public:
+
+ PrimitiveSkipper() :
+ Resolver()
+ {}
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ std::vector<uint8_t> val;
+ reader.readBytes(val);
+ DEBUG_OUT("Skipping bytes");
+ }
+};
+
+template <>
+class PrimitiveParser<std::vector<uint8_t> > : public Resolver
+{
+ public:
+
+ PrimitiveParser(const PrimitiveLayout &offset) :
+ Resolver(),
+ offset_(offset.offset())
+ {}
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ std::vector<uint8_t> *location = reinterpret_cast<std::vector<uint8_t> *> (address + offset_);
+ reader.readBytes(*location);
+ DEBUG_OUT("Reading bytes");
+ }
+
+ private:
+
+ size_t offset_;
+};
+
+class RecordSkipper : public Resolver
+{
+ public:
+
+ RecordSkipper(ResolverFactory &factory, const NodePtr &writer);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Skipping record");
+
+ reader.readRecord();
+ size_t steps = resolvers_.size();
+ for(size_t i = 0; i < steps; ++i) {
+ resolvers_[i]->parse(reader, address);
+ }
+ }
+
+ protected:
+
+ ResolverPtrVector resolvers_;
+
+};
+
+class RecordParser : public Resolver
+{
+ public:
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading record");
+
+ reader.readRecord();
+ size_t steps = resolvers_.size();
+ for(size_t i = 0; i < steps; ++i) {
+ resolvers_[i]->parse(reader, address);
+ }
+ }
+
+ RecordParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
+
+ protected:
+
+ ResolverPtrVector resolvers_;
+
+};
+
+
+class MapSkipper : public Resolver
+{
+ public:
+
+ MapSkipper(ResolverFactory &factory, const NodePtr &writer);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Skipping map");
+
+ std::string key;
+ int64_t size = 0;
+ do {
+ size = reader.readMapBlockSize();
+ for(int64_t i = 0; i < size; ++i) {
+ reader.readValue(key);
+ resolver_->parse(reader, address);
+ }
+ } while (size != 0);
+ }
+
+ protected:
+
+ ResolverPtr resolver_;
+};
+
+
+class MapParser : public Resolver
+{
+ public:
+
+ typedef uint8_t *(*GenericMapSetter)(uint8_t *map, const std::string &key);
+
+ MapParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading map");
+
+ uint8_t *mapAddress = address + offset_;
+
+ std::string key;
+ GenericMapSetter* setter = reinterpret_cast<GenericMapSetter *> (address + setFuncOffset_);
+
+ int64_t size = 0;
+ do {
+ size = reader.readMapBlockSize();
+ for(int64_t i = 0; i < size; ++i) {
+ reader.readValue(key);
+
+ // create a new map entry and get the address
+ uint8_t *location = (*setter)(mapAddress, key);
+ resolver_->parse(reader, location);
+ }
+ } while (size != 0);
+ }
+
+ protected:
+
+ ResolverPtr resolver_;
+ size_t offset_;
+ size_t setFuncOffset_;
+};
+
+class ArraySkipper : public Resolver
+{
+ public:
+
+ ArraySkipper(ResolverFactory &factory, const NodePtr &writer);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Skipping array");
+
+ int64_t size = 0;
+ do {
+ size = reader.readArrayBlockSize();
+ for(int64_t i = 0; i < size; ++i) {
+ resolver_->parse(reader, address);
+ }
+ } while (size != 0);
+ }
+
+ protected:
+
+ ResolverPtr resolver_;
+};
+
+typedef uint8_t *(*GenericArraySetter)(uint8_t *array);
+
+class ArrayParser : public Resolver
+{
+ public:
+
+ ArrayParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading array");
+
+ uint8_t *arrayAddress = address + offset_;
+
+ GenericArraySetter* setter = reinterpret_cast<GenericArraySetter *> (address + setFuncOffset_);
+
+ int64_t size = 0;
+ do {
+ size = reader.readArrayBlockSize();
+ for(int64_t i = 0; i < size; ++i) {
+ // create a new map entry and get the address
+ uint8_t *location = (*setter)(arrayAddress);
+ resolver_->parse(reader, location);
+ }
+ } while (size != 0);
+ }
+
+ protected:
+
+ ArrayParser() :
+ Resolver()
+ {}
+
+ ResolverPtr resolver_;
+ size_t offset_;
+ size_t setFuncOffset_;
+};
+
+class EnumSkipper : public Resolver
+{
+ public:
+
+ EnumSkipper(ResolverFactory &factory, const NodePtr &writer) :
+ Resolver()
+ { }
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ int64_t val = reader.readEnum();
+ DEBUG_OUT("Skipping enum" << val);
+ }
+};
+
+class EnumParser : public Resolver
+{
+ public:
+
+ enum EnumRepresentation {
+ VAL
+ };
+
+ EnumParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver(),
+ offset_(offsets.at(0).offset()),
+ readerSize_(reader->names())
+ {
+ const size_t writerSize = writer->names();
+
+ mapping_.reserve(writerSize);
+
+ for(size_t i = 0; i < writerSize; ++i) {
+ const std::string &name = writer->nameAt(i);
+ size_t readerIndex = readerSize_;
+ reader->nameIndex(name, readerIndex);
+ mapping_.push_back(readerIndex);
+ }
+ }
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ size_t val = static_cast<size_t>(reader.readEnum());
+ assert(static_cast<size_t>(val) < mapping_.size());
+
+ if(mapping_[val] < readerSize_) {
+ EnumRepresentation* location = reinterpret_cast<EnumRepresentation *> (address + offset_);
+ *location = static_cast<EnumRepresentation>(mapping_[val]);
+ DEBUG_OUT("Setting enum" << *location);
+ }
+ }
+
+protected:
+
+ size_t offset_;
+ size_t readerSize_;
+ std::vector<size_t> mapping_;
+
+};
+
+class UnionSkipper : public Resolver
+{
+ public:
+
+ UnionSkipper(ResolverFactory &factory, const NodePtr &writer);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Skipping union");
+ size_t choice = static_cast<size_t>(reader.readUnion());
+ resolvers_[choice]->parse(reader, address);
+ }
+
+ protected:
+
+ ResolverPtrVector resolvers_;
+};
+
+
+class UnionParser : public Resolver
+{
+ public:
+
+ typedef uint8_t *(*GenericUnionSetter)(uint8_t *, int64_t);
+
+ UnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading union");
+ size_t writerChoice = static_cast<size_t>(reader.readUnion());
+ int64_t *readerChoice = reinterpret_cast<int64_t *>(address + choiceOffset_);
+
+ *readerChoice = choiceMapping_[writerChoice];
+ GenericUnionSetter* setter = reinterpret_cast<GenericUnionSetter *> (address + setFuncOffset_);
+ uint8_t *value = reinterpret_cast<uint8_t *> (address + offset_);
+ uint8_t *location = (*setter)(value, *readerChoice);
+
+ resolvers_[writerChoice]->parse(reader, location);
+ }
+
+ protected:
+
+ ResolverPtrVector resolvers_;
+ std::vector<int64_t> choiceMapping_;
+ size_t offset_;
+ size_t choiceOffset_;
+ size_t setFuncOffset_;
+};
+
+class UnionToNonUnionParser : public Resolver
+{
+ public:
+
+ typedef uint8_t *(*GenericUnionSetter)(uint8_t *, int64_t);
+
+ UnionToNonUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const Layout &offsets);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading union to non-union");
+ size_t choice = static_cast<size_t>(reader.readUnion());
+ resolvers_[choice]->parse(reader, address);
+ }
+
+ protected:
+
+ ResolverPtrVector resolvers_;
+};
+
+class NonUnionToUnionParser : public Resolver
+{
+ public:
+
+ typedef uint8_t *(*GenericUnionSetter)(uint8_t *, int64_t);
+
+ NonUnionToUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets);
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading non-union to union");
+
+ int64_t *choice = reinterpret_cast<int64_t *>(address + choiceOffset_);
+ *choice = choice_;
+ GenericUnionSetter* setter = reinterpret_cast<GenericUnionSetter *> (address + setFuncOffset_);
+ uint8_t *value = reinterpret_cast<uint8_t *> (address + offset_);
+ uint8_t *location = (*setter)(value, choice_);
+
+ resolver_->parse(reader, location);
+ }
+
+ protected:
+
+ ResolverPtr resolver_;
+ size_t choice_;
+ size_t offset_;
+ size_t choiceOffset_;
+ size_t setFuncOffset_;
+};
+
+class FixedSkipper : public Resolver
+{
+ public:
+
+ FixedSkipper(ResolverFactory &factory, const NodePtr &writer) :
+ Resolver()
+ {
+ size_ = writer->fixedSize();
+ }
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Skipping fixed");
+ std::unique_ptr<uint8_t[]> val(new uint8_t[size_]);
+ reader.readFixed(&val[0], size_);
+ }
+
+ protected:
+
+ int size_;
+
+};
+
+class FixedParser : public Resolver
+{
+ public:
+
+ FixedParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver()
+ {
+ size_ = writer->fixedSize();
+ offset_ = offsets.at(0).offset();
+ }
+
+ virtual void parse(Reader &reader, uint8_t *address) const
+ {
+ DEBUG_OUT("Reading fixed");
+ uint8_t *location = reinterpret_cast<uint8_t *> (address + offset_);
+ reader.readFixed(location, size_);
+ }
+
+ protected:
+
+ int size_;
+ size_t offset_;
+
+};
+
+
+class ResolverFactory : private boost::noncopyable {
+
+ template<typename T>
+ unique_ptr<Resolver>
+ constructPrimitiveSkipper(const NodePtr &writer)
+ {
+ return unique_ptr<Resolver>(new PrimitiveSkipper<T>());
+ }
+
+ template<typename T>
+ unique_ptr<Resolver>
+ constructPrimitive(const NodePtr &writer, const NodePtr &reader, const Layout &offset)
+ {
+ unique_ptr<Resolver> instruction;
+
+ SchemaResolution match = writer->resolve(*reader);
+
+ if (match == RESOLVE_NO_MATCH) {
+ instruction = unique_ptr<Resolver>(new PrimitiveSkipper<T>());
+ }
+ else if (reader->type() == AVRO_UNION) {
+ const CompoundLayout &compoundLayout = static_cast<const CompoundLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new NonUnionToUnionParser(*this, writer, reader, compoundLayout));
+ }
+ else if (match == RESOLVE_MATCH) {
+ const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new PrimitiveParser<T>(primitiveLayout));
+ }
+ else if(match == RESOLVE_PROMOTABLE_TO_LONG) {
+ const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new PrimitivePromoter<T, int64_t>(primitiveLayout));
+ }
+ else if(match == RESOLVE_PROMOTABLE_TO_FLOAT) {
+ const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new PrimitivePromoter<T, float>(primitiveLayout));
+ }
+ else if(match == RESOLVE_PROMOTABLE_TO_DOUBLE) {
+ const PrimitiveLayout &primitiveLayout = static_cast<const PrimitiveLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new PrimitivePromoter<T, double>(primitiveLayout));
+ }
+ else {
+ assert(0);
+ }
+ return instruction;
+ }
+
+ template<typename Skipper>
+ unique_ptr<Resolver>
+ constructCompoundSkipper(const NodePtr &writer)
+ {
+ return unique_ptr<Resolver>(new Skipper(*this, writer));
+ }
+
+
+ template<typename Parser, typename Skipper>
+ unique_ptr<Resolver>
+ constructCompound(const NodePtr &writer, const NodePtr &reader, const Layout &offset)
+ {
+ unique_ptr<Resolver> instruction;
+
+ SchemaResolution match = RESOLVE_NO_MATCH;
+
+ match = writer->resolve(*reader);
+
+ if (match == RESOLVE_NO_MATCH) {
+ instruction = unique_ptr<Resolver>(new Skipper(*this, writer));
+ }
+ else if(writer->type() != AVRO_UNION && reader->type() == AVRO_UNION) {
+ const CompoundLayout &compoundLayout = dynamic_cast<const CompoundLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new NonUnionToUnionParser(*this, writer, reader, compoundLayout));
+ }
+ else if(writer->type() == AVRO_UNION && reader->type() != AVRO_UNION) {
+ instruction = unique_ptr<Resolver>(new UnionToNonUnionParser(*this, writer, reader, offset));
+ }
+ else {
+ const CompoundLayout &compoundLayout = dynamic_cast<const CompoundLayout &>(offset);
+ instruction = unique_ptr<Resolver>(new Parser(*this, writer, reader, compoundLayout));
+ }
+
+ return instruction;
+ }
+
+ public:
+
+ unique_ptr<Resolver>
+ construct(const NodePtr &writer, const NodePtr &reader, const Layout &offset)
+ {
+
+ typedef unique_ptr<Resolver> (ResolverFactory::*BuilderFunc)(const NodePtr &writer, const NodePtr &reader, const Layout &offset);
+
+ NodePtr currentWriter = (writer->type() == AVRO_SYMBOLIC) ?
+ resolveSymbol(writer) : writer;
+
+ NodePtr currentReader = (reader->type() == AVRO_SYMBOLIC) ?
+ resolveSymbol(reader) : reader;
+
+ static const BuilderFunc funcs[] = {
+ &ResolverFactory::constructPrimitive<std::string>,
+ &ResolverFactory::constructPrimitive<std::vector<uint8_t> >,
+ &ResolverFactory::constructPrimitive<int32_t>,
+ &ResolverFactory::constructPrimitive<int64_t>,
+ &ResolverFactory::constructPrimitive<float>,
+ &ResolverFactory::constructPrimitive<double>,
+ &ResolverFactory::constructPrimitive<bool>,
+ &ResolverFactory::constructPrimitive<Null>,
+ &ResolverFactory::constructCompound<RecordParser, RecordSkipper>,
+ &ResolverFactory::constructCompound<EnumParser, EnumSkipper>,
+ &ResolverFactory::constructCompound<ArrayParser, ArraySkipper>,
+ &ResolverFactory::constructCompound<MapParser, MapSkipper>,
+ &ResolverFactory::constructCompound<UnionParser, UnionSkipper>,
+ &ResolverFactory::constructCompound<FixedParser, FixedSkipper>
+ };
+
+ static_assert((sizeof(funcs)/sizeof(BuilderFunc)) == (AVRO_NUM_TYPES),
+ "Invalid number of builder functions");
+
+ BuilderFunc func = funcs[currentWriter->type()];
+ assert(func);
+
+ return ((this)->*(func))(currentWriter, currentReader, offset);
+ }
+
+ unique_ptr<Resolver>
+ skipper(const NodePtr &writer)
+ {
+
+ typedef unique_ptr<Resolver> (ResolverFactory::*BuilderFunc)(const NodePtr &writer);
+
+ NodePtr currentWriter = (writer->type() == AVRO_SYMBOLIC) ?
+ writer->leafAt(0) : writer;
+
+ static const BuilderFunc funcs[] = {
+ &ResolverFactory::constructPrimitiveSkipper<std::string>,
+ &ResolverFactory::constructPrimitiveSkipper<std::vector<uint8_t> >,
+ &ResolverFactory::constructPrimitiveSkipper<int32_t>,
+ &ResolverFactory::constructPrimitiveSkipper<int64_t>,
+ &ResolverFactory::constructPrimitiveSkipper<float>,
+ &ResolverFactory::constructPrimitiveSkipper<double>,
+ &ResolverFactory::constructPrimitiveSkipper<bool>,
+ &ResolverFactory::constructPrimitiveSkipper<Null>,
+ &ResolverFactory::constructCompoundSkipper<RecordSkipper>,
+ &ResolverFactory::constructCompoundSkipper<EnumSkipper>,
+ &ResolverFactory::constructCompoundSkipper<ArraySkipper>,
+ &ResolverFactory::constructCompoundSkipper<MapSkipper>,
+ &ResolverFactory::constructCompoundSkipper<UnionSkipper>,
+ &ResolverFactory::constructCompoundSkipper<FixedSkipper>
+ };
+
+ static_assert((sizeof(funcs)/sizeof(BuilderFunc)) == (AVRO_NUM_TYPES),
+ "Invalid number of builder functions");
+
+ BuilderFunc func = funcs[currentWriter->type()];
+ assert(func);
+
+ return ((this)->*(func))(currentWriter);
+ }
+};
+
+
+RecordSkipper::RecordSkipper(ResolverFactory &factory, const NodePtr &writer) :
+ Resolver()
+{
+ size_t leaves = writer->leaves();
+ resolvers_.reserve(leaves);
+ for(size_t i = 0; i < leaves; ++i) {
+ const NodePtr &w = writer->leafAt(i);
+ resolvers_.push_back(factory.skipper(w));
+ }
+}
+
+RecordParser::RecordParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver()
+{
+ size_t leaves = writer->leaves();
+ resolvers_.reserve(leaves);
+ for(size_t i = 0; i < leaves; ++i) {
+
+ const NodePtr &w = writer->leafAt(i);
+
+ const std::string &name = writer->nameAt(i);
+
+ size_t readerIndex = 0;
+ bool found = reader->nameIndex(name, readerIndex);
+
+ if(found) {
+ const NodePtr &r = reader->leafAt(readerIndex);
+ resolvers_.push_back(factory.construct(w, r, offsets.at(readerIndex)));
+ }
+ else {
+ resolvers_.push_back(factory.skipper(w));
+ }
+ }
+}
+
+MapSkipper::MapSkipper(ResolverFactory &factory, const NodePtr &writer) :
+ Resolver(),
+ resolver_(factory.skipper(writer->leafAt(1)))
+{ }
+
+MapParser::MapParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver(),
+ resolver_(factory.construct(writer->leafAt(1), reader->leafAt(1), offsets.at(1))),
+ offset_(offsets.offset()),
+ setFuncOffset_( offsets.at(0).offset())
+{ }
+
+ArraySkipper::ArraySkipper(ResolverFactory &factory, const NodePtr &writer) :
+ Resolver(),
+ resolver_(factory.skipper(writer->leafAt(0)))
+{ }
+
+ArrayParser::ArrayParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver(),
+ resolver_(factory.construct(writer->leafAt(0), reader->leafAt(0), offsets.at(1))),
+ offset_(offsets.offset()),
+ setFuncOffset_(offsets.at(0).offset())
+{ }
+
+UnionSkipper::UnionSkipper(ResolverFactory &factory, const NodePtr &writer) :
+ Resolver()
+{
+ size_t leaves = writer->leaves();
+ resolvers_.reserve(leaves);
+ for(size_t i = 0; i < leaves; ++i) {
+ const NodePtr &w = writer->leafAt(i);
+ resolvers_.push_back(factory.skipper(w));
+ }
+}
+
+namespace {
+
+// asumes the writer is NOT a union, and the reader IS a union
+
+SchemaResolution
+checkUnionMatch(const NodePtr &writer, const NodePtr &reader, size_t &index)
+{
+ SchemaResolution bestMatch = RESOLVE_NO_MATCH;
+
+ index = 0;
+ size_t leaves = reader->leaves();
+
+ for(size_t i=0; i < leaves; ++i) {
+
+ const NodePtr &leaf = reader->leafAt(i);
+ SchemaResolution newMatch = writer->resolve(*leaf);
+
+ if(newMatch == RESOLVE_MATCH) {
+ bestMatch = newMatch;
+ index = i;
+ break;
+ }
+ if(bestMatch == RESOLVE_NO_MATCH) {
+ bestMatch = newMatch;
+ index = i;
+ }
+ }
+
+ return bestMatch;
+}
+
+};
+
+UnionParser::UnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver(),
+ offset_(offsets.offset()),
+ choiceOffset_(offsets.at(0).offset()),
+ setFuncOffset_(offsets.at(1).offset())
+{
+
+ size_t leaves = writer->leaves();
+ resolvers_.reserve(leaves);
+ choiceMapping_.reserve(leaves);
+ for(size_t i = 0; i < leaves; ++i) {
+
+ // for each writer, we need a schema match for the reader
+ const NodePtr &w = writer->leafAt(i);
+ size_t index = 0;
+
+ SchemaResolution match = checkUnionMatch(w, reader, index);
+
+ if(match == RESOLVE_NO_MATCH) {
+ resolvers_.push_back(factory.skipper(w));
+ // push back a non-sensical number
+ choiceMapping_.push_back(reader->leaves());
+ }
+ else {
+ const NodePtr &r = reader->leafAt(index);
+ resolvers_.push_back(factory.construct(w, r, offsets.at(index+2)));
+ choiceMapping_.push_back(index);
+ }
+ }
+}
+
+NonUnionToUnionParser::NonUnionToUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const CompoundLayout &offsets) :
+ Resolver(),
+ offset_(offsets.offset()),
+ choiceOffset_(offsets.at(0).offset()),
+ setFuncOffset_(offsets.at(1).offset())
+{
+#ifndef NDEBUG
+ SchemaResolution bestMatch =
+#endif
+ checkUnionMatch(writer, reader, choice_);
+ assert(bestMatch != RESOLVE_NO_MATCH);
+ resolver_ = factory.construct(writer, reader->leafAt(choice_), offsets.at(choice_+2));
+}
+
+UnionToNonUnionParser::UnionToNonUnionParser(ResolverFactory &factory, const NodePtr &writer, const NodePtr &reader, const Layout &offsets) :
+ Resolver()
+{
+ size_t leaves = writer->leaves();
+ resolvers_.reserve(leaves);
+ for(size_t i = 0; i < leaves; ++i) {
+ const NodePtr &w = writer->leafAt(i);
+ resolvers_.push_back(factory.construct(w, reader, offsets));
+ }
+}
+
+unique_ptr<Resolver> constructResolver(const ValidSchema &writerSchema,
+ const ValidSchema &readerSchema,
+ const Layout &readerLayout)
+{
+ ResolverFactory factory;
+ return factory.construct(writerSchema.root(), readerSchema.root(), readerLayout);
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/ResolverSchema.cc b/contrib/libs/apache/avro/impl/ResolverSchema.cc
index f42946d6929..6ce8592559d 100644
--- a/contrib/libs/apache/avro/impl/ResolverSchema.cc
+++ b/contrib/libs/apache/avro/impl/ResolverSchema.cc
@@ -1,39 +1,39 @@
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ResolverSchema.hh"
-#include "Resolver.hh"
-#include "ValidSchema.hh"
-
-namespace avro {
-
-ResolverSchema::ResolverSchema(
- const ValidSchema &writerSchema,
- const ValidSchema &readerSchema,
- const Layout &readerLayout) :
- resolver_(constructResolver(writerSchema, readerSchema, readerLayout))
-{ }
-
-void
-ResolverSchema::parse(Reader &reader, uint8_t *address)
-{
- resolver_->parse(reader, address);
-}
-
-} // namespace avro
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ResolverSchema.hh"
+#include "Resolver.hh"
+#include "ValidSchema.hh"
+
+namespace avro {
+
+ResolverSchema::ResolverSchema(
+ const ValidSchema &writerSchema,
+ const ValidSchema &readerSchema,
+ const Layout &readerLayout) :
+ resolver_(constructResolver(writerSchema, readerSchema, readerLayout))
+{ }
+
+void
+ResolverSchema::parse(Reader &reader, uint8_t *address)
+{
+ resolver_->parse(reader, address);
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Schema.cc b/contrib/libs/apache/avro/impl/Schema.cc
index e6cfa45c2e8..077212895bf 100644
--- a/contrib/libs/apache/avro/impl/Schema.cc
+++ b/contrib/libs/apache/avro/impl/Schema.cc
@@ -1,139 +1,139 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "Schema.hh"
-
-namespace avro {
-
-Schema::Schema()
-{ }
-
-Schema::~Schema()
-{ }
-
-Schema::Schema(const NodePtr &node) :
- node_(node)
-{ }
-
-Schema::Schema(Node *node) :
- node_(node)
-{ }
-
-RecordSchema::RecordSchema(const std::string &name) :
- Schema(new NodeRecord)
-{
- node_->setName(name);
-}
-
-void
-RecordSchema::addField(const std::string &name, const Schema &fieldSchema)
-{
- // add the name first. it will throw if the name is a duplicate, preventing
- // the leaf from being added
- node_->addName(name);
-
- node_->addLeaf(fieldSchema.root());
-}
-
-std::string RecordSchema::getDoc() const
-{
- return node_->getDoc();
-}
-void RecordSchema::setDoc(const std::string& doc)
-{
- node_->setDoc(doc);
-}
-
-EnumSchema::EnumSchema(const std::string &name) :
- Schema(new NodeEnum)
-{
- node_->setName(name);
-}
-
-void
-EnumSchema::addSymbol(const std::string &symbol)
-{
- node_->addName(symbol);
-}
-
-ArraySchema::ArraySchema(const Schema &itemsSchema) :
- Schema(new NodeArray)
-{
- node_->addLeaf(itemsSchema.root());
-}
-
-ArraySchema::ArraySchema(const ArraySchema &itemsSchema) :
- Schema(new NodeArray)
-{
- node_->addLeaf(itemsSchema.root());
-}
-
-MapSchema::MapSchema(const Schema &valuesSchema) :
- Schema(new NodeMap)
-{
- node_->addLeaf(valuesSchema.root());
-}
-
-MapSchema::MapSchema(const MapSchema &valuesSchema) :
- Schema(new NodeMap)
-{
- node_->addLeaf(valuesSchema.root());
-}
-
-UnionSchema::UnionSchema() :
- Schema(new NodeUnion)
-{ }
-
-void
-UnionSchema::addType(const Schema &typeSchema)
-{
- if(typeSchema.type() == AVRO_UNION) {
- throw Exception("Cannot add unions to unions");
- }
-
- if(typeSchema.type() == AVRO_RECORD) {
- // check for duplicate records
- size_t types = node_->leaves();
- for(size_t i = 0; i < types; ++i) {
- const NodePtr &leaf = node_->leafAt(i);
- // TODO, more checks?
- if(leaf->type() == AVRO_RECORD && leaf->name() == typeSchema.root()->name()) {
- throw Exception("Records in unions cannot have duplicate names");
- }
- }
- }
-
- node_->addLeaf(typeSchema.root());
-}
-
-FixedSchema::FixedSchema(int size, const std::string &name) :
- Schema(new NodeFixed)
-{
- node_->setFixedSize(size);
- node_->setName(name);
-}
-
-SymbolicSchema::SymbolicSchema(const Name &name, const NodePtr& link) :
- Schema(new NodeSymbolic(HasName(name), link))
-{
-}
-
-
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "Schema.hh"
+
+namespace avro {
+
+Schema::Schema()
+{ }
+
+Schema::~Schema()
+{ }
+
+Schema::Schema(const NodePtr &node) :
+ node_(node)
+{ }
+
+Schema::Schema(Node *node) :
+ node_(node)
+{ }
+
+RecordSchema::RecordSchema(const std::string &name) :
+ Schema(new NodeRecord)
+{
+ node_->setName(name);
+}
+
+void
+RecordSchema::addField(const std::string &name, const Schema &fieldSchema)
+{
+ // add the name first. it will throw if the name is a duplicate, preventing
+ // the leaf from being added
+ node_->addName(name);
+
+ node_->addLeaf(fieldSchema.root());
+}
+
+std::string RecordSchema::getDoc() const
+{
+ return node_->getDoc();
+}
+void RecordSchema::setDoc(const std::string& doc)
+{
+ node_->setDoc(doc);
+}
+
+EnumSchema::EnumSchema(const std::string &name) :
+ Schema(new NodeEnum)
+{
+ node_->setName(name);
+}
+
+void
+EnumSchema::addSymbol(const std::string &symbol)
+{
+ node_->addName(symbol);
+}
+
+ArraySchema::ArraySchema(const Schema &itemsSchema) :
+ Schema(new NodeArray)
+{
+ node_->addLeaf(itemsSchema.root());
+}
+
+ArraySchema::ArraySchema(const ArraySchema &itemsSchema) :
+ Schema(new NodeArray)
+{
+ node_->addLeaf(itemsSchema.root());
+}
+
+MapSchema::MapSchema(const Schema &valuesSchema) :
+ Schema(new NodeMap)
+{
+ node_->addLeaf(valuesSchema.root());
+}
+
+MapSchema::MapSchema(const MapSchema &valuesSchema) :
+ Schema(new NodeMap)
+{
+ node_->addLeaf(valuesSchema.root());
+}
+
+UnionSchema::UnionSchema() :
+ Schema(new NodeUnion)
+{ }
+
+void
+UnionSchema::addType(const Schema &typeSchema)
+{
+ if(typeSchema.type() == AVRO_UNION) {
+ throw Exception("Cannot add unions to unions");
+ }
+
+ if(typeSchema.type() == AVRO_RECORD) {
+ // check for duplicate records
+ size_t types = node_->leaves();
+ for(size_t i = 0; i < types; ++i) {
+ const NodePtr &leaf = node_->leafAt(i);
+ // TODO, more checks?
+ if(leaf->type() == AVRO_RECORD && leaf->name() == typeSchema.root()->name()) {
+ throw Exception("Records in unions cannot have duplicate names");
+ }
+ }
+ }
+
+ node_->addLeaf(typeSchema.root());
+}
+
+FixedSchema::FixedSchema(int size, const std::string &name) :
+ Schema(new NodeFixed)
+{
+ node_->setFixedSize(size);
+ node_->setName(name);
+}
+
+SymbolicSchema::SymbolicSchema(const Name &name, const NodePtr& link) :
+ Schema(new NodeSymbolic(HasName(name), link))
+{
+}
+
+
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Stream.cc b/contrib/libs/apache/avro/impl/Stream.cc
index 7023f3f2136..ab7849e5b62 100644
--- a/contrib/libs/apache/avro/impl/Stream.cc
+++ b/contrib/libs/apache/avro/impl/Stream.cc
@@ -1,198 +1,198 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Stream.hh"
-#include <vector>
-
-namespace avro {
-
-using std::vector;
-
-class MemoryInputStream : public InputStream {
- const std::vector<uint8_t*>& data_;
- const size_t chunkSize_;
- const size_t size_;
- const size_t available_;
- size_t cur_;
- size_t curLen_;
-
- size_t maxLen() {
- size_t n = (cur_ == (size_ - 1)) ? available_ : chunkSize_;
- if (n == curLen_) {
- if (cur_ == (size_ - 1)) {
- return 0;
- }
- ++cur_;
- n = (cur_ == (size_ - 1)) ? available_ : chunkSize_;
- curLen_ = 0;
- }
- return n;
- }
-
-public:
- MemoryInputStream(const std::vector<uint8_t*>& b,
- size_t chunkSize, size_t available) :
- data_(b), chunkSize_(chunkSize), size_(b.size()),
- available_(available), cur_(0), curLen_(0) { }
-
- bool next(const uint8_t** data, size_t* len) {
- if (size_t n = maxLen()) {
- *data = data_[cur_] + curLen_;
- *len = n - curLen_;
- curLen_ = n;
- return true;
- }
- return false;
- }
-
- void backup(size_t len) {
- curLen_ -= len;
- }
-
- void skip(size_t len) {
- while (len > 0) {
- if (size_t n = maxLen()) {
- if ((curLen_ + len) < n) {
- n = curLen_ + len;
- }
- len -= n - curLen_;
- curLen_ = n;
- } else {
- break;
- }
- }
- }
-
- size_t byteCount() const {
- return cur_ * chunkSize_ + curLen_;
- }
-};
-
-class MemoryInputStream2 : public InputStream {
- const uint8_t* const data_;
- const size_t size_;
- size_t curLen_;
-public:
- MemoryInputStream2(const uint8_t *data, size_t len)
- : data_(data), size_(len), curLen_(0) { }
-
- bool next(const uint8_t** data, size_t* len) {
- if (curLen_ == size_) {
- return false;
- }
- *data = &data_[curLen_];
- *len = size_ - curLen_;
- curLen_ = size_;
- return true;
- }
-
- void backup(size_t len) {
- curLen_ -= len;
- }
-
- void skip(size_t len) {
- if (len > (size_ - curLen_)) {
- len = size_ - curLen_;
- }
- curLen_ += len;
- }
-
- size_t byteCount() const {
- return curLen_;
- }
-};
-
-class MemoryOutputStream : public OutputStream {
-public:
- const size_t chunkSize_;
- std::vector<uint8_t*> data_;
- size_t available_;
- size_t byteCount_;
-
- MemoryOutputStream(size_t chunkSize) : chunkSize_(chunkSize),
- available_(0), byteCount_(0) { }
- ~MemoryOutputStream() {
- for (std::vector<uint8_t*>::const_iterator it = data_.begin();
- it != data_.end(); ++it) {
- delete[] *it;
- }
- }
-
- bool next(uint8_t** data, size_t* len) {
- if (available_ == 0) {
- data_.push_back(new uint8_t[chunkSize_]);
- available_ = chunkSize_;
- }
- *data = &data_.back()[chunkSize_ - available_];
- *len = available_;
- byteCount_ += available_;
- available_ = 0;
- return true;
- }
-
- void backup(size_t len) {
- available_ += len;
- byteCount_ -= len;
- }
-
- uint64_t byteCount() const {
- return byteCount_;
- }
-
- void flush() { }
-};
-
-std::unique_ptr<OutputStream> memoryOutputStream(size_t chunkSize)
-{
- return std::unique_ptr<OutputStream>(new MemoryOutputStream(chunkSize));
-}
-
-std::unique_ptr<InputStream> memoryInputStream(const uint8_t* data, size_t len)
-{
- return std::unique_ptr<InputStream>(new MemoryInputStream2(data, len));
-}
-
-std::unique_ptr<InputStream> memoryInputStream(const OutputStream& source)
-{
- const MemoryOutputStream& mos =
- dynamic_cast<const MemoryOutputStream&>(source);
- return (mos.data_.empty()) ?
- std::unique_ptr<InputStream>(new MemoryInputStream2(0, 0)) :
- std::unique_ptr<InputStream>(new MemoryInputStream(mos.data_,
- mos.chunkSize_,
- (mos.chunkSize_ - mos.available_)));
-}
-
-std::shared_ptr<std::vector<uint8_t> > snapshot(const OutputStream& source)
-{
- const MemoryOutputStream& mos =
- dynamic_cast<const MemoryOutputStream&>(source);
- std::shared_ptr<std::vector<uint8_t> > result(new std::vector<uint8_t>());
- size_t c = mos.byteCount_;
- result->reserve(mos.byteCount_);
- for (vector<uint8_t*>::const_iterator it = mos.data_.begin();
- it != mos.data_.end(); ++it) {
- size_t n = std::min(c, mos.chunkSize_);
- std::copy(*it, *it + n, std::back_inserter(*result));
- c -= n;
- }
- return result;
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Stream.hh"
+#include <vector>
+
+namespace avro {
+
+using std::vector;
+
+class MemoryInputStream : public InputStream {
+ const std::vector<uint8_t*>& data_;
+ const size_t chunkSize_;
+ const size_t size_;
+ const size_t available_;
+ size_t cur_;
+ size_t curLen_;
+
+ size_t maxLen() {
+ size_t n = (cur_ == (size_ - 1)) ? available_ : chunkSize_;
+ if (n == curLen_) {
+ if (cur_ == (size_ - 1)) {
+ return 0;
+ }
+ ++cur_;
+ n = (cur_ == (size_ - 1)) ? available_ : chunkSize_;
+ curLen_ = 0;
+ }
+ return n;
+ }
+
+public:
+ MemoryInputStream(const std::vector<uint8_t*>& b,
+ size_t chunkSize, size_t available) :
+ data_(b), chunkSize_(chunkSize), size_(b.size()),
+ available_(available), cur_(0), curLen_(0) { }
+
+ bool next(const uint8_t** data, size_t* len) {
+ if (size_t n = maxLen()) {
+ *data = data_[cur_] + curLen_;
+ *len = n - curLen_;
+ curLen_ = n;
+ return true;
+ }
+ return false;
+ }
+
+ void backup(size_t len) {
+ curLen_ -= len;
+ }
+
+ void skip(size_t len) {
+ while (len > 0) {
+ if (size_t n = maxLen()) {
+ if ((curLen_ + len) < n) {
+ n = curLen_ + len;
+ }
+ len -= n - curLen_;
+ curLen_ = n;
+ } else {
+ break;
+ }
+ }
+ }
+
+ size_t byteCount() const {
+ return cur_ * chunkSize_ + curLen_;
+ }
+};
+
+class MemoryInputStream2 : public InputStream {
+ const uint8_t* const data_;
+ const size_t size_;
+ size_t curLen_;
+public:
+ MemoryInputStream2(const uint8_t *data, size_t len)
+ : data_(data), size_(len), curLen_(0) { }
+
+ bool next(const uint8_t** data, size_t* len) {
+ if (curLen_ == size_) {
+ return false;
+ }
+ *data = &data_[curLen_];
+ *len = size_ - curLen_;
+ curLen_ = size_;
+ return true;
+ }
+
+ void backup(size_t len) {
+ curLen_ -= len;
+ }
+
+ void skip(size_t len) {
+ if (len > (size_ - curLen_)) {
+ len = size_ - curLen_;
+ }
+ curLen_ += len;
+ }
+
+ size_t byteCount() const {
+ return curLen_;
+ }
+};
+
+class MemoryOutputStream : public OutputStream {
+public:
+ const size_t chunkSize_;
+ std::vector<uint8_t*> data_;
+ size_t available_;
+ size_t byteCount_;
+
+ MemoryOutputStream(size_t chunkSize) : chunkSize_(chunkSize),
+ available_(0), byteCount_(0) { }
+ ~MemoryOutputStream() {
+ for (std::vector<uint8_t*>::const_iterator it = data_.begin();
+ it != data_.end(); ++it) {
+ delete[] *it;
+ }
+ }
+
+ bool next(uint8_t** data, size_t* len) {
+ if (available_ == 0) {
+ data_.push_back(new uint8_t[chunkSize_]);
+ available_ = chunkSize_;
+ }
+ *data = &data_.back()[chunkSize_ - available_];
+ *len = available_;
+ byteCount_ += available_;
+ available_ = 0;
+ return true;
+ }
+
+ void backup(size_t len) {
+ available_ += len;
+ byteCount_ -= len;
+ }
+
+ uint64_t byteCount() const {
+ return byteCount_;
+ }
+
+ void flush() { }
+};
+
+std::unique_ptr<OutputStream> memoryOutputStream(size_t chunkSize)
+{
+ return std::unique_ptr<OutputStream>(new MemoryOutputStream(chunkSize));
+}
+
+std::unique_ptr<InputStream> memoryInputStream(const uint8_t* data, size_t len)
+{
+ return std::unique_ptr<InputStream>(new MemoryInputStream2(data, len));
+}
+
+std::unique_ptr<InputStream> memoryInputStream(const OutputStream& source)
+{
+ const MemoryOutputStream& mos =
+ dynamic_cast<const MemoryOutputStream&>(source);
+ return (mos.data_.empty()) ?
+ std::unique_ptr<InputStream>(new MemoryInputStream2(0, 0)) :
+ std::unique_ptr<InputStream>(new MemoryInputStream(mos.data_,
+ mos.chunkSize_,
+ (mos.chunkSize_ - mos.available_)));
+}
+
+std::shared_ptr<std::vector<uint8_t> > snapshot(const OutputStream& source)
+{
+ const MemoryOutputStream& mos =
+ dynamic_cast<const MemoryOutputStream&>(source);
+ std::shared_ptr<std::vector<uint8_t> > result(new std::vector<uint8_t>());
+ size_t c = mos.byteCount_;
+ result->reserve(mos.byteCount_);
+ for (vector<uint8_t*>::const_iterator it = mos.data_.begin();
+ it != mos.data_.end(); ++it) {
+ size_t n = std::min(c, mos.chunkSize_);
+ std::copy(*it, *it + n, std::back_inserter(*result));
+ c -= n;
+ }
+ return result;
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/Types.cc b/contrib/libs/apache/avro/impl/Types.cc
index 6c9d702b831..e7485fe8ac4 100644
--- a/contrib/libs/apache/avro/impl/Types.cc
+++ b/contrib/libs/apache/avro/impl/Types.cc
@@ -1,82 +1,82 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <string>
-#include "Types.hh"
-
-namespace avro {
-
-namespace strings {
-const std::string typeToString[] = {
- "string",
- "bytes",
- "int",
- "long",
- "float",
- "double",
- "boolean",
- "null",
- "record",
- "enum",
- "array",
- "map",
- "union",
- "fixed",
- "symbolic"
-};
-
-static_assert((sizeof(typeToString)/sizeof(std::string)) == (AVRO_NUM_TYPES+1),
- "Incorrect Avro typeToString");
-
-} // namespace strings
-
-
-// this static assert exists because a 32 bit integer is used as a bit-flag for each type,
-// and it would be a problem for this flag if we ever supported more than 32 types
-static_assert(AVRO_NUM_TYPES < 32, "Too many Avro types");
-
-const std::string& toString(Type type)
-{
- static std::string undefinedType = "Undefined type";
- if (isAvroTypeOrPseudoType(type)) {
- return strings::typeToString[type];
- } else {
- return undefinedType;
- }
-}
-
-std::ostream &operator<< (std::ostream &os, Type type)
-{
- if(isAvroTypeOrPseudoType(type)) {
- os << strings::typeToString[type];
- }
- else {
- os << static_cast<int>(type);
- }
- return os;
-}
-
-std::ostream &operator<< (std::ostream &os, const Null &)
-{
- os << "(null value)";
- return os;
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <string>
+#include "Types.hh"
+
+namespace avro {
+
+namespace strings {
+const std::string typeToString[] = {
+ "string",
+ "bytes",
+ "int",
+ "long",
+ "float",
+ "double",
+ "boolean",
+ "null",
+ "record",
+ "enum",
+ "array",
+ "map",
+ "union",
+ "fixed",
+ "symbolic"
+};
+
+static_assert((sizeof(typeToString)/sizeof(std::string)) == (AVRO_NUM_TYPES+1),
+ "Incorrect Avro typeToString");
+
+} // namespace strings
+
+
+// this static assert exists because a 32 bit integer is used as a bit-flag for each type,
+// and it would be a problem for this flag if we ever supported more than 32 types
+static_assert(AVRO_NUM_TYPES < 32, "Too many Avro types");
+
+const std::string& toString(Type type)
+{
+ static std::string undefinedType = "Undefined type";
+ if (isAvroTypeOrPseudoType(type)) {
+ return strings::typeToString[type];
+ } else {
+ return undefinedType;
+ }
+}
+
+std::ostream &operator<< (std::ostream &os, Type type)
+{
+ if(isAvroTypeOrPseudoType(type)) {
+ os << strings::typeToString[type];
+ }
+ else {
+ os << static_cast<int>(type);
+ }
+ return os;
+}
+
+std::ostream &operator<< (std::ostream &os, const Null &)
+{
+ os << "(null value)";
+ return os;
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/ValidSchema.cc b/contrib/libs/apache/avro/impl/ValidSchema.cc
index 74a3f845c5e..6e9762ccba8 100644
--- a/contrib/libs/apache/avro/impl/ValidSchema.cc
+++ b/contrib/libs/apache/avro/impl/ValidSchema.cc
@@ -1,193 +1,193 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <boost/format.hpp>
-#include <cctype>
-#include <sstream>
-
-#include "ValidSchema.hh"
-#include "Schema.hh"
-#include "Node.hh"
-
-using std::string;
-using std::ostringstream;
-using std::make_pair;
-using boost::format;
-using std::shared_ptr;
-using std::static_pointer_cast;
-
-namespace avro {
-typedef std::map<Name, NodePtr> SymbolMap;
-
-static bool validate(const NodePtr &node, SymbolMap &symbolMap)
-{
- if (! node->isValid()) {
- throw Exception(format("Schema is invalid, due to bad node of type %1%")
- % node->type());
- }
-
- if (node->hasName()) {
- const Name& nm = node->name();
- SymbolMap::iterator it = symbolMap.find(nm);
- bool found = it != symbolMap.end() && nm == it->first;
-
- if (node->type() == AVRO_SYMBOLIC) {
- if (! found) {
- throw Exception(format("Symbolic name \"%1%\" is unknown") %
- node->name());
- }
-
- shared_ptr<NodeSymbolic> symNode =
- static_pointer_cast<NodeSymbolic>(node);
-
- // if the symbolic link is already resolved, we return true,
- // otherwise returning false will force it to be resolved
- return symNode->isSet();
- }
-
- if (found) {
- return false;
- }
- symbolMap.insert(it, make_pair(nm, node));
- }
-
- node->lock();
- size_t leaves = node->leaves();
- for (size_t i = 0; i < leaves; ++i) {
- const NodePtr &leaf(node->leafAt(i));
-
- if (! validate(leaf, symbolMap)) {
-
- // if validate returns false it means a node with this name already
- // existed in the map, instead of keeping this node twice in the
- // map (which could potentially create circular shared pointer
- // links that could not be easily freed), replace this node with a
- // symbolic link to the original one.
-
- node->setLeafToSymbolic(i, symbolMap.find(leaf->name())->second);
- }
- }
-
- return true;
-}
-
-static void validate(const NodePtr& p)
-{
- SymbolMap m;
- validate(p, m);
-}
-
-ValidSchema::ValidSchema(const NodePtr &root) : root_(root)
-{
- validate(root_);
-}
-
-ValidSchema::ValidSchema(const Schema &schema) : root_(schema.root())
-{
- validate(root_);
-}
-
-ValidSchema::ValidSchema() : root_(NullSchema().root())
-{
- validate(root_);
-}
-
-void
-ValidSchema::setSchema(const Schema &schema)
-{
- root_ = schema.root();
- validate(root_);
-}
-
-void
-ValidSchema::toJson(std::ostream &os) const
-{
- root_->printJson(os, 0);
- os << '\n';
-}
-
-string
-ValidSchema::toJson(bool prettyPrint) const
-{
- ostringstream oss;
- toJson(oss);
- if (!prettyPrint) {
- return compactSchema(oss.str());
- }
- return oss.str();
-}
-
-void
-ValidSchema::toFlatList(std::ostream &os) const
-{
- root_->printBasicInfo(os);
-}
-
-/*
- * compactSchema compacts and returns a formatted string representation
- * of a ValidSchema object by removing the whitespaces outside of the quoted
- * field names and values. It can handle the cases where the quoted value is
- * in UTF-8 format. Note that this method is not responsible for validating
- * the schema.
- */
-string ValidSchema::compactSchema(const string& schema) {
- bool insideQuote = false;
- size_t newPos = 0;
- string data(schema.data());
-
- for (size_t currentPos = 0; currentPos < schema.size(); currentPos++) {
- if (!insideQuote && std::isspace(data[currentPos])) {
- // Skip the white spaces outside quotes.
- continue;
- }
-
- if (data[currentPos] == '\"') {
- // It is valid for a quote to be part of the value for some fields,
- // e.g., the "doc" field. In that case, the quote is expected to be
- // escaped inside the schema. Since the escape character '\\' could
- // be escaped itself, we need to check whether there are an even
- // number of consecutive slashes prior to the quote.
- int leadingSlashes = 0;
- for (int i = newPos - 1; i >= 0; i--) {
- if (data[i] == '\\') {
- leadingSlashes++;
- } else {
- break;
- }
- }
- if (leadingSlashes % 2 == 0) {
- // Found a real quote which identifies either the start or the
- // end of a field name or value.
- insideQuote = !insideQuote;
- }
- }
- data[newPos++] = data[currentPos];
- }
-
- if (insideQuote) {
- throw Exception("Schema is not well formed with mismatched quotes");
- }
-
- if (newPos < schema.size()) {
- data.resize(newPos);
- }
- return data;
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <boost/format.hpp>
+#include <cctype>
+#include <sstream>
+
+#include "ValidSchema.hh"
+#include "Schema.hh"
+#include "Node.hh"
+
+using std::string;
+using std::ostringstream;
+using std::make_pair;
+using boost::format;
+using std::shared_ptr;
+using std::static_pointer_cast;
+
+namespace avro {
+typedef std::map<Name, NodePtr> SymbolMap;
+
+static bool validate(const NodePtr &node, SymbolMap &symbolMap)
+{
+ if (! node->isValid()) {
+ throw Exception(format("Schema is invalid, due to bad node of type %1%")
+ % node->type());
+ }
+
+ if (node->hasName()) {
+ const Name& nm = node->name();
+ SymbolMap::iterator it = symbolMap.find(nm);
+ bool found = it != symbolMap.end() && nm == it->first;
+
+ if (node->type() == AVRO_SYMBOLIC) {
+ if (! found) {
+ throw Exception(format("Symbolic name \"%1%\" is unknown") %
+ node->name());
+ }
+
+ shared_ptr<NodeSymbolic> symNode =
+ static_pointer_cast<NodeSymbolic>(node);
+
+ // if the symbolic link is already resolved, we return true,
+ // otherwise returning false will force it to be resolved
+ return symNode->isSet();
+ }
+
+ if (found) {
+ return false;
+ }
+ symbolMap.insert(it, make_pair(nm, node));
+ }
+
+ node->lock();
+ size_t leaves = node->leaves();
+ for (size_t i = 0; i < leaves; ++i) {
+ const NodePtr &leaf(node->leafAt(i));
+
+ if (! validate(leaf, symbolMap)) {
+
+ // if validate returns false it means a node with this name already
+ // existed in the map, instead of keeping this node twice in the
+ // map (which could potentially create circular shared pointer
+ // links that could not be easily freed), replace this node with a
+ // symbolic link to the original one.
+
+ node->setLeafToSymbolic(i, symbolMap.find(leaf->name())->second);
+ }
+ }
+
+ return true;
+}
+
+static void validate(const NodePtr& p)
+{
+ SymbolMap m;
+ validate(p, m);
+}
+
+ValidSchema::ValidSchema(const NodePtr &root) : root_(root)
+{
+ validate(root_);
+}
+
+ValidSchema::ValidSchema(const Schema &schema) : root_(schema.root())
+{
+ validate(root_);
+}
+
+ValidSchema::ValidSchema() : root_(NullSchema().root())
+{
+ validate(root_);
+}
+
+void
+ValidSchema::setSchema(const Schema &schema)
+{
+ root_ = schema.root();
+ validate(root_);
+}
+
+void
+ValidSchema::toJson(std::ostream &os) const
+{
+ root_->printJson(os, 0);
+ os << '\n';
+}
+
+string
+ValidSchema::toJson(bool prettyPrint) const
+{
+ ostringstream oss;
+ toJson(oss);
+ if (!prettyPrint) {
+ return compactSchema(oss.str());
+ }
+ return oss.str();
+}
+
+void
+ValidSchema::toFlatList(std::ostream &os) const
+{
+ root_->printBasicInfo(os);
+}
+
+/*
+ * compactSchema compacts and returns a formatted string representation
+ * of a ValidSchema object by removing the whitespaces outside of the quoted
+ * field names and values. It can handle the cases where the quoted value is
+ * in UTF-8 format. Note that this method is not responsible for validating
+ * the schema.
+ */
+string ValidSchema::compactSchema(const string& schema) {
+ bool insideQuote = false;
+ size_t newPos = 0;
+ string data(schema.data());
+
+ for (size_t currentPos = 0; currentPos < schema.size(); currentPos++) {
+ if (!insideQuote && std::isspace(data[currentPos])) {
+ // Skip the white spaces outside quotes.
+ continue;
+ }
+
+ if (data[currentPos] == '\"') {
+ // It is valid for a quote to be part of the value for some fields,
+ // e.g., the "doc" field. In that case, the quote is expected to be
+ // escaped inside the schema. Since the escape character '\\' could
+ // be escaped itself, we need to check whether there are an even
+ // number of consecutive slashes prior to the quote.
+ int leadingSlashes = 0;
+ for (int i = newPos - 1; i >= 0; i--) {
+ if (data[i] == '\\') {
+ leadingSlashes++;
+ } else {
+ break;
+ }
+ }
+ if (leadingSlashes % 2 == 0) {
+ // Found a real quote which identifies either the start or the
+ // end of a field name or value.
+ insideQuote = !insideQuote;
+ }
+ }
+ data[newPos++] = data[currentPos];
+ }
+
+ if (insideQuote) {
+ throw Exception("Schema is not well formed with mismatched quotes");
+ }
+
+ if (newPos < schema.size()) {
+ data.resize(newPos);
+ }
+ return data;
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/Validator.cc b/contrib/libs/apache/avro/impl/Validator.cc
index 2e74b06b66b..139081693d4 100644
--- a/contrib/libs/apache/avro/impl/Validator.cc
+++ b/contrib/libs/apache/avro/impl/Validator.cc
@@ -1,301 +1,301 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Validator.hh"
-#include "ValidSchema.hh"
-#include "NodeImpl.hh"
-
-namespace avro {
-
-Validator::Validator(const ValidSchema &schema) :
- schema_(schema),
- nextType_(AVRO_NULL),
- expectedTypesFlag_(0),
- compoundStarted_(false),
- waitingForCount_(false),
- count_(0)
-{
- setupOperation(schema_.root());
-}
-
-void
-Validator::setWaitingForCount()
-{
- waitingForCount_ = true;
- count_ = 0;
- expectedTypesFlag_ = typeToFlag(AVRO_INT) | typeToFlag(AVRO_LONG);
- nextType_ = AVRO_LONG;
-}
-
-void
-Validator::enumAdvance()
-{
- if(compoundStarted_) {
- setWaitingForCount();
- compoundStarted_ = false;
- }
- else {
- waitingForCount_ = false;
- compoundStack_.pop_back();
- }
-}
-
-bool
-Validator::countingSetup()
-{
- bool proceed = true;
- if(compoundStarted_) {
- setWaitingForCount();
- compoundStarted_ = false;
- proceed = false;
- }
- else if(waitingForCount_) {
- waitingForCount_ = false;
- if(count_ == 0) {
- compoundStack_.pop_back();
- proceed = false;
- }
- else {
- counters_.push_back(static_cast<size_t>(count_));
- }
- }
-
- return proceed;
-}
-
-void
-Validator::countingAdvance()
-{
- if(countingSetup()) {
-
- size_t index = (compoundStack_.back().pos)++;
- const NodePtr &node = compoundStack_.back().node;
-
- if(index < node->leaves() ) {
- setupOperation(node->leafAt(index));
- }
- else {
- compoundStack_.back().pos = 0;
- int count = --counters_.back();
- if(count == 0) {
- counters_.pop_back();
- compoundStarted_ = true;
- nextType_ = node->type();
- expectedTypesFlag_ = typeToFlag(nextType_);
- }
- else {
- size_t index = (compoundStack_.back().pos)++;
- setupOperation(node->leafAt(index));
- }
- }
- }
-}
-
-void
-Validator::unionAdvance()
-{
- if(compoundStarted_) {
- setWaitingForCount();
- compoundStarted_ = false;
- }
- else {
- waitingForCount_ = false;
- NodePtr node = compoundStack_.back().node;
-
- if(count_ < static_cast<int64_t>(node->leaves())) {
- compoundStack_.pop_back();
- setupOperation(node->leafAt(static_cast<int>(count_)));
- }
- else {
- throw Exception(
- boost::format("Union selection out of range, got %1%," \
- " expecting 0-%2%")
- % count_ % (node->leaves() -1)
- );
- }
- }
-}
-
-void
-Validator::fixedAdvance()
-{
- compoundStarted_ = false;
- compoundStack_.pop_back();
-}
-
-int
-Validator::nextSizeExpected() const
-{
- return compoundStack_.back().node->fixedSize();
-}
-
-void
-Validator::doAdvance()
-{
- typedef void (Validator::*AdvanceFunc)();
-
- // only the compound types need advance functions here
- static const AdvanceFunc funcs[] = {
- 0, // string
- 0, // bytes
- 0, // int
- 0, // long
- 0, // float
- 0, // double
- 0, // bool
- 0, // null
- &Validator::countingAdvance, // Record is treated like counting with count == 1
- &Validator::enumAdvance,
- &Validator::countingAdvance,
- &Validator::countingAdvance,
- &Validator::unionAdvance,
- &Validator::fixedAdvance
- };
- static_assert((sizeof(funcs)/sizeof(AdvanceFunc)) == (AVRO_NUM_TYPES),
- "Invalid number of advance functions");
-
- expectedTypesFlag_ = 0;
- // loop until we encounter a next expected type, or we've exited all compound types
- while(!expectedTypesFlag_ && !compoundStack_.empty() ) {
-
- Type type = compoundStack_.back().node->type();
-
- AdvanceFunc func = funcs[type];
-
- // only compound functions are put on the status stack so it is ok to
- // assume that func is not null
- assert(func);
-
- ((this)->*(func))();
- }
-
- if(compoundStack_.empty()) {
- nextType_ = AVRO_NULL;
- }
-}
-
-void Validator::advance()
-{
- if(!waitingForCount_) {
- doAdvance();
- }
-}
-
-void
-Validator::setCount(int64_t count)
-{
- if(!waitingForCount_) {
- throw Exception("Not expecting count");
- }
- else if(count_ < 0) {
- throw Exception("Count cannot be negative");
- }
- count_ = count;
-
- doAdvance();
-}
-
-void
-Validator::setupFlag(Type type)
-{
- // use flags instead of strictly types, so that we can be more lax about the type
- // (for example, a long should be able to accept an int type, but not vice versa)
- static const flag_t flags[] = {
- typeToFlag(AVRO_STRING) | typeToFlag(AVRO_BYTES),
- typeToFlag(AVRO_STRING) | typeToFlag(AVRO_BYTES),
- typeToFlag(AVRO_INT),
- typeToFlag(AVRO_INT) | typeToFlag(AVRO_LONG),
- typeToFlag(AVRO_FLOAT),
- typeToFlag(AVRO_DOUBLE),
- typeToFlag(AVRO_BOOL),
- typeToFlag(AVRO_NULL),
- typeToFlag(AVRO_RECORD),
- typeToFlag(AVRO_ENUM),
- typeToFlag(AVRO_ARRAY),
- typeToFlag(AVRO_MAP),
- typeToFlag(AVRO_UNION),
- typeToFlag(AVRO_FIXED)
- };
- static_assert((sizeof(flags)/sizeof(flag_t)) == (AVRO_NUM_TYPES),
- "Invalid number of avro type flags");
-
- expectedTypesFlag_ = flags[type];
-}
-
-void
-Validator::setupOperation(const NodePtr &node)
-{
- nextType_ = node->type();
-
- if(nextType_ == AVRO_SYMBOLIC) {
- NodePtr actualNode = resolveSymbol(node);
- assert(actualNode);
- setupOperation(actualNode);
- return;
- }
-
- assert(nextType_ < AVRO_SYMBOLIC);
-
- setupFlag(nextType_);
-
- if(!isPrimitive(nextType_)) {
- compoundStack_.push_back(CompoundType(node));
- compoundStarted_ = true;
- }
-}
-
-bool
-Validator::getCurrentRecordName(std::string &name) const
-{
- bool found = false;
- name.clear();
-
- int idx = -1;
- // if the top of the stack is a record I want this record name
- if(!compoundStack_.empty() && (isPrimitive(nextType_) || nextType_ == AVRO_RECORD)) {
- idx = compoundStack_.size() -1;
- }
- else {
- idx = compoundStack_.size() -2;
- }
-
- if(idx >= 0 && compoundStack_[idx].node->type() == AVRO_RECORD) {
- name = compoundStack_[idx].node->name().simpleName();
- found = true;
- }
- return found;
-}
-
-bool
-Validator::getNextFieldName(std::string &name) const
-{
- bool found = false;
- name.clear();
- int idx = isCompound(nextType_) ? compoundStack_.size()-2 : compoundStack_.size()-1;
- if(idx >= 0 && compoundStack_[idx].node->type() == AVRO_RECORD) {
- size_t pos = compoundStack_[idx].pos-1;
- const NodePtr &node = compoundStack_[idx].node;
- if(pos < node->leaves()) {
- name = node->nameAt(pos);
- found = true;
- }
- }
- return found;
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Validator.hh"
+#include "ValidSchema.hh"
+#include "NodeImpl.hh"
+
+namespace avro {
+
+Validator::Validator(const ValidSchema &schema) :
+ schema_(schema),
+ nextType_(AVRO_NULL),
+ expectedTypesFlag_(0),
+ compoundStarted_(false),
+ waitingForCount_(false),
+ count_(0)
+{
+ setupOperation(schema_.root());
+}
+
+void
+Validator::setWaitingForCount()
+{
+ waitingForCount_ = true;
+ count_ = 0;
+ expectedTypesFlag_ = typeToFlag(AVRO_INT) | typeToFlag(AVRO_LONG);
+ nextType_ = AVRO_LONG;
+}
+
+void
+Validator::enumAdvance()
+{
+ if(compoundStarted_) {
+ setWaitingForCount();
+ compoundStarted_ = false;
+ }
+ else {
+ waitingForCount_ = false;
+ compoundStack_.pop_back();
+ }
+}
+
+bool
+Validator::countingSetup()
+{
+ bool proceed = true;
+ if(compoundStarted_) {
+ setWaitingForCount();
+ compoundStarted_ = false;
+ proceed = false;
+ }
+ else if(waitingForCount_) {
+ waitingForCount_ = false;
+ if(count_ == 0) {
+ compoundStack_.pop_back();
+ proceed = false;
+ }
+ else {
+ counters_.push_back(static_cast<size_t>(count_));
+ }
+ }
+
+ return proceed;
+}
+
+void
+Validator::countingAdvance()
+{
+ if(countingSetup()) {
+
+ size_t index = (compoundStack_.back().pos)++;
+ const NodePtr &node = compoundStack_.back().node;
+
+ if(index < node->leaves() ) {
+ setupOperation(node->leafAt(index));
+ }
+ else {
+ compoundStack_.back().pos = 0;
+ int count = --counters_.back();
+ if(count == 0) {
+ counters_.pop_back();
+ compoundStarted_ = true;
+ nextType_ = node->type();
+ expectedTypesFlag_ = typeToFlag(nextType_);
+ }
+ else {
+ size_t index = (compoundStack_.back().pos)++;
+ setupOperation(node->leafAt(index));
+ }
+ }
+ }
+}
+
+void
+Validator::unionAdvance()
+{
+ if(compoundStarted_) {
+ setWaitingForCount();
+ compoundStarted_ = false;
+ }
+ else {
+ waitingForCount_ = false;
+ NodePtr node = compoundStack_.back().node;
+
+ if(count_ < static_cast<int64_t>(node->leaves())) {
+ compoundStack_.pop_back();
+ setupOperation(node->leafAt(static_cast<int>(count_)));
+ }
+ else {
+ throw Exception(
+ boost::format("Union selection out of range, got %1%," \
+ " expecting 0-%2%")
+ % count_ % (node->leaves() -1)
+ );
+ }
+ }
+}
+
+void
+Validator::fixedAdvance()
+{
+ compoundStarted_ = false;
+ compoundStack_.pop_back();
+}
+
+int
+Validator::nextSizeExpected() const
+{
+ return compoundStack_.back().node->fixedSize();
+}
+
+void
+Validator::doAdvance()
+{
+ typedef void (Validator::*AdvanceFunc)();
+
+ // only the compound types need advance functions here
+ static const AdvanceFunc funcs[] = {
+ 0, // string
+ 0, // bytes
+ 0, // int
+ 0, // long
+ 0, // float
+ 0, // double
+ 0, // bool
+ 0, // null
+ &Validator::countingAdvance, // Record is treated like counting with count == 1
+ &Validator::enumAdvance,
+ &Validator::countingAdvance,
+ &Validator::countingAdvance,
+ &Validator::unionAdvance,
+ &Validator::fixedAdvance
+ };
+ static_assert((sizeof(funcs)/sizeof(AdvanceFunc)) == (AVRO_NUM_TYPES),
+ "Invalid number of advance functions");
+
+ expectedTypesFlag_ = 0;
+ // loop until we encounter a next expected type, or we've exited all compound types
+ while(!expectedTypesFlag_ && !compoundStack_.empty() ) {
+
+ Type type = compoundStack_.back().node->type();
+
+ AdvanceFunc func = funcs[type];
+
+ // only compound functions are put on the status stack so it is ok to
+ // assume that func is not null
+ assert(func);
+
+ ((this)->*(func))();
+ }
+
+ if(compoundStack_.empty()) {
+ nextType_ = AVRO_NULL;
+ }
+}
+
+void Validator::advance()
+{
+ if(!waitingForCount_) {
+ doAdvance();
+ }
+}
+
+void
+Validator::setCount(int64_t count)
+{
+ if(!waitingForCount_) {
+ throw Exception("Not expecting count");
+ }
+ else if(count_ < 0) {
+ throw Exception("Count cannot be negative");
+ }
+ count_ = count;
+
+ doAdvance();
+}
+
+void
+Validator::setupFlag(Type type)
+{
+ // use flags instead of strictly types, so that we can be more lax about the type
+ // (for example, a long should be able to accept an int type, but not vice versa)
+ static const flag_t flags[] = {
+ typeToFlag(AVRO_STRING) | typeToFlag(AVRO_BYTES),
+ typeToFlag(AVRO_STRING) | typeToFlag(AVRO_BYTES),
+ typeToFlag(AVRO_INT),
+ typeToFlag(AVRO_INT) | typeToFlag(AVRO_LONG),
+ typeToFlag(AVRO_FLOAT),
+ typeToFlag(AVRO_DOUBLE),
+ typeToFlag(AVRO_BOOL),
+ typeToFlag(AVRO_NULL),
+ typeToFlag(AVRO_RECORD),
+ typeToFlag(AVRO_ENUM),
+ typeToFlag(AVRO_ARRAY),
+ typeToFlag(AVRO_MAP),
+ typeToFlag(AVRO_UNION),
+ typeToFlag(AVRO_FIXED)
+ };
+ static_assert((sizeof(flags)/sizeof(flag_t)) == (AVRO_NUM_TYPES),
+ "Invalid number of avro type flags");
+
+ expectedTypesFlag_ = flags[type];
+}
+
+void
+Validator::setupOperation(const NodePtr &node)
+{
+ nextType_ = node->type();
+
+ if(nextType_ == AVRO_SYMBOLIC) {
+ NodePtr actualNode = resolveSymbol(node);
+ assert(actualNode);
+ setupOperation(actualNode);
+ return;
+ }
+
+ assert(nextType_ < AVRO_SYMBOLIC);
+
+ setupFlag(nextType_);
+
+ if(!isPrimitive(nextType_)) {
+ compoundStack_.push_back(CompoundType(node));
+ compoundStarted_ = true;
+ }
+}
+
+bool
+Validator::getCurrentRecordName(std::string &name) const
+{
+ bool found = false;
+ name.clear();
+
+ int idx = -1;
+ // if the top of the stack is a record I want this record name
+ if(!compoundStack_.empty() && (isPrimitive(nextType_) || nextType_ == AVRO_RECORD)) {
+ idx = compoundStack_.size() -1;
+ }
+ else {
+ idx = compoundStack_.size() -2;
+ }
+
+ if(idx >= 0 && compoundStack_[idx].node->type() == AVRO_RECORD) {
+ name = compoundStack_[idx].node->name().simpleName();
+ found = true;
+ }
+ return found;
+}
+
+bool
+Validator::getNextFieldName(std::string &name) const
+{
+ bool found = false;
+ name.clear();
+ int idx = isCompound(nextType_) ? compoundStack_.size()-2 : compoundStack_.size()-1;
+ if(idx >= 0 && compoundStack_[idx].node->type() == AVRO_RECORD) {
+ size_t pos = compoundStack_[idx].pos-1;
+ const NodePtr &node = compoundStack_[idx].node;
+ if(pos < node->leaves()) {
+ name = node->nameAt(pos);
+ found = true;
+ }
+ }
+ return found;
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/Zigzag.cc b/contrib/libs/apache/avro/impl/Zigzag.cc
index 06db5b4e7ba..d4844440a72 100644
--- a/contrib/libs/apache/avro/impl/Zigzag.cc
+++ b/contrib/libs/apache/avro/impl/Zigzag.cc
@@ -1,86 +1,86 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "Zigzag.hh"
-
-namespace avro {
-
-uint64_t
-encodeZigzag64(int64_t input)
-{
- // cppcheck-suppress shiftTooManyBitsSigned
- return ((input << 1) ^ (input >> 63));
-}
-
-int64_t
-decodeZigzag64(uint64_t input)
-{
- return static_cast<int64_t>(((input >> 1) ^ -(static_cast<int64_t>(input) & 1)));
-}
-
-uint32_t
-encodeZigzag32(int32_t input)
-{
- // cppcheck-suppress shiftTooManyBitsSigned
- return ((input << 1) ^ (input >> 31));
-}
-
-int32_t
-decodeZigzag32(uint32_t input)
-{
- return static_cast<int32_t>(((input >> 1) ^ -(static_cast<int64_t>(input) & 1)));
-}
-
-size_t
-encodeInt64(int64_t input, std::array<uint8_t, 10> &output)
-{
- // get the zigzag encoding
- uint64_t val = encodeZigzag64(input);
-
- // put values in an array of bytes with variable length encoding
- const int mask = 0x7F;
- output[0] = val & mask;
- size_t bytesOut = 1;
- while( val >>=7 ) {
- output[bytesOut-1] |= 0x80;
- output[bytesOut++] = (val & mask);
- }
-
- return bytesOut;
-}
-
-size_t
-encodeInt32(int32_t input, std::array<uint8_t, 5> &output)
-{
- // get the zigzag encoding
- uint32_t val = encodeZigzag32(input);
-
- // put values in an array of bytes with variable length encoding
- const int mask = 0x7F;
- output[0] = val & mask;
- size_t bytesOut = 1;
- while( val >>=7 ) {
- output[bytesOut-1] |= 0x80;
- output[bytesOut++] = (val & mask);
- }
-
- return bytesOut;
-}
-
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "Zigzag.hh"
+
+namespace avro {
+
+uint64_t
+encodeZigzag64(int64_t input)
+{
+ // cppcheck-suppress shiftTooManyBitsSigned
+ return ((input << 1) ^ (input >> 63));
+}
+
+int64_t
+decodeZigzag64(uint64_t input)
+{
+ return static_cast<int64_t>(((input >> 1) ^ -(static_cast<int64_t>(input) & 1)));
+}
+
+uint32_t
+encodeZigzag32(int32_t input)
+{
+ // cppcheck-suppress shiftTooManyBitsSigned
+ return ((input << 1) ^ (input >> 31));
+}
+
+int32_t
+decodeZigzag32(uint32_t input)
+{
+ return static_cast<int32_t>(((input >> 1) ^ -(static_cast<int64_t>(input) & 1)));
+}
+
+size_t
+encodeInt64(int64_t input, std::array<uint8_t, 10> &output)
+{
+ // get the zigzag encoding
+ uint64_t val = encodeZigzag64(input);
+
+ // put values in an array of bytes with variable length encoding
+ const int mask = 0x7F;
+ output[0] = val & mask;
+ size_t bytesOut = 1;
+ while( val >>=7 ) {
+ output[bytesOut-1] |= 0x80;
+ output[bytesOut++] = (val & mask);
+ }
+
+ return bytesOut;
+}
+
+size_t
+encodeInt32(int32_t input, std::array<uint8_t, 5> &output)
+{
+ // get the zigzag encoding
+ uint32_t val = encodeZigzag32(input);
+
+ // put values in an array of bytes with variable length encoding
+ const int mask = 0x7F;
+ output[0] = val & mask;
+ size_t bytesOut = 1;
+ while( val >>=7 ) {
+ output[bytesOut-1] |= 0x80;
+ output[bytesOut++] = (val & mask);
+ }
+
+ return bytesOut;
+}
+
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/json/JsonDom.cc b/contrib/libs/apache/avro/impl/json/JsonDom.cc
index ac4d8c9bfca..8a41912be26 100644
--- a/contrib/libs/apache/avro/impl/json/JsonDom.cc
+++ b/contrib/libs/apache/avro/impl/json/JsonDom.cc
@@ -1,203 +1,203 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "JsonDom.hh"
-
-#include <stdexcept>
-
-#include <string.h>
-
-#include "Stream.hh"
-#include "JsonIO.hh"
-
-using std::string;
-using boost::format;
-
-namespace avro {
-namespace json {
-const char* typeToString(EntityType t)
-{
- switch (t) {
- case etNull: return "null";
- case etBool: return "bool";
- case etLong: return "long";
- case etDouble: return "double";
- case etString: return "string";
- case etArray: return "array";
- case etObject: return "object";
- default: return "unknown";
- }
-}
-
-Entity readEntity(JsonParser& p)
-{
- switch (p.peek()) {
- case JsonParser::tkNull:
- p.advance();
- return Entity(p.line());
- case JsonParser::tkBool:
- p.advance();
- return Entity(p.boolValue(), p.line());
- case JsonParser::tkLong:
- p.advance();
- return Entity(p.longValue(), p.line());
- case JsonParser::tkDouble:
- p.advance();
- return Entity(p.doubleValue(), p.line());
- case JsonParser::tkString:
- p.advance();
- return Entity(std::make_shared<String>(p.rawString()), p.line());
- case JsonParser::tkArrayStart:
- {
- size_t l = p.line();
- p.advance();
- std::shared_ptr<Array> v = std::make_shared<Array>();
- while (p.peek() != JsonParser::tkArrayEnd) {
- v->push_back(readEntity(p));
- }
- p.advance();
- return Entity(v, l);
- }
- case JsonParser::tkObjectStart:
- {
- size_t l = p.line();
- p.advance();
- std::shared_ptr<Object> v = std::make_shared<Object>();
- while (p.peek() != JsonParser::tkObjectEnd) {
- p.advance();
- std::string k = p.stringValue();
- Entity n = readEntity(p);
- v->insert(std::make_pair(k, n));
- }
- p.advance();
- return Entity(v, l);
- }
- default:
- throw std::domain_error(JsonParser::toString(p.peek()));
- }
-
-}
-
-Entity loadEntity(const char* text)
-{
- return loadEntity(reinterpret_cast<const uint8_t*>(text), ::strlen(text));
-}
-
-Entity loadEntity(InputStream& in)
-{
- JsonParser p;
- p.init(in);
- return readEntity(p);
-}
-
-Entity loadEntity(const uint8_t* text, size_t len)
-{
- std::unique_ptr<InputStream> in = memoryInputStream(text, len);
- return loadEntity(*in);
-}
-
-void writeEntity(JsonGenerator<JsonNullFormatter>& g, const Entity& n)
-{
- switch (n.type()) {
- case etNull:
- g.encodeNull();
- break;
- case etBool:
- g.encodeBool(n.boolValue());
- break;
- case etLong:
- g.encodeNumber(n.longValue());
- break;
- case etDouble:
- g.encodeNumber(n.doubleValue());
- break;
- case etString:
- g.encodeString(n.stringValue());
- break;
- case etArray:
- {
- g.arrayStart();
- const Array& v = n.arrayValue();
- for (Array::const_iterator it = v.begin();
- it != v.end(); ++it) {
- writeEntity(g, *it);
- }
- g.arrayEnd();
- }
- break;
- case etObject:
- {
- g.objectStart();
- const Object& v = n.objectValue();
- for (Object::const_iterator it = v.begin(); it != v.end(); ++it) {
- g.encodeString(it->first);
- writeEntity(g, it->second);
- }
- g.objectEnd();
- }
- break;
- }
-}
-
-void Entity::ensureType(EntityType type) const
-{
- if (type_ != type) {
- format msg = format("Invalid type. Expected \"%1%\" actual %2%") %
- typeToString(type) % typeToString(type_);
- throw Exception(msg);
- }
-}
-
-String Entity::stringValue() const {
- ensureType(etString);
- return JsonParser::toStringValue(**boost::any_cast<std::shared_ptr<String> >(&value_));
-}
-
-String Entity::bytesValue() const {
- ensureType(etString);
- return JsonParser::toBytesValue(**boost::any_cast<std::shared_ptr<String> >(&value_));
-}
-
-std::string Entity::toString() const
-{
- std::unique_ptr<OutputStream> out = memoryOutputStream();
- JsonGenerator<JsonNullFormatter> g;
- g.init(*out);
- writeEntity(g, *this);
- g.flush();
- std::unique_ptr<InputStream> in = memoryInputStream(*out);
- const uint8_t *p = 0;
- size_t n = 0;
- size_t c = 0;
- while (in->next(&p, &n)) {
- c += n;
- }
- std::string result;
- result.resize(c);
- c = 0;
- std::unique_ptr<InputStream> in2 = memoryInputStream(*out);
- while (in2->next(&p, &n)) {
- ::memcpy(&result[c], p, n);
- c += n;
- }
- return result;
-}
-
-}
-}
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JsonDom.hh"
+
+#include <stdexcept>
+
+#include <string.h>
+
+#include "Stream.hh"
+#include "JsonIO.hh"
+
+using std::string;
+using boost::format;
+
+namespace avro {
+namespace json {
+const char* typeToString(EntityType t)
+{
+ switch (t) {
+ case etNull: return "null";
+ case etBool: return "bool";
+ case etLong: return "long";
+ case etDouble: return "double";
+ case etString: return "string";
+ case etArray: return "array";
+ case etObject: return "object";
+ default: return "unknown";
+ }
+}
+
+Entity readEntity(JsonParser& p)
+{
+ switch (p.peek()) {
+ case JsonParser::tkNull:
+ p.advance();
+ return Entity(p.line());
+ case JsonParser::tkBool:
+ p.advance();
+ return Entity(p.boolValue(), p.line());
+ case JsonParser::tkLong:
+ p.advance();
+ return Entity(p.longValue(), p.line());
+ case JsonParser::tkDouble:
+ p.advance();
+ return Entity(p.doubleValue(), p.line());
+ case JsonParser::tkString:
+ p.advance();
+ return Entity(std::make_shared<String>(p.rawString()), p.line());
+ case JsonParser::tkArrayStart:
+ {
+ size_t l = p.line();
+ p.advance();
+ std::shared_ptr<Array> v = std::make_shared<Array>();
+ while (p.peek() != JsonParser::tkArrayEnd) {
+ v->push_back(readEntity(p));
+ }
+ p.advance();
+ return Entity(v, l);
+ }
+ case JsonParser::tkObjectStart:
+ {
+ size_t l = p.line();
+ p.advance();
+ std::shared_ptr<Object> v = std::make_shared<Object>();
+ while (p.peek() != JsonParser::tkObjectEnd) {
+ p.advance();
+ std::string k = p.stringValue();
+ Entity n = readEntity(p);
+ v->insert(std::make_pair(k, n));
+ }
+ p.advance();
+ return Entity(v, l);
+ }
+ default:
+ throw std::domain_error(JsonParser::toString(p.peek()));
+ }
+
+}
+
+Entity loadEntity(const char* text)
+{
+ return loadEntity(reinterpret_cast<const uint8_t*>(text), ::strlen(text));
+}
+
+Entity loadEntity(InputStream& in)
+{
+ JsonParser p;
+ p.init(in);
+ return readEntity(p);
+}
+
+Entity loadEntity(const uint8_t* text, size_t len)
+{
+ std::unique_ptr<InputStream> in = memoryInputStream(text, len);
+ return loadEntity(*in);
+}
+
+void writeEntity(JsonGenerator<JsonNullFormatter>& g, const Entity& n)
+{
+ switch (n.type()) {
+ case etNull:
+ g.encodeNull();
+ break;
+ case etBool:
+ g.encodeBool(n.boolValue());
+ break;
+ case etLong:
+ g.encodeNumber(n.longValue());
+ break;
+ case etDouble:
+ g.encodeNumber(n.doubleValue());
+ break;
+ case etString:
+ g.encodeString(n.stringValue());
+ break;
+ case etArray:
+ {
+ g.arrayStart();
+ const Array& v = n.arrayValue();
+ for (Array::const_iterator it = v.begin();
+ it != v.end(); ++it) {
+ writeEntity(g, *it);
+ }
+ g.arrayEnd();
+ }
+ break;
+ case etObject:
+ {
+ g.objectStart();
+ const Object& v = n.objectValue();
+ for (Object::const_iterator it = v.begin(); it != v.end(); ++it) {
+ g.encodeString(it->first);
+ writeEntity(g, it->second);
+ }
+ g.objectEnd();
+ }
+ break;
+ }
+}
+
+void Entity::ensureType(EntityType type) const
+{
+ if (type_ != type) {
+ format msg = format("Invalid type. Expected \"%1%\" actual %2%") %
+ typeToString(type) % typeToString(type_);
+ throw Exception(msg);
+ }
+}
+
+String Entity::stringValue() const {
+ ensureType(etString);
+ return JsonParser::toStringValue(**boost::any_cast<std::shared_ptr<String> >(&value_));
+}
+
+String Entity::bytesValue() const {
+ ensureType(etString);
+ return JsonParser::toBytesValue(**boost::any_cast<std::shared_ptr<String> >(&value_));
+}
+
+std::string Entity::toString() const
+{
+ std::unique_ptr<OutputStream> out = memoryOutputStream();
+ JsonGenerator<JsonNullFormatter> g;
+ g.init(*out);
+ writeEntity(g, *this);
+ g.flush();
+ std::unique_ptr<InputStream> in = memoryInputStream(*out);
+ const uint8_t *p = 0;
+ size_t n = 0;
+ size_t c = 0;
+ while (in->next(&p, &n)) {
+ c += n;
+ }
+ std::string result;
+ result.resize(c);
+ c = 0;
+ std::unique_ptr<InputStream> in2 = memoryInputStream(*out);
+ while (in2->next(&p, &n)) {
+ ::memcpy(&result[c], p, n);
+ c += n;
+ }
+ return result;
+}
+
+}
+}
+
diff --git a/contrib/libs/apache/avro/impl/json/JsonDom.hh b/contrib/libs/apache/avro/impl/json/JsonDom.hh
index e1f549dfeab..7eb412aa2f3 100644
--- a/contrib/libs/apache/avro/impl/json/JsonDom.hh
+++ b/contrib/libs/apache/avro/impl/json/JsonDom.hh
@@ -1,162 +1,162 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_json_JsonDom_hh__
-#define avro_json_JsonDom_hh__
-
-#include <iostream>
-#include <stdint.h>
-#include <map>
-#include <string>
-#include <vector>
-#include <memory>
-
-#include "boost/any.hpp"
-#include "Config.hh"
-
-namespace avro {
-
-class AVRO_DECL InputStream;
-
-namespace json {
-class Entity;
-
-typedef bool Bool;
-typedef int64_t Long;
-typedef double Double;
-typedef std::string String;
-typedef std::vector<Entity> Array;
-typedef std::map<std::string, Entity> Object;
-
-class AVRO_DECL JsonParser;
-class JsonNullFormatter;
-
-template<typename F = JsonNullFormatter>
-class AVRO_DECL JsonGenerator;
-
-enum EntityType {
- etNull,
- etBool,
- etLong,
- etDouble,
- etString,
- etArray,
- etObject
-};
-
-const char* typeToString(EntityType t);
-
-class AVRO_DECL Entity {
- EntityType type_;
- boost::any value_;
- size_t line_; // can't be const else noncopyable...
-
- void ensureType(EntityType) const;
-public:
- Entity(size_t line = 0) : type_(etNull), line_(line) { }
- Entity(Bool v, size_t line = 0) : type_(etBool), value_(v), line_(line) { }
- Entity(Long v, size_t line = 0) : type_(etLong), value_(v), line_(line) { }
- Entity(Double v, size_t line = 0) : type_(etDouble), value_(v), line_(line) { }
- Entity(const std::shared_ptr<String>& v, size_t line = 0) : type_(etString), value_(v), line_(line) { }
- Entity(const std::shared_ptr<Array>& v, size_t line = 0) : type_(etArray), value_(v), line_(line) { }
- Entity(const std::shared_ptr<Object>& v, size_t line = 0) : type_(etObject), value_(v), line_(line) { }
-
- EntityType type() const { return type_; }
-
- size_t line() const { return line_; }
-
- Bool boolValue() const {
- ensureType(etBool);
- return boost::any_cast<Bool>(value_);
- }
-
- Long longValue() const {
- ensureType(etLong);
- return boost::any_cast<Long>(value_);
- }
-
- Double doubleValue() const {
- ensureType(etDouble);
- return boost::any_cast<Double>(value_);
- }
-
- String stringValue() const;
-
- String bytesValue() const;
-
- const Array& arrayValue() const {
- ensureType(etArray);
- return **boost::any_cast<std::shared_ptr<Array> >(&value_);
- }
-
- const Object& objectValue() const {
- ensureType(etObject);
- return **boost::any_cast<std::shared_ptr<Object> >(&value_);
- }
-
- std::string toString() const;
-};
-
-template <typename T>
-struct type_traits {
-};
-
-template <> struct type_traits<bool> {
- static EntityType type() { return etBool; }
- static const char* name() { return "bool"; }
-};
-
-template <> struct type_traits<int64_t> {
- static EntityType type() { return etLong; }
- static const char* name() { return "long"; }
-};
-
-template <> struct type_traits<double> {
- static EntityType type() { return etDouble; }
- static const char* name() { return "double"; }
-};
-
-template <> struct type_traits<std::string> {
- static EntityType type() { return etString; }
- static const char* name() { return "string"; }
-};
-
-template <> struct type_traits<std::vector<Entity> > {
- static EntityType type() { return etArray; }
- static const char* name() { return "array"; }
-};
-
-template <> struct type_traits<std::map<std::string, Entity> > {
- static EntityType type() { return etObject; }
- static const char* name() { return "object"; }
-};
-
-AVRO_DECL Entity readEntity(JsonParser& p);
-
-AVRO_DECL Entity loadEntity(InputStream& in);
-AVRO_DECL Entity loadEntity(const char* text);
-AVRO_DECL Entity loadEntity(const uint8_t* text, size_t len);
-
-void writeEntity(JsonGenerator<JsonNullFormatter>& g, const Entity& n);
-
-}
-}
-
-#endif
-
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_json_JsonDom_hh__
+#define avro_json_JsonDom_hh__
+
+#include <iostream>
+#include <stdint.h>
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "boost/any.hpp"
+#include "Config.hh"
+
+namespace avro {
+
+class AVRO_DECL InputStream;
+
+namespace json {
+class Entity;
+
+typedef bool Bool;
+typedef int64_t Long;
+typedef double Double;
+typedef std::string String;
+typedef std::vector<Entity> Array;
+typedef std::map<std::string, Entity> Object;
+
+class AVRO_DECL JsonParser;
+class JsonNullFormatter;
+
+template<typename F = JsonNullFormatter>
+class AVRO_DECL JsonGenerator;
+
+enum EntityType {
+ etNull,
+ etBool,
+ etLong,
+ etDouble,
+ etString,
+ etArray,
+ etObject
+};
+
+const char* typeToString(EntityType t);
+
+class AVRO_DECL Entity {
+ EntityType type_;
+ boost::any value_;
+ size_t line_; // can't be const else noncopyable...
+
+ void ensureType(EntityType) const;
+public:
+ Entity(size_t line = 0) : type_(etNull), line_(line) { }
+ Entity(Bool v, size_t line = 0) : type_(etBool), value_(v), line_(line) { }
+ Entity(Long v, size_t line = 0) : type_(etLong), value_(v), line_(line) { }
+ Entity(Double v, size_t line = 0) : type_(etDouble), value_(v), line_(line) { }
+ Entity(const std::shared_ptr<String>& v, size_t line = 0) : type_(etString), value_(v), line_(line) { }
+ Entity(const std::shared_ptr<Array>& v, size_t line = 0) : type_(etArray), value_(v), line_(line) { }
+ Entity(const std::shared_ptr<Object>& v, size_t line = 0) : type_(etObject), value_(v), line_(line) { }
+
+ EntityType type() const { return type_; }
+
+ size_t line() const { return line_; }
+
+ Bool boolValue() const {
+ ensureType(etBool);
+ return boost::any_cast<Bool>(value_);
+ }
+
+ Long longValue() const {
+ ensureType(etLong);
+ return boost::any_cast<Long>(value_);
+ }
+
+ Double doubleValue() const {
+ ensureType(etDouble);
+ return boost::any_cast<Double>(value_);
+ }
+
+ String stringValue() const;
+
+ String bytesValue() const;
+
+ const Array& arrayValue() const {
+ ensureType(etArray);
+ return **boost::any_cast<std::shared_ptr<Array> >(&value_);
+ }
+
+ const Object& objectValue() const {
+ ensureType(etObject);
+ return **boost::any_cast<std::shared_ptr<Object> >(&value_);
+ }
+
+ std::string toString() const;
+};
+
+template <typename T>
+struct type_traits {
+};
+
+template <> struct type_traits<bool> {
+ static EntityType type() { return etBool; }
+ static const char* name() { return "bool"; }
+};
+
+template <> struct type_traits<int64_t> {
+ static EntityType type() { return etLong; }
+ static const char* name() { return "long"; }
+};
+
+template <> struct type_traits<double> {
+ static EntityType type() { return etDouble; }
+ static const char* name() { return "double"; }
+};
+
+template <> struct type_traits<std::string> {
+ static EntityType type() { return etString; }
+ static const char* name() { return "string"; }
+};
+
+template <> struct type_traits<std::vector<Entity> > {
+ static EntityType type() { return etArray; }
+ static const char* name() { return "array"; }
+};
+
+template <> struct type_traits<std::map<std::string, Entity> > {
+ static EntityType type() { return etObject; }
+ static const char* name() { return "object"; }
+};
+
+AVRO_DECL Entity readEntity(JsonParser& p);
+
+AVRO_DECL Entity loadEntity(InputStream& in);
+AVRO_DECL Entity loadEntity(const char* text);
+AVRO_DECL Entity loadEntity(const uint8_t* text, size_t len);
+
+void writeEntity(JsonGenerator<JsonNullFormatter>& g, const Entity& n);
+
+}
+}
+
+#endif
+
+
diff --git a/contrib/libs/apache/avro/impl/json/JsonIO.cc b/contrib/libs/apache/avro/impl/json/JsonIO.cc
index c11a722ad4d..d09ea2315f5 100644
--- a/contrib/libs/apache/avro/impl/json/JsonIO.cc
+++ b/contrib/libs/apache/avro/impl/json/JsonIO.cc
@@ -1,442 +1,442 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "JsonIO.hh"
-
-namespace avro {
-namespace json {
-
-using std::ostringstream;
-using std::string;
-
-const char* const
-JsonParser::tokenNames[] = {
- "Null",
- "Bool",
- "Integer",
- "Double",
- "String",
- "Array start",
- "Array end",
- "Object start",
- "Object end",
-};
-
-char JsonParser::next()
-{
- char ch = hasNext ? nextChar : ' ';
- while (isspace(ch)) {
- if (ch == '\n') {
- line_++;
- }
- ch = in_.read();
- }
- hasNext = false;
- return ch;
-}
-
-void JsonParser::expectToken(Token tk)
-{
- if (advance() != tk) {
- if (tk == tkDouble) {
- if(cur() == tkString
- && (sv == "Infinity" || sv == "-Infinity" || sv == "NaN")) {
- curToken = tkDouble;
- dv = sv == "Infinity" ?
- std::numeric_limits<double>::infinity() :
- sv == "-Infinity" ?
- -std::numeric_limits<double>::infinity() :
- std::numeric_limits<double>::quiet_NaN();
- return;
- } else if (cur() == tkLong) {
- dv = double(lv);
- return;
- }
- }
- ostringstream oss;
- oss << "Incorrect token in the stream. Expected: "
- << JsonParser::toString(tk) << ", found "
- << JsonParser::toString(cur());
- throw Exception(oss.str());
- }
-}
-
-JsonParser::Token JsonParser::doAdvance()
-{
- char ch = next();
- if (ch == ']') {
- if (curState == stArray0 || curState == stArrayN) {
- curState = stateStack.top();
- stateStack.pop();
- return tkArrayEnd;
- } else {
- throw unexpected(ch);
- }
- } else if (ch == '}') {
- if (curState == stObject0 || curState == stObjectN) {
- curState = stateStack.top();
- stateStack.pop();
- return tkObjectEnd;
- } else {
- throw unexpected(ch);
- }
- } else if (ch == ',') {
- if (curState != stObjectN && curState != stArrayN) {
- throw unexpected(ch);
- }
- if (curState == stObjectN) {
- curState = stObject0;
- }
- ch = next();
- } else if (ch == ':') {
- if (curState != stKey) {
- throw unexpected(ch);
- }
- curState = stObjectN;
- ch = next();
- }
-
- if (curState == stObject0) {
- if (ch != '"') {
- throw unexpected(ch);
- }
- curState = stKey;
- } else if (curState == stArray0) {
- curState = stArrayN;
- }
-
- switch (ch) {
- case '[':
- stateStack.push(curState);
- curState = stArray0;
- return tkArrayStart;
- case '{':
- stateStack.push(curState);
- curState = stObject0;
- return tkObjectStart;
- case '"':
- return tryString();
- case 't':
- bv = true;
- return tryLiteral("rue", 3, tkBool);
- case 'f':
- bv = false;
- return tryLiteral("alse", 4, tkBool);
- case 'n':
- return tryLiteral("ull", 3, tkNull);
- default:
- if (isdigit(ch) || ch == '-') {
- return tryNumber(ch);
- } else {
- throw unexpected(ch);
- }
- }
-}
-
-JsonParser::Token JsonParser::tryNumber(char ch)
-{
- sv.clear();
- sv.push_back(ch);
-
- hasNext = false;
- int state = (ch == '-') ? 0 : (ch == '0') ? 1 : 2;
- for (; ;) {
- switch (state) {
- case 0:
- if (in_.hasMore()) {
- ch = in_.read();
- if (isdigit(ch)) {
- state = (ch == '0') ? 1 : 2;
- sv.push_back(ch);
- continue;
- }
- hasNext = true;
- }
- break;
- case 1:
- if (in_.hasMore()) {
- ch = in_.read();
- if (ch == '.') {
- state = 3;
- sv.push_back(ch);
- continue;
- } else if (ch == 'e' || ch == 'E') {
- sv.push_back(ch);
- state = 5;
- continue;
- }
- hasNext = true;
- }
- break;
- case 2:
- if (in_.hasMore()) {
- ch = in_.read();
- if (isdigit(ch)) {
- sv.push_back(ch);
- continue;
- } else if (ch == '.') {
- state = 3;
- sv.push_back(ch);
- continue;
- } else if (ch == 'e' || ch == 'E') {
- sv.push_back(ch);
- state = 5;
- continue;
- }
- hasNext = true;
- }
- break;
- case 3:
- case 6:
- if (in_.hasMore()) {
- ch = in_.read();
- if (isdigit(ch)) {
- sv.push_back(ch);
- state++;
- continue;
- }
- hasNext = true;
- }
- break;
- case 4:
- if (in_.hasMore()) {
- ch = in_.read();
- if (isdigit(ch)) {
- sv.push_back(ch);
- continue;
- } else if (ch == 'e' || ch == 'E') {
- sv.push_back(ch);
- state = 5;
- continue;
- }
- hasNext = true;
- }
- break;
- case 5:
- if (in_.hasMore()) {
- ch = in_.read();
- if (ch == '+' || ch == '-') {
- sv.push_back(ch);
- state = 6;
- continue;
- } else if (isdigit(ch)) {
- sv.push_back(ch);
- state = 7;
- continue;
- }
- hasNext = true;
- }
- break;
- case 7:
- if (in_.hasMore()) {
- ch = in_.read();
- if (isdigit(ch)) {
- sv.push_back(ch);
- continue;
- }
- hasNext = true;
- }
- break;
- }
- if (state == 1 || state == 2 || state == 4 || state == 7) {
- if (hasNext) {
- nextChar = ch;
- }
- std::istringstream iss(sv);
- if (state == 1 || state == 2) {
- iss >> lv;
- return tkLong;
- } else {
- iss >> dv;
- return tkDouble;
- }
- } else {
- if (hasNext) {
- throw unexpected(ch);
- } else {
- throw Exception("Unexpected EOF");
- }
- }
- }
-}
-
-JsonParser::Token JsonParser::tryString()
-{
- sv.clear();
- for ( ; ;) {
- char ch = in_.read();
- if (ch == '"') {
- return tkString;
- } else if (ch == '\\') {
- ch = in_.read();
- switch (ch) {
- case '"':
- case '\\':
- case '/':
- case 'b':
- case 'f':
- case 'n':
- case 'r':
- case 't':
- sv.push_back('\\');
- sv.push_back(ch);
- break;
- case 'u':
- case 'U':
- {
- uint32_t n = 0;
- char e[4];
- in_.readBytes(reinterpret_cast<uint8_t*>(e), 4);
- sv.push_back('\\');
- sv.push_back(ch);
- for (int i = 0; i < 4; i++) {
- n *= 16;
- char c = e[i];
- if (isdigit(c) ||
- (c >= 'a' && c <= 'f') ||
- (c >= 'A' && c <= 'F')) {
- sv.push_back(c);
- } else {
- throw unexpected(c);
- }
- }
- }
- break;
- default:
- throw unexpected(ch);
- }
- } else {
- sv.push_back(ch);
- }
- }
-}
-
-
-string JsonParser::decodeString(const string& s, bool binary)
-{
- string result;
- for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
- char ch = *it;
- if (ch == '\\') {
- ch = *++it;
- switch (ch) {
- case '"':
- case '\\':
- case '/':
- result.push_back(ch);
- continue;
- case 'b':
- result.push_back('\b');
- continue;
- case 'f':
- result.push_back('\f');
- continue;
- case 'n':
- result.push_back('\n');
- continue;
- case 'r':
- result.push_back('\r');
- continue;
- case 't':
- result.push_back('\t');
- continue;
- case 'u':
- case 'U':
- {
- uint32_t n = 0;
- char e[4];
- for (int i = 0; i < 4; i++) {
- n *= 16;
- char c = *++it;
- e[i] = c;
- if (isdigit(c)) {
- n += c - '0';
- } else if (c >= 'a' && c <= 'f') {
- n += c - 'a' + 10;
- } else if (c >= 'A' && c <= 'F') {
- n += c - 'A' + 10;
- }
- }
- if (binary) {
- if (n > 0xff) {
- throw Exception(boost::format(
- "Invalid byte for binary: %1%%2%") % ch %
- string(e, 4));
- } else {
- result.push_back(n);
- continue;
- }
- }
- if (n < 0x80) {
- result.push_back(n);
- } else if (n < 0x800) {
- result.push_back((n >> 6) | 0xc0);
- result.push_back((n & 0x3f) | 0x80);
- } else if (n < 0x10000) {
- result.push_back((n >> 12) | 0xe0);
- result.push_back(((n >> 6)& 0x3f) | 0x80);
- result.push_back((n & 0x3f) | 0x80);
- } else if (n < 110000) {
- result.push_back((n >> 18) | 0xf0);
- result.push_back(((n >> 12)& 0x3f) | 0x80);
- result.push_back(((n >> 6)& 0x3f) | 0x80);
- result.push_back((n & 0x3f) | 0x80);
- } else {
- throw Exception(boost::format(
- "Invalid unicode value: %1%i%2%") % ch %
- string(e, 4));
- }
- }
- continue;
- }
- } else {
- result.push_back(ch);
- }
- }
- return result;
-}
-
-Exception JsonParser::unexpected(unsigned char c)
-{
- std::ostringstream oss;
- oss << "Unexpected character in json " << toHex(c / 16) << toHex(c % 16);
- return Exception(oss.str());
-}
-
-JsonParser::Token JsonParser::tryLiteral(const char exp[], size_t n, Token tk)
-{
- char c[100];
- in_.readBytes(reinterpret_cast<uint8_t*>(c), n);
- for (size_t i = 0; i < n; ++i) {
- if (c[i] != exp[i]) {
- throw unexpected(c[i]);
- }
- }
- if (in_.hasMore()) {
- nextChar = in_.read();
- if (isdigit(nextChar) || isalpha(nextChar)) {
- throw unexpected(nextChar);
- }
- hasNext = true;
- }
- return tk;
-}
-
-}
-}
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "JsonIO.hh"
+
+namespace avro {
+namespace json {
+
+using std::ostringstream;
+using std::string;
+
+const char* const
+JsonParser::tokenNames[] = {
+ "Null",
+ "Bool",
+ "Integer",
+ "Double",
+ "String",
+ "Array start",
+ "Array end",
+ "Object start",
+ "Object end",
+};
+
+char JsonParser::next()
+{
+ char ch = hasNext ? nextChar : ' ';
+ while (isspace(ch)) {
+ if (ch == '\n') {
+ line_++;
+ }
+ ch = in_.read();
+ }
+ hasNext = false;
+ return ch;
+}
+
+void JsonParser::expectToken(Token tk)
+{
+ if (advance() != tk) {
+ if (tk == tkDouble) {
+ if(cur() == tkString
+ && (sv == "Infinity" || sv == "-Infinity" || sv == "NaN")) {
+ curToken = tkDouble;
+ dv = sv == "Infinity" ?
+ std::numeric_limits<double>::infinity() :
+ sv == "-Infinity" ?
+ -std::numeric_limits<double>::infinity() :
+ std::numeric_limits<double>::quiet_NaN();
+ return;
+ } else if (cur() == tkLong) {
+ dv = double(lv);
+ return;
+ }
+ }
+ ostringstream oss;
+ oss << "Incorrect token in the stream. Expected: "
+ << JsonParser::toString(tk) << ", found "
+ << JsonParser::toString(cur());
+ throw Exception(oss.str());
+ }
+}
+
+JsonParser::Token JsonParser::doAdvance()
+{
+ char ch = next();
+ if (ch == ']') {
+ if (curState == stArray0 || curState == stArrayN) {
+ curState = stateStack.top();
+ stateStack.pop();
+ return tkArrayEnd;
+ } else {
+ throw unexpected(ch);
+ }
+ } else if (ch == '}') {
+ if (curState == stObject0 || curState == stObjectN) {
+ curState = stateStack.top();
+ stateStack.pop();
+ return tkObjectEnd;
+ } else {
+ throw unexpected(ch);
+ }
+ } else if (ch == ',') {
+ if (curState != stObjectN && curState != stArrayN) {
+ throw unexpected(ch);
+ }
+ if (curState == stObjectN) {
+ curState = stObject0;
+ }
+ ch = next();
+ } else if (ch == ':') {
+ if (curState != stKey) {
+ throw unexpected(ch);
+ }
+ curState = stObjectN;
+ ch = next();
+ }
+
+ if (curState == stObject0) {
+ if (ch != '"') {
+ throw unexpected(ch);
+ }
+ curState = stKey;
+ } else if (curState == stArray0) {
+ curState = stArrayN;
+ }
+
+ switch (ch) {
+ case '[':
+ stateStack.push(curState);
+ curState = stArray0;
+ return tkArrayStart;
+ case '{':
+ stateStack.push(curState);
+ curState = stObject0;
+ return tkObjectStart;
+ case '"':
+ return tryString();
+ case 't':
+ bv = true;
+ return tryLiteral("rue", 3, tkBool);
+ case 'f':
+ bv = false;
+ return tryLiteral("alse", 4, tkBool);
+ case 'n':
+ return tryLiteral("ull", 3, tkNull);
+ default:
+ if (isdigit(ch) || ch == '-') {
+ return tryNumber(ch);
+ } else {
+ throw unexpected(ch);
+ }
+ }
+}
+
+JsonParser::Token JsonParser::tryNumber(char ch)
+{
+ sv.clear();
+ sv.push_back(ch);
+
+ hasNext = false;
+ int state = (ch == '-') ? 0 : (ch == '0') ? 1 : 2;
+ for (; ;) {
+ switch (state) {
+ case 0:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (isdigit(ch)) {
+ state = (ch == '0') ? 1 : 2;
+ sv.push_back(ch);
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ case 1:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (ch == '.') {
+ state = 3;
+ sv.push_back(ch);
+ continue;
+ } else if (ch == 'e' || ch == 'E') {
+ sv.push_back(ch);
+ state = 5;
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ case 2:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (isdigit(ch)) {
+ sv.push_back(ch);
+ continue;
+ } else if (ch == '.') {
+ state = 3;
+ sv.push_back(ch);
+ continue;
+ } else if (ch == 'e' || ch == 'E') {
+ sv.push_back(ch);
+ state = 5;
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ case 3:
+ case 6:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (isdigit(ch)) {
+ sv.push_back(ch);
+ state++;
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ case 4:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (isdigit(ch)) {
+ sv.push_back(ch);
+ continue;
+ } else if (ch == 'e' || ch == 'E') {
+ sv.push_back(ch);
+ state = 5;
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ case 5:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (ch == '+' || ch == '-') {
+ sv.push_back(ch);
+ state = 6;
+ continue;
+ } else if (isdigit(ch)) {
+ sv.push_back(ch);
+ state = 7;
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ case 7:
+ if (in_.hasMore()) {
+ ch = in_.read();
+ if (isdigit(ch)) {
+ sv.push_back(ch);
+ continue;
+ }
+ hasNext = true;
+ }
+ break;
+ }
+ if (state == 1 || state == 2 || state == 4 || state == 7) {
+ if (hasNext) {
+ nextChar = ch;
+ }
+ std::istringstream iss(sv);
+ if (state == 1 || state == 2) {
+ iss >> lv;
+ return tkLong;
+ } else {
+ iss >> dv;
+ return tkDouble;
+ }
+ } else {
+ if (hasNext) {
+ throw unexpected(ch);
+ } else {
+ throw Exception("Unexpected EOF");
+ }
+ }
+ }
+}
+
+JsonParser::Token JsonParser::tryString()
+{
+ sv.clear();
+ for ( ; ;) {
+ char ch = in_.read();
+ if (ch == '"') {
+ return tkString;
+ } else if (ch == '\\') {
+ ch = in_.read();
+ switch (ch) {
+ case '"':
+ case '\\':
+ case '/':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ sv.push_back('\\');
+ sv.push_back(ch);
+ break;
+ case 'u':
+ case 'U':
+ {
+ uint32_t n = 0;
+ char e[4];
+ in_.readBytes(reinterpret_cast<uint8_t*>(e), 4);
+ sv.push_back('\\');
+ sv.push_back(ch);
+ for (int i = 0; i < 4; i++) {
+ n *= 16;
+ char c = e[i];
+ if (isdigit(c) ||
+ (c >= 'a' && c <= 'f') ||
+ (c >= 'A' && c <= 'F')) {
+ sv.push_back(c);
+ } else {
+ throw unexpected(c);
+ }
+ }
+ }
+ break;
+ default:
+ throw unexpected(ch);
+ }
+ } else {
+ sv.push_back(ch);
+ }
+ }
+}
+
+
+string JsonParser::decodeString(const string& s, bool binary)
+{
+ string result;
+ for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
+ char ch = *it;
+ if (ch == '\\') {
+ ch = *++it;
+ switch (ch) {
+ case '"':
+ case '\\':
+ case '/':
+ result.push_back(ch);
+ continue;
+ case 'b':
+ result.push_back('\b');
+ continue;
+ case 'f':
+ result.push_back('\f');
+ continue;
+ case 'n':
+ result.push_back('\n');
+ continue;
+ case 'r':
+ result.push_back('\r');
+ continue;
+ case 't':
+ result.push_back('\t');
+ continue;
+ case 'u':
+ case 'U':
+ {
+ uint32_t n = 0;
+ char e[4];
+ for (int i = 0; i < 4; i++) {
+ n *= 16;
+ char c = *++it;
+ e[i] = c;
+ if (isdigit(c)) {
+ n += c - '0';
+ } else if (c >= 'a' && c <= 'f') {
+ n += c - 'a' + 10;
+ } else if (c >= 'A' && c <= 'F') {
+ n += c - 'A' + 10;
+ }
+ }
+ if (binary) {
+ if (n > 0xff) {
+ throw Exception(boost::format(
+ "Invalid byte for binary: %1%%2%") % ch %
+ string(e, 4));
+ } else {
+ result.push_back(n);
+ continue;
+ }
+ }
+ if (n < 0x80) {
+ result.push_back(n);
+ } else if (n < 0x800) {
+ result.push_back((n >> 6) | 0xc0);
+ result.push_back((n & 0x3f) | 0x80);
+ } else if (n < 0x10000) {
+ result.push_back((n >> 12) | 0xe0);
+ result.push_back(((n >> 6)& 0x3f) | 0x80);
+ result.push_back((n & 0x3f) | 0x80);
+ } else if (n < 110000) {
+ result.push_back((n >> 18) | 0xf0);
+ result.push_back(((n >> 12)& 0x3f) | 0x80);
+ result.push_back(((n >> 6)& 0x3f) | 0x80);
+ result.push_back((n & 0x3f) | 0x80);
+ } else {
+ throw Exception(boost::format(
+ "Invalid unicode value: %1%i%2%") % ch %
+ string(e, 4));
+ }
+ }
+ continue;
+ }
+ } else {
+ result.push_back(ch);
+ }
+ }
+ return result;
+}
+
+Exception JsonParser::unexpected(unsigned char c)
+{
+ std::ostringstream oss;
+ oss << "Unexpected character in json " << toHex(c / 16) << toHex(c % 16);
+ return Exception(oss.str());
+}
+
+JsonParser::Token JsonParser::tryLiteral(const char exp[], size_t n, Token tk)
+{
+ char c[100];
+ in_.readBytes(reinterpret_cast<uint8_t*>(c), n);
+ for (size_t i = 0; i < n; ++i) {
+ if (c[i] != exp[i]) {
+ throw unexpected(c[i]);
+ }
+ }
+ if (in_.hasMore()) {
+ nextChar = in_.read();
+ if (isdigit(nextChar) || isalpha(nextChar)) {
+ throw unexpected(nextChar);
+ }
+ hasNext = true;
+ }
+ return tk;
+}
+
+}
+}
+
diff --git a/contrib/libs/apache/avro/impl/json/JsonIO.hh b/contrib/libs/apache/avro/impl/json/JsonIO.hh
index 5ae7ae07dc5..705c5fc4fd7 100644
--- a/contrib/libs/apache/avro/impl/json/JsonIO.hh
+++ b/contrib/libs/apache/avro/impl/json/JsonIO.hh
@@ -1,482 +1,482 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_json_JsonIO_hh__
-#define avro_json_JsonIO_hh__
-
-#include <locale>
-#include <stack>
-#include <string>
-#include <sstream>
-#include <boost/math/special_functions/fpclassify.hpp>
-#include <boost/lexical_cast.hpp>
-#include <boost/utility.hpp>
-
-#include "Config.hh"
-#include "Stream.hh"
-
-namespace avro {
-namespace json {
-
-inline char toHex(unsigned int n) {
- return (n < 10) ? (n + '0') : (n + 'a' - 10);
-}
-
-
-class AVRO_DECL JsonParser : boost::noncopyable {
-public:
- enum Token {
- tkNull,
- tkBool,
- tkLong,
- tkDouble,
- tkString,
- tkArrayStart,
- tkArrayEnd,
- tkObjectStart,
- tkObjectEnd
- };
-
- size_t line() const { return line_; }
-
-private:
- enum State {
- stValue, // Expect a data type
- stArray0, // Expect a data type or ']'
- stArrayN, // Expect a ',' or ']'
- stObject0, // Expect a string or a '}'
- stObjectN, // Expect a ',' or '}'
- stKey // Expect a ':'
- };
- std::stack<State> stateStack;
- State curState;
- bool hasNext;
- char nextChar;
- bool peeked;
-
- StreamReader in_;
- Token curToken;
- bool bv;
- int64_t lv;
- double dv;
- std::string sv;
- size_t line_;
-
- Token doAdvance();
- Token tryLiteral(const char exp[], size_t n, Token tk);
- Token tryNumber(char ch);
- Token tryString();
- Exception unexpected(unsigned char ch);
- char next();
-
- static std::string decodeString(const std::string& s, bool binary);
-
-public:
- JsonParser() : curState(stValue), hasNext(false), peeked(false), line_(1) { }
-
- void init(InputStream& is) {
- // Clear by swapping with an empty stack
- std::stack<State>().swap(stateStack);
- curState = stValue;
- hasNext = false;
- peeked = false;
- line_ = 1;
- in_.reset(is);
- }
-
- Token advance() {
- if (! peeked) {
- curToken = doAdvance();
- } else {
- peeked = false;
- }
- return curToken;
- }
-
- Token peek() {
- if (! peeked) {
- curToken = doAdvance();
- peeked = true;
- }
- return curToken;
- }
-
- void expectToken(Token tk);
-
- bool boolValue() const {
- return bv;
- }
-
- Token cur() const {
- return curToken;
- }
-
- double doubleValue() const {
- return dv;
- }
-
- int64_t longValue() const {
- return lv;
- }
-
- const std::string& rawString() const {
- return sv;
- }
-
- std::string stringValue() const {
- return decodeString(sv, false);
- }
-
- std::string bytesValue() const {
- return decodeString(sv, true);
- }
-
- void drain() {
- if (!stateStack.empty() || peeked) {
- throw Exception("Invalid state for draining");
- }
- in_.drain(hasNext);
- hasNext = false;
- }
-
- /**
- * Return UTF-8 encoded string value.
- */
- static std::string toStringValue(const std::string& sv) {
- return decodeString(sv, false);
- }
-
- /**
- * Return byte-encoded string value. It is an error if the input
- * JSON string contained unicode characters more than "\u00ff'.
- */
- static std::string toBytesValue(const std::string& sv) {
- return decodeString(sv, true);
- }
-
- static const char* const tokenNames[];
-
- static const char* toString(Token tk) {
- return tokenNames[tk];
- }
-};
-
-class AVRO_DECL JsonNullFormatter {
-public:
- JsonNullFormatter(StreamWriter&) { }
-
- void handleObjectStart() {}
- void handleObjectEnd() {}
- void handleValueEnd() {}
- void handleColon() {}
-};
-
-class AVRO_DECL JsonPrettyFormatter {
- StreamWriter& out_;
- size_t level_;
- std::vector<uint8_t> indent_;
-
- static const int CHARS_PER_LEVEL = 2;
-
- void printIndent() {
- size_t charsToIndent = level_ * CHARS_PER_LEVEL;
- if (indent_.size() < charsToIndent) {
- indent_.resize(charsToIndent * 2, ' ');
- }
- out_.writeBytes(indent_.data(), charsToIndent);
- }
-public:
- JsonPrettyFormatter(StreamWriter& out) : out_(out), level_(0), indent_(10, ' ') { }
-
- void handleObjectStart() {
- out_.write('\n');
- ++level_;
- printIndent();
- }
-
- void handleObjectEnd() {
- out_.write('\n');
- --level_;
- printIndent();
- }
-
- void handleValueEnd() {
- out_.write('\n');
- printIndent();
- }
-
- void handleColon() {
- out_.write(' ');
- }
-};
-
-template <class F>
-class AVRO_DECL JsonGenerator {
- StreamWriter out_;
- F formatter_;
- enum State {
- stStart,
- stArray0,
- stArrayN,
- stMap0,
- stMapN,
- stKey,
- };
-
- std::stack<State> stateStack;
- State top;
-
- void write(const char *b, const char* p) {
- if (b != p) {
- out_.writeBytes(reinterpret_cast<const uint8_t*>(b), p - b);
- }
- }
-
- void escape(char c, const char* b, const char *p) {
- write(b, p);
- out_.write('\\');
- out_.write(c);
- }
-
- void escapeCtl(char c) {
- escapeUnicode(static_cast<uint8_t>(c));
- }
-
- void writeHex(char c) {
- out_.write(toHex((static_cast<unsigned char>(c)) / 16));
- out_.write(toHex((static_cast<unsigned char>(c)) % 16));
- }
-
- void escapeUnicode(uint32_t c) {
- out_.write('\\');
- out_.write('u');
- writeHex((c >> 8) & 0xff);
- writeHex(c & 0xff);
- }
- void doEncodeString(const char* b, size_t len, bool binary) {
- const char* e = b + len;
- out_.write('"');
- for (const char* p = b; p != e; p++) {
- if ((*p & 0x80) != 0) {
- write(b, p);
- if (binary) {
- escapeCtl(*p);
- } else if ((*p & 0x40) == 0) {
- throw Exception("Invalid UTF-8 sequence");
- } else {
- int more = 1;
- uint32_t value = 0;
- if ((*p & 0x20) != 0) {
- more++;
- if ((*p & 0x10) != 0) {
- more++;
- if ((*p & 0x08) != 0) {
- throw Exception("Invalid UTF-8 sequence");
- } else {
- value = *p & 0x07;
- }
- } else {
- value = *p & 0x0f;
- }
- } else {
- value = *p & 0x1f;
- }
- for (int i = 0; i < more; ++i) {
- if (++p == e || (*p & 0xc0) != 0x80) {
- throw Exception("Invalid UTF-8 sequence");
- }
- value <<= 6;
- value |= *p & 0x3f;
- }
- escapeUnicode(value);
- }
- } else {
- switch (*p) {
- case '\\':
- case '"':
- case '/':
- escape(*p, b, p);
- break;
- case '\b':
- escape('b', b, p);
- break;
- case '\f':
- escape('f', b, p);
- break;
- case '\n':
- escape('n', b, p);
- break;
- case '\r':
- escape('r', b, p);
- break;
- case '\t':
- escape('t', b, p);
- break;
- default:
- if (std::iscntrl(*p, std::locale::classic())) {
- write(b, p);
- escapeCtl(*p);
- break;
- } else {
- continue;
- }
- }
- }
- b = p + 1;
- }
- write(b, e);
- out_.write('"');
- }
-
- void sep() {
- if (top == stArrayN) {
- out_.write(',');
- formatter_.handleValueEnd();
- } else if (top == stArray0) {
- top = stArrayN;
- }
- }
-
- void sep2() {
- if (top == stKey) {
- top = stMapN;
- }
- }
-
-public:
- JsonGenerator() : formatter_(out_), top(stStart) { }
-
- void init(OutputStream& os) {
- out_.reset(os);
- }
-
- void flush() {
- out_.flush();
- }
-
- int64_t byteCount() const {
- return out_.byteCount();
- }
-
- void encodeNull() {
- sep();
- out_.writeBytes(reinterpret_cast<const uint8_t*>("null"), 4);
- sep2();
- }
-
- void encodeBool(bool b) {
- sep();
- if (b) {
- out_.writeBytes(reinterpret_cast<const uint8_t*>("true"), 4);
- } else {
- out_.writeBytes(reinterpret_cast<const uint8_t*>("false"), 5);
- }
- sep2();
- }
-
- template <typename T>
- void encodeNumber(T t) {
- sep();
- std::ostringstream oss;
- oss << boost::lexical_cast<std::string>(t);
- const std::string& s = oss.str();
- out_.writeBytes(reinterpret_cast<const uint8_t*>(s.data()), s.size());
- sep2();
- }
-
- void encodeNumber(double t) {
- sep();
- std::ostringstream oss;
- if (boost::math::isfinite(t)) {
- oss << boost::lexical_cast<std::string>(t);
- } else if (boost::math::isnan(t)) {
- oss << "NaN";
- } else if (t == std::numeric_limits<double>::infinity()) {
- oss << "Infinity";
- } else {
- oss << "-Infinity";
- }
- const std::string& s = oss.str();
- out_.writeBytes(reinterpret_cast<const uint8_t*>(s.data()), s.size());
- sep2();
- }
-
-
- void encodeString(const std::string& s) {
- if (top == stMap0) {
- top = stKey;
- } else if (top == stMapN) {
- out_.write(',');
- formatter_.handleValueEnd();
- top = stKey;
- } else if (top == stKey) {
- top = stMapN;
- } else {
- sep();
- }
- doEncodeString(s.c_str(), s.size(), false);
- if (top == stKey) {
- out_.write(':');
- formatter_.handleColon();
- }
- }
-
- void encodeBinary(const uint8_t* bytes, size_t len) {
- sep();
- doEncodeString(reinterpret_cast<const char *>(bytes), len, true);
- sep2();
- }
-
- void arrayStart() {
- sep();
- stateStack.push(top);
- top = stArray0;
- out_.write('[');
- formatter_.handleObjectStart();
- }
-
- void arrayEnd() {
- top = stateStack.top();
- stateStack.pop();
- formatter_.handleObjectEnd();
- out_.write(']');
- sep2();
- }
-
- void objectStart() {
- sep();
- stateStack.push(top);
- top = stMap0;
- out_.write('{');
- formatter_.handleObjectStart();
- }
-
- void objectEnd() {
- top = stateStack.top();
- stateStack.pop();
- formatter_.handleObjectEnd();
- out_.write('}');
- sep2();
- }
-
-};
-
-}
-}
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_json_JsonIO_hh__
+#define avro_json_JsonIO_hh__
+
+#include <locale>
+#include <stack>
+#include <string>
+#include <sstream>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/utility.hpp>
+
+#include "Config.hh"
+#include "Stream.hh"
+
+namespace avro {
+namespace json {
+
+inline char toHex(unsigned int n) {
+ return (n < 10) ? (n + '0') : (n + 'a' - 10);
+}
+
+
+class AVRO_DECL JsonParser : boost::noncopyable {
+public:
+ enum Token {
+ tkNull,
+ tkBool,
+ tkLong,
+ tkDouble,
+ tkString,
+ tkArrayStart,
+ tkArrayEnd,
+ tkObjectStart,
+ tkObjectEnd
+ };
+
+ size_t line() const { return line_; }
+
+private:
+ enum State {
+ stValue, // Expect a data type
+ stArray0, // Expect a data type or ']'
+ stArrayN, // Expect a ',' or ']'
+ stObject0, // Expect a string or a '}'
+ stObjectN, // Expect a ',' or '}'
+ stKey // Expect a ':'
+ };
+ std::stack<State> stateStack;
+ State curState;
+ bool hasNext;
+ char nextChar;
+ bool peeked;
+
+ StreamReader in_;
+ Token curToken;
+ bool bv;
+ int64_t lv;
+ double dv;
+ std::string sv;
+ size_t line_;
+
+ Token doAdvance();
+ Token tryLiteral(const char exp[], size_t n, Token tk);
+ Token tryNumber(char ch);
+ Token tryString();
+ Exception unexpected(unsigned char ch);
+ char next();
+
+ static std::string decodeString(const std::string& s, bool binary);
+
+public:
+ JsonParser() : curState(stValue), hasNext(false), peeked(false), line_(1) { }
+
+ void init(InputStream& is) {
+ // Clear by swapping with an empty stack
+ std::stack<State>().swap(stateStack);
+ curState = stValue;
+ hasNext = false;
+ peeked = false;
+ line_ = 1;
+ in_.reset(is);
+ }
+
+ Token advance() {
+ if (! peeked) {
+ curToken = doAdvance();
+ } else {
+ peeked = false;
+ }
+ return curToken;
+ }
+
+ Token peek() {
+ if (! peeked) {
+ curToken = doAdvance();
+ peeked = true;
+ }
+ return curToken;
+ }
+
+ void expectToken(Token tk);
+
+ bool boolValue() const {
+ return bv;
+ }
+
+ Token cur() const {
+ return curToken;
+ }
+
+ double doubleValue() const {
+ return dv;
+ }
+
+ int64_t longValue() const {
+ return lv;
+ }
+
+ const std::string& rawString() const {
+ return sv;
+ }
+
+ std::string stringValue() const {
+ return decodeString(sv, false);
+ }
+
+ std::string bytesValue() const {
+ return decodeString(sv, true);
+ }
+
+ void drain() {
+ if (!stateStack.empty() || peeked) {
+ throw Exception("Invalid state for draining");
+ }
+ in_.drain(hasNext);
+ hasNext = false;
+ }
+
+ /**
+ * Return UTF-8 encoded string value.
+ */
+ static std::string toStringValue(const std::string& sv) {
+ return decodeString(sv, false);
+ }
+
+ /**
+ * Return byte-encoded string value. It is an error if the input
+ * JSON string contained unicode characters more than "\u00ff'.
+ */
+ static std::string toBytesValue(const std::string& sv) {
+ return decodeString(sv, true);
+ }
+
+ static const char* const tokenNames[];
+
+ static const char* toString(Token tk) {
+ return tokenNames[tk];
+ }
+};
+
+class AVRO_DECL JsonNullFormatter {
+public:
+ JsonNullFormatter(StreamWriter&) { }
+
+ void handleObjectStart() {}
+ void handleObjectEnd() {}
+ void handleValueEnd() {}
+ void handleColon() {}
+};
+
+class AVRO_DECL JsonPrettyFormatter {
+ StreamWriter& out_;
+ size_t level_;
+ std::vector<uint8_t> indent_;
+
+ static const int CHARS_PER_LEVEL = 2;
+
+ void printIndent() {
+ size_t charsToIndent = level_ * CHARS_PER_LEVEL;
+ if (indent_.size() < charsToIndent) {
+ indent_.resize(charsToIndent * 2, ' ');
+ }
+ out_.writeBytes(indent_.data(), charsToIndent);
+ }
+public:
+ JsonPrettyFormatter(StreamWriter& out) : out_(out), level_(0), indent_(10, ' ') { }
+
+ void handleObjectStart() {
+ out_.write('\n');
+ ++level_;
+ printIndent();
+ }
+
+ void handleObjectEnd() {
+ out_.write('\n');
+ --level_;
+ printIndent();
+ }
+
+ void handleValueEnd() {
+ out_.write('\n');
+ printIndent();
+ }
+
+ void handleColon() {
+ out_.write(' ');
+ }
+};
+
+template <class F>
+class AVRO_DECL JsonGenerator {
+ StreamWriter out_;
+ F formatter_;
+ enum State {
+ stStart,
+ stArray0,
+ stArrayN,
+ stMap0,
+ stMapN,
+ stKey,
+ };
+
+ std::stack<State> stateStack;
+ State top;
+
+ void write(const char *b, const char* p) {
+ if (b != p) {
+ out_.writeBytes(reinterpret_cast<const uint8_t*>(b), p - b);
+ }
+ }
+
+ void escape(char c, const char* b, const char *p) {
+ write(b, p);
+ out_.write('\\');
+ out_.write(c);
+ }
+
+ void escapeCtl(char c) {
+ escapeUnicode(static_cast<uint8_t>(c));
+ }
+
+ void writeHex(char c) {
+ out_.write(toHex((static_cast<unsigned char>(c)) / 16));
+ out_.write(toHex((static_cast<unsigned char>(c)) % 16));
+ }
+
+ void escapeUnicode(uint32_t c) {
+ out_.write('\\');
+ out_.write('u');
+ writeHex((c >> 8) & 0xff);
+ writeHex(c & 0xff);
+ }
+ void doEncodeString(const char* b, size_t len, bool binary) {
+ const char* e = b + len;
+ out_.write('"');
+ for (const char* p = b; p != e; p++) {
+ if ((*p & 0x80) != 0) {
+ write(b, p);
+ if (binary) {
+ escapeCtl(*p);
+ } else if ((*p & 0x40) == 0) {
+ throw Exception("Invalid UTF-8 sequence");
+ } else {
+ int more = 1;
+ uint32_t value = 0;
+ if ((*p & 0x20) != 0) {
+ more++;
+ if ((*p & 0x10) != 0) {
+ more++;
+ if ((*p & 0x08) != 0) {
+ throw Exception("Invalid UTF-8 sequence");
+ } else {
+ value = *p & 0x07;
+ }
+ } else {
+ value = *p & 0x0f;
+ }
+ } else {
+ value = *p & 0x1f;
+ }
+ for (int i = 0; i < more; ++i) {
+ if (++p == e || (*p & 0xc0) != 0x80) {
+ throw Exception("Invalid UTF-8 sequence");
+ }
+ value <<= 6;
+ value |= *p & 0x3f;
+ }
+ escapeUnicode(value);
+ }
+ } else {
+ switch (*p) {
+ case '\\':
+ case '"':
+ case '/':
+ escape(*p, b, p);
+ break;
+ case '\b':
+ escape('b', b, p);
+ break;
+ case '\f':
+ escape('f', b, p);
+ break;
+ case '\n':
+ escape('n', b, p);
+ break;
+ case '\r':
+ escape('r', b, p);
+ break;
+ case '\t':
+ escape('t', b, p);
+ break;
+ default:
+ if (std::iscntrl(*p, std::locale::classic())) {
+ write(b, p);
+ escapeCtl(*p);
+ break;
+ } else {
+ continue;
+ }
+ }
+ }
+ b = p + 1;
+ }
+ write(b, e);
+ out_.write('"');
+ }
+
+ void sep() {
+ if (top == stArrayN) {
+ out_.write(',');
+ formatter_.handleValueEnd();
+ } else if (top == stArray0) {
+ top = stArrayN;
+ }
+ }
+
+ void sep2() {
+ if (top == stKey) {
+ top = stMapN;
+ }
+ }
+
+public:
+ JsonGenerator() : formatter_(out_), top(stStart) { }
+
+ void init(OutputStream& os) {
+ out_.reset(os);
+ }
+
+ void flush() {
+ out_.flush();
+ }
+
+ int64_t byteCount() const {
+ return out_.byteCount();
+ }
+
+ void encodeNull() {
+ sep();
+ out_.writeBytes(reinterpret_cast<const uint8_t*>("null"), 4);
+ sep2();
+ }
+
+ void encodeBool(bool b) {
+ sep();
+ if (b) {
+ out_.writeBytes(reinterpret_cast<const uint8_t*>("true"), 4);
+ } else {
+ out_.writeBytes(reinterpret_cast<const uint8_t*>("false"), 5);
+ }
+ sep2();
+ }
+
+ template <typename T>
+ void encodeNumber(T t) {
+ sep();
+ std::ostringstream oss;
+ oss << boost::lexical_cast<std::string>(t);
+ const std::string& s = oss.str();
+ out_.writeBytes(reinterpret_cast<const uint8_t*>(s.data()), s.size());
+ sep2();
+ }
+
+ void encodeNumber(double t) {
+ sep();
+ std::ostringstream oss;
+ if (boost::math::isfinite(t)) {
+ oss << boost::lexical_cast<std::string>(t);
+ } else if (boost::math::isnan(t)) {
+ oss << "NaN";
+ } else if (t == std::numeric_limits<double>::infinity()) {
+ oss << "Infinity";
+ } else {
+ oss << "-Infinity";
+ }
+ const std::string& s = oss.str();
+ out_.writeBytes(reinterpret_cast<const uint8_t*>(s.data()), s.size());
+ sep2();
+ }
+
+
+ void encodeString(const std::string& s) {
+ if (top == stMap0) {
+ top = stKey;
+ } else if (top == stMapN) {
+ out_.write(',');
+ formatter_.handleValueEnd();
+ top = stKey;
+ } else if (top == stKey) {
+ top = stMapN;
+ } else {
+ sep();
+ }
+ doEncodeString(s.c_str(), s.size(), false);
+ if (top == stKey) {
+ out_.write(':');
+ formatter_.handleColon();
+ }
+ }
+
+ void encodeBinary(const uint8_t* bytes, size_t len) {
+ sep();
+ doEncodeString(reinterpret_cast<const char *>(bytes), len, true);
+ sep2();
+ }
+
+ void arrayStart() {
+ sep();
+ stateStack.push(top);
+ top = stArray0;
+ out_.write('[');
+ formatter_.handleObjectStart();
+ }
+
+ void arrayEnd() {
+ top = stateStack.top();
+ stateStack.pop();
+ formatter_.handleObjectEnd();
+ out_.write(']');
+ sep2();
+ }
+
+ void objectStart() {
+ sep();
+ stateStack.push(top);
+ top = stMap0;
+ out_.write('{');
+ formatter_.handleObjectStart();
+ }
+
+ void objectEnd() {
+ top = stateStack.top();
+ stateStack.pop();
+ formatter_.handleObjectEnd();
+ out_.write('}');
+ sep2();
+ }
+
+};
+
+}
+}
+
+#endif
diff --git a/contrib/libs/apache/avro/impl/parsing/JsonCodec.cc b/contrib/libs/apache/avro/impl/parsing/JsonCodec.cc
index 8bca2984aee..73271fca55b 100644
--- a/contrib/libs/apache/avro/impl/parsing/JsonCodec.cc
+++ b/contrib/libs/apache/avro/impl/parsing/JsonCodec.cc
@@ -1,718 +1,718 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define __STDC_LIMIT_MACROS
-
-#include <string>
-#include <map>
-#include <algorithm>
-#include <ctype.h>
-#include <memory>
-#include <boost/math/special_functions/fpclassify.hpp>
-
-#include "ValidatingCodec.hh"
-#include "Symbol.hh"
-#include "ValidSchema.hh"
-#include "Decoder.hh"
-#include "Encoder.hh"
-#include "NodeImpl.hh"
-
-#include "../json/JsonIO.hh"
-
-namespace avro {
-
-namespace parsing {
-
-using std::make_shared;
-
-using std::map;
-using std::vector;
-using std::string;
-using std::reverse;
-using std::ostringstream;
-using std::istringstream;
-
-using avro::json::JsonParser;
-using avro::json::JsonGenerator;
-using avro::json::JsonNullFormatter;
-
-class JsonGrammarGenerator : public ValidatingGrammarGenerator {
- ProductionPtr doGenerate(const NodePtr& n,
- std::map<NodePtr, ProductionPtr> &m);
-};
-
-static std::string nameOf(const NodePtr& n)
-{
- if (n->hasName()) {
- return n->name();
- }
- std::ostringstream oss;
- oss << n->type();
- return oss.str();
-}
-
-ProductionPtr JsonGrammarGenerator::doGenerate(const NodePtr& n,
- std::map<NodePtr, ProductionPtr> &m) {
- switch (n->type()) {
- case AVRO_NULL:
- case AVRO_BOOL:
- case AVRO_INT:
- case AVRO_LONG:
- case AVRO_FLOAT:
- case AVRO_DOUBLE:
- case AVRO_STRING:
- case AVRO_BYTES:
- case AVRO_FIXED:
- case AVRO_ARRAY:
- case AVRO_MAP:
- case AVRO_SYMBOLIC:
- return ValidatingGrammarGenerator::doGenerate(n, m);
- case AVRO_RECORD:
- {
- ProductionPtr result = make_shared<Production>();
-
- m.erase(n);
-
- size_t c = n->leaves();
- result->reserve(2 + 2 * c);
- result->push_back(Symbol::recordStartSymbol());
- for (size_t i = 0; i < c; ++i) {
- const NodePtr& leaf = n->leafAt(i);
- ProductionPtr v = doGenerate(leaf, m);
- result->push_back(Symbol::fieldSymbol(n->nameAt(i)));
- copy(v->rbegin(), v->rend(), back_inserter(*result));
- }
- result->push_back(Symbol::recordEndSymbol());
- reverse(result->begin(), result->end());
-
- m[n] = result;
- return make_shared<Production>(1, Symbol::indirect(result));
- }
- case AVRO_ENUM:
- {
- vector<string> nn;
- size_t c = n->names();
- nn.reserve(c);
- for (size_t i = 0; i < c; ++i) {
- nn.push_back(n->nameAt(i));
- }
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::nameListSymbol(nn));
- result->push_back(Symbol::enumSymbol());
- m[n] = result;
- return result;
- }
- case AVRO_UNION:
- {
- size_t c = n->leaves();
-
- vector<ProductionPtr> vv;
- vv.reserve(c);
-
- vector<string> names;
- names.reserve(c);
-
- for (size_t i = 0; i < c; ++i) {
- const NodePtr& nn = n->leafAt(i);
- ProductionPtr v = doGenerate(nn, m);
- if (nn->type() != AVRO_NULL) {
- ProductionPtr v2 = make_shared<Production>();
- v2->push_back(Symbol::recordEndSymbol());
- copy(v->begin(), v->end(), back_inserter(*v2));
- v.swap(v2);
- }
- vv.push_back(v);
- names.push_back(nameOf(nn));
- }
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::alternative(vv));
- result->push_back(Symbol::nameListSymbol(names));
- result->push_back(Symbol::unionSymbol());
- return result;
- }
- default:
- throw Exception("Unknown node type");
- }
-}
-
-static void expectToken(JsonParser& in, JsonParser::Token tk)
-{
- in.expectToken(tk);
-}
-
-class JsonDecoderHandler {
- JsonParser& in_;
-public:
- JsonDecoderHandler(JsonParser& p) : in_(p) { }
- size_t handle(const Symbol& s) {
- switch (s.kind()) {
- case Symbol::sRecordStart:
- expectToken(in_, JsonParser::tkObjectStart);
- break;
- case Symbol::sRecordEnd:
- expectToken(in_, JsonParser::tkObjectEnd);
- break;
- case Symbol::sField:
- expectToken(in_, JsonParser::tkString);
- if (s.extra<string>() != in_.stringValue()) {
- throw Exception("Incorrect field");
- }
- break;
- default:
- break;
- }
- return 0;
- }
-};
-
-template <typename P>
-class JsonDecoder : public Decoder {
- JsonParser in_;
- JsonDecoderHandler handler_;
- P parser_;
-
- void init(InputStream& is);
- void decodeNull();
- bool decodeBool();
- int32_t decodeInt();
- int64_t decodeLong();
- float decodeFloat();
- double decodeDouble();
- void decodeString(string& value);
- void skipString();
- void decodeBytes(vector<uint8_t>& value);
- void skipBytes();
- void decodeFixed(size_t n, vector<uint8_t>& value);
- void skipFixed(size_t n);
- size_t decodeEnum();
- size_t arrayStart();
- size_t arrayNext();
- size_t skipArray();
- size_t mapStart();
- size_t mapNext();
- size_t skipMap();
- size_t decodeUnionIndex();
-
- void expect(JsonParser::Token tk);
- void skipComposite();
- void drain();
-public:
-
- JsonDecoder(const ValidSchema& s) :
- handler_(in_),
- parser_(JsonGrammarGenerator().generate(s), NULL, handler_) { }
-
-};
-
-template <typename P>
-void JsonDecoder<P>::init(InputStream& is)
-{
- in_.init(is);
- parser_.reset();
-}
-
-template <typename P>
-void JsonDecoder<P>::expect(JsonParser::Token tk)
-{
- expectToken(in_, tk);
-}
-
-template <typename P>
-void JsonDecoder<P>::decodeNull()
-{
- parser_.advance(Symbol::sNull);
- expect(JsonParser::tkNull);
-}
-
-template <typename P>
-bool JsonDecoder<P>::decodeBool()
-{
- parser_.advance(Symbol::sBool);
- expect(JsonParser::tkBool);
- bool result = in_.boolValue();
- return result;
-}
-
-template <typename P>
-int32_t JsonDecoder<P>::decodeInt()
-{
- parser_.advance(Symbol::sInt);
- expect(JsonParser::tkLong);
- int64_t result = in_.longValue();
- if (result < INT32_MIN || result > INT32_MAX) {
- throw Exception(boost::format("Value out of range for Avro int: %1%")
- % result);
- }
- return static_cast<int32_t>(result);
-}
-
-template <typename P>
-int64_t JsonDecoder<P>::decodeLong()
-{
- parser_.advance(Symbol::sLong);
- expect(JsonParser::tkLong);
- int64_t result = in_.longValue();
- return result;
-}
-
-template <typename P>
-float JsonDecoder<P>::decodeFloat()
-{
- parser_.advance(Symbol::sFloat);
- expect(JsonParser::tkDouble);
- double result = in_.doubleValue();
- return static_cast<float>(result);
-}
-
-template <typename P>
-double JsonDecoder<P>::decodeDouble()
-{
- parser_.advance(Symbol::sDouble);
- expect(JsonParser::tkDouble);
- double result = in_.doubleValue();
- return result;
-}
-
-template <typename P>
-void JsonDecoder<P>::decodeString(string& value)
-{
- parser_.advance(Symbol::sString);
- expect(JsonParser::tkString);
- value = in_.stringValue();
-}
-
-template <typename P>
-void JsonDecoder<P>::skipString()
-{
- parser_.advance(Symbol::sString);
- expect(JsonParser::tkString);
-}
-
-static vector<uint8_t> toBytes(const string& s)
-{
- return vector<uint8_t>(s.begin(), s.end());
-}
-
-template <typename P>
-void JsonDecoder<P>::decodeBytes(vector<uint8_t>& value )
-{
- parser_.advance(Symbol::sBytes);
- expect(JsonParser::tkString);
- value = toBytes(in_.bytesValue());
-}
-
-template <typename P>
-void JsonDecoder<P>::skipBytes()
-{
- parser_.advance(Symbol::sBytes);
- expect(JsonParser::tkString);
-}
-
-template <typename P>
-void JsonDecoder<P>::decodeFixed(size_t n, vector<uint8_t>& value)
-{
- parser_.advance(Symbol::sFixed);
- parser_.assertSize(n);
- expect(JsonParser::tkString);
- value = toBytes(in_.bytesValue());
- if (value.size() != n) {
- throw Exception("Incorrect value for fixed");
- }
-}
-
-template <typename P>
-void JsonDecoder<P>::skipFixed(size_t n)
-{
- parser_.advance(Symbol::sFixed);
- parser_.assertSize(n);
- expect(JsonParser::tkString);
- vector<uint8_t> result = toBytes(in_.bytesValue());
- if (result.size() != n) {
- throw Exception("Incorrect value for fixed");
- }
-}
-
-template <typename P>
-size_t JsonDecoder<P>::decodeEnum()
-{
- parser_.advance(Symbol::sEnum);
- expect(JsonParser::tkString);
- size_t result = parser_.indexForName(in_.stringValue());
- return result;
-}
-
-template <typename P>
-size_t JsonDecoder<P>::arrayStart()
-{
- parser_.advance(Symbol::sArrayStart);
- parser_.pushRepeatCount(0);
- expect(JsonParser::tkArrayStart);
- return arrayNext();
-}
-
-template <typename P>
-size_t JsonDecoder<P>::arrayNext()
-{
- parser_.processImplicitActions();
- if (in_.peek() == JsonParser::tkArrayEnd) {
- in_.advance();
- parser_.popRepeater();
- parser_.advance(Symbol::sArrayEnd);
- return 0;
- }
- parser_.nextRepeatCount(1);
- return 1;
-}
-
-template<typename P>
-void JsonDecoder<P>::skipComposite()
-{
- size_t level = 0;
- for (; ;) {
- switch (in_.advance()) {
- case JsonParser::tkArrayStart:
- case JsonParser::tkObjectStart:
- ++level;
- continue;
- case JsonParser::tkArrayEnd:
- case JsonParser::tkObjectEnd:
- if (level == 0) {
- return;
- }
- --level;
- continue;
- default:
- continue;
- }
- }
-}
-
-template<typename P>
-void JsonDecoder<P>::drain()
-{
- parser_.processImplicitActions();
- in_.drain();
-}
-
-template <typename P>
-size_t JsonDecoder<P>::skipArray()
-{
- parser_.advance(Symbol::sArrayStart);
- parser_.pop();
- parser_.advance(Symbol::sArrayEnd);
- expect(JsonParser::tkArrayStart);
- skipComposite();
- return 0;
-}
-
-template <typename P>
-size_t JsonDecoder<P>::mapStart()
-{
- parser_.advance(Symbol::sMapStart);
- parser_.pushRepeatCount(0);
- expect(JsonParser::tkObjectStart);
- return mapNext();
-}
-
-template <typename P>
-size_t JsonDecoder<P>::mapNext()
-{
- parser_.processImplicitActions();
- if (in_.peek() == JsonParser::tkObjectEnd) {
- in_.advance();
- parser_.popRepeater();
- parser_.advance(Symbol::sMapEnd);
- return 0;
- }
- parser_.nextRepeatCount(1);
- return 1;
-}
-
-template <typename P>
-size_t JsonDecoder<P>::skipMap()
-{
- parser_.advance(Symbol::sMapStart);
- parser_.pop();
- parser_.advance(Symbol::sMapEnd);
- expect(JsonParser::tkObjectStart);
- skipComposite();
- return 0;
-}
-
-template <typename P>
-size_t JsonDecoder<P>::decodeUnionIndex()
-{
- parser_.advance(Symbol::sUnion);
-
- size_t result;
- if (in_.peek() == JsonParser::tkNull) {
- result = parser_.indexForName("null");
- } else {
- expect(JsonParser::tkObjectStart);
- expect(JsonParser::tkString);
- result = parser_.indexForName(in_.stringValue());
- }
- parser_.selectBranch(result);
- return result;
-}
-
-template<typename F = JsonNullFormatter>
-class JsonHandler {
- JsonGenerator<F>& generator_;
-public:
- JsonHandler(JsonGenerator<F>& g) : generator_(g) { }
- size_t handle(const Symbol& s) {
- switch (s.kind()) {
- case Symbol::sRecordStart:
- generator_.objectStart();
- break;
- case Symbol::sRecordEnd:
- generator_.objectEnd();
- break;
- case Symbol::sField:
- generator_.encodeString(s.extra<string>());
- break;
- default:
- break;
- }
- return 0;
- }
-};
-
-template <typename P, typename F = JsonNullFormatter>
-class JsonEncoder : public Encoder {
- JsonGenerator<F> out_;
- JsonHandler<F> handler_;
- P parser_;
-
- void init(OutputStream& os);
- void flush();
- int64_t byteCount() const;
- void encodeNull();
- void encodeBool(bool b);
- void encodeInt(int32_t i);
- void encodeLong(int64_t l);
- void encodeFloat(float f);
- void encodeDouble(double d);
- void encodeString(const std::string& s);
- void encodeBytes(const uint8_t *bytes, size_t len);
- void encodeFixed(const uint8_t *bytes, size_t len);
- void encodeEnum(size_t e);
- void arrayStart();
- void arrayEnd();
- void mapStart();
- void mapEnd();
- void setItemCount(size_t count);
- void startItem();
- void encodeUnionIndex(size_t e);
-public:
- JsonEncoder(const ValidSchema& schema) :
- handler_(out_),
- parser_(JsonGrammarGenerator().generate(schema), NULL, handler_) { }
-};
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::init(OutputStream& os)
-{
- out_.init(os);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::flush()
-{
- parser_.processImplicitActions();
- out_.flush();
-}
-
-template<typename P, typename F>
-int64_t JsonEncoder<P, F>::byteCount() const
-{
- return out_.byteCount();
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeNull()
-{
- parser_.advance(Symbol::sNull);
- out_.encodeNull();
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeBool(bool b)
-{
- parser_.advance(Symbol::sBool);
- out_.encodeBool(b);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeInt(int32_t i)
-{
- parser_.advance(Symbol::sInt);
- out_.encodeNumber(i);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeLong(int64_t l)
-{
- parser_.advance(Symbol::sLong);
- out_.encodeNumber(l);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeFloat(float f)
-{
- parser_.advance(Symbol::sFloat);
- if (f == std::numeric_limits<float>::infinity()) {
- out_.encodeString("Infinity");
- } else if (f == -std::numeric_limits<float>::infinity()) {
- out_.encodeString("-Infinity");
- } else if (boost::math::isnan(f)) {
- out_.encodeString("NaN");
- } else {
- out_.encodeNumber(f);
- }
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeDouble(double d)
-{
- parser_.advance(Symbol::sDouble);
- if (d == std::numeric_limits<double>::infinity()) {
- out_.encodeString("Infinity");
- } else if (d == -std::numeric_limits<double>::infinity()) {
- out_.encodeString("-Infinity");
- } else if (boost::math::isnan(d)) {
- out_.encodeString("NaN");
- } else {
- out_.encodeNumber(d);
- }
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeString(const std::string& s)
-{
- parser_.advance(Symbol::sString);
- out_.encodeString(s);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeBytes(const uint8_t *bytes, size_t len)
-{
- parser_.advance(Symbol::sBytes);
- out_.encodeBinary(bytes, len);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeFixed(const uint8_t *bytes, size_t len)
-{
- parser_.advance(Symbol::sFixed);
- parser_.assertSize(len);
- out_.encodeBinary(bytes, len);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeEnum(size_t e)
-{
- parser_.advance(Symbol::sEnum);
- const string& s = parser_.nameForIndex(e);
- out_.encodeString(s);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::arrayStart()
-{
- parser_.advance(Symbol::sArrayStart);
- parser_.pushRepeatCount(0);
- out_.arrayStart();
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::arrayEnd()
-{
- parser_.popRepeater();
- parser_.advance(Symbol::sArrayEnd);
- out_.arrayEnd();
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::mapStart()
-{
- parser_.advance(Symbol::sMapStart);
- parser_.pushRepeatCount(0);
- out_.objectStart();
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::mapEnd()
-{
- parser_.popRepeater();
- parser_.advance(Symbol::sMapEnd);
- out_.objectEnd();
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::setItemCount(size_t count)
-{
- parser_.nextRepeatCount(count);
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::startItem()
-{
- parser_.processImplicitActions();
- if (parser_.top() != Symbol::sRepeater) {
- throw Exception("startItem at not an item boundary");
- }
-}
-
-template<typename P, typename F>
-void JsonEncoder<P, F>::encodeUnionIndex(size_t e)
-{
- parser_.advance(Symbol::sUnion);
-
- const std::string name = parser_.nameForIndex(e);
-
- if (name != "null") {
- out_.objectStart();
- out_.encodeString(name);
- }
- parser_.selectBranch(e);
-}
-
-} // namespace parsing
-
-DecoderPtr jsonDecoder(const ValidSchema& s)
-{
- return std::make_shared<parsing::JsonDecoder<
- parsing::SimpleParser<parsing::JsonDecoderHandler> > >(s);
-}
-
-EncoderPtr jsonEncoder(const ValidSchema& schema)
-{
- return std::make_shared<parsing::JsonEncoder<
- parsing::SimpleParser<parsing::JsonHandler<avro::json::JsonNullFormatter> >, avro::json::JsonNullFormatter> >(schema);
-}
-
-EncoderPtr jsonPrettyEncoder(const ValidSchema& schema)
-{
- return std::make_shared<parsing::JsonEncoder<
- parsing::SimpleParser<parsing::JsonHandler<avro::json::JsonPrettyFormatter> >, avro::json::JsonPrettyFormatter> >(schema);
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define __STDC_LIMIT_MACROS
+
+#include <string>
+#include <map>
+#include <algorithm>
+#include <ctype.h>
+#include <memory>
+#include <boost/math/special_functions/fpclassify.hpp>
+
+#include "ValidatingCodec.hh"
+#include "Symbol.hh"
+#include "ValidSchema.hh"
+#include "Decoder.hh"
+#include "Encoder.hh"
+#include "NodeImpl.hh"
+
+#include "../json/JsonIO.hh"
+
+namespace avro {
+
+namespace parsing {
+
+using std::make_shared;
+
+using std::map;
+using std::vector;
+using std::string;
+using std::reverse;
+using std::ostringstream;
+using std::istringstream;
+
+using avro::json::JsonParser;
+using avro::json::JsonGenerator;
+using avro::json::JsonNullFormatter;
+
+class JsonGrammarGenerator : public ValidatingGrammarGenerator {
+ ProductionPtr doGenerate(const NodePtr& n,
+ std::map<NodePtr, ProductionPtr> &m);
+};
+
+static std::string nameOf(const NodePtr& n)
+{
+ if (n->hasName()) {
+ return n->name();
+ }
+ std::ostringstream oss;
+ oss << n->type();
+ return oss.str();
+}
+
+ProductionPtr JsonGrammarGenerator::doGenerate(const NodePtr& n,
+ std::map<NodePtr, ProductionPtr> &m) {
+ switch (n->type()) {
+ case AVRO_NULL:
+ case AVRO_BOOL:
+ case AVRO_INT:
+ case AVRO_LONG:
+ case AVRO_FLOAT:
+ case AVRO_DOUBLE:
+ case AVRO_STRING:
+ case AVRO_BYTES:
+ case AVRO_FIXED:
+ case AVRO_ARRAY:
+ case AVRO_MAP:
+ case AVRO_SYMBOLIC:
+ return ValidatingGrammarGenerator::doGenerate(n, m);
+ case AVRO_RECORD:
+ {
+ ProductionPtr result = make_shared<Production>();
+
+ m.erase(n);
+
+ size_t c = n->leaves();
+ result->reserve(2 + 2 * c);
+ result->push_back(Symbol::recordStartSymbol());
+ for (size_t i = 0; i < c; ++i) {
+ const NodePtr& leaf = n->leafAt(i);
+ ProductionPtr v = doGenerate(leaf, m);
+ result->push_back(Symbol::fieldSymbol(n->nameAt(i)));
+ copy(v->rbegin(), v->rend(), back_inserter(*result));
+ }
+ result->push_back(Symbol::recordEndSymbol());
+ reverse(result->begin(), result->end());
+
+ m[n] = result;
+ return make_shared<Production>(1, Symbol::indirect(result));
+ }
+ case AVRO_ENUM:
+ {
+ vector<string> nn;
+ size_t c = n->names();
+ nn.reserve(c);
+ for (size_t i = 0; i < c; ++i) {
+ nn.push_back(n->nameAt(i));
+ }
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::nameListSymbol(nn));
+ result->push_back(Symbol::enumSymbol());
+ m[n] = result;
+ return result;
+ }
+ case AVRO_UNION:
+ {
+ size_t c = n->leaves();
+
+ vector<ProductionPtr> vv;
+ vv.reserve(c);
+
+ vector<string> names;
+ names.reserve(c);
+
+ for (size_t i = 0; i < c; ++i) {
+ const NodePtr& nn = n->leafAt(i);
+ ProductionPtr v = doGenerate(nn, m);
+ if (nn->type() != AVRO_NULL) {
+ ProductionPtr v2 = make_shared<Production>();
+ v2->push_back(Symbol::recordEndSymbol());
+ copy(v->begin(), v->end(), back_inserter(*v2));
+ v.swap(v2);
+ }
+ vv.push_back(v);
+ names.push_back(nameOf(nn));
+ }
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::alternative(vv));
+ result->push_back(Symbol::nameListSymbol(names));
+ result->push_back(Symbol::unionSymbol());
+ return result;
+ }
+ default:
+ throw Exception("Unknown node type");
+ }
+}
+
+static void expectToken(JsonParser& in, JsonParser::Token tk)
+{
+ in.expectToken(tk);
+}
+
+class JsonDecoderHandler {
+ JsonParser& in_;
+public:
+ JsonDecoderHandler(JsonParser& p) : in_(p) { }
+ size_t handle(const Symbol& s) {
+ switch (s.kind()) {
+ case Symbol::sRecordStart:
+ expectToken(in_, JsonParser::tkObjectStart);
+ break;
+ case Symbol::sRecordEnd:
+ expectToken(in_, JsonParser::tkObjectEnd);
+ break;
+ case Symbol::sField:
+ expectToken(in_, JsonParser::tkString);
+ if (s.extra<string>() != in_.stringValue()) {
+ throw Exception("Incorrect field");
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+ }
+};
+
+template <typename P>
+class JsonDecoder : public Decoder {
+ JsonParser in_;
+ JsonDecoderHandler handler_;
+ P parser_;
+
+ void init(InputStream& is);
+ void decodeNull();
+ bool decodeBool();
+ int32_t decodeInt();
+ int64_t decodeLong();
+ float decodeFloat();
+ double decodeDouble();
+ void decodeString(string& value);
+ void skipString();
+ void decodeBytes(vector<uint8_t>& value);
+ void skipBytes();
+ void decodeFixed(size_t n, vector<uint8_t>& value);
+ void skipFixed(size_t n);
+ size_t decodeEnum();
+ size_t arrayStart();
+ size_t arrayNext();
+ size_t skipArray();
+ size_t mapStart();
+ size_t mapNext();
+ size_t skipMap();
+ size_t decodeUnionIndex();
+
+ void expect(JsonParser::Token tk);
+ void skipComposite();
+ void drain();
+public:
+
+ JsonDecoder(const ValidSchema& s) :
+ handler_(in_),
+ parser_(JsonGrammarGenerator().generate(s), NULL, handler_) { }
+
+};
+
+template <typename P>
+void JsonDecoder<P>::init(InputStream& is)
+{
+ in_.init(is);
+ parser_.reset();
+}
+
+template <typename P>
+void JsonDecoder<P>::expect(JsonParser::Token tk)
+{
+ expectToken(in_, tk);
+}
+
+template <typename P>
+void JsonDecoder<P>::decodeNull()
+{
+ parser_.advance(Symbol::sNull);
+ expect(JsonParser::tkNull);
+}
+
+template <typename P>
+bool JsonDecoder<P>::decodeBool()
+{
+ parser_.advance(Symbol::sBool);
+ expect(JsonParser::tkBool);
+ bool result = in_.boolValue();
+ return result;
+}
+
+template <typename P>
+int32_t JsonDecoder<P>::decodeInt()
+{
+ parser_.advance(Symbol::sInt);
+ expect(JsonParser::tkLong);
+ int64_t result = in_.longValue();
+ if (result < INT32_MIN || result > INT32_MAX) {
+ throw Exception(boost::format("Value out of range for Avro int: %1%")
+ % result);
+ }
+ return static_cast<int32_t>(result);
+}
+
+template <typename P>
+int64_t JsonDecoder<P>::decodeLong()
+{
+ parser_.advance(Symbol::sLong);
+ expect(JsonParser::tkLong);
+ int64_t result = in_.longValue();
+ return result;
+}
+
+template <typename P>
+float JsonDecoder<P>::decodeFloat()
+{
+ parser_.advance(Symbol::sFloat);
+ expect(JsonParser::tkDouble);
+ double result = in_.doubleValue();
+ return static_cast<float>(result);
+}
+
+template <typename P>
+double JsonDecoder<P>::decodeDouble()
+{
+ parser_.advance(Symbol::sDouble);
+ expect(JsonParser::tkDouble);
+ double result = in_.doubleValue();
+ return result;
+}
+
+template <typename P>
+void JsonDecoder<P>::decodeString(string& value)
+{
+ parser_.advance(Symbol::sString);
+ expect(JsonParser::tkString);
+ value = in_.stringValue();
+}
+
+template <typename P>
+void JsonDecoder<P>::skipString()
+{
+ parser_.advance(Symbol::sString);
+ expect(JsonParser::tkString);
+}
+
+static vector<uint8_t> toBytes(const string& s)
+{
+ return vector<uint8_t>(s.begin(), s.end());
+}
+
+template <typename P>
+void JsonDecoder<P>::decodeBytes(vector<uint8_t>& value )
+{
+ parser_.advance(Symbol::sBytes);
+ expect(JsonParser::tkString);
+ value = toBytes(in_.bytesValue());
+}
+
+template <typename P>
+void JsonDecoder<P>::skipBytes()
+{
+ parser_.advance(Symbol::sBytes);
+ expect(JsonParser::tkString);
+}
+
+template <typename P>
+void JsonDecoder<P>::decodeFixed(size_t n, vector<uint8_t>& value)
+{
+ parser_.advance(Symbol::sFixed);
+ parser_.assertSize(n);
+ expect(JsonParser::tkString);
+ value = toBytes(in_.bytesValue());
+ if (value.size() != n) {
+ throw Exception("Incorrect value for fixed");
+ }
+}
+
+template <typename P>
+void JsonDecoder<P>::skipFixed(size_t n)
+{
+ parser_.advance(Symbol::sFixed);
+ parser_.assertSize(n);
+ expect(JsonParser::tkString);
+ vector<uint8_t> result = toBytes(in_.bytesValue());
+ if (result.size() != n) {
+ throw Exception("Incorrect value for fixed");
+ }
+}
+
+template <typename P>
+size_t JsonDecoder<P>::decodeEnum()
+{
+ parser_.advance(Symbol::sEnum);
+ expect(JsonParser::tkString);
+ size_t result = parser_.indexForName(in_.stringValue());
+ return result;
+}
+
+template <typename P>
+size_t JsonDecoder<P>::arrayStart()
+{
+ parser_.advance(Symbol::sArrayStart);
+ parser_.pushRepeatCount(0);
+ expect(JsonParser::tkArrayStart);
+ return arrayNext();
+}
+
+template <typename P>
+size_t JsonDecoder<P>::arrayNext()
+{
+ parser_.processImplicitActions();
+ if (in_.peek() == JsonParser::tkArrayEnd) {
+ in_.advance();
+ parser_.popRepeater();
+ parser_.advance(Symbol::sArrayEnd);
+ return 0;
+ }
+ parser_.nextRepeatCount(1);
+ return 1;
+}
+
+template<typename P>
+void JsonDecoder<P>::skipComposite()
+{
+ size_t level = 0;
+ for (; ;) {
+ switch (in_.advance()) {
+ case JsonParser::tkArrayStart:
+ case JsonParser::tkObjectStart:
+ ++level;
+ continue;
+ case JsonParser::tkArrayEnd:
+ case JsonParser::tkObjectEnd:
+ if (level == 0) {
+ return;
+ }
+ --level;
+ continue;
+ default:
+ continue;
+ }
+ }
+}
+
+template<typename P>
+void JsonDecoder<P>::drain()
+{
+ parser_.processImplicitActions();
+ in_.drain();
+}
+
+template <typename P>
+size_t JsonDecoder<P>::skipArray()
+{
+ parser_.advance(Symbol::sArrayStart);
+ parser_.pop();
+ parser_.advance(Symbol::sArrayEnd);
+ expect(JsonParser::tkArrayStart);
+ skipComposite();
+ return 0;
+}
+
+template <typename P>
+size_t JsonDecoder<P>::mapStart()
+{
+ parser_.advance(Symbol::sMapStart);
+ parser_.pushRepeatCount(0);
+ expect(JsonParser::tkObjectStart);
+ return mapNext();
+}
+
+template <typename P>
+size_t JsonDecoder<P>::mapNext()
+{
+ parser_.processImplicitActions();
+ if (in_.peek() == JsonParser::tkObjectEnd) {
+ in_.advance();
+ parser_.popRepeater();
+ parser_.advance(Symbol::sMapEnd);
+ return 0;
+ }
+ parser_.nextRepeatCount(1);
+ return 1;
+}
+
+template <typename P>
+size_t JsonDecoder<P>::skipMap()
+{
+ parser_.advance(Symbol::sMapStart);
+ parser_.pop();
+ parser_.advance(Symbol::sMapEnd);
+ expect(JsonParser::tkObjectStart);
+ skipComposite();
+ return 0;
+}
+
+template <typename P>
+size_t JsonDecoder<P>::decodeUnionIndex()
+{
+ parser_.advance(Symbol::sUnion);
+
+ size_t result;
+ if (in_.peek() == JsonParser::tkNull) {
+ result = parser_.indexForName("null");
+ } else {
+ expect(JsonParser::tkObjectStart);
+ expect(JsonParser::tkString);
+ result = parser_.indexForName(in_.stringValue());
+ }
+ parser_.selectBranch(result);
+ return result;
+}
+
+template<typename F = JsonNullFormatter>
+class JsonHandler {
+ JsonGenerator<F>& generator_;
+public:
+ JsonHandler(JsonGenerator<F>& g) : generator_(g) { }
+ size_t handle(const Symbol& s) {
+ switch (s.kind()) {
+ case Symbol::sRecordStart:
+ generator_.objectStart();
+ break;
+ case Symbol::sRecordEnd:
+ generator_.objectEnd();
+ break;
+ case Symbol::sField:
+ generator_.encodeString(s.extra<string>());
+ break;
+ default:
+ break;
+ }
+ return 0;
+ }
+};
+
+template <typename P, typename F = JsonNullFormatter>
+class JsonEncoder : public Encoder {
+ JsonGenerator<F> out_;
+ JsonHandler<F> handler_;
+ P parser_;
+
+ void init(OutputStream& os);
+ void flush();
+ int64_t byteCount() const;
+ void encodeNull();
+ void encodeBool(bool b);
+ void encodeInt(int32_t i);
+ void encodeLong(int64_t l);
+ void encodeFloat(float f);
+ void encodeDouble(double d);
+ void encodeString(const std::string& s);
+ void encodeBytes(const uint8_t *bytes, size_t len);
+ void encodeFixed(const uint8_t *bytes, size_t len);
+ void encodeEnum(size_t e);
+ void arrayStart();
+ void arrayEnd();
+ void mapStart();
+ void mapEnd();
+ void setItemCount(size_t count);
+ void startItem();
+ void encodeUnionIndex(size_t e);
+public:
+ JsonEncoder(const ValidSchema& schema) :
+ handler_(out_),
+ parser_(JsonGrammarGenerator().generate(schema), NULL, handler_) { }
+};
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::init(OutputStream& os)
+{
+ out_.init(os);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::flush()
+{
+ parser_.processImplicitActions();
+ out_.flush();
+}
+
+template<typename P, typename F>
+int64_t JsonEncoder<P, F>::byteCount() const
+{
+ return out_.byteCount();
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeNull()
+{
+ parser_.advance(Symbol::sNull);
+ out_.encodeNull();
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeBool(bool b)
+{
+ parser_.advance(Symbol::sBool);
+ out_.encodeBool(b);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeInt(int32_t i)
+{
+ parser_.advance(Symbol::sInt);
+ out_.encodeNumber(i);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeLong(int64_t l)
+{
+ parser_.advance(Symbol::sLong);
+ out_.encodeNumber(l);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeFloat(float f)
+{
+ parser_.advance(Symbol::sFloat);
+ if (f == std::numeric_limits<float>::infinity()) {
+ out_.encodeString("Infinity");
+ } else if (f == -std::numeric_limits<float>::infinity()) {
+ out_.encodeString("-Infinity");
+ } else if (boost::math::isnan(f)) {
+ out_.encodeString("NaN");
+ } else {
+ out_.encodeNumber(f);
+ }
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeDouble(double d)
+{
+ parser_.advance(Symbol::sDouble);
+ if (d == std::numeric_limits<double>::infinity()) {
+ out_.encodeString("Infinity");
+ } else if (d == -std::numeric_limits<double>::infinity()) {
+ out_.encodeString("-Infinity");
+ } else if (boost::math::isnan(d)) {
+ out_.encodeString("NaN");
+ } else {
+ out_.encodeNumber(d);
+ }
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeString(const std::string& s)
+{
+ parser_.advance(Symbol::sString);
+ out_.encodeString(s);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeBytes(const uint8_t *bytes, size_t len)
+{
+ parser_.advance(Symbol::sBytes);
+ out_.encodeBinary(bytes, len);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeFixed(const uint8_t *bytes, size_t len)
+{
+ parser_.advance(Symbol::sFixed);
+ parser_.assertSize(len);
+ out_.encodeBinary(bytes, len);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeEnum(size_t e)
+{
+ parser_.advance(Symbol::sEnum);
+ const string& s = parser_.nameForIndex(e);
+ out_.encodeString(s);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::arrayStart()
+{
+ parser_.advance(Symbol::sArrayStart);
+ parser_.pushRepeatCount(0);
+ out_.arrayStart();
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::arrayEnd()
+{
+ parser_.popRepeater();
+ parser_.advance(Symbol::sArrayEnd);
+ out_.arrayEnd();
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::mapStart()
+{
+ parser_.advance(Symbol::sMapStart);
+ parser_.pushRepeatCount(0);
+ out_.objectStart();
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::mapEnd()
+{
+ parser_.popRepeater();
+ parser_.advance(Symbol::sMapEnd);
+ out_.objectEnd();
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::setItemCount(size_t count)
+{
+ parser_.nextRepeatCount(count);
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::startItem()
+{
+ parser_.processImplicitActions();
+ if (parser_.top() != Symbol::sRepeater) {
+ throw Exception("startItem at not an item boundary");
+ }
+}
+
+template<typename P, typename F>
+void JsonEncoder<P, F>::encodeUnionIndex(size_t e)
+{
+ parser_.advance(Symbol::sUnion);
+
+ const std::string name = parser_.nameForIndex(e);
+
+ if (name != "null") {
+ out_.objectStart();
+ out_.encodeString(name);
+ }
+ parser_.selectBranch(e);
+}
+
+} // namespace parsing
+
+DecoderPtr jsonDecoder(const ValidSchema& s)
+{
+ return std::make_shared<parsing::JsonDecoder<
+ parsing::SimpleParser<parsing::JsonDecoderHandler> > >(s);
+}
+
+EncoderPtr jsonEncoder(const ValidSchema& schema)
+{
+ return std::make_shared<parsing::JsonEncoder<
+ parsing::SimpleParser<parsing::JsonHandler<avro::json::JsonNullFormatter> >, avro::json::JsonNullFormatter> >(schema);
+}
+
+EncoderPtr jsonPrettyEncoder(const ValidSchema& schema)
+{
+ return std::make_shared<parsing::JsonEncoder<
+ parsing::SimpleParser<parsing::JsonHandler<avro::json::JsonPrettyFormatter> >, avro::json::JsonPrettyFormatter> >(schema);
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/parsing/ResolvingDecoder.cc b/contrib/libs/apache/avro/impl/parsing/ResolvingDecoder.cc
index f6dbacabcf1..2e33eaa8d01 100644
--- a/contrib/libs/apache/avro/impl/parsing/ResolvingDecoder.cc
+++ b/contrib/libs/apache/avro/impl/parsing/ResolvingDecoder.cc
@@ -1,740 +1,740 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define __STDC_LIMIT_MACROS
-
-#include <string>
-#include <stack>
-#include <map>
-#include <algorithm>
-#include <memory>
-#include <ctype.h>
-
-#include "ValidatingCodec.hh"
-#include "Symbol.hh"
-#include "Types.hh"
-#include "ValidSchema.hh"
-#include "Decoder.hh"
-#include "Encoder.hh"
-#include "NodeImpl.hh"
-#include "Generic.hh"
-#include "Stream.hh"
-
-namespace avro {
-
-using std::make_shared;
-
-namespace parsing {
-
-using std::shared_ptr;
-using std::static_pointer_cast;
-using std::make_shared;
-
-using std::unique_ptr;
-using std::map;
-using std::pair;
-using std::vector;
-using std::string;
-using std::reverse;
-using std::ostringstream;
-using std::istringstream;
-using std::stack;
-using std::find_if;
-using std::make_pair;
-
-typedef pair<NodePtr, NodePtr> NodePair;
-
-class ResolvingGrammarGenerator : public ValidatingGrammarGenerator {
- ProductionPtr doGenerate2(const NodePtr& writer,
- const NodePtr& reader, map<NodePair, ProductionPtr> &m,
- map<NodePtr, ProductionPtr> &m2);
- ProductionPtr resolveRecords(const NodePtr& writer,
- const NodePtr& reader, map<NodePair, ProductionPtr> &m,
- map<NodePtr, ProductionPtr> &m2);
- ProductionPtr resolveUnion(const NodePtr& writer,
- const NodePtr& reader, map<NodePair, ProductionPtr> &m,
- map<NodePtr, ProductionPtr> &m2);
-
- static vector<pair<string, size_t> > fields(const NodePtr& n) {
- vector<pair<string, size_t> > result;
- size_t c = n->names();
- for (size_t i = 0; i < c; ++i) {
- result.push_back(make_pair(n->nameAt(i), i));
- }
- return result;
- }
-
- static int bestBranch(const NodePtr& writer, const NodePtr& reader);
-
- ProductionPtr getWriterProduction(const NodePtr& n,
- map<NodePtr, ProductionPtr>& m2);
-
-public:
- Symbol generate(
- const ValidSchema& writer, const ValidSchema& reader);
-};
-
-Symbol ResolvingGrammarGenerator::generate(
- const ValidSchema& writer, const ValidSchema& reader) {
- map<NodePtr, ProductionPtr> m2;
-
- const NodePtr& rr = reader.root();
- const NodePtr& rw = writer.root();
- ProductionPtr backup = ValidatingGrammarGenerator::doGenerate(rw, m2);
- fixup(backup, m2);
-
- map<NodePair, ProductionPtr> m;
- ProductionPtr main = doGenerate2(rw, rr, m, m2);
- fixup(main, m);
- return Symbol::rootSymbol(main, backup);
-}
-
-int ResolvingGrammarGenerator::bestBranch(const NodePtr& writer,
- const NodePtr& reader)
-{
- Type t = writer->type();
-
- const size_t c = reader->leaves();
- for (size_t j = 0; j < c; ++j) {
- NodePtr r = reader->leafAt(j);
- if (r->type() == AVRO_SYMBOLIC) {
- r = resolveSymbol(r);
- }
- if (t == r->type()) {
- if (r->hasName()) {
- if (r->name() == writer->name()) {
- return j;
- }
- } else {
- return j;
- }
- }
- }
-
- for (size_t j = 0; j < c; ++j) {
- const NodePtr& r = reader->leafAt(j);
- Type rt = r->type();
- switch (t) {
- case AVRO_INT:
- if (rt == AVRO_LONG || rt == AVRO_DOUBLE || rt == AVRO_FLOAT) {
- return j;
- }
- break;
- case AVRO_LONG:
- case AVRO_FLOAT:
- if (rt == AVRO_DOUBLE) {
- return j;
- }
- break;
- default:
- break;
- }
- }
- return -1;
-}
-
-static shared_ptr<vector<uint8_t> > getAvroBinary(
- const GenericDatum& defaultValue)
-{
- EncoderPtr e = binaryEncoder();
- unique_ptr<OutputStream> os = memoryOutputStream();
- e->init(*os);
- GenericWriter::write(*e, defaultValue);
- e->flush();
- return snapshot(*os);
-}
-
-template<typename T1, typename T2>
-struct equalsFirst
-{
- const T1& v_;
- equalsFirst(const T1& v) : v_(v) { }
- bool operator()(const pair<T1, T2>& p) {
- return p.first == v_;
- }
-};
-
-ProductionPtr ResolvingGrammarGenerator::getWriterProduction(
- const NodePtr& n, map<NodePtr, ProductionPtr>& m2)
-{
- const NodePtr& nn = (n->type() == AVRO_SYMBOLIC) ?
- static_cast<const NodeSymbolic& >(*n).getNode() : n;
- map<NodePtr, ProductionPtr>::const_iterator it2 = m2.find(nn);
- if (it2 != m2.end()) {
- return it2->second;
- } else {
- ProductionPtr result = ValidatingGrammarGenerator::doGenerate(nn, m2);
- fixup(result, m2);
- return result;
- }
-}
-
-ProductionPtr ResolvingGrammarGenerator::resolveRecords(
- const NodePtr& writer, const NodePtr& reader,
- map<NodePair, ProductionPtr>& m,
- map<NodePtr, ProductionPtr>& m2)
-{
- ProductionPtr result = make_shared<Production>();
-
- vector<pair<string, size_t> > wf = fields(writer);
- vector<pair<string, size_t> > rf = fields(reader);
- vector<size_t> fieldOrder;
- fieldOrder.reserve(reader->names());
-
- /*
- * We look for all writer fields in the reader. If found, recursively
- * resolve the corresponding fields. Then erase the reader field.
- * If no matching field is found for reader, arrange to skip the writer
- * field.
- */
- for (vector<pair<string, size_t> >::const_iterator it = wf.begin();
- it != wf.end(); ++it) {
- vector<pair<string, size_t> >::iterator it2 =
- find_if(rf.begin(), rf.end(),
- equalsFirst<string, size_t>(it->first));
- if (it2 != rf.end()) {
- ProductionPtr p = doGenerate2(writer->leafAt(it->second),
- reader->leafAt(it2->second), m, m2);
- copy(p->rbegin(), p->rend(), back_inserter(*result));
- fieldOrder.push_back(it2->second);
- rf.erase(it2);
- } else {
- ProductionPtr p = getWriterProduction(
- writer->leafAt(it->second), m2);
- result->push_back(Symbol::skipStart());
- if (p->size() == 1) {
- result->push_back((*p)[0]);
- } else {
- result->push_back(Symbol::indirect(p));
- }
- }
- }
-
- /*
- * Examine the reader fields left out, (i.e. those didn't have corresponding
- * writer field).
- */
- for (vector<pair<string, size_t> >::const_iterator it = rf.begin();
- it != rf.end(); ++it) {
-
- NodePtr s = reader->leafAt(it->second);
- fieldOrder.push_back(it->second);
-
- if (s->type() == AVRO_SYMBOLIC) {
- s = resolveSymbol(s);
- }
- shared_ptr<vector<uint8_t> > defaultBinary =
- getAvroBinary(reader->defaultValueAt(it->second));
- result->push_back(Symbol::defaultStartAction(defaultBinary));
- map<NodePair, shared_ptr<Production> >::const_iterator it2 =
- m.find(NodePair(s, s));
- ProductionPtr p = (it2 == m.end()) ?
- doGenerate2(s, s, m, m2) : it2->second;
- copy(p->rbegin(), p->rend(), back_inserter(*result));
- result->push_back(Symbol::defaultEndAction());
- }
- reverse(result->begin(), result->end());
- result->push_back(Symbol::sizeListAction(fieldOrder));
- result->push_back(Symbol::recordAction());
-
- return result;
-
-}
-
-ProductionPtr ResolvingGrammarGenerator::resolveUnion(
- const NodePtr& writer, const NodePtr& reader,
- map<NodePair, ProductionPtr>& m,
- map<NodePtr, ProductionPtr>& m2)
-{
- vector<ProductionPtr> v;
- size_t c = writer->leaves();
- v.reserve(c);
- for (size_t i = 0; i < c; ++i) {
- ProductionPtr p = doGenerate2(writer->leafAt(i), reader, m, m2);
- v.push_back(p);
- }
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::alternative(v));
- result->push_back(Symbol::writerUnionAction());
- return result;
-}
-
-ProductionPtr ResolvingGrammarGenerator::doGenerate2(
- const NodePtr& w, const NodePtr& r,
- map<NodePair, ProductionPtr> &m,
- map<NodePtr, ProductionPtr> &m2)
-{
- const NodePtr writer = w->type() == AVRO_SYMBOLIC ? resolveSymbol(w) : w;
- const NodePtr reader = r->type() == AVRO_SYMBOLIC ? resolveSymbol(r) : r;
- Type writerType = writer->type();
- Type readerType = reader->type();
-
- if (writerType == readerType) {
- switch (writerType) {
- case AVRO_NULL:
- return make_shared<Production>(1, Symbol::nullSymbol());
- case AVRO_BOOL:
- return make_shared<Production>(1, Symbol::boolSymbol());
- case AVRO_INT:
- return make_shared<Production>(1, Symbol::intSymbol());
- case AVRO_LONG:
- return make_shared<Production>(1, Symbol::longSymbol());
- case AVRO_FLOAT:
- return make_shared<Production>(1, Symbol::floatSymbol());
- case AVRO_DOUBLE:
- return make_shared<Production>(1, Symbol::doubleSymbol());
- case AVRO_STRING:
- return make_shared<Production>(1, Symbol::stringSymbol());
- case AVRO_BYTES:
- return make_shared<Production>(1, Symbol::bytesSymbol());
- case AVRO_FIXED:
- if (writer->name() == reader->name() &&
- writer->fixedSize() == reader->fixedSize()) {
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::sizeCheckSymbol(reader->fixedSize()));
- result->push_back(Symbol::fixedSymbol());
- m[make_pair(writer, reader)] = result;
- return result;
- }
- break;
- case AVRO_RECORD:
- if (writer->name() == reader->name()) {
- const pair<NodePtr, NodePtr> key(writer, reader);
- map<NodePair, ProductionPtr>::const_iterator kp = m.find(key);
- if (kp != m.end()) {
- return (kp->second) ? kp->second :
- make_shared<Production>(1, Symbol::placeholder(key));
- }
- m[key] = ProductionPtr();
- ProductionPtr result = resolveRecords(writer, reader, m, m2);
- m[key] = result;
- return make_shared<Production>(1, Symbol::indirect(result));
- }
- break;
-
- case AVRO_ENUM:
- if (writer->name() == reader->name()) {
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::enumAdjustSymbol(writer, reader));
- result->push_back(Symbol::enumSymbol());
- m[make_pair(writer, reader)] = result;
- return result;
- }
- break;
-
- case AVRO_ARRAY:
- {
- ProductionPtr p = getWriterProduction(writer->leafAt(0), m2);
- ProductionPtr p2 = doGenerate2(writer->leafAt(0), reader->leafAt(0), m, m2);
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::arrayEndSymbol());
- result->push_back(Symbol::repeater(p2, p, true));
- result->push_back(Symbol::arrayStartSymbol());
- return result;
- }
- case AVRO_MAP:
- {
- ProductionPtr pp =
- doGenerate2(writer->leafAt(1),reader->leafAt(1), m, m2);
- ProductionPtr v(new Production(*pp));
- v->push_back(Symbol::stringSymbol());
-
- ProductionPtr pp2 = getWriterProduction(writer->leafAt(1), m2);
- ProductionPtr v2(new Production(*pp2));
-
- v2->push_back(Symbol::stringSymbol());
-
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::mapEndSymbol());
- result->push_back(Symbol::repeater(v, v2, false));
- result->push_back(Symbol::mapStartSymbol());
- return result;
- }
- case AVRO_UNION:
- return resolveUnion(writer, reader, m, m2);
- case AVRO_SYMBOLIC:
- {
- shared_ptr<NodeSymbolic> w =
- static_pointer_cast<NodeSymbolic>(writer);
- shared_ptr<NodeSymbolic> r =
- static_pointer_cast<NodeSymbolic>(reader);
- NodePair p(w->getNode(), r->getNode());
- map<NodePair, ProductionPtr>::iterator it = m.find(p);
- if (it != m.end() && it->second) {
- return it->second;
- } else {
- m[p] = ProductionPtr();
- return make_shared<Production>(1, Symbol::placeholder(p));
- }
- }
- default:
- throw Exception("Unknown node type");
- }
- } else if (writerType == AVRO_UNION) {
- return resolveUnion(writer, reader, m, m2);
- } else {
- switch (readerType) {
- case AVRO_LONG:
- if (writerType == AVRO_INT) {
- return make_shared<Production>(1,
- Symbol::resolveSymbol(Symbol::sInt, Symbol::sLong));
- }
- break;
- case AVRO_FLOAT:
- if (writerType == AVRO_INT || writerType == AVRO_LONG) {
- return make_shared<Production>(1,
- Symbol::resolveSymbol(writerType == AVRO_INT ?
- Symbol::sInt : Symbol::sLong, Symbol::sFloat));
- }
- break;
- case AVRO_DOUBLE:
- if (writerType == AVRO_INT || writerType == AVRO_LONG
- || writerType == AVRO_FLOAT) {
- return make_shared<Production>(1,
- Symbol::resolveSymbol(writerType == AVRO_INT ?
- Symbol::sInt : writerType == AVRO_LONG ?
- Symbol::sLong : Symbol::sFloat, Symbol::sDouble));
- }
- break;
-
- case AVRO_UNION:
- {
- int j = bestBranch(writer, reader);
- if (j >= 0) {
- ProductionPtr p = doGenerate2(writer, reader->leafAt(j), m, m2);
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::unionAdjustSymbol(j, p));
- result->push_back(Symbol::unionSymbol());
- return result;
- }
- }
- break;
- case AVRO_NULL:
- case AVRO_BOOL:
- case AVRO_INT:
- case AVRO_STRING:
- case AVRO_BYTES:
- case AVRO_ENUM:
- case AVRO_ARRAY:
- case AVRO_MAP:
- case AVRO_RECORD:
- break;
- default:
- throw Exception("Unknown node type");
- }
- }
- return make_shared<Production>(1, Symbol::error(writer, reader));
-}
-
-class ResolvingDecoderHandler {
- shared_ptr<vector<uint8_t> > defaultData_;
- unique_ptr<InputStream> inp_;
- DecoderPtr backup_;
- DecoderPtr& base_;
- const DecoderPtr binDecoder;
- public:
- ResolvingDecoderHandler(DecoderPtr& base) : base_(base),
- binDecoder(binaryDecoder()) { }
- size_t handle(const Symbol& s) {
- switch (s.kind()) {
- case Symbol::sWriterUnion:
- return base_->decodeUnionIndex();
- case Symbol::sDefaultStart:
- defaultData_ = s.extra<shared_ptr<vector<uint8_t> > >();
- backup_ = base_;
- inp_ = memoryInputStream(&(*defaultData_)[0], defaultData_->size());
- base_ = binDecoder;
- base_->init(*inp_);
- return 0;
- case Symbol::sDefaultEnd:
- base_= backup_;
- backup_.reset();
- return 0;
- default:
- return 0;
- }
- }
-
- void reset()
- {
- if (backup_ != NULL)
- {
- base_= backup_;
- backup_.reset();
- }
- }
-};
-
-template <typename Parser>
-class ResolvingDecoderImpl : public ResolvingDecoder
-{
- DecoderPtr base_;
- ResolvingDecoderHandler handler_;
- Parser parser_;
-
- void init(InputStream& is);
- void decodeNull();
- bool decodeBool();
- int32_t decodeInt();
- int64_t decodeLong();
- float decodeFloat();
- double decodeDouble();
- void decodeString(string& value);
- void skipString();
- void decodeBytes(vector<uint8_t>& value);
- void skipBytes();
- void decodeFixed(size_t n, vector<uint8_t>& value);
- void skipFixed(size_t n);
- size_t decodeEnum();
- size_t arrayStart();
- size_t arrayNext();
- size_t skipArray();
- size_t mapStart();
- size_t mapNext();
- size_t skipMap();
- size_t decodeUnionIndex();
- const vector<size_t>& fieldOrder();
- void drain() {
- parser_.processImplicitActions();
- base_->drain();
- }
-public:
- ResolvingDecoderImpl(const ValidSchema& writer, const ValidSchema& reader,
- const DecoderPtr& base) :
- base_(base),
- handler_(base_),
- parser_(ResolvingGrammarGenerator().generate(writer, reader),
- &(*base_), handler_)
- {
- }
-};
-
-template <typename P>
-void ResolvingDecoderImpl<P>::init(InputStream& is)
-{
- handler_.reset();
- base_->init(is);
- parser_.reset();
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::decodeNull()
-{
- parser_.advance(Symbol::sNull);
- base_->decodeNull();
-}
-
-template <typename P>
-bool ResolvingDecoderImpl<P>::decodeBool()
-{
- parser_.advance(Symbol::sBool);
- return base_->decodeBool();
-}
-
-template <typename P>
-int32_t ResolvingDecoderImpl<P>::decodeInt()
-{
- parser_.advance(Symbol::sInt);
- return base_->decodeInt();
-}
-
-template <typename P>
-int64_t ResolvingDecoderImpl<P>::decodeLong()
-{
- Symbol::Kind k = parser_.advance(Symbol::sLong);
- return k == Symbol::sInt ? base_->decodeInt() : base_->decodeLong();
-}
-
-template <typename P>
-float ResolvingDecoderImpl<P>::decodeFloat()
-{
- Symbol::Kind k = parser_.advance(Symbol::sFloat);
- return k == Symbol::sInt ? base_->decodeInt() :
- k == Symbol::sLong ? base_->decodeLong() :
- base_->decodeFloat();
-}
-
-template <typename P>
-double ResolvingDecoderImpl<P>::decodeDouble()
-{
- Symbol::Kind k = parser_.advance(Symbol::sDouble);
- return k == Symbol::sInt ? base_->decodeInt() :
- k == Symbol::sLong ? base_->decodeLong() :
- k == Symbol::sFloat ? base_->decodeFloat() :
- base_->decodeDouble();
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::decodeString(string& value)
-{
- parser_.advance(Symbol::sString);
- base_->decodeString(value);
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::skipString()
-{
- parser_.advance(Symbol::sString);
- base_->skipString();
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::decodeBytes(vector<uint8_t>& value)
-{
- parser_.advance(Symbol::sBytes);
- base_->decodeBytes(value);
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::skipBytes()
-{
- parser_.advance(Symbol::sBytes);
- base_->skipBytes();
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::decodeFixed(size_t n, vector<uint8_t>& value)
-{
- parser_.advance(Symbol::sFixed);
- parser_.assertSize(n);
- return base_->decodeFixed(n, value);
-}
-
-template <typename P>
-void ResolvingDecoderImpl<P>::skipFixed(size_t n)
-{
- parser_.advance(Symbol::sFixed);
- parser_.assertSize(n);
- base_->skipFixed(n);
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::decodeEnum()
-{
- parser_.advance(Symbol::sEnum);
- size_t n = base_->decodeEnum();
- return parser_.enumAdjust(n);
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::arrayStart()
-{
- parser_.advance(Symbol::sArrayStart);
- size_t result = base_->arrayStart();
- parser_.pushRepeatCount(result);
- if (result == 0) {
- parser_.popRepeater();
- parser_.advance(Symbol::sArrayEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::arrayNext()
-{
- parser_.processImplicitActions();
- size_t result = base_->arrayNext();
- parser_.nextRepeatCount(result);
- if (result == 0) {
- parser_.popRepeater();
- parser_.advance(Symbol::sArrayEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::skipArray()
-{
- parser_.advance(Symbol::sArrayStart);
- size_t n = base_->skipArray();
- if (n == 0) {
- parser_.pop();
- } else {
- parser_.pushRepeatCount(n);
- parser_.skip(*base_);
- }
- parser_.advance(Symbol::sArrayEnd);
- return 0;
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::mapStart()
-{
- parser_.advance(Symbol::sMapStart);
- size_t result = base_->mapStart();
- parser_.pushRepeatCount(result);
- if (result == 0) {
- parser_.popRepeater();
- parser_.advance(Symbol::sMapEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::mapNext()
-{
- parser_.processImplicitActions();
- size_t result = base_->mapNext();
- parser_.nextRepeatCount(result);
- if (result == 0) {
- parser_.popRepeater();
- parser_.advance(Symbol::sMapEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::skipMap()
-{
- parser_.advance(Symbol::sMapStart);
- size_t n = base_->skipMap();
- if (n == 0) {
- parser_.pop();
- } else {
- parser_.pushRepeatCount(n);
- parser_.skip(*base_);
- }
- parser_.advance(Symbol::sMapEnd);
- return 0;
-}
-
-template <typename P>
-size_t ResolvingDecoderImpl<P>::decodeUnionIndex()
-{
- parser_.advance(Symbol::sUnion);
- return parser_.unionAdjust();
-}
-
-template <typename P>
-const vector<size_t>& ResolvingDecoderImpl<P>::fieldOrder()
-{
- parser_.advance(Symbol::sRecord);
- return parser_.sizeList();
-}
-
-} // namespace parsing
-
-ResolvingDecoderPtr resolvingDecoder(const ValidSchema& writer,
- const ValidSchema& reader, const DecoderPtr& base) {
- return make_shared<parsing::ResolvingDecoderImpl
- <parsing::SimpleParser<parsing::ResolvingDecoderHandler> > >(
- writer, reader, base);
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define __STDC_LIMIT_MACROS
+
+#include <string>
+#include <stack>
+#include <map>
+#include <algorithm>
+#include <memory>
+#include <ctype.h>
+
+#include "ValidatingCodec.hh"
+#include "Symbol.hh"
+#include "Types.hh"
+#include "ValidSchema.hh"
+#include "Decoder.hh"
+#include "Encoder.hh"
+#include "NodeImpl.hh"
+#include "Generic.hh"
+#include "Stream.hh"
+
+namespace avro {
+
+using std::make_shared;
+
+namespace parsing {
+
+using std::shared_ptr;
+using std::static_pointer_cast;
+using std::make_shared;
+
+using std::unique_ptr;
+using std::map;
+using std::pair;
+using std::vector;
+using std::string;
+using std::reverse;
+using std::ostringstream;
+using std::istringstream;
+using std::stack;
+using std::find_if;
+using std::make_pair;
+
+typedef pair<NodePtr, NodePtr> NodePair;
+
+class ResolvingGrammarGenerator : public ValidatingGrammarGenerator {
+ ProductionPtr doGenerate2(const NodePtr& writer,
+ const NodePtr& reader, map<NodePair, ProductionPtr> &m,
+ map<NodePtr, ProductionPtr> &m2);
+ ProductionPtr resolveRecords(const NodePtr& writer,
+ const NodePtr& reader, map<NodePair, ProductionPtr> &m,
+ map<NodePtr, ProductionPtr> &m2);
+ ProductionPtr resolveUnion(const NodePtr& writer,
+ const NodePtr& reader, map<NodePair, ProductionPtr> &m,
+ map<NodePtr, ProductionPtr> &m2);
+
+ static vector<pair<string, size_t> > fields(const NodePtr& n) {
+ vector<pair<string, size_t> > result;
+ size_t c = n->names();
+ for (size_t i = 0; i < c; ++i) {
+ result.push_back(make_pair(n->nameAt(i), i));
+ }
+ return result;
+ }
+
+ static int bestBranch(const NodePtr& writer, const NodePtr& reader);
+
+ ProductionPtr getWriterProduction(const NodePtr& n,
+ map<NodePtr, ProductionPtr>& m2);
+
+public:
+ Symbol generate(
+ const ValidSchema& writer, const ValidSchema& reader);
+};
+
+Symbol ResolvingGrammarGenerator::generate(
+ const ValidSchema& writer, const ValidSchema& reader) {
+ map<NodePtr, ProductionPtr> m2;
+
+ const NodePtr& rr = reader.root();
+ const NodePtr& rw = writer.root();
+ ProductionPtr backup = ValidatingGrammarGenerator::doGenerate(rw, m2);
+ fixup(backup, m2);
+
+ map<NodePair, ProductionPtr> m;
+ ProductionPtr main = doGenerate2(rw, rr, m, m2);
+ fixup(main, m);
+ return Symbol::rootSymbol(main, backup);
+}
+
+int ResolvingGrammarGenerator::bestBranch(const NodePtr& writer,
+ const NodePtr& reader)
+{
+ Type t = writer->type();
+
+ const size_t c = reader->leaves();
+ for (size_t j = 0; j < c; ++j) {
+ NodePtr r = reader->leafAt(j);
+ if (r->type() == AVRO_SYMBOLIC) {
+ r = resolveSymbol(r);
+ }
+ if (t == r->type()) {
+ if (r->hasName()) {
+ if (r->name() == writer->name()) {
+ return j;
+ }
+ } else {
+ return j;
+ }
+ }
+ }
+
+ for (size_t j = 0; j < c; ++j) {
+ const NodePtr& r = reader->leafAt(j);
+ Type rt = r->type();
+ switch (t) {
+ case AVRO_INT:
+ if (rt == AVRO_LONG || rt == AVRO_DOUBLE || rt == AVRO_FLOAT) {
+ return j;
+ }
+ break;
+ case AVRO_LONG:
+ case AVRO_FLOAT:
+ if (rt == AVRO_DOUBLE) {
+ return j;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return -1;
+}
+
+static shared_ptr<vector<uint8_t> > getAvroBinary(
+ const GenericDatum& defaultValue)
+{
+ EncoderPtr e = binaryEncoder();
+ unique_ptr<OutputStream> os = memoryOutputStream();
+ e->init(*os);
+ GenericWriter::write(*e, defaultValue);
+ e->flush();
+ return snapshot(*os);
+}
+
+template<typename T1, typename T2>
+struct equalsFirst
+{
+ const T1& v_;
+ equalsFirst(const T1& v) : v_(v) { }
+ bool operator()(const pair<T1, T2>& p) {
+ return p.first == v_;
+ }
+};
+
+ProductionPtr ResolvingGrammarGenerator::getWriterProduction(
+ const NodePtr& n, map<NodePtr, ProductionPtr>& m2)
+{
+ const NodePtr& nn = (n->type() == AVRO_SYMBOLIC) ?
+ static_cast<const NodeSymbolic& >(*n).getNode() : n;
+ map<NodePtr, ProductionPtr>::const_iterator it2 = m2.find(nn);
+ if (it2 != m2.end()) {
+ return it2->second;
+ } else {
+ ProductionPtr result = ValidatingGrammarGenerator::doGenerate(nn, m2);
+ fixup(result, m2);
+ return result;
+ }
+}
+
+ProductionPtr ResolvingGrammarGenerator::resolveRecords(
+ const NodePtr& writer, const NodePtr& reader,
+ map<NodePair, ProductionPtr>& m,
+ map<NodePtr, ProductionPtr>& m2)
+{
+ ProductionPtr result = make_shared<Production>();
+
+ vector<pair<string, size_t> > wf = fields(writer);
+ vector<pair<string, size_t> > rf = fields(reader);
+ vector<size_t> fieldOrder;
+ fieldOrder.reserve(reader->names());
+
+ /*
+ * We look for all writer fields in the reader. If found, recursively
+ * resolve the corresponding fields. Then erase the reader field.
+ * If no matching field is found for reader, arrange to skip the writer
+ * field.
+ */
+ for (vector<pair<string, size_t> >::const_iterator it = wf.begin();
+ it != wf.end(); ++it) {
+ vector<pair<string, size_t> >::iterator it2 =
+ find_if(rf.begin(), rf.end(),
+ equalsFirst<string, size_t>(it->first));
+ if (it2 != rf.end()) {
+ ProductionPtr p = doGenerate2(writer->leafAt(it->second),
+ reader->leafAt(it2->second), m, m2);
+ copy(p->rbegin(), p->rend(), back_inserter(*result));
+ fieldOrder.push_back(it2->second);
+ rf.erase(it2);
+ } else {
+ ProductionPtr p = getWriterProduction(
+ writer->leafAt(it->second), m2);
+ result->push_back(Symbol::skipStart());
+ if (p->size() == 1) {
+ result->push_back((*p)[0]);
+ } else {
+ result->push_back(Symbol::indirect(p));
+ }
+ }
+ }
+
+ /*
+ * Examine the reader fields left out, (i.e. those didn't have corresponding
+ * writer field).
+ */
+ for (vector<pair<string, size_t> >::const_iterator it = rf.begin();
+ it != rf.end(); ++it) {
+
+ NodePtr s = reader->leafAt(it->second);
+ fieldOrder.push_back(it->second);
+
+ if (s->type() == AVRO_SYMBOLIC) {
+ s = resolveSymbol(s);
+ }
+ shared_ptr<vector<uint8_t> > defaultBinary =
+ getAvroBinary(reader->defaultValueAt(it->second));
+ result->push_back(Symbol::defaultStartAction(defaultBinary));
+ map<NodePair, shared_ptr<Production> >::const_iterator it2 =
+ m.find(NodePair(s, s));
+ ProductionPtr p = (it2 == m.end()) ?
+ doGenerate2(s, s, m, m2) : it2->second;
+ copy(p->rbegin(), p->rend(), back_inserter(*result));
+ result->push_back(Symbol::defaultEndAction());
+ }
+ reverse(result->begin(), result->end());
+ result->push_back(Symbol::sizeListAction(fieldOrder));
+ result->push_back(Symbol::recordAction());
+
+ return result;
+
+}
+
+ProductionPtr ResolvingGrammarGenerator::resolveUnion(
+ const NodePtr& writer, const NodePtr& reader,
+ map<NodePair, ProductionPtr>& m,
+ map<NodePtr, ProductionPtr>& m2)
+{
+ vector<ProductionPtr> v;
+ size_t c = writer->leaves();
+ v.reserve(c);
+ for (size_t i = 0; i < c; ++i) {
+ ProductionPtr p = doGenerate2(writer->leafAt(i), reader, m, m2);
+ v.push_back(p);
+ }
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::alternative(v));
+ result->push_back(Symbol::writerUnionAction());
+ return result;
+}
+
+ProductionPtr ResolvingGrammarGenerator::doGenerate2(
+ const NodePtr& w, const NodePtr& r,
+ map<NodePair, ProductionPtr> &m,
+ map<NodePtr, ProductionPtr> &m2)
+{
+ const NodePtr writer = w->type() == AVRO_SYMBOLIC ? resolveSymbol(w) : w;
+ const NodePtr reader = r->type() == AVRO_SYMBOLIC ? resolveSymbol(r) : r;
+ Type writerType = writer->type();
+ Type readerType = reader->type();
+
+ if (writerType == readerType) {
+ switch (writerType) {
+ case AVRO_NULL:
+ return make_shared<Production>(1, Symbol::nullSymbol());
+ case AVRO_BOOL:
+ return make_shared<Production>(1, Symbol::boolSymbol());
+ case AVRO_INT:
+ return make_shared<Production>(1, Symbol::intSymbol());
+ case AVRO_LONG:
+ return make_shared<Production>(1, Symbol::longSymbol());
+ case AVRO_FLOAT:
+ return make_shared<Production>(1, Symbol::floatSymbol());
+ case AVRO_DOUBLE:
+ return make_shared<Production>(1, Symbol::doubleSymbol());
+ case AVRO_STRING:
+ return make_shared<Production>(1, Symbol::stringSymbol());
+ case AVRO_BYTES:
+ return make_shared<Production>(1, Symbol::bytesSymbol());
+ case AVRO_FIXED:
+ if (writer->name() == reader->name() &&
+ writer->fixedSize() == reader->fixedSize()) {
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::sizeCheckSymbol(reader->fixedSize()));
+ result->push_back(Symbol::fixedSymbol());
+ m[make_pair(writer, reader)] = result;
+ return result;
+ }
+ break;
+ case AVRO_RECORD:
+ if (writer->name() == reader->name()) {
+ const pair<NodePtr, NodePtr> key(writer, reader);
+ map<NodePair, ProductionPtr>::const_iterator kp = m.find(key);
+ if (kp != m.end()) {
+ return (kp->second) ? kp->second :
+ make_shared<Production>(1, Symbol::placeholder(key));
+ }
+ m[key] = ProductionPtr();
+ ProductionPtr result = resolveRecords(writer, reader, m, m2);
+ m[key] = result;
+ return make_shared<Production>(1, Symbol::indirect(result));
+ }
+ break;
+
+ case AVRO_ENUM:
+ if (writer->name() == reader->name()) {
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::enumAdjustSymbol(writer, reader));
+ result->push_back(Symbol::enumSymbol());
+ m[make_pair(writer, reader)] = result;
+ return result;
+ }
+ break;
+
+ case AVRO_ARRAY:
+ {
+ ProductionPtr p = getWriterProduction(writer->leafAt(0), m2);
+ ProductionPtr p2 = doGenerate2(writer->leafAt(0), reader->leafAt(0), m, m2);
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::arrayEndSymbol());
+ result->push_back(Symbol::repeater(p2, p, true));
+ result->push_back(Symbol::arrayStartSymbol());
+ return result;
+ }
+ case AVRO_MAP:
+ {
+ ProductionPtr pp =
+ doGenerate2(writer->leafAt(1),reader->leafAt(1), m, m2);
+ ProductionPtr v(new Production(*pp));
+ v->push_back(Symbol::stringSymbol());
+
+ ProductionPtr pp2 = getWriterProduction(writer->leafAt(1), m2);
+ ProductionPtr v2(new Production(*pp2));
+
+ v2->push_back(Symbol::stringSymbol());
+
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::mapEndSymbol());
+ result->push_back(Symbol::repeater(v, v2, false));
+ result->push_back(Symbol::mapStartSymbol());
+ return result;
+ }
+ case AVRO_UNION:
+ return resolveUnion(writer, reader, m, m2);
+ case AVRO_SYMBOLIC:
+ {
+ shared_ptr<NodeSymbolic> w =
+ static_pointer_cast<NodeSymbolic>(writer);
+ shared_ptr<NodeSymbolic> r =
+ static_pointer_cast<NodeSymbolic>(reader);
+ NodePair p(w->getNode(), r->getNode());
+ map<NodePair, ProductionPtr>::iterator it = m.find(p);
+ if (it != m.end() && it->second) {
+ return it->second;
+ } else {
+ m[p] = ProductionPtr();
+ return make_shared<Production>(1, Symbol::placeholder(p));
+ }
+ }
+ default:
+ throw Exception("Unknown node type");
+ }
+ } else if (writerType == AVRO_UNION) {
+ return resolveUnion(writer, reader, m, m2);
+ } else {
+ switch (readerType) {
+ case AVRO_LONG:
+ if (writerType == AVRO_INT) {
+ return make_shared<Production>(1,
+ Symbol::resolveSymbol(Symbol::sInt, Symbol::sLong));
+ }
+ break;
+ case AVRO_FLOAT:
+ if (writerType == AVRO_INT || writerType == AVRO_LONG) {
+ return make_shared<Production>(1,
+ Symbol::resolveSymbol(writerType == AVRO_INT ?
+ Symbol::sInt : Symbol::sLong, Symbol::sFloat));
+ }
+ break;
+ case AVRO_DOUBLE:
+ if (writerType == AVRO_INT || writerType == AVRO_LONG
+ || writerType == AVRO_FLOAT) {
+ return make_shared<Production>(1,
+ Symbol::resolveSymbol(writerType == AVRO_INT ?
+ Symbol::sInt : writerType == AVRO_LONG ?
+ Symbol::sLong : Symbol::sFloat, Symbol::sDouble));
+ }
+ break;
+
+ case AVRO_UNION:
+ {
+ int j = bestBranch(writer, reader);
+ if (j >= 0) {
+ ProductionPtr p = doGenerate2(writer, reader->leafAt(j), m, m2);
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::unionAdjustSymbol(j, p));
+ result->push_back(Symbol::unionSymbol());
+ return result;
+ }
+ }
+ break;
+ case AVRO_NULL:
+ case AVRO_BOOL:
+ case AVRO_INT:
+ case AVRO_STRING:
+ case AVRO_BYTES:
+ case AVRO_ENUM:
+ case AVRO_ARRAY:
+ case AVRO_MAP:
+ case AVRO_RECORD:
+ break;
+ default:
+ throw Exception("Unknown node type");
+ }
+ }
+ return make_shared<Production>(1, Symbol::error(writer, reader));
+}
+
+class ResolvingDecoderHandler {
+ shared_ptr<vector<uint8_t> > defaultData_;
+ unique_ptr<InputStream> inp_;
+ DecoderPtr backup_;
+ DecoderPtr& base_;
+ const DecoderPtr binDecoder;
+ public:
+ ResolvingDecoderHandler(DecoderPtr& base) : base_(base),
+ binDecoder(binaryDecoder()) { }
+ size_t handle(const Symbol& s) {
+ switch (s.kind()) {
+ case Symbol::sWriterUnion:
+ return base_->decodeUnionIndex();
+ case Symbol::sDefaultStart:
+ defaultData_ = s.extra<shared_ptr<vector<uint8_t> > >();
+ backup_ = base_;
+ inp_ = memoryInputStream(&(*defaultData_)[0], defaultData_->size());
+ base_ = binDecoder;
+ base_->init(*inp_);
+ return 0;
+ case Symbol::sDefaultEnd:
+ base_= backup_;
+ backup_.reset();
+ return 0;
+ default:
+ return 0;
+ }
+ }
+
+ void reset()
+ {
+ if (backup_ != NULL)
+ {
+ base_= backup_;
+ backup_.reset();
+ }
+ }
+};
+
+template <typename Parser>
+class ResolvingDecoderImpl : public ResolvingDecoder
+{
+ DecoderPtr base_;
+ ResolvingDecoderHandler handler_;
+ Parser parser_;
+
+ void init(InputStream& is);
+ void decodeNull();
+ bool decodeBool();
+ int32_t decodeInt();
+ int64_t decodeLong();
+ float decodeFloat();
+ double decodeDouble();
+ void decodeString(string& value);
+ void skipString();
+ void decodeBytes(vector<uint8_t>& value);
+ void skipBytes();
+ void decodeFixed(size_t n, vector<uint8_t>& value);
+ void skipFixed(size_t n);
+ size_t decodeEnum();
+ size_t arrayStart();
+ size_t arrayNext();
+ size_t skipArray();
+ size_t mapStart();
+ size_t mapNext();
+ size_t skipMap();
+ size_t decodeUnionIndex();
+ const vector<size_t>& fieldOrder();
+ void drain() {
+ parser_.processImplicitActions();
+ base_->drain();
+ }
+public:
+ ResolvingDecoderImpl(const ValidSchema& writer, const ValidSchema& reader,
+ const DecoderPtr& base) :
+ base_(base),
+ handler_(base_),
+ parser_(ResolvingGrammarGenerator().generate(writer, reader),
+ &(*base_), handler_)
+ {
+ }
+};
+
+template <typename P>
+void ResolvingDecoderImpl<P>::init(InputStream& is)
+{
+ handler_.reset();
+ base_->init(is);
+ parser_.reset();
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::decodeNull()
+{
+ parser_.advance(Symbol::sNull);
+ base_->decodeNull();
+}
+
+template <typename P>
+bool ResolvingDecoderImpl<P>::decodeBool()
+{
+ parser_.advance(Symbol::sBool);
+ return base_->decodeBool();
+}
+
+template <typename P>
+int32_t ResolvingDecoderImpl<P>::decodeInt()
+{
+ parser_.advance(Symbol::sInt);
+ return base_->decodeInt();
+}
+
+template <typename P>
+int64_t ResolvingDecoderImpl<P>::decodeLong()
+{
+ Symbol::Kind k = parser_.advance(Symbol::sLong);
+ return k == Symbol::sInt ? base_->decodeInt() : base_->decodeLong();
+}
+
+template <typename P>
+float ResolvingDecoderImpl<P>::decodeFloat()
+{
+ Symbol::Kind k = parser_.advance(Symbol::sFloat);
+ return k == Symbol::sInt ? base_->decodeInt() :
+ k == Symbol::sLong ? base_->decodeLong() :
+ base_->decodeFloat();
+}
+
+template <typename P>
+double ResolvingDecoderImpl<P>::decodeDouble()
+{
+ Symbol::Kind k = parser_.advance(Symbol::sDouble);
+ return k == Symbol::sInt ? base_->decodeInt() :
+ k == Symbol::sLong ? base_->decodeLong() :
+ k == Symbol::sFloat ? base_->decodeFloat() :
+ base_->decodeDouble();
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::decodeString(string& value)
+{
+ parser_.advance(Symbol::sString);
+ base_->decodeString(value);
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::skipString()
+{
+ parser_.advance(Symbol::sString);
+ base_->skipString();
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::decodeBytes(vector<uint8_t>& value)
+{
+ parser_.advance(Symbol::sBytes);
+ base_->decodeBytes(value);
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::skipBytes()
+{
+ parser_.advance(Symbol::sBytes);
+ base_->skipBytes();
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::decodeFixed(size_t n, vector<uint8_t>& value)
+{
+ parser_.advance(Symbol::sFixed);
+ parser_.assertSize(n);
+ return base_->decodeFixed(n, value);
+}
+
+template <typename P>
+void ResolvingDecoderImpl<P>::skipFixed(size_t n)
+{
+ parser_.advance(Symbol::sFixed);
+ parser_.assertSize(n);
+ base_->skipFixed(n);
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::decodeEnum()
+{
+ parser_.advance(Symbol::sEnum);
+ size_t n = base_->decodeEnum();
+ return parser_.enumAdjust(n);
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::arrayStart()
+{
+ parser_.advance(Symbol::sArrayStart);
+ size_t result = base_->arrayStart();
+ parser_.pushRepeatCount(result);
+ if (result == 0) {
+ parser_.popRepeater();
+ parser_.advance(Symbol::sArrayEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::arrayNext()
+{
+ parser_.processImplicitActions();
+ size_t result = base_->arrayNext();
+ parser_.nextRepeatCount(result);
+ if (result == 0) {
+ parser_.popRepeater();
+ parser_.advance(Symbol::sArrayEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::skipArray()
+{
+ parser_.advance(Symbol::sArrayStart);
+ size_t n = base_->skipArray();
+ if (n == 0) {
+ parser_.pop();
+ } else {
+ parser_.pushRepeatCount(n);
+ parser_.skip(*base_);
+ }
+ parser_.advance(Symbol::sArrayEnd);
+ return 0;
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::mapStart()
+{
+ parser_.advance(Symbol::sMapStart);
+ size_t result = base_->mapStart();
+ parser_.pushRepeatCount(result);
+ if (result == 0) {
+ parser_.popRepeater();
+ parser_.advance(Symbol::sMapEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::mapNext()
+{
+ parser_.processImplicitActions();
+ size_t result = base_->mapNext();
+ parser_.nextRepeatCount(result);
+ if (result == 0) {
+ parser_.popRepeater();
+ parser_.advance(Symbol::sMapEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::skipMap()
+{
+ parser_.advance(Symbol::sMapStart);
+ size_t n = base_->skipMap();
+ if (n == 0) {
+ parser_.pop();
+ } else {
+ parser_.pushRepeatCount(n);
+ parser_.skip(*base_);
+ }
+ parser_.advance(Symbol::sMapEnd);
+ return 0;
+}
+
+template <typename P>
+size_t ResolvingDecoderImpl<P>::decodeUnionIndex()
+{
+ parser_.advance(Symbol::sUnion);
+ return parser_.unionAdjust();
+}
+
+template <typename P>
+const vector<size_t>& ResolvingDecoderImpl<P>::fieldOrder()
+{
+ parser_.advance(Symbol::sRecord);
+ return parser_.sizeList();
+}
+
+} // namespace parsing
+
+ResolvingDecoderPtr resolvingDecoder(const ValidSchema& writer,
+ const ValidSchema& reader, const DecoderPtr& base) {
+ return make_shared<parsing::ResolvingDecoderImpl
+ <parsing::SimpleParser<parsing::ResolvingDecoderHandler> > >(
+ writer, reader, base);
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/parsing/Symbol.cc b/contrib/libs/apache/avro/impl/parsing/Symbol.cc
index 6eb83309be4..b59b9651334 100644
--- a/contrib/libs/apache/avro/impl/parsing/Symbol.cc
+++ b/contrib/libs/apache/avro/impl/parsing/Symbol.cc
@@ -1,111 +1,111 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "Symbol.hh"
-
-namespace avro {
-namespace parsing {
-
-using std::vector;
-using std::string;
-using std::ostringstream;
-
-const char* Symbol::stringValues[] = {
- "TerminalLow",
- "Null",
- "Bool",
- "Int",
- "Long",
- "Float",
- "Double",
- "String",
- "Bytes",
- "ArrayStart",
- "ArrayEnd",
- "MapStart",
- "MapEnd",
- "Fixed",
- "Enum",
- "Union",
- "TerminalHigh",
- "SizeCheck",
- "NameList",
- "Root",
- "Repeater",
- "Alternative",
- "Placeholder",
- "Indirect",
- "Symbolic",
- "EnumAdjust",
- "UnionAdjust",
- "SkipStart",
- "Resolve",
- "ImplicitActionLow",
- "RecordStart",
- "RecordEnd",
- "Field",
- "Record",
- "SizeList",
- "WriterUnion",
- "DefaultStart",
- "DefaultEnd",
- "ImplicitActionHigh",
- "Error"
-};
-
-Symbol Symbol::enumAdjustSymbol(const NodePtr& writer, const NodePtr& reader)
-{
- vector<string> rs;
- size_t rc = reader->names();
- for (size_t i = 0; i < rc; ++i) {
- rs.push_back(reader->nameAt(i));
- }
-
- size_t wc = writer->names();
- vector<int> adj;
- adj.reserve(wc);
-
- vector<string> err;
-
- for (size_t i = 0; i < wc; ++i) {
- const string& s = writer->nameAt(i);
- vector<string>::const_iterator it = find(rs.begin(), rs.end(), s);
- if (it == rs.end()) {
- int pos = err.size() + 1;
- adj.push_back(-pos);
- err.push_back(s);
- } else {
- adj.push_back(it - rs.begin());
- }
- }
- return Symbol(sEnumAdjust, make_pair(adj, err));
-}
-
-Symbol Symbol::error(const NodePtr& writer, const NodePtr& reader)
-{
- ostringstream oss;
- oss << "Cannot resolve: " << std::endl;
- writer->printJson(oss, 0);
- oss << std::endl << "with" << std::endl;
- reader->printJson(oss, 0);
- return Symbol(sError, oss.str());
-}
-
-} // namespace parsing
-} // namespace avro
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "Symbol.hh"
+
+namespace avro {
+namespace parsing {
+
+using std::vector;
+using std::string;
+using std::ostringstream;
+
+const char* Symbol::stringValues[] = {
+ "TerminalLow",
+ "Null",
+ "Bool",
+ "Int",
+ "Long",
+ "Float",
+ "Double",
+ "String",
+ "Bytes",
+ "ArrayStart",
+ "ArrayEnd",
+ "MapStart",
+ "MapEnd",
+ "Fixed",
+ "Enum",
+ "Union",
+ "TerminalHigh",
+ "SizeCheck",
+ "NameList",
+ "Root",
+ "Repeater",
+ "Alternative",
+ "Placeholder",
+ "Indirect",
+ "Symbolic",
+ "EnumAdjust",
+ "UnionAdjust",
+ "SkipStart",
+ "Resolve",
+ "ImplicitActionLow",
+ "RecordStart",
+ "RecordEnd",
+ "Field",
+ "Record",
+ "SizeList",
+ "WriterUnion",
+ "DefaultStart",
+ "DefaultEnd",
+ "ImplicitActionHigh",
+ "Error"
+};
+
+Symbol Symbol::enumAdjustSymbol(const NodePtr& writer, const NodePtr& reader)
+{
+ vector<string> rs;
+ size_t rc = reader->names();
+ for (size_t i = 0; i < rc; ++i) {
+ rs.push_back(reader->nameAt(i));
+ }
+
+ size_t wc = writer->names();
+ vector<int> adj;
+ adj.reserve(wc);
+
+ vector<string> err;
+
+ for (size_t i = 0; i < wc; ++i) {
+ const string& s = writer->nameAt(i);
+ vector<string>::const_iterator it = find(rs.begin(), rs.end(), s);
+ if (it == rs.end()) {
+ int pos = err.size() + 1;
+ adj.push_back(-pos);
+ err.push_back(s);
+ } else {
+ adj.push_back(it - rs.begin());
+ }
+ }
+ return Symbol(sEnumAdjust, make_pair(adj, err));
+}
+
+Symbol Symbol::error(const NodePtr& writer, const NodePtr& reader)
+{
+ ostringstream oss;
+ oss << "Cannot resolve: " << std::endl;
+ writer->printJson(oss, 0);
+ oss << std::endl << "with" << std::endl;
+ reader->printJson(oss, 0);
+ return Symbol(sError, oss.str());
+}
+
+} // namespace parsing
+} // namespace avro
diff --git a/contrib/libs/apache/avro/impl/parsing/Symbol.hh b/contrib/libs/apache/avro/impl/parsing/Symbol.hh
index f4ecfe6e839..d642341e16e 100644
--- a/contrib/libs/apache/avro/impl/parsing/Symbol.hh
+++ b/contrib/libs/apache/avro/impl/parsing/Symbol.hh
@@ -1,854 +1,854 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_parsing_Symbol_hh__
-#define avro_parsing_Symbol_hh__
-
-#include <vector>
-#include <map>
-#include <set>
-#include <stack>
-#include <sstream>
-
-#include <boost/any.hpp>
-#include <boost/tuple/tuple.hpp>
-
-#include "Node.hh"
-#include "Decoder.hh"
-#include "Exception.hh"
-
-namespace avro {
-namespace parsing {
-
-class Symbol;
-
-typedef std::vector<Symbol> Production;
-typedef std::shared_ptr<Production> ProductionPtr;
-typedef boost::tuple<std::stack<ssize_t>, bool, ProductionPtr, ProductionPtr> RepeaterInfo;
-typedef boost::tuple<ProductionPtr, ProductionPtr> RootInfo;
-
-class Symbol {
-public:
- enum Kind {
- sTerminalLow, // extra has nothing
- sNull,
- sBool,
- sInt,
- sLong,
- sFloat,
- sDouble,
- sString,
- sBytes,
- sArrayStart,
- sArrayEnd,
- sMapStart,
- sMapEnd,
- sFixed,
- sEnum,
- sUnion,
- sTerminalHigh,
- sSizeCheck, // Extra has size
- sNameList, // Extra has a vector<string>
- sRoot, // Root for a schema, extra is Symbol
- sRepeater, // Array or Map, extra is symbol
- sAlternative, // One of many (union), extra is Union
- sPlaceholder, // To be fixed up later.
- sIndirect, // extra is shared_ptr<Production>
- sSymbolic, // extra is weal_ptr<Production>
- sEnumAdjust,
- sUnionAdjust,
- sSkipStart,
- sResolve,
-
- sImplicitActionLow,
- sRecordStart,
- sRecordEnd,
- sField, // extra is string
- sRecord,
- sSizeList,
- sWriterUnion,
- sDefaultStart, // extra has default value in Avro binary encoding
- sDefaultEnd,
- sImplicitActionHigh,
- sError
- };
-
-private:
- Kind kind_;
- boost::any extra_;
-
-
- explicit Symbol(Kind k) : kind_(k) { }
- template <typename T> Symbol(Kind k, T t) : kind_(k), extra_(t) { }
-public:
-
- Kind kind() const {
- return kind_;
- }
-
- template <typename T> T extra() const {
- return boost::any_cast<T>(extra_);
- }
-
- template <typename T> T* extrap() {
- return boost::any_cast<T>(&extra_);
- }
-
- template <typename T> const T* extrap() const {
- return boost::any_cast<T>(&extra_);
- }
-
- template <typename T> void extra(const T& t) {
- extra_ = t;
- }
-
- bool isTerminal() const {
- return kind_ > sTerminalLow && kind_ < sTerminalHigh;
- }
-
- bool isImplicitAction() const {
- return kind_ > sImplicitActionLow && kind_ < sImplicitActionHigh;
- }
-
- static const char* stringValues[];
- static const char* toString(Kind k) {
- return stringValues[k];
- }
-
- static Symbol rootSymbol(ProductionPtr& s)
- {
- return Symbol(Symbol::sRoot, RootInfo(s, std::make_shared<Production>()));
- }
-
- static Symbol rootSymbol(const ProductionPtr& main,
- const ProductionPtr& backup)
- {
- return Symbol(Symbol::sRoot, RootInfo(main, backup));
- }
-
- static Symbol nullSymbol() {
- return Symbol(sNull);
- }
-
- static Symbol boolSymbol() {
- return Symbol(sBool);
- }
-
- static Symbol intSymbol() {
- return Symbol(sInt);
- }
-
- static Symbol longSymbol() {
- return Symbol(sLong);
- }
-
- static Symbol floatSymbol() {
- return Symbol(sFloat);
- }
-
- static Symbol doubleSymbol() {
- return Symbol(sDouble);
- }
-
- static Symbol stringSymbol() {
- return Symbol(sString);
- }
-
- static Symbol bytesSymbol() {
- return Symbol(sBytes);
- }
-
- static Symbol sizeCheckSymbol(size_t s) {
- return Symbol(sSizeCheck, s);
- }
-
- static Symbol fixedSymbol() {
- return Symbol(sFixed);
- }
-
- static Symbol enumSymbol() {
- return Symbol(sEnum);
- }
-
- static Symbol arrayStartSymbol() {
- return Symbol(sArrayStart);
- }
-
- static Symbol arrayEndSymbol() {
- return Symbol(sArrayEnd);
- }
-
- static Symbol mapStartSymbol() {
- return Symbol(sMapStart);
- }
-
- static Symbol mapEndSymbol() {
- return Symbol(sMapEnd);
- }
-
- static Symbol repeater(const ProductionPtr& p,
- bool isArray) {
- return repeater(p, p, isArray);
- }
-
- static Symbol repeater(const ProductionPtr& read,
- const ProductionPtr& skip,
- bool isArray) {
- std::stack<ssize_t> s;
- return Symbol(sRepeater, RepeaterInfo(s, isArray, read, skip));
- }
-
- static Symbol defaultStartAction(std::shared_ptr<std::vector<uint8_t> > bb)
- {
- return Symbol(sDefaultStart, bb);
- }
-
- static Symbol defaultEndAction()
- {
- return Symbol(sDefaultEnd);
- }
-
- static Symbol alternative(
- const std::vector<ProductionPtr>& branches)
- {
- return Symbol(Symbol::sAlternative, branches);
- }
-
- static Symbol unionSymbol() {
- return Symbol(sUnion);
- }
-
- static Symbol recordStartSymbol() {
- return Symbol(sRecordStart);
- }
-
- static Symbol recordEndSymbol() {
- return Symbol(sRecordEnd);
- }
-
- static Symbol fieldSymbol(const std::string& name) {
- return Symbol(sField, name);
- }
-
- static Symbol writerUnionAction() {
- return Symbol(sWriterUnion);
- }
-
- static Symbol nameListSymbol(
- const std::vector<std::string>& v) {
- return Symbol(sNameList, v);
- }
-
- template <typename T>
- static Symbol placeholder(const T& n) {
- return Symbol(sPlaceholder, n);
- }
-
- static Symbol indirect(const ProductionPtr& p) {
- return Symbol(sIndirect, p);
- }
-
- static Symbol symbolic(const std::weak_ptr<Production>& p) {
- return Symbol(sSymbolic, p);
- }
-
- static Symbol enumAdjustSymbol(const NodePtr& writer,
- const NodePtr& reader);
-
- static Symbol unionAdjustSymbol(size_t branch,
- const ProductionPtr& p) {
- return Symbol(sUnionAdjust, std::make_pair(branch, p));
- }
-
- static Symbol sizeListAction(std::vector<size_t> order) {
- return Symbol(sSizeList, order);
- }
-
- static Symbol recordAction() {
- return Symbol(sRecord);
- }
-
- static Symbol error(const NodePtr& writer, const NodePtr& reader);
-
- static Symbol resolveSymbol(Kind w, Kind r) {
- return Symbol(sResolve, std::make_pair(w, r));
- }
-
- static Symbol skipStart() {
- return Symbol(sSkipStart);
- }
-
-};
-
-/**
- * Recursively replaces all placeholders in the production with the
- * corresponding values.
- */
-template<typename T>
-void fixup(const ProductionPtr& p,
- const std::map<T, ProductionPtr> &m)
-{
- std::set<ProductionPtr> seen;
- for (Production::iterator it = p->begin(); it != p->end(); ++it) {
- fixup(*it, m, seen);
- }
-}
-
-
-/**
- * Recursively replaces all placeholders in the symbol with the values with the
- * corresponding values.
- */
-template<typename T>
-void fixup_internal(const ProductionPtr& p,
- const std::map<T, ProductionPtr> &m,
- std::set<ProductionPtr>& seen)
-{
- if (seen.find(p) == seen.end()) {
- seen.insert(p);
- for (Production::iterator it = p->begin(); it != p->end(); ++it) {
- fixup(*it, m, seen);
- }
- }
-}
-
-template<typename T>
-void fixup(Symbol& s, const std::map<T, ProductionPtr> &m,
- std::set<ProductionPtr>& seen)
-{
- switch (s.kind()) {
- case Symbol::sIndirect:
- fixup_internal(s.extra<ProductionPtr>(), m, seen);
- break;
- case Symbol::sAlternative:
- {
- const std::vector<ProductionPtr> *vv =
- s.extrap<std::vector<ProductionPtr> >();
- for (std::vector<ProductionPtr>::const_iterator it = vv->begin();
- it != vv->end(); ++it) {
- fixup_internal(*it, m, seen);
- }
- }
- break;
- case Symbol::sRepeater:
- {
- const RepeaterInfo& ri = *s.extrap<RepeaterInfo>();
- fixup_internal(boost::tuples::get<2>(ri), m, seen);
- fixup_internal(boost::tuples::get<3>(ri), m, seen);
- }
- break;
- case Symbol::sPlaceholder:
- {
- typename std::map<T, std::shared_ptr<Production> >::const_iterator it =
- m.find(s.extra<T>());
- if (it == m.end()) {
- throw Exception("Placeholder symbol cannot be resolved");
- }
- s = Symbol::symbolic(std::weak_ptr<Production>(it->second));
- }
- break;
- case Symbol::sUnionAdjust:
- fixup_internal(s.extrap<std::pair<size_t, ProductionPtr> >()->second,
- m, seen);
- break;
- default:
- break;
- }
-}
-
-template<typename Handler>
-class SimpleParser {
- Decoder* decoder_;
- Handler& handler_;
- std::stack<Symbol> parsingStack;
-
- static void throwMismatch(Symbol::Kind actual, Symbol::Kind expected)
- {
- std::ostringstream oss;
- oss << "Invalid operation. Schema requires: " <<
- Symbol::toString(expected) << ", got: " <<
- Symbol::toString(actual);
- throw Exception(oss.str());
- }
-
- static void assertMatch(Symbol::Kind actual, Symbol::Kind expected)
- {
- if (expected != actual) {
- throwMismatch(actual, expected);
- }
-
- }
-
- void append(const ProductionPtr& ss) {
- for (Production::const_iterator it = ss->begin();
- it != ss->end(); ++it) {
- parsingStack.push(*it);
- }
- }
-
- size_t popSize() {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sSizeCheck, s.kind());
- size_t result = s.extra<size_t>();
- parsingStack.pop();
- return result;
- }
-
- static void assertLessThan(size_t n, size_t s) {
- if (n >= s) {
- std::ostringstream oss;
- oss << "Size max value. Upper bound: " << s << " found " << n;
- throw Exception(oss.str());
- }
- }
-
-public:
- Symbol::Kind advance(Symbol::Kind k) {
- for (; ;) {
- Symbol& s = parsingStack.top();
-// std::cout << "advance: " << Symbol::toString(s.kind())
-// << " looking for " << Symbol::toString(k) << '\n';
- if (s.kind() == k) {
- parsingStack.pop();
- return k;
- } else if (s.isTerminal()) {
- throwMismatch(k, s.kind());
- } else {
- switch (s.kind()) {
- case Symbol::sRoot:
- append(boost::tuples::get<0>(*s.extrap<RootInfo>()));
- continue;
- case Symbol::sIndirect:
- {
- ProductionPtr pp =
- s.extra<ProductionPtr>();
- parsingStack.pop();
- append(pp);
- }
- continue;
- case Symbol::sSymbolic:
- {
- ProductionPtr pp(
- s.extra<std::weak_ptr<Production> >());
- parsingStack.pop();
- append(pp);
- }
- continue;
- case Symbol::sRepeater:
- {
- RepeaterInfo *p = s.extrap<RepeaterInfo>();
- std::stack<ssize_t>& ns = boost::tuples::get<0>(*p);
- if (ns.empty()) {
- throw Exception(
- "Empty item count stack in repeater advance");
- }
- if (ns.top() == 0) {
- throw Exception(
- "Zero item count in repeater advance");
- }
- --ns.top();
- append(boost::tuples::get<2>(*p));
- }
- continue;
- case Symbol::sError:
- throw Exception(s.extra<std::string>());
- case Symbol::sResolve:
- {
- const std::pair<Symbol::Kind, Symbol::Kind>* p =
- s.extrap<std::pair<Symbol::Kind, Symbol::Kind> >();
- assertMatch(p->second, k);
- Symbol::Kind result = p->first;
- parsingStack.pop();
- return result;
- }
- case Symbol::sSkipStart:
- parsingStack.pop();
- skip(*decoder_);
- break;
- default:
- if (s.isImplicitAction()) {
- size_t n = handler_.handle(s);
- if (s.kind() == Symbol::sWriterUnion) {
- parsingStack.pop();
- selectBranch(n);
- } else {
- parsingStack.pop();
- }
- } else {
- std::ostringstream oss;
- oss << "Encountered " << Symbol::toString(s.kind())
- << " while looking for " << Symbol::toString(k);
- throw Exception(oss.str());
- }
- }
- }
- }
- }
-
- void skip(Decoder& d) {
- const size_t sz = parsingStack.size();
- if (sz == 0) {
- throw Exception("Nothing to skip!");
- }
- while (parsingStack.size() >= sz) {
- Symbol& t = parsingStack.top();
- // std::cout << "skip: " << Symbol::toString(t.kind()) << '\n';
- switch (t.kind()) {
- case Symbol::sNull:
- d.decodeNull();
- break;
- case Symbol::sBool:
- d.decodeBool();
- break;
- case Symbol::sInt:
- d.decodeInt();
- break;
- case Symbol::sLong:
- d.decodeLong();
- break;
- case Symbol::sFloat:
- d.decodeFloat();
- break;
- case Symbol::sDouble:
- d.decodeDouble();
- break;
- case Symbol::sString:
- d.skipString();
- break;
- case Symbol::sBytes:
- d.skipBytes();
- break;
- case Symbol::sArrayStart:
- {
- parsingStack.pop();
- size_t n = d.skipArray();
- processImplicitActions();
- assertMatch(Symbol::sRepeater, parsingStack.top().kind());
- if (n == 0) {
- break;
- }
- Symbol& t = parsingStack.top();
- RepeaterInfo *p = t.extrap<RepeaterInfo>();
- boost::tuples::get<0>(*p).push(n);
- continue;
- }
- case Symbol::sArrayEnd:
- break;
- case Symbol::sMapStart:
- {
- parsingStack.pop();
- size_t n = d.skipMap();
- processImplicitActions();
- assertMatch(Symbol::sRepeater, parsingStack.top().kind());
- if (n == 0) {
- break;
- }
- Symbol& t = parsingStack.top();
- RepeaterInfo *p = t.extrap<RepeaterInfo>();
- boost::tuples::get<0>(*p).push(n);
- continue;
- }
- case Symbol::sMapEnd:
- break;
- case Symbol::sFixed:
- {
- parsingStack.pop();
- Symbol& t = parsingStack.top();
- d.decodeFixed(t.extra<size_t>());
- }
- break;
- case Symbol::sEnum:
- parsingStack.pop();
- d.decodeEnum();
- break;
- case Symbol::sUnion:
- {
- parsingStack.pop();
- size_t n = d.decodeUnionIndex();
- selectBranch(n);
- continue;
- }
- case Symbol::sRepeater:
- {
- RepeaterInfo *p = t.extrap<RepeaterInfo>();
- std::stack<ssize_t>& ns = boost::tuples::get<0>(*p);
- if (ns.empty()) {
- throw Exception(
- "Empty item count stack in repeater skip");
- }
- ssize_t& n = ns.top();
- if (n == 0) {
- n = boost::tuples::get<1>(*p) ? d.arrayNext()
- : d.mapNext();
- }
- if (n != 0) {
- --n;
- append(boost::tuples::get<3>(*p));
- continue;
- } else {
- ns.pop();
- }
- }
- break;
- case Symbol::sIndirect:
- {
- ProductionPtr pp =
- t.extra<ProductionPtr>();
- parsingStack.pop();
- append(pp);
- }
- continue;
- case Symbol::sSymbolic:
- {
- ProductionPtr pp(
- t.extra<std::weak_ptr<Production> >());
- parsingStack.pop();
- append(pp);
- }
- continue;
- default:
- {
- std::ostringstream oss;
- oss << "Don't know how to skip "
- << Symbol::toString(t.kind());
- throw Exception(oss.str());
- }
- }
- parsingStack.pop();
- }
- }
-
- void assertSize(size_t n) {
- size_t s = popSize();
- if (s != n) {
- std::ostringstream oss;
- oss << "Incorrect size. Expected: " << s << " found " << n;
- throw Exception(oss.str());
- }
- }
-
- void assertLessThanSize(size_t n) {
- assertLessThan(n, popSize());
- }
-
- size_t enumAdjust(size_t n) {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sEnumAdjust, s.kind());
- const std::pair<std::vector<int>, std::vector<std::string> >* v =
- s.extrap<std::pair<std::vector<int>, std::vector<std::string> > >();
- assertLessThan(n, v->first.size());
-
- int result = v->first[n];
- if (result < 0) {
- std::ostringstream oss;
- oss << "Cannot resolve symbol: " << v->second[-result - 1]
- << std::endl;
- throw Exception(oss.str());
- }
- parsingStack.pop();
- return result;
- }
-
- size_t unionAdjust() {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sUnionAdjust, s.kind());
- std::pair<size_t, ProductionPtr> p =
- s.extra<std::pair<size_t, ProductionPtr> >();
- parsingStack.pop();
- append(p.second);
- return p.first;
- }
-
- std::string nameForIndex(size_t e) {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sNameList, s.kind());
- const std::vector<std::string> names =
- s.extra<std::vector<std::string> >();
- if (e >= names.size()) {
- throw Exception("Not that many names");
- }
- std::string result = names[e];
- parsingStack.pop();
- return result;
- }
-
- size_t indexForName(const std::string &name) {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sNameList, s.kind());
- const std::vector<std::string> names =
- s.extra<std::vector<std::string> >();
- std::vector<std::string>::const_iterator it =
- std::find(names.begin(), names.end(), name);
- if (it == names.end()) {
- throw Exception("No such enum symbol");
- }
- size_t result = it - names.begin();
- parsingStack.pop();
- return result;
- }
-
- void pushRepeatCount(size_t n) {
- processImplicitActions();
- Symbol& s = parsingStack.top();
- assertMatch(Symbol::sRepeater, s.kind());
- RepeaterInfo *p = s.extrap<RepeaterInfo>();
- std::stack<ssize_t> &nn = boost::tuples::get<0>(*p);
- nn.push(n);
- }
-
- void nextRepeatCount(size_t n) {
- processImplicitActions();
- Symbol& s = parsingStack.top();
- assertMatch(Symbol::sRepeater, s.kind());
- RepeaterInfo *p = s.extrap<RepeaterInfo>();
- std::stack<ssize_t> &nn = boost::tuples::get<0>(*p);
- if (nn.empty() || nn.top() != 0) {
- throw Exception("Wrong number of items");
- }
- nn.top() = n;
- }
-
- void popRepeater() {
- processImplicitActions();
- Symbol& s = parsingStack.top();
- assertMatch(Symbol::sRepeater, s.kind());
- RepeaterInfo *p = s.extrap<RepeaterInfo>();
- std::stack<ssize_t> &ns = boost::tuples::get<0>(*p);
- if (ns.empty()) {
- throw Exception("Incorrect number of items (empty)");
- }
- if (ns.top() > 0) {
- throw Exception("Incorrect number of items (non-zero)");
- }
- ns.pop();
- parsingStack.pop();
- }
-
- void selectBranch(size_t n) {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sAlternative, s.kind());
- std::vector<ProductionPtr> v =
- s.extra<std::vector<ProductionPtr> >();
- if (n >= v.size()) {
- throw Exception("Not that many branches");
- }
- parsingStack.pop();
- append(v[n]);
- }
-
- const std::vector<size_t>& sizeList() {
- const Symbol& s = parsingStack.top();
- assertMatch(Symbol::sSizeList, s.kind());
- return *s.extrap<std::vector<size_t> >();
- }
-
- Symbol::Kind top() const {
- return parsingStack.top().kind();
- }
-
- void pop() {
- parsingStack.pop();
- }
-
- void processImplicitActions() {
- for (; ;) {
- Symbol& s = parsingStack.top();
- if (s.isImplicitAction()) {
- handler_.handle(s);
- parsingStack.pop();
- } else if (s.kind() == Symbol::sSkipStart) {
- parsingStack.pop();
- skip(*decoder_);
- } else {
- break;
- }
- }
- }
-
- SimpleParser(const Symbol& s, Decoder* d, Handler& h) :
- decoder_(d), handler_(h) {
- parsingStack.push(s);
- }
-
- void reset() {
- while (parsingStack.size() > 1) {
- parsingStack.pop();
- }
- }
-
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Symbol s);
-
-inline std::ostream& operator<<(std::ostream& os, const Production p)
-{
- os << '(';
- for (Production::const_iterator it = p.begin(); it != p.end(); ++it) {
- os << *it << ", ";
- }
- os << ')';
- return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Symbol s)
-{
- switch (s.kind()) {
- case Symbol::sRepeater:
- {
- const RepeaterInfo& ri = *s.extrap<RepeaterInfo>();
- os << '(' << Symbol::toString(s.kind())
- << ' ' << *boost::tuples::get<2>(ri)
- << ' ' << *boost::tuples::get<3>(ri)
- << ')';
- }
- break;
- case Symbol::sIndirect:
- {
- os << '(' << Symbol::toString(s.kind()) << ' '
- << *s.extra<std::shared_ptr<Production> >() << ')';
- }
- break;
- case Symbol::sAlternative:
- {
- os << '(' << Symbol::toString(s.kind());
- for (std::vector<ProductionPtr>::const_iterator it =
- s.extrap<std::vector<ProductionPtr> >()->begin();
- it != s.extrap<std::vector<ProductionPtr> >()->end();
- ++it) {
- os << ' ' << **it;
- }
- os << ')';
- }
- break;
- case Symbol::sSymbolic:
- {
- os << '(' << Symbol::toString(s.kind())
- << ' ' << s.extra<std::weak_ptr<Production> >().lock()
- << ')';
- }
- break;
- default:
- os << Symbol::toString(s.kind());
- break;
- }
- return os;
- }
-} // namespace parsing
-} // namespace avro
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_parsing_Symbol_hh__
+#define avro_parsing_Symbol_hh__
+
+#include <vector>
+#include <map>
+#include <set>
+#include <stack>
+#include <sstream>
+
+#include <boost/any.hpp>
+#include <boost/tuple/tuple.hpp>
+
+#include "Node.hh"
+#include "Decoder.hh"
+#include "Exception.hh"
+
+namespace avro {
+namespace parsing {
+
+class Symbol;
+
+typedef std::vector<Symbol> Production;
+typedef std::shared_ptr<Production> ProductionPtr;
+typedef boost::tuple<std::stack<ssize_t>, bool, ProductionPtr, ProductionPtr> RepeaterInfo;
+typedef boost::tuple<ProductionPtr, ProductionPtr> RootInfo;
+
+class Symbol {
+public:
+ enum Kind {
+ sTerminalLow, // extra has nothing
+ sNull,
+ sBool,
+ sInt,
+ sLong,
+ sFloat,
+ sDouble,
+ sString,
+ sBytes,
+ sArrayStart,
+ sArrayEnd,
+ sMapStart,
+ sMapEnd,
+ sFixed,
+ sEnum,
+ sUnion,
+ sTerminalHigh,
+ sSizeCheck, // Extra has size
+ sNameList, // Extra has a vector<string>
+ sRoot, // Root for a schema, extra is Symbol
+ sRepeater, // Array or Map, extra is symbol
+ sAlternative, // One of many (union), extra is Union
+ sPlaceholder, // To be fixed up later.
+ sIndirect, // extra is shared_ptr<Production>
+ sSymbolic, // extra is weal_ptr<Production>
+ sEnumAdjust,
+ sUnionAdjust,
+ sSkipStart,
+ sResolve,
+
+ sImplicitActionLow,
+ sRecordStart,
+ sRecordEnd,
+ sField, // extra is string
+ sRecord,
+ sSizeList,
+ sWriterUnion,
+ sDefaultStart, // extra has default value in Avro binary encoding
+ sDefaultEnd,
+ sImplicitActionHigh,
+ sError
+ };
+
+private:
+ Kind kind_;
+ boost::any extra_;
+
+
+ explicit Symbol(Kind k) : kind_(k) { }
+ template <typename T> Symbol(Kind k, T t) : kind_(k), extra_(t) { }
+public:
+
+ Kind kind() const {
+ return kind_;
+ }
+
+ template <typename T> T extra() const {
+ return boost::any_cast<T>(extra_);
+ }
+
+ template <typename T> T* extrap() {
+ return boost::any_cast<T>(&extra_);
+ }
+
+ template <typename T> const T* extrap() const {
+ return boost::any_cast<T>(&extra_);
+ }
+
+ template <typename T> void extra(const T& t) {
+ extra_ = t;
+ }
+
+ bool isTerminal() const {
+ return kind_ > sTerminalLow && kind_ < sTerminalHigh;
+ }
+
+ bool isImplicitAction() const {
+ return kind_ > sImplicitActionLow && kind_ < sImplicitActionHigh;
+ }
+
+ static const char* stringValues[];
+ static const char* toString(Kind k) {
+ return stringValues[k];
+ }
+
+ static Symbol rootSymbol(ProductionPtr& s)
+ {
+ return Symbol(Symbol::sRoot, RootInfo(s, std::make_shared<Production>()));
+ }
+
+ static Symbol rootSymbol(const ProductionPtr& main,
+ const ProductionPtr& backup)
+ {
+ return Symbol(Symbol::sRoot, RootInfo(main, backup));
+ }
+
+ static Symbol nullSymbol() {
+ return Symbol(sNull);
+ }
+
+ static Symbol boolSymbol() {
+ return Symbol(sBool);
+ }
+
+ static Symbol intSymbol() {
+ return Symbol(sInt);
+ }
+
+ static Symbol longSymbol() {
+ return Symbol(sLong);
+ }
+
+ static Symbol floatSymbol() {
+ return Symbol(sFloat);
+ }
+
+ static Symbol doubleSymbol() {
+ return Symbol(sDouble);
+ }
+
+ static Symbol stringSymbol() {
+ return Symbol(sString);
+ }
+
+ static Symbol bytesSymbol() {
+ return Symbol(sBytes);
+ }
+
+ static Symbol sizeCheckSymbol(size_t s) {
+ return Symbol(sSizeCheck, s);
+ }
+
+ static Symbol fixedSymbol() {
+ return Symbol(sFixed);
+ }
+
+ static Symbol enumSymbol() {
+ return Symbol(sEnum);
+ }
+
+ static Symbol arrayStartSymbol() {
+ return Symbol(sArrayStart);
+ }
+
+ static Symbol arrayEndSymbol() {
+ return Symbol(sArrayEnd);
+ }
+
+ static Symbol mapStartSymbol() {
+ return Symbol(sMapStart);
+ }
+
+ static Symbol mapEndSymbol() {
+ return Symbol(sMapEnd);
+ }
+
+ static Symbol repeater(const ProductionPtr& p,
+ bool isArray) {
+ return repeater(p, p, isArray);
+ }
+
+ static Symbol repeater(const ProductionPtr& read,
+ const ProductionPtr& skip,
+ bool isArray) {
+ std::stack<ssize_t> s;
+ return Symbol(sRepeater, RepeaterInfo(s, isArray, read, skip));
+ }
+
+ static Symbol defaultStartAction(std::shared_ptr<std::vector<uint8_t> > bb)
+ {
+ return Symbol(sDefaultStart, bb);
+ }
+
+ static Symbol defaultEndAction()
+ {
+ return Symbol(sDefaultEnd);
+ }
+
+ static Symbol alternative(
+ const std::vector<ProductionPtr>& branches)
+ {
+ return Symbol(Symbol::sAlternative, branches);
+ }
+
+ static Symbol unionSymbol() {
+ return Symbol(sUnion);
+ }
+
+ static Symbol recordStartSymbol() {
+ return Symbol(sRecordStart);
+ }
+
+ static Symbol recordEndSymbol() {
+ return Symbol(sRecordEnd);
+ }
+
+ static Symbol fieldSymbol(const std::string& name) {
+ return Symbol(sField, name);
+ }
+
+ static Symbol writerUnionAction() {
+ return Symbol(sWriterUnion);
+ }
+
+ static Symbol nameListSymbol(
+ const std::vector<std::string>& v) {
+ return Symbol(sNameList, v);
+ }
+
+ template <typename T>
+ static Symbol placeholder(const T& n) {
+ return Symbol(sPlaceholder, n);
+ }
+
+ static Symbol indirect(const ProductionPtr& p) {
+ return Symbol(sIndirect, p);
+ }
+
+ static Symbol symbolic(const std::weak_ptr<Production>& p) {
+ return Symbol(sSymbolic, p);
+ }
+
+ static Symbol enumAdjustSymbol(const NodePtr& writer,
+ const NodePtr& reader);
+
+ static Symbol unionAdjustSymbol(size_t branch,
+ const ProductionPtr& p) {
+ return Symbol(sUnionAdjust, std::make_pair(branch, p));
+ }
+
+ static Symbol sizeListAction(std::vector<size_t> order) {
+ return Symbol(sSizeList, order);
+ }
+
+ static Symbol recordAction() {
+ return Symbol(sRecord);
+ }
+
+ static Symbol error(const NodePtr& writer, const NodePtr& reader);
+
+ static Symbol resolveSymbol(Kind w, Kind r) {
+ return Symbol(sResolve, std::make_pair(w, r));
+ }
+
+ static Symbol skipStart() {
+ return Symbol(sSkipStart);
+ }
+
+};
+
+/**
+ * Recursively replaces all placeholders in the production with the
+ * corresponding values.
+ */
+template<typename T>
+void fixup(const ProductionPtr& p,
+ const std::map<T, ProductionPtr> &m)
+{
+ std::set<ProductionPtr> seen;
+ for (Production::iterator it = p->begin(); it != p->end(); ++it) {
+ fixup(*it, m, seen);
+ }
+}
+
+
+/**
+ * Recursively replaces all placeholders in the symbol with the values with the
+ * corresponding values.
+ */
+template<typename T>
+void fixup_internal(const ProductionPtr& p,
+ const std::map<T, ProductionPtr> &m,
+ std::set<ProductionPtr>& seen)
+{
+ if (seen.find(p) == seen.end()) {
+ seen.insert(p);
+ for (Production::iterator it = p->begin(); it != p->end(); ++it) {
+ fixup(*it, m, seen);
+ }
+ }
+}
+
+template<typename T>
+void fixup(Symbol& s, const std::map<T, ProductionPtr> &m,
+ std::set<ProductionPtr>& seen)
+{
+ switch (s.kind()) {
+ case Symbol::sIndirect:
+ fixup_internal(s.extra<ProductionPtr>(), m, seen);
+ break;
+ case Symbol::sAlternative:
+ {
+ const std::vector<ProductionPtr> *vv =
+ s.extrap<std::vector<ProductionPtr> >();
+ for (std::vector<ProductionPtr>::const_iterator it = vv->begin();
+ it != vv->end(); ++it) {
+ fixup_internal(*it, m, seen);
+ }
+ }
+ break;
+ case Symbol::sRepeater:
+ {
+ const RepeaterInfo& ri = *s.extrap<RepeaterInfo>();
+ fixup_internal(boost::tuples::get<2>(ri), m, seen);
+ fixup_internal(boost::tuples::get<3>(ri), m, seen);
+ }
+ break;
+ case Symbol::sPlaceholder:
+ {
+ typename std::map<T, std::shared_ptr<Production> >::const_iterator it =
+ m.find(s.extra<T>());
+ if (it == m.end()) {
+ throw Exception("Placeholder symbol cannot be resolved");
+ }
+ s = Symbol::symbolic(std::weak_ptr<Production>(it->second));
+ }
+ break;
+ case Symbol::sUnionAdjust:
+ fixup_internal(s.extrap<std::pair<size_t, ProductionPtr> >()->second,
+ m, seen);
+ break;
+ default:
+ break;
+ }
+}
+
+template<typename Handler>
+class SimpleParser {
+ Decoder* decoder_;
+ Handler& handler_;
+ std::stack<Symbol> parsingStack;
+
+ static void throwMismatch(Symbol::Kind actual, Symbol::Kind expected)
+ {
+ std::ostringstream oss;
+ oss << "Invalid operation. Schema requires: " <<
+ Symbol::toString(expected) << ", got: " <<
+ Symbol::toString(actual);
+ throw Exception(oss.str());
+ }
+
+ static void assertMatch(Symbol::Kind actual, Symbol::Kind expected)
+ {
+ if (expected != actual) {
+ throwMismatch(actual, expected);
+ }
+
+ }
+
+ void append(const ProductionPtr& ss) {
+ for (Production::const_iterator it = ss->begin();
+ it != ss->end(); ++it) {
+ parsingStack.push(*it);
+ }
+ }
+
+ size_t popSize() {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sSizeCheck, s.kind());
+ size_t result = s.extra<size_t>();
+ parsingStack.pop();
+ return result;
+ }
+
+ static void assertLessThan(size_t n, size_t s) {
+ if (n >= s) {
+ std::ostringstream oss;
+ oss << "Size max value. Upper bound: " << s << " found " << n;
+ throw Exception(oss.str());
+ }
+ }
+
+public:
+ Symbol::Kind advance(Symbol::Kind k) {
+ for (; ;) {
+ Symbol& s = parsingStack.top();
+// std::cout << "advance: " << Symbol::toString(s.kind())
+// << " looking for " << Symbol::toString(k) << '\n';
+ if (s.kind() == k) {
+ parsingStack.pop();
+ return k;
+ } else if (s.isTerminal()) {
+ throwMismatch(k, s.kind());
+ } else {
+ switch (s.kind()) {
+ case Symbol::sRoot:
+ append(boost::tuples::get<0>(*s.extrap<RootInfo>()));
+ continue;
+ case Symbol::sIndirect:
+ {
+ ProductionPtr pp =
+ s.extra<ProductionPtr>();
+ parsingStack.pop();
+ append(pp);
+ }
+ continue;
+ case Symbol::sSymbolic:
+ {
+ ProductionPtr pp(
+ s.extra<std::weak_ptr<Production> >());
+ parsingStack.pop();
+ append(pp);
+ }
+ continue;
+ case Symbol::sRepeater:
+ {
+ RepeaterInfo *p = s.extrap<RepeaterInfo>();
+ std::stack<ssize_t>& ns = boost::tuples::get<0>(*p);
+ if (ns.empty()) {
+ throw Exception(
+ "Empty item count stack in repeater advance");
+ }
+ if (ns.top() == 0) {
+ throw Exception(
+ "Zero item count in repeater advance");
+ }
+ --ns.top();
+ append(boost::tuples::get<2>(*p));
+ }
+ continue;
+ case Symbol::sError:
+ throw Exception(s.extra<std::string>());
+ case Symbol::sResolve:
+ {
+ const std::pair<Symbol::Kind, Symbol::Kind>* p =
+ s.extrap<std::pair<Symbol::Kind, Symbol::Kind> >();
+ assertMatch(p->second, k);
+ Symbol::Kind result = p->first;
+ parsingStack.pop();
+ return result;
+ }
+ case Symbol::sSkipStart:
+ parsingStack.pop();
+ skip(*decoder_);
+ break;
+ default:
+ if (s.isImplicitAction()) {
+ size_t n = handler_.handle(s);
+ if (s.kind() == Symbol::sWriterUnion) {
+ parsingStack.pop();
+ selectBranch(n);
+ } else {
+ parsingStack.pop();
+ }
+ } else {
+ std::ostringstream oss;
+ oss << "Encountered " << Symbol::toString(s.kind())
+ << " while looking for " << Symbol::toString(k);
+ throw Exception(oss.str());
+ }
+ }
+ }
+ }
+ }
+
+ void skip(Decoder& d) {
+ const size_t sz = parsingStack.size();
+ if (sz == 0) {
+ throw Exception("Nothing to skip!");
+ }
+ while (parsingStack.size() >= sz) {
+ Symbol& t = parsingStack.top();
+ // std::cout << "skip: " << Symbol::toString(t.kind()) << '\n';
+ switch (t.kind()) {
+ case Symbol::sNull:
+ d.decodeNull();
+ break;
+ case Symbol::sBool:
+ d.decodeBool();
+ break;
+ case Symbol::sInt:
+ d.decodeInt();
+ break;
+ case Symbol::sLong:
+ d.decodeLong();
+ break;
+ case Symbol::sFloat:
+ d.decodeFloat();
+ break;
+ case Symbol::sDouble:
+ d.decodeDouble();
+ break;
+ case Symbol::sString:
+ d.skipString();
+ break;
+ case Symbol::sBytes:
+ d.skipBytes();
+ break;
+ case Symbol::sArrayStart:
+ {
+ parsingStack.pop();
+ size_t n = d.skipArray();
+ processImplicitActions();
+ assertMatch(Symbol::sRepeater, parsingStack.top().kind());
+ if (n == 0) {
+ break;
+ }
+ Symbol& t = parsingStack.top();
+ RepeaterInfo *p = t.extrap<RepeaterInfo>();
+ boost::tuples::get<0>(*p).push(n);
+ continue;
+ }
+ case Symbol::sArrayEnd:
+ break;
+ case Symbol::sMapStart:
+ {
+ parsingStack.pop();
+ size_t n = d.skipMap();
+ processImplicitActions();
+ assertMatch(Symbol::sRepeater, parsingStack.top().kind());
+ if (n == 0) {
+ break;
+ }
+ Symbol& t = parsingStack.top();
+ RepeaterInfo *p = t.extrap<RepeaterInfo>();
+ boost::tuples::get<0>(*p).push(n);
+ continue;
+ }
+ case Symbol::sMapEnd:
+ break;
+ case Symbol::sFixed:
+ {
+ parsingStack.pop();
+ Symbol& t = parsingStack.top();
+ d.decodeFixed(t.extra<size_t>());
+ }
+ break;
+ case Symbol::sEnum:
+ parsingStack.pop();
+ d.decodeEnum();
+ break;
+ case Symbol::sUnion:
+ {
+ parsingStack.pop();
+ size_t n = d.decodeUnionIndex();
+ selectBranch(n);
+ continue;
+ }
+ case Symbol::sRepeater:
+ {
+ RepeaterInfo *p = t.extrap<RepeaterInfo>();
+ std::stack<ssize_t>& ns = boost::tuples::get<0>(*p);
+ if (ns.empty()) {
+ throw Exception(
+ "Empty item count stack in repeater skip");
+ }
+ ssize_t& n = ns.top();
+ if (n == 0) {
+ n = boost::tuples::get<1>(*p) ? d.arrayNext()
+ : d.mapNext();
+ }
+ if (n != 0) {
+ --n;
+ append(boost::tuples::get<3>(*p));
+ continue;
+ } else {
+ ns.pop();
+ }
+ }
+ break;
+ case Symbol::sIndirect:
+ {
+ ProductionPtr pp =
+ t.extra<ProductionPtr>();
+ parsingStack.pop();
+ append(pp);
+ }
+ continue;
+ case Symbol::sSymbolic:
+ {
+ ProductionPtr pp(
+ t.extra<std::weak_ptr<Production> >());
+ parsingStack.pop();
+ append(pp);
+ }
+ continue;
+ default:
+ {
+ std::ostringstream oss;
+ oss << "Don't know how to skip "
+ << Symbol::toString(t.kind());
+ throw Exception(oss.str());
+ }
+ }
+ parsingStack.pop();
+ }
+ }
+
+ void assertSize(size_t n) {
+ size_t s = popSize();
+ if (s != n) {
+ std::ostringstream oss;
+ oss << "Incorrect size. Expected: " << s << " found " << n;
+ throw Exception(oss.str());
+ }
+ }
+
+ void assertLessThanSize(size_t n) {
+ assertLessThan(n, popSize());
+ }
+
+ size_t enumAdjust(size_t n) {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sEnumAdjust, s.kind());
+ const std::pair<std::vector<int>, std::vector<std::string> >* v =
+ s.extrap<std::pair<std::vector<int>, std::vector<std::string> > >();
+ assertLessThan(n, v->first.size());
+
+ int result = v->first[n];
+ if (result < 0) {
+ std::ostringstream oss;
+ oss << "Cannot resolve symbol: " << v->second[-result - 1]
+ << std::endl;
+ throw Exception(oss.str());
+ }
+ parsingStack.pop();
+ return result;
+ }
+
+ size_t unionAdjust() {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sUnionAdjust, s.kind());
+ std::pair<size_t, ProductionPtr> p =
+ s.extra<std::pair<size_t, ProductionPtr> >();
+ parsingStack.pop();
+ append(p.second);
+ return p.first;
+ }
+
+ std::string nameForIndex(size_t e) {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sNameList, s.kind());
+ const std::vector<std::string> names =
+ s.extra<std::vector<std::string> >();
+ if (e >= names.size()) {
+ throw Exception("Not that many names");
+ }
+ std::string result = names[e];
+ parsingStack.pop();
+ return result;
+ }
+
+ size_t indexForName(const std::string &name) {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sNameList, s.kind());
+ const std::vector<std::string> names =
+ s.extra<std::vector<std::string> >();
+ std::vector<std::string>::const_iterator it =
+ std::find(names.begin(), names.end(), name);
+ if (it == names.end()) {
+ throw Exception("No such enum symbol");
+ }
+ size_t result = it - names.begin();
+ parsingStack.pop();
+ return result;
+ }
+
+ void pushRepeatCount(size_t n) {
+ processImplicitActions();
+ Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sRepeater, s.kind());
+ RepeaterInfo *p = s.extrap<RepeaterInfo>();
+ std::stack<ssize_t> &nn = boost::tuples::get<0>(*p);
+ nn.push(n);
+ }
+
+ void nextRepeatCount(size_t n) {
+ processImplicitActions();
+ Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sRepeater, s.kind());
+ RepeaterInfo *p = s.extrap<RepeaterInfo>();
+ std::stack<ssize_t> &nn = boost::tuples::get<0>(*p);
+ if (nn.empty() || nn.top() != 0) {
+ throw Exception("Wrong number of items");
+ }
+ nn.top() = n;
+ }
+
+ void popRepeater() {
+ processImplicitActions();
+ Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sRepeater, s.kind());
+ RepeaterInfo *p = s.extrap<RepeaterInfo>();
+ std::stack<ssize_t> &ns = boost::tuples::get<0>(*p);
+ if (ns.empty()) {
+ throw Exception("Incorrect number of items (empty)");
+ }
+ if (ns.top() > 0) {
+ throw Exception("Incorrect number of items (non-zero)");
+ }
+ ns.pop();
+ parsingStack.pop();
+ }
+
+ void selectBranch(size_t n) {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sAlternative, s.kind());
+ std::vector<ProductionPtr> v =
+ s.extra<std::vector<ProductionPtr> >();
+ if (n >= v.size()) {
+ throw Exception("Not that many branches");
+ }
+ parsingStack.pop();
+ append(v[n]);
+ }
+
+ const std::vector<size_t>& sizeList() {
+ const Symbol& s = parsingStack.top();
+ assertMatch(Symbol::sSizeList, s.kind());
+ return *s.extrap<std::vector<size_t> >();
+ }
+
+ Symbol::Kind top() const {
+ return parsingStack.top().kind();
+ }
+
+ void pop() {
+ parsingStack.pop();
+ }
+
+ void processImplicitActions() {
+ for (; ;) {
+ Symbol& s = parsingStack.top();
+ if (s.isImplicitAction()) {
+ handler_.handle(s);
+ parsingStack.pop();
+ } else if (s.kind() == Symbol::sSkipStart) {
+ parsingStack.pop();
+ skip(*decoder_);
+ } else {
+ break;
+ }
+ }
+ }
+
+ SimpleParser(const Symbol& s, Decoder* d, Handler& h) :
+ decoder_(d), handler_(h) {
+ parsingStack.push(s);
+ }
+
+ void reset() {
+ while (parsingStack.size() > 1) {
+ parsingStack.pop();
+ }
+ }
+
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Symbol s);
+
+inline std::ostream& operator<<(std::ostream& os, const Production p)
+{
+ os << '(';
+ for (Production::const_iterator it = p.begin(); it != p.end(); ++it) {
+ os << *it << ", ";
+ }
+ os << ')';
+ return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Symbol s)
+{
+ switch (s.kind()) {
+ case Symbol::sRepeater:
+ {
+ const RepeaterInfo& ri = *s.extrap<RepeaterInfo>();
+ os << '(' << Symbol::toString(s.kind())
+ << ' ' << *boost::tuples::get<2>(ri)
+ << ' ' << *boost::tuples::get<3>(ri)
+ << ')';
+ }
+ break;
+ case Symbol::sIndirect:
+ {
+ os << '(' << Symbol::toString(s.kind()) << ' '
+ << *s.extra<std::shared_ptr<Production> >() << ')';
+ }
+ break;
+ case Symbol::sAlternative:
+ {
+ os << '(' << Symbol::toString(s.kind());
+ for (std::vector<ProductionPtr>::const_iterator it =
+ s.extrap<std::vector<ProductionPtr> >()->begin();
+ it != s.extrap<std::vector<ProductionPtr> >()->end();
+ ++it) {
+ os << ' ' << **it;
+ }
+ os << ')';
+ }
+ break;
+ case Symbol::sSymbolic:
+ {
+ os << '(' << Symbol::toString(s.kind())
+ << ' ' << s.extra<std::weak_ptr<Production> >().lock()
+ << ')';
+ }
+ break;
+ default:
+ os << Symbol::toString(s.kind());
+ break;
+ }
+ return os;
+ }
+} // namespace parsing
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.cc b/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.cc
index fdf6ef898f9..8a291d43179 100644
--- a/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.cc
+++ b/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.cc
@@ -1,591 +1,591 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ValidatingCodec.hh"
-
-#include <string>
-#include <map>
-#include <algorithm>
-#include <memory>
-#include <boost/any.hpp>
-
-#include "ValidSchema.hh"
-#include "Decoder.hh"
-#include "Encoder.hh"
-#include "NodeImpl.hh"
-
-namespace avro {
-
-using std::make_shared;
-
-namespace parsing {
-
-using std::shared_ptr;
-using std::static_pointer_cast;
-
-using std::map;
-using std::vector;
-using std::pair;
-using std::string;
-using std::reverse;
-using std::ostringstream;
-
-/** Follows the design of Avro Parser in Java. */
-ProductionPtr ValidatingGrammarGenerator::generate(const NodePtr& n)
-{
- map<NodePtr, ProductionPtr> m;
- ProductionPtr result = doGenerate(n, m);
- fixup(result, m);
- return result;
-}
-
-Symbol ValidatingGrammarGenerator::generate(const ValidSchema& schema)
-{
- ProductionPtr r = generate(schema.root());
- return Symbol::rootSymbol(r);
-}
-
-ProductionPtr ValidatingGrammarGenerator::doGenerate(const NodePtr& n,
- map<NodePtr, ProductionPtr> &m) {
- switch (n->type()) {
- case AVRO_NULL:
- return make_shared<Production>(1, Symbol::nullSymbol());
- case AVRO_BOOL:
- return make_shared<Production>(1, Symbol::boolSymbol());
- case AVRO_INT:
- return make_shared<Production>(1, Symbol::intSymbol());
- case AVRO_LONG:
- return make_shared<Production>(1, Symbol::longSymbol());
- case AVRO_FLOAT:
- return make_shared<Production>(1, Symbol::floatSymbol());
- case AVRO_DOUBLE:
- return make_shared<Production>(1, Symbol::doubleSymbol());
- case AVRO_STRING:
- return make_shared<Production>(1, Symbol::stringSymbol());
- case AVRO_BYTES:
- return make_shared<Production>(1, Symbol::bytesSymbol());
- case AVRO_FIXED:
- {
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::sizeCheckSymbol(n->fixedSize()));
- result->push_back(Symbol::fixedSymbol());
- m[n] = result;
- return result;
- }
- case AVRO_RECORD:
- {
- ProductionPtr result = make_shared<Production>();
-
- m.erase(n);
- size_t c = n->leaves();
- for (size_t i = 0; i < c; ++i) {
- const NodePtr& leaf = n->leafAt(i);
- ProductionPtr v = doGenerate(leaf, m);
- copy(v->rbegin(), v->rend(), back_inserter(*result));
- }
- reverse(result->begin(), result->end());
-
- m[n] = result;
- return make_shared<Production>(1, Symbol::indirect(result));
- }
- case AVRO_ENUM:
- {
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::sizeCheckSymbol(n->names()));
- result->push_back(Symbol::enumSymbol());
- m[n] = result;
- return result;
- }
- case AVRO_ARRAY:
- {
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::arrayEndSymbol());
- result->push_back(Symbol::repeater(doGenerate(n->leafAt(0), m), true));
- result->push_back(Symbol::arrayStartSymbol());
- return result;
- }
- case AVRO_MAP:
- {
- ProductionPtr pp = doGenerate(n->leafAt(1), m);
- ProductionPtr v(new Production(*pp));
- v->push_back(Symbol::stringSymbol());
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::mapEndSymbol());
- result->push_back(Symbol::repeater(v, false));
- result->push_back(Symbol::mapStartSymbol());
- return result;
- }
- case AVRO_UNION:
- {
- vector<ProductionPtr> vv;
- size_t c = n->leaves();
- vv.reserve(c);
- for (size_t i = 0; i < c; ++i) {
- vv.push_back(doGenerate(n->leafAt(i), m));
- }
- ProductionPtr result = make_shared<Production>();
- result->push_back(Symbol::alternative(vv));
- result->push_back(Symbol::unionSymbol());
- return result;
- }
- case AVRO_SYMBOLIC:
- {
- shared_ptr<NodeSymbolic> ns = static_pointer_cast<NodeSymbolic>(n);
- NodePtr nn = ns->getNode();
- map<NodePtr, ProductionPtr>::iterator it =
- m.find(nn);
- if (it != m.end() && it->second) {
- return it->second;
- } else {
- m[nn] = ProductionPtr();
- return make_shared<Production>(1, Symbol::placeholder(nn));
- }
- }
- default:
- throw Exception("Unknown node type");
- }
-}
-
-struct DummyHandler {
- size_t handle(const Symbol& s) {
- return 0;
- }
-};
-
-template <typename P>
-class ValidatingDecoder : public Decoder {
- const shared_ptr<Decoder> base;
- DummyHandler handler_;
- P parser;
-
- void init(InputStream& is);
- void decodeNull();
- bool decodeBool();
- int32_t decodeInt();
- int64_t decodeLong();
- float decodeFloat();
- double decodeDouble();
- void decodeString(string& value);
- void skipString();
- void decodeBytes(vector<uint8_t>& value);
- void skipBytes();
- void decodeFixed(size_t n, vector<uint8_t>& value);
- void skipFixed(size_t n);
- size_t decodeEnum();
- size_t arrayStart();
- size_t arrayNext();
- size_t skipArray();
- size_t mapStart();
- size_t mapNext();
- size_t skipMap();
- size_t decodeUnionIndex();
- void drain() {
- base->drain();
- }
-
-public:
-
- ValidatingDecoder(const ValidSchema& s, const shared_ptr<Decoder> b) :
- base(b),
- parser(ValidatingGrammarGenerator().generate(s), NULL, handler_) { }
-
-};
-
-template <typename P>
-void ValidatingDecoder<P>::init(InputStream& is)
-{
- base->init(is);
-}
-
-template <typename P>
-void ValidatingDecoder<P>::decodeNull()
-{
- parser.advance(Symbol::sNull);
- base->decodeNull();
-}
-
-template <typename P>
-bool ValidatingDecoder<P>::decodeBool()
-{
- parser.advance(Symbol::sBool);
- return base->decodeBool();
-}
-
-template <typename P>
-int32_t ValidatingDecoder<P>::decodeInt()
-{
- parser.advance(Symbol::sInt);
- return base->decodeInt();
-}
-
-template <typename P>
-int64_t ValidatingDecoder<P>::decodeLong()
-{
- parser.advance(Symbol::sLong);
- return base->decodeLong();
-}
-
-template <typename P>
-float ValidatingDecoder<P>::decodeFloat()
-{
- parser.advance(Symbol::sFloat);
- return base->decodeFloat();
-}
-
-template <typename P>
-double ValidatingDecoder<P>::decodeDouble()
-{
- parser.advance(Symbol::sDouble);
- return base->decodeDouble();
-}
-
-template <typename P>
-void ValidatingDecoder<P>::decodeString(string& value)
-{
- parser.advance(Symbol::sString);
- base->decodeString(value);
-}
-
-template <typename P>
-void ValidatingDecoder<P>::skipString()
-{
- parser.advance(Symbol::sString);
- base->skipString();
-}
-
-template <typename P>
-void ValidatingDecoder<P>::decodeBytes(vector<uint8_t>& value)
-{
- parser.advance(Symbol::sBytes);
- base->decodeBytes(value);
-}
-
-template <typename P>
-void ValidatingDecoder<P>::skipBytes()
-{
- parser.advance(Symbol::sBytes);
- base->skipBytes();
-}
-
-template <typename P>
-void ValidatingDecoder<P>::decodeFixed(size_t n, vector<uint8_t>& value)
-{
- parser.advance(Symbol::sFixed);
- parser.assertSize(n);
- base->decodeFixed(n, value);
-}
-
-template <typename P>
-void ValidatingDecoder<P>::skipFixed(size_t n)
-{
- parser.advance(Symbol::sFixed);
- parser.assertSize(n);
- base->skipFixed(n);
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::decodeEnum()
-{
- parser.advance(Symbol::sEnum);
- size_t result = base->decodeEnum();
- parser.assertLessThanSize(result);
- return result;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::arrayStart()
-{
- parser.advance(Symbol::sArrayStart);
- size_t result = base->arrayStart();
- parser.pushRepeatCount(result);
- if (result == 0) {
- parser.popRepeater();
- parser.advance(Symbol::sArrayEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::arrayNext()
-{
- size_t result = base->arrayNext();
- parser.nextRepeatCount(result);
- if (result == 0) {
- parser.popRepeater();
- parser.advance(Symbol::sArrayEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::skipArray()
-{
- parser.advance(Symbol::sArrayStart);
- size_t n = base->skipArray();
- if (n == 0) {
- parser.pop();
- } else {
- parser.pushRepeatCount(n);
- parser.skip(*base);
- }
- parser.advance(Symbol::sArrayEnd);
- return 0;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::mapStart()
-{
- parser.advance(Symbol::sMapStart);
- size_t result = base->mapStart();
- parser.pushRepeatCount(result);
- if (result == 0) {
- parser.popRepeater();
- parser.advance(Symbol::sMapEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::mapNext()
-{
- size_t result = base->mapNext();
- parser.nextRepeatCount(result);
- if (result == 0) {
- parser.popRepeater();
- parser.advance(Symbol::sMapEnd);
- }
- return result;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::skipMap()
-{
- parser.advance(Symbol::sMapStart);
- size_t n = base->skipMap();
- if (n == 0) {
- parser.pop();
- } else {
- parser.pushRepeatCount(n);
- parser.skip(*base);
- }
- parser.advance(Symbol::sMapEnd);
- return 0;
-}
-
-template <typename P>
-size_t ValidatingDecoder<P>::decodeUnionIndex()
-{
- parser.advance(Symbol::sUnion);
- size_t result = base->decodeUnionIndex();
- parser.selectBranch(result);
- return result;
-}
-
-template <typename P>
-class ValidatingEncoder : public Encoder {
- DummyHandler handler_;
- P parser_;
- EncoderPtr base_;
-
- void init(OutputStream& os);
- void flush();
- int64_t byteCount() const;
- void encodeNull();
- void encodeBool(bool b);
- void encodeInt(int32_t i);
- void encodeLong(int64_t l);
- void encodeFloat(float f);
- void encodeDouble(double d);
- void encodeString(const std::string& s);
- void encodeBytes(const uint8_t *bytes, size_t len);
- void encodeFixed(const uint8_t *bytes, size_t len);
- void encodeEnum(size_t e);
- void arrayStart();
- void arrayEnd();
- void mapStart();
- void mapEnd();
- void setItemCount(size_t count);
- void startItem();
- void encodeUnionIndex(size_t e);
-public:
- ValidatingEncoder(const ValidSchema& schema, const EncoderPtr& base) :
- parser_(ValidatingGrammarGenerator().generate(schema), NULL, handler_),
- base_(base) { }
-};
-
-template<typename P>
-void ValidatingEncoder<P>::init(OutputStream& os)
-{
- base_->init(os);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::flush()
-{
- base_->flush();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeNull()
-{
- parser_.advance(Symbol::sNull);
- base_->encodeNull();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeBool(bool b)
-{
- parser_.advance(Symbol::sBool);
- base_->encodeBool(b);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeInt(int32_t i)
-{
- parser_.advance(Symbol::sInt);
- base_->encodeInt(i);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeLong(int64_t l)
-{
- parser_.advance(Symbol::sLong);
- base_->encodeLong(l);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeFloat(float f)
-{
- parser_.advance(Symbol::sFloat);
- base_->encodeFloat(f);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeDouble(double d)
-{
- parser_.advance(Symbol::sDouble);
- base_->encodeDouble(d);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeString(const std::string& s)
-{
- parser_.advance(Symbol::sString);
- base_->encodeString(s);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeBytes(const uint8_t *bytes, size_t len)
-{
- parser_.advance(Symbol::sBytes);
- base_->encodeBytes(bytes, len);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeFixed(const uint8_t *bytes, size_t len)
-{
- parser_.advance(Symbol::sFixed);
- parser_.assertSize(len);
- base_->encodeFixed(bytes, len);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeEnum(size_t e)
-{
- parser_.advance(Symbol::sEnum);
- parser_.assertLessThanSize(e);
- base_->encodeEnum(e);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::arrayStart()
-{
- parser_.advance(Symbol::sArrayStart);
- parser_.pushRepeatCount(0);
- base_->arrayStart();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::arrayEnd()
-{
- parser_.popRepeater();
- parser_.advance(Symbol::sArrayEnd);
- base_->arrayEnd();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::mapStart()
-{
- parser_.advance(Symbol::sMapStart);
- parser_.pushRepeatCount(0);
- base_->mapStart();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::mapEnd()
-{
- parser_.popRepeater();
- parser_.advance(Symbol::sMapEnd);
- base_->mapEnd();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::setItemCount(size_t count)
-{
- parser_.nextRepeatCount(count);
- base_->setItemCount(count);
-}
-
-template<typename P>
-void ValidatingEncoder<P>::startItem()
-{
- if (parser_.top() != Symbol::sRepeater) {
- throw Exception("startItem at not an item boundary");
- }
- base_->startItem();
-}
-
-template<typename P>
-void ValidatingEncoder<P>::encodeUnionIndex(size_t e)
-{
- parser_.advance(Symbol::sUnion);
- parser_.selectBranch(e);
- base_->encodeUnionIndex(e);
-}
-
-template<typename P>
-int64_t ValidatingEncoder<P>::byteCount() const
-{
- return base_->byteCount();
-}
-
-} // namespace parsing
-
-DecoderPtr validatingDecoder(const ValidSchema& s,
- const DecoderPtr& base)
-{
- return make_shared<parsing::ValidatingDecoder<parsing::SimpleParser<parsing::DummyHandler> > >(s, base);
-}
-
-EncoderPtr validatingEncoder(const ValidSchema& schema, const EncoderPtr& base)
-{
- return make_shared<parsing::ValidatingEncoder<parsing::SimpleParser<parsing::DummyHandler> > >(schema, base);
-}
-
-} // namespace avro
-
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ValidatingCodec.hh"
+
+#include <string>
+#include <map>
+#include <algorithm>
+#include <memory>
+#include <boost/any.hpp>
+
+#include "ValidSchema.hh"
+#include "Decoder.hh"
+#include "Encoder.hh"
+#include "NodeImpl.hh"
+
+namespace avro {
+
+using std::make_shared;
+
+namespace parsing {
+
+using std::shared_ptr;
+using std::static_pointer_cast;
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::string;
+using std::reverse;
+using std::ostringstream;
+
+/** Follows the design of Avro Parser in Java. */
+ProductionPtr ValidatingGrammarGenerator::generate(const NodePtr& n)
+{
+ map<NodePtr, ProductionPtr> m;
+ ProductionPtr result = doGenerate(n, m);
+ fixup(result, m);
+ return result;
+}
+
+Symbol ValidatingGrammarGenerator::generate(const ValidSchema& schema)
+{
+ ProductionPtr r = generate(schema.root());
+ return Symbol::rootSymbol(r);
+}
+
+ProductionPtr ValidatingGrammarGenerator::doGenerate(const NodePtr& n,
+ map<NodePtr, ProductionPtr> &m) {
+ switch (n->type()) {
+ case AVRO_NULL:
+ return make_shared<Production>(1, Symbol::nullSymbol());
+ case AVRO_BOOL:
+ return make_shared<Production>(1, Symbol::boolSymbol());
+ case AVRO_INT:
+ return make_shared<Production>(1, Symbol::intSymbol());
+ case AVRO_LONG:
+ return make_shared<Production>(1, Symbol::longSymbol());
+ case AVRO_FLOAT:
+ return make_shared<Production>(1, Symbol::floatSymbol());
+ case AVRO_DOUBLE:
+ return make_shared<Production>(1, Symbol::doubleSymbol());
+ case AVRO_STRING:
+ return make_shared<Production>(1, Symbol::stringSymbol());
+ case AVRO_BYTES:
+ return make_shared<Production>(1, Symbol::bytesSymbol());
+ case AVRO_FIXED:
+ {
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::sizeCheckSymbol(n->fixedSize()));
+ result->push_back(Symbol::fixedSymbol());
+ m[n] = result;
+ return result;
+ }
+ case AVRO_RECORD:
+ {
+ ProductionPtr result = make_shared<Production>();
+
+ m.erase(n);
+ size_t c = n->leaves();
+ for (size_t i = 0; i < c; ++i) {
+ const NodePtr& leaf = n->leafAt(i);
+ ProductionPtr v = doGenerate(leaf, m);
+ copy(v->rbegin(), v->rend(), back_inserter(*result));
+ }
+ reverse(result->begin(), result->end());
+
+ m[n] = result;
+ return make_shared<Production>(1, Symbol::indirect(result));
+ }
+ case AVRO_ENUM:
+ {
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::sizeCheckSymbol(n->names()));
+ result->push_back(Symbol::enumSymbol());
+ m[n] = result;
+ return result;
+ }
+ case AVRO_ARRAY:
+ {
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::arrayEndSymbol());
+ result->push_back(Symbol::repeater(doGenerate(n->leafAt(0), m), true));
+ result->push_back(Symbol::arrayStartSymbol());
+ return result;
+ }
+ case AVRO_MAP:
+ {
+ ProductionPtr pp = doGenerate(n->leafAt(1), m);
+ ProductionPtr v(new Production(*pp));
+ v->push_back(Symbol::stringSymbol());
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::mapEndSymbol());
+ result->push_back(Symbol::repeater(v, false));
+ result->push_back(Symbol::mapStartSymbol());
+ return result;
+ }
+ case AVRO_UNION:
+ {
+ vector<ProductionPtr> vv;
+ size_t c = n->leaves();
+ vv.reserve(c);
+ for (size_t i = 0; i < c; ++i) {
+ vv.push_back(doGenerate(n->leafAt(i), m));
+ }
+ ProductionPtr result = make_shared<Production>();
+ result->push_back(Symbol::alternative(vv));
+ result->push_back(Symbol::unionSymbol());
+ return result;
+ }
+ case AVRO_SYMBOLIC:
+ {
+ shared_ptr<NodeSymbolic> ns = static_pointer_cast<NodeSymbolic>(n);
+ NodePtr nn = ns->getNode();
+ map<NodePtr, ProductionPtr>::iterator it =
+ m.find(nn);
+ if (it != m.end() && it->second) {
+ return it->second;
+ } else {
+ m[nn] = ProductionPtr();
+ return make_shared<Production>(1, Symbol::placeholder(nn));
+ }
+ }
+ default:
+ throw Exception("Unknown node type");
+ }
+}
+
+struct DummyHandler {
+ size_t handle(const Symbol& s) {
+ return 0;
+ }
+};
+
+template <typename P>
+class ValidatingDecoder : public Decoder {
+ const shared_ptr<Decoder> base;
+ DummyHandler handler_;
+ P parser;
+
+ void init(InputStream& is);
+ void decodeNull();
+ bool decodeBool();
+ int32_t decodeInt();
+ int64_t decodeLong();
+ float decodeFloat();
+ double decodeDouble();
+ void decodeString(string& value);
+ void skipString();
+ void decodeBytes(vector<uint8_t>& value);
+ void skipBytes();
+ void decodeFixed(size_t n, vector<uint8_t>& value);
+ void skipFixed(size_t n);
+ size_t decodeEnum();
+ size_t arrayStart();
+ size_t arrayNext();
+ size_t skipArray();
+ size_t mapStart();
+ size_t mapNext();
+ size_t skipMap();
+ size_t decodeUnionIndex();
+ void drain() {
+ base->drain();
+ }
+
+public:
+
+ ValidatingDecoder(const ValidSchema& s, const shared_ptr<Decoder> b) :
+ base(b),
+ parser(ValidatingGrammarGenerator().generate(s), NULL, handler_) { }
+
+};
+
+template <typename P>
+void ValidatingDecoder<P>::init(InputStream& is)
+{
+ base->init(is);
+}
+
+template <typename P>
+void ValidatingDecoder<P>::decodeNull()
+{
+ parser.advance(Symbol::sNull);
+ base->decodeNull();
+}
+
+template <typename P>
+bool ValidatingDecoder<P>::decodeBool()
+{
+ parser.advance(Symbol::sBool);
+ return base->decodeBool();
+}
+
+template <typename P>
+int32_t ValidatingDecoder<P>::decodeInt()
+{
+ parser.advance(Symbol::sInt);
+ return base->decodeInt();
+}
+
+template <typename P>
+int64_t ValidatingDecoder<P>::decodeLong()
+{
+ parser.advance(Symbol::sLong);
+ return base->decodeLong();
+}
+
+template <typename P>
+float ValidatingDecoder<P>::decodeFloat()
+{
+ parser.advance(Symbol::sFloat);
+ return base->decodeFloat();
+}
+
+template <typename P>
+double ValidatingDecoder<P>::decodeDouble()
+{
+ parser.advance(Symbol::sDouble);
+ return base->decodeDouble();
+}
+
+template <typename P>
+void ValidatingDecoder<P>::decodeString(string& value)
+{
+ parser.advance(Symbol::sString);
+ base->decodeString(value);
+}
+
+template <typename P>
+void ValidatingDecoder<P>::skipString()
+{
+ parser.advance(Symbol::sString);
+ base->skipString();
+}
+
+template <typename P>
+void ValidatingDecoder<P>::decodeBytes(vector<uint8_t>& value)
+{
+ parser.advance(Symbol::sBytes);
+ base->decodeBytes(value);
+}
+
+template <typename P>
+void ValidatingDecoder<P>::skipBytes()
+{
+ parser.advance(Symbol::sBytes);
+ base->skipBytes();
+}
+
+template <typename P>
+void ValidatingDecoder<P>::decodeFixed(size_t n, vector<uint8_t>& value)
+{
+ parser.advance(Symbol::sFixed);
+ parser.assertSize(n);
+ base->decodeFixed(n, value);
+}
+
+template <typename P>
+void ValidatingDecoder<P>::skipFixed(size_t n)
+{
+ parser.advance(Symbol::sFixed);
+ parser.assertSize(n);
+ base->skipFixed(n);
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::decodeEnum()
+{
+ parser.advance(Symbol::sEnum);
+ size_t result = base->decodeEnum();
+ parser.assertLessThanSize(result);
+ return result;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::arrayStart()
+{
+ parser.advance(Symbol::sArrayStart);
+ size_t result = base->arrayStart();
+ parser.pushRepeatCount(result);
+ if (result == 0) {
+ parser.popRepeater();
+ parser.advance(Symbol::sArrayEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::arrayNext()
+{
+ size_t result = base->arrayNext();
+ parser.nextRepeatCount(result);
+ if (result == 0) {
+ parser.popRepeater();
+ parser.advance(Symbol::sArrayEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::skipArray()
+{
+ parser.advance(Symbol::sArrayStart);
+ size_t n = base->skipArray();
+ if (n == 0) {
+ parser.pop();
+ } else {
+ parser.pushRepeatCount(n);
+ parser.skip(*base);
+ }
+ parser.advance(Symbol::sArrayEnd);
+ return 0;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::mapStart()
+{
+ parser.advance(Symbol::sMapStart);
+ size_t result = base->mapStart();
+ parser.pushRepeatCount(result);
+ if (result == 0) {
+ parser.popRepeater();
+ parser.advance(Symbol::sMapEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::mapNext()
+{
+ size_t result = base->mapNext();
+ parser.nextRepeatCount(result);
+ if (result == 0) {
+ parser.popRepeater();
+ parser.advance(Symbol::sMapEnd);
+ }
+ return result;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::skipMap()
+{
+ parser.advance(Symbol::sMapStart);
+ size_t n = base->skipMap();
+ if (n == 0) {
+ parser.pop();
+ } else {
+ parser.pushRepeatCount(n);
+ parser.skip(*base);
+ }
+ parser.advance(Symbol::sMapEnd);
+ return 0;
+}
+
+template <typename P>
+size_t ValidatingDecoder<P>::decodeUnionIndex()
+{
+ parser.advance(Symbol::sUnion);
+ size_t result = base->decodeUnionIndex();
+ parser.selectBranch(result);
+ return result;
+}
+
+template <typename P>
+class ValidatingEncoder : public Encoder {
+ DummyHandler handler_;
+ P parser_;
+ EncoderPtr base_;
+
+ void init(OutputStream& os);
+ void flush();
+ int64_t byteCount() const;
+ void encodeNull();
+ void encodeBool(bool b);
+ void encodeInt(int32_t i);
+ void encodeLong(int64_t l);
+ void encodeFloat(float f);
+ void encodeDouble(double d);
+ void encodeString(const std::string& s);
+ void encodeBytes(const uint8_t *bytes, size_t len);
+ void encodeFixed(const uint8_t *bytes, size_t len);
+ void encodeEnum(size_t e);
+ void arrayStart();
+ void arrayEnd();
+ void mapStart();
+ void mapEnd();
+ void setItemCount(size_t count);
+ void startItem();
+ void encodeUnionIndex(size_t e);
+public:
+ ValidatingEncoder(const ValidSchema& schema, const EncoderPtr& base) :
+ parser_(ValidatingGrammarGenerator().generate(schema), NULL, handler_),
+ base_(base) { }
+};
+
+template<typename P>
+void ValidatingEncoder<P>::init(OutputStream& os)
+{
+ base_->init(os);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::flush()
+{
+ base_->flush();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeNull()
+{
+ parser_.advance(Symbol::sNull);
+ base_->encodeNull();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeBool(bool b)
+{
+ parser_.advance(Symbol::sBool);
+ base_->encodeBool(b);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeInt(int32_t i)
+{
+ parser_.advance(Symbol::sInt);
+ base_->encodeInt(i);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeLong(int64_t l)
+{
+ parser_.advance(Symbol::sLong);
+ base_->encodeLong(l);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeFloat(float f)
+{
+ parser_.advance(Symbol::sFloat);
+ base_->encodeFloat(f);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeDouble(double d)
+{
+ parser_.advance(Symbol::sDouble);
+ base_->encodeDouble(d);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeString(const std::string& s)
+{
+ parser_.advance(Symbol::sString);
+ base_->encodeString(s);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeBytes(const uint8_t *bytes, size_t len)
+{
+ parser_.advance(Symbol::sBytes);
+ base_->encodeBytes(bytes, len);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeFixed(const uint8_t *bytes, size_t len)
+{
+ parser_.advance(Symbol::sFixed);
+ parser_.assertSize(len);
+ base_->encodeFixed(bytes, len);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeEnum(size_t e)
+{
+ parser_.advance(Symbol::sEnum);
+ parser_.assertLessThanSize(e);
+ base_->encodeEnum(e);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::arrayStart()
+{
+ parser_.advance(Symbol::sArrayStart);
+ parser_.pushRepeatCount(0);
+ base_->arrayStart();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::arrayEnd()
+{
+ parser_.popRepeater();
+ parser_.advance(Symbol::sArrayEnd);
+ base_->arrayEnd();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::mapStart()
+{
+ parser_.advance(Symbol::sMapStart);
+ parser_.pushRepeatCount(0);
+ base_->mapStart();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::mapEnd()
+{
+ parser_.popRepeater();
+ parser_.advance(Symbol::sMapEnd);
+ base_->mapEnd();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::setItemCount(size_t count)
+{
+ parser_.nextRepeatCount(count);
+ base_->setItemCount(count);
+}
+
+template<typename P>
+void ValidatingEncoder<P>::startItem()
+{
+ if (parser_.top() != Symbol::sRepeater) {
+ throw Exception("startItem at not an item boundary");
+ }
+ base_->startItem();
+}
+
+template<typename P>
+void ValidatingEncoder<P>::encodeUnionIndex(size_t e)
+{
+ parser_.advance(Symbol::sUnion);
+ parser_.selectBranch(e);
+ base_->encodeUnionIndex(e);
+}
+
+template<typename P>
+int64_t ValidatingEncoder<P>::byteCount() const
+{
+ return base_->byteCount();
+}
+
+} // namespace parsing
+
+DecoderPtr validatingDecoder(const ValidSchema& s,
+ const DecoderPtr& base)
+{
+ return make_shared<parsing::ValidatingDecoder<parsing::SimpleParser<parsing::DummyHandler> > >(s, base);
+}
+
+EncoderPtr validatingEncoder(const ValidSchema& schema, const EncoderPtr& base)
+{
+ return make_shared<parsing::ValidatingEncoder<parsing::SimpleParser<parsing::DummyHandler> > >(schema, base);
+}
+
+} // namespace avro
+
diff --git a/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.hh b/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.hh
index 39ceda033e0..b90b3ea64a7 100644
--- a/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.hh
+++ b/contrib/libs/apache/avro/impl/parsing/ValidatingCodec.hh
@@ -1,51 +1,51 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef avro_parsing_ValidatingCodec_hh__
-#define avro_parsing_ValidatingCodec_hh__
-
-#include <map>
-#include <vector>
-
-#include "Symbol.hh"
-#include "ValidSchema.hh"
-#include "NodeImpl.hh"
-
-namespace avro {
-namespace parsing {
-
-class ValidatingGrammarGenerator {
-protected:
- template<typename T>
- static void doFixup(Production& p, const std::map<T, ProductionPtr> &m);
-
- template<typename T>
- static void doFixup(Symbol &s, const std::map<T, ProductionPtr> &m);
- virtual ProductionPtr doGenerate(const NodePtr& n,
- std::map<NodePtr, ProductionPtr> &m);
-
- ProductionPtr generate(const NodePtr& schema);
-public:
- Symbol generate(const ValidSchema& schema);
-
-};
-
-} // namespace parsing
-} // namespace avro
-
-#endif
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef avro_parsing_ValidatingCodec_hh__
+#define avro_parsing_ValidatingCodec_hh__
+
+#include <map>
+#include <vector>
+
+#include "Symbol.hh"
+#include "ValidSchema.hh"
+#include "NodeImpl.hh"
+
+namespace avro {
+namespace parsing {
+
+class ValidatingGrammarGenerator {
+protected:
+ template<typename T>
+ static void doFixup(Production& p, const std::map<T, ProductionPtr> &m);
+
+ template<typename T>
+ static void doFixup(Symbol &s, const std::map<T, ProductionPtr> &m);
+ virtual ProductionPtr doGenerate(const NodePtr& n,
+ std::map<NodePtr, ProductionPtr> &m);
+
+ ProductionPtr generate(const NodePtr& schema);
+public:
+ Symbol generate(const ValidSchema& schema);
+
+};
+
+} // namespace parsing
+} // namespace avro
+
+#endif
diff --git a/contrib/libs/apache/avro/ya.make b/contrib/libs/apache/avro/ya.make
index 80197fdc47c..11e72e643ac 100644
--- a/contrib/libs/apache/avro/ya.make
+++ b/contrib/libs/apache/avro/ya.make
@@ -1,14 +1,14 @@
-# Generated by devtools/yamaker from nixpkgs 5701e5bc3bfb317e1f37ff3fb889eae7584a1206.
-
-LIBRARY()
-
+# Generated by devtools/yamaker from nixpkgs 5701e5bc3bfb317e1f37ff3fb889eae7584a1206.
+
+LIBRARY()
+
OWNER(
- g:cpp-contrib
+ g:cpp-contrib
g:yql
)
-
-VERSION(1.10.2)
-
+
+VERSION(1.10.2)
+
ORIGINAL_SOURCE(https://github.com/apache/avro/archive/release-1.10.2.tar.gz)
LICENSE(
@@ -16,53 +16,53 @@ LICENSE(
BSL-1.0 AND
FSFAP
)
-
+
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
- contrib/libs/snappy
- contrib/restricted/boost
+PEERDIR(
+ contrib/libs/snappy
+ contrib/restricted/boost
contrib/restricted/boost/libs/iostreams
-)
-
+)
+
ADDINCL(
contrib/libs/apache/avro/api
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-CFLAGS(
- -DAVRO_SOURCE
- -DSNAPPY_CODEC_AVAILABLE
-)
-
-SRCS(
- impl/BinaryDecoder.cc
- impl/BinaryEncoder.cc
- impl/Compiler.cc
- impl/DataFile.cc
- impl/FileStream.cc
- impl/Generic.cc
- impl/GenericDatum.cc
- impl/LogicalType.cc
- impl/Node.cc
- impl/NodeImpl.cc
- impl/Resolver.cc
- impl/ResolverSchema.cc
- impl/Schema.cc
- impl/Stream.cc
- impl/Types.cc
- impl/ValidSchema.cc
- impl/Validator.cc
- impl/Zigzag.cc
- impl/json/JsonDom.cc
- impl/json/JsonIO.cc
- impl/parsing/JsonCodec.cc
- impl/parsing/ResolvingDecoder.cc
- impl/parsing/Symbol.cc
- impl/parsing/ValidatingCodec.cc
-)
-
-END()
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+CFLAGS(
+ -DAVRO_SOURCE
+ -DSNAPPY_CODEC_AVAILABLE
+)
+
+SRCS(
+ impl/BinaryDecoder.cc
+ impl/BinaryEncoder.cc
+ impl/Compiler.cc
+ impl/DataFile.cc
+ impl/FileStream.cc
+ impl/Generic.cc
+ impl/GenericDatum.cc
+ impl/LogicalType.cc
+ impl/Node.cc
+ impl/NodeImpl.cc
+ impl/Resolver.cc
+ impl/ResolverSchema.cc
+ impl/Schema.cc
+ impl/Stream.cc
+ impl/Types.cc
+ impl/ValidSchema.cc
+ impl/Validator.cc
+ impl/Zigzag.cc
+ impl/json/JsonDom.cc
+ impl/json/JsonIO.cc
+ impl/parsing/JsonCodec.cc
+ impl/parsing/ResolvingDecoder.cc
+ impl/parsing/Symbol.cc
+ impl/parsing/ValidatingCodec.cc
+)
+
+END()
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.hh b/contrib/libs/apache/orc/c++/src/Adaptor.hh
index a91b9c894db..95d6fccec11 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor.hh
+++ b/contrib/libs/apache/orc/c++/src/Adaptor.hh
@@ -21,9 +21,9 @@
/* #undef INT64_IS_LL */
#define HAS_CONSTEXPR
-#ifndef _MSC_VER
+#ifndef _MSC_VER
#define HAS_PREAD
-#endif
+#endif
#define HAS_STRPTIME
#define HAS_STOLL
#define HAS_DIAGNOSTIC_PUSH
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
index 6e54b1412fd..dd1fadfa0ab 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc
@@ -115,7 +115,7 @@ namespace orc {
return false;
}
- int64_t SeekableArrayInputStream::ByteCount() const {
+ int64_t SeekableArrayInputStream::ByteCount() const {
return static_cast<google::protobuf::int64>(position);
}
diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
index d8bd3d4d8ce..68506a5a017 100644
--- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh
@@ -76,7 +76,7 @@ namespace orc {
virtual bool Next(const void** data, int*size) override;
virtual void BackUp(int count) override;
virtual bool Skip(int count) override;
- virtual int64_t ByteCount() const override;
+ virtual int64_t ByteCount() const override;
virtual void seek(PositionProvider& position) override;
virtual std::string getName() const override;
};
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
index 11a21c0bd35..db894f38d32 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
@@ -67,7 +67,7 @@ namespace orc {
}
}
- int64_t BufferedOutputStream::ByteCount() const {
+ int64_t BufferedOutputStream::ByteCount() const {
return static_cast<google::protobuf::int64>(dataBuffer->size());
}
diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
index 7ce9fafa240..f3c5e6332ac 100644
--- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
+++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh
@@ -55,7 +55,7 @@ namespace orc {
virtual bool Next(void** data, int*size) override;
virtual void BackUp(int count) override;
- virtual int64_t ByteCount() const override;
+ virtual int64_t ByteCount() const override;
virtual bool WriteAliasedRaw(const void * data, int size) override;
virtual bool AllowsAliasing() const override;
diff --git a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
index 605fbf826ce..19d2761263d 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h
@@ -28,7 +28,7 @@ DIAGNOSTIC_PUSH
DIAGNOSTIC_IGNORE("-Wconversion")
#endif
-#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/coded_stream.h>
DIAGNOSTIC_POP
diff --git a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
index 5c161660cca..d545e73d772 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
+++ b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh
@@ -40,7 +40,7 @@ DIAGNOSTIC_PUSH
DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false'
#endif
-#include "contrib/libs/apache/orc/proto/orc_proto.pb.h"
+#include "contrib/libs/apache/orc/proto/orc_proto.pb.h"
DIAGNOSTIC_POP
diff --git a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
index 1af0bd002d4..52af9dfa40c 100644
--- a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
+++ b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h
@@ -29,7 +29,7 @@ DIAGNOSTIC_PUSH
DIAGNOSTIC_IGNORE("-Wreserved-id-macro")
#endif
-#include <google/protobuf/io/zero_copy_stream.h>
+#include <google/protobuf/io/zero_copy_stream.h>
DIAGNOSTIC_POP
diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make
index 5672ba95db8..050fd90f1c0 100644
--- a/contrib/libs/apache/orc/ya.make
+++ b/contrib/libs/apache/orc/ya.make
@@ -1,4 +1,4 @@
-# Generated by devtools/yamaker from nixpkgs e392df43c9f302d4a0892caaadcad3cd693edf9e.
+# Generated by devtools/yamaker from nixpkgs e392df43c9f302d4a0892caaadcad3cd693edf9e.
LIBRARY()
@@ -12,7 +12,7 @@ VERSION(1.6.12)
ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-1.6.12.tar.gz)
LICENSE(Apache-2.0)
-
+
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
@@ -23,11 +23,11 @@ PEERDIR(
)
ADDINCL(
- contrib/libs/apache/orc/c++/include
- contrib/libs/apache/orc/c++/src
- contrib/libs/apache/orc/proto
+ contrib/libs/apache/orc/c++/include
+ contrib/libs/apache/orc/c++/src
+ contrib/libs/apache/orc/proto
contrib/libs/lz4
- contrib/libs/zstd/include
+ contrib/libs/zstd/include
)
NO_COMPILER_WARNINGS()
diff --git a/contrib/libs/apache/ya.make b/contrib/libs/apache/ya.make
index 31f70e39c5b..a29f79c3114 100644
--- a/contrib/libs/apache/ya.make
+++ b/contrib/libs/apache/ya.make
@@ -1,5 +1,5 @@
-RECURSE(
- arrow
- avro
- orc
-)
+RECURSE(
+ arrow
+ avro
+ orc
+)